xref: /illumos-gate/usr/src/cmd/bhyve/net_backends.c (revision 763f1f5f97e4c16840af2ced98915f0ed0f46616)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #ifdef __FreeBSD__
50 #if defined(INET6) || defined(INET)
51 #include <net/if_tap.h>
52 #endif
53 #include <net/netmap.h>
54 #include <net/netmap_virt.h>
55 #define NETMAP_WITH_LIBS
56 #include <net/netmap_user.h>
57 #endif /* __FreeBSD__ */
58 
59 #ifndef WITHOUT_CAPSICUM
60 #include <capsicum_helpers.h>
61 #endif
62 #include <err.h>
63 #include <errno.h>
64 #include <fcntl.h>
65 #include <stdio.h>
66 #include <stdlib.h>
67 #include <stdint.h>
68 #include <string.h>
69 #include <unistd.h>
70 #include <sysexits.h>
71 #include <assert.h>
72 #include <pthread.h>
73 #include <pthread_np.h>
74 #include <poll.h>
75 #include <assert.h>
76 
77 #ifdef NETGRAPH
78 #include <sys/param.h>
79 #include <sys/sysctl.h>
80 #include <netgraph.h>
81 #endif
82 
83 #ifndef __FreeBSD__
84 #include <libdlpi.h>
85 #include <net/ethernet.h>
86 #endif
87 
88 #include "config.h"
89 #include "debug.h"
90 #include "iov.h"
91 #include "mevent.h"
92 #include "net_backends.h"
93 #include "pci_emul.h"
94 
95 #include <sys/linker_set.h>
96 
97 /*
98  * Each network backend registers a set of function pointers that are
99  * used to implement the net backends API.
100  * This might need to be exposed if we implement backends in separate files.
101  */
102 struct net_backend {
103 	const char *prefix;	/* prefix matching this backend */
104 
105 	/*
106 	 * Routines used to initialize and cleanup the resources needed
107 	 * by a backend. The cleanup function is used internally,
108 	 * and should not be called by the frontend.
109 	 */
110 	int (*init)(struct net_backend *be, const char *devname,
111 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
112 	void (*cleanup)(struct net_backend *be);
113 
114 	/*
115 	 * Called to serve a guest transmit request. The scatter-gather
116 	 * vector provided by the caller has 'iovcnt' elements and contains
117 	 * the packet to send.
118 	 */
119 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
120 	    int iovcnt);
121 
122 	/*
123 	 * Get the length of the next packet that can be received from
124 	 * the backend. If no packets are currently available, this
125 	 * function returns 0.
126 	 */
127 	ssize_t (*peek_recvlen)(struct net_backend *be);
128 
129 	/*
130 	 * Called to receive a packet from the backend. When the function
131 	 * returns a positive value 'len', the scatter-gather vector
132 	 * provided by the caller contains a packet with such length.
133 	 * The function returns 0 if the backend doesn't have a new packet to
134 	 * receive.
135 	 */
136 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
137 	    int iovcnt);
138 
139 	/*
140 	 * Ask the backend to enable or disable receive operation in the
141 	 * backend. On return from a disable operation, it is guaranteed
142 	 * that the receive callback won't be called until receive is
143 	 * enabled again. Note however that it is up to the caller to make
144 	 * sure that netbe_recv() is not currently being executed by another
145 	 * thread.
146 	 */
147 	void (*recv_enable)(struct net_backend *be);
148 	void (*recv_disable)(struct net_backend *be);
149 
150 	/*
151 	 * Ask the backend for the virtio-net features it is able to
152 	 * support. Possible features are TSO, UFO and checksum offloading
153 	 * in both rx and tx direction and for both IPv4 and IPv6.
154 	 */
155 	uint64_t (*get_cap)(struct net_backend *be);
156 
157 	/*
158 	 * Tell the backend to enable/disable the specified virtio-net
159 	 * features (capabilities).
160 	 */
161 	int (*set_cap)(struct net_backend *be, uint64_t features,
162 	    unsigned int vnet_hdr_len);
163 
164 #ifndef __FreeBSD__
165 	int (*get_mac)(struct net_backend *be, void *, size_t *);
166 #endif
167 
168 	struct pci_vtnet_softc *sc;
169 	int fd;
170 
171 	/*
172 	 * Length of the virtio-net header used by the backend and the
173 	 * frontend, respectively. A zero value means that the header
174 	 * is not used.
175 	 */
176 	unsigned int be_vnet_hdr_len;
177 	unsigned int fe_vnet_hdr_len;
178 
179 	/* Size of backend-specific private data. */
180 	size_t priv_size;
181 
182 	/* Backend-specific private data follows. */
183 };
184 
185 #define	NET_BE_PRIV(be)		((void *)((be) + 1))
186 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
187 
188 SET_DECLARE(net_backend_set, struct net_backend);
189 
190 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
191 
192 #define WPRINTF(params) PRINTLN params
193 
194 #ifdef __FreeBSD__
195 
196 /*
197  * The tap backend
198  */
199 
200 #if defined(INET6) || defined(INET)
201 static const int pf_list[] = {
202 #if defined(INET6)
203 	PF_INET6,
204 #endif
205 #if defined(INET)
206 	PF_INET,
207 #endif
208 };
209 #endif
210 
211 struct tap_priv {
212 	struct mevent *mevp;
213 	/*
214 	 * A bounce buffer that allows us to implement the peek_recvlen
215 	 * callback. In the future we may get the same information from
216 	 * the kevent data.
217 	 */
218 	char bbuf[1 << 16];
219 	ssize_t bbuflen;
220 };
221 
222 static void
223 tap_cleanup(struct net_backend *be)
224 {
225 	struct tap_priv *priv = NET_BE_PRIV(be);
226 
227 	if (priv->mevp) {
228 		mevent_delete(priv->mevp);
229 	}
230 	if (be->fd != -1) {
231 		close(be->fd);
232 		be->fd = -1;
233 	}
234 }
235 
236 static int
237 tap_init(struct net_backend *be, const char *devname,
238     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
239 {
240 	struct tap_priv *priv = NET_BE_PRIV(be);
241 	char tbuf[80];
242 	int opt = 1;
243 #if defined(INET6) || defined(INET)
244 	struct ifreq ifrq;
245 	int s;
246 #endif
247 #ifndef WITHOUT_CAPSICUM
248 	cap_rights_t rights;
249 #endif
250 
251 	if (cb == NULL) {
252 		WPRINTF(("TAP backend requires non-NULL callback"));
253 		return (-1);
254 	}
255 
256 	strcpy(tbuf, "/dev/");
257 	strlcat(tbuf, devname, sizeof(tbuf));
258 
259 	be->fd = open(tbuf, O_RDWR);
260 	if (be->fd == -1) {
261 		WPRINTF(("open of tap device %s failed", tbuf));
262 		goto error;
263 	}
264 
265 	/*
266 	 * Set non-blocking and register for read
267 	 * notifications with the event loop
268 	 */
269 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
270 		WPRINTF(("tap device O_NONBLOCK failed"));
271 		goto error;
272 	}
273 
274 #if defined(INET6) || defined(INET)
275 	/*
276 	 * Try to UP the interface rather than relying on
277 	 * net.link.tap.up_on_open.
278 	  */
279 	bzero(&ifrq, sizeof(ifrq));
280 	if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
281 		WPRINTF(("Could not get interface name"));
282 		goto error;
283 	}
284 
285 	s = -1;
286 	for (size_t i = 0; s == -1 && i < nitems(pf_list); i++)
287 		s = socket(pf_list[i], SOCK_DGRAM, 0);
288 	if (s == -1) {
289 		WPRINTF(("Could open socket"));
290 		goto error;
291 	}
292 
293 	if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
294 		(void)close(s);
295 		WPRINTF(("Could not get interface flags"));
296 		goto error;
297 	}
298 	ifrq.ifr_flags |= IFF_UP;
299 	if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
300 		(void)close(s);
301 		WPRINTF(("Could not set interface flags"));
302 		goto error;
303 	}
304 	(void)close(s);
305 #endif
306 
307 #ifndef WITHOUT_CAPSICUM
308 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
309 	if (caph_rights_limit(be->fd, &rights) == -1)
310 		errx(EX_OSERR, "Unable to apply rights for sandbox");
311 #endif
312 
313 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
314 	priv->bbuflen = 0;
315 
316 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
317 	if (priv->mevp == NULL) {
318 		WPRINTF(("Could not register event"));
319 		goto error;
320 	}
321 
322 	return (0);
323 
324 error:
325 	tap_cleanup(be);
326 	return (-1);
327 }
328 
329 /*
330  * Called to send a buffer chain out to the tap device
331  */
332 static ssize_t
333 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
334 {
335 	return (writev(be->fd, iov, iovcnt));
336 }
337 
338 static ssize_t
339 tap_peek_recvlen(struct net_backend *be)
340 {
341 	struct tap_priv *priv = NET_BE_PRIV(be);
342 	ssize_t ret;
343 
344 	if (priv->bbuflen > 0) {
345 		/*
346 		 * We already have a packet in the bounce buffer.
347 		 * Just return its length.
348 		 */
349 		return priv->bbuflen;
350 	}
351 
352 	/*
353 	 * Read the next packet (if any) into the bounce buffer, so
354 	 * that we get to know its length and we can return that
355 	 * to the caller.
356 	 */
357 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
358 	if (ret < 0 && errno == EWOULDBLOCK) {
359 		return (0);
360 	}
361 
362 	if (ret > 0)
363 		priv->bbuflen = ret;
364 
365 	return (ret);
366 }
367 
368 static ssize_t
369 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
370 {
371 	struct tap_priv *priv = NET_BE_PRIV(be);
372 	ssize_t ret;
373 
374 	if (priv->bbuflen > 0) {
375 		/*
376 		 * A packet is available in the bounce buffer, so
377 		 * we read it from there.
378 		 */
379 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
380 		    iov, iovcnt, 0);
381 
382 		/* Mark the bounce buffer as empty. */
383 		priv->bbuflen = 0;
384 
385 		return (ret);
386 	}
387 
388 	ret = readv(be->fd, iov, iovcnt);
389 	if (ret < 0 && errno == EWOULDBLOCK) {
390 		return (0);
391 	}
392 
393 	return (ret);
394 }
395 
396 static void
397 tap_recv_enable(struct net_backend *be)
398 {
399 	struct tap_priv *priv = NET_BE_PRIV(be);
400 
401 	mevent_enable(priv->mevp);
402 }
403 
404 static void
405 tap_recv_disable(struct net_backend *be)
406 {
407 	struct tap_priv *priv = NET_BE_PRIV(be);
408 
409 	mevent_disable(priv->mevp);
410 }
411 
412 static uint64_t
413 tap_get_cap(struct net_backend *be __unused)
414 {
415 
416 	return (0); /* no capabilities for now */
417 }
418 
419 static int
420 tap_set_cap(struct net_backend *be __unused, uint64_t features,
421     unsigned vnet_hdr_len)
422 {
423 
424 	return ((features || vnet_hdr_len) ? -1 : 0);
425 }
426 
427 static struct net_backend tap_backend = {
428 	.prefix = "tap",
429 	.priv_size = sizeof(struct tap_priv),
430 	.init = tap_init,
431 	.cleanup = tap_cleanup,
432 	.send = tap_send,
433 	.peek_recvlen = tap_peek_recvlen,
434 	.recv = tap_recv,
435 	.recv_enable = tap_recv_enable,
436 	.recv_disable = tap_recv_disable,
437 	.get_cap = tap_get_cap,
438 	.set_cap = tap_set_cap,
439 };
440 
441 /* A clone of the tap backend, with a different prefix. */
442 static struct net_backend vmnet_backend = {
443 	.prefix = "vmnet",
444 	.priv_size = sizeof(struct tap_priv),
445 	.init = tap_init,
446 	.cleanup = tap_cleanup,
447 	.send = tap_send,
448 	.peek_recvlen = tap_peek_recvlen,
449 	.recv = tap_recv,
450 	.recv_enable = tap_recv_enable,
451 	.recv_disable = tap_recv_disable,
452 	.get_cap = tap_get_cap,
453 	.set_cap = tap_set_cap,
454 };
455 
456 DATA_SET(net_backend_set, tap_backend);
457 DATA_SET(net_backend_set, vmnet_backend);
458 
459 #ifdef NETGRAPH
460 
461 /*
462  * Netgraph backend
463  */
464 
465 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
466 
467 static int
468 ng_init(struct net_backend *be, const char *devname __unused,
469 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
470 {
471 	struct tap_priv *p = NET_BE_PRIV(be);
472 	struct ngm_connect ngc;
473 	const char *value, *nodename;
474 	int sbsz;
475 	int ctrl_sock;
476 	int flags;
477 	unsigned long maxsbsz;
478 	size_t msbsz;
479 #ifndef WITHOUT_CAPSICUM
480 	cap_rights_t rights;
481 #endif
482 
483 	if (cb == NULL) {
484 		WPRINTF(("Netgraph backend requires non-NULL callback"));
485 		return (-1);
486 	}
487 
488 	be->fd = -1;
489 
490 	memset(&ngc, 0, sizeof(ngc));
491 
492 	value = get_config_value_node(nvl, "path");
493 	if (value == NULL) {
494 		WPRINTF(("path must be provided"));
495 		return (-1);
496 	}
497 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
498 
499 	value = get_config_value_node(nvl, "hook");
500 	if (value == NULL)
501 		value = "vmlink";
502 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
503 
504 	value = get_config_value_node(nvl, "peerhook");
505 	if (value == NULL) {
506 		WPRINTF(("peer hook must be provided"));
507 		return (-1);
508 	}
509 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
510 
511 	nodename = get_config_value_node(nvl, "socket");
512 	if (NgMkSockNode(nodename,
513 		&ctrl_sock, &be->fd) < 0) {
514 		WPRINTF(("can't get Netgraph sockets"));
515 		return (-1);
516 	}
517 
518 	if (NgSendMsg(ctrl_sock, ".",
519 		NGM_GENERIC_COOKIE,
520 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
521 		WPRINTF(("can't connect to node"));
522 		close(ctrl_sock);
523 		goto error;
524 	}
525 
526 	close(ctrl_sock);
527 
528 	flags = fcntl(be->fd, F_GETFL);
529 
530 	if (flags < 0) {
531 		WPRINTF(("can't get socket flags"));
532 		goto error;
533 	}
534 
535 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
536 		WPRINTF(("can't set O_NONBLOCK flag"));
537 		goto error;
538 	}
539 
540 	/*
541 	 * The default ng_socket(4) buffer's size is too low.
542 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
543 	 * and kern.ipc.maxsockbuf.
544 	 */
545 	msbsz = sizeof(maxsbsz);
546 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
547 		NULL, 0) < 0) {
548 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
549 		goto error;
550 	}
551 
552 	/*
553 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
554 	 * as it takes into account the mbuf(9) overhead.
555 	 */
556 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
557 
558 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
559 
560 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
561 		sizeof(sbsz)) < 0) {
562 		WPRINTF(("can't set TX buffer size"));
563 		goto error;
564 	}
565 
566 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
567 		sizeof(sbsz)) < 0) {
568 		WPRINTF(("can't set RX buffer size"));
569 		goto error;
570 	}
571 
572 #ifndef WITHOUT_CAPSICUM
573 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
574 	if (caph_rights_limit(be->fd, &rights) == -1)
575 		errx(EX_OSERR, "Unable to apply rights for sandbox");
576 #endif
577 
578 	memset(p->bbuf, 0, sizeof(p->bbuf));
579 	p->bbuflen = 0;
580 
581 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
582 	if (p->mevp == NULL) {
583 		WPRINTF(("Could not register event"));
584 		goto error;
585 	}
586 
587 	return (0);
588 
589 error:
590 	tap_cleanup(be);
591 	return (-1);
592 }
593 
594 static struct net_backend ng_backend = {
595 	.prefix = "netgraph",
596 	.priv_size = sizeof(struct tap_priv),
597 	.init = ng_init,
598 	.cleanup = tap_cleanup,
599 	.send = tap_send,
600 	.peek_recvlen = tap_peek_recvlen,
601 	.recv = tap_recv,
602 	.recv_enable = tap_recv_enable,
603 	.recv_disable = tap_recv_disable,
604 	.get_cap = tap_get_cap,
605 	.set_cap = tap_set_cap,
606 };
607 
608 DATA_SET(net_backend_set, ng_backend);
609 
610 #endif /* NETGRAPH */
611 
612 /*
613  * The netmap backend
614  */
615 
616 /* The virtio-net features supported by netmap. */
617 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
618 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
619 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
620 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
621 
622 struct netmap_priv {
623 	char ifname[IFNAMSIZ];
624 	struct nm_desc *nmd;
625 	uint16_t memid;
626 	struct netmap_ring *rx;
627 	struct netmap_ring *tx;
628 	struct mevent *mevp;
629 	net_be_rxeof_t cb;
630 	void *cb_param;
631 };
632 
633 static void
634 nmreq_init(struct nmreq *req, char *ifname)
635 {
636 
637 	memset(req, 0, sizeof(*req));
638 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
639 	req->nr_version = NETMAP_API;
640 }
641 
642 static int
643 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
644 {
645 	int err;
646 	struct nmreq req;
647 	struct netmap_priv *priv = NET_BE_PRIV(be);
648 
649 	nmreq_init(&req, priv->ifname);
650 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
651 	req.nr_arg1 = vnet_hdr_len;
652 	err = ioctl(be->fd, NIOCREGIF, &req);
653 	if (err) {
654 		WPRINTF(("Unable to set vnet header length %d",
655 				vnet_hdr_len));
656 		return (err);
657 	}
658 
659 	be->be_vnet_hdr_len = vnet_hdr_len;
660 
661 	return (0);
662 }
663 
664 static int
665 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
666 {
667 	unsigned prev_hdr_len = be->be_vnet_hdr_len;
668 	int ret;
669 
670 	if (vnet_hdr_len == prev_hdr_len) {
671 		return (1);
672 	}
673 
674 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
675 	if (ret) {
676 		return (0);
677 	}
678 
679 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
680 
681 	return (1);
682 }
683 
684 static uint64_t
685 netmap_get_cap(struct net_backend *be)
686 {
687 
688 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
689 	    NETMAP_FEATURES : 0);
690 }
691 
692 static int
693 netmap_set_cap(struct net_backend *be, uint64_t features __unused,
694     unsigned vnet_hdr_len)
695 {
696 
697 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
698 }
699 
700 static int
701 netmap_init(struct net_backend *be, const char *devname,
702     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
703 {
704 	struct netmap_priv *priv = NET_BE_PRIV(be);
705 
706 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
707 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
708 
709 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
710 	if (priv->nmd == NULL) {
711 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
712 			devname, strerror(errno)));
713 		return (-1);
714 	}
715 
716 	priv->memid = priv->nmd->req.nr_arg2;
717 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
718 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
719 	priv->cb = cb;
720 	priv->cb_param = param;
721 	be->fd = priv->nmd->fd;
722 
723 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
724 	if (priv->mevp == NULL) {
725 		WPRINTF(("Could not register event"));
726 		return (-1);
727 	}
728 
729 	return (0);
730 }
731 
732 static void
733 netmap_cleanup(struct net_backend *be)
734 {
735 	struct netmap_priv *priv = NET_BE_PRIV(be);
736 
737 	if (priv->mevp) {
738 		mevent_delete(priv->mevp);
739 	}
740 	if (priv->nmd) {
741 		nm_close(priv->nmd);
742 	}
743 	be->fd = -1;
744 }
745 
746 static ssize_t
747 netmap_send(struct net_backend *be, const struct iovec *iov,
748 	    int iovcnt)
749 {
750 	struct netmap_priv *priv = NET_BE_PRIV(be);
751 	struct netmap_ring *ring;
752 	ssize_t totlen = 0;
753 	int nm_buf_size;
754 	int nm_buf_len;
755 	uint32_t head;
756 	uint8_t *nm_buf;
757 	int j;
758 
759 	ring = priv->tx;
760 	head = ring->head;
761 	if (head == ring->tail) {
762 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
763 		goto txsync;
764 	}
765 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
766 	nm_buf_size = ring->nr_buf_size;
767 	nm_buf_len = 0;
768 
769 	for (j = 0; j < iovcnt; j++) {
770 		uint8_t *iov_frag_buf = iov[j].iov_base;
771 		int iov_frag_size = iov[j].iov_len;
772 
773 		totlen += iov_frag_size;
774 
775 		/*
776 		 * Split each iovec fragment over more netmap slots, if
777 		 * necessary.
778 		 */
779 		for (;;) {
780 			int copylen;
781 
782 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
783 			memcpy(nm_buf, iov_frag_buf, copylen);
784 
785 			iov_frag_buf += copylen;
786 			iov_frag_size -= copylen;
787 			nm_buf += copylen;
788 			nm_buf_size -= copylen;
789 			nm_buf_len += copylen;
790 
791 			if (iov_frag_size == 0) {
792 				break;
793 			}
794 
795 			ring->slot[head].len = nm_buf_len;
796 			ring->slot[head].flags = NS_MOREFRAG;
797 			head = nm_ring_next(ring, head);
798 			if (head == ring->tail) {
799 				/*
800 				 * We ran out of netmap slots while
801 				 * splitting the iovec fragments.
802 				 */
803 				WPRINTF(("No space, drop %zu bytes",
804 				   count_iov(iov, iovcnt)));
805 				goto txsync;
806 			}
807 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
808 			nm_buf_size = ring->nr_buf_size;
809 			nm_buf_len = 0;
810 		}
811 	}
812 
813 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
814 	ring->slot[head].len = nm_buf_len;
815 	ring->slot[head].flags = 0;
816 	head = nm_ring_next(ring, head);
817 
818 	/* Now update ring->head and ring->cur. */
819 	ring->head = ring->cur = head;
820 txsync:
821 	ioctl(be->fd, NIOCTXSYNC, NULL);
822 
823 	return (totlen);
824 }
825 
826 static ssize_t
827 netmap_peek_recvlen(struct net_backend *be)
828 {
829 	struct netmap_priv *priv = NET_BE_PRIV(be);
830 	struct netmap_ring *ring = priv->rx;
831 	uint32_t head = ring->head;
832 	ssize_t totlen = 0;
833 
834 	while (head != ring->tail) {
835 		struct netmap_slot *slot = ring->slot + head;
836 
837 		totlen += slot->len;
838 		if ((slot->flags & NS_MOREFRAG) == 0)
839 			break;
840 		head = nm_ring_next(ring, head);
841 	}
842 
843 	return (totlen);
844 }
845 
846 static ssize_t
847 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
848 {
849 	struct netmap_priv *priv = NET_BE_PRIV(be);
850 	struct netmap_slot *slot = NULL;
851 	struct netmap_ring *ring;
852 	uint8_t *iov_frag_buf;
853 	int iov_frag_size;
854 	ssize_t totlen = 0;
855 	uint32_t head;
856 
857 	assert(iovcnt);
858 
859 	ring = priv->rx;
860 	head = ring->head;
861 	iov_frag_buf = iov->iov_base;
862 	iov_frag_size = iov->iov_len;
863 
864 	do {
865 		uint8_t *nm_buf;
866 		int nm_buf_len;
867 
868 		if (head == ring->tail) {
869 			return (0);
870 		}
871 
872 		slot = ring->slot + head;
873 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
874 		nm_buf_len = slot->len;
875 
876 		for (;;) {
877 			int copylen = nm_buf_len < iov_frag_size ?
878 			    nm_buf_len : iov_frag_size;
879 
880 			memcpy(iov_frag_buf, nm_buf, copylen);
881 			nm_buf += copylen;
882 			nm_buf_len -= copylen;
883 			iov_frag_buf += copylen;
884 			iov_frag_size -= copylen;
885 			totlen += copylen;
886 
887 			if (nm_buf_len == 0) {
888 				break;
889 			}
890 
891 			iov++;
892 			iovcnt--;
893 			if (iovcnt == 0) {
894 				/* No space to receive. */
895 				WPRINTF(("Short iov, drop %zd bytes",
896 				    totlen));
897 				return (-ENOSPC);
898 			}
899 			iov_frag_buf = iov->iov_base;
900 			iov_frag_size = iov->iov_len;
901 		}
902 
903 		head = nm_ring_next(ring, head);
904 
905 	} while (slot->flags & NS_MOREFRAG);
906 
907 	/* Release slots to netmap. */
908 	ring->head = ring->cur = head;
909 
910 	return (totlen);
911 }
912 
913 static void
914 netmap_recv_enable(struct net_backend *be)
915 {
916 	struct netmap_priv *priv = NET_BE_PRIV(be);
917 
918 	mevent_enable(priv->mevp);
919 }
920 
921 static void
922 netmap_recv_disable(struct net_backend *be)
923 {
924 	struct netmap_priv *priv = NET_BE_PRIV(be);
925 
926 	mevent_disable(priv->mevp);
927 }
928 
929 static struct net_backend netmap_backend = {
930 	.prefix = "netmap",
931 	.priv_size = sizeof(struct netmap_priv),
932 	.init = netmap_init,
933 	.cleanup = netmap_cleanup,
934 	.send = netmap_send,
935 	.peek_recvlen = netmap_peek_recvlen,
936 	.recv = netmap_recv,
937 	.recv_enable = netmap_recv_enable,
938 	.recv_disable = netmap_recv_disable,
939 	.get_cap = netmap_get_cap,
940 	.set_cap = netmap_set_cap,
941 };
942 
943 /* A clone of the netmap backend, with a different prefix. */
944 static struct net_backend vale_backend = {
945 	.prefix = "vale",
946 	.priv_size = sizeof(struct netmap_priv),
947 	.init = netmap_init,
948 	.cleanup = netmap_cleanup,
949 	.send = netmap_send,
950 	.peek_recvlen = netmap_peek_recvlen,
951 	.recv = netmap_recv,
952 	.recv_enable = netmap_recv_enable,
953 	.recv_disable = netmap_recv_disable,
954 	.get_cap = netmap_get_cap,
955 	.set_cap = netmap_set_cap,
956 };
957 
958 DATA_SET(net_backend_set, netmap_backend);
959 DATA_SET(net_backend_set, vale_backend);
960 
961 #else /* __FreeBSD__ */
962 
963 /*
964  * The illumos dlpi backend
965  */
966 
967 /*
968  * The size of the bounce buffer used to implement the peek callback.
969  * This value should be big enough to accommodate the largest of all possible
970  * frontend packet lengths. The value here matches the definition of
971  * VTNET_MAX_PKT_LEN in pci_virtio_net.c
972  */
973 #define	DLPI_BBUF_SIZE (65536 + 64)
974 
975 typedef struct be_dlpi_priv {
976 	dlpi_handle_t bdp_dhp;
977 	struct mevent *bdp_mevp;
978 	/*
979 	 * A bounce buffer that allows us to implement the peek_recvlen
980 	 * callback. Each structure is only used by a single thread so
981 	 * one is enough.
982 	 */
983 	uint8_t bdp_bbuf[DLPI_BBUF_SIZE];
984 	ssize_t bdp_bbuflen;
985 } be_dlpi_priv_t;
986 
987 static void
988 be_dlpi_cleanup(net_backend_t *be)
989 {
990 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
991 
992 	if (priv->bdp_dhp != NULL)
993 		dlpi_close(priv->bdp_dhp);
994 	priv->bdp_dhp = NULL;
995 
996 	if (priv->bdp_mevp != NULL)
997 		mevent_delete(priv->bdp_mevp);
998 	priv->bdp_mevp = NULL;
999 
1000 	priv->bdp_bbuflen = 0;
1001 	be->fd = -1;
1002 }
1003 
1004 static void
1005 be_dlpi_err(int ret, const char *dev, char *msg)
1006 {
1007 	WPRINTF(("%s: %s (%s)", dev, msg, dlpi_strerror(ret)));
1008 }
1009 
1010 static int
1011 be_dlpi_init(net_backend_t *be, const char *devname __unused,
1012      nvlist_t *nvl, net_be_rxeof_t cb, void *param)
1013 {
1014 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1015 	const char *vnic;
1016 	int ret;
1017 
1018 	if (cb == NULL) {
1019 		WPRINTF(("dlpi backend requires non-NULL callback"));
1020 		return (-1);
1021 	}
1022 
1023 	vnic = get_config_value_node(nvl, "vnic");
1024 	if (vnic == NULL) {
1025 		WPRINTF(("dlpi backend requires a VNIC"));
1026 		return (-1);
1027 	}
1028 
1029 	priv->bdp_bbuflen = 0;
1030 
1031 	ret = dlpi_open(vnic, &priv->bdp_dhp, DLPI_RAW);
1032 
1033 	if (ret != DLPI_SUCCESS) {
1034 		be_dlpi_err(ret, vnic, "open failed");
1035 		goto error;
1036 	}
1037 
1038 	if ((ret = dlpi_bind(priv->bdp_dhp, DLPI_ANY_SAP, NULL)) !=
1039 	    DLPI_SUCCESS) {
1040 		be_dlpi_err(ret, vnic, "bind failed");
1041 		goto error;
1042 	}
1043 
1044 	if (get_config_bool_node_default(nvl, "promiscrxonly", true)) {
1045 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_RX_ONLY)) !=
1046 		    DLPI_SUCCESS) {
1047 			be_dlpi_err(ret, vnic,
1048 			    "enable promiscuous mode(rxonly) failed");
1049 			goto error;
1050 		}
1051 	}
1052 	if (get_config_bool_node_default(nvl, "promiscphys", false)) {
1053 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_PHYS)) !=
1054 		    DLPI_SUCCESS) {
1055 			be_dlpi_err(ret, vnic,
1056 			    "enable promiscuous mode(physical) failed");
1057 			goto error;
1058 		}
1059 	}
1060 	if (get_config_bool_node_default(nvl, "promiscsap", true)) {
1061 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_SAP)) !=
1062 		    DLPI_SUCCESS) {
1063 			be_dlpi_err(ret, vnic,
1064 			    "enable promiscuous mode(SAP) failed");
1065 			goto error;
1066 		}
1067 	}
1068 	if (get_config_bool_node_default(nvl, "promiscmulti", true)) {
1069 		if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_MULTI)) !=
1070 		    DLPI_SUCCESS) {
1071 			be_dlpi_err(ret, vnic,
1072 			    "enable promiscuous mode(muticast) failed");
1073 			goto error;
1074 		}
1075 	}
1076 
1077         be->fd = dlpi_fd(priv->bdp_dhp);
1078 
1079         if (fcntl(be->fd, F_SETFL, O_NONBLOCK) < 0) {
1080                 WPRINTF(("%s: enable O_NONBLOCK failed", vnic));
1081 		goto error;
1082         }
1083 
1084 	priv->bdp_mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
1085 	if (priv->bdp_mevp == NULL) {
1086 		WPRINTF(("Could not register event"));
1087 		goto error;
1088 	}
1089 
1090 	return (0);
1091 
1092 error:
1093 	be_dlpi_cleanup(be);
1094 	return (-1);
1095 }
1096 
1097 /*
1098  * Called to send a buffer chain out to the dlpi device
1099  */
1100 static ssize_t
1101 be_dlpi_send(net_backend_t *be, const struct iovec *iov, int iovcnt)
1102 {
1103 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1104 	ssize_t len = 0;
1105 	int ret;
1106 
1107 	if (iovcnt == 1) {
1108 		len = iov[0].iov_len;
1109 		ret = dlpi_send(priv->bdp_dhp, NULL, 0, iov[0].iov_base, len,
1110 		    NULL);
1111 	} else {
1112 		void *buf = NULL;
1113 
1114 		len = iov_to_buf(iov, iovcnt, &buf);
1115 
1116 		if (len <= 0 || buf == NULL)
1117 			return (-1);
1118 
1119 		ret = dlpi_send(priv->bdp_dhp, NULL, 0, buf, len, NULL);
1120 		free(buf);
1121 	}
1122 
1123 	if (ret != DLPI_SUCCESS)
1124 		return (-1);
1125 
1126 	return (len);
1127 }
1128 
1129 static ssize_t
1130 be_dlpi_peek_recvlen(net_backend_t *be)
1131 {
1132 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1133 	dlpi_recvinfo_t recv;
1134 	size_t len;
1135 	int ret;
1136 
1137 	/*
1138 	 * We already have a packet in the bounce buffer.
1139 	 * Just return its length.
1140 	 */
1141 	if (priv->bdp_bbuflen > 0)
1142 		return (priv->bdp_bbuflen);
1143 
1144 	/*
1145 	 * Read the next packet (if any) into the bounce buffer, so
1146 	 * that we get to know its length and we can return that
1147 	 * to the caller.
1148 	 */
1149 	len = sizeof (priv->bdp_bbuf);
1150 	ret = dlpi_recv(priv->bdp_dhp, NULL, NULL, priv->bdp_bbuf, &len,
1151 	    0, &recv);
1152 	if (ret == DL_SYSERR) {
1153 		if (errno == EWOULDBLOCK)
1154 			return (0);
1155 		return (-1);
1156 	} else if (ret == DLPI_ETIMEDOUT) {
1157 		return (0);
1158 	} else if (ret != DLPI_SUCCESS) {
1159 		return (-1);
1160 	}
1161 
1162 	if (recv.dri_totmsglen > sizeof (priv->bdp_bbuf)) {
1163 		EPRINTLN("DLPI bounce buffer was too small! - needed %x bytes",
1164 		    recv.dri_totmsglen);
1165 	}
1166 
1167 	priv->bdp_bbuflen = len;
1168 
1169 	return (len);
1170 }
1171 
1172 static ssize_t
1173 be_dlpi_recv(net_backend_t *be, const struct iovec *iov, int iovcnt)
1174 {
1175 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1176 	size_t len;
1177 	int ret;
1178 
1179 	if (priv->bdp_bbuflen > 0) {
1180 		/*
1181 		 * A packet is available in the bounce buffer, so
1182 		 * we read it from there.
1183 		 */
1184 		len = buf_to_iov(priv->bdp_bbuf, priv->bdp_bbuflen,
1185 		    iov, iovcnt, 0);
1186 
1187 		/* Mark the bounce buffer as empty. */
1188 		priv->bdp_bbuflen = 0;
1189 
1190 		return (len);
1191 	}
1192 
1193 	len = iov[0].iov_len;
1194 	ret = dlpi_recv(priv->bdp_dhp, NULL, NULL,
1195 	    (uint8_t *)iov[0].iov_base, &len, 0, NULL);
1196 	if (ret == DL_SYSERR) {
1197 		if (errno == EWOULDBLOCK)
1198 			return (0);
1199 		return (-1);
1200 	} else if (ret == DLPI_ETIMEDOUT) {
1201 		return (0);
1202 	} else if (ret != DLPI_SUCCESS) {
1203 		return (-1);
1204 	}
1205 
1206 	return (len);
1207 }
1208 
1209 static void
1210 be_dlpi_recv_enable(net_backend_t *be)
1211 {
1212 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1213 
1214 	mevent_enable(priv->bdp_mevp);
1215 }
1216 
1217 static void
1218 be_dlpi_recv_disable(net_backend_t *be)
1219 {
1220 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1221 
1222 	mevent_disable(priv->bdp_mevp);
1223 }
1224 
1225 static uint64_t
1226 be_dlpi_get_cap(net_backend_t *be)
1227 {
1228 	return (0); /* no capabilities for now */
1229 }
1230 
1231 static int
1232 be_dlpi_set_cap(net_backend_t *be, uint64_t features,
1233     unsigned vnet_hdr_len)
1234 {
1235 	return ((features || vnet_hdr_len) ? -1 : 0);
1236 }
1237 
1238 static int
1239 be_dlpi_get_mac(net_backend_t *be, void *buf, size_t *buflen)
1240 {
1241 	be_dlpi_priv_t *priv = NET_BE_PRIV(be);
1242 	uchar_t physaddr[DLPI_PHYSADDR_MAX];
1243 	size_t physaddrlen = DLPI_PHYSADDR_MAX;
1244 	int ret;
1245 
1246 	if ((ret = dlpi_get_physaddr(priv->bdp_dhp, DL_CURR_PHYS_ADDR,
1247 	    physaddr, &physaddrlen)) != DLPI_SUCCESS) {
1248 		be_dlpi_err(ret, dlpi_linkname(priv->bdp_dhp),
1249 		    "read MAC address failed");
1250 		return (EINVAL);
1251 	}
1252 
1253 	if (physaddrlen != ETHERADDRL) {
1254 		WPRINTF(("%s: bad MAC address len %d",
1255 		    dlpi_linkname(priv->bdp_dhp), physaddrlen));
1256 		return (EINVAL);
1257 	}
1258 
1259 	if (physaddrlen > *buflen) {
1260 		WPRINTF(("%s: MAC address too long (%d bytes required)",
1261 		    dlpi_linkname(priv->bdp_dhp), physaddrlen));
1262 		return (ENOMEM);
1263 	}
1264 
1265 	*buflen = physaddrlen;
1266 	memcpy(buf, physaddr, *buflen);
1267 
1268 	return (0);
1269 }
1270 
1271 static struct net_backend dlpi_backend = {
1272 	.prefix = "dlpi",
1273 	.priv_size = sizeof(struct be_dlpi_priv),
1274 	.init = be_dlpi_init,
1275 	.cleanup = be_dlpi_cleanup,
1276 	.send = be_dlpi_send,
1277 	.peek_recvlen = be_dlpi_peek_recvlen,
1278 	.recv = be_dlpi_recv,
1279 	.recv_enable = be_dlpi_recv_enable,
1280 	.recv_disable = be_dlpi_recv_disable,
1281 	.get_cap = be_dlpi_get_cap,
1282 	.set_cap = be_dlpi_set_cap,
1283 	.get_mac = be_dlpi_get_mac,
1284 };
1285 
1286 DATA_SET(net_backend_set, dlpi_backend);
1287 
1288 #endif /* __FreeBSD__ */
1289 
1290 #ifdef __FreeBSD__
1291 int
1292 netbe_legacy_config(nvlist_t *nvl, const char *opts)
1293 {
1294 	char *backend, *cp;
1295 
1296 	if (opts == NULL)
1297 		return (0);
1298 
1299 	cp = strchr(opts, ',');
1300 	if (cp == NULL) {
1301 		set_config_value_node(nvl, "backend", opts);
1302 		return (0);
1303 	}
1304 	backend = strndup(opts, cp - opts);
1305 	set_config_value_node(nvl, "backend", backend);
1306 	free(backend);
1307 	return (pci_parse_legacy_config(nvl, cp + 1));
1308 }
1309 #else
1310 int
1311 netbe_legacy_config(nvlist_t *nvl, const char *opts)
1312 {
1313 	char *config, *name, *tofree, *value;
1314 
1315 	if (opts == NULL)
1316 		return (0);
1317 
1318 	/* Default to the 'dlpi' backend - can still be overridden by opts */
1319 	set_config_value_node(nvl, "backend", "dlpi");
1320 	set_config_value_node(nvl, "type", "dlpi");
1321 
1322 	config = tofree = strdup(opts);
1323 	if (config == NULL)
1324 		err(4, "netbe_legacy_config strdup()");
1325 	while ((name = strsep(&config, ",")) != NULL) {
1326 		value = strchr(name, '=');
1327 		if (value != NULL) {
1328 			*value++ = '\0';
1329 			set_config_value_node(nvl, name, value);
1330 		} else {
1331 			set_config_value_node(nvl, "vnic", name);
1332 		}
1333 	}
1334 	free(tofree);
1335 
1336 	return (0);
1337 }
1338 #endif
1339 
1340 /*
1341  * Initialize a backend and attach to the frontend.
1342  * This is called during frontend initialization.
1343  *  @ret is a pointer to the backend to be initialized
1344  *  @devname is the backend-name as supplied on the command line,
1345  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
1346  *  @cb is the receive callback supplied by the frontend,
1347  *	and it is invoked in the event loop when a receive
1348  *	event is generated in the hypervisor,
1349  *  @param is a pointer to the frontend, and normally used as
1350  *	the argument for the callback.
1351  */
1352 int
1353 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
1354     void *param)
1355 {
1356 	struct net_backend **pbe, *nbe, *tbe = NULL;
1357 	const char *value, *type;
1358 	char *devname;
1359 	int err;
1360 
1361 	value = get_config_value_node(nvl, "backend");
1362 	if (value == NULL) {
1363 		return (-1);
1364 	}
1365 	devname = strdup(value);
1366 
1367 	/*
1368 	 * Use the type given by configuration if exists; otherwise
1369 	 * use the prefix of the backend as the type.
1370 	 */
1371 	type = get_config_value_node(nvl, "type");
1372 	if (type == NULL)
1373 		type = devname;
1374 
1375 	/*
1376 	 * Find the network backend that matches the user-provided
1377 	 * device name. net_backend_set is built using a linker set.
1378 	 */
1379 	SET_FOREACH(pbe, net_backend_set) {
1380 		if (strncmp(type, (*pbe)->prefix,
1381 		    strlen((*pbe)->prefix)) == 0) {
1382 			tbe = *pbe;
1383 			assert(tbe->init != NULL);
1384 			assert(tbe->cleanup != NULL);
1385 			assert(tbe->send != NULL);
1386 			assert(tbe->recv != NULL);
1387 			assert(tbe->get_cap != NULL);
1388 			assert(tbe->set_cap != NULL);
1389 			break;
1390 		}
1391 	}
1392 
1393 	*ret = NULL;
1394 	if (tbe == NULL) {
1395 		free(devname);
1396 		return (EINVAL);
1397 	}
1398 
1399 	nbe = calloc(1, NET_BE_SIZE(tbe));
1400 	*nbe = *tbe;	/* copy the template */
1401 	nbe->fd = -1;
1402 	nbe->sc = param;
1403 	nbe->be_vnet_hdr_len = 0;
1404 	nbe->fe_vnet_hdr_len = 0;
1405 
1406 	/* Initialize the backend. */
1407 	err = nbe->init(nbe, devname, nvl, cb, param);
1408 	if (err) {
1409 		free(devname);
1410 		free(nbe);
1411 		return (err);
1412 	}
1413 
1414 	*ret = nbe;
1415 	free(devname);
1416 
1417 	return (0);
1418 }
1419 
1420 void
1421 netbe_cleanup(struct net_backend *be)
1422 {
1423 
1424 	if (be != NULL) {
1425 		be->cleanup(be);
1426 		free(be);
1427 	}
1428 }
1429 
1430 uint64_t
1431 netbe_get_cap(struct net_backend *be)
1432 {
1433 
1434 	assert(be != NULL);
1435 	return (be->get_cap(be));
1436 }
1437 
1438 int
1439 netbe_set_cap(struct net_backend *be, uint64_t features,
1440 	      unsigned vnet_hdr_len)
1441 {
1442 	int ret;
1443 
1444 	assert(be != NULL);
1445 
1446 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1447 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1448 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1449 		return (-1);
1450 
1451 	be->fe_vnet_hdr_len = vnet_hdr_len;
1452 
1453 	ret = be->set_cap(be, features, vnet_hdr_len);
1454 	assert(be->be_vnet_hdr_len == 0 ||
1455 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1456 
1457 	return (ret);
1458 }
1459 
1460 ssize_t
1461 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1462 {
1463 
1464 	return (be->send(be, iov, iovcnt));
1465 }
1466 
1467 ssize_t
1468 netbe_peek_recvlen(struct net_backend *be)
1469 {
1470 
1471 	return (be->peek_recvlen(be));
1472 }
1473 
1474 /*
1475  * Try to read a packet from the backend, without blocking.
1476  * If no packets are available, return 0. In case of success, return
1477  * the length of the packet just read. Return -1 in case of errors.
1478  */
1479 ssize_t
1480 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1481 {
1482 
1483 	return (be->recv(be, iov, iovcnt));
1484 }
1485 
1486 /*
1487  * Read a packet from the backend and discard it.
1488  * Returns the size of the discarded packet or zero if no packet was available.
1489  * A negative error code is returned in case of read error.
1490  */
1491 ssize_t
1492 netbe_rx_discard(struct net_backend *be)
1493 {
1494 	/*
1495 	 * MP note: the dummybuf is only used to discard frames,
1496 	 * so there is no need for it to be per-vtnet or locked.
1497 	 * We only make it large enough for TSO-sized segment.
1498 	 */
1499 	static uint8_t dummybuf[65536 + 64];
1500 	struct iovec iov;
1501 
1502 #ifdef __FreeBSD__
1503 	iov.iov_base = dummybuf;
1504 #else
1505 	iov.iov_base = (caddr_t)dummybuf;
1506 #endif
1507 	iov.iov_len = sizeof(dummybuf);
1508 
1509 	return netbe_recv(be, &iov, 1);
1510 }
1511 
1512 void
1513 netbe_rx_disable(struct net_backend *be)
1514 {
1515 
1516 	return be->recv_disable(be);
1517 }
1518 
1519 void
1520 netbe_rx_enable(struct net_backend *be)
1521 {
1522 
1523 	return be->recv_enable(be);
1524 }
1525 
1526 size_t
1527 netbe_get_vnet_hdr_len(struct net_backend *be)
1528 {
1529 
1530 	return (be->be_vnet_hdr_len);
1531 }
1532 
1533 #ifndef __FreeBSD__
1534 int
1535 netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen)
1536 {
1537 	if (be->get_mac == NULL)
1538 		return (ENOTSUP);
1539 	return (be->get_mac(be, buf, buflen));
1540 }
1541 #endif
1542