xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision b17b639832e707aab0e9514cf94727498e2d67bd)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #if defined(INET6) || defined(INET)
50 #include <net/if_tap.h>
51 #endif
52 #include <net/netmap.h>
53 #include <net/netmap_virt.h>
54 #define NETMAP_WITH_LIBS
55 #include <net/netmap_user.h>
56 
57 #ifndef WITHOUT_CAPSICUM
58 #include <capsicum_helpers.h>
59 #endif
60 #include <err.h>
61 #include <errno.h>
62 #include <fcntl.h>
63 #include <stdio.h>
64 #include <stdlib.h>
65 #include <stdint.h>
66 #include <string.h>
67 #include <unistd.h>
68 #include <sysexits.h>
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <poll.h>
73 #include <assert.h>
74 
75 #ifdef NETGRAPH
76 #include <sys/param.h>
77 #include <sys/sysctl.h>
78 #include <netgraph.h>
79 #endif
80 
81 #include "config.h"
82 #include "debug.h"
83 #include "iov.h"
84 #include "mevent.h"
85 #include "net_backends.h"
86 #include "pci_emul.h"
87 
88 #include <sys/linker_set.h>
89 
90 /*
91  * Each network backend registers a set of function pointers that are
92  * used to implement the net backends API.
93  * This might need to be exposed if we implement backends in separate files.
94  */
95 struct net_backend {
96 	const char *prefix;	/* prefix matching this backend */
97 
98 	/*
99 	 * Routines used to initialize and cleanup the resources needed
100 	 * by a backend. The cleanup function is used internally,
101 	 * and should not be called by the frontend.
102 	 */
103 	int (*init)(struct net_backend *be, const char *devname,
104 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
105 	void (*cleanup)(struct net_backend *be);
106 
107 	/*
108 	 * Called to serve a guest transmit request. The scatter-gather
109 	 * vector provided by the caller has 'iovcnt' elements and contains
110 	 * the packet to send.
111 	 */
112 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
113 	    int iovcnt);
114 
115 	/*
116 	 * Get the length of the next packet that can be received from
117 	 * the backend. If no packets are currently available, this
118 	 * function returns 0.
119 	 */
120 	ssize_t (*peek_recvlen)(struct net_backend *be);
121 
122 	/*
123 	 * Called to receive a packet from the backend. When the function
124 	 * returns a positive value 'len', the scatter-gather vector
125 	 * provided by the caller contains a packet with such length.
126 	 * The function returns 0 if the backend doesn't have a new packet to
127 	 * receive.
128 	 */
129 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
130 	    int iovcnt);
131 
132 	/*
133 	 * Ask the backend to enable or disable receive operation in the
134 	 * backend. On return from a disable operation, it is guaranteed
135 	 * that the receive callback won't be called until receive is
136 	 * enabled again. Note however that it is up to the caller to make
137 	 * sure that netbe_recv() is not currently being executed by another
138 	 * thread.
139 	 */
140 	void (*recv_enable)(struct net_backend *be);
141 	void (*recv_disable)(struct net_backend *be);
142 
143 	/*
144 	 * Ask the backend for the virtio-net features it is able to
145 	 * support. Possible features are TSO, UFO and checksum offloading
146 	 * in both rx and tx direction and for both IPv4 and IPv6.
147 	 */
148 	uint64_t (*get_cap)(struct net_backend *be);
149 
150 	/*
151 	 * Tell the backend to enable/disable the specified virtio-net
152 	 * features (capabilities).
153 	 */
154 	int (*set_cap)(struct net_backend *be, uint64_t features,
155 	    unsigned int vnet_hdr_len);
156 
157 	struct pci_vtnet_softc *sc;
158 	int fd;
159 
160 	/*
161 	 * Length of the virtio-net header used by the backend and the
162 	 * frontend, respectively. A zero value means that the header
163 	 * is not used.
164 	 */
165 	unsigned int be_vnet_hdr_len;
166 	unsigned int fe_vnet_hdr_len;
167 
168 	/* Size of backend-specific private data. */
169 	size_t priv_size;
170 
171 	/* Backend-specific private data follows. */
172 };
173 
174 #define	NET_BE_PRIV(be)		((void *)((be) + 1))
175 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
176 
177 SET_DECLARE(net_backend_set, struct net_backend);
178 
179 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
180 
181 #define WPRINTF(params) PRINTLN params
182 
183 /*
184  * The tap backend
185  */
186 
187 #if defined(INET6) || defined(INET)
188 static const int pf_list[] = {
189 #if defined(INET6)
190 	PF_INET6,
191 #endif
192 #if defined(INET)
193 	PF_INET,
194 #endif
195 };
196 #endif
197 
198 struct tap_priv {
199 	struct mevent *mevp;
200 	/*
201 	 * A bounce buffer that allows us to implement the peek_recvlen
202 	 * callback. In the future we may get the same information from
203 	 * the kevent data.
204 	 */
205 	char bbuf[1 << 16];
206 	ssize_t bbuflen;
207 };
208 
209 static void
210 tap_cleanup(struct net_backend *be)
211 {
212 	struct tap_priv *priv = NET_BE_PRIV(be);
213 
214 	if (priv->mevp) {
215 		mevent_delete(priv->mevp);
216 	}
217 	if (be->fd != -1) {
218 		close(be->fd);
219 		be->fd = -1;
220 	}
221 }
222 
223 static int
224 tap_init(struct net_backend *be, const char *devname,
225     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
226 {
227 	struct tap_priv *priv = NET_BE_PRIV(be);
228 	char tbuf[80];
229 	int opt = 1;
230 #if defined(INET6) || defined(INET)
231 	struct ifreq ifrq;
232 	int s;
233 #endif
234 #ifndef WITHOUT_CAPSICUM
235 	cap_rights_t rights;
236 #endif
237 
238 	if (cb == NULL) {
239 		WPRINTF(("TAP backend requires non-NULL callback"));
240 		return (-1);
241 	}
242 
243 	strcpy(tbuf, "/dev/");
244 	strlcat(tbuf, devname, sizeof(tbuf));
245 
246 	be->fd = open(tbuf, O_RDWR);
247 	if (be->fd == -1) {
248 		WPRINTF(("open of tap device %s failed", tbuf));
249 		goto error;
250 	}
251 
252 	/*
253 	 * Set non-blocking and register for read
254 	 * notifications with the event loop
255 	 */
256 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
257 		WPRINTF(("tap device O_NONBLOCK failed"));
258 		goto error;
259 	}
260 
261 #if defined(INET6) || defined(INET)
262 	/*
263 	 * Try to UP the interface rather than relying on
264 	 * net.link.tap.up_on_open.
265 	  */
266 	bzero(&ifrq, sizeof(ifrq));
267 	if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
268 		WPRINTF(("Could not get interface name"));
269 		goto error;
270 	}
271 
272 	s = -1;
273 	for (size_t i = 0; s == -1 && i < nitems(pf_list); i++)
274 		s = socket(pf_list[i], SOCK_DGRAM, 0);
275 	if (s == -1) {
276 		WPRINTF(("Could open socket"));
277 		goto error;
278 	}
279 
280 	if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
281 		(void)close(s);
282 		WPRINTF(("Could not get interface flags"));
283 		goto error;
284 	}
285 	ifrq.ifr_flags |= IFF_UP;
286 	if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
287 		(void)close(s);
288 		WPRINTF(("Could not set interface flags"));
289 		goto error;
290 	}
291 	(void)close(s);
292 #endif
293 
294 #ifndef WITHOUT_CAPSICUM
295 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
296 	if (caph_rights_limit(be->fd, &rights) == -1)
297 		errx(EX_OSERR, "Unable to apply rights for sandbox");
298 #endif
299 
300 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
301 	priv->bbuflen = 0;
302 
303 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
304 	if (priv->mevp == NULL) {
305 		WPRINTF(("Could not register event"));
306 		goto error;
307 	}
308 
309 	return (0);
310 
311 error:
312 	tap_cleanup(be);
313 	return (-1);
314 }
315 
316 /*
317  * Called to send a buffer chain out to the tap device
318  */
319 static ssize_t
320 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
321 {
322 	return (writev(be->fd, iov, iovcnt));
323 }
324 
325 static ssize_t
326 tap_peek_recvlen(struct net_backend *be)
327 {
328 	struct tap_priv *priv = NET_BE_PRIV(be);
329 	ssize_t ret;
330 
331 	if (priv->bbuflen > 0) {
332 		/*
333 		 * We already have a packet in the bounce buffer.
334 		 * Just return its length.
335 		 */
336 		return priv->bbuflen;
337 	}
338 
339 	/*
340 	 * Read the next packet (if any) into the bounce buffer, so
341 	 * that we get to know its length and we can return that
342 	 * to the caller.
343 	 */
344 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
345 	if (ret < 0 && errno == EWOULDBLOCK) {
346 		return (0);
347 	}
348 
349 	if (ret > 0)
350 		priv->bbuflen = ret;
351 
352 	return (ret);
353 }
354 
355 static ssize_t
356 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
357 {
358 	struct tap_priv *priv = NET_BE_PRIV(be);
359 	ssize_t ret;
360 
361 	if (priv->bbuflen > 0) {
362 		/*
363 		 * A packet is available in the bounce buffer, so
364 		 * we read it from there.
365 		 */
366 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
367 		    iov, iovcnt, 0);
368 
369 		/* Mark the bounce buffer as empty. */
370 		priv->bbuflen = 0;
371 
372 		return (ret);
373 	}
374 
375 	ret = readv(be->fd, iov, iovcnt);
376 	if (ret < 0 && errno == EWOULDBLOCK) {
377 		return (0);
378 	}
379 
380 	return (ret);
381 }
382 
383 static void
384 tap_recv_enable(struct net_backend *be)
385 {
386 	struct tap_priv *priv = NET_BE_PRIV(be);
387 
388 	mevent_enable(priv->mevp);
389 }
390 
391 static void
392 tap_recv_disable(struct net_backend *be)
393 {
394 	struct tap_priv *priv = NET_BE_PRIV(be);
395 
396 	mevent_disable(priv->mevp);
397 }
398 
399 static uint64_t
400 tap_get_cap(struct net_backend *be __unused)
401 {
402 
403 	return (0); /* no capabilities for now */
404 }
405 
406 static int
407 tap_set_cap(struct net_backend *be __unused, uint64_t features,
408     unsigned vnet_hdr_len)
409 {
410 
411 	return ((features || vnet_hdr_len) ? -1 : 0);
412 }
413 
414 static struct net_backend tap_backend = {
415 	.prefix = "tap",
416 	.priv_size = sizeof(struct tap_priv),
417 	.init = tap_init,
418 	.cleanup = tap_cleanup,
419 	.send = tap_send,
420 	.peek_recvlen = tap_peek_recvlen,
421 	.recv = tap_recv,
422 	.recv_enable = tap_recv_enable,
423 	.recv_disable = tap_recv_disable,
424 	.get_cap = tap_get_cap,
425 	.set_cap = tap_set_cap,
426 };
427 
428 /* A clone of the tap backend, with a different prefix. */
429 static struct net_backend vmnet_backend = {
430 	.prefix = "vmnet",
431 	.priv_size = sizeof(struct tap_priv),
432 	.init = tap_init,
433 	.cleanup = tap_cleanup,
434 	.send = tap_send,
435 	.peek_recvlen = tap_peek_recvlen,
436 	.recv = tap_recv,
437 	.recv_enable = tap_recv_enable,
438 	.recv_disable = tap_recv_disable,
439 	.get_cap = tap_get_cap,
440 	.set_cap = tap_set_cap,
441 };
442 
443 DATA_SET(net_backend_set, tap_backend);
444 DATA_SET(net_backend_set, vmnet_backend);
445 
446 #ifdef NETGRAPH
447 
448 /*
449  * Netgraph backend
450  */
451 
452 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
453 
454 static int
455 ng_init(struct net_backend *be, const char *devname __unused,
456 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
457 {
458 	struct tap_priv *p = NET_BE_PRIV(be);
459 	struct ngm_connect ngc;
460 	const char *value, *nodename;
461 	int sbsz;
462 	int ctrl_sock;
463 	int flags;
464 	unsigned long maxsbsz;
465 	size_t msbsz;
466 #ifndef WITHOUT_CAPSICUM
467 	cap_rights_t rights;
468 #endif
469 
470 	if (cb == NULL) {
471 		WPRINTF(("Netgraph backend requires non-NULL callback"));
472 		return (-1);
473 	}
474 
475 	be->fd = -1;
476 
477 	memset(&ngc, 0, sizeof(ngc));
478 
479 	value = get_config_value_node(nvl, "path");
480 	if (value == NULL) {
481 		WPRINTF(("path must be provided"));
482 		return (-1);
483 	}
484 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
485 
486 	value = get_config_value_node(nvl, "hook");
487 	if (value == NULL)
488 		value = "vmlink";
489 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
490 
491 	value = get_config_value_node(nvl, "peerhook");
492 	if (value == NULL) {
493 		WPRINTF(("peer hook must be provided"));
494 		return (-1);
495 	}
496 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
497 
498 	nodename = get_config_value_node(nvl, "socket");
499 	if (NgMkSockNode(nodename,
500 		&ctrl_sock, &be->fd) < 0) {
501 		WPRINTF(("can't get Netgraph sockets"));
502 		return (-1);
503 	}
504 
505 	if (NgSendMsg(ctrl_sock, ".",
506 		NGM_GENERIC_COOKIE,
507 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
508 		WPRINTF(("can't connect to node"));
509 		close(ctrl_sock);
510 		goto error;
511 	}
512 
513 	close(ctrl_sock);
514 
515 	flags = fcntl(be->fd, F_GETFL);
516 
517 	if (flags < 0) {
518 		WPRINTF(("can't get socket flags"));
519 		goto error;
520 	}
521 
522 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
523 		WPRINTF(("can't set O_NONBLOCK flag"));
524 		goto error;
525 	}
526 
527 	/*
528 	 * The default ng_socket(4) buffer's size is too low.
529 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
530 	 * and kern.ipc.maxsockbuf.
531 	 */
532 	msbsz = sizeof(maxsbsz);
533 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
534 		NULL, 0) < 0) {
535 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
536 		goto error;
537 	}
538 
539 	/*
540 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
541 	 * as it takes into account the mbuf(9) overhead.
542 	 */
543 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
544 
545 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
546 
547 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
548 		sizeof(sbsz)) < 0) {
549 		WPRINTF(("can't set TX buffer size"));
550 		goto error;
551 	}
552 
553 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
554 		sizeof(sbsz)) < 0) {
555 		WPRINTF(("can't set RX buffer size"));
556 		goto error;
557 	}
558 
559 #ifndef WITHOUT_CAPSICUM
560 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
561 	if (caph_rights_limit(be->fd, &rights) == -1)
562 		errx(EX_OSERR, "Unable to apply rights for sandbox");
563 #endif
564 
565 	memset(p->bbuf, 0, sizeof(p->bbuf));
566 	p->bbuflen = 0;
567 
568 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
569 	if (p->mevp == NULL) {
570 		WPRINTF(("Could not register event"));
571 		goto error;
572 	}
573 
574 	return (0);
575 
576 error:
577 	tap_cleanup(be);
578 	return (-1);
579 }
580 
581 static struct net_backend ng_backend = {
582 	.prefix = "netgraph",
583 	.priv_size = sizeof(struct tap_priv),
584 	.init = ng_init,
585 	.cleanup = tap_cleanup,
586 	.send = tap_send,
587 	.peek_recvlen = tap_peek_recvlen,
588 	.recv = tap_recv,
589 	.recv_enable = tap_recv_enable,
590 	.recv_disable = tap_recv_disable,
591 	.get_cap = tap_get_cap,
592 	.set_cap = tap_set_cap,
593 };
594 
595 DATA_SET(net_backend_set, ng_backend);
596 
597 #endif /* NETGRAPH */
598 
599 /*
600  * The netmap backend
601  */
602 
603 /* The virtio-net features supported by netmap. */
604 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
605 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
606 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
607 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
608 
609 struct netmap_priv {
610 	char ifname[IFNAMSIZ];
611 	struct nm_desc *nmd;
612 	uint16_t memid;
613 	struct netmap_ring *rx;
614 	struct netmap_ring *tx;
615 	struct mevent *mevp;
616 	net_be_rxeof_t cb;
617 	void *cb_param;
618 };
619 
620 static void
621 nmreq_init(struct nmreq *req, char *ifname)
622 {
623 
624 	memset(req, 0, sizeof(*req));
625 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
626 	req->nr_version = NETMAP_API;
627 }
628 
629 static int
630 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
631 {
632 	int err;
633 	struct nmreq req;
634 	struct netmap_priv *priv = NET_BE_PRIV(be);
635 
636 	nmreq_init(&req, priv->ifname);
637 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
638 	req.nr_arg1 = vnet_hdr_len;
639 	err = ioctl(be->fd, NIOCREGIF, &req);
640 	if (err) {
641 		WPRINTF(("Unable to set vnet header length %d",
642 				vnet_hdr_len));
643 		return (err);
644 	}
645 
646 	be->be_vnet_hdr_len = vnet_hdr_len;
647 
648 	return (0);
649 }
650 
651 static int
652 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
653 {
654 	unsigned prev_hdr_len = be->be_vnet_hdr_len;
655 	int ret;
656 
657 	if (vnet_hdr_len == prev_hdr_len) {
658 		return (1);
659 	}
660 
661 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
662 	if (ret) {
663 		return (0);
664 	}
665 
666 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
667 
668 	return (1);
669 }
670 
671 static uint64_t
672 netmap_get_cap(struct net_backend *be)
673 {
674 
675 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
676 	    NETMAP_FEATURES : 0);
677 }
678 
679 static int
680 netmap_set_cap(struct net_backend *be, uint64_t features __unused,
681     unsigned vnet_hdr_len)
682 {
683 
684 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
685 }
686 
687 static int
688 netmap_init(struct net_backend *be, const char *devname,
689     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
690 {
691 	struct netmap_priv *priv = NET_BE_PRIV(be);
692 
693 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
694 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
695 
696 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
697 	if (priv->nmd == NULL) {
698 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
699 			devname, strerror(errno)));
700 		return (-1);
701 	}
702 
703 	priv->memid = priv->nmd->req.nr_arg2;
704 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
705 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
706 	priv->cb = cb;
707 	priv->cb_param = param;
708 	be->fd = priv->nmd->fd;
709 
710 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
711 	if (priv->mevp == NULL) {
712 		WPRINTF(("Could not register event"));
713 		return (-1);
714 	}
715 
716 	return (0);
717 }
718 
719 static void
720 netmap_cleanup(struct net_backend *be)
721 {
722 	struct netmap_priv *priv = NET_BE_PRIV(be);
723 
724 	if (priv->mevp) {
725 		mevent_delete(priv->mevp);
726 	}
727 	if (priv->nmd) {
728 		nm_close(priv->nmd);
729 	}
730 	be->fd = -1;
731 }
732 
733 static ssize_t
734 netmap_send(struct net_backend *be, const struct iovec *iov,
735 	    int iovcnt)
736 {
737 	struct netmap_priv *priv = NET_BE_PRIV(be);
738 	struct netmap_ring *ring;
739 	ssize_t totlen = 0;
740 	int nm_buf_size;
741 	int nm_buf_len;
742 	uint32_t head;
743 	uint8_t *nm_buf;
744 	int j;
745 
746 	ring = priv->tx;
747 	head = ring->head;
748 	if (head == ring->tail) {
749 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
750 		goto txsync;
751 	}
752 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
753 	nm_buf_size = ring->nr_buf_size;
754 	nm_buf_len = 0;
755 
756 	for (j = 0; j < iovcnt; j++) {
757 		uint8_t *iov_frag_buf = iov[j].iov_base;
758 		int iov_frag_size = iov[j].iov_len;
759 
760 		totlen += iov_frag_size;
761 
762 		/*
763 		 * Split each iovec fragment over more netmap slots, if
764 		 * necessary.
765 		 */
766 		for (;;) {
767 			int copylen;
768 
769 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
770 			memcpy(nm_buf, iov_frag_buf, copylen);
771 
772 			iov_frag_buf += copylen;
773 			iov_frag_size -= copylen;
774 			nm_buf += copylen;
775 			nm_buf_size -= copylen;
776 			nm_buf_len += copylen;
777 
778 			if (iov_frag_size == 0) {
779 				break;
780 			}
781 
782 			ring->slot[head].len = nm_buf_len;
783 			ring->slot[head].flags = NS_MOREFRAG;
784 			head = nm_ring_next(ring, head);
785 			if (head == ring->tail) {
786 				/*
787 				 * We ran out of netmap slots while
788 				 * splitting the iovec fragments.
789 				 */
790 				WPRINTF(("No space, drop %zu bytes",
791 				   count_iov(iov, iovcnt)));
792 				goto txsync;
793 			}
794 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
795 			nm_buf_size = ring->nr_buf_size;
796 			nm_buf_len = 0;
797 		}
798 	}
799 
800 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
801 	ring->slot[head].len = nm_buf_len;
802 	ring->slot[head].flags = 0;
803 	head = nm_ring_next(ring, head);
804 
805 	/* Now update ring->head and ring->cur. */
806 	ring->head = ring->cur = head;
807 txsync:
808 	ioctl(be->fd, NIOCTXSYNC, NULL);
809 
810 	return (totlen);
811 }
812 
813 static ssize_t
814 netmap_peek_recvlen(struct net_backend *be)
815 {
816 	struct netmap_priv *priv = NET_BE_PRIV(be);
817 	struct netmap_ring *ring = priv->rx;
818 	uint32_t head = ring->head;
819 	ssize_t totlen = 0;
820 
821 	while (head != ring->tail) {
822 		struct netmap_slot *slot = ring->slot + head;
823 
824 		totlen += slot->len;
825 		if ((slot->flags & NS_MOREFRAG) == 0)
826 			break;
827 		head = nm_ring_next(ring, head);
828 	}
829 
830 	return (totlen);
831 }
832 
833 static ssize_t
834 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
835 {
836 	struct netmap_priv *priv = NET_BE_PRIV(be);
837 	struct netmap_slot *slot = NULL;
838 	struct netmap_ring *ring;
839 	uint8_t *iov_frag_buf;
840 	int iov_frag_size;
841 	ssize_t totlen = 0;
842 	uint32_t head;
843 
844 	assert(iovcnt);
845 
846 	ring = priv->rx;
847 	head = ring->head;
848 	iov_frag_buf = iov->iov_base;
849 	iov_frag_size = iov->iov_len;
850 
851 	do {
852 		uint8_t *nm_buf;
853 		int nm_buf_len;
854 
855 		if (head == ring->tail) {
856 			return (0);
857 		}
858 
859 		slot = ring->slot + head;
860 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
861 		nm_buf_len = slot->len;
862 
863 		for (;;) {
864 			int copylen = nm_buf_len < iov_frag_size ?
865 			    nm_buf_len : iov_frag_size;
866 
867 			memcpy(iov_frag_buf, nm_buf, copylen);
868 			nm_buf += copylen;
869 			nm_buf_len -= copylen;
870 			iov_frag_buf += copylen;
871 			iov_frag_size -= copylen;
872 			totlen += copylen;
873 
874 			if (nm_buf_len == 0) {
875 				break;
876 			}
877 
878 			iov++;
879 			iovcnt--;
880 			if (iovcnt == 0) {
881 				/* No space to receive. */
882 				WPRINTF(("Short iov, drop %zd bytes",
883 				    totlen));
884 				return (-ENOSPC);
885 			}
886 			iov_frag_buf = iov->iov_base;
887 			iov_frag_size = iov->iov_len;
888 		}
889 
890 		head = nm_ring_next(ring, head);
891 
892 	} while (slot->flags & NS_MOREFRAG);
893 
894 	/* Release slots to netmap. */
895 	ring->head = ring->cur = head;
896 
897 	return (totlen);
898 }
899 
900 static void
901 netmap_recv_enable(struct net_backend *be)
902 {
903 	struct netmap_priv *priv = NET_BE_PRIV(be);
904 
905 	mevent_enable(priv->mevp);
906 }
907 
908 static void
909 netmap_recv_disable(struct net_backend *be)
910 {
911 	struct netmap_priv *priv = NET_BE_PRIV(be);
912 
913 	mevent_disable(priv->mevp);
914 }
915 
916 static struct net_backend netmap_backend = {
917 	.prefix = "netmap",
918 	.priv_size = sizeof(struct netmap_priv),
919 	.init = netmap_init,
920 	.cleanup = netmap_cleanup,
921 	.send = netmap_send,
922 	.peek_recvlen = netmap_peek_recvlen,
923 	.recv = netmap_recv,
924 	.recv_enable = netmap_recv_enable,
925 	.recv_disable = netmap_recv_disable,
926 	.get_cap = netmap_get_cap,
927 	.set_cap = netmap_set_cap,
928 };
929 
930 /* A clone of the netmap backend, with a different prefix. */
931 static struct net_backend vale_backend = {
932 	.prefix = "vale",
933 	.priv_size = sizeof(struct netmap_priv),
934 	.init = netmap_init,
935 	.cleanup = netmap_cleanup,
936 	.send = netmap_send,
937 	.peek_recvlen = netmap_peek_recvlen,
938 	.recv = netmap_recv,
939 	.recv_enable = netmap_recv_enable,
940 	.recv_disable = netmap_recv_disable,
941 	.get_cap = netmap_get_cap,
942 	.set_cap = netmap_set_cap,
943 };
944 
945 DATA_SET(net_backend_set, netmap_backend);
946 DATA_SET(net_backend_set, vale_backend);
947 
948 int
949 netbe_legacy_config(nvlist_t *nvl, const char *opts)
950 {
951 	char *backend, *cp;
952 
953 	if (opts == NULL)
954 		return (0);
955 
956 	cp = strchr(opts, ',');
957 	if (cp == NULL) {
958 		set_config_value_node(nvl, "backend", opts);
959 		return (0);
960 	}
961 	backend = strndup(opts, cp - opts);
962 	set_config_value_node(nvl, "backend", backend);
963 	free(backend);
964 	return (pci_parse_legacy_config(nvl, cp + 1));
965 }
966 
967 /*
968  * Initialize a backend and attach to the frontend.
969  * This is called during frontend initialization.
970  *  @ret is a pointer to the backend to be initialized
971  *  @devname is the backend-name as supplied on the command line,
972  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
973  *  @cb is the receive callback supplied by the frontend,
974  *	and it is invoked in the event loop when a receive
975  *	event is generated in the hypervisor,
976  *  @param is a pointer to the frontend, and normally used as
977  *	the argument for the callback.
978  */
979 int
980 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
981     void *param)
982 {
983 	struct net_backend **pbe, *nbe, *tbe = NULL;
984 	const char *value, *type;
985 	char *devname;
986 	int err;
987 
988 	value = get_config_value_node(nvl, "backend");
989 	if (value == NULL) {
990 		return (-1);
991 	}
992 	devname = strdup(value);
993 
994 	/*
995 	 * Use the type given by configuration if exists; otherwise
996 	 * use the prefix of the backend as the type.
997 	 */
998 	type = get_config_value_node(nvl, "type");
999 	if (type == NULL)
1000 		type = devname;
1001 
1002 	/*
1003 	 * Find the network backend that matches the user-provided
1004 	 * device name. net_backend_set is built using a linker set.
1005 	 */
1006 	SET_FOREACH(pbe, net_backend_set) {
1007 		if (strncmp(type, (*pbe)->prefix,
1008 		    strlen((*pbe)->prefix)) == 0) {
1009 			tbe = *pbe;
1010 			assert(tbe->init != NULL);
1011 			assert(tbe->cleanup != NULL);
1012 			assert(tbe->send != NULL);
1013 			assert(tbe->recv != NULL);
1014 			assert(tbe->get_cap != NULL);
1015 			assert(tbe->set_cap != NULL);
1016 			break;
1017 		}
1018 	}
1019 
1020 	*ret = NULL;
1021 	if (tbe == NULL) {
1022 		free(devname);
1023 		return (EINVAL);
1024 	}
1025 
1026 	nbe = calloc(1, NET_BE_SIZE(tbe));
1027 	*nbe = *tbe;	/* copy the template */
1028 	nbe->fd = -1;
1029 	nbe->sc = param;
1030 	nbe->be_vnet_hdr_len = 0;
1031 	nbe->fe_vnet_hdr_len = 0;
1032 
1033 	/* Initialize the backend. */
1034 	err = nbe->init(nbe, devname, nvl, cb, param);
1035 	if (err) {
1036 		free(devname);
1037 		free(nbe);
1038 		return (err);
1039 	}
1040 
1041 	*ret = nbe;
1042 	free(devname);
1043 
1044 	return (0);
1045 }
1046 
1047 void
1048 netbe_cleanup(struct net_backend *be)
1049 {
1050 
1051 	if (be != NULL) {
1052 		be->cleanup(be);
1053 		free(be);
1054 	}
1055 }
1056 
1057 uint64_t
1058 netbe_get_cap(struct net_backend *be)
1059 {
1060 
1061 	assert(be != NULL);
1062 	return (be->get_cap(be));
1063 }
1064 
1065 int
1066 netbe_set_cap(struct net_backend *be, uint64_t features,
1067 	      unsigned vnet_hdr_len)
1068 {
1069 	int ret;
1070 
1071 	assert(be != NULL);
1072 
1073 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1074 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1075 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1076 		return (-1);
1077 
1078 	be->fe_vnet_hdr_len = vnet_hdr_len;
1079 
1080 	ret = be->set_cap(be, features, vnet_hdr_len);
1081 	assert(be->be_vnet_hdr_len == 0 ||
1082 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1083 
1084 	return (ret);
1085 }
1086 
1087 ssize_t
1088 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1089 {
1090 
1091 	return (be->send(be, iov, iovcnt));
1092 }
1093 
1094 ssize_t
1095 netbe_peek_recvlen(struct net_backend *be)
1096 {
1097 
1098 	return (be->peek_recvlen(be));
1099 }
1100 
1101 /*
1102  * Try to read a packet from the backend, without blocking.
1103  * If no packets are available, return 0. In case of success, return
1104  * the length of the packet just read. Return -1 in case of errors.
1105  */
1106 ssize_t
1107 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1108 {
1109 
1110 	return (be->recv(be, iov, iovcnt));
1111 }
1112 
1113 /*
1114  * Read a packet from the backend and discard it.
1115  * Returns the size of the discarded packet or zero if no packet was available.
1116  * A negative error code is returned in case of read error.
1117  */
1118 ssize_t
1119 netbe_rx_discard(struct net_backend *be)
1120 {
1121 	/*
1122 	 * MP note: the dummybuf is only used to discard frames,
1123 	 * so there is no need for it to be per-vtnet or locked.
1124 	 * We only make it large enough for TSO-sized segment.
1125 	 */
1126 	static uint8_t dummybuf[65536 + 64];
1127 	struct iovec iov;
1128 
1129 	iov.iov_base = dummybuf;
1130 	iov.iov_len = sizeof(dummybuf);
1131 
1132 	return netbe_recv(be, &iov, 1);
1133 }
1134 
1135 void
1136 netbe_rx_disable(struct net_backend *be)
1137 {
1138 
1139 	return be->recv_disable(be);
1140 }
1141 
1142 void
1143 netbe_rx_enable(struct net_backend *be)
1144 {
1145 
1146 	return be->recv_enable(be);
1147 }
1148 
1149 size_t
1150 netbe_get_vnet_hdr_len(struct net_backend *be)
1151 {
1152 
1153 	return (be->be_vnet_hdr_len);
1154 }
1155