xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision d7d962ead0b6e5e8a39202d0590022082bf5bfb6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #if defined(INET6) || defined(INET)
50 #include <net/if_tap.h>
51 #endif
52 #include <net/netmap.h>
53 #include <net/netmap_virt.h>
54 #define NETMAP_WITH_LIBS
55 #include <net/netmap_user.h>
56 
57 #ifndef WITHOUT_CAPSICUM
58 #include <capsicum_helpers.h>
59 #endif
60 #include <err.h>
61 #include <errno.h>
62 #include <fcntl.h>
63 #include <stdio.h>
64 #include <stdlib.h>
65 #include <stdint.h>
66 #include <string.h>
67 #include <unistd.h>
68 #include <sysexits.h>
69 #include <assert.h>
70 #include <pthread.h>
71 #include <pthread_np.h>
72 #include <poll.h>
73 #include <assert.h>
74 
75 #ifdef NETGRAPH
76 #include <sys/param.h>
77 #include <sys/sysctl.h>
78 #include <netgraph.h>
79 #endif
80 
81 #include "config.h"
82 #include "debug.h"
83 #include "iov.h"
84 #include "mevent.h"
85 #include "net_backends.h"
86 #include "pci_emul.h"
87 
88 #include <sys/linker_set.h>
89 
90 /*
91  * Each network backend registers a set of function pointers that are
92  * used to implement the net backends API.
93  * This might need to be exposed if we implement backends in separate files.
94  */
95 struct net_backend {
96 	const char *prefix;	/* prefix matching this backend */
97 
98 	/*
99 	 * Routines used to initialize and cleanup the resources needed
100 	 * by a backend. The cleanup function is used internally,
101 	 * and should not be called by the frontend.
102 	 */
103 	int (*init)(struct net_backend *be, const char *devname,
104 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
105 	void (*cleanup)(struct net_backend *be);
106 
107 	/*
108 	 * Called to serve a guest transmit request. The scatter-gather
109 	 * vector provided by the caller has 'iovcnt' elements and contains
110 	 * the packet to send.
111 	 */
112 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
113 	    int iovcnt);
114 
115 	/*
116 	 * Get the length of the next packet that can be received from
117 	 * the backend. If no packets are currently available, this
118 	 * function returns 0.
119 	 */
120 	ssize_t (*peek_recvlen)(struct net_backend *be);
121 
122 	/*
123 	 * Called to receive a packet from the backend. When the function
124 	 * returns a positive value 'len', the scatter-gather vector
125 	 * provided by the caller contains a packet with such length.
126 	 * The function returns 0 if the backend doesn't have a new packet to
127 	 * receive.
128 	 */
129 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
130 	    int iovcnt);
131 
132 	/*
133 	 * Ask the backend to enable or disable receive operation in the
134 	 * backend. On return from a disable operation, it is guaranteed
135 	 * that the receive callback won't be called until receive is
136 	 * enabled again. Note however that it is up to the caller to make
137 	 * sure that netbe_recv() is not currently being executed by another
138 	 * thread.
139 	 */
140 	void (*recv_enable)(struct net_backend *be);
141 	void (*recv_disable)(struct net_backend *be);
142 
143 	/*
144 	 * Ask the backend for the virtio-net features it is able to
145 	 * support. Possible features are TSO, UFO and checksum offloading
146 	 * in both rx and tx direction and for both IPv4 and IPv6.
147 	 */
148 	uint64_t (*get_cap)(struct net_backend *be);
149 
150 	/*
151 	 * Tell the backend to enable/disable the specified virtio-net
152 	 * features (capabilities).
153 	 */
154 	int (*set_cap)(struct net_backend *be, uint64_t features,
155 	    unsigned int vnet_hdr_len);
156 
157 	struct pci_vtnet_softc *sc;
158 	int fd;
159 
160 	/*
161 	 * Length of the virtio-net header used by the backend and the
162 	 * frontend, respectively. A zero value means that the header
163 	 * is not used.
164 	 */
165 	unsigned int be_vnet_hdr_len;
166 	unsigned int fe_vnet_hdr_len;
167 
168 	/* Size of backend-specific private data. */
169 	size_t priv_size;
170 
171 	/* Room for backend-specific data. */
172 	char opaque[0];
173 };
174 
175 SET_DECLARE(net_backend_set, struct net_backend);
176 
177 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
178 
179 #define WPRINTF(params) PRINTLN params
180 
181 /*
182  * The tap backend
183  */
184 
185 #if defined(INET6) || defined(INET)
186 const int pf_list[] = {
187 #if defined(INET6)
188 	PF_INET6,
189 #endif
190 #if defined(INET)
191 	PF_INET,
192 #endif
193 };
194 #endif
195 
196 struct tap_priv {
197 	struct mevent *mevp;
198 	/*
199 	 * A bounce buffer that allows us to implement the peek_recvlen
200 	 * callback. In the future we may get the same information from
201 	 * the kevent data.
202 	 */
203 	char bbuf[1 << 16];
204 	ssize_t bbuflen;
205 };
206 
207 static void
208 tap_cleanup(struct net_backend *be)
209 {
210 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
211 
212 	if (priv->mevp) {
213 		mevent_delete(priv->mevp);
214 	}
215 	if (be->fd != -1) {
216 		close(be->fd);
217 		be->fd = -1;
218 	}
219 }
220 
221 static int
222 tap_init(struct net_backend *be, const char *devname,
223 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
224 {
225 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
226 	char tbuf[80];
227 	int opt = 1;
228 #if defined(INET6) || defined(INET)
229 	struct ifreq ifrq;
230 	int i, s;
231 #endif
232 #ifndef WITHOUT_CAPSICUM
233 	cap_rights_t rights;
234 #endif
235 
236 	if (cb == NULL) {
237 		WPRINTF(("TAP backend requires non-NULL callback"));
238 		return (-1);
239 	}
240 
241 	strcpy(tbuf, "/dev/");
242 	strlcat(tbuf, devname, sizeof(tbuf));
243 
244 	be->fd = open(tbuf, O_RDWR);
245 	if (be->fd == -1) {
246 		WPRINTF(("open of tap device %s failed", tbuf));
247 		goto error;
248 	}
249 
250 	/*
251 	 * Set non-blocking and register for read
252 	 * notifications with the event loop
253 	 */
254 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
255 		WPRINTF(("tap device O_NONBLOCK failed"));
256 		goto error;
257 	}
258 
259 #if defined(INET6) || defined(INET)
260 	/*
261 	 * Try to UP the interface rather than relying on
262 	 * net.link.tap.up_on_open.
263 	  */
264 	bzero(&ifrq, sizeof(ifrq));
265 	if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
266 		WPRINTF(("Could not get interface name"));
267 		goto error;
268 	}
269 
270 	s = -1;
271 	for (i = 0; s == -1 && i < nitems(pf_list); i++)
272 		s = socket(pf_list[i], SOCK_DGRAM, 0);
273 	if (s == -1) {
274 		WPRINTF(("Could open socket"));
275 		goto error;
276 	}
277 
278 	if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
279 		(void)close(s);
280 		WPRINTF(("Could not get interface flags"));
281 		goto error;
282 	}
283 	ifrq.ifr_flags |= IFF_UP;
284 	if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
285 		(void)close(s);
286 		WPRINTF(("Could not set interface flags"));
287 		goto error;
288 	}
289 	(void)close(s);
290 #endif
291 
292 #ifndef WITHOUT_CAPSICUM
293 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
294 	if (caph_rights_limit(be->fd, &rights) == -1)
295 		errx(EX_OSERR, "Unable to apply rights for sandbox");
296 #endif
297 
298 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
299 	priv->bbuflen = 0;
300 
301 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
302 	if (priv->mevp == NULL) {
303 		WPRINTF(("Could not register event"));
304 		goto error;
305 	}
306 
307 	return (0);
308 
309 error:
310 	tap_cleanup(be);
311 	return (-1);
312 }
313 
314 /*
315  * Called to send a buffer chain out to the tap device
316  */
317 static ssize_t
318 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
319 {
320 	return (writev(be->fd, iov, iovcnt));
321 }
322 
323 static ssize_t
324 tap_peek_recvlen(struct net_backend *be)
325 {
326 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
327 	ssize_t ret;
328 
329 	if (priv->bbuflen > 0) {
330 		/*
331 		 * We already have a packet in the bounce buffer.
332 		 * Just return its length.
333 		 */
334 		return priv->bbuflen;
335 	}
336 
337 	/*
338 	 * Read the next packet (if any) into the bounce buffer, so
339 	 * that we get to know its length and we can return that
340 	 * to the caller.
341 	 */
342 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
343 	if (ret < 0 && errno == EWOULDBLOCK) {
344 		return (0);
345 	}
346 
347 	if (ret > 0)
348 		priv->bbuflen = ret;
349 
350 	return (ret);
351 }
352 
353 static ssize_t
354 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
355 {
356 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
357 	ssize_t ret;
358 
359 	if (priv->bbuflen > 0) {
360 		/*
361 		 * A packet is available in the bounce buffer, so
362 		 * we read it from there.
363 		 */
364 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
365 		    iov, iovcnt, 0);
366 
367 		/* Mark the bounce buffer as empty. */
368 		priv->bbuflen = 0;
369 
370 		return (ret);
371 	}
372 
373 	ret = readv(be->fd, iov, iovcnt);
374 	if (ret < 0 && errno == EWOULDBLOCK) {
375 		return (0);
376 	}
377 
378 	return (ret);
379 }
380 
381 static void
382 tap_recv_enable(struct net_backend *be)
383 {
384 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
385 
386 	mevent_enable(priv->mevp);
387 }
388 
389 static void
390 tap_recv_disable(struct net_backend *be)
391 {
392 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
393 
394 	mevent_disable(priv->mevp);
395 }
396 
397 static uint64_t
398 tap_get_cap(struct net_backend *be)
399 {
400 
401 	return (0); /* no capabilities for now */
402 }
403 
404 static int
405 tap_set_cap(struct net_backend *be, uint64_t features,
406 		unsigned vnet_hdr_len)
407 {
408 
409 	return ((features || vnet_hdr_len) ? -1 : 0);
410 }
411 
412 static struct net_backend tap_backend = {
413 	.prefix = "tap",
414 	.priv_size = sizeof(struct tap_priv),
415 	.init = tap_init,
416 	.cleanup = tap_cleanup,
417 	.send = tap_send,
418 	.peek_recvlen = tap_peek_recvlen,
419 	.recv = tap_recv,
420 	.recv_enable = tap_recv_enable,
421 	.recv_disable = tap_recv_disable,
422 	.get_cap = tap_get_cap,
423 	.set_cap = tap_set_cap,
424 };
425 
426 /* A clone of the tap backend, with a different prefix. */
427 static struct net_backend vmnet_backend = {
428 	.prefix = "vmnet",
429 	.priv_size = sizeof(struct tap_priv),
430 	.init = tap_init,
431 	.cleanup = tap_cleanup,
432 	.send = tap_send,
433 	.peek_recvlen = tap_peek_recvlen,
434 	.recv = tap_recv,
435 	.recv_enable = tap_recv_enable,
436 	.recv_disable = tap_recv_disable,
437 	.get_cap = tap_get_cap,
438 	.set_cap = tap_set_cap,
439 };
440 
441 DATA_SET(net_backend_set, tap_backend);
442 DATA_SET(net_backend_set, vmnet_backend);
443 
444 #ifdef NETGRAPH
445 
446 /*
447  * Netgraph backend
448  */
449 
450 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
451 
452 static int
453 ng_init(struct net_backend *be, const char *devname,
454 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
455 {
456 	struct tap_priv *p = (struct tap_priv *)be->opaque;
457 	struct ngm_connect ngc;
458 	const char *value, *nodename;
459 	int sbsz;
460 	int ctrl_sock;
461 	int flags;
462 	unsigned long maxsbsz;
463 	size_t msbsz;
464 #ifndef WITHOUT_CAPSICUM
465 	cap_rights_t rights;
466 #endif
467 
468 	if (cb == NULL) {
469 		WPRINTF(("Netgraph backend requires non-NULL callback"));
470 		return (-1);
471 	}
472 
473 	be->fd = -1;
474 
475 	memset(&ngc, 0, sizeof(ngc));
476 
477 	value = get_config_value_node(nvl, "path");
478 	if (value == NULL) {
479 		WPRINTF(("path must be provided"));
480 		return (-1);
481 	}
482 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
483 
484 	value = get_config_value_node(nvl, "hook");
485 	if (value == NULL)
486 		value = "vmlink";
487 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
488 
489 	value = get_config_value_node(nvl, "peerhook");
490 	if (value == NULL) {
491 		WPRINTF(("peer hook must be provided"));
492 		return (-1);
493 	}
494 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
495 
496 	nodename = get_config_value_node(nvl, "socket");
497 	if (NgMkSockNode(nodename,
498 		&ctrl_sock, &be->fd) < 0) {
499 		WPRINTF(("can't get Netgraph sockets"));
500 		return (-1);
501 	}
502 
503 	if (NgSendMsg(ctrl_sock, ".",
504 		NGM_GENERIC_COOKIE,
505 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
506 		WPRINTF(("can't connect to node"));
507 		close(ctrl_sock);
508 		goto error;
509 	}
510 
511 	close(ctrl_sock);
512 
513 	flags = fcntl(be->fd, F_GETFL);
514 
515 	if (flags < 0) {
516 		WPRINTF(("can't get socket flags"));
517 		goto error;
518 	}
519 
520 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
521 		WPRINTF(("can't set O_NONBLOCK flag"));
522 		goto error;
523 	}
524 
525 	/*
526 	 * The default ng_socket(4) buffer's size is too low.
527 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
528 	 * and kern.ipc.maxsockbuf.
529 	 */
530 	msbsz = sizeof(maxsbsz);
531 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
532 		NULL, 0) < 0) {
533 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
534 		goto error;
535 	}
536 
537 	/*
538 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
539 	 * as it takes into account the mbuf(9) overhead.
540 	 */
541 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
542 
543 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
544 
545 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
546 		sizeof(sbsz)) < 0) {
547 		WPRINTF(("can't set TX buffer size"));
548 		goto error;
549 	}
550 
551 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
552 		sizeof(sbsz)) < 0) {
553 		WPRINTF(("can't set RX buffer size"));
554 		goto error;
555 	}
556 
557 #ifndef WITHOUT_CAPSICUM
558 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
559 	if (caph_rights_limit(be->fd, &rights) == -1)
560 		errx(EX_OSERR, "Unable to apply rights for sandbox");
561 #endif
562 
563 	memset(p->bbuf, 0, sizeof(p->bbuf));
564 	p->bbuflen = 0;
565 
566 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
567 	if (p->mevp == NULL) {
568 		WPRINTF(("Could not register event"));
569 		goto error;
570 	}
571 
572 	return (0);
573 
574 error:
575 	tap_cleanup(be);
576 	return (-1);
577 }
578 
579 static struct net_backend ng_backend = {
580 	.prefix = "netgraph",
581 	.priv_size = sizeof(struct tap_priv),
582 	.init = ng_init,
583 	.cleanup = tap_cleanup,
584 	.send = tap_send,
585 	.peek_recvlen = tap_peek_recvlen,
586 	.recv = tap_recv,
587 	.recv_enable = tap_recv_enable,
588 	.recv_disable = tap_recv_disable,
589 	.get_cap = tap_get_cap,
590 	.set_cap = tap_set_cap,
591 };
592 
593 DATA_SET(net_backend_set, ng_backend);
594 
595 #endif /* NETGRAPH */
596 
597 /*
598  * The netmap backend
599  */
600 
601 /* The virtio-net features supported by netmap. */
602 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
603 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
604 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
605 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
606 
607 struct netmap_priv {
608 	char ifname[IFNAMSIZ];
609 	struct nm_desc *nmd;
610 	uint16_t memid;
611 	struct netmap_ring *rx;
612 	struct netmap_ring *tx;
613 	struct mevent *mevp;
614 	net_be_rxeof_t cb;
615 	void *cb_param;
616 };
617 
618 static void
619 nmreq_init(struct nmreq *req, char *ifname)
620 {
621 
622 	memset(req, 0, sizeof(*req));
623 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
624 	req->nr_version = NETMAP_API;
625 }
626 
627 static int
628 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
629 {
630 	int err;
631 	struct nmreq req;
632 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
633 
634 	nmreq_init(&req, priv->ifname);
635 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
636 	req.nr_arg1 = vnet_hdr_len;
637 	err = ioctl(be->fd, NIOCREGIF, &req);
638 	if (err) {
639 		WPRINTF(("Unable to set vnet header length %d",
640 				vnet_hdr_len));
641 		return (err);
642 	}
643 
644 	be->be_vnet_hdr_len = vnet_hdr_len;
645 
646 	return (0);
647 }
648 
649 static int
650 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
651 {
652 	int prev_hdr_len = be->be_vnet_hdr_len;
653 	int ret;
654 
655 	if (vnet_hdr_len == prev_hdr_len) {
656 		return (1);
657 	}
658 
659 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
660 	if (ret) {
661 		return (0);
662 	}
663 
664 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
665 
666 	return (1);
667 }
668 
669 static uint64_t
670 netmap_get_cap(struct net_backend *be)
671 {
672 
673 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
674 	    NETMAP_FEATURES : 0);
675 }
676 
677 static int
678 netmap_set_cap(struct net_backend *be, uint64_t features,
679 	       unsigned vnet_hdr_len)
680 {
681 
682 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
683 }
684 
685 static int
686 netmap_init(struct net_backend *be, const char *devname,
687 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param)
688 {
689 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
690 
691 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
692 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
693 
694 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
695 	if (priv->nmd == NULL) {
696 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
697 			devname, strerror(errno)));
698 		free(priv);
699 		return (-1);
700 	}
701 
702 	priv->memid = priv->nmd->req.nr_arg2;
703 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
704 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
705 	priv->cb = cb;
706 	priv->cb_param = param;
707 	be->fd = priv->nmd->fd;
708 
709 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
710 	if (priv->mevp == NULL) {
711 		WPRINTF(("Could not register event"));
712 		return (-1);
713 	}
714 
715 	return (0);
716 }
717 
718 static void
719 netmap_cleanup(struct net_backend *be)
720 {
721 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
722 
723 	if (priv->mevp) {
724 		mevent_delete(priv->mevp);
725 	}
726 	if (priv->nmd) {
727 		nm_close(priv->nmd);
728 	}
729 	be->fd = -1;
730 }
731 
732 static ssize_t
733 netmap_send(struct net_backend *be, const struct iovec *iov,
734 	    int iovcnt)
735 {
736 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
737 	struct netmap_ring *ring;
738 	ssize_t totlen = 0;
739 	int nm_buf_size;
740 	int nm_buf_len;
741 	uint32_t head;
742 	void *nm_buf;
743 	int j;
744 
745 	ring = priv->tx;
746 	head = ring->head;
747 	if (head == ring->tail) {
748 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
749 		goto txsync;
750 	}
751 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
752 	nm_buf_size = ring->nr_buf_size;
753 	nm_buf_len = 0;
754 
755 	for (j = 0; j < iovcnt; j++) {
756 		int iov_frag_size = iov[j].iov_len;
757 		void *iov_frag_buf = iov[j].iov_base;
758 
759 		totlen += iov_frag_size;
760 
761 		/*
762 		 * Split each iovec fragment over more netmap slots, if
763 		 * necessary.
764 		 */
765 		for (;;) {
766 			int copylen;
767 
768 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
769 			memcpy(nm_buf, iov_frag_buf, copylen);
770 
771 			iov_frag_buf += copylen;
772 			iov_frag_size -= copylen;
773 			nm_buf += copylen;
774 			nm_buf_size -= copylen;
775 			nm_buf_len += copylen;
776 
777 			if (iov_frag_size == 0) {
778 				break;
779 			}
780 
781 			ring->slot[head].len = nm_buf_len;
782 			ring->slot[head].flags = NS_MOREFRAG;
783 			head = nm_ring_next(ring, head);
784 			if (head == ring->tail) {
785 				/*
786 				 * We ran out of netmap slots while
787 				 * splitting the iovec fragments.
788 				 */
789 				WPRINTF(("No space, drop %zu bytes",
790 				   count_iov(iov, iovcnt)));
791 				goto txsync;
792 			}
793 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
794 			nm_buf_size = ring->nr_buf_size;
795 			nm_buf_len = 0;
796 		}
797 	}
798 
799 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
800 	ring->slot[head].len = nm_buf_len;
801 	ring->slot[head].flags = 0;
802 	head = nm_ring_next(ring, head);
803 
804 	/* Now update ring->head and ring->cur. */
805 	ring->head = ring->cur = head;
806 txsync:
807 	ioctl(be->fd, NIOCTXSYNC, NULL);
808 
809 	return (totlen);
810 }
811 
812 static ssize_t
813 netmap_peek_recvlen(struct net_backend *be)
814 {
815 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
816 	struct netmap_ring *ring = priv->rx;
817 	uint32_t head = ring->head;
818 	ssize_t totlen = 0;
819 
820 	while (head != ring->tail) {
821 		struct netmap_slot *slot = ring->slot + head;
822 
823 		totlen += slot->len;
824 		if ((slot->flags & NS_MOREFRAG) == 0)
825 			break;
826 		head = nm_ring_next(ring, head);
827 	}
828 
829 	return (totlen);
830 }
831 
832 static ssize_t
833 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
834 {
835 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
836 	struct netmap_slot *slot = NULL;
837 	struct netmap_ring *ring;
838 	void *iov_frag_buf;
839 	int iov_frag_size;
840 	ssize_t totlen = 0;
841 	uint32_t head;
842 
843 	assert(iovcnt);
844 
845 	ring = priv->rx;
846 	head = ring->head;
847 	iov_frag_buf = iov->iov_base;
848 	iov_frag_size = iov->iov_len;
849 
850 	do {
851 		int nm_buf_len;
852 		void *nm_buf;
853 
854 		if (head == ring->tail) {
855 			return (0);
856 		}
857 
858 		slot = ring->slot + head;
859 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
860 		nm_buf_len = slot->len;
861 
862 		for (;;) {
863 			int copylen = nm_buf_len < iov_frag_size ?
864 			    nm_buf_len : iov_frag_size;
865 
866 			memcpy(iov_frag_buf, nm_buf, copylen);
867 			nm_buf += copylen;
868 			nm_buf_len -= copylen;
869 			iov_frag_buf += copylen;
870 			iov_frag_size -= copylen;
871 			totlen += copylen;
872 
873 			if (nm_buf_len == 0) {
874 				break;
875 			}
876 
877 			iov++;
878 			iovcnt--;
879 			if (iovcnt == 0) {
880 				/* No space to receive. */
881 				WPRINTF(("Short iov, drop %zd bytes",
882 				    totlen));
883 				return (-ENOSPC);
884 			}
885 			iov_frag_buf = iov->iov_base;
886 			iov_frag_size = iov->iov_len;
887 		}
888 
889 		head = nm_ring_next(ring, head);
890 
891 	} while (slot->flags & NS_MOREFRAG);
892 
893 	/* Release slots to netmap. */
894 	ring->head = ring->cur = head;
895 
896 	return (totlen);
897 }
898 
899 static void
900 netmap_recv_enable(struct net_backend *be)
901 {
902 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
903 
904 	mevent_enable(priv->mevp);
905 }
906 
907 static void
908 netmap_recv_disable(struct net_backend *be)
909 {
910 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
911 
912 	mevent_disable(priv->mevp);
913 }
914 
915 static struct net_backend netmap_backend = {
916 	.prefix = "netmap",
917 	.priv_size = sizeof(struct netmap_priv),
918 	.init = netmap_init,
919 	.cleanup = netmap_cleanup,
920 	.send = netmap_send,
921 	.peek_recvlen = netmap_peek_recvlen,
922 	.recv = netmap_recv,
923 	.recv_enable = netmap_recv_enable,
924 	.recv_disable = netmap_recv_disable,
925 	.get_cap = netmap_get_cap,
926 	.set_cap = netmap_set_cap,
927 };
928 
929 /* A clone of the netmap backend, with a different prefix. */
930 static struct net_backend vale_backend = {
931 	.prefix = "vale",
932 	.priv_size = sizeof(struct netmap_priv),
933 	.init = netmap_init,
934 	.cleanup = netmap_cleanup,
935 	.send = netmap_send,
936 	.peek_recvlen = netmap_peek_recvlen,
937 	.recv = netmap_recv,
938 	.recv_enable = netmap_recv_enable,
939 	.recv_disable = netmap_recv_disable,
940 	.get_cap = netmap_get_cap,
941 	.set_cap = netmap_set_cap,
942 };
943 
944 DATA_SET(net_backend_set, netmap_backend);
945 DATA_SET(net_backend_set, vale_backend);
946 
947 int
948 netbe_legacy_config(nvlist_t *nvl, const char *opts)
949 {
950 	char *backend, *cp;
951 
952 	if (opts == NULL)
953 		return (0);
954 
955 	cp = strchr(opts, ',');
956 	if (cp == NULL) {
957 		set_config_value_node(nvl, "backend", opts);
958 		return (0);
959 	}
960 	backend = strndup(opts, cp - opts);
961 	set_config_value_node(nvl, "backend", backend);
962 	free(backend);
963 	return (pci_parse_legacy_config(nvl, cp + 1));
964 }
965 
966 /*
967  * Initialize a backend and attach to the frontend.
968  * This is called during frontend initialization.
969  *  @ret is a pointer to the backend to be initialized
970  *  @devname is the backend-name as supplied on the command line,
971  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
972  *  @cb is the receive callback supplied by the frontend,
973  *	and it is invoked in the event loop when a receive
974  *	event is generated in the hypervisor,
975  *  @param is a pointer to the frontend, and normally used as
976  *	the argument for the callback.
977  */
978 int
979 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
980     void *param)
981 {
982 	struct net_backend **pbe, *nbe, *tbe = NULL;
983 	const char *value;
984 	char *devname;
985 	int err;
986 
987 	value = get_config_value_node(nvl, "backend");
988 	if (value == NULL) {
989 		return (-1);
990 	}
991 	devname = strdup(value);
992 
993 	/*
994 	 * Find the network backend that matches the user-provided
995 	 * device name. net_backend_set is built using a linker set.
996 	 */
997 	SET_FOREACH(pbe, net_backend_set) {
998 		if (strncmp(devname, (*pbe)->prefix,
999 		    strlen((*pbe)->prefix)) == 0) {
1000 			tbe = *pbe;
1001 			assert(tbe->init != NULL);
1002 			assert(tbe->cleanup != NULL);
1003 			assert(tbe->send != NULL);
1004 			assert(tbe->recv != NULL);
1005 			assert(tbe->get_cap != NULL);
1006 			assert(tbe->set_cap != NULL);
1007 			break;
1008 		}
1009 	}
1010 
1011 	*ret = NULL;
1012 	if (tbe == NULL) {
1013 		free(devname);
1014 		return (EINVAL);
1015 	}
1016 
1017 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
1018 	*nbe = *tbe;	/* copy the template */
1019 	nbe->fd = -1;
1020 	nbe->sc = param;
1021 	nbe->be_vnet_hdr_len = 0;
1022 	nbe->fe_vnet_hdr_len = 0;
1023 
1024 	/* Initialize the backend. */
1025 	err = nbe->init(nbe, devname, nvl, cb, param);
1026 	if (err) {
1027 		free(devname);
1028 		free(nbe);
1029 		return (err);
1030 	}
1031 
1032 	*ret = nbe;
1033 	free(devname);
1034 
1035 	return (0);
1036 }
1037 
1038 void
1039 netbe_cleanup(struct net_backend *be)
1040 {
1041 
1042 	if (be != NULL) {
1043 		be->cleanup(be);
1044 		free(be);
1045 	}
1046 }
1047 
1048 uint64_t
1049 netbe_get_cap(struct net_backend *be)
1050 {
1051 
1052 	assert(be != NULL);
1053 	return (be->get_cap(be));
1054 }
1055 
1056 int
1057 netbe_set_cap(struct net_backend *be, uint64_t features,
1058 	      unsigned vnet_hdr_len)
1059 {
1060 	int ret;
1061 
1062 	assert(be != NULL);
1063 
1064 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1065 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1066 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1067 		return (-1);
1068 
1069 	be->fe_vnet_hdr_len = vnet_hdr_len;
1070 
1071 	ret = be->set_cap(be, features, vnet_hdr_len);
1072 	assert(be->be_vnet_hdr_len == 0 ||
1073 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1074 
1075 	return (ret);
1076 }
1077 
1078 ssize_t
1079 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1080 {
1081 
1082 	return (be->send(be, iov, iovcnt));
1083 }
1084 
1085 ssize_t
1086 netbe_peek_recvlen(struct net_backend *be)
1087 {
1088 
1089 	return (be->peek_recvlen(be));
1090 }
1091 
1092 /*
1093  * Try to read a packet from the backend, without blocking.
1094  * If no packets are available, return 0. In case of success, return
1095  * the length of the packet just read. Return -1 in case of errors.
1096  */
1097 ssize_t
1098 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1099 {
1100 
1101 	return (be->recv(be, iov, iovcnt));
1102 }
1103 
1104 /*
1105  * Read a packet from the backend and discard it.
1106  * Returns the size of the discarded packet or zero if no packet was available.
1107  * A negative error code is returned in case of read error.
1108  */
1109 ssize_t
1110 netbe_rx_discard(struct net_backend *be)
1111 {
1112 	/*
1113 	 * MP note: the dummybuf is only used to discard frames,
1114 	 * so there is no need for it to be per-vtnet or locked.
1115 	 * We only make it large enough for TSO-sized segment.
1116 	 */
1117 	static uint8_t dummybuf[65536 + 64];
1118 	struct iovec iov;
1119 
1120 	iov.iov_base = dummybuf;
1121 	iov.iov_len = sizeof(dummybuf);
1122 
1123 	return netbe_recv(be, &iov, 1);
1124 }
1125 
1126 void
1127 netbe_rx_disable(struct net_backend *be)
1128 {
1129 
1130 	return be->recv_disable(be);
1131 }
1132 
1133 void
1134 netbe_rx_enable(struct net_backend *be)
1135 {
1136 
1137 	return be->recv_enable(be);
1138 }
1139 
1140 size_t
1141 netbe_get_vnet_hdr_len(struct net_backend *be)
1142 {
1143 
1144 	return (be->be_vnet_hdr_len);
1145 }
1146