xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision b3e7694832e81d7a904a10f525f8797b753bf0d3)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * This file implements multiple network backends (tap, netmap, ...),
30  * to be used by network frontends such as virtio-net and e1000.
31  * The API to access the backend (e.g. send/receive packets, negotiate
32  * features) is exported by net_backends.h.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/types.h>		/* u_short etc */
39 #ifndef WITHOUT_CAPSICUM
40 #include <sys/capsicum.h>
41 #endif
42 #include <sys/ioctl.h>
43 #include <sys/mman.h>
44 #include <sys/uio.h>
45 
46 #include <net/if.h>
47 #if defined(INET6) || defined(INET)
48 #include <net/if_tap.h>
49 #endif
50 #include <net/netmap.h>
51 #include <net/netmap_virt.h>
52 #define NETMAP_WITH_LIBS
53 #include <net/netmap_user.h>
54 
55 #ifndef WITHOUT_CAPSICUM
56 #include <capsicum_helpers.h>
57 #endif
58 #include <err.h>
59 #include <errno.h>
60 #include <fcntl.h>
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <stdint.h>
64 #include <string.h>
65 #include <unistd.h>
66 #include <sysexits.h>
67 #include <assert.h>
68 #include <pthread.h>
69 #include <pthread_np.h>
70 #include <poll.h>
71 #include <assert.h>
72 
73 #ifdef NETGRAPH
74 #include <sys/param.h>
75 #include <sys/sysctl.h>
76 #include <netgraph.h>
77 #endif
78 
79 #include "config.h"
80 #include "debug.h"
81 #include "iov.h"
82 #include "mevent.h"
83 #include "net_backends.h"
84 #include "pci_emul.h"
85 
86 #include <sys/linker_set.h>
87 
88 /*
89  * Each network backend registers a set of function pointers that are
90  * used to implement the net backends API.
91  * This might need to be exposed if we implement backends in separate files.
92  */
93 struct net_backend {
94 	const char *prefix;	/* prefix matching this backend */
95 
96 	/*
97 	 * Routines used to initialize and cleanup the resources needed
98 	 * by a backend. The cleanup function is used internally,
99 	 * and should not be called by the frontend.
100 	 */
101 	int (*init)(struct net_backend *be, const char *devname,
102 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
103 	void (*cleanup)(struct net_backend *be);
104 
105 	/*
106 	 * Called to serve a guest transmit request. The scatter-gather
107 	 * vector provided by the caller has 'iovcnt' elements and contains
108 	 * the packet to send.
109 	 */
110 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
111 	    int iovcnt);
112 
113 	/*
114 	 * Get the length of the next packet that can be received from
115 	 * the backend. If no packets are currently available, this
116 	 * function returns 0.
117 	 */
118 	ssize_t (*peek_recvlen)(struct net_backend *be);
119 
120 	/*
121 	 * Called to receive a packet from the backend. When the function
122 	 * returns a positive value 'len', the scatter-gather vector
123 	 * provided by the caller contains a packet with such length.
124 	 * The function returns 0 if the backend doesn't have a new packet to
125 	 * receive.
126 	 */
127 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
128 	    int iovcnt);
129 
130 	/*
131 	 * Ask the backend to enable or disable receive operation in the
132 	 * backend. On return from a disable operation, it is guaranteed
133 	 * that the receive callback won't be called until receive is
134 	 * enabled again. Note however that it is up to the caller to make
135 	 * sure that netbe_recv() is not currently being executed by another
136 	 * thread.
137 	 */
138 	void (*recv_enable)(struct net_backend *be);
139 	void (*recv_disable)(struct net_backend *be);
140 
141 	/*
142 	 * Ask the backend for the virtio-net features it is able to
143 	 * support. Possible features are TSO, UFO and checksum offloading
144 	 * in both rx and tx direction and for both IPv4 and IPv6.
145 	 */
146 	uint64_t (*get_cap)(struct net_backend *be);
147 
148 	/*
149 	 * Tell the backend to enable/disable the specified virtio-net
150 	 * features (capabilities).
151 	 */
152 	int (*set_cap)(struct net_backend *be, uint64_t features,
153 	    unsigned int vnet_hdr_len);
154 
155 	struct pci_vtnet_softc *sc;
156 	int fd;
157 
158 	/*
159 	 * Length of the virtio-net header used by the backend and the
160 	 * frontend, respectively. A zero value means that the header
161 	 * is not used.
162 	 */
163 	unsigned int be_vnet_hdr_len;
164 	unsigned int fe_vnet_hdr_len;
165 
166 	/* Size of backend-specific private data. */
167 	size_t priv_size;
168 
169 	/* Backend-specific private data follows. */
170 };
171 
172 #define	NET_BE_PRIV(be)		((void *)((be) + 1))
173 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
174 
175 SET_DECLARE(net_backend_set, struct net_backend);
176 
177 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
178 
179 #define WPRINTF(params) PRINTLN params
180 
181 /*
182  * The tap backend
183  */
184 
185 #if defined(INET6) || defined(INET)
186 static const int pf_list[] = {
187 #if defined(INET6)
188 	PF_INET6,
189 #endif
190 #if defined(INET)
191 	PF_INET,
192 #endif
193 };
194 #endif
195 
196 struct tap_priv {
197 	struct mevent *mevp;
198 	/*
199 	 * A bounce buffer that allows us to implement the peek_recvlen
200 	 * callback. In the future we may get the same information from
201 	 * the kevent data.
202 	 */
203 	char bbuf[1 << 16];
204 	ssize_t bbuflen;
205 };
206 
207 static void
208 tap_cleanup(struct net_backend *be)
209 {
210 	struct tap_priv *priv = NET_BE_PRIV(be);
211 
212 	if (priv->mevp) {
213 		mevent_delete(priv->mevp);
214 	}
215 	if (be->fd != -1) {
216 		close(be->fd);
217 		be->fd = -1;
218 	}
219 }
220 
221 static int
222 tap_init(struct net_backend *be, const char *devname,
223     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
224 {
225 	struct tap_priv *priv = NET_BE_PRIV(be);
226 	char tbuf[80];
227 	int opt = 1;
228 #if defined(INET6) || defined(INET)
229 	struct ifreq ifrq;
230 	int s;
231 #endif
232 #ifndef WITHOUT_CAPSICUM
233 	cap_rights_t rights;
234 #endif
235 
236 	if (cb == NULL) {
237 		WPRINTF(("TAP backend requires non-NULL callback"));
238 		return (-1);
239 	}
240 
241 	strcpy(tbuf, "/dev/");
242 	strlcat(tbuf, devname, sizeof(tbuf));
243 
244 	be->fd = open(tbuf, O_RDWR);
245 	if (be->fd == -1) {
246 		WPRINTF(("open of tap device %s failed", tbuf));
247 		goto error;
248 	}
249 
250 	/*
251 	 * Set non-blocking and register for read
252 	 * notifications with the event loop
253 	 */
254 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
255 		WPRINTF(("tap device O_NONBLOCK failed"));
256 		goto error;
257 	}
258 
259 #if defined(INET6) || defined(INET)
260 	/*
261 	 * Try to UP the interface rather than relying on
262 	 * net.link.tap.up_on_open.
263 	  */
264 	bzero(&ifrq, sizeof(ifrq));
265 	if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) {
266 		WPRINTF(("Could not get interface name"));
267 		goto error;
268 	}
269 
270 	s = -1;
271 	for (size_t i = 0; s == -1 && i < nitems(pf_list); i++)
272 		s = socket(pf_list[i], SOCK_DGRAM, 0);
273 	if (s == -1) {
274 		WPRINTF(("Could open socket"));
275 		goto error;
276 	}
277 
278 	if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) {
279 		(void)close(s);
280 		WPRINTF(("Could not get interface flags"));
281 		goto error;
282 	}
283 	ifrq.ifr_flags |= IFF_UP;
284 	if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) {
285 		(void)close(s);
286 		WPRINTF(("Could not set interface flags"));
287 		goto error;
288 	}
289 	(void)close(s);
290 #endif
291 
292 #ifndef WITHOUT_CAPSICUM
293 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
294 	if (caph_rights_limit(be->fd, &rights) == -1)
295 		errx(EX_OSERR, "Unable to apply rights for sandbox");
296 #endif
297 
298 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
299 	priv->bbuflen = 0;
300 
301 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
302 	if (priv->mevp == NULL) {
303 		WPRINTF(("Could not register event"));
304 		goto error;
305 	}
306 
307 	return (0);
308 
309 error:
310 	tap_cleanup(be);
311 	return (-1);
312 }
313 
314 /*
315  * Called to send a buffer chain out to the tap device
316  */
317 static ssize_t
318 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
319 {
320 	return (writev(be->fd, iov, iovcnt));
321 }
322 
323 static ssize_t
324 tap_peek_recvlen(struct net_backend *be)
325 {
326 	struct tap_priv *priv = NET_BE_PRIV(be);
327 	ssize_t ret;
328 
329 	if (priv->bbuflen > 0) {
330 		/*
331 		 * We already have a packet in the bounce buffer.
332 		 * Just return its length.
333 		 */
334 		return priv->bbuflen;
335 	}
336 
337 	/*
338 	 * Read the next packet (if any) into the bounce buffer, so
339 	 * that we get to know its length and we can return that
340 	 * to the caller.
341 	 */
342 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
343 	if (ret < 0 && errno == EWOULDBLOCK) {
344 		return (0);
345 	}
346 
347 	if (ret > 0)
348 		priv->bbuflen = ret;
349 
350 	return (ret);
351 }
352 
353 static ssize_t
354 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
355 {
356 	struct tap_priv *priv = NET_BE_PRIV(be);
357 	ssize_t ret;
358 
359 	if (priv->bbuflen > 0) {
360 		/*
361 		 * A packet is available in the bounce buffer, so
362 		 * we read it from there.
363 		 */
364 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
365 		    iov, iovcnt, 0);
366 
367 		/* Mark the bounce buffer as empty. */
368 		priv->bbuflen = 0;
369 
370 		return (ret);
371 	}
372 
373 	ret = readv(be->fd, iov, iovcnt);
374 	if (ret < 0 && errno == EWOULDBLOCK) {
375 		return (0);
376 	}
377 
378 	return (ret);
379 }
380 
381 static void
382 tap_recv_enable(struct net_backend *be)
383 {
384 	struct tap_priv *priv = NET_BE_PRIV(be);
385 
386 	mevent_enable(priv->mevp);
387 }
388 
389 static void
390 tap_recv_disable(struct net_backend *be)
391 {
392 	struct tap_priv *priv = NET_BE_PRIV(be);
393 
394 	mevent_disable(priv->mevp);
395 }
396 
397 static uint64_t
398 tap_get_cap(struct net_backend *be __unused)
399 {
400 
401 	return (0); /* no capabilities for now */
402 }
403 
404 static int
405 tap_set_cap(struct net_backend *be __unused, uint64_t features,
406     unsigned vnet_hdr_len)
407 {
408 
409 	return ((features || vnet_hdr_len) ? -1 : 0);
410 }
411 
412 static struct net_backend tap_backend = {
413 	.prefix = "tap",
414 	.priv_size = sizeof(struct tap_priv),
415 	.init = tap_init,
416 	.cleanup = tap_cleanup,
417 	.send = tap_send,
418 	.peek_recvlen = tap_peek_recvlen,
419 	.recv = tap_recv,
420 	.recv_enable = tap_recv_enable,
421 	.recv_disable = tap_recv_disable,
422 	.get_cap = tap_get_cap,
423 	.set_cap = tap_set_cap,
424 };
425 
426 /* A clone of the tap backend, with a different prefix. */
427 static struct net_backend vmnet_backend = {
428 	.prefix = "vmnet",
429 	.priv_size = sizeof(struct tap_priv),
430 	.init = tap_init,
431 	.cleanup = tap_cleanup,
432 	.send = tap_send,
433 	.peek_recvlen = tap_peek_recvlen,
434 	.recv = tap_recv,
435 	.recv_enable = tap_recv_enable,
436 	.recv_disable = tap_recv_disable,
437 	.get_cap = tap_get_cap,
438 	.set_cap = tap_set_cap,
439 };
440 
441 DATA_SET(net_backend_set, tap_backend);
442 DATA_SET(net_backend_set, vmnet_backend);
443 
444 #ifdef NETGRAPH
445 
446 /*
447  * Netgraph backend
448  */
449 
450 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
451 
452 static int
453 ng_init(struct net_backend *be, const char *devname __unused,
454 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
455 {
456 	struct tap_priv *p = NET_BE_PRIV(be);
457 	struct ngm_connect ngc;
458 	const char *value, *nodename;
459 	int sbsz;
460 	int ctrl_sock;
461 	int flags;
462 	unsigned long maxsbsz;
463 	size_t msbsz;
464 #ifndef WITHOUT_CAPSICUM
465 	cap_rights_t rights;
466 #endif
467 
468 	if (cb == NULL) {
469 		WPRINTF(("Netgraph backend requires non-NULL callback"));
470 		return (-1);
471 	}
472 
473 	be->fd = -1;
474 
475 	memset(&ngc, 0, sizeof(ngc));
476 
477 	value = get_config_value_node(nvl, "path");
478 	if (value == NULL) {
479 		WPRINTF(("path must be provided"));
480 		return (-1);
481 	}
482 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
483 
484 	value = get_config_value_node(nvl, "hook");
485 	if (value == NULL)
486 		value = "vmlink";
487 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
488 
489 	value = get_config_value_node(nvl, "peerhook");
490 	if (value == NULL) {
491 		WPRINTF(("peer hook must be provided"));
492 		return (-1);
493 	}
494 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
495 
496 	nodename = get_config_value_node(nvl, "socket");
497 	if (NgMkSockNode(nodename,
498 		&ctrl_sock, &be->fd) < 0) {
499 		WPRINTF(("can't get Netgraph sockets"));
500 		return (-1);
501 	}
502 
503 	if (NgSendMsg(ctrl_sock, ".",
504 		NGM_GENERIC_COOKIE,
505 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
506 		WPRINTF(("can't connect to node"));
507 		close(ctrl_sock);
508 		goto error;
509 	}
510 
511 	close(ctrl_sock);
512 
513 	flags = fcntl(be->fd, F_GETFL);
514 
515 	if (flags < 0) {
516 		WPRINTF(("can't get socket flags"));
517 		goto error;
518 	}
519 
520 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
521 		WPRINTF(("can't set O_NONBLOCK flag"));
522 		goto error;
523 	}
524 
525 	/*
526 	 * The default ng_socket(4) buffer's size is too low.
527 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
528 	 * and kern.ipc.maxsockbuf.
529 	 */
530 	msbsz = sizeof(maxsbsz);
531 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
532 		NULL, 0) < 0) {
533 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
534 		goto error;
535 	}
536 
537 	/*
538 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
539 	 * as it takes into account the mbuf(9) overhead.
540 	 */
541 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
542 
543 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
544 
545 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
546 		sizeof(sbsz)) < 0) {
547 		WPRINTF(("can't set TX buffer size"));
548 		goto error;
549 	}
550 
551 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
552 		sizeof(sbsz)) < 0) {
553 		WPRINTF(("can't set RX buffer size"));
554 		goto error;
555 	}
556 
557 #ifndef WITHOUT_CAPSICUM
558 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
559 	if (caph_rights_limit(be->fd, &rights) == -1)
560 		errx(EX_OSERR, "Unable to apply rights for sandbox");
561 #endif
562 
563 	memset(p->bbuf, 0, sizeof(p->bbuf));
564 	p->bbuflen = 0;
565 
566 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
567 	if (p->mevp == NULL) {
568 		WPRINTF(("Could not register event"));
569 		goto error;
570 	}
571 
572 	return (0);
573 
574 error:
575 	tap_cleanup(be);
576 	return (-1);
577 }
578 
579 static struct net_backend ng_backend = {
580 	.prefix = "netgraph",
581 	.priv_size = sizeof(struct tap_priv),
582 	.init = ng_init,
583 	.cleanup = tap_cleanup,
584 	.send = tap_send,
585 	.peek_recvlen = tap_peek_recvlen,
586 	.recv = tap_recv,
587 	.recv_enable = tap_recv_enable,
588 	.recv_disable = tap_recv_disable,
589 	.get_cap = tap_get_cap,
590 	.set_cap = tap_set_cap,
591 };
592 
593 DATA_SET(net_backend_set, ng_backend);
594 
595 #endif /* NETGRAPH */
596 
597 /*
598  * The netmap backend
599  */
600 
601 /* The virtio-net features supported by netmap. */
602 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
603 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
604 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
605 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
606 
607 struct netmap_priv {
608 	char ifname[IFNAMSIZ];
609 	struct nm_desc *nmd;
610 	uint16_t memid;
611 	struct netmap_ring *rx;
612 	struct netmap_ring *tx;
613 	struct mevent *mevp;
614 	net_be_rxeof_t cb;
615 	void *cb_param;
616 };
617 
618 static void
619 nmreq_init(struct nmreq *req, char *ifname)
620 {
621 
622 	memset(req, 0, sizeof(*req));
623 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
624 	req->nr_version = NETMAP_API;
625 }
626 
627 static int
628 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
629 {
630 	int err;
631 	struct nmreq req;
632 	struct netmap_priv *priv = NET_BE_PRIV(be);
633 
634 	nmreq_init(&req, priv->ifname);
635 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
636 	req.nr_arg1 = vnet_hdr_len;
637 	err = ioctl(be->fd, NIOCREGIF, &req);
638 	if (err) {
639 		WPRINTF(("Unable to set vnet header length %d",
640 				vnet_hdr_len));
641 		return (err);
642 	}
643 
644 	be->be_vnet_hdr_len = vnet_hdr_len;
645 
646 	return (0);
647 }
648 
649 static int
650 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
651 {
652 	unsigned prev_hdr_len = be->be_vnet_hdr_len;
653 	int ret;
654 
655 	if (vnet_hdr_len == prev_hdr_len) {
656 		return (1);
657 	}
658 
659 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
660 	if (ret) {
661 		return (0);
662 	}
663 
664 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
665 
666 	return (1);
667 }
668 
669 static uint64_t
670 netmap_get_cap(struct net_backend *be)
671 {
672 
673 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
674 	    NETMAP_FEATURES : 0);
675 }
676 
677 static int
678 netmap_set_cap(struct net_backend *be, uint64_t features __unused,
679     unsigned vnet_hdr_len)
680 {
681 
682 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
683 }
684 
685 static int
686 netmap_init(struct net_backend *be, const char *devname,
687     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
688 {
689 	struct netmap_priv *priv = NET_BE_PRIV(be);
690 
691 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
692 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
693 
694 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
695 	if (priv->nmd == NULL) {
696 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
697 			devname, strerror(errno)));
698 		return (-1);
699 	}
700 
701 	priv->memid = priv->nmd->req.nr_arg2;
702 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
703 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
704 	priv->cb = cb;
705 	priv->cb_param = param;
706 	be->fd = priv->nmd->fd;
707 
708 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
709 	if (priv->mevp == NULL) {
710 		WPRINTF(("Could not register event"));
711 		return (-1);
712 	}
713 
714 	return (0);
715 }
716 
717 static void
718 netmap_cleanup(struct net_backend *be)
719 {
720 	struct netmap_priv *priv = NET_BE_PRIV(be);
721 
722 	if (priv->mevp) {
723 		mevent_delete(priv->mevp);
724 	}
725 	if (priv->nmd) {
726 		nm_close(priv->nmd);
727 	}
728 	be->fd = -1;
729 }
730 
731 static ssize_t
732 netmap_send(struct net_backend *be, const struct iovec *iov,
733 	    int iovcnt)
734 {
735 	struct netmap_priv *priv = NET_BE_PRIV(be);
736 	struct netmap_ring *ring;
737 	ssize_t totlen = 0;
738 	int nm_buf_size;
739 	int nm_buf_len;
740 	uint32_t head;
741 	uint8_t *nm_buf;
742 	int j;
743 
744 	ring = priv->tx;
745 	head = ring->head;
746 	if (head == ring->tail) {
747 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
748 		goto txsync;
749 	}
750 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
751 	nm_buf_size = ring->nr_buf_size;
752 	nm_buf_len = 0;
753 
754 	for (j = 0; j < iovcnt; j++) {
755 		uint8_t *iov_frag_buf = iov[j].iov_base;
756 		int iov_frag_size = iov[j].iov_len;
757 
758 		totlen += iov_frag_size;
759 
760 		/*
761 		 * Split each iovec fragment over more netmap slots, if
762 		 * necessary.
763 		 */
764 		for (;;) {
765 			int copylen;
766 
767 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
768 			memcpy(nm_buf, iov_frag_buf, copylen);
769 
770 			iov_frag_buf += copylen;
771 			iov_frag_size -= copylen;
772 			nm_buf += copylen;
773 			nm_buf_size -= copylen;
774 			nm_buf_len += copylen;
775 
776 			if (iov_frag_size == 0) {
777 				break;
778 			}
779 
780 			ring->slot[head].len = nm_buf_len;
781 			ring->slot[head].flags = NS_MOREFRAG;
782 			head = nm_ring_next(ring, head);
783 			if (head == ring->tail) {
784 				/*
785 				 * We ran out of netmap slots while
786 				 * splitting the iovec fragments.
787 				 */
788 				WPRINTF(("No space, drop %zu bytes",
789 				   count_iov(iov, iovcnt)));
790 				goto txsync;
791 			}
792 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
793 			nm_buf_size = ring->nr_buf_size;
794 			nm_buf_len = 0;
795 		}
796 	}
797 
798 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
799 	ring->slot[head].len = nm_buf_len;
800 	ring->slot[head].flags = 0;
801 	head = nm_ring_next(ring, head);
802 
803 	/* Now update ring->head and ring->cur. */
804 	ring->head = ring->cur = head;
805 txsync:
806 	ioctl(be->fd, NIOCTXSYNC, NULL);
807 
808 	return (totlen);
809 }
810 
811 static ssize_t
812 netmap_peek_recvlen(struct net_backend *be)
813 {
814 	struct netmap_priv *priv = NET_BE_PRIV(be);
815 	struct netmap_ring *ring = priv->rx;
816 	uint32_t head = ring->head;
817 	ssize_t totlen = 0;
818 
819 	while (head != ring->tail) {
820 		struct netmap_slot *slot = ring->slot + head;
821 
822 		totlen += slot->len;
823 		if ((slot->flags & NS_MOREFRAG) == 0)
824 			break;
825 		head = nm_ring_next(ring, head);
826 	}
827 
828 	return (totlen);
829 }
830 
831 static ssize_t
832 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
833 {
834 	struct netmap_priv *priv = NET_BE_PRIV(be);
835 	struct netmap_slot *slot = NULL;
836 	struct netmap_ring *ring;
837 	uint8_t *iov_frag_buf;
838 	int iov_frag_size;
839 	ssize_t totlen = 0;
840 	uint32_t head;
841 
842 	assert(iovcnt);
843 
844 	ring = priv->rx;
845 	head = ring->head;
846 	iov_frag_buf = iov->iov_base;
847 	iov_frag_size = iov->iov_len;
848 
849 	do {
850 		uint8_t *nm_buf;
851 		int nm_buf_len;
852 
853 		if (head == ring->tail) {
854 			return (0);
855 		}
856 
857 		slot = ring->slot + head;
858 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
859 		nm_buf_len = slot->len;
860 
861 		for (;;) {
862 			int copylen = nm_buf_len < iov_frag_size ?
863 			    nm_buf_len : iov_frag_size;
864 
865 			memcpy(iov_frag_buf, nm_buf, copylen);
866 			nm_buf += copylen;
867 			nm_buf_len -= copylen;
868 			iov_frag_buf += copylen;
869 			iov_frag_size -= copylen;
870 			totlen += copylen;
871 
872 			if (nm_buf_len == 0) {
873 				break;
874 			}
875 
876 			iov++;
877 			iovcnt--;
878 			if (iovcnt == 0) {
879 				/* No space to receive. */
880 				WPRINTF(("Short iov, drop %zd bytes",
881 				    totlen));
882 				return (-ENOSPC);
883 			}
884 			iov_frag_buf = iov->iov_base;
885 			iov_frag_size = iov->iov_len;
886 		}
887 
888 		head = nm_ring_next(ring, head);
889 
890 	} while (slot->flags & NS_MOREFRAG);
891 
892 	/* Release slots to netmap. */
893 	ring->head = ring->cur = head;
894 
895 	return (totlen);
896 }
897 
898 static void
899 netmap_recv_enable(struct net_backend *be)
900 {
901 	struct netmap_priv *priv = NET_BE_PRIV(be);
902 
903 	mevent_enable(priv->mevp);
904 }
905 
906 static void
907 netmap_recv_disable(struct net_backend *be)
908 {
909 	struct netmap_priv *priv = NET_BE_PRIV(be);
910 
911 	mevent_disable(priv->mevp);
912 }
913 
914 static struct net_backend netmap_backend = {
915 	.prefix = "netmap",
916 	.priv_size = sizeof(struct netmap_priv),
917 	.init = netmap_init,
918 	.cleanup = netmap_cleanup,
919 	.send = netmap_send,
920 	.peek_recvlen = netmap_peek_recvlen,
921 	.recv = netmap_recv,
922 	.recv_enable = netmap_recv_enable,
923 	.recv_disable = netmap_recv_disable,
924 	.get_cap = netmap_get_cap,
925 	.set_cap = netmap_set_cap,
926 };
927 
928 /* A clone of the netmap backend, with a different prefix. */
929 static struct net_backend vale_backend = {
930 	.prefix = "vale",
931 	.priv_size = sizeof(struct netmap_priv),
932 	.init = netmap_init,
933 	.cleanup = netmap_cleanup,
934 	.send = netmap_send,
935 	.peek_recvlen = netmap_peek_recvlen,
936 	.recv = netmap_recv,
937 	.recv_enable = netmap_recv_enable,
938 	.recv_disable = netmap_recv_disable,
939 	.get_cap = netmap_get_cap,
940 	.set_cap = netmap_set_cap,
941 };
942 
943 DATA_SET(net_backend_set, netmap_backend);
944 DATA_SET(net_backend_set, vale_backend);
945 
946 int
947 netbe_legacy_config(nvlist_t *nvl, const char *opts)
948 {
949 	char *backend, *cp;
950 
951 	if (opts == NULL)
952 		return (0);
953 
954 	cp = strchr(opts, ',');
955 	if (cp == NULL) {
956 		set_config_value_node(nvl, "backend", opts);
957 		return (0);
958 	}
959 	backend = strndup(opts, cp - opts);
960 	set_config_value_node(nvl, "backend", backend);
961 	free(backend);
962 	return (pci_parse_legacy_config(nvl, cp + 1));
963 }
964 
965 /*
966  * Initialize a backend and attach to the frontend.
967  * This is called during frontend initialization.
968  *  @ret is a pointer to the backend to be initialized
969  *  @devname is the backend-name as supplied on the command line,
970  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
971  *  @cb is the receive callback supplied by the frontend,
972  *	and it is invoked in the event loop when a receive
973  *	event is generated in the hypervisor,
974  *  @param is a pointer to the frontend, and normally used as
975  *	the argument for the callback.
976  */
977 int
978 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
979     void *param)
980 {
981 	struct net_backend **pbe, *nbe, *tbe = NULL;
982 	const char *value, *type;
983 	char *devname;
984 	int err;
985 
986 	value = get_config_value_node(nvl, "backend");
987 	if (value == NULL) {
988 		return (-1);
989 	}
990 	devname = strdup(value);
991 
992 	/*
993 	 * Use the type given by configuration if exists; otherwise
994 	 * use the prefix of the backend as the type.
995 	 */
996 	type = get_config_value_node(nvl, "type");
997 	if (type == NULL)
998 		type = devname;
999 
1000 	/*
1001 	 * Find the network backend that matches the user-provided
1002 	 * device name. net_backend_set is built using a linker set.
1003 	 */
1004 	SET_FOREACH(pbe, net_backend_set) {
1005 		if (strncmp(type, (*pbe)->prefix,
1006 		    strlen((*pbe)->prefix)) == 0) {
1007 			tbe = *pbe;
1008 			assert(tbe->init != NULL);
1009 			assert(tbe->cleanup != NULL);
1010 			assert(tbe->send != NULL);
1011 			assert(tbe->recv != NULL);
1012 			assert(tbe->get_cap != NULL);
1013 			assert(tbe->set_cap != NULL);
1014 			break;
1015 		}
1016 	}
1017 
1018 	*ret = NULL;
1019 	if (tbe == NULL) {
1020 		free(devname);
1021 		return (EINVAL);
1022 	}
1023 
1024 	nbe = calloc(1, NET_BE_SIZE(tbe));
1025 	*nbe = *tbe;	/* copy the template */
1026 	nbe->fd = -1;
1027 	nbe->sc = param;
1028 	nbe->be_vnet_hdr_len = 0;
1029 	nbe->fe_vnet_hdr_len = 0;
1030 
1031 	/* Initialize the backend. */
1032 	err = nbe->init(nbe, devname, nvl, cb, param);
1033 	if (err) {
1034 		free(devname);
1035 		free(nbe);
1036 		return (err);
1037 	}
1038 
1039 	*ret = nbe;
1040 	free(devname);
1041 
1042 	return (0);
1043 }
1044 
1045 void
1046 netbe_cleanup(struct net_backend *be)
1047 {
1048 
1049 	if (be != NULL) {
1050 		be->cleanup(be);
1051 		free(be);
1052 	}
1053 }
1054 
1055 uint64_t
1056 netbe_get_cap(struct net_backend *be)
1057 {
1058 
1059 	assert(be != NULL);
1060 	return (be->get_cap(be));
1061 }
1062 
1063 int
1064 netbe_set_cap(struct net_backend *be, uint64_t features,
1065 	      unsigned vnet_hdr_len)
1066 {
1067 	int ret;
1068 
1069 	assert(be != NULL);
1070 
1071 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1072 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1073 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1074 		return (-1);
1075 
1076 	be->fe_vnet_hdr_len = vnet_hdr_len;
1077 
1078 	ret = be->set_cap(be, features, vnet_hdr_len);
1079 	assert(be->be_vnet_hdr_len == 0 ||
1080 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1081 
1082 	return (ret);
1083 }
1084 
1085 ssize_t
1086 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1087 {
1088 
1089 	return (be->send(be, iov, iovcnt));
1090 }
1091 
1092 ssize_t
1093 netbe_peek_recvlen(struct net_backend *be)
1094 {
1095 
1096 	return (be->peek_recvlen(be));
1097 }
1098 
1099 /*
1100  * Try to read a packet from the backend, without blocking.
1101  * If no packets are available, return 0. In case of success, return
1102  * the length of the packet just read. Return -1 in case of errors.
1103  */
1104 ssize_t
1105 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1106 {
1107 
1108 	return (be->recv(be, iov, iovcnt));
1109 }
1110 
1111 /*
1112  * Read a packet from the backend and discard it.
1113  * Returns the size of the discarded packet or zero if no packet was available.
1114  * A negative error code is returned in case of read error.
1115  */
1116 ssize_t
1117 netbe_rx_discard(struct net_backend *be)
1118 {
1119 	/*
1120 	 * MP note: the dummybuf is only used to discard frames,
1121 	 * so there is no need for it to be per-vtnet or locked.
1122 	 * We only make it large enough for TSO-sized segment.
1123 	 */
1124 	static uint8_t dummybuf[65536 + 64];
1125 	struct iovec iov;
1126 
1127 	iov.iov_base = dummybuf;
1128 	iov.iov_len = sizeof(dummybuf);
1129 
1130 	return netbe_recv(be, &iov, 1);
1131 }
1132 
1133 void
1134 netbe_rx_disable(struct net_backend *be)
1135 {
1136 
1137 	return be->recv_disable(be);
1138 }
1139 
1140 void
1141 netbe_rx_enable(struct net_backend *be)
1142 {
1143 
1144 	return be->recv_enable(be);
1145 }
1146 
1147 size_t
1148 netbe_get_vnet_hdr_len(struct net_backend *be)
1149 {
1150 
1151 	return (be->be_vnet_hdr_len);
1152 }
1153