xref: /illumos-gate/usr/src/cmd/bhyve/net_backends.c (revision 915894ef19890baaed00080f85f6b69e225cda98)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  *
27  * $FreeBSD$
28  */
29 
30 /*
31  * This file implements multiple network backends (tap, netmap, ...),
32  * to be used by network frontends such as virtio-net and e1000.
33  * The API to access the backend (e.g. send/receive packets, negotiate
34  * features) is exported by net_backends.h.
35  */
36 
37 #include <sys/cdefs.h>
38 __FBSDID("$FreeBSD$");
39 
40 #include <sys/types.h>		/* u_short etc */
41 #ifndef WITHOUT_CAPSICUM
42 #include <sys/capsicum.h>
43 #endif
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/uio.h>
47 
48 #include <net/if.h>
49 #include <net/netmap.h>
50 #include <net/netmap_virt.h>
51 #define NETMAP_WITH_LIBS
52 #include <net/netmap_user.h>
53 
54 #ifndef WITHOUT_CAPSICUM
55 #include <capsicum_helpers.h>
56 #endif
57 #include <err.h>
58 #include <errno.h>
59 #include <fcntl.h>
60 #include <stdio.h>
61 #include <stdlib.h>
62 #include <stdint.h>
63 #include <string.h>
64 #include <unistd.h>
65 #include <sysexits.h>
66 #include <assert.h>
67 #include <pthread.h>
68 #include <pthread_np.h>
69 #include <poll.h>
70 #include <assert.h>
71 
72 #ifdef NETGRAPH
73 #include <sys/param.h>
74 #include <sys/sysctl.h>
75 #include <netgraph.h>
76 #endif
77 
78 #include "debug.h"
79 #include "iov.h"
80 #include "mevent.h"
81 #include "net_backends.h"
82 
83 #include <sys/linker_set.h>
84 
85 /*
86  * Each network backend registers a set of function pointers that are
87  * used to implement the net backends API.
88  * This might need to be exposed if we implement backends in separate files.
89  */
90 struct net_backend {
91 	const char *prefix;	/* prefix matching this backend */
92 
93 	/*
94 	 * Routines used to initialize and cleanup the resources needed
95 	 * by a backend. The cleanup function is used internally,
96 	 * and should not be called by the frontend.
97 	 */
98 	int (*init)(struct net_backend *be, const char *devname,
99 	    const char *opts, net_be_rxeof_t cb, void *param);
100 	void (*cleanup)(struct net_backend *be);
101 
102 	/*
103 	 * Called to serve a guest transmit request. The scatter-gather
104 	 * vector provided by the caller has 'iovcnt' elements and contains
105 	 * the packet to send.
106 	 */
107 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
108 	    int iovcnt);
109 
110 	/*
111 	 * Get the length of the next packet that can be received from
112 	 * the backend. If no packets are currently available, this
113 	 * function returns 0.
114 	 */
115 	ssize_t (*peek_recvlen)(struct net_backend *be);
116 
117 	/*
118 	 * Called to receive a packet from the backend. When the function
119 	 * returns a positive value 'len', the scatter-gather vector
120 	 * provided by the caller contains a packet with such length.
121 	 * The function returns 0 if the backend doesn't have a new packet to
122 	 * receive.
123 	 */
124 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
125 	    int iovcnt);
126 
127 	/*
128 	 * Ask the backend to enable or disable receive operation in the
129 	 * backend. On return from a disable operation, it is guaranteed
130 	 * that the receive callback won't be called until receive is
131 	 * enabled again. Note however that it is up to the caller to make
132 	 * sure that netbe_recv() is not currently being executed by another
133 	 * thread.
134 	 */
135 	void (*recv_enable)(struct net_backend *be);
136 	void (*recv_disable)(struct net_backend *be);
137 
138 	/*
139 	 * Ask the backend for the virtio-net features it is able to
140 	 * support. Possible features are TSO, UFO and checksum offloading
141 	 * in both rx and tx direction and for both IPv4 and IPv6.
142 	 */
143 	uint64_t (*get_cap)(struct net_backend *be);
144 
145 	/*
146 	 * Tell the backend to enable/disable the specified virtio-net
147 	 * features (capabilities).
148 	 */
149 	int (*set_cap)(struct net_backend *be, uint64_t features,
150 	    unsigned int vnet_hdr_len);
151 
152 	struct pci_vtnet_softc *sc;
153 	int fd;
154 
155 	/*
156 	 * Length of the virtio-net header used by the backend and the
157 	 * frontend, respectively. A zero value means that the header
158 	 * is not used.
159 	 */
160 	unsigned int be_vnet_hdr_len;
161 	unsigned int fe_vnet_hdr_len;
162 
163 	/* Size of backend-specific private data. */
164 	size_t priv_size;
165 
166 	/* Room for backend-specific data. */
167 	char opaque[0];
168 };
169 
170 SET_DECLARE(net_backend_set, struct net_backend);
171 
172 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
173 
174 #define WPRINTF(params) PRINTLN params
175 
176 /*
177  * The tap backend
178  */
179 
180 struct tap_priv {
181 	struct mevent *mevp;
182 	/*
183 	 * A bounce buffer that allows us to implement the peek_recvlen
184 	 * callback. In the future we may get the same information from
185 	 * the kevent data.
186 	 */
187 	char bbuf[1 << 16];
188 	ssize_t bbuflen;
189 };
190 
191 static void
192 tap_cleanup(struct net_backend *be)
193 {
194 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
195 
196 	if (priv->mevp) {
197 		mevent_delete(priv->mevp);
198 	}
199 	if (be->fd != -1) {
200 		close(be->fd);
201 		be->fd = -1;
202 	}
203 }
204 
205 static int
206 tap_init(struct net_backend *be, const char *devname,
207 	 const char *opts, net_be_rxeof_t cb, void *param)
208 {
209 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
210 	char tbuf[80];
211 	int opt = 1;
212 #ifndef WITHOUT_CAPSICUM
213 	cap_rights_t rights;
214 #endif
215 
216 	if (cb == NULL) {
217 		WPRINTF(("TAP backend requires non-NULL callback"));
218 		return (-1);
219 	}
220 
221 	strcpy(tbuf, "/dev/");
222 	strlcat(tbuf, devname, sizeof(tbuf));
223 
224 	be->fd = open(tbuf, O_RDWR);
225 	if (be->fd == -1) {
226 		WPRINTF(("open of tap device %s failed", tbuf));
227 		goto error;
228 	}
229 
230 	/*
231 	 * Set non-blocking and register for read
232 	 * notifications with the event loop
233 	 */
234 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
235 		WPRINTF(("tap device O_NONBLOCK failed"));
236 		goto error;
237 	}
238 
239 #ifndef WITHOUT_CAPSICUM
240 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
241 	if (caph_rights_limit(be->fd, &rights) == -1)
242 		errx(EX_OSERR, "Unable to apply rights for sandbox");
243 #endif
244 
245 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
246 	priv->bbuflen = 0;
247 
248 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
249 	if (priv->mevp == NULL) {
250 		WPRINTF(("Could not register event"));
251 		goto error;
252 	}
253 
254 	return (0);
255 
256 error:
257 	tap_cleanup(be);
258 	return (-1);
259 }
260 
261 /*
262  * Called to send a buffer chain out to the tap device
263  */
264 static ssize_t
265 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
266 {
267 	return (writev(be->fd, iov, iovcnt));
268 }
269 
270 static ssize_t
271 tap_peek_recvlen(struct net_backend *be)
272 {
273 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
274 	ssize_t ret;
275 
276 	if (priv->bbuflen > 0) {
277 		/*
278 		 * We already have a packet in the bounce buffer.
279 		 * Just return its length.
280 		 */
281 		return priv->bbuflen;
282 	}
283 
284 	/*
285 	 * Read the next packet (if any) into the bounce buffer, so
286 	 * that we get to know its length and we can return that
287 	 * to the caller.
288 	 */
289 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
290 	if (ret < 0 && errno == EWOULDBLOCK) {
291 		return (0);
292 	}
293 
294 	if (ret > 0)
295 		priv->bbuflen = ret;
296 
297 	return (ret);
298 }
299 
300 static ssize_t
301 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
302 {
303 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
304 	ssize_t ret;
305 
306 	if (priv->bbuflen > 0) {
307 		/*
308 		 * A packet is available in the bounce buffer, so
309 		 * we read it from there.
310 		 */
311 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
312 		    iov, iovcnt, 0);
313 
314 		/* Mark the bounce buffer as empty. */
315 		priv->bbuflen = 0;
316 
317 		return (ret);
318 	}
319 
320 	ret = readv(be->fd, iov, iovcnt);
321 	if (ret < 0 && errno == EWOULDBLOCK) {
322 		return (0);
323 	}
324 
325 	return (ret);
326 }
327 
328 static void
329 tap_recv_enable(struct net_backend *be)
330 {
331 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
332 
333 	mevent_enable(priv->mevp);
334 }
335 
336 static void
337 tap_recv_disable(struct net_backend *be)
338 {
339 	struct tap_priv *priv = (struct tap_priv *)be->opaque;
340 
341 	mevent_disable(priv->mevp);
342 }
343 
344 static uint64_t
345 tap_get_cap(struct net_backend *be)
346 {
347 
348 	return (0); /* no capabilities for now */
349 }
350 
351 static int
352 tap_set_cap(struct net_backend *be, uint64_t features,
353 		unsigned vnet_hdr_len)
354 {
355 
356 	return ((features || vnet_hdr_len) ? -1 : 0);
357 }
358 
359 static struct net_backend tap_backend = {
360 	.prefix = "tap",
361 	.priv_size = sizeof(struct tap_priv),
362 	.init = tap_init,
363 	.cleanup = tap_cleanup,
364 	.send = tap_send,
365 	.peek_recvlen = tap_peek_recvlen,
366 	.recv = tap_recv,
367 	.recv_enable = tap_recv_enable,
368 	.recv_disable = tap_recv_disable,
369 	.get_cap = tap_get_cap,
370 	.set_cap = tap_set_cap,
371 };
372 
373 /* A clone of the tap backend, with a different prefix. */
374 static struct net_backend vmnet_backend = {
375 	.prefix = "vmnet",
376 	.priv_size = sizeof(struct tap_priv),
377 	.init = tap_init,
378 	.cleanup = tap_cleanup,
379 	.send = tap_send,
380 	.peek_recvlen = tap_peek_recvlen,
381 	.recv = tap_recv,
382 	.recv_enable = tap_recv_enable,
383 	.recv_disable = tap_recv_disable,
384 	.get_cap = tap_get_cap,
385 	.set_cap = tap_set_cap,
386 };
387 
388 DATA_SET(net_backend_set, tap_backend);
389 DATA_SET(net_backend_set, vmnet_backend);
390 
391 #ifdef NETGRAPH
392 
393 /*
394  * Netgraph backend
395  */
396 
397 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
398 
399 static int
400 ng_init(struct net_backend *be, const char *devname,
401 	 const char *opts, net_be_rxeof_t cb, void *param)
402 {
403 	struct tap_priv *p = (struct tap_priv *)be->opaque;
404 	struct ngm_connect ngc;
405 	char *ngopts, *tofree;
406 	char nodename[NG_NODESIZ];
407 	int sbsz;
408 	int ctrl_sock;
409 	int flags;
410 	int path_provided;
411 	int peerhook_provided;
412 	int socket_provided;
413 	unsigned long maxsbsz;
414 	size_t msbsz;
415 #ifndef WITHOUT_CAPSICUM
416 	cap_rights_t rights;
417 #endif
418 
419 	if (cb == NULL) {
420 		WPRINTF(("Netgraph backend requires non-NULL callback"));
421 		return (-1);
422 	}
423 
424 	be->fd = -1;
425 
426 	memset(&ngc, 0, sizeof(ngc));
427 
428 	strncpy(ngc.ourhook, "vmlink", NG_HOOKSIZ - 1);
429 
430 	tofree = ngopts = strdup(opts);
431 
432 	if (ngopts == NULL) {
433 		WPRINTF(("strdup error"));
434 		return (-1);
435 	}
436 
437 	socket_provided = 0;
438 	path_provided = 0;
439 	peerhook_provided = 0;
440 
441 	while (ngopts != NULL) {
442 		char *value = ngopts;
443 		char *key;
444 
445 		key = strsep(&value, "=");
446 		if (value == NULL)
447 			break;
448 		ngopts = value;
449 		(void) strsep(&ngopts, ",");
450 
451 		if (strcmp(key, "socket") == 0) {
452 			strncpy(nodename, value, NG_NODESIZ - 1);
453 			socket_provided = 1;
454 		} else if (strcmp(key, "path") == 0) {
455 			strncpy(ngc.path, value, NG_PATHSIZ - 1);
456 			path_provided = 1;
457 		} else if (strcmp(key, "hook") == 0) {
458 			strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
459 		} else if (strcmp(key, "peerhook") == 0) {
460 			strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
461 			peerhook_provided = 1;
462 		}
463 	}
464 
465 	free(tofree);
466 
467 	if (!path_provided) {
468 		WPRINTF(("path must be provided"));
469 		return (-1);
470 	}
471 
472 	if (!peerhook_provided) {
473 		WPRINTF(("peer hook must be provided"));
474 		return (-1);
475 	}
476 
477 	if (NgMkSockNode(socket_provided ? nodename : NULL,
478 		&ctrl_sock, &be->fd) < 0) {
479 		WPRINTF(("can't get Netgraph sockets"));
480 		return (-1);
481 	}
482 
483 	if (NgSendMsg(ctrl_sock, ".",
484 		NGM_GENERIC_COOKIE,
485 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
486 		WPRINTF(("can't connect to node"));
487 		close(ctrl_sock);
488 		goto error;
489 	}
490 
491 	close(ctrl_sock);
492 
493 	flags = fcntl(be->fd, F_GETFL);
494 
495 	if (flags < 0) {
496 		WPRINTF(("can't get socket flags"));
497 		goto error;
498 	}
499 
500 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
501 		WPRINTF(("can't set O_NONBLOCK flag"));
502 		goto error;
503 	}
504 
505 	/*
506 	 * The default ng_socket(4) buffer's size is too low.
507 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
508 	 * and kern.ipc.maxsockbuf.
509 	 */
510 	msbsz = sizeof(maxsbsz);
511 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
512 		NULL, 0) < 0) {
513 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
514 		goto error;
515 	}
516 
517 	/*
518 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
519 	 * as it takes into account the mbuf(9) overhead.
520 	 */
521 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
522 
523 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
524 
525 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
526 		sizeof(sbsz)) < 0) {
527 		WPRINTF(("can't set TX buffer size"));
528 		goto error;
529 	}
530 
531 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
532 		sizeof(sbsz)) < 0) {
533 		WPRINTF(("can't set RX buffer size"));
534 		goto error;
535 	}
536 
537 #ifndef WITHOUT_CAPSICUM
538 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
539 	if (caph_rights_limit(be->fd, &rights) == -1)
540 		errx(EX_OSERR, "Unable to apply rights for sandbox");
541 #endif
542 
543 	memset(p->bbuf, 0, sizeof(p->bbuf));
544 	p->bbuflen = 0;
545 
546 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
547 	if (p->mevp == NULL) {
548 		WPRINTF(("Could not register event"));
549 		goto error;
550 	}
551 
552 	return (0);
553 
554 error:
555 	tap_cleanup(be);
556 	return (-1);
557 }
558 
559 static struct net_backend ng_backend = {
560 	.prefix = "netgraph",
561 	.priv_size = sizeof(struct tap_priv),
562 	.init = ng_init,
563 	.cleanup = tap_cleanup,
564 	.send = tap_send,
565 	.peek_recvlen = tap_peek_recvlen,
566 	.recv = tap_recv,
567 	.recv_enable = tap_recv_enable,
568 	.recv_disable = tap_recv_disable,
569 	.get_cap = tap_get_cap,
570 	.set_cap = tap_set_cap,
571 };
572 
573 DATA_SET(net_backend_set, ng_backend);
574 
575 #endif /* NETGRAPH */
576 
577 /*
578  * The netmap backend
579  */
580 
581 /* The virtio-net features supported by netmap. */
582 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
583 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
584 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
585 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
586 
587 struct netmap_priv {
588 	char ifname[IFNAMSIZ];
589 	struct nm_desc *nmd;
590 	uint16_t memid;
591 	struct netmap_ring *rx;
592 	struct netmap_ring *tx;
593 	struct mevent *mevp;
594 	net_be_rxeof_t cb;
595 	void *cb_param;
596 };
597 
598 static void
599 nmreq_init(struct nmreq *req, char *ifname)
600 {
601 
602 	memset(req, 0, sizeof(*req));
603 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
604 	req->nr_version = NETMAP_API;
605 }
606 
607 static int
608 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
609 {
610 	int err;
611 	struct nmreq req;
612 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
613 
614 	nmreq_init(&req, priv->ifname);
615 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
616 	req.nr_arg1 = vnet_hdr_len;
617 	err = ioctl(be->fd, NIOCREGIF, &req);
618 	if (err) {
619 		WPRINTF(("Unable to set vnet header length %d",
620 				vnet_hdr_len));
621 		return (err);
622 	}
623 
624 	be->be_vnet_hdr_len = vnet_hdr_len;
625 
626 	return (0);
627 }
628 
629 static int
630 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
631 {
632 	int prev_hdr_len = be->be_vnet_hdr_len;
633 	int ret;
634 
635 	if (vnet_hdr_len == prev_hdr_len) {
636 		return (1);
637 	}
638 
639 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
640 	if (ret) {
641 		return (0);
642 	}
643 
644 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
645 
646 	return (1);
647 }
648 
649 static uint64_t
650 netmap_get_cap(struct net_backend *be)
651 {
652 
653 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
654 	    NETMAP_FEATURES : 0);
655 }
656 
657 static int
658 netmap_set_cap(struct net_backend *be, uint64_t features,
659 	       unsigned vnet_hdr_len)
660 {
661 
662 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
663 }
664 
665 static int
666 netmap_init(struct net_backend *be, const char *devname,
667 	    const char *opts, net_be_rxeof_t cb, void *param)
668 {
669 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
670 
671 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
672 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
673 
674 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
675 	if (priv->nmd == NULL) {
676 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
677 			devname, strerror(errno)));
678 		free(priv);
679 		return (-1);
680 	}
681 
682 	priv->memid = priv->nmd->req.nr_arg2;
683 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
684 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
685 	priv->cb = cb;
686 	priv->cb_param = param;
687 	be->fd = priv->nmd->fd;
688 
689 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
690 	if (priv->mevp == NULL) {
691 		WPRINTF(("Could not register event"));
692 		return (-1);
693 	}
694 
695 	return (0);
696 }
697 
698 static void
699 netmap_cleanup(struct net_backend *be)
700 {
701 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
702 
703 	if (priv->mevp) {
704 		mevent_delete(priv->mevp);
705 	}
706 	if (priv->nmd) {
707 		nm_close(priv->nmd);
708 	}
709 	be->fd = -1;
710 }
711 
712 static ssize_t
713 netmap_send(struct net_backend *be, const struct iovec *iov,
714 	    int iovcnt)
715 {
716 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
717 	struct netmap_ring *ring;
718 	ssize_t totlen = 0;
719 	int nm_buf_size;
720 	int nm_buf_len;
721 	uint32_t head;
722 	void *nm_buf;
723 	int j;
724 
725 	ring = priv->tx;
726 	head = ring->head;
727 	if (head == ring->tail) {
728 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
729 		goto txsync;
730 	}
731 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
732 	nm_buf_size = ring->nr_buf_size;
733 	nm_buf_len = 0;
734 
735 	for (j = 0; j < iovcnt; j++) {
736 		int iov_frag_size = iov[j].iov_len;
737 		void *iov_frag_buf = iov[j].iov_base;
738 
739 		totlen += iov_frag_size;
740 
741 		/*
742 		 * Split each iovec fragment over more netmap slots, if
743 		 * necessary.
744 		 */
745 		for (;;) {
746 			int copylen;
747 
748 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
749 			memcpy(nm_buf, iov_frag_buf, copylen);
750 
751 			iov_frag_buf += copylen;
752 			iov_frag_size -= copylen;
753 			nm_buf += copylen;
754 			nm_buf_size -= copylen;
755 			nm_buf_len += copylen;
756 
757 			if (iov_frag_size == 0) {
758 				break;
759 			}
760 
761 			ring->slot[head].len = nm_buf_len;
762 			ring->slot[head].flags = NS_MOREFRAG;
763 			head = nm_ring_next(ring, head);
764 			if (head == ring->tail) {
765 				/*
766 				 * We ran out of netmap slots while
767 				 * splitting the iovec fragments.
768 				 */
769 				WPRINTF(("No space, drop %zu bytes",
770 				   count_iov(iov, iovcnt)));
771 				goto txsync;
772 			}
773 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
774 			nm_buf_size = ring->nr_buf_size;
775 			nm_buf_len = 0;
776 		}
777 	}
778 
779 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
780 	ring->slot[head].len = nm_buf_len;
781 	ring->slot[head].flags = 0;
782 	head = nm_ring_next(ring, head);
783 
784 	/* Now update ring->head and ring->cur. */
785 	ring->head = ring->cur = head;
786 txsync:
787 	ioctl(be->fd, NIOCTXSYNC, NULL);
788 
789 	return (totlen);
790 }
791 
792 static ssize_t
793 netmap_peek_recvlen(struct net_backend *be)
794 {
795 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
796 	struct netmap_ring *ring = priv->rx;
797 	uint32_t head = ring->head;
798 	ssize_t totlen = 0;
799 
800 	while (head != ring->tail) {
801 		struct netmap_slot *slot = ring->slot + head;
802 
803 		totlen += slot->len;
804 		if ((slot->flags & NS_MOREFRAG) == 0)
805 			break;
806 		head = nm_ring_next(ring, head);
807 	}
808 
809 	return (totlen);
810 }
811 
812 static ssize_t
813 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
814 {
815 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
816 	struct netmap_slot *slot = NULL;
817 	struct netmap_ring *ring;
818 	void *iov_frag_buf;
819 	int iov_frag_size;
820 	ssize_t totlen = 0;
821 	uint32_t head;
822 
823 	assert(iovcnt);
824 
825 	ring = priv->rx;
826 	head = ring->head;
827 	iov_frag_buf = iov->iov_base;
828 	iov_frag_size = iov->iov_len;
829 
830 	do {
831 		int nm_buf_len;
832 		void *nm_buf;
833 
834 		if (head == ring->tail) {
835 			return (0);
836 		}
837 
838 		slot = ring->slot + head;
839 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
840 		nm_buf_len = slot->len;
841 
842 		for (;;) {
843 			int copylen = nm_buf_len < iov_frag_size ?
844 			    nm_buf_len : iov_frag_size;
845 
846 			memcpy(iov_frag_buf, nm_buf, copylen);
847 			nm_buf += copylen;
848 			nm_buf_len -= copylen;
849 			iov_frag_buf += copylen;
850 			iov_frag_size -= copylen;
851 			totlen += copylen;
852 
853 			if (nm_buf_len == 0) {
854 				break;
855 			}
856 
857 			iov++;
858 			iovcnt--;
859 			if (iovcnt == 0) {
860 				/* No space to receive. */
861 				WPRINTF(("Short iov, drop %zd bytes",
862 				    totlen));
863 				return (-ENOSPC);
864 			}
865 			iov_frag_buf = iov->iov_base;
866 			iov_frag_size = iov->iov_len;
867 		}
868 
869 		head = nm_ring_next(ring, head);
870 
871 	} while (slot->flags & NS_MOREFRAG);
872 
873 	/* Release slots to netmap. */
874 	ring->head = ring->cur = head;
875 
876 	return (totlen);
877 }
878 
879 static void
880 netmap_recv_enable(struct net_backend *be)
881 {
882 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
883 
884 	mevent_enable(priv->mevp);
885 }
886 
887 static void
888 netmap_recv_disable(struct net_backend *be)
889 {
890 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
891 
892 	mevent_disable(priv->mevp);
893 }
894 
895 static struct net_backend netmap_backend = {
896 	.prefix = "netmap",
897 	.priv_size = sizeof(struct netmap_priv),
898 	.init = netmap_init,
899 	.cleanup = netmap_cleanup,
900 	.send = netmap_send,
901 	.peek_recvlen = netmap_peek_recvlen,
902 	.recv = netmap_recv,
903 	.recv_enable = netmap_recv_enable,
904 	.recv_disable = netmap_recv_disable,
905 	.get_cap = netmap_get_cap,
906 	.set_cap = netmap_set_cap,
907 };
908 
909 /* A clone of the netmap backend, with a different prefix. */
910 static struct net_backend vale_backend = {
911 	.prefix = "vale",
912 	.priv_size = sizeof(struct netmap_priv),
913 	.init = netmap_init,
914 	.cleanup = netmap_cleanup,
915 	.send = netmap_send,
916 	.peek_recvlen = netmap_peek_recvlen,
917 	.recv = netmap_recv,
918 	.recv_enable = netmap_recv_enable,
919 	.recv_disable = netmap_recv_disable,
920 	.get_cap = netmap_get_cap,
921 	.set_cap = netmap_set_cap,
922 };
923 
924 DATA_SET(net_backend_set, netmap_backend);
925 DATA_SET(net_backend_set, vale_backend);
926 
927 /*
928  * Initialize a backend and attach to the frontend.
929  * This is called during frontend initialization.
930  *  @pbe is a pointer to the backend to be initialized
931  *  @devname is the backend-name as supplied on the command line,
932  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
933  *  @cb is the receive callback supplied by the frontend,
934  *	and it is invoked in the event loop when a receive
935  *	event is generated in the hypervisor,
936  *  @param is a pointer to the frontend, and normally used as
937  *	the argument for the callback.
938  */
939 int
940 netbe_init(struct net_backend **ret, const char *opts, net_be_rxeof_t cb,
941     void *param)
942 {
943 	struct net_backend **pbe, *nbe, *tbe = NULL;
944 	char *devname;
945 	char *options;
946 	int err;
947 
948 	devname = options = strdup(opts);
949 
950 	if (devname == NULL) {
951 		return (-1);
952 	}
953 
954 	devname = strsep(&options, ",");
955 
956 	/*
957 	 * Find the network backend that matches the user-provided
958 	 * device name. net_backend_set is built using a linker set.
959 	 */
960 	SET_FOREACH(pbe, net_backend_set) {
961 		if (strncmp(devname, (*pbe)->prefix,
962 		    strlen((*pbe)->prefix)) == 0) {
963 			tbe = *pbe;
964 			assert(tbe->init != NULL);
965 			assert(tbe->cleanup != NULL);
966 			assert(tbe->send != NULL);
967 			assert(tbe->recv != NULL);
968 			assert(tbe->get_cap != NULL);
969 			assert(tbe->set_cap != NULL);
970 			break;
971 		}
972 	}
973 
974 	*ret = NULL;
975 	if (tbe == NULL) {
976 		free(devname);
977 		return (EINVAL);
978 	}
979 
980 	nbe = calloc(1, sizeof(*nbe) + tbe->priv_size);
981 	*nbe = *tbe;	/* copy the template */
982 	nbe->fd = -1;
983 	nbe->sc = param;
984 	nbe->be_vnet_hdr_len = 0;
985 	nbe->fe_vnet_hdr_len = 0;
986 
987 	/* Initialize the backend. */
988 	err = nbe->init(nbe, devname, options, cb, param);
989 	if (err) {
990 		free(devname);
991 		free(nbe);
992 		return (err);
993 	}
994 
995 	*ret = nbe;
996 	free(devname);
997 
998 	return (0);
999 }
1000 
1001 void
1002 netbe_cleanup(struct net_backend *be)
1003 {
1004 
1005 	if (be != NULL) {
1006 		be->cleanup(be);
1007 		free(be);
1008 	}
1009 }
1010 
1011 uint64_t
1012 netbe_get_cap(struct net_backend *be)
1013 {
1014 
1015 	assert(be != NULL);
1016 	return (be->get_cap(be));
1017 }
1018 
1019 int
1020 netbe_set_cap(struct net_backend *be, uint64_t features,
1021 	      unsigned vnet_hdr_len)
1022 {
1023 	int ret;
1024 
1025 	assert(be != NULL);
1026 
1027 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1028 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1029 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1030 		return (-1);
1031 
1032 	be->fe_vnet_hdr_len = vnet_hdr_len;
1033 
1034 	ret = be->set_cap(be, features, vnet_hdr_len);
1035 	assert(be->be_vnet_hdr_len == 0 ||
1036 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1037 
1038 	return (ret);
1039 }
1040 
1041 ssize_t
1042 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1043 {
1044 
1045 	return (be->send(be, iov, iovcnt));
1046 }
1047 
1048 ssize_t
1049 netbe_peek_recvlen(struct net_backend *be)
1050 {
1051 
1052 	return (be->peek_recvlen(be));
1053 }
1054 
1055 /*
1056  * Try to read a packet from the backend, without blocking.
1057  * If no packets are available, return 0. In case of success, return
1058  * the length of the packet just read. Return -1 in case of errors.
1059  */
1060 ssize_t
1061 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1062 {
1063 
1064 	return (be->recv(be, iov, iovcnt));
1065 }
1066 
1067 /*
1068  * Read a packet from the backend and discard it.
1069  * Returns the size of the discarded packet or zero if no packet was available.
1070  * A negative error code is returned in case of read error.
1071  */
1072 ssize_t
1073 netbe_rx_discard(struct net_backend *be)
1074 {
1075 	/*
1076 	 * MP note: the dummybuf is only used to discard frames,
1077 	 * so there is no need for it to be per-vtnet or locked.
1078 	 * We only make it large enough for TSO-sized segment.
1079 	 */
1080 	static uint8_t dummybuf[65536 + 64];
1081 	struct iovec iov;
1082 
1083 	iov.iov_base = dummybuf;
1084 	iov.iov_len = sizeof(dummybuf);
1085 
1086 	return netbe_recv(be, &iov, 1);
1087 }
1088 
1089 void
1090 netbe_rx_disable(struct net_backend *be)
1091 {
1092 
1093 	return be->recv_disable(be);
1094 }
1095 
1096 void
1097 netbe_rx_enable(struct net_backend *be)
1098 {
1099 
1100 	return be->recv_enable(be);
1101 }
1102 
1103 size_t
1104 netbe_get_vnet_hdr_len(struct net_backend *be)
1105 {
1106 
1107 	return (be->be_vnet_hdr_len);
1108 }
1109