xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision 608da65de9552d5678c1000776ed69da04a45983)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * This file implements multiple network backends (tap, netmap, ...),
30  * to be used by network frontends such as virtio-net and e1000.
31  * The API to access the backend (e.g. send/receive packets, negotiate
32  * features) is exported by net_backends.h.
33  */
34 
35 #include <sys/cdefs.h>
36 #include <sys/types.h>		/* u_short etc */
37 #ifndef WITHOUT_CAPSICUM
38 #include <sys/capsicum.h>
39 #endif
40 #include <sys/ioctl.h>
41 #include <sys/mman.h>
42 #include <sys/uio.h>
43 
44 #include <net/if.h>
45 #include <net/if_tap.h>
46 #include <net/netmap.h>
47 #include <net/netmap_virt.h>
48 #define NETMAP_WITH_LIBS
49 #include <net/netmap_user.h>
50 
51 #ifndef WITHOUT_CAPSICUM
52 #include <capsicum_helpers.h>
53 #endif
54 #include <err.h>
55 #include <errno.h>
56 #include <fcntl.h>
57 #include <stdio.h>
58 #include <stdlib.h>
59 #include <stdint.h>
60 #include <string.h>
61 #include <unistd.h>
62 #include <sysexits.h>
63 #include <assert.h>
64 #include <pthread.h>
65 #include <pthread_np.h>
66 #include <poll.h>
67 #include <assert.h>
68 
69 #ifdef NETGRAPH
70 #include <sys/param.h>
71 #include <sys/sysctl.h>
72 #include <netgraph.h>
73 #endif
74 
75 #include "config.h"
76 #include "debug.h"
77 #include "iov.h"
78 #include "mevent.h"
79 #include "net_backends.h"
80 #include "pci_emul.h"
81 
82 #include <sys/linker_set.h>
83 
84 /*
85  * Each network backend registers a set of function pointers that are
86  * used to implement the net backends API.
87  * This might need to be exposed if we implement backends in separate files.
88  */
89 struct net_backend {
90 	const char *prefix;	/* prefix matching this backend */
91 
92 	/*
93 	 * Routines used to initialize and cleanup the resources needed
94 	 * by a backend. The cleanup function is used internally,
95 	 * and should not be called by the frontend.
96 	 */
97 	int (*init)(struct net_backend *be, const char *devname,
98 	    nvlist_t *nvl, net_be_rxeof_t cb, void *param);
99 	void (*cleanup)(struct net_backend *be);
100 
101 	/*
102 	 * Called to serve a guest transmit request. The scatter-gather
103 	 * vector provided by the caller has 'iovcnt' elements and contains
104 	 * the packet to send.
105 	 */
106 	ssize_t (*send)(struct net_backend *be, const struct iovec *iov,
107 	    int iovcnt);
108 
109 	/*
110 	 * Get the length of the next packet that can be received from
111 	 * the backend. If no packets are currently available, this
112 	 * function returns 0.
113 	 */
114 	ssize_t (*peek_recvlen)(struct net_backend *be);
115 
116 	/*
117 	 * Called to receive a packet from the backend. When the function
118 	 * returns a positive value 'len', the scatter-gather vector
119 	 * provided by the caller contains a packet with such length.
120 	 * The function returns 0 if the backend doesn't have a new packet to
121 	 * receive.
122 	 */
123 	ssize_t (*recv)(struct net_backend *be, const struct iovec *iov,
124 	    int iovcnt);
125 
126 	/*
127 	 * Ask the backend to enable or disable receive operation in the
128 	 * backend. On return from a disable operation, it is guaranteed
129 	 * that the receive callback won't be called until receive is
130 	 * enabled again. Note however that it is up to the caller to make
131 	 * sure that netbe_recv() is not currently being executed by another
132 	 * thread.
133 	 */
134 	void (*recv_enable)(struct net_backend *be);
135 	void (*recv_disable)(struct net_backend *be);
136 
137 	/*
138 	 * Ask the backend for the virtio-net features it is able to
139 	 * support. Possible features are TSO, UFO and checksum offloading
140 	 * in both rx and tx direction and for both IPv4 and IPv6.
141 	 */
142 	uint64_t (*get_cap)(struct net_backend *be);
143 
144 	/*
145 	 * Tell the backend to enable/disable the specified virtio-net
146 	 * features (capabilities).
147 	 */
148 	int (*set_cap)(struct net_backend *be, uint64_t features,
149 	    unsigned int vnet_hdr_len);
150 
151 	struct pci_vtnet_softc *sc;
152 	int fd;
153 
154 	/*
155 	 * Length of the virtio-net header used by the backend and the
156 	 * frontend, respectively. A zero value means that the header
157 	 * is not used.
158 	 */
159 	unsigned int be_vnet_hdr_len;
160 	unsigned int fe_vnet_hdr_len;
161 
162 	/* Size of backend-specific private data. */
163 	size_t priv_size;
164 
165 	/* Backend-specific private data follows. */
166 };
167 
168 #define	NET_BE_PRIV(be)		((void *)((be) + 1))
169 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
170 
171 SET_DECLARE(net_backend_set, struct net_backend);
172 
173 #define VNET_HDR_LEN	sizeof(struct virtio_net_rxhdr)
174 
175 #define WPRINTF(params) PRINTLN params
176 
177 /*
178  * The tap backend
179  */
180 
181 struct tap_priv {
182 	struct mevent *mevp;
183 	/*
184 	 * A bounce buffer that allows us to implement the peek_recvlen
185 	 * callback. In the future we may get the same information from
186 	 * the kevent data.
187 	 */
188 	char bbuf[1 << 16];
189 	ssize_t bbuflen;
190 };
191 
192 static void
193 tap_cleanup(struct net_backend *be)
194 {
195 	struct tap_priv *priv = NET_BE_PRIV(be);
196 
197 	if (priv->mevp) {
198 		mevent_delete(priv->mevp);
199 	}
200 	if (be->fd != -1) {
201 		close(be->fd);
202 		be->fd = -1;
203 	}
204 }
205 
206 static int
207 tap_init(struct net_backend *be, const char *devname,
208     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
209 {
210 	struct tap_priv *priv = NET_BE_PRIV(be);
211 	char tbuf[80];
212 	int opt = 1, up = IFF_UP;
213 
214 #ifndef WITHOUT_CAPSICUM
215 	cap_rights_t rights;
216 #endif
217 
218 	if (cb == NULL) {
219 		WPRINTF(("TAP backend requires non-NULL callback"));
220 		return (-1);
221 	}
222 
223 	strcpy(tbuf, "/dev/");
224 	strlcat(tbuf, devname, sizeof(tbuf));
225 
226 	be->fd = open(tbuf, O_RDWR);
227 	if (be->fd == -1) {
228 		WPRINTF(("open of tap device %s failed", tbuf));
229 		goto error;
230 	}
231 
232 	/*
233 	 * Set non-blocking and register for read
234 	 * notifications with the event loop
235 	 */
236 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
237 		WPRINTF(("tap device O_NONBLOCK failed"));
238 		goto error;
239 	}
240 
241 	if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, &up)) {
242 		WPRINTF(("tap device link up failed"));
243 		goto error;
244 	}
245 
246 #ifndef WITHOUT_CAPSICUM
247 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
248 	if (caph_rights_limit(be->fd, &rights) == -1)
249 		errx(EX_OSERR, "Unable to apply rights for sandbox");
250 #endif
251 
252 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
253 	priv->bbuflen = 0;
254 
255 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
256 	if (priv->mevp == NULL) {
257 		WPRINTF(("Could not register event"));
258 		goto error;
259 	}
260 
261 	return (0);
262 
263 error:
264 	tap_cleanup(be);
265 	return (-1);
266 }
267 
268 /*
269  * Called to send a buffer chain out to the tap device
270  */
271 static ssize_t
272 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
273 {
274 	return (writev(be->fd, iov, iovcnt));
275 }
276 
277 static ssize_t
278 tap_peek_recvlen(struct net_backend *be)
279 {
280 	struct tap_priv *priv = NET_BE_PRIV(be);
281 	ssize_t ret;
282 
283 	if (priv->bbuflen > 0) {
284 		/*
285 		 * We already have a packet in the bounce buffer.
286 		 * Just return its length.
287 		 */
288 		return priv->bbuflen;
289 	}
290 
291 	/*
292 	 * Read the next packet (if any) into the bounce buffer, so
293 	 * that we get to know its length and we can return that
294 	 * to the caller.
295 	 */
296 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
297 	if (ret < 0 && errno == EWOULDBLOCK) {
298 		return (0);
299 	}
300 
301 	if (ret > 0)
302 		priv->bbuflen = ret;
303 
304 	return (ret);
305 }
306 
307 static ssize_t
308 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
309 {
310 	struct tap_priv *priv = NET_BE_PRIV(be);
311 	ssize_t ret;
312 
313 	if (priv->bbuflen > 0) {
314 		/*
315 		 * A packet is available in the bounce buffer, so
316 		 * we read it from there.
317 		 */
318 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
319 		    iov, iovcnt, 0);
320 
321 		/* Mark the bounce buffer as empty. */
322 		priv->bbuflen = 0;
323 
324 		return (ret);
325 	}
326 
327 	ret = readv(be->fd, iov, iovcnt);
328 	if (ret < 0 && errno == EWOULDBLOCK) {
329 		return (0);
330 	}
331 
332 	return (ret);
333 }
334 
335 static void
336 tap_recv_enable(struct net_backend *be)
337 {
338 	struct tap_priv *priv = NET_BE_PRIV(be);
339 
340 	mevent_enable(priv->mevp);
341 }
342 
343 static void
344 tap_recv_disable(struct net_backend *be)
345 {
346 	struct tap_priv *priv = NET_BE_PRIV(be);
347 
348 	mevent_disable(priv->mevp);
349 }
350 
351 static uint64_t
352 tap_get_cap(struct net_backend *be __unused)
353 {
354 
355 	return (0); /* no capabilities for now */
356 }
357 
358 static int
359 tap_set_cap(struct net_backend *be __unused, uint64_t features,
360     unsigned vnet_hdr_len)
361 {
362 
363 	return ((features || vnet_hdr_len) ? -1 : 0);
364 }
365 
366 static struct net_backend tap_backend = {
367 	.prefix = "tap",
368 	.priv_size = sizeof(struct tap_priv),
369 	.init = tap_init,
370 	.cleanup = tap_cleanup,
371 	.send = tap_send,
372 	.peek_recvlen = tap_peek_recvlen,
373 	.recv = tap_recv,
374 	.recv_enable = tap_recv_enable,
375 	.recv_disable = tap_recv_disable,
376 	.get_cap = tap_get_cap,
377 	.set_cap = tap_set_cap,
378 };
379 
380 /* A clone of the tap backend, with a different prefix. */
381 static struct net_backend vmnet_backend = {
382 	.prefix = "vmnet",
383 	.priv_size = sizeof(struct tap_priv),
384 	.init = tap_init,
385 	.cleanup = tap_cleanup,
386 	.send = tap_send,
387 	.peek_recvlen = tap_peek_recvlen,
388 	.recv = tap_recv,
389 	.recv_enable = tap_recv_enable,
390 	.recv_disable = tap_recv_disable,
391 	.get_cap = tap_get_cap,
392 	.set_cap = tap_set_cap,
393 };
394 
395 DATA_SET(net_backend_set, tap_backend);
396 DATA_SET(net_backend_set, vmnet_backend);
397 
398 #ifdef NETGRAPH
399 
400 /*
401  * Netgraph backend
402  */
403 
404 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024)
405 
406 static int
407 ng_init(struct net_backend *be, const char *devname __unused,
408 	 nvlist_t *nvl, net_be_rxeof_t cb, void *param)
409 {
410 	struct tap_priv *p = NET_BE_PRIV(be);
411 	struct ngm_connect ngc;
412 	const char *value, *nodename;
413 	int sbsz;
414 	int ctrl_sock;
415 	int flags;
416 	unsigned long maxsbsz;
417 	size_t msbsz;
418 #ifndef WITHOUT_CAPSICUM
419 	cap_rights_t rights;
420 #endif
421 
422 	if (cb == NULL) {
423 		WPRINTF(("Netgraph backend requires non-NULL callback"));
424 		return (-1);
425 	}
426 
427 	be->fd = -1;
428 
429 	memset(&ngc, 0, sizeof(ngc));
430 
431 	value = get_config_value_node(nvl, "path");
432 	if (value == NULL) {
433 		WPRINTF(("path must be provided"));
434 		return (-1);
435 	}
436 	strncpy(ngc.path, value, NG_PATHSIZ - 1);
437 
438 	value = get_config_value_node(nvl, "hook");
439 	if (value == NULL)
440 		value = "vmlink";
441 	strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1);
442 
443 	value = get_config_value_node(nvl, "peerhook");
444 	if (value == NULL) {
445 		WPRINTF(("peer hook must be provided"));
446 		return (-1);
447 	}
448 	strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1);
449 
450 	nodename = get_config_value_node(nvl, "socket");
451 	if (NgMkSockNode(nodename,
452 		&ctrl_sock, &be->fd) < 0) {
453 		WPRINTF(("can't get Netgraph sockets"));
454 		return (-1);
455 	}
456 
457 	if (NgSendMsg(ctrl_sock, ".",
458 		NGM_GENERIC_COOKIE,
459 		NGM_CONNECT, &ngc, sizeof(ngc)) < 0) {
460 		WPRINTF(("can't connect to node"));
461 		close(ctrl_sock);
462 		goto error;
463 	}
464 
465 	close(ctrl_sock);
466 
467 	flags = fcntl(be->fd, F_GETFL);
468 
469 	if (flags < 0) {
470 		WPRINTF(("can't get socket flags"));
471 		goto error;
472 	}
473 
474 	if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) {
475 		WPRINTF(("can't set O_NONBLOCK flag"));
476 		goto error;
477 	}
478 
479 	/*
480 	 * The default ng_socket(4) buffer's size is too low.
481 	 * Calculate the minimum value between NG_SBUF_MAX_SIZE
482 	 * and kern.ipc.maxsockbuf.
483 	 */
484 	msbsz = sizeof(maxsbsz);
485 	if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz,
486 		NULL, 0) < 0) {
487 		WPRINTF(("can't get 'kern.ipc.maxsockbuf' value"));
488 		goto error;
489 	}
490 
491 	/*
492 	 * We can't set the socket buffer size to kern.ipc.maxsockbuf value,
493 	 * as it takes into account the mbuf(9) overhead.
494 	 */
495 	maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES);
496 
497 	sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz);
498 
499 	if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz,
500 		sizeof(sbsz)) < 0) {
501 		WPRINTF(("can't set TX buffer size"));
502 		goto error;
503 	}
504 
505 	if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz,
506 		sizeof(sbsz)) < 0) {
507 		WPRINTF(("can't set RX buffer size"));
508 		goto error;
509 	}
510 
511 #ifndef WITHOUT_CAPSICUM
512 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
513 	if (caph_rights_limit(be->fd, &rights) == -1)
514 		errx(EX_OSERR, "Unable to apply rights for sandbox");
515 #endif
516 
517 	memset(p->bbuf, 0, sizeof(p->bbuf));
518 	p->bbuflen = 0;
519 
520 	p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
521 	if (p->mevp == NULL) {
522 		WPRINTF(("Could not register event"));
523 		goto error;
524 	}
525 
526 	return (0);
527 
528 error:
529 	tap_cleanup(be);
530 	return (-1);
531 }
532 
533 static struct net_backend ng_backend = {
534 	.prefix = "netgraph",
535 	.priv_size = sizeof(struct tap_priv),
536 	.init = ng_init,
537 	.cleanup = tap_cleanup,
538 	.send = tap_send,
539 	.peek_recvlen = tap_peek_recvlen,
540 	.recv = tap_recv,
541 	.recv_enable = tap_recv_enable,
542 	.recv_disable = tap_recv_disable,
543 	.get_cap = tap_get_cap,
544 	.set_cap = tap_set_cap,
545 };
546 
547 DATA_SET(net_backend_set, ng_backend);
548 
549 #endif /* NETGRAPH */
550 
551 /*
552  * The netmap backend
553  */
554 
555 /* The virtio-net features supported by netmap. */
556 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
557 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
558 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
559 		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
560 
561 struct netmap_priv {
562 	char ifname[IFNAMSIZ];
563 	struct nm_desc *nmd;
564 	uint16_t memid;
565 	struct netmap_ring *rx;
566 	struct netmap_ring *tx;
567 	struct mevent *mevp;
568 	net_be_rxeof_t cb;
569 	void *cb_param;
570 };
571 
572 static void
573 nmreq_init(struct nmreq *req, char *ifname)
574 {
575 
576 	memset(req, 0, sizeof(*req));
577 	strlcpy(req->nr_name, ifname, sizeof(req->nr_name));
578 	req->nr_version = NETMAP_API;
579 }
580 
581 static int
582 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len)
583 {
584 	int err;
585 	struct nmreq req;
586 	struct netmap_priv *priv = NET_BE_PRIV(be);
587 
588 	nmreq_init(&req, priv->ifname);
589 	req.nr_cmd = NETMAP_BDG_VNET_HDR;
590 	req.nr_arg1 = vnet_hdr_len;
591 	err = ioctl(be->fd, NIOCREGIF, &req);
592 	if (err) {
593 		WPRINTF(("Unable to set vnet header length %d",
594 				vnet_hdr_len));
595 		return (err);
596 	}
597 
598 	be->be_vnet_hdr_len = vnet_hdr_len;
599 
600 	return (0);
601 }
602 
603 static int
604 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len)
605 {
606 	unsigned prev_hdr_len = be->be_vnet_hdr_len;
607 	int ret;
608 
609 	if (vnet_hdr_len == prev_hdr_len) {
610 		return (1);
611 	}
612 
613 	ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len);
614 	if (ret) {
615 		return (0);
616 	}
617 
618 	netmap_set_vnet_hdr_len(be, prev_hdr_len);
619 
620 	return (1);
621 }
622 
623 static uint64_t
624 netmap_get_cap(struct net_backend *be)
625 {
626 
627 	return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ?
628 	    NETMAP_FEATURES : 0);
629 }
630 
631 static int
632 netmap_set_cap(struct net_backend *be, uint64_t features __unused,
633     unsigned vnet_hdr_len)
634 {
635 
636 	return (netmap_set_vnet_hdr_len(be, vnet_hdr_len));
637 }
638 
639 static int
640 netmap_init(struct net_backend *be, const char *devname,
641     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
642 {
643 	struct netmap_priv *priv = NET_BE_PRIV(be);
644 
645 	strlcpy(priv->ifname, devname, sizeof(priv->ifname));
646 	priv->ifname[sizeof(priv->ifname) - 1] = '\0';
647 
648 	priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL);
649 	if (priv->nmd == NULL) {
650 		WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)",
651 			devname, strerror(errno)));
652 		return (-1);
653 	}
654 
655 	priv->memid = priv->nmd->req.nr_arg2;
656 	priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0);
657 	priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0);
658 	priv->cb = cb;
659 	priv->cb_param = param;
660 	be->fd = priv->nmd->fd;
661 
662 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
663 	if (priv->mevp == NULL) {
664 		WPRINTF(("Could not register event"));
665 		return (-1);
666 	}
667 
668 	return (0);
669 }
670 
671 static void
672 netmap_cleanup(struct net_backend *be)
673 {
674 	struct netmap_priv *priv = NET_BE_PRIV(be);
675 
676 	if (priv->mevp) {
677 		mevent_delete(priv->mevp);
678 	}
679 	if (priv->nmd) {
680 		nm_close(priv->nmd);
681 	}
682 	be->fd = -1;
683 }
684 
685 static ssize_t
686 netmap_send(struct net_backend *be, const struct iovec *iov,
687 	    int iovcnt)
688 {
689 	struct netmap_priv *priv = NET_BE_PRIV(be);
690 	struct netmap_ring *ring;
691 	ssize_t totlen = 0;
692 	int nm_buf_size;
693 	int nm_buf_len;
694 	uint32_t head;
695 	uint8_t *nm_buf;
696 	int j;
697 
698 	ring = priv->tx;
699 	head = ring->head;
700 	if (head == ring->tail) {
701 		WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt)));
702 		goto txsync;
703 	}
704 	nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
705 	nm_buf_size = ring->nr_buf_size;
706 	nm_buf_len = 0;
707 
708 	for (j = 0; j < iovcnt; j++) {
709 		uint8_t *iov_frag_buf = iov[j].iov_base;
710 		int iov_frag_size = iov[j].iov_len;
711 
712 		totlen += iov_frag_size;
713 
714 		/*
715 		 * Split each iovec fragment over more netmap slots, if
716 		 * necessary.
717 		 */
718 		for (;;) {
719 			int copylen;
720 
721 			copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size;
722 			memcpy(nm_buf, iov_frag_buf, copylen);
723 
724 			iov_frag_buf += copylen;
725 			iov_frag_size -= copylen;
726 			nm_buf += copylen;
727 			nm_buf_size -= copylen;
728 			nm_buf_len += copylen;
729 
730 			if (iov_frag_size == 0) {
731 				break;
732 			}
733 
734 			ring->slot[head].len = nm_buf_len;
735 			ring->slot[head].flags = NS_MOREFRAG;
736 			head = nm_ring_next(ring, head);
737 			if (head == ring->tail) {
738 				/*
739 				 * We ran out of netmap slots while
740 				 * splitting the iovec fragments.
741 				 */
742 				WPRINTF(("No space, drop %zu bytes",
743 				   count_iov(iov, iovcnt)));
744 				goto txsync;
745 			}
746 			nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx);
747 			nm_buf_size = ring->nr_buf_size;
748 			nm_buf_len = 0;
749 		}
750 	}
751 
752 	/* Complete the last slot, which must not have NS_MOREFRAG set. */
753 	ring->slot[head].len = nm_buf_len;
754 	ring->slot[head].flags = 0;
755 	head = nm_ring_next(ring, head);
756 
757 	/* Now update ring->head and ring->cur. */
758 	ring->head = ring->cur = head;
759 txsync:
760 	ioctl(be->fd, NIOCTXSYNC, NULL);
761 
762 	return (totlen);
763 }
764 
765 static ssize_t
766 netmap_peek_recvlen(struct net_backend *be)
767 {
768 	struct netmap_priv *priv = NET_BE_PRIV(be);
769 	struct netmap_ring *ring = priv->rx;
770 	uint32_t head = ring->head;
771 	ssize_t totlen = 0;
772 
773 	while (head != ring->tail) {
774 		struct netmap_slot *slot = ring->slot + head;
775 
776 		totlen += slot->len;
777 		if ((slot->flags & NS_MOREFRAG) == 0)
778 			break;
779 		head = nm_ring_next(ring, head);
780 	}
781 
782 	return (totlen);
783 }
784 
785 static ssize_t
786 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
787 {
788 	struct netmap_priv *priv = NET_BE_PRIV(be);
789 	struct netmap_slot *slot = NULL;
790 	struct netmap_ring *ring;
791 	uint8_t *iov_frag_buf;
792 	int iov_frag_size;
793 	ssize_t totlen = 0;
794 	uint32_t head;
795 
796 	assert(iovcnt);
797 
798 	ring = priv->rx;
799 	head = ring->head;
800 	iov_frag_buf = iov->iov_base;
801 	iov_frag_size = iov->iov_len;
802 
803 	do {
804 		uint8_t *nm_buf;
805 		int nm_buf_len;
806 
807 		if (head == ring->tail) {
808 			return (0);
809 		}
810 
811 		slot = ring->slot + head;
812 		nm_buf = NETMAP_BUF(ring, slot->buf_idx);
813 		nm_buf_len = slot->len;
814 
815 		for (;;) {
816 			int copylen = nm_buf_len < iov_frag_size ?
817 			    nm_buf_len : iov_frag_size;
818 
819 			memcpy(iov_frag_buf, nm_buf, copylen);
820 			nm_buf += copylen;
821 			nm_buf_len -= copylen;
822 			iov_frag_buf += copylen;
823 			iov_frag_size -= copylen;
824 			totlen += copylen;
825 
826 			if (nm_buf_len == 0) {
827 				break;
828 			}
829 
830 			iov++;
831 			iovcnt--;
832 			if (iovcnt == 0) {
833 				/* No space to receive. */
834 				WPRINTF(("Short iov, drop %zd bytes",
835 				    totlen));
836 				return (-ENOSPC);
837 			}
838 			iov_frag_buf = iov->iov_base;
839 			iov_frag_size = iov->iov_len;
840 		}
841 
842 		head = nm_ring_next(ring, head);
843 
844 	} while (slot->flags & NS_MOREFRAG);
845 
846 	/* Release slots to netmap. */
847 	ring->head = ring->cur = head;
848 
849 	return (totlen);
850 }
851 
852 static void
853 netmap_recv_enable(struct net_backend *be)
854 {
855 	struct netmap_priv *priv = NET_BE_PRIV(be);
856 
857 	mevent_enable(priv->mevp);
858 }
859 
860 static void
861 netmap_recv_disable(struct net_backend *be)
862 {
863 	struct netmap_priv *priv = NET_BE_PRIV(be);
864 
865 	mevent_disable(priv->mevp);
866 }
867 
868 static struct net_backend netmap_backend = {
869 	.prefix = "netmap",
870 	.priv_size = sizeof(struct netmap_priv),
871 	.init = netmap_init,
872 	.cleanup = netmap_cleanup,
873 	.send = netmap_send,
874 	.peek_recvlen = netmap_peek_recvlen,
875 	.recv = netmap_recv,
876 	.recv_enable = netmap_recv_enable,
877 	.recv_disable = netmap_recv_disable,
878 	.get_cap = netmap_get_cap,
879 	.set_cap = netmap_set_cap,
880 };
881 
882 /* A clone of the netmap backend, with a different prefix. */
883 static struct net_backend vale_backend = {
884 	.prefix = "vale",
885 	.priv_size = sizeof(struct netmap_priv),
886 	.init = netmap_init,
887 	.cleanup = netmap_cleanup,
888 	.send = netmap_send,
889 	.peek_recvlen = netmap_peek_recvlen,
890 	.recv = netmap_recv,
891 	.recv_enable = netmap_recv_enable,
892 	.recv_disable = netmap_recv_disable,
893 	.get_cap = netmap_get_cap,
894 	.set_cap = netmap_set_cap,
895 };
896 
897 DATA_SET(net_backend_set, netmap_backend);
898 DATA_SET(net_backend_set, vale_backend);
899 
900 int
901 netbe_legacy_config(nvlist_t *nvl, const char *opts)
902 {
903 	char *backend, *cp;
904 
905 	if (opts == NULL)
906 		return (0);
907 
908 	cp = strchr(opts, ',');
909 	if (cp == NULL) {
910 		set_config_value_node(nvl, "backend", opts);
911 		return (0);
912 	}
913 	backend = strndup(opts, cp - opts);
914 	set_config_value_node(nvl, "backend", backend);
915 	free(backend);
916 	return (pci_parse_legacy_config(nvl, cp + 1));
917 }
918 
919 /*
920  * Initialize a backend and attach to the frontend.
921  * This is called during frontend initialization.
922  *  @ret is a pointer to the backend to be initialized
923  *  @devname is the backend-name as supplied on the command line,
924  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
925  *  @cb is the receive callback supplied by the frontend,
926  *	and it is invoked in the event loop when a receive
927  *	event is generated in the hypervisor,
928  *  @param is a pointer to the frontend, and normally used as
929  *	the argument for the callback.
930  */
931 int
932 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
933     void *param)
934 {
935 	struct net_backend **pbe, *nbe, *tbe = NULL;
936 	const char *value, *type;
937 	char *devname;
938 	int err;
939 
940 	value = get_config_value_node(nvl, "backend");
941 	if (value == NULL) {
942 		return (-1);
943 	}
944 	devname = strdup(value);
945 
946 	/*
947 	 * Use the type given by configuration if exists; otherwise
948 	 * use the prefix of the backend as the type.
949 	 */
950 	type = get_config_value_node(nvl, "type");
951 	if (type == NULL)
952 		type = devname;
953 
954 	/*
955 	 * Find the network backend that matches the user-provided
956 	 * device name. net_backend_set is built using a linker set.
957 	 */
958 	SET_FOREACH(pbe, net_backend_set) {
959 		if (strncmp(type, (*pbe)->prefix,
960 		    strlen((*pbe)->prefix)) == 0) {
961 			tbe = *pbe;
962 			assert(tbe->init != NULL);
963 			assert(tbe->cleanup != NULL);
964 			assert(tbe->send != NULL);
965 			assert(tbe->recv != NULL);
966 			assert(tbe->get_cap != NULL);
967 			assert(tbe->set_cap != NULL);
968 			break;
969 		}
970 	}
971 
972 	*ret = NULL;
973 	if (tbe == NULL) {
974 		free(devname);
975 		return (EINVAL);
976 	}
977 
978 	nbe = calloc(1, NET_BE_SIZE(tbe));
979 	*nbe = *tbe;	/* copy the template */
980 	nbe->fd = -1;
981 	nbe->sc = param;
982 	nbe->be_vnet_hdr_len = 0;
983 	nbe->fe_vnet_hdr_len = 0;
984 
985 	/* Initialize the backend. */
986 	err = nbe->init(nbe, devname, nvl, cb, param);
987 	if (err) {
988 		free(devname);
989 		free(nbe);
990 		return (err);
991 	}
992 
993 	*ret = nbe;
994 	free(devname);
995 
996 	return (0);
997 }
998 
999 void
1000 netbe_cleanup(struct net_backend *be)
1001 {
1002 
1003 	if (be != NULL) {
1004 		be->cleanup(be);
1005 		free(be);
1006 	}
1007 }
1008 
1009 uint64_t
1010 netbe_get_cap(struct net_backend *be)
1011 {
1012 
1013 	assert(be != NULL);
1014 	return (be->get_cap(be));
1015 }
1016 
1017 int
1018 netbe_set_cap(struct net_backend *be, uint64_t features,
1019 	      unsigned vnet_hdr_len)
1020 {
1021 	int ret;
1022 
1023 	assert(be != NULL);
1024 
1025 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
1026 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
1027 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
1028 		return (-1);
1029 
1030 	be->fe_vnet_hdr_len = vnet_hdr_len;
1031 
1032 	ret = be->set_cap(be, features, vnet_hdr_len);
1033 	assert(be->be_vnet_hdr_len == 0 ||
1034 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
1035 
1036 	return (ret);
1037 }
1038 
1039 ssize_t
1040 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
1041 {
1042 
1043 	return (be->send(be, iov, iovcnt));
1044 }
1045 
1046 ssize_t
1047 netbe_peek_recvlen(struct net_backend *be)
1048 {
1049 
1050 	return (be->peek_recvlen(be));
1051 }
1052 
1053 /*
1054  * Try to read a packet from the backend, without blocking.
1055  * If no packets are available, return 0. In case of success, return
1056  * the length of the packet just read. Return -1 in case of errors.
1057  */
1058 ssize_t
1059 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
1060 {
1061 
1062 	return (be->recv(be, iov, iovcnt));
1063 }
1064 
1065 /*
1066  * Read a packet from the backend and discard it.
1067  * Returns the size of the discarded packet or zero if no packet was available.
1068  * A negative error code is returned in case of read error.
1069  */
1070 ssize_t
1071 netbe_rx_discard(struct net_backend *be)
1072 {
1073 	/*
1074 	 * MP note: the dummybuf is only used to discard frames,
1075 	 * so there is no need for it to be per-vtnet or locked.
1076 	 * We only make it large enough for TSO-sized segment.
1077 	 */
1078 	static uint8_t dummybuf[65536 + 64];
1079 	struct iovec iov;
1080 
1081 	iov.iov_base = dummybuf;
1082 	iov.iov_len = sizeof(dummybuf);
1083 
1084 	return netbe_recv(be, &iov, 1);
1085 }
1086 
1087 void
1088 netbe_rx_disable(struct net_backend *be)
1089 {
1090 
1091 	return be->recv_disable(be);
1092 }
1093 
1094 void
1095 netbe_rx_enable(struct net_backend *be)
1096 {
1097 
1098 	return be->recv_enable(be);
1099 }
1100 
1101 size_t
1102 netbe_get_vnet_hdr_len(struct net_backend *be)
1103 {
1104 
1105 	return (be->be_vnet_hdr_len);
1106 }
1107