xref: /freebsd/usr.sbin/bhyve/net_backends.c (revision 1aad95345237424918e5f6b18464df4dbc2aa1d8)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * This file implements multiple network backends (tap, netmap, ...),
30  * to be used by network frontends such as virtio-net and e1000.
31  * The API to access the backend (e.g. send/receive packets, negotiate
32  * features) is exported by net_backends.h.
33  */
34 
35 #include <sys/types.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/ioctl.h>
40 #include <sys/mman.h>
41 #include <sys/uio.h>
42 
43 #include <net/if.h>
44 #include <net/if_tap.h>
45 
46 #include <assert.h>
47 #ifndef WITHOUT_CAPSICUM
48 #include <capsicum_helpers.h>
49 #endif
50 #include <err.h>
51 #include <errno.h>
52 #include <fcntl.h>
53 #include <poll.h>
54 #include <pthread.h>
55 #include <pthread_np.h>
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <stdint.h>
59 #include <string.h>
60 #include <sysexits.h>
61 #include <unistd.h>
62 
63 #include "config.h"
64 #include "debug.h"
65 #include "iov.h"
66 #include "mevent.h"
67 #include "net_backends.h"
68 #include "net_backends_priv.h"
69 #include "pci_emul.h"
70 
71 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
72 
73 void
tap_cleanup(struct net_backend * be)74 tap_cleanup(struct net_backend *be)
75 {
76 	struct tap_priv *priv = NET_BE_PRIV(be);
77 
78 	if (priv->mevp) {
79 		mevent_delete(priv->mevp);
80 	}
81 	if (be->fd != -1) {
82 		close(be->fd);
83 		be->fd = -1;
84 	}
85 }
86 
87 static int
tap_init(struct net_backend * be,const char * devname,nvlist_t * nvl __unused,net_be_rxeof_t cb,void * param)88 tap_init(struct net_backend *be, const char *devname,
89     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
90 {
91 	struct tap_priv *priv = NET_BE_PRIV(be);
92 	char tbuf[80];
93 	int opt = 1, up = IFF_UP;
94 
95 #ifndef WITHOUT_CAPSICUM
96 	cap_rights_t rights;
97 #endif
98 
99 	if (cb == NULL) {
100 		EPRINTLN("TAP backend requires non-NULL callback");
101 		return (-1);
102 	}
103 
104 	strcpy(tbuf, "/dev/");
105 	strlcat(tbuf, devname, sizeof(tbuf));
106 
107 	be->fd = open(tbuf, O_RDWR);
108 	if (be->fd == -1) {
109 		EPRINTLN("open of tap device %s failed", tbuf);
110 		goto error;
111 	}
112 
113 	/*
114 	 * Set non-blocking and register for read
115 	 * notifications with the event loop
116 	 */
117 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
118 		EPRINTLN("tap device O_NONBLOCK failed");
119 		goto error;
120 	}
121 
122 	if (strncmp("ngd", be->prefix, 3) &&
123 	    ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) {
124 		EPRINTLN("tap device link up failed");
125 		goto error;
126 	}
127 
128 #ifndef WITHOUT_CAPSICUM
129 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
130 	if (caph_rights_limit(be->fd, &rights) == -1)
131 		errx(EX_OSERR, "Unable to apply rights for sandbox");
132 #endif
133 
134 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
135 	priv->bbuflen = 0;
136 
137 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
138 	if (priv->mevp == NULL) {
139 		EPRINTLN("Could not register event");
140 		goto error;
141 	}
142 
143 	return (0);
144 
145 error:
146 	tap_cleanup(be);
147 	return (-1);
148 }
149 
150 /*
151  * Called to send a buffer chain out to the tap device
152  */
153 ssize_t
tap_send(struct net_backend * be,const struct iovec * iov,int iovcnt)154 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
155 {
156 	return (writev(be->fd, iov, iovcnt));
157 }
158 
159 ssize_t
tap_peek_recvlen(struct net_backend * be)160 tap_peek_recvlen(struct net_backend *be)
161 {
162 	struct tap_priv *priv = NET_BE_PRIV(be);
163 	ssize_t ret;
164 
165 	if (priv->bbuflen > 0) {
166 		/*
167 		 * We already have a packet in the bounce buffer.
168 		 * Just return its length.
169 		 */
170 		return priv->bbuflen;
171 	}
172 
173 	/*
174 	 * Read the next packet (if any) into the bounce buffer, so
175 	 * that we get to know its length and we can return that
176 	 * to the caller.
177 	 */
178 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
179 	if (ret < 0 && errno == EWOULDBLOCK) {
180 		return (0);
181 	}
182 
183 	if (ret > 0)
184 		priv->bbuflen = ret;
185 
186 	return (ret);
187 }
188 
189 ssize_t
tap_recv(struct net_backend * be,const struct iovec * iov,int iovcnt)190 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
191 {
192 	struct tap_priv *priv = NET_BE_PRIV(be);
193 	ssize_t ret;
194 
195 	if (priv->bbuflen > 0) {
196 		/*
197 		 * A packet is available in the bounce buffer, so
198 		 * we read it from there.
199 		 */
200 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
201 		    iov, iovcnt, 0);
202 
203 		/* Mark the bounce buffer as empty. */
204 		priv->bbuflen = 0;
205 
206 		return (ret);
207 	}
208 
209 	ret = readv(be->fd, iov, iovcnt);
210 	if (ret < 0 && errno == EWOULDBLOCK) {
211 		return (0);
212 	}
213 
214 	return (ret);
215 }
216 
217 void
tap_recv_enable(struct net_backend * be)218 tap_recv_enable(struct net_backend *be)
219 {
220 	struct tap_priv *priv = NET_BE_PRIV(be);
221 
222 	mevent_enable(priv->mevp);
223 }
224 
225 void
tap_recv_disable(struct net_backend * be)226 tap_recv_disable(struct net_backend *be)
227 {
228 	struct tap_priv *priv = NET_BE_PRIV(be);
229 
230 	mevent_disable(priv->mevp);
231 }
232 
233 uint64_t
tap_get_cap(struct net_backend * be __unused)234 tap_get_cap(struct net_backend *be __unused)
235 {
236 
237 	return (0); /* no capabilities for now */
238 }
239 
240 int
tap_set_cap(struct net_backend * be __unused,uint64_t features,unsigned vnet_hdr_len)241 tap_set_cap(struct net_backend *be __unused, uint64_t features,
242     unsigned vnet_hdr_len)
243 {
244 
245 	return ((features || vnet_hdr_len) ? -1 : 0);
246 }
247 
248 static struct net_backend tap_backend = {
249 	.prefix = "tap",
250 	.priv_size = sizeof(struct tap_priv),
251 	.init = tap_init,
252 	.cleanup = tap_cleanup,
253 	.send = tap_send,
254 	.peek_recvlen = tap_peek_recvlen,
255 	.recv = tap_recv,
256 	.recv_enable = tap_recv_enable,
257 	.recv_disable = tap_recv_disable,
258 	.get_cap = tap_get_cap,
259 	.set_cap = tap_set_cap,
260 };
261 
262 /* A clone of the tap backend, with a different prefix. */
263 static struct net_backend vmnet_backend = {
264 	.prefix = "vmnet",
265 	.priv_size = sizeof(struct tap_priv),
266 	.init = tap_init,
267 	.cleanup = tap_cleanup,
268 	.send = tap_send,
269 	.peek_recvlen = tap_peek_recvlen,
270 	.recv = tap_recv,
271 	.recv_enable = tap_recv_enable,
272 	.recv_disable = tap_recv_disable,
273 	.get_cap = tap_get_cap,
274 	.set_cap = tap_set_cap,
275 };
276 
277 /* A clone of the tap backend, with a different prefix. */
278 static struct net_backend ngd_backend = {
279 	.prefix = "ngd",
280 	.priv_size = sizeof(struct tap_priv),
281 	.init = tap_init,
282 	.cleanup = tap_cleanup,
283 	.send = tap_send,
284 	.peek_recvlen = tap_peek_recvlen,
285 	.recv = tap_recv,
286 	.recv_enable = tap_recv_enable,
287 	.recv_disable = tap_recv_disable,
288 	.get_cap = tap_get_cap,
289 	.set_cap = tap_set_cap,
290 };
291 
292 DATA_SET(net_backend_set, tap_backend);
293 DATA_SET(net_backend_set, vmnet_backend);
294 DATA_SET(net_backend_set, ngd_backend);
295 
296 int
netbe_legacy_config(nvlist_t * nvl,const char * opts)297 netbe_legacy_config(nvlist_t *nvl, const char *opts)
298 {
299 	char *backend, *cp;
300 
301 	if (opts == NULL)
302 		return (0);
303 
304 	cp = strchr(opts, ',');
305 	if (cp == NULL) {
306 		set_config_value_node(nvl, "backend", opts);
307 		return (0);
308 	}
309 	backend = strndup(opts, cp - opts);
310 	set_config_value_node(nvl, "backend", backend);
311 	free(backend);
312 	return (pci_parse_legacy_config(nvl, cp + 1));
313 }
314 
315 /*
316  * Initialize a backend and attach to the frontend.
317  * This is called during frontend initialization.
318  *  @ret is a pointer to the backend to be initialized
319  *  @devname is the backend-name as supplied on the command line,
320  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
321  *  @cb is the receive callback supplied by the frontend,
322  *	and it is invoked in the event loop when a receive
323  *	event is generated in the hypervisor,
324  *  @param is a pointer to the frontend, and normally used as
325  *	the argument for the callback.
326  */
327 int
netbe_init(struct net_backend ** ret,nvlist_t * nvl,net_be_rxeof_t cb,void * param)328 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
329     void *param)
330 {
331 	struct net_backend **pbe, *nbe, *tbe = NULL;
332 	const char *value, *type;
333 	char *devname;
334 	int err;
335 
336 	value = get_config_value_node(nvl, "backend");
337 	if (value == NULL) {
338 		return (-1);
339 	}
340 	devname = strdup(value);
341 
342 	/*
343 	 * Use the type given by configuration if exists; otherwise
344 	 * use the prefix of the backend as the type.
345 	 */
346 	type = get_config_value_node(nvl, "type");
347 	if (type == NULL)
348 		type = devname;
349 
350 	/*
351 	 * Find the network backend that matches the user-provided
352 	 * device name. net_backend_set is built using a linker set.
353 	 */
354 	SET_FOREACH(pbe, net_backend_set) {
355 		if (strncmp(type, (*pbe)->prefix,
356 		    strlen((*pbe)->prefix)) == 0) {
357 			tbe = *pbe;
358 			assert(tbe->init != NULL);
359 			assert(tbe->cleanup != NULL);
360 			assert(tbe->send != NULL);
361 			assert(tbe->recv != NULL);
362 			assert(tbe->get_cap != NULL);
363 			assert(tbe->set_cap != NULL);
364 			break;
365 		}
366 	}
367 
368 	*ret = NULL;
369 	if (tbe == NULL) {
370 		free(devname);
371 		return (EINVAL);
372 	}
373 
374 	nbe = calloc(1, NET_BE_SIZE(tbe));
375 	*nbe = *tbe;	/* copy the template */
376 	nbe->fd = -1;
377 	nbe->sc = param;
378 	nbe->be_vnet_hdr_len = 0;
379 	nbe->fe_vnet_hdr_len = 0;
380 
381 	/* Initialize the backend. */
382 	err = nbe->init(nbe, devname, nvl, cb, param);
383 	if (err) {
384 		free(devname);
385 		free(nbe);
386 		return (err);
387 	}
388 
389 	*ret = nbe;
390 	free(devname);
391 
392 	return (0);
393 }
394 
395 void
netbe_cleanup(struct net_backend * be)396 netbe_cleanup(struct net_backend *be)
397 {
398 
399 	if (be != NULL) {
400 		be->cleanup(be);
401 		free(be);
402 	}
403 }
404 
405 uint64_t
netbe_get_cap(struct net_backend * be)406 netbe_get_cap(struct net_backend *be)
407 {
408 
409 	assert(be != NULL);
410 	return (be->get_cap(be));
411 }
412 
413 int
netbe_set_cap(struct net_backend * be,uint64_t features,unsigned vnet_hdr_len)414 netbe_set_cap(struct net_backend *be, uint64_t features,
415 	      unsigned vnet_hdr_len)
416 {
417 	int ret;
418 
419 	assert(be != NULL);
420 
421 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
422 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
423 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
424 		return (-1);
425 
426 	be->fe_vnet_hdr_len = vnet_hdr_len;
427 
428 	ret = be->set_cap(be, features, vnet_hdr_len);
429 	assert(be->be_vnet_hdr_len == 0 ||
430 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
431 
432 	return (ret);
433 }
434 
435 ssize_t
netbe_send(struct net_backend * be,const struct iovec * iov,int iovcnt)436 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
437 {
438 
439 	return (be->send(be, iov, iovcnt));
440 }
441 
442 ssize_t
netbe_peek_recvlen(struct net_backend * be)443 netbe_peek_recvlen(struct net_backend *be)
444 {
445 
446 	return (be->peek_recvlen(be));
447 }
448 
449 /*
450  * Try to read a packet from the backend, without blocking.
451  * If no packets are available, return 0. In case of success, return
452  * the length of the packet just read. Return -1 in case of errors.
453  */
454 ssize_t
netbe_recv(struct net_backend * be,const struct iovec * iov,int iovcnt)455 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
456 {
457 
458 	return (be->recv(be, iov, iovcnt));
459 }
460 
461 /*
462  * Read a packet from the backend and discard it.
463  * Returns the size of the discarded packet or zero if no packet was available.
464  * A negative error code is returned in case of read error.
465  */
466 ssize_t
netbe_rx_discard(struct net_backend * be)467 netbe_rx_discard(struct net_backend *be)
468 {
469 	/*
470 	 * MP note: the dummybuf is only used to discard frames,
471 	 * so there is no need for it to be per-vtnet or locked.
472 	 * We only make it large enough for TSO-sized segment.
473 	 */
474 	static uint8_t dummybuf[65536 + 64];
475 	struct iovec iov;
476 
477 	iov.iov_base = dummybuf;
478 	iov.iov_len = sizeof(dummybuf);
479 
480 	return netbe_recv(be, &iov, 1);
481 }
482 
483 void
netbe_rx_disable(struct net_backend * be)484 netbe_rx_disable(struct net_backend *be)
485 {
486 
487 	return be->recv_disable(be);
488 }
489 
490 void
netbe_rx_enable(struct net_backend * be)491 netbe_rx_enable(struct net_backend *be)
492 {
493 
494 	return be->recv_enable(be);
495 }
496 
497 size_t
netbe_get_vnet_hdr_len(struct net_backend * be)498 netbe_get_vnet_hdr_len(struct net_backend *be)
499 {
500 
501 	return (be->be_vnet_hdr_len);
502 }
503