xref: /illumos-gate/usr/src/cmd/bhyve/common/net_backends.c (revision 5c4a5fe16715fb423db76577a6883b5bbecdbe45)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20  * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21  * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /*
29  * This file implements multiple network backends (tap, netmap, ...),
30  * to be used by network frontends such as virtio-net and e1000.
31  * The API to access the backend (e.g. send/receive packets, negotiate
32  * features) is exported by net_backends.h.
33  */
34 
35 #include <sys/types.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/ioctl.h>
40 #include <sys/mman.h>
41 #include <sys/uio.h>
42 
43 #include <net/if.h>
44 #ifdef __FreeBSD__
45 #include <net/if_tap.h>
46 #endif
47 
48 #include <assert.h>
49 #ifndef WITHOUT_CAPSICUM
50 #include <capsicum_helpers.h>
51 #endif
52 #include <err.h>
53 #include <errno.h>
54 #include <fcntl.h>
55 #include <poll.h>
56 #include <pthread.h>
57 #include <pthread_np.h>
58 #include <stdio.h>
59 #include <stdlib.h>
60 #include <stdint.h>
61 #include <string.h>
62 #include <unistd.h>
63 #include <sysexits.h>
64 #include <unistd.h>
65 
66 #include "config.h"
67 #include "debug.h"
68 #include "iov.h"
69 #include "mevent.h"
70 #include "net_backends.h"
71 #include "net_backends_priv.h"
72 #include "pci_emul.h"
73 
74 #define	NET_BE_SIZE(be)		(sizeof(*be) + (be)->priv_size)
75 
76 #ifdef __FreeBSD__
77 void
tap_cleanup(struct net_backend * be)78 tap_cleanup(struct net_backend *be)
79 {
80 	struct tap_priv *priv = NET_BE_PRIV(be);
81 
82 	if (priv->mevp) {
83 		mevent_delete(priv->mevp);
84 	}
85 	if (be->fd != -1) {
86 		close(be->fd);
87 		be->fd = -1;
88 	}
89 }
90 
91 static int
tap_init(struct net_backend * be,const char * devname,nvlist_t * nvl __unused,net_be_rxeof_t cb,void * param)92 tap_init(struct net_backend *be, const char *devname,
93     nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
94 {
95 	struct tap_priv *priv = NET_BE_PRIV(be);
96 	char tbuf[80];
97 	int opt = 1, up = IFF_UP;
98 
99 #ifndef WITHOUT_CAPSICUM
100 	cap_rights_t rights;
101 #endif
102 
103 	if (cb == NULL) {
104 		EPRINTLN("TAP backend requires non-NULL callback");
105 		return (-1);
106 	}
107 
108 	strcpy(tbuf, "/dev/");
109 	strlcat(tbuf, devname, sizeof(tbuf));
110 
111 	be->fd = open(tbuf, O_RDWR);
112 	if (be->fd == -1) {
113 		EPRINTLN("open of tap device %s failed", tbuf);
114 		goto error;
115 	}
116 
117 	/*
118 	 * Set non-blocking and register for read
119 	 * notifications with the event loop
120 	 */
121 	if (ioctl(be->fd, FIONBIO, &opt) < 0) {
122 		EPRINTLN("tap device O_NONBLOCK failed");
123 		goto error;
124 	}
125 
126 	if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) {
127 		EPRINTLN("tap device link up failed");
128 		goto error;
129 	}
130 
131 #ifndef WITHOUT_CAPSICUM
132 	cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
133 	if (caph_rights_limit(be->fd, &rights) == -1)
134 		errx(EX_OSERR, "Unable to apply rights for sandbox");
135 #endif
136 
137 	memset(priv->bbuf, 0, sizeof(priv->bbuf));
138 	priv->bbuflen = 0;
139 
140 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
141 	if (priv->mevp == NULL) {
142 		EPRINTLN("Could not register event");
143 		goto error;
144 	}
145 
146 	return (0);
147 
148 error:
149 	tap_cleanup(be);
150 	return (-1);
151 }
152 
153 /*
154  * Called to send a buffer chain out to the tap device
155  */
156 ssize_t
tap_send(struct net_backend * be,const struct iovec * iov,int iovcnt)157 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
158 {
159 	return (writev(be->fd, iov, iovcnt));
160 }
161 
162 ssize_t
tap_peek_recvlen(struct net_backend * be)163 tap_peek_recvlen(struct net_backend *be)
164 {
165 	struct tap_priv *priv = NET_BE_PRIV(be);
166 	ssize_t ret;
167 
168 	if (priv->bbuflen > 0) {
169 		/*
170 		 * We already have a packet in the bounce buffer.
171 		 * Just return its length.
172 		 */
173 		return priv->bbuflen;
174 	}
175 
176 	/*
177 	 * Read the next packet (if any) into the bounce buffer, so
178 	 * that we get to know its length and we can return that
179 	 * to the caller.
180 	 */
181 	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
182 	if (ret < 0 && errno == EWOULDBLOCK) {
183 		return (0);
184 	}
185 
186 	if (ret > 0)
187 		priv->bbuflen = ret;
188 
189 	return (ret);
190 }
191 
192 ssize_t
tap_recv(struct net_backend * be,const struct iovec * iov,int iovcnt)193 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
194 {
195 	struct tap_priv *priv = NET_BE_PRIV(be);
196 	ssize_t ret;
197 
198 	if (priv->bbuflen > 0) {
199 		/*
200 		 * A packet is available in the bounce buffer, so
201 		 * we read it from there.
202 		 */
203 		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
204 		    iov, iovcnt, 0);
205 
206 		/* Mark the bounce buffer as empty. */
207 		priv->bbuflen = 0;
208 
209 		return (ret);
210 	}
211 
212 	ret = readv(be->fd, iov, iovcnt);
213 	if (ret < 0 && errno == EWOULDBLOCK) {
214 		return (0);
215 	}
216 
217 	return (ret);
218 }
219 
220 void
tap_recv_enable(struct net_backend * be)221 tap_recv_enable(struct net_backend *be)
222 {
223 	struct tap_priv *priv = NET_BE_PRIV(be);
224 
225 	mevent_enable(priv->mevp);
226 }
227 
228 void
tap_recv_disable(struct net_backend * be)229 tap_recv_disable(struct net_backend *be)
230 {
231 	struct tap_priv *priv = NET_BE_PRIV(be);
232 
233 	mevent_disable(priv->mevp);
234 }
235 
236 uint64_t
tap_get_cap(struct net_backend * be __unused)237 tap_get_cap(struct net_backend *be __unused)
238 {
239 
240 	return (0); /* no capabilities for now */
241 }
242 
243 int
tap_set_cap(struct net_backend * be __unused,uint64_t features,unsigned vnet_hdr_len)244 tap_set_cap(struct net_backend *be __unused, uint64_t features,
245     unsigned vnet_hdr_len)
246 {
247 
248 	return ((features || vnet_hdr_len) ? -1 : 0);
249 }
250 
251 static struct net_backend tap_backend = {
252 	.prefix = "tap",
253 	.priv_size = sizeof(struct tap_priv),
254 	.init = tap_init,
255 	.cleanup = tap_cleanup,
256 	.send = tap_send,
257 	.peek_recvlen = tap_peek_recvlen,
258 	.recv = tap_recv,
259 	.recv_enable = tap_recv_enable,
260 	.recv_disable = tap_recv_disable,
261 	.get_cap = tap_get_cap,
262 	.set_cap = tap_set_cap,
263 };
264 
265 /* A clone of the tap backend, with a different prefix. */
266 static struct net_backend vmnet_backend = {
267 	.prefix = "vmnet",
268 	.priv_size = sizeof(struct tap_priv),
269 	.init = tap_init,
270 	.cleanup = tap_cleanup,
271 	.send = tap_send,
272 	.peek_recvlen = tap_peek_recvlen,
273 	.recv = tap_recv,
274 	.recv_enable = tap_recv_enable,
275 	.recv_disable = tap_recv_disable,
276 	.get_cap = tap_get_cap,
277 	.set_cap = tap_set_cap,
278 };
279 
280 DATA_SET(net_backend_set, tap_backend);
281 DATA_SET(net_backend_set, vmnet_backend);
282 #endif /* __FreeBSD__ */
283 
284 #ifdef __FreeBSD__
285 int
netbe_legacy_config(nvlist_t * nvl,const char * opts)286 netbe_legacy_config(nvlist_t *nvl, const char *opts)
287 {
288 	char *backend, *cp;
289 
290 	if (opts == NULL)
291 		return (0);
292 
293 	cp = strchr(opts, ',');
294 	if (cp == NULL) {
295 		set_config_value_node(nvl, "backend", opts);
296 		return (0);
297 	}
298 	backend = strndup(opts, cp - opts);
299 	set_config_value_node(nvl, "backend", backend);
300 	free(backend);
301 	return (pci_parse_legacy_config(nvl, cp + 1));
302 }
303 #else
304 int
netbe_legacy_config(nvlist_t * nvl,const char * opts)305 netbe_legacy_config(nvlist_t *nvl, const char *opts)
306 {
307 	char *config, *name, *tofree, *value;
308 
309 	if (opts == NULL)
310 		return (0);
311 
312 	/* Default to the 'dlpi' backend - can still be overridden by opts */
313 	set_config_value_node(nvl, "backend", "dlpi");
314 	set_config_value_node(nvl, "type", "dlpi");
315 
316 	config = tofree = strdup(opts);
317 	if (config == NULL)
318 		err(4, "netbe_legacy_config strdup()");
319 	while ((name = strsep(&config, ",")) != NULL) {
320 		value = strchr(name, '=');
321 		if (value != NULL) {
322 			*value++ = '\0';
323 			set_config_value_node(nvl, name, value);
324 		} else {
325 			set_config_value_node(nvl, "vnic", name);
326 		}
327 	}
328 	free(tofree);
329 
330 	return (0);
331 }
332 #endif
333 
334 /*
335  * Initialize a backend and attach to the frontend.
336  * This is called during frontend initialization.
337  *  @ret is a pointer to the backend to be initialized
338  *  @devname is the backend-name as supplied on the command line,
339  * 	e.g. -s 2:0,frontend-name,backend-name[,other-args]
340  *  @cb is the receive callback supplied by the frontend,
341  *	and it is invoked in the event loop when a receive
342  *	event is generated in the hypervisor,
343  *  @param is a pointer to the frontend, and normally used as
344  *	the argument for the callback.
345  */
346 int
netbe_init(struct net_backend ** ret,nvlist_t * nvl,net_be_rxeof_t cb,void * param)347 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
348     void *param)
349 {
350 	struct net_backend **pbe, *nbe, *tbe = NULL;
351 	const char *value, *type;
352 	char *devname;
353 	int err;
354 
355 	value = get_config_value_node(nvl, "backend");
356 	if (value == NULL) {
357 		return (-1);
358 	}
359 	devname = strdup(value);
360 
361 	/*
362 	 * Use the type given by configuration if exists; otherwise
363 	 * use the prefix of the backend as the type.
364 	 */
365 	type = get_config_value_node(nvl, "type");
366 	if (type == NULL)
367 		type = devname;
368 
369 	/*
370 	 * Find the network backend that matches the user-provided
371 	 * device name. net_backend_set is built using a linker set.
372 	 */
373 	SET_FOREACH(pbe, net_backend_set) {
374 		if (strncmp(type, (*pbe)->prefix,
375 		    strlen((*pbe)->prefix)) == 0) {
376 			tbe = *pbe;
377 			assert(tbe->init != NULL);
378 			assert(tbe->cleanup != NULL);
379 			assert(tbe->send != NULL);
380 			assert(tbe->recv != NULL);
381 			assert(tbe->get_cap != NULL);
382 			assert(tbe->set_cap != NULL);
383 			break;
384 		}
385 	}
386 
387 	*ret = NULL;
388 	if (tbe == NULL) {
389 		free(devname);
390 		return (EINVAL);
391 	}
392 
393 	nbe = calloc(1, NET_BE_SIZE(tbe));
394 	*nbe = *tbe;	/* copy the template */
395 	nbe->fd = -1;
396 	nbe->sc = param;
397 	nbe->be_vnet_hdr_len = 0;
398 	nbe->fe_vnet_hdr_len = 0;
399 
400 	/* Initialize the backend. */
401 	err = nbe->init(nbe, devname, nvl, cb, param);
402 	if (err) {
403 		free(devname);
404 		free(nbe);
405 		return (err);
406 	}
407 
408 	*ret = nbe;
409 	free(devname);
410 
411 	return (0);
412 }
413 
414 void
netbe_cleanup(struct net_backend * be)415 netbe_cleanup(struct net_backend *be)
416 {
417 
418 	if (be != NULL) {
419 		be->cleanup(be);
420 		free(be);
421 	}
422 }
423 
424 uint64_t
netbe_get_cap(struct net_backend * be)425 netbe_get_cap(struct net_backend *be)
426 {
427 
428 	assert(be != NULL);
429 	return (be->get_cap(be));
430 }
431 
432 int
netbe_set_cap(struct net_backend * be,uint64_t features,unsigned vnet_hdr_len)433 netbe_set_cap(struct net_backend *be, uint64_t features,
434 	      unsigned vnet_hdr_len)
435 {
436 	int ret;
437 
438 	assert(be != NULL);
439 
440 	/* There are only three valid lengths, i.e., 0, 10 and 12. */
441 	if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
442 		&& vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
443 		return (-1);
444 
445 	be->fe_vnet_hdr_len = vnet_hdr_len;
446 
447 	ret = be->set_cap(be, features, vnet_hdr_len);
448 	assert(be->be_vnet_hdr_len == 0 ||
449 	       be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
450 
451 	return (ret);
452 }
453 
454 ssize_t
netbe_send(struct net_backend * be,const struct iovec * iov,int iovcnt)455 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
456 {
457 
458 	return (be->send(be, iov, iovcnt));
459 }
460 
461 ssize_t
netbe_peek_recvlen(struct net_backend * be)462 netbe_peek_recvlen(struct net_backend *be)
463 {
464 
465 	return (be->peek_recvlen(be));
466 }
467 
468 /*
469  * Try to read a packet from the backend, without blocking.
470  * If no packets are available, return 0. In case of success, return
471  * the length of the packet just read. Return -1 in case of errors.
472  */
473 ssize_t
netbe_recv(struct net_backend * be,const struct iovec * iov,int iovcnt)474 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
475 {
476 
477 	return (be->recv(be, iov, iovcnt));
478 }
479 
480 /*
481  * Read a packet from the backend and discard it.
482  * Returns the size of the discarded packet or zero if no packet was available.
483  * A negative error code is returned in case of read error.
484  */
485 ssize_t
netbe_rx_discard(struct net_backend * be)486 netbe_rx_discard(struct net_backend *be)
487 {
488 	/*
489 	 * MP note: the dummybuf is only used to discard frames,
490 	 * so there is no need for it to be per-vtnet or locked.
491 	 * We only make it large enough for TSO-sized segment.
492 	 */
493 	static uint8_t dummybuf[65536 + 64];
494 	struct iovec iov;
495 
496 #ifdef __FreeBSD__
497 	iov.iov_base = dummybuf;
498 #else
499 	iov.iov_base = (caddr_t)dummybuf;
500 #endif
501 	iov.iov_len = sizeof(dummybuf);
502 
503 	return netbe_recv(be, &iov, 1);
504 }
505 
506 void
netbe_rx_disable(struct net_backend * be)507 netbe_rx_disable(struct net_backend *be)
508 {
509 
510 	return be->recv_disable(be);
511 }
512 
513 void
netbe_rx_enable(struct net_backend * be)514 netbe_rx_enable(struct net_backend *be)
515 {
516 
517 	return be->recv_enable(be);
518 }
519 
520 size_t
netbe_get_vnet_hdr_len(struct net_backend * be)521 netbe_get_vnet_hdr_len(struct net_backend *be)
522 {
523 
524 	return (be->be_vnet_hdr_len);
525 }
526 
527 #ifndef __FreeBSD__
528 int
netbe_get_mac(net_backend_t * be,void * buf,size_t * buflen)529 netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen)
530 {
531 	if (be->get_mac == NULL)
532 		return (ENOTSUP);
533 	return (be->get_mac(be, buf, buflen));
534 }
535 #endif
536