1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS
19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 /*
29 * This file implements multiple network backends (tap, netmap, ...),
30 * to be used by network frontends such as virtio-net and e1000.
31 * The API to access the backend (e.g. send/receive packets, negotiate
32 * features) is exported by net_backends.h.
33 */
34
35 #include <sys/types.h>
36 #ifndef WITHOUT_CAPSICUM
37 #include <sys/capsicum.h>
38 #endif
39 #include <sys/ioctl.h>
40 #include <sys/mman.h>
41 #include <sys/uio.h>
42
43 #include <net/if.h>
44 #include <net/if_tap.h>
45
46 #include <assert.h>
47 #ifndef WITHOUT_CAPSICUM
48 #include <capsicum_helpers.h>
49 #endif
50 #include <err.h>
51 #include <errno.h>
52 #include <fcntl.h>
53 #include <poll.h>
54 #include <pthread.h>
55 #include <pthread_np.h>
56 #include <stdio.h>
57 #include <stdlib.h>
58 #include <stdint.h>
59 #include <string.h>
60 #include <sysexits.h>
61 #include <unistd.h>
62
63 #include "config.h"
64 #include "debug.h"
65 #include "iov.h"
66 #include "mevent.h"
67 #include "net_backends.h"
68 #include "net_backends_priv.h"
69 #include "pci_emul.h"
70
71 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size)
72
73 void
tap_cleanup(struct net_backend * be)74 tap_cleanup(struct net_backend *be)
75 {
76 struct tap_priv *priv = NET_BE_PRIV(be);
77
78 if (priv->mevp) {
79 mevent_delete(priv->mevp);
80 }
81 if (be->fd != -1) {
82 close(be->fd);
83 be->fd = -1;
84 }
85 }
86
87 static int
tap_init(struct net_backend * be,const char * devname,nvlist_t * nvl __unused,net_be_rxeof_t cb,void * param)88 tap_init(struct net_backend *be, const char *devname,
89 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param)
90 {
91 struct tap_priv *priv = NET_BE_PRIV(be);
92 char tbuf[80];
93 int opt = 1, up = IFF_UP;
94
95 #ifndef WITHOUT_CAPSICUM
96 cap_rights_t rights;
97 #endif
98
99 if (cb == NULL) {
100 EPRINTLN("TAP backend requires non-NULL callback");
101 return (-1);
102 }
103
104 strcpy(tbuf, "/dev/");
105 strlcat(tbuf, devname, sizeof(tbuf));
106
107 be->fd = open(tbuf, O_RDWR);
108 if (be->fd == -1) {
109 EPRINTLN("open of tap device %s failed", tbuf);
110 goto error;
111 }
112
113 /*
114 * Set non-blocking and register for read
115 * notifications with the event loop
116 */
117 if (ioctl(be->fd, FIONBIO, &opt) < 0) {
118 EPRINTLN("tap device O_NONBLOCK failed");
119 goto error;
120 }
121
122 if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) {
123 EPRINTLN("tap device link up failed");
124 goto error;
125 }
126
127 #ifndef WITHOUT_CAPSICUM
128 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE);
129 if (caph_rights_limit(be->fd, &rights) == -1)
130 errx(EX_OSERR, "Unable to apply rights for sandbox");
131 #endif
132
133 memset(priv->bbuf, 0, sizeof(priv->bbuf));
134 priv->bbuflen = 0;
135
136 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
137 if (priv->mevp == NULL) {
138 EPRINTLN("Could not register event");
139 goto error;
140 }
141
142 return (0);
143
144 error:
145 tap_cleanup(be);
146 return (-1);
147 }
148
149 /*
150 * Called to send a buffer chain out to the tap device
151 */
152 ssize_t
tap_send(struct net_backend * be,const struct iovec * iov,int iovcnt)153 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
154 {
155 return (writev(be->fd, iov, iovcnt));
156 }
157
158 ssize_t
tap_peek_recvlen(struct net_backend * be)159 tap_peek_recvlen(struct net_backend *be)
160 {
161 struct tap_priv *priv = NET_BE_PRIV(be);
162 ssize_t ret;
163
164 if (priv->bbuflen > 0) {
165 /*
166 * We already have a packet in the bounce buffer.
167 * Just return its length.
168 */
169 return priv->bbuflen;
170 }
171
172 /*
173 * Read the next packet (if any) into the bounce buffer, so
174 * that we get to know its length and we can return that
175 * to the caller.
176 */
177 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
178 if (ret < 0 && errno == EWOULDBLOCK) {
179 return (0);
180 }
181
182 if (ret > 0)
183 priv->bbuflen = ret;
184
185 return (ret);
186 }
187
188 ssize_t
tap_recv(struct net_backend * be,const struct iovec * iov,int iovcnt)189 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
190 {
191 struct tap_priv *priv = NET_BE_PRIV(be);
192 ssize_t ret;
193
194 if (priv->bbuflen > 0) {
195 /*
196 * A packet is available in the bounce buffer, so
197 * we read it from there.
198 */
199 ret = buf_to_iov(priv->bbuf, priv->bbuflen,
200 iov, iovcnt, 0);
201
202 /* Mark the bounce buffer as empty. */
203 priv->bbuflen = 0;
204
205 return (ret);
206 }
207
208 ret = readv(be->fd, iov, iovcnt);
209 if (ret < 0 && errno == EWOULDBLOCK) {
210 return (0);
211 }
212
213 return (ret);
214 }
215
216 void
tap_recv_enable(struct net_backend * be)217 tap_recv_enable(struct net_backend *be)
218 {
219 struct tap_priv *priv = NET_BE_PRIV(be);
220
221 mevent_enable(priv->mevp);
222 }
223
224 void
tap_recv_disable(struct net_backend * be)225 tap_recv_disable(struct net_backend *be)
226 {
227 struct tap_priv *priv = NET_BE_PRIV(be);
228
229 mevent_disable(priv->mevp);
230 }
231
232 uint64_t
tap_get_cap(struct net_backend * be __unused)233 tap_get_cap(struct net_backend *be __unused)
234 {
235
236 return (0); /* no capabilities for now */
237 }
238
239 int
tap_set_cap(struct net_backend * be __unused,uint64_t features,unsigned vnet_hdr_len)240 tap_set_cap(struct net_backend *be __unused, uint64_t features,
241 unsigned vnet_hdr_len)
242 {
243
244 return ((features || vnet_hdr_len) ? -1 : 0);
245 }
246
247 static struct net_backend tap_backend = {
248 .prefix = "tap",
249 .priv_size = sizeof(struct tap_priv),
250 .init = tap_init,
251 .cleanup = tap_cleanup,
252 .send = tap_send,
253 .peek_recvlen = tap_peek_recvlen,
254 .recv = tap_recv,
255 .recv_enable = tap_recv_enable,
256 .recv_disable = tap_recv_disable,
257 .get_cap = tap_get_cap,
258 .set_cap = tap_set_cap,
259 };
260
261 /* A clone of the tap backend, with a different prefix. */
262 static struct net_backend vmnet_backend = {
263 .prefix = "vmnet",
264 .priv_size = sizeof(struct tap_priv),
265 .init = tap_init,
266 .cleanup = tap_cleanup,
267 .send = tap_send,
268 .peek_recvlen = tap_peek_recvlen,
269 .recv = tap_recv,
270 .recv_enable = tap_recv_enable,
271 .recv_disable = tap_recv_disable,
272 .get_cap = tap_get_cap,
273 .set_cap = tap_set_cap,
274 };
275
276 DATA_SET(net_backend_set, tap_backend);
277 DATA_SET(net_backend_set, vmnet_backend);
278
279 int
netbe_legacy_config(nvlist_t * nvl,const char * opts)280 netbe_legacy_config(nvlist_t *nvl, const char *opts)
281 {
282 char *backend, *cp;
283
284 if (opts == NULL)
285 return (0);
286
287 cp = strchr(opts, ',');
288 if (cp == NULL) {
289 set_config_value_node(nvl, "backend", opts);
290 return (0);
291 }
292 backend = strndup(opts, cp - opts);
293 set_config_value_node(nvl, "backend", backend);
294 free(backend);
295 return (pci_parse_legacy_config(nvl, cp + 1));
296 }
297
298 /*
299 * Initialize a backend and attach to the frontend.
300 * This is called during frontend initialization.
301 * @ret is a pointer to the backend to be initialized
302 * @devname is the backend-name as supplied on the command line,
303 * e.g. -s 2:0,frontend-name,backend-name[,other-args]
304 * @cb is the receive callback supplied by the frontend,
305 * and it is invoked in the event loop when a receive
306 * event is generated in the hypervisor,
307 * @param is a pointer to the frontend, and normally used as
308 * the argument for the callback.
309 */
310 int
netbe_init(struct net_backend ** ret,nvlist_t * nvl,net_be_rxeof_t cb,void * param)311 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb,
312 void *param)
313 {
314 struct net_backend **pbe, *nbe, *tbe = NULL;
315 const char *value, *type;
316 char *devname;
317 int err;
318
319 value = get_config_value_node(nvl, "backend");
320 if (value == NULL) {
321 return (-1);
322 }
323 devname = strdup(value);
324
325 /*
326 * Use the type given by configuration if exists; otherwise
327 * use the prefix of the backend as the type.
328 */
329 type = get_config_value_node(nvl, "type");
330 if (type == NULL)
331 type = devname;
332
333 /*
334 * Find the network backend that matches the user-provided
335 * device name. net_backend_set is built using a linker set.
336 */
337 SET_FOREACH(pbe, net_backend_set) {
338 if (strncmp(type, (*pbe)->prefix,
339 strlen((*pbe)->prefix)) == 0) {
340 tbe = *pbe;
341 assert(tbe->init != NULL);
342 assert(tbe->cleanup != NULL);
343 assert(tbe->send != NULL);
344 assert(tbe->recv != NULL);
345 assert(tbe->get_cap != NULL);
346 assert(tbe->set_cap != NULL);
347 break;
348 }
349 }
350
351 *ret = NULL;
352 if (tbe == NULL) {
353 free(devname);
354 return (EINVAL);
355 }
356
357 nbe = calloc(1, NET_BE_SIZE(tbe));
358 *nbe = *tbe; /* copy the template */
359 nbe->fd = -1;
360 nbe->sc = param;
361 nbe->be_vnet_hdr_len = 0;
362 nbe->fe_vnet_hdr_len = 0;
363
364 /* Initialize the backend. */
365 err = nbe->init(nbe, devname, nvl, cb, param);
366 if (err) {
367 free(devname);
368 free(nbe);
369 return (err);
370 }
371
372 *ret = nbe;
373 free(devname);
374
375 return (0);
376 }
377
378 void
netbe_cleanup(struct net_backend * be)379 netbe_cleanup(struct net_backend *be)
380 {
381
382 if (be != NULL) {
383 be->cleanup(be);
384 free(be);
385 }
386 }
387
388 uint64_t
netbe_get_cap(struct net_backend * be)389 netbe_get_cap(struct net_backend *be)
390 {
391
392 assert(be != NULL);
393 return (be->get_cap(be));
394 }
395
396 int
netbe_set_cap(struct net_backend * be,uint64_t features,unsigned vnet_hdr_len)397 netbe_set_cap(struct net_backend *be, uint64_t features,
398 unsigned vnet_hdr_len)
399 {
400 int ret;
401
402 assert(be != NULL);
403
404 /* There are only three valid lengths, i.e., 0, 10 and 12. */
405 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN
406 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t)))
407 return (-1);
408
409 be->fe_vnet_hdr_len = vnet_hdr_len;
410
411 ret = be->set_cap(be, features, vnet_hdr_len);
412 assert(be->be_vnet_hdr_len == 0 ||
413 be->be_vnet_hdr_len == be->fe_vnet_hdr_len);
414
415 return (ret);
416 }
417
418 ssize_t
netbe_send(struct net_backend * be,const struct iovec * iov,int iovcnt)419 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt)
420 {
421
422 return (be->send(be, iov, iovcnt));
423 }
424
425 ssize_t
netbe_peek_recvlen(struct net_backend * be)426 netbe_peek_recvlen(struct net_backend *be)
427 {
428
429 return (be->peek_recvlen(be));
430 }
431
432 /*
433 * Try to read a packet from the backend, without blocking.
434 * If no packets are available, return 0. In case of success, return
435 * the length of the packet just read. Return -1 in case of errors.
436 */
437 ssize_t
netbe_recv(struct net_backend * be,const struct iovec * iov,int iovcnt)438 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
439 {
440
441 return (be->recv(be, iov, iovcnt));
442 }
443
444 /*
445 * Read a packet from the backend and discard it.
446 * Returns the size of the discarded packet or zero if no packet was available.
447 * A negative error code is returned in case of read error.
448 */
449 ssize_t
netbe_rx_discard(struct net_backend * be)450 netbe_rx_discard(struct net_backend *be)
451 {
452 /*
453 * MP note: the dummybuf is only used to discard frames,
454 * so there is no need for it to be per-vtnet or locked.
455 * We only make it large enough for TSO-sized segment.
456 */
457 static uint8_t dummybuf[65536 + 64];
458 struct iovec iov;
459
460 iov.iov_base = dummybuf;
461 iov.iov_len = sizeof(dummybuf);
462
463 return netbe_recv(be, &iov, 1);
464 }
465
466 void
netbe_rx_disable(struct net_backend * be)467 netbe_rx_disable(struct net_backend *be)
468 {
469
470 return be->recv_disable(be);
471 }
472
473 void
netbe_rx_enable(struct net_backend * be)474 netbe_rx_enable(struct net_backend *be)
475 {
476
477 return be->recv_enable(be);
478 }
479
480 size_t
netbe_get_vnet_hdr_len(struct net_backend * be)481 netbe_get_vnet_hdr_len(struct net_backend *be)
482 {
483
484 return (be->be_vnet_hdr_len);
485 }
486