1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35 #include <sys/types.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/ioctl.h> 40 #include <sys/mman.h> 41 #include <sys/uio.h> 42 43 #include <net/if.h> 44 #include <net/if_tap.h> 45 46 #include <assert.h> 47 #ifndef WITHOUT_CAPSICUM 48 #include <capsicum_helpers.h> 49 #endif 50 #include <err.h> 51 #include <errno.h> 52 #include <fcntl.h> 53 #include <poll.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <stdint.h> 59 #include <string.h> 60 #include <sysexits.h> 61 #include <unistd.h> 62 63 #include "config.h" 64 #include "debug.h" 65 #include "iov.h" 66 #include "mevent.h" 67 #include "net_backends.h" 68 #include "net_backends_priv.h" 69 #include "pci_emul.h" 70 71 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 72 73 void 74 tap_cleanup(struct net_backend *be) 75 { 76 struct tap_priv *priv = NET_BE_PRIV(be); 77 78 if (priv->mevp) { 79 mevent_delete(priv->mevp); 80 } 81 if (be->fd != -1) { 82 close(be->fd); 83 be->fd = -1; 84 } 85 } 86 87 static int 88 tap_init(struct net_backend *be, const char *devname, 89 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 90 { 91 struct tap_priv *priv = NET_BE_PRIV(be); 92 char tbuf[80]; 93 int opt = 1, up = IFF_UP; 94 95 #ifndef WITHOUT_CAPSICUM 96 cap_rights_t rights; 97 #endif 98 99 if (cb == NULL) { 100 EPRINTLN("TAP backend requires non-NULL callback"); 101 return (-1); 102 } 103 104 strcpy(tbuf, "/dev/"); 105 strlcat(tbuf, devname, sizeof(tbuf)); 106 107 be->fd = open(tbuf, O_RDWR); 108 if (be->fd == -1) { 109 EPRINTLN("open of tap device %s failed", tbuf); 110 goto error; 111 } 112 113 /* 114 * Set non-blocking and register for read 115 * notifications with the event loop 116 */ 117 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 118 EPRINTLN("tap device O_NONBLOCK failed"); 119 goto error; 120 } 121 122 if (strncmp("ngd", be->prefix, 3) && 123 ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { 124 EPRINTLN("tap device link up failed"); 125 goto error; 126 } 127 128 #ifndef WITHOUT_CAPSICUM 129 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 130 if (caph_rights_limit(be->fd, &rights) == -1) 131 errx(EX_OSERR, "Unable to apply rights for sandbox"); 132 #endif 133 134 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 135 priv->bbuflen = 0; 136 137 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 138 if (priv->mevp == NULL) { 139 EPRINTLN("Could not register event"); 140 goto error; 141 } 142 143 return (0); 144 145 error: 146 tap_cleanup(be); 147 return (-1); 148 } 149 150 /* 151 * Called to send a buffer chain out to the tap device 152 */ 153 ssize_t 154 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 155 { 156 return (writev(be->fd, iov, iovcnt)); 157 } 158 159 ssize_t 160 tap_peek_recvlen(struct net_backend *be) 161 { 162 struct tap_priv *priv = NET_BE_PRIV(be); 163 ssize_t ret; 164 165 if (priv->bbuflen > 0) { 166 /* 167 * We already have a packet in the bounce buffer. 168 * Just return its length. 169 */ 170 return priv->bbuflen; 171 } 172 173 /* 174 * Read the next packet (if any) into the bounce buffer, so 175 * that we get to know its length and we can return that 176 * to the caller. 177 */ 178 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 179 if (ret < 0 && errno == EWOULDBLOCK) { 180 return (0); 181 } 182 183 if (ret > 0) 184 priv->bbuflen = ret; 185 186 return (ret); 187 } 188 189 ssize_t 190 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 191 { 192 struct tap_priv *priv = NET_BE_PRIV(be); 193 ssize_t ret; 194 195 if (priv->bbuflen > 0) { 196 /* 197 * A packet is available in the bounce buffer, so 198 * we read it from there. 199 */ 200 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 201 iov, iovcnt, 0); 202 203 /* Mark the bounce buffer as empty. */ 204 priv->bbuflen = 0; 205 206 return (ret); 207 } 208 209 ret = readv(be->fd, iov, iovcnt); 210 if (ret < 0 && errno == EWOULDBLOCK) { 211 return (0); 212 } 213 214 return (ret); 215 } 216 217 void 218 tap_recv_enable(struct net_backend *be) 219 { 220 struct tap_priv *priv = NET_BE_PRIV(be); 221 222 mevent_enable(priv->mevp); 223 } 224 225 void 226 tap_recv_disable(struct net_backend *be) 227 { 228 struct tap_priv *priv = NET_BE_PRIV(be); 229 230 mevent_disable(priv->mevp); 231 } 232 233 uint64_t 234 tap_get_cap(struct net_backend *be __unused) 235 { 236 237 return (0); /* no capabilities for now */ 238 } 239 240 int 241 tap_set_cap(struct net_backend *be __unused, uint64_t features, 242 unsigned vnet_hdr_len) 243 { 244 245 return ((features || vnet_hdr_len) ? -1 : 0); 246 } 247 248 static struct net_backend tap_backend = { 249 .prefix = "tap", 250 .priv_size = sizeof(struct tap_priv), 251 .init = tap_init, 252 .cleanup = tap_cleanup, 253 .send = tap_send, 254 .peek_recvlen = tap_peek_recvlen, 255 .recv = tap_recv, 256 .recv_enable = tap_recv_enable, 257 .recv_disable = tap_recv_disable, 258 .get_cap = tap_get_cap, 259 .set_cap = tap_set_cap, 260 }; 261 262 /* A clone of the tap backend, with a different prefix. */ 263 static struct net_backend vmnet_backend = { 264 .prefix = "vmnet", 265 .priv_size = sizeof(struct tap_priv), 266 .init = tap_init, 267 .cleanup = tap_cleanup, 268 .send = tap_send, 269 .peek_recvlen = tap_peek_recvlen, 270 .recv = tap_recv, 271 .recv_enable = tap_recv_enable, 272 .recv_disable = tap_recv_disable, 273 .get_cap = tap_get_cap, 274 .set_cap = tap_set_cap, 275 }; 276 277 /* A clone of the tap backend, with a different prefix. */ 278 static struct net_backend ngd_backend = { 279 .prefix = "ngd", 280 .priv_size = sizeof(struct tap_priv), 281 .init = tap_init, 282 .cleanup = tap_cleanup, 283 .send = tap_send, 284 .peek_recvlen = tap_peek_recvlen, 285 .recv = tap_recv, 286 .recv_enable = tap_recv_enable, 287 .recv_disable = tap_recv_disable, 288 .get_cap = tap_get_cap, 289 .set_cap = tap_set_cap, 290 }; 291 292 DATA_SET(net_backend_set, tap_backend); 293 DATA_SET(net_backend_set, vmnet_backend); 294 DATA_SET(net_backend_set, ngd_backend); 295 296 int 297 netbe_legacy_config(nvlist_t *nvl, const char *opts) 298 { 299 char *backend, *cp; 300 301 if (opts == NULL) 302 return (0); 303 304 cp = strchr(opts, ','); 305 if (cp == NULL) { 306 set_config_value_node(nvl, "backend", opts); 307 return (0); 308 } 309 backend = strndup(opts, cp - opts); 310 set_config_value_node(nvl, "backend", backend); 311 free(backend); 312 return (pci_parse_legacy_config(nvl, cp + 1)); 313 } 314 315 /* 316 * Initialize a backend and attach to the frontend. 317 * This is called during frontend initialization. 318 * @ret is a pointer to the backend to be initialized 319 * @devname is the backend-name as supplied on the command line, 320 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 321 * @cb is the receive callback supplied by the frontend, 322 * and it is invoked in the event loop when a receive 323 * event is generated in the hypervisor, 324 * @param is a pointer to the frontend, and normally used as 325 * the argument for the callback. 326 */ 327 int 328 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 329 void *param) 330 { 331 struct net_backend **pbe, *nbe, *tbe = NULL; 332 const char *value, *type; 333 char *devname; 334 int err; 335 336 value = get_config_value_node(nvl, "backend"); 337 if (value == NULL) { 338 return (-1); 339 } 340 devname = strdup(value); 341 342 /* 343 * Use the type given by configuration if exists; otherwise 344 * use the prefix of the backend as the type. 345 */ 346 type = get_config_value_node(nvl, "type"); 347 if (type == NULL) 348 type = devname; 349 350 /* 351 * Find the network backend that matches the user-provided 352 * device name. net_backend_set is built using a linker set. 353 */ 354 SET_FOREACH(pbe, net_backend_set) { 355 if (strncmp(type, (*pbe)->prefix, 356 strlen((*pbe)->prefix)) == 0) { 357 tbe = *pbe; 358 assert(tbe->init != NULL); 359 assert(tbe->cleanup != NULL); 360 assert(tbe->send != NULL); 361 assert(tbe->recv != NULL); 362 assert(tbe->get_cap != NULL); 363 assert(tbe->set_cap != NULL); 364 break; 365 } 366 } 367 368 *ret = NULL; 369 if (tbe == NULL) { 370 free(devname); 371 return (EINVAL); 372 } 373 374 nbe = calloc(1, NET_BE_SIZE(tbe)); 375 *nbe = *tbe; /* copy the template */ 376 nbe->fd = -1; 377 nbe->sc = param; 378 nbe->be_vnet_hdr_len = 0; 379 nbe->fe_vnet_hdr_len = 0; 380 381 /* Initialize the backend. */ 382 err = nbe->init(nbe, devname, nvl, cb, param); 383 if (err) { 384 free(devname); 385 free(nbe); 386 return (err); 387 } 388 389 *ret = nbe; 390 free(devname); 391 392 return (0); 393 } 394 395 void 396 netbe_cleanup(struct net_backend *be) 397 { 398 399 if (be != NULL) { 400 be->cleanup(be); 401 free(be); 402 } 403 } 404 405 uint64_t 406 netbe_get_cap(struct net_backend *be) 407 { 408 409 assert(be != NULL); 410 return (be->get_cap(be)); 411 } 412 413 int 414 netbe_set_cap(struct net_backend *be, uint64_t features, 415 unsigned vnet_hdr_len) 416 { 417 int ret; 418 419 assert(be != NULL); 420 421 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 422 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 423 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 424 return (-1); 425 426 be->fe_vnet_hdr_len = vnet_hdr_len; 427 428 ret = be->set_cap(be, features, vnet_hdr_len); 429 assert(be->be_vnet_hdr_len == 0 || 430 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 431 432 return (ret); 433 } 434 435 ssize_t 436 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 437 { 438 439 return (be->send(be, iov, iovcnt)); 440 } 441 442 ssize_t 443 netbe_peek_recvlen(struct net_backend *be) 444 { 445 446 return (be->peek_recvlen(be)); 447 } 448 449 /* 450 * Try to read a packet from the backend, without blocking. 451 * If no packets are available, return 0. In case of success, return 452 * the length of the packet just read. Return -1 in case of errors. 453 */ 454 ssize_t 455 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 456 { 457 458 return (be->recv(be, iov, iovcnt)); 459 } 460 461 /* 462 * Read a packet from the backend and discard it. 463 * Returns the size of the discarded packet or zero if no packet was available. 464 * A negative error code is returned in case of read error. 465 */ 466 ssize_t 467 netbe_rx_discard(struct net_backend *be) 468 { 469 /* 470 * MP note: the dummybuf is only used to discard frames, 471 * so there is no need for it to be per-vtnet or locked. 472 * We only make it large enough for TSO-sized segment. 473 */ 474 static uint8_t dummybuf[65536 + 64]; 475 struct iovec iov; 476 477 iov.iov_base = dummybuf; 478 iov.iov_len = sizeof(dummybuf); 479 480 return netbe_recv(be, &iov, 1); 481 } 482 483 void 484 netbe_rx_disable(struct net_backend *be) 485 { 486 487 return be->recv_disable(be); 488 } 489 490 void 491 netbe_rx_enable(struct net_backend *be) 492 { 493 494 return be->recv_enable(be); 495 } 496 497 size_t 498 netbe_get_vnet_hdr_len(struct net_backend *be) 499 { 500 501 return (be->be_vnet_hdr_len); 502 } 503