1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35 #include <sys/types.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/ioctl.h> 40 #include <sys/mman.h> 41 #include <sys/uio.h> 42 43 #include <net/if.h> 44 #ifdef __FreeBSD__ 45 #include <net/if_tap.h> 46 #endif 47 48 #include <assert.h> 49 #ifndef WITHOUT_CAPSICUM 50 #include <capsicum_helpers.h> 51 #endif 52 #include <err.h> 53 #include <errno.h> 54 #include <fcntl.h> 55 #include <poll.h> 56 #include <pthread.h> 57 #include <pthread_np.h> 58 #include <stdio.h> 59 #include <stdlib.h> 60 #include <stdint.h> 61 #include <string.h> 62 #include <unistd.h> 63 #include <sysexits.h> 64 #include <unistd.h> 65 66 #include "config.h" 67 #include "debug.h" 68 #include "iov.h" 69 #include "mevent.h" 70 #include "net_backends.h" 71 #include "net_backends_priv.h" 72 #include "pci_emul.h" 73 74 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 75 76 #ifdef __FreeBSD__ 77 void 78 tap_cleanup(struct net_backend *be) 79 { 80 struct tap_priv *priv = NET_BE_PRIV(be); 81 82 if (priv->mevp) { 83 mevent_delete(priv->mevp); 84 } 85 if (be->fd != -1) { 86 close(be->fd); 87 be->fd = -1; 88 } 89 } 90 91 static int 92 tap_init(struct net_backend *be, const char *devname, 93 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 94 { 95 struct tap_priv *priv = NET_BE_PRIV(be); 96 char tbuf[80]; 97 int opt = 1, up = IFF_UP; 98 99 #ifndef WITHOUT_CAPSICUM 100 cap_rights_t rights; 101 #endif 102 103 if (cb == NULL) { 104 EPRINTLN("TAP backend requires non-NULL callback"); 105 return (-1); 106 } 107 108 strcpy(tbuf, "/dev/"); 109 strlcat(tbuf, devname, sizeof(tbuf)); 110 111 be->fd = open(tbuf, O_RDWR); 112 if (be->fd == -1) { 113 EPRINTLN("open of tap device %s failed", tbuf); 114 goto error; 115 } 116 117 /* 118 * Set non-blocking and register for read 119 * notifications with the event loop 120 */ 121 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 122 EPRINTLN("tap device O_NONBLOCK failed"); 123 goto error; 124 } 125 126 if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { 127 EPRINTLN("tap device link up failed"); 128 goto error; 129 } 130 131 #ifndef WITHOUT_CAPSICUM 132 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 133 if (caph_rights_limit(be->fd, &rights) == -1) 134 errx(EX_OSERR, "Unable to apply rights for sandbox"); 135 #endif 136 137 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 138 priv->bbuflen = 0; 139 140 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 141 if (priv->mevp == NULL) { 142 EPRINTLN("Could not register event"); 143 goto error; 144 } 145 146 return (0); 147 148 error: 149 tap_cleanup(be); 150 return (-1); 151 } 152 153 /* 154 * Called to send a buffer chain out to the tap device 155 */ 156 ssize_t 157 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 158 { 159 return (writev(be->fd, iov, iovcnt)); 160 } 161 162 ssize_t 163 tap_peek_recvlen(struct net_backend *be) 164 { 165 struct tap_priv *priv = NET_BE_PRIV(be); 166 ssize_t ret; 167 168 if (priv->bbuflen > 0) { 169 /* 170 * We already have a packet in the bounce buffer. 171 * Just return its length. 172 */ 173 return priv->bbuflen; 174 } 175 176 /* 177 * Read the next packet (if any) into the bounce buffer, so 178 * that we get to know its length and we can return that 179 * to the caller. 180 */ 181 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 182 if (ret < 0 && errno == EWOULDBLOCK) { 183 return (0); 184 } 185 186 if (ret > 0) 187 priv->bbuflen = ret; 188 189 return (ret); 190 } 191 192 ssize_t 193 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 194 { 195 struct tap_priv *priv = NET_BE_PRIV(be); 196 ssize_t ret; 197 198 if (priv->bbuflen > 0) { 199 /* 200 * A packet is available in the bounce buffer, so 201 * we read it from there. 202 */ 203 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 204 iov, iovcnt, 0); 205 206 /* Mark the bounce buffer as empty. */ 207 priv->bbuflen = 0; 208 209 return (ret); 210 } 211 212 ret = readv(be->fd, iov, iovcnt); 213 if (ret < 0 && errno == EWOULDBLOCK) { 214 return (0); 215 } 216 217 return (ret); 218 } 219 220 void 221 tap_recv_enable(struct net_backend *be) 222 { 223 struct tap_priv *priv = NET_BE_PRIV(be); 224 225 mevent_enable(priv->mevp); 226 } 227 228 void 229 tap_recv_disable(struct net_backend *be) 230 { 231 struct tap_priv *priv = NET_BE_PRIV(be); 232 233 mevent_disable(priv->mevp); 234 } 235 236 uint64_t 237 tap_get_cap(struct net_backend *be __unused) 238 { 239 240 return (0); /* no capabilities for now */ 241 } 242 243 int 244 tap_set_cap(struct net_backend *be __unused, uint64_t features, 245 unsigned vnet_hdr_len) 246 { 247 248 return ((features || vnet_hdr_len) ? -1 : 0); 249 } 250 251 static struct net_backend tap_backend = { 252 .prefix = "tap", 253 .priv_size = sizeof(struct tap_priv), 254 .init = tap_init, 255 .cleanup = tap_cleanup, 256 .send = tap_send, 257 .peek_recvlen = tap_peek_recvlen, 258 .recv = tap_recv, 259 .recv_enable = tap_recv_enable, 260 .recv_disable = tap_recv_disable, 261 .get_cap = tap_get_cap, 262 .set_cap = tap_set_cap, 263 }; 264 265 /* A clone of the tap backend, with a different prefix. */ 266 static struct net_backend vmnet_backend = { 267 .prefix = "vmnet", 268 .priv_size = sizeof(struct tap_priv), 269 .init = tap_init, 270 .cleanup = tap_cleanup, 271 .send = tap_send, 272 .peek_recvlen = tap_peek_recvlen, 273 .recv = tap_recv, 274 .recv_enable = tap_recv_enable, 275 .recv_disable = tap_recv_disable, 276 .get_cap = tap_get_cap, 277 .set_cap = tap_set_cap, 278 }; 279 280 DATA_SET(net_backend_set, tap_backend); 281 DATA_SET(net_backend_set, vmnet_backend); 282 #endif /* __FreeBSD__ */ 283 284 #ifdef __FreeBSD__ 285 int 286 netbe_legacy_config(nvlist_t *nvl, const char *opts) 287 { 288 char *backend, *cp; 289 290 if (opts == NULL) 291 return (0); 292 293 cp = strchr(opts, ','); 294 if (cp == NULL) { 295 set_config_value_node(nvl, "backend", opts); 296 return (0); 297 } 298 backend = strndup(opts, cp - opts); 299 set_config_value_node(nvl, "backend", backend); 300 free(backend); 301 return (pci_parse_legacy_config(nvl, cp + 1)); 302 } 303 #else 304 int 305 netbe_legacy_config(nvlist_t *nvl, const char *opts) 306 { 307 char *config, *name, *tofree, *value; 308 309 if (opts == NULL) 310 return (0); 311 312 /* Default to the 'dlpi' backend - can still be overridden by opts */ 313 set_config_value_node(nvl, "backend", "dlpi"); 314 set_config_value_node(nvl, "type", "dlpi"); 315 316 config = tofree = strdup(opts); 317 if (config == NULL) 318 err(4, "netbe_legacy_config strdup()"); 319 while ((name = strsep(&config, ",")) != NULL) { 320 value = strchr(name, '='); 321 if (value != NULL) { 322 *value++ = '\0'; 323 set_config_value_node(nvl, name, value); 324 } else { 325 set_config_value_node(nvl, "vnic", name); 326 } 327 } 328 free(tofree); 329 330 return (0); 331 } 332 #endif 333 334 /* 335 * Initialize a backend and attach to the frontend. 336 * This is called during frontend initialization. 337 * @ret is a pointer to the backend to be initialized 338 * @devname is the backend-name as supplied on the command line, 339 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 340 * @cb is the receive callback supplied by the frontend, 341 * and it is invoked in the event loop when a receive 342 * event is generated in the hypervisor, 343 * @param is a pointer to the frontend, and normally used as 344 * the argument for the callback. 345 */ 346 int 347 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 348 void *param) 349 { 350 struct net_backend **pbe, *nbe, *tbe = NULL; 351 const char *value, *type; 352 char *devname; 353 int err; 354 355 value = get_config_value_node(nvl, "backend"); 356 if (value == NULL) { 357 return (-1); 358 } 359 devname = strdup(value); 360 361 /* 362 * Use the type given by configuration if exists; otherwise 363 * use the prefix of the backend as the type. 364 */ 365 type = get_config_value_node(nvl, "type"); 366 if (type == NULL) 367 type = devname; 368 369 /* 370 * Find the network backend that matches the user-provided 371 * device name. net_backend_set is built using a linker set. 372 */ 373 SET_FOREACH(pbe, net_backend_set) { 374 if (strncmp(type, (*pbe)->prefix, 375 strlen((*pbe)->prefix)) == 0) { 376 tbe = *pbe; 377 assert(tbe->init != NULL); 378 assert(tbe->cleanup != NULL); 379 assert(tbe->send != NULL); 380 assert(tbe->recv != NULL); 381 assert(tbe->get_cap != NULL); 382 assert(tbe->set_cap != NULL); 383 break; 384 } 385 } 386 387 *ret = NULL; 388 if (tbe == NULL) { 389 free(devname); 390 return (EINVAL); 391 } 392 393 nbe = calloc(1, NET_BE_SIZE(tbe)); 394 *nbe = *tbe; /* copy the template */ 395 nbe->fd = -1; 396 nbe->sc = param; 397 nbe->be_vnet_hdr_len = 0; 398 nbe->fe_vnet_hdr_len = 0; 399 400 /* Initialize the backend. */ 401 err = nbe->init(nbe, devname, nvl, cb, param); 402 if (err) { 403 free(devname); 404 free(nbe); 405 return (err); 406 } 407 408 *ret = nbe; 409 free(devname); 410 411 return (0); 412 } 413 414 void 415 netbe_cleanup(struct net_backend *be) 416 { 417 418 if (be != NULL) { 419 be->cleanup(be); 420 free(be); 421 } 422 } 423 424 uint64_t 425 netbe_get_cap(struct net_backend *be) 426 { 427 428 assert(be != NULL); 429 return (be->get_cap(be)); 430 } 431 432 int 433 netbe_set_cap(struct net_backend *be, uint64_t features, 434 unsigned vnet_hdr_len) 435 { 436 int ret; 437 438 assert(be != NULL); 439 440 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 441 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 442 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 443 return (-1); 444 445 be->fe_vnet_hdr_len = vnet_hdr_len; 446 447 ret = be->set_cap(be, features, vnet_hdr_len); 448 assert(be->be_vnet_hdr_len == 0 || 449 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 450 451 return (ret); 452 } 453 454 ssize_t 455 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 456 { 457 458 return (be->send(be, iov, iovcnt)); 459 } 460 461 ssize_t 462 netbe_peek_recvlen(struct net_backend *be) 463 { 464 465 return (be->peek_recvlen(be)); 466 } 467 468 /* 469 * Try to read a packet from the backend, without blocking. 470 * If no packets are available, return 0. In case of success, return 471 * the length of the packet just read. Return -1 in case of errors. 472 */ 473 ssize_t 474 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 475 { 476 477 return (be->recv(be, iov, iovcnt)); 478 } 479 480 /* 481 * Read a packet from the backend and discard it. 482 * Returns the size of the discarded packet or zero if no packet was available. 483 * A negative error code is returned in case of read error. 484 */ 485 ssize_t 486 netbe_rx_discard(struct net_backend *be) 487 { 488 /* 489 * MP note: the dummybuf is only used to discard frames, 490 * so there is no need for it to be per-vtnet or locked. 491 * We only make it large enough for TSO-sized segment. 492 */ 493 static uint8_t dummybuf[65536 + 64]; 494 struct iovec iov; 495 496 #ifdef __FreeBSD__ 497 iov.iov_base = dummybuf; 498 #else 499 iov.iov_base = (caddr_t)dummybuf; 500 #endif 501 iov.iov_len = sizeof(dummybuf); 502 503 return netbe_recv(be, &iov, 1); 504 } 505 506 void 507 netbe_rx_disable(struct net_backend *be) 508 { 509 510 return be->recv_disable(be); 511 } 512 513 void 514 netbe_rx_enable(struct net_backend *be) 515 { 516 517 return be->recv_enable(be); 518 } 519 520 size_t 521 netbe_get_vnet_hdr_len(struct net_backend *be) 522 { 523 524 return (be->be_vnet_hdr_len); 525 } 526 527 #ifndef __FreeBSD__ 528 int 529 netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen) 530 { 531 if (be->get_mac == NULL) 532 return (ENOTSUP); 533 return (be->get_mac(be, buf, buflen)); 534 } 535 #endif 536