1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35 #include <sys/types.h> 36 #ifndef WITHOUT_CAPSICUM 37 #include <sys/capsicum.h> 38 #endif 39 #include <sys/ioctl.h> 40 #include <sys/mman.h> 41 #include <sys/uio.h> 42 43 #include <net/if.h> 44 #include <net/if_tap.h> 45 46 #include <assert.h> 47 #ifndef WITHOUT_CAPSICUM 48 #include <capsicum_helpers.h> 49 #endif 50 #include <err.h> 51 #include <errno.h> 52 #include <fcntl.h> 53 #include <poll.h> 54 #include <pthread.h> 55 #include <pthread_np.h> 56 #include <stdio.h> 57 #include <stdlib.h> 58 #include <stdint.h> 59 #include <string.h> 60 #include <sysexits.h> 61 #include <unistd.h> 62 63 #include "config.h" 64 #include "debug.h" 65 #include "iov.h" 66 #include "mevent.h" 67 #include "net_backends.h" 68 #include "net_backends_priv.h" 69 #include "pci_emul.h" 70 71 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 72 73 void 74 tap_cleanup(struct net_backend *be) 75 { 76 struct tap_priv *priv = NET_BE_PRIV(be); 77 78 if (priv->mevp) { 79 mevent_delete(priv->mevp); 80 } 81 if (be->fd != -1) { 82 close(be->fd); 83 be->fd = -1; 84 } 85 } 86 87 static int 88 tap_init(struct net_backend *be, const char *devname, 89 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 90 { 91 struct tap_priv *priv = NET_BE_PRIV(be); 92 char tbuf[80]; 93 int opt = 1, up = IFF_UP; 94 95 #ifndef WITHOUT_CAPSICUM 96 cap_rights_t rights; 97 #endif 98 99 if (cb == NULL) { 100 EPRINTLN("TAP backend requires non-NULL callback"); 101 return (-1); 102 } 103 104 strcpy(tbuf, "/dev/"); 105 strlcat(tbuf, devname, sizeof(tbuf)); 106 107 be->fd = open(tbuf, O_RDWR); 108 if (be->fd == -1) { 109 EPRINTLN("open of tap device %s failed", tbuf); 110 goto error; 111 } 112 113 /* 114 * Set non-blocking and register for read 115 * notifications with the event loop 116 */ 117 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 118 EPRINTLN("tap device O_NONBLOCK failed"); 119 goto error; 120 } 121 122 if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { 123 EPRINTLN("tap device link up failed"); 124 goto error; 125 } 126 127 #ifndef WITHOUT_CAPSICUM 128 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 129 if (caph_rights_limit(be->fd, &rights) == -1) 130 errx(EX_OSERR, "Unable to apply rights for sandbox"); 131 #endif 132 133 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 134 priv->bbuflen = 0; 135 136 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 137 if (priv->mevp == NULL) { 138 EPRINTLN("Could not register event"); 139 goto error; 140 } 141 142 return (0); 143 144 error: 145 tap_cleanup(be); 146 return (-1); 147 } 148 149 /* 150 * Called to send a buffer chain out to the tap device 151 */ 152 ssize_t 153 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 154 { 155 return (writev(be->fd, iov, iovcnt)); 156 } 157 158 ssize_t 159 tap_peek_recvlen(struct net_backend *be) 160 { 161 struct tap_priv *priv = NET_BE_PRIV(be); 162 ssize_t ret; 163 164 if (priv->bbuflen > 0) { 165 /* 166 * We already have a packet in the bounce buffer. 167 * Just return its length. 168 */ 169 return priv->bbuflen; 170 } 171 172 /* 173 * Read the next packet (if any) into the bounce buffer, so 174 * that we get to know its length and we can return that 175 * to the caller. 176 */ 177 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 178 if (ret < 0 && errno == EWOULDBLOCK) { 179 return (0); 180 } 181 182 if (ret > 0) 183 priv->bbuflen = ret; 184 185 return (ret); 186 } 187 188 ssize_t 189 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 190 { 191 struct tap_priv *priv = NET_BE_PRIV(be); 192 ssize_t ret; 193 194 if (priv->bbuflen > 0) { 195 /* 196 * A packet is available in the bounce buffer, so 197 * we read it from there. 198 */ 199 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 200 iov, iovcnt, 0); 201 202 /* Mark the bounce buffer as empty. */ 203 priv->bbuflen = 0; 204 205 return (ret); 206 } 207 208 ret = readv(be->fd, iov, iovcnt); 209 if (ret < 0 && errno == EWOULDBLOCK) { 210 return (0); 211 } 212 213 return (ret); 214 } 215 216 void 217 tap_recv_enable(struct net_backend *be) 218 { 219 struct tap_priv *priv = NET_BE_PRIV(be); 220 221 mevent_enable(priv->mevp); 222 } 223 224 void 225 tap_recv_disable(struct net_backend *be) 226 { 227 struct tap_priv *priv = NET_BE_PRIV(be); 228 229 mevent_disable(priv->mevp); 230 } 231 232 uint64_t 233 tap_get_cap(struct net_backend *be __unused) 234 { 235 236 return (0); /* no capabilities for now */ 237 } 238 239 int 240 tap_set_cap(struct net_backend *be __unused, uint64_t features, 241 unsigned vnet_hdr_len) 242 { 243 244 return ((features || vnet_hdr_len) ? -1 : 0); 245 } 246 247 static struct net_backend tap_backend = { 248 .prefix = "tap", 249 .priv_size = sizeof(struct tap_priv), 250 .init = tap_init, 251 .cleanup = tap_cleanup, 252 .send = tap_send, 253 .peek_recvlen = tap_peek_recvlen, 254 .recv = tap_recv, 255 .recv_enable = tap_recv_enable, 256 .recv_disable = tap_recv_disable, 257 .get_cap = tap_get_cap, 258 .set_cap = tap_set_cap, 259 }; 260 261 /* A clone of the tap backend, with a different prefix. */ 262 static struct net_backend vmnet_backend = { 263 .prefix = "vmnet", 264 .priv_size = sizeof(struct tap_priv), 265 .init = tap_init, 266 .cleanup = tap_cleanup, 267 .send = tap_send, 268 .peek_recvlen = tap_peek_recvlen, 269 .recv = tap_recv, 270 .recv_enable = tap_recv_enable, 271 .recv_disable = tap_recv_disable, 272 .get_cap = tap_get_cap, 273 .set_cap = tap_set_cap, 274 }; 275 276 DATA_SET(net_backend_set, tap_backend); 277 DATA_SET(net_backend_set, vmnet_backend); 278 279 int 280 netbe_legacy_config(nvlist_t *nvl, const char *opts) 281 { 282 char *backend, *cp; 283 284 if (opts == NULL) 285 return (0); 286 287 cp = strchr(opts, ','); 288 if (cp == NULL) { 289 set_config_value_node(nvl, "backend", opts); 290 return (0); 291 } 292 backend = strndup(opts, cp - opts); 293 set_config_value_node(nvl, "backend", backend); 294 free(backend); 295 return (pci_parse_legacy_config(nvl, cp + 1)); 296 } 297 298 /* 299 * Initialize a backend and attach to the frontend. 300 * This is called during frontend initialization. 301 * @ret is a pointer to the backend to be initialized 302 * @devname is the backend-name as supplied on the command line, 303 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 304 * @cb is the receive callback supplied by the frontend, 305 * and it is invoked in the event loop when a receive 306 * event is generated in the hypervisor, 307 * @param is a pointer to the frontend, and normally used as 308 * the argument for the callback. 309 */ 310 int 311 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 312 void *param) 313 { 314 struct net_backend **pbe, *nbe, *tbe = NULL; 315 const char *value, *type; 316 char *devname; 317 int err; 318 319 value = get_config_value_node(nvl, "backend"); 320 if (value == NULL) { 321 return (-1); 322 } 323 devname = strdup(value); 324 325 /* 326 * Use the type given by configuration if exists; otherwise 327 * use the prefix of the backend as the type. 328 */ 329 type = get_config_value_node(nvl, "type"); 330 if (type == NULL) 331 type = devname; 332 333 /* 334 * Find the network backend that matches the user-provided 335 * device name. net_backend_set is built using a linker set. 336 */ 337 SET_FOREACH(pbe, net_backend_set) { 338 if (strncmp(type, (*pbe)->prefix, 339 strlen((*pbe)->prefix)) == 0) { 340 tbe = *pbe; 341 assert(tbe->init != NULL); 342 assert(tbe->cleanup != NULL); 343 assert(tbe->send != NULL); 344 assert(tbe->recv != NULL); 345 assert(tbe->get_cap != NULL); 346 assert(tbe->set_cap != NULL); 347 break; 348 } 349 } 350 351 *ret = NULL; 352 if (tbe == NULL) { 353 free(devname); 354 return (EINVAL); 355 } 356 357 nbe = calloc(1, NET_BE_SIZE(tbe)); 358 *nbe = *tbe; /* copy the template */ 359 nbe->fd = -1; 360 nbe->sc = param; 361 nbe->be_vnet_hdr_len = 0; 362 nbe->fe_vnet_hdr_len = 0; 363 364 /* Initialize the backend. */ 365 err = nbe->init(nbe, devname, nvl, cb, param); 366 if (err) { 367 free(devname); 368 free(nbe); 369 return (err); 370 } 371 372 *ret = nbe; 373 free(devname); 374 375 return (0); 376 } 377 378 void 379 netbe_cleanup(struct net_backend *be) 380 { 381 382 if (be != NULL) { 383 be->cleanup(be); 384 free(be); 385 } 386 } 387 388 uint64_t 389 netbe_get_cap(struct net_backend *be) 390 { 391 392 assert(be != NULL); 393 return (be->get_cap(be)); 394 } 395 396 int 397 netbe_set_cap(struct net_backend *be, uint64_t features, 398 unsigned vnet_hdr_len) 399 { 400 int ret; 401 402 assert(be != NULL); 403 404 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 405 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 406 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 407 return (-1); 408 409 be->fe_vnet_hdr_len = vnet_hdr_len; 410 411 ret = be->set_cap(be, features, vnet_hdr_len); 412 assert(be->be_vnet_hdr_len == 0 || 413 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 414 415 return (ret); 416 } 417 418 ssize_t 419 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 420 { 421 422 return (be->send(be, iov, iovcnt)); 423 } 424 425 ssize_t 426 netbe_peek_recvlen(struct net_backend *be) 427 { 428 429 return (be->peek_recvlen(be)); 430 } 431 432 /* 433 * Try to read a packet from the backend, without blocking. 434 * If no packets are available, return 0. In case of success, return 435 * the length of the packet just read. Return -1 in case of errors. 436 */ 437 ssize_t 438 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 439 { 440 441 return (be->recv(be, iov, iovcnt)); 442 } 443 444 /* 445 * Read a packet from the backend and discard it. 446 * Returns the size of the discarded packet or zero if no packet was available. 447 * A negative error code is returned in case of read error. 448 */ 449 ssize_t 450 netbe_rx_discard(struct net_backend *be) 451 { 452 /* 453 * MP note: the dummybuf is only used to discard frames, 454 * so there is no need for it to be per-vtnet or locked. 455 * We only make it large enough for TSO-sized segment. 456 */ 457 static uint8_t dummybuf[65536 + 64]; 458 struct iovec iov; 459 460 iov.iov_base = dummybuf; 461 iov.iov_len = sizeof(dummybuf); 462 463 return netbe_recv(be, &iov, 1); 464 } 465 466 void 467 netbe_rx_disable(struct net_backend *be) 468 { 469 470 return be->recv_disable(be); 471 } 472 473 void 474 netbe_rx_enable(struct net_backend *be) 475 { 476 477 return be->recv_enable(be); 478 } 479 480 size_t 481 netbe_get_vnet_hdr_len(struct net_backend *be) 482 { 483 484 return (be->be_vnet_hdr_len); 485 } 486