1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 /* 31 * This file implements multiple network backends (tap, netmap, ...), 32 * to be used by network frontends such as virtio-net and e1000. 33 * The API to access the backend (e.g. send/receive packets, negotiate 34 * features) is exported by net_backends.h. 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/types.h> /* u_short etc */ 41 #ifndef WITHOUT_CAPSICUM 42 #include <sys/capsicum.h> 43 #endif 44 #include <sys/ioctl.h> 45 #include <sys/mman.h> 46 #include <sys/uio.h> 47 48 #include <net/if.h> 49 #if defined(INET6) || defined(INET) 50 #include <net/if_tap.h> 51 #endif 52 #include <net/netmap.h> 53 #include <net/netmap_virt.h> 54 #define NETMAP_WITH_LIBS 55 #include <net/netmap_user.h> 56 57 #ifndef WITHOUT_CAPSICUM 58 #include <capsicum_helpers.h> 59 #endif 60 #include <err.h> 61 #include <errno.h> 62 #include <fcntl.h> 63 #include <stdio.h> 64 #include <stdlib.h> 65 #include <stdint.h> 66 #include <string.h> 67 #include <unistd.h> 68 #include <sysexits.h> 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <poll.h> 73 #include <assert.h> 74 75 #ifdef NETGRAPH 76 #include <sys/param.h> 77 #include <sys/sysctl.h> 78 #include <netgraph.h> 79 #endif 80 81 #include "config.h" 82 #include "debug.h" 83 #include "iov.h" 84 #include "mevent.h" 85 #include "net_backends.h" 86 #include "pci_emul.h" 87 88 #include <sys/linker_set.h> 89 90 /* 91 * Each network backend registers a set of function pointers that are 92 * used to implement the net backends API. 93 * This might need to be exposed if we implement backends in separate files. 94 */ 95 struct net_backend { 96 const char *prefix; /* prefix matching this backend */ 97 98 /* 99 * Routines used to initialize and cleanup the resources needed 100 * by a backend. The cleanup function is used internally, 101 * and should not be called by the frontend. 102 */ 103 int (*init)(struct net_backend *be, const char *devname, 104 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 105 void (*cleanup)(struct net_backend *be); 106 107 /* 108 * Called to serve a guest transmit request. The scatter-gather 109 * vector provided by the caller has 'iovcnt' elements and contains 110 * the packet to send. 111 */ 112 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 113 int iovcnt); 114 115 /* 116 * Get the length of the next packet that can be received from 117 * the backend. If no packets are currently available, this 118 * function returns 0. 119 */ 120 ssize_t (*peek_recvlen)(struct net_backend *be); 121 122 /* 123 * Called to receive a packet from the backend. When the function 124 * returns a positive value 'len', the scatter-gather vector 125 * provided by the caller contains a packet with such length. 126 * The function returns 0 if the backend doesn't have a new packet to 127 * receive. 128 */ 129 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 130 int iovcnt); 131 132 /* 133 * Ask the backend to enable or disable receive operation in the 134 * backend. On return from a disable operation, it is guaranteed 135 * that the receive callback won't be called until receive is 136 * enabled again. Note however that it is up to the caller to make 137 * sure that netbe_recv() is not currently being executed by another 138 * thread. 139 */ 140 void (*recv_enable)(struct net_backend *be); 141 void (*recv_disable)(struct net_backend *be); 142 143 /* 144 * Ask the backend for the virtio-net features it is able to 145 * support. Possible features are TSO, UFO and checksum offloading 146 * in both rx and tx direction and for both IPv4 and IPv6. 147 */ 148 uint64_t (*get_cap)(struct net_backend *be); 149 150 /* 151 * Tell the backend to enable/disable the specified virtio-net 152 * features (capabilities). 153 */ 154 int (*set_cap)(struct net_backend *be, uint64_t features, 155 unsigned int vnet_hdr_len); 156 157 struct pci_vtnet_softc *sc; 158 int fd; 159 160 /* 161 * Length of the virtio-net header used by the backend and the 162 * frontend, respectively. A zero value means that the header 163 * is not used. 164 */ 165 unsigned int be_vnet_hdr_len; 166 unsigned int fe_vnet_hdr_len; 167 168 /* Size of backend-specific private data. */ 169 size_t priv_size; 170 171 /* Backend-specific private data follows. */ 172 }; 173 174 #define NET_BE_PRIV(be) ((void *)((be) + 1)) 175 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 176 177 SET_DECLARE(net_backend_set, struct net_backend); 178 179 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 180 181 #define WPRINTF(params) PRINTLN params 182 183 /* 184 * The tap backend 185 */ 186 187 #if defined(INET6) || defined(INET) 188 static const int pf_list[] = { 189 #if defined(INET6) 190 PF_INET6, 191 #endif 192 #if defined(INET) 193 PF_INET, 194 #endif 195 }; 196 #endif 197 198 struct tap_priv { 199 struct mevent *mevp; 200 /* 201 * A bounce buffer that allows us to implement the peek_recvlen 202 * callback. In the future we may get the same information from 203 * the kevent data. 204 */ 205 char bbuf[1 << 16]; 206 ssize_t bbuflen; 207 }; 208 209 static void 210 tap_cleanup(struct net_backend *be) 211 { 212 struct tap_priv *priv = NET_BE_PRIV(be); 213 214 if (priv->mevp) { 215 mevent_delete(priv->mevp); 216 } 217 if (be->fd != -1) { 218 close(be->fd); 219 be->fd = -1; 220 } 221 } 222 223 static int 224 tap_init(struct net_backend *be, const char *devname, 225 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 226 { 227 struct tap_priv *priv = NET_BE_PRIV(be); 228 char tbuf[80]; 229 int opt = 1; 230 #if defined(INET6) || defined(INET) 231 struct ifreq ifrq; 232 int s; 233 #endif 234 #ifndef WITHOUT_CAPSICUM 235 cap_rights_t rights; 236 #endif 237 238 if (cb == NULL) { 239 WPRINTF(("TAP backend requires non-NULL callback")); 240 return (-1); 241 } 242 243 strcpy(tbuf, "/dev/"); 244 strlcat(tbuf, devname, sizeof(tbuf)); 245 246 be->fd = open(tbuf, O_RDWR); 247 if (be->fd == -1) { 248 WPRINTF(("open of tap device %s failed", tbuf)); 249 goto error; 250 } 251 252 /* 253 * Set non-blocking and register for read 254 * notifications with the event loop 255 */ 256 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 257 WPRINTF(("tap device O_NONBLOCK failed")); 258 goto error; 259 } 260 261 #if defined(INET6) || defined(INET) 262 /* 263 * Try to UP the interface rather than relying on 264 * net.link.tap.up_on_open. 265 */ 266 bzero(&ifrq, sizeof(ifrq)); 267 if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) { 268 WPRINTF(("Could not get interface name")); 269 goto error; 270 } 271 272 s = -1; 273 for (size_t i = 0; s == -1 && i < nitems(pf_list); i++) 274 s = socket(pf_list[i], SOCK_DGRAM, 0); 275 if (s == -1) { 276 WPRINTF(("Could open socket")); 277 goto error; 278 } 279 280 if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) { 281 (void)close(s); 282 WPRINTF(("Could not get interface flags")); 283 goto error; 284 } 285 ifrq.ifr_flags |= IFF_UP; 286 if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) { 287 (void)close(s); 288 WPRINTF(("Could not set interface flags")); 289 goto error; 290 } 291 (void)close(s); 292 #endif 293 294 #ifndef WITHOUT_CAPSICUM 295 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 296 if (caph_rights_limit(be->fd, &rights) == -1) 297 errx(EX_OSERR, "Unable to apply rights for sandbox"); 298 #endif 299 300 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 301 priv->bbuflen = 0; 302 303 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 304 if (priv->mevp == NULL) { 305 WPRINTF(("Could not register event")); 306 goto error; 307 } 308 309 return (0); 310 311 error: 312 tap_cleanup(be); 313 return (-1); 314 } 315 316 /* 317 * Called to send a buffer chain out to the tap device 318 */ 319 static ssize_t 320 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 321 { 322 return (writev(be->fd, iov, iovcnt)); 323 } 324 325 static ssize_t 326 tap_peek_recvlen(struct net_backend *be) 327 { 328 struct tap_priv *priv = NET_BE_PRIV(be); 329 ssize_t ret; 330 331 if (priv->bbuflen > 0) { 332 /* 333 * We already have a packet in the bounce buffer. 334 * Just return its length. 335 */ 336 return priv->bbuflen; 337 } 338 339 /* 340 * Read the next packet (if any) into the bounce buffer, so 341 * that we get to know its length and we can return that 342 * to the caller. 343 */ 344 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 345 if (ret < 0 && errno == EWOULDBLOCK) { 346 return (0); 347 } 348 349 if (ret > 0) 350 priv->bbuflen = ret; 351 352 return (ret); 353 } 354 355 static ssize_t 356 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 357 { 358 struct tap_priv *priv = NET_BE_PRIV(be); 359 ssize_t ret; 360 361 if (priv->bbuflen > 0) { 362 /* 363 * A packet is available in the bounce buffer, so 364 * we read it from there. 365 */ 366 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 367 iov, iovcnt, 0); 368 369 /* Mark the bounce buffer as empty. */ 370 priv->bbuflen = 0; 371 372 return (ret); 373 } 374 375 ret = readv(be->fd, iov, iovcnt); 376 if (ret < 0 && errno == EWOULDBLOCK) { 377 return (0); 378 } 379 380 return (ret); 381 } 382 383 static void 384 tap_recv_enable(struct net_backend *be) 385 { 386 struct tap_priv *priv = NET_BE_PRIV(be); 387 388 mevent_enable(priv->mevp); 389 } 390 391 static void 392 tap_recv_disable(struct net_backend *be) 393 { 394 struct tap_priv *priv = NET_BE_PRIV(be); 395 396 mevent_disable(priv->mevp); 397 } 398 399 static uint64_t 400 tap_get_cap(struct net_backend *be __unused) 401 { 402 403 return (0); /* no capabilities for now */ 404 } 405 406 static int 407 tap_set_cap(struct net_backend *be __unused, uint64_t features, 408 unsigned vnet_hdr_len) 409 { 410 411 return ((features || vnet_hdr_len) ? -1 : 0); 412 } 413 414 static struct net_backend tap_backend = { 415 .prefix = "tap", 416 .priv_size = sizeof(struct tap_priv), 417 .init = tap_init, 418 .cleanup = tap_cleanup, 419 .send = tap_send, 420 .peek_recvlen = tap_peek_recvlen, 421 .recv = tap_recv, 422 .recv_enable = tap_recv_enable, 423 .recv_disable = tap_recv_disable, 424 .get_cap = tap_get_cap, 425 .set_cap = tap_set_cap, 426 }; 427 428 /* A clone of the tap backend, with a different prefix. */ 429 static struct net_backend vmnet_backend = { 430 .prefix = "vmnet", 431 .priv_size = sizeof(struct tap_priv), 432 .init = tap_init, 433 .cleanup = tap_cleanup, 434 .send = tap_send, 435 .peek_recvlen = tap_peek_recvlen, 436 .recv = tap_recv, 437 .recv_enable = tap_recv_enable, 438 .recv_disable = tap_recv_disable, 439 .get_cap = tap_get_cap, 440 .set_cap = tap_set_cap, 441 }; 442 443 DATA_SET(net_backend_set, tap_backend); 444 DATA_SET(net_backend_set, vmnet_backend); 445 446 #ifdef NETGRAPH 447 448 /* 449 * Netgraph backend 450 */ 451 452 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 453 454 static int 455 ng_init(struct net_backend *be, const char *devname __unused, 456 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 457 { 458 struct tap_priv *p = NET_BE_PRIV(be); 459 struct ngm_connect ngc; 460 const char *value, *nodename; 461 int sbsz; 462 int ctrl_sock; 463 int flags; 464 unsigned long maxsbsz; 465 size_t msbsz; 466 #ifndef WITHOUT_CAPSICUM 467 cap_rights_t rights; 468 #endif 469 470 if (cb == NULL) { 471 WPRINTF(("Netgraph backend requires non-NULL callback")); 472 return (-1); 473 } 474 475 be->fd = -1; 476 477 memset(&ngc, 0, sizeof(ngc)); 478 479 value = get_config_value_node(nvl, "path"); 480 if (value == NULL) { 481 WPRINTF(("path must be provided")); 482 return (-1); 483 } 484 strncpy(ngc.path, value, NG_PATHSIZ - 1); 485 486 value = get_config_value_node(nvl, "hook"); 487 if (value == NULL) 488 value = "vmlink"; 489 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 490 491 value = get_config_value_node(nvl, "peerhook"); 492 if (value == NULL) { 493 WPRINTF(("peer hook must be provided")); 494 return (-1); 495 } 496 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 497 498 nodename = get_config_value_node(nvl, "socket"); 499 if (NgMkSockNode(nodename, 500 &ctrl_sock, &be->fd) < 0) { 501 WPRINTF(("can't get Netgraph sockets")); 502 return (-1); 503 } 504 505 if (NgSendMsg(ctrl_sock, ".", 506 NGM_GENERIC_COOKIE, 507 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 508 WPRINTF(("can't connect to node")); 509 close(ctrl_sock); 510 goto error; 511 } 512 513 close(ctrl_sock); 514 515 flags = fcntl(be->fd, F_GETFL); 516 517 if (flags < 0) { 518 WPRINTF(("can't get socket flags")); 519 goto error; 520 } 521 522 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 523 WPRINTF(("can't set O_NONBLOCK flag")); 524 goto error; 525 } 526 527 /* 528 * The default ng_socket(4) buffer's size is too low. 529 * Calculate the minimum value between NG_SBUF_MAX_SIZE 530 * and kern.ipc.maxsockbuf. 531 */ 532 msbsz = sizeof(maxsbsz); 533 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 534 NULL, 0) < 0) { 535 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 536 goto error; 537 } 538 539 /* 540 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 541 * as it takes into account the mbuf(9) overhead. 542 */ 543 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 544 545 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 546 547 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 548 sizeof(sbsz)) < 0) { 549 WPRINTF(("can't set TX buffer size")); 550 goto error; 551 } 552 553 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 554 sizeof(sbsz)) < 0) { 555 WPRINTF(("can't set RX buffer size")); 556 goto error; 557 } 558 559 #ifndef WITHOUT_CAPSICUM 560 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 561 if (caph_rights_limit(be->fd, &rights) == -1) 562 errx(EX_OSERR, "Unable to apply rights for sandbox"); 563 #endif 564 565 memset(p->bbuf, 0, sizeof(p->bbuf)); 566 p->bbuflen = 0; 567 568 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 569 if (p->mevp == NULL) { 570 WPRINTF(("Could not register event")); 571 goto error; 572 } 573 574 return (0); 575 576 error: 577 tap_cleanup(be); 578 return (-1); 579 } 580 581 static struct net_backend ng_backend = { 582 .prefix = "netgraph", 583 .priv_size = sizeof(struct tap_priv), 584 .init = ng_init, 585 .cleanup = tap_cleanup, 586 .send = tap_send, 587 .peek_recvlen = tap_peek_recvlen, 588 .recv = tap_recv, 589 .recv_enable = tap_recv_enable, 590 .recv_disable = tap_recv_disable, 591 .get_cap = tap_get_cap, 592 .set_cap = tap_set_cap, 593 }; 594 595 DATA_SET(net_backend_set, ng_backend); 596 597 #endif /* NETGRAPH */ 598 599 /* 600 * The netmap backend 601 */ 602 603 /* The virtio-net features supported by netmap. */ 604 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 605 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 606 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 607 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 608 609 struct netmap_priv { 610 char ifname[IFNAMSIZ]; 611 struct nm_desc *nmd; 612 uint16_t memid; 613 struct netmap_ring *rx; 614 struct netmap_ring *tx; 615 struct mevent *mevp; 616 net_be_rxeof_t cb; 617 void *cb_param; 618 }; 619 620 static void 621 nmreq_init(struct nmreq *req, char *ifname) 622 { 623 624 memset(req, 0, sizeof(*req)); 625 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 626 req->nr_version = NETMAP_API; 627 } 628 629 static int 630 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 631 { 632 int err; 633 struct nmreq req; 634 struct netmap_priv *priv = NET_BE_PRIV(be); 635 636 nmreq_init(&req, priv->ifname); 637 req.nr_cmd = NETMAP_BDG_VNET_HDR; 638 req.nr_arg1 = vnet_hdr_len; 639 err = ioctl(be->fd, NIOCREGIF, &req); 640 if (err) { 641 WPRINTF(("Unable to set vnet header length %d", 642 vnet_hdr_len)); 643 return (err); 644 } 645 646 be->be_vnet_hdr_len = vnet_hdr_len; 647 648 return (0); 649 } 650 651 static int 652 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 653 { 654 unsigned prev_hdr_len = be->be_vnet_hdr_len; 655 int ret; 656 657 if (vnet_hdr_len == prev_hdr_len) { 658 return (1); 659 } 660 661 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 662 if (ret) { 663 return (0); 664 } 665 666 netmap_set_vnet_hdr_len(be, prev_hdr_len); 667 668 return (1); 669 } 670 671 static uint64_t 672 netmap_get_cap(struct net_backend *be) 673 { 674 675 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 676 NETMAP_FEATURES : 0); 677 } 678 679 static int 680 netmap_set_cap(struct net_backend *be, uint64_t features __unused, 681 unsigned vnet_hdr_len) 682 { 683 684 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 685 } 686 687 static int 688 netmap_init(struct net_backend *be, const char *devname, 689 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 690 { 691 struct netmap_priv *priv = NET_BE_PRIV(be); 692 693 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 694 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 695 696 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 697 if (priv->nmd == NULL) { 698 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 699 devname, strerror(errno))); 700 return (-1); 701 } 702 703 priv->memid = priv->nmd->req.nr_arg2; 704 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 705 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 706 priv->cb = cb; 707 priv->cb_param = param; 708 be->fd = priv->nmd->fd; 709 710 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 711 if (priv->mevp == NULL) { 712 WPRINTF(("Could not register event")); 713 return (-1); 714 } 715 716 return (0); 717 } 718 719 static void 720 netmap_cleanup(struct net_backend *be) 721 { 722 struct netmap_priv *priv = NET_BE_PRIV(be); 723 724 if (priv->mevp) { 725 mevent_delete(priv->mevp); 726 } 727 if (priv->nmd) { 728 nm_close(priv->nmd); 729 } 730 be->fd = -1; 731 } 732 733 static ssize_t 734 netmap_send(struct net_backend *be, const struct iovec *iov, 735 int iovcnt) 736 { 737 struct netmap_priv *priv = NET_BE_PRIV(be); 738 struct netmap_ring *ring; 739 ssize_t totlen = 0; 740 int nm_buf_size; 741 int nm_buf_len; 742 uint32_t head; 743 uint8_t *nm_buf; 744 int j; 745 746 ring = priv->tx; 747 head = ring->head; 748 if (head == ring->tail) { 749 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 750 goto txsync; 751 } 752 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 753 nm_buf_size = ring->nr_buf_size; 754 nm_buf_len = 0; 755 756 for (j = 0; j < iovcnt; j++) { 757 uint8_t *iov_frag_buf = iov[j].iov_base; 758 int iov_frag_size = iov[j].iov_len; 759 760 totlen += iov_frag_size; 761 762 /* 763 * Split each iovec fragment over more netmap slots, if 764 * necessary. 765 */ 766 for (;;) { 767 int copylen; 768 769 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 770 memcpy(nm_buf, iov_frag_buf, copylen); 771 772 iov_frag_buf += copylen; 773 iov_frag_size -= copylen; 774 nm_buf += copylen; 775 nm_buf_size -= copylen; 776 nm_buf_len += copylen; 777 778 if (iov_frag_size == 0) { 779 break; 780 } 781 782 ring->slot[head].len = nm_buf_len; 783 ring->slot[head].flags = NS_MOREFRAG; 784 head = nm_ring_next(ring, head); 785 if (head == ring->tail) { 786 /* 787 * We ran out of netmap slots while 788 * splitting the iovec fragments. 789 */ 790 WPRINTF(("No space, drop %zu bytes", 791 count_iov(iov, iovcnt))); 792 goto txsync; 793 } 794 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 795 nm_buf_size = ring->nr_buf_size; 796 nm_buf_len = 0; 797 } 798 } 799 800 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 801 ring->slot[head].len = nm_buf_len; 802 ring->slot[head].flags = 0; 803 head = nm_ring_next(ring, head); 804 805 /* Now update ring->head and ring->cur. */ 806 ring->head = ring->cur = head; 807 txsync: 808 ioctl(be->fd, NIOCTXSYNC, NULL); 809 810 return (totlen); 811 } 812 813 static ssize_t 814 netmap_peek_recvlen(struct net_backend *be) 815 { 816 struct netmap_priv *priv = NET_BE_PRIV(be); 817 struct netmap_ring *ring = priv->rx; 818 uint32_t head = ring->head; 819 ssize_t totlen = 0; 820 821 while (head != ring->tail) { 822 struct netmap_slot *slot = ring->slot + head; 823 824 totlen += slot->len; 825 if ((slot->flags & NS_MOREFRAG) == 0) 826 break; 827 head = nm_ring_next(ring, head); 828 } 829 830 return (totlen); 831 } 832 833 static ssize_t 834 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 835 { 836 struct netmap_priv *priv = NET_BE_PRIV(be); 837 struct netmap_slot *slot = NULL; 838 struct netmap_ring *ring; 839 uint8_t *iov_frag_buf; 840 int iov_frag_size; 841 ssize_t totlen = 0; 842 uint32_t head; 843 844 assert(iovcnt); 845 846 ring = priv->rx; 847 head = ring->head; 848 iov_frag_buf = iov->iov_base; 849 iov_frag_size = iov->iov_len; 850 851 do { 852 uint8_t *nm_buf; 853 int nm_buf_len; 854 855 if (head == ring->tail) { 856 return (0); 857 } 858 859 slot = ring->slot + head; 860 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 861 nm_buf_len = slot->len; 862 863 for (;;) { 864 int copylen = nm_buf_len < iov_frag_size ? 865 nm_buf_len : iov_frag_size; 866 867 memcpy(iov_frag_buf, nm_buf, copylen); 868 nm_buf += copylen; 869 nm_buf_len -= copylen; 870 iov_frag_buf += copylen; 871 iov_frag_size -= copylen; 872 totlen += copylen; 873 874 if (nm_buf_len == 0) { 875 break; 876 } 877 878 iov++; 879 iovcnt--; 880 if (iovcnt == 0) { 881 /* No space to receive. */ 882 WPRINTF(("Short iov, drop %zd bytes", 883 totlen)); 884 return (-ENOSPC); 885 } 886 iov_frag_buf = iov->iov_base; 887 iov_frag_size = iov->iov_len; 888 } 889 890 head = nm_ring_next(ring, head); 891 892 } while (slot->flags & NS_MOREFRAG); 893 894 /* Release slots to netmap. */ 895 ring->head = ring->cur = head; 896 897 return (totlen); 898 } 899 900 static void 901 netmap_recv_enable(struct net_backend *be) 902 { 903 struct netmap_priv *priv = NET_BE_PRIV(be); 904 905 mevent_enable(priv->mevp); 906 } 907 908 static void 909 netmap_recv_disable(struct net_backend *be) 910 { 911 struct netmap_priv *priv = NET_BE_PRIV(be); 912 913 mevent_disable(priv->mevp); 914 } 915 916 static struct net_backend netmap_backend = { 917 .prefix = "netmap", 918 .priv_size = sizeof(struct netmap_priv), 919 .init = netmap_init, 920 .cleanup = netmap_cleanup, 921 .send = netmap_send, 922 .peek_recvlen = netmap_peek_recvlen, 923 .recv = netmap_recv, 924 .recv_enable = netmap_recv_enable, 925 .recv_disable = netmap_recv_disable, 926 .get_cap = netmap_get_cap, 927 .set_cap = netmap_set_cap, 928 }; 929 930 /* A clone of the netmap backend, with a different prefix. */ 931 static struct net_backend vale_backend = { 932 .prefix = "vale", 933 .priv_size = sizeof(struct netmap_priv), 934 .init = netmap_init, 935 .cleanup = netmap_cleanup, 936 .send = netmap_send, 937 .peek_recvlen = netmap_peek_recvlen, 938 .recv = netmap_recv, 939 .recv_enable = netmap_recv_enable, 940 .recv_disable = netmap_recv_disable, 941 .get_cap = netmap_get_cap, 942 .set_cap = netmap_set_cap, 943 }; 944 945 DATA_SET(net_backend_set, netmap_backend); 946 DATA_SET(net_backend_set, vale_backend); 947 948 int 949 netbe_legacy_config(nvlist_t *nvl, const char *opts) 950 { 951 char *backend, *cp; 952 953 if (opts == NULL) 954 return (0); 955 956 cp = strchr(opts, ','); 957 if (cp == NULL) { 958 set_config_value_node(nvl, "backend", opts); 959 return (0); 960 } 961 backend = strndup(opts, cp - opts); 962 set_config_value_node(nvl, "backend", backend); 963 free(backend); 964 return (pci_parse_legacy_config(nvl, cp + 1)); 965 } 966 967 /* 968 * Initialize a backend and attach to the frontend. 969 * This is called during frontend initialization. 970 * @ret is a pointer to the backend to be initialized 971 * @devname is the backend-name as supplied on the command line, 972 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 973 * @cb is the receive callback supplied by the frontend, 974 * and it is invoked in the event loop when a receive 975 * event is generated in the hypervisor, 976 * @param is a pointer to the frontend, and normally used as 977 * the argument for the callback. 978 */ 979 int 980 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 981 void *param) 982 { 983 struct net_backend **pbe, *nbe, *tbe = NULL; 984 const char *value, *type; 985 char *devname; 986 int err; 987 988 value = get_config_value_node(nvl, "backend"); 989 if (value == NULL) { 990 return (-1); 991 } 992 devname = strdup(value); 993 994 /* 995 * Use the type given by configuration if exists; otherwise 996 * use the prefix of the backend as the type. 997 */ 998 type = get_config_value_node(nvl, "type"); 999 if (type == NULL) 1000 type = devname; 1001 1002 /* 1003 * Find the network backend that matches the user-provided 1004 * device name. net_backend_set is built using a linker set. 1005 */ 1006 SET_FOREACH(pbe, net_backend_set) { 1007 if (strncmp(type, (*pbe)->prefix, 1008 strlen((*pbe)->prefix)) == 0) { 1009 tbe = *pbe; 1010 assert(tbe->init != NULL); 1011 assert(tbe->cleanup != NULL); 1012 assert(tbe->send != NULL); 1013 assert(tbe->recv != NULL); 1014 assert(tbe->get_cap != NULL); 1015 assert(tbe->set_cap != NULL); 1016 break; 1017 } 1018 } 1019 1020 *ret = NULL; 1021 if (tbe == NULL) { 1022 free(devname); 1023 return (EINVAL); 1024 } 1025 1026 nbe = calloc(1, NET_BE_SIZE(tbe)); 1027 *nbe = *tbe; /* copy the template */ 1028 nbe->fd = -1; 1029 nbe->sc = param; 1030 nbe->be_vnet_hdr_len = 0; 1031 nbe->fe_vnet_hdr_len = 0; 1032 1033 /* Initialize the backend. */ 1034 err = nbe->init(nbe, devname, nvl, cb, param); 1035 if (err) { 1036 free(devname); 1037 free(nbe); 1038 return (err); 1039 } 1040 1041 *ret = nbe; 1042 free(devname); 1043 1044 return (0); 1045 } 1046 1047 void 1048 netbe_cleanup(struct net_backend *be) 1049 { 1050 1051 if (be != NULL) { 1052 be->cleanup(be); 1053 free(be); 1054 } 1055 } 1056 1057 uint64_t 1058 netbe_get_cap(struct net_backend *be) 1059 { 1060 1061 assert(be != NULL); 1062 return (be->get_cap(be)); 1063 } 1064 1065 int 1066 netbe_set_cap(struct net_backend *be, uint64_t features, 1067 unsigned vnet_hdr_len) 1068 { 1069 int ret; 1070 1071 assert(be != NULL); 1072 1073 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1074 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1075 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1076 return (-1); 1077 1078 be->fe_vnet_hdr_len = vnet_hdr_len; 1079 1080 ret = be->set_cap(be, features, vnet_hdr_len); 1081 assert(be->be_vnet_hdr_len == 0 || 1082 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1083 1084 return (ret); 1085 } 1086 1087 ssize_t 1088 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1089 { 1090 1091 return (be->send(be, iov, iovcnt)); 1092 } 1093 1094 ssize_t 1095 netbe_peek_recvlen(struct net_backend *be) 1096 { 1097 1098 return (be->peek_recvlen(be)); 1099 } 1100 1101 /* 1102 * Try to read a packet from the backend, without blocking. 1103 * If no packets are available, return 0. In case of success, return 1104 * the length of the packet just read. Return -1 in case of errors. 1105 */ 1106 ssize_t 1107 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1108 { 1109 1110 return (be->recv(be, iov, iovcnt)); 1111 } 1112 1113 /* 1114 * Read a packet from the backend and discard it. 1115 * Returns the size of the discarded packet or zero if no packet was available. 1116 * A negative error code is returned in case of read error. 1117 */ 1118 ssize_t 1119 netbe_rx_discard(struct net_backend *be) 1120 { 1121 /* 1122 * MP note: the dummybuf is only used to discard frames, 1123 * so there is no need for it to be per-vtnet or locked. 1124 * We only make it large enough for TSO-sized segment. 1125 */ 1126 static uint8_t dummybuf[65536 + 64]; 1127 struct iovec iov; 1128 1129 iov.iov_base = dummybuf; 1130 iov.iov_len = sizeof(dummybuf); 1131 1132 return netbe_recv(be, &iov, 1); 1133 } 1134 1135 void 1136 netbe_rx_disable(struct net_backend *be) 1137 { 1138 1139 return be->recv_disable(be); 1140 } 1141 1142 void 1143 netbe_rx_enable(struct net_backend *be) 1144 { 1145 1146 return be->recv_enable(be); 1147 } 1148 1149 size_t 1150 netbe_get_vnet_hdr_len(struct net_backend *be) 1151 { 1152 1153 return (be->be_vnet_hdr_len); 1154 } 1155