1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35 #include <sys/cdefs.h> 36 __FBSDID("$FreeBSD$"); 37 38 #include <sys/types.h> /* u_short etc */ 39 #ifndef WITHOUT_CAPSICUM 40 #include <sys/capsicum.h> 41 #endif 42 #include <sys/ioctl.h> 43 #include <sys/mman.h> 44 #include <sys/uio.h> 45 46 #include <net/if.h> 47 #if defined(INET6) || defined(INET) 48 #include <net/if_tap.h> 49 #endif 50 #include <net/netmap.h> 51 #include <net/netmap_virt.h> 52 #define NETMAP_WITH_LIBS 53 #include <net/netmap_user.h> 54 55 #ifndef WITHOUT_CAPSICUM 56 #include <capsicum_helpers.h> 57 #endif 58 #include <err.h> 59 #include <errno.h> 60 #include <fcntl.h> 61 #include <stdio.h> 62 #include <stdlib.h> 63 #include <stdint.h> 64 #include <string.h> 65 #include <unistd.h> 66 #include <sysexits.h> 67 #include <assert.h> 68 #include <pthread.h> 69 #include <pthread_np.h> 70 #include <poll.h> 71 #include <assert.h> 72 73 #ifdef NETGRAPH 74 #include <sys/param.h> 75 #include <sys/sysctl.h> 76 #include <netgraph.h> 77 #endif 78 79 #include "config.h" 80 #include "debug.h" 81 #include "iov.h" 82 #include "mevent.h" 83 #include "net_backends.h" 84 #include "pci_emul.h" 85 86 #include <sys/linker_set.h> 87 88 /* 89 * Each network backend registers a set of function pointers that are 90 * used to implement the net backends API. 91 * This might need to be exposed if we implement backends in separate files. 92 */ 93 struct net_backend { 94 const char *prefix; /* prefix matching this backend */ 95 96 /* 97 * Routines used to initialize and cleanup the resources needed 98 * by a backend. The cleanup function is used internally, 99 * and should not be called by the frontend. 100 */ 101 int (*init)(struct net_backend *be, const char *devname, 102 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 103 void (*cleanup)(struct net_backend *be); 104 105 /* 106 * Called to serve a guest transmit request. The scatter-gather 107 * vector provided by the caller has 'iovcnt' elements and contains 108 * the packet to send. 109 */ 110 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 111 int iovcnt); 112 113 /* 114 * Get the length of the next packet that can be received from 115 * the backend. If no packets are currently available, this 116 * function returns 0. 117 */ 118 ssize_t (*peek_recvlen)(struct net_backend *be); 119 120 /* 121 * Called to receive a packet from the backend. When the function 122 * returns a positive value 'len', the scatter-gather vector 123 * provided by the caller contains a packet with such length. 124 * The function returns 0 if the backend doesn't have a new packet to 125 * receive. 126 */ 127 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 128 int iovcnt); 129 130 /* 131 * Ask the backend to enable or disable receive operation in the 132 * backend. On return from a disable operation, it is guaranteed 133 * that the receive callback won't be called until receive is 134 * enabled again. Note however that it is up to the caller to make 135 * sure that netbe_recv() is not currently being executed by another 136 * thread. 137 */ 138 void (*recv_enable)(struct net_backend *be); 139 void (*recv_disable)(struct net_backend *be); 140 141 /* 142 * Ask the backend for the virtio-net features it is able to 143 * support. Possible features are TSO, UFO and checksum offloading 144 * in both rx and tx direction and for both IPv4 and IPv6. 145 */ 146 uint64_t (*get_cap)(struct net_backend *be); 147 148 /* 149 * Tell the backend to enable/disable the specified virtio-net 150 * features (capabilities). 151 */ 152 int (*set_cap)(struct net_backend *be, uint64_t features, 153 unsigned int vnet_hdr_len); 154 155 struct pci_vtnet_softc *sc; 156 int fd; 157 158 /* 159 * Length of the virtio-net header used by the backend and the 160 * frontend, respectively. A zero value means that the header 161 * is not used. 162 */ 163 unsigned int be_vnet_hdr_len; 164 unsigned int fe_vnet_hdr_len; 165 166 /* Size of backend-specific private data. */ 167 size_t priv_size; 168 169 /* Backend-specific private data follows. */ 170 }; 171 172 #define NET_BE_PRIV(be) ((void *)((be) + 1)) 173 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 174 175 SET_DECLARE(net_backend_set, struct net_backend); 176 177 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 178 179 #define WPRINTF(params) PRINTLN params 180 181 /* 182 * The tap backend 183 */ 184 185 #if defined(INET6) || defined(INET) 186 static const int pf_list[] = { 187 #if defined(INET6) 188 PF_INET6, 189 #endif 190 #if defined(INET) 191 PF_INET, 192 #endif 193 }; 194 #endif 195 196 struct tap_priv { 197 struct mevent *mevp; 198 /* 199 * A bounce buffer that allows us to implement the peek_recvlen 200 * callback. In the future we may get the same information from 201 * the kevent data. 202 */ 203 char bbuf[1 << 16]; 204 ssize_t bbuflen; 205 }; 206 207 static void 208 tap_cleanup(struct net_backend *be) 209 { 210 struct tap_priv *priv = NET_BE_PRIV(be); 211 212 if (priv->mevp) { 213 mevent_delete(priv->mevp); 214 } 215 if (be->fd != -1) { 216 close(be->fd); 217 be->fd = -1; 218 } 219 } 220 221 static int 222 tap_init(struct net_backend *be, const char *devname, 223 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 224 { 225 struct tap_priv *priv = NET_BE_PRIV(be); 226 char tbuf[80]; 227 int opt = 1; 228 #if defined(INET6) || defined(INET) 229 struct ifreq ifrq; 230 int s; 231 #endif 232 #ifndef WITHOUT_CAPSICUM 233 cap_rights_t rights; 234 #endif 235 236 if (cb == NULL) { 237 WPRINTF(("TAP backend requires non-NULL callback")); 238 return (-1); 239 } 240 241 strcpy(tbuf, "/dev/"); 242 strlcat(tbuf, devname, sizeof(tbuf)); 243 244 be->fd = open(tbuf, O_RDWR); 245 if (be->fd == -1) { 246 WPRINTF(("open of tap device %s failed", tbuf)); 247 goto error; 248 } 249 250 /* 251 * Set non-blocking and register for read 252 * notifications with the event loop 253 */ 254 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 255 WPRINTF(("tap device O_NONBLOCK failed")); 256 goto error; 257 } 258 259 #if defined(INET6) || defined(INET) 260 /* 261 * Try to UP the interface rather than relying on 262 * net.link.tap.up_on_open. 263 */ 264 bzero(&ifrq, sizeof(ifrq)); 265 if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) { 266 WPRINTF(("Could not get interface name")); 267 goto error; 268 } 269 270 s = -1; 271 for (size_t i = 0; s == -1 && i < nitems(pf_list); i++) 272 s = socket(pf_list[i], SOCK_DGRAM, 0); 273 if (s == -1) { 274 WPRINTF(("Could open socket")); 275 goto error; 276 } 277 278 if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) { 279 (void)close(s); 280 WPRINTF(("Could not get interface flags")); 281 goto error; 282 } 283 ifrq.ifr_flags |= IFF_UP; 284 if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) { 285 (void)close(s); 286 WPRINTF(("Could not set interface flags")); 287 goto error; 288 } 289 (void)close(s); 290 #endif 291 292 #ifndef WITHOUT_CAPSICUM 293 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 294 if (caph_rights_limit(be->fd, &rights) == -1) 295 errx(EX_OSERR, "Unable to apply rights for sandbox"); 296 #endif 297 298 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 299 priv->bbuflen = 0; 300 301 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 302 if (priv->mevp == NULL) { 303 WPRINTF(("Could not register event")); 304 goto error; 305 } 306 307 return (0); 308 309 error: 310 tap_cleanup(be); 311 return (-1); 312 } 313 314 /* 315 * Called to send a buffer chain out to the tap device 316 */ 317 static ssize_t 318 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 319 { 320 return (writev(be->fd, iov, iovcnt)); 321 } 322 323 static ssize_t 324 tap_peek_recvlen(struct net_backend *be) 325 { 326 struct tap_priv *priv = NET_BE_PRIV(be); 327 ssize_t ret; 328 329 if (priv->bbuflen > 0) { 330 /* 331 * We already have a packet in the bounce buffer. 332 * Just return its length. 333 */ 334 return priv->bbuflen; 335 } 336 337 /* 338 * Read the next packet (if any) into the bounce buffer, so 339 * that we get to know its length and we can return that 340 * to the caller. 341 */ 342 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 343 if (ret < 0 && errno == EWOULDBLOCK) { 344 return (0); 345 } 346 347 if (ret > 0) 348 priv->bbuflen = ret; 349 350 return (ret); 351 } 352 353 static ssize_t 354 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 355 { 356 struct tap_priv *priv = NET_BE_PRIV(be); 357 ssize_t ret; 358 359 if (priv->bbuflen > 0) { 360 /* 361 * A packet is available in the bounce buffer, so 362 * we read it from there. 363 */ 364 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 365 iov, iovcnt, 0); 366 367 /* Mark the bounce buffer as empty. */ 368 priv->bbuflen = 0; 369 370 return (ret); 371 } 372 373 ret = readv(be->fd, iov, iovcnt); 374 if (ret < 0 && errno == EWOULDBLOCK) { 375 return (0); 376 } 377 378 return (ret); 379 } 380 381 static void 382 tap_recv_enable(struct net_backend *be) 383 { 384 struct tap_priv *priv = NET_BE_PRIV(be); 385 386 mevent_enable(priv->mevp); 387 } 388 389 static void 390 tap_recv_disable(struct net_backend *be) 391 { 392 struct tap_priv *priv = NET_BE_PRIV(be); 393 394 mevent_disable(priv->mevp); 395 } 396 397 static uint64_t 398 tap_get_cap(struct net_backend *be __unused) 399 { 400 401 return (0); /* no capabilities for now */ 402 } 403 404 static int 405 tap_set_cap(struct net_backend *be __unused, uint64_t features, 406 unsigned vnet_hdr_len) 407 { 408 409 return ((features || vnet_hdr_len) ? -1 : 0); 410 } 411 412 static struct net_backend tap_backend = { 413 .prefix = "tap", 414 .priv_size = sizeof(struct tap_priv), 415 .init = tap_init, 416 .cleanup = tap_cleanup, 417 .send = tap_send, 418 .peek_recvlen = tap_peek_recvlen, 419 .recv = tap_recv, 420 .recv_enable = tap_recv_enable, 421 .recv_disable = tap_recv_disable, 422 .get_cap = tap_get_cap, 423 .set_cap = tap_set_cap, 424 }; 425 426 /* A clone of the tap backend, with a different prefix. */ 427 static struct net_backend vmnet_backend = { 428 .prefix = "vmnet", 429 .priv_size = sizeof(struct tap_priv), 430 .init = tap_init, 431 .cleanup = tap_cleanup, 432 .send = tap_send, 433 .peek_recvlen = tap_peek_recvlen, 434 .recv = tap_recv, 435 .recv_enable = tap_recv_enable, 436 .recv_disable = tap_recv_disable, 437 .get_cap = tap_get_cap, 438 .set_cap = tap_set_cap, 439 }; 440 441 DATA_SET(net_backend_set, tap_backend); 442 DATA_SET(net_backend_set, vmnet_backend); 443 444 #ifdef NETGRAPH 445 446 /* 447 * Netgraph backend 448 */ 449 450 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 451 452 static int 453 ng_init(struct net_backend *be, const char *devname __unused, 454 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 455 { 456 struct tap_priv *p = NET_BE_PRIV(be); 457 struct ngm_connect ngc; 458 const char *value, *nodename; 459 int sbsz; 460 int ctrl_sock; 461 int flags; 462 unsigned long maxsbsz; 463 size_t msbsz; 464 #ifndef WITHOUT_CAPSICUM 465 cap_rights_t rights; 466 #endif 467 468 if (cb == NULL) { 469 WPRINTF(("Netgraph backend requires non-NULL callback")); 470 return (-1); 471 } 472 473 be->fd = -1; 474 475 memset(&ngc, 0, sizeof(ngc)); 476 477 value = get_config_value_node(nvl, "path"); 478 if (value == NULL) { 479 WPRINTF(("path must be provided")); 480 return (-1); 481 } 482 strncpy(ngc.path, value, NG_PATHSIZ - 1); 483 484 value = get_config_value_node(nvl, "hook"); 485 if (value == NULL) 486 value = "vmlink"; 487 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 488 489 value = get_config_value_node(nvl, "peerhook"); 490 if (value == NULL) { 491 WPRINTF(("peer hook must be provided")); 492 return (-1); 493 } 494 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 495 496 nodename = get_config_value_node(nvl, "socket"); 497 if (NgMkSockNode(nodename, 498 &ctrl_sock, &be->fd) < 0) { 499 WPRINTF(("can't get Netgraph sockets")); 500 return (-1); 501 } 502 503 if (NgSendMsg(ctrl_sock, ".", 504 NGM_GENERIC_COOKIE, 505 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 506 WPRINTF(("can't connect to node")); 507 close(ctrl_sock); 508 goto error; 509 } 510 511 close(ctrl_sock); 512 513 flags = fcntl(be->fd, F_GETFL); 514 515 if (flags < 0) { 516 WPRINTF(("can't get socket flags")); 517 goto error; 518 } 519 520 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 521 WPRINTF(("can't set O_NONBLOCK flag")); 522 goto error; 523 } 524 525 /* 526 * The default ng_socket(4) buffer's size is too low. 527 * Calculate the minimum value between NG_SBUF_MAX_SIZE 528 * and kern.ipc.maxsockbuf. 529 */ 530 msbsz = sizeof(maxsbsz); 531 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 532 NULL, 0) < 0) { 533 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 534 goto error; 535 } 536 537 /* 538 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 539 * as it takes into account the mbuf(9) overhead. 540 */ 541 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 542 543 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 544 545 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 546 sizeof(sbsz)) < 0) { 547 WPRINTF(("can't set TX buffer size")); 548 goto error; 549 } 550 551 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 552 sizeof(sbsz)) < 0) { 553 WPRINTF(("can't set RX buffer size")); 554 goto error; 555 } 556 557 #ifndef WITHOUT_CAPSICUM 558 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 559 if (caph_rights_limit(be->fd, &rights) == -1) 560 errx(EX_OSERR, "Unable to apply rights for sandbox"); 561 #endif 562 563 memset(p->bbuf, 0, sizeof(p->bbuf)); 564 p->bbuflen = 0; 565 566 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 567 if (p->mevp == NULL) { 568 WPRINTF(("Could not register event")); 569 goto error; 570 } 571 572 return (0); 573 574 error: 575 tap_cleanup(be); 576 return (-1); 577 } 578 579 static struct net_backend ng_backend = { 580 .prefix = "netgraph", 581 .priv_size = sizeof(struct tap_priv), 582 .init = ng_init, 583 .cleanup = tap_cleanup, 584 .send = tap_send, 585 .peek_recvlen = tap_peek_recvlen, 586 .recv = tap_recv, 587 .recv_enable = tap_recv_enable, 588 .recv_disable = tap_recv_disable, 589 .get_cap = tap_get_cap, 590 .set_cap = tap_set_cap, 591 }; 592 593 DATA_SET(net_backend_set, ng_backend); 594 595 #endif /* NETGRAPH */ 596 597 /* 598 * The netmap backend 599 */ 600 601 /* The virtio-net features supported by netmap. */ 602 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 603 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 604 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 605 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 606 607 struct netmap_priv { 608 char ifname[IFNAMSIZ]; 609 struct nm_desc *nmd; 610 uint16_t memid; 611 struct netmap_ring *rx; 612 struct netmap_ring *tx; 613 struct mevent *mevp; 614 net_be_rxeof_t cb; 615 void *cb_param; 616 }; 617 618 static void 619 nmreq_init(struct nmreq *req, char *ifname) 620 { 621 622 memset(req, 0, sizeof(*req)); 623 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 624 req->nr_version = NETMAP_API; 625 } 626 627 static int 628 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 629 { 630 int err; 631 struct nmreq req; 632 struct netmap_priv *priv = NET_BE_PRIV(be); 633 634 nmreq_init(&req, priv->ifname); 635 req.nr_cmd = NETMAP_BDG_VNET_HDR; 636 req.nr_arg1 = vnet_hdr_len; 637 err = ioctl(be->fd, NIOCREGIF, &req); 638 if (err) { 639 WPRINTF(("Unable to set vnet header length %d", 640 vnet_hdr_len)); 641 return (err); 642 } 643 644 be->be_vnet_hdr_len = vnet_hdr_len; 645 646 return (0); 647 } 648 649 static int 650 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 651 { 652 unsigned prev_hdr_len = be->be_vnet_hdr_len; 653 int ret; 654 655 if (vnet_hdr_len == prev_hdr_len) { 656 return (1); 657 } 658 659 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 660 if (ret) { 661 return (0); 662 } 663 664 netmap_set_vnet_hdr_len(be, prev_hdr_len); 665 666 return (1); 667 } 668 669 static uint64_t 670 netmap_get_cap(struct net_backend *be) 671 { 672 673 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 674 NETMAP_FEATURES : 0); 675 } 676 677 static int 678 netmap_set_cap(struct net_backend *be, uint64_t features __unused, 679 unsigned vnet_hdr_len) 680 { 681 682 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 683 } 684 685 static int 686 netmap_init(struct net_backend *be, const char *devname, 687 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 688 { 689 struct netmap_priv *priv = NET_BE_PRIV(be); 690 691 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 692 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 693 694 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 695 if (priv->nmd == NULL) { 696 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 697 devname, strerror(errno))); 698 return (-1); 699 } 700 701 priv->memid = priv->nmd->req.nr_arg2; 702 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 703 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 704 priv->cb = cb; 705 priv->cb_param = param; 706 be->fd = priv->nmd->fd; 707 708 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 709 if (priv->mevp == NULL) { 710 WPRINTF(("Could not register event")); 711 return (-1); 712 } 713 714 return (0); 715 } 716 717 static void 718 netmap_cleanup(struct net_backend *be) 719 { 720 struct netmap_priv *priv = NET_BE_PRIV(be); 721 722 if (priv->mevp) { 723 mevent_delete(priv->mevp); 724 } 725 if (priv->nmd) { 726 nm_close(priv->nmd); 727 } 728 be->fd = -1; 729 } 730 731 static ssize_t 732 netmap_send(struct net_backend *be, const struct iovec *iov, 733 int iovcnt) 734 { 735 struct netmap_priv *priv = NET_BE_PRIV(be); 736 struct netmap_ring *ring; 737 ssize_t totlen = 0; 738 int nm_buf_size; 739 int nm_buf_len; 740 uint32_t head; 741 uint8_t *nm_buf; 742 int j; 743 744 ring = priv->tx; 745 head = ring->head; 746 if (head == ring->tail) { 747 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 748 goto txsync; 749 } 750 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 751 nm_buf_size = ring->nr_buf_size; 752 nm_buf_len = 0; 753 754 for (j = 0; j < iovcnt; j++) { 755 uint8_t *iov_frag_buf = iov[j].iov_base; 756 int iov_frag_size = iov[j].iov_len; 757 758 totlen += iov_frag_size; 759 760 /* 761 * Split each iovec fragment over more netmap slots, if 762 * necessary. 763 */ 764 for (;;) { 765 int copylen; 766 767 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 768 memcpy(nm_buf, iov_frag_buf, copylen); 769 770 iov_frag_buf += copylen; 771 iov_frag_size -= copylen; 772 nm_buf += copylen; 773 nm_buf_size -= copylen; 774 nm_buf_len += copylen; 775 776 if (iov_frag_size == 0) { 777 break; 778 } 779 780 ring->slot[head].len = nm_buf_len; 781 ring->slot[head].flags = NS_MOREFRAG; 782 head = nm_ring_next(ring, head); 783 if (head == ring->tail) { 784 /* 785 * We ran out of netmap slots while 786 * splitting the iovec fragments. 787 */ 788 WPRINTF(("No space, drop %zu bytes", 789 count_iov(iov, iovcnt))); 790 goto txsync; 791 } 792 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 793 nm_buf_size = ring->nr_buf_size; 794 nm_buf_len = 0; 795 } 796 } 797 798 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 799 ring->slot[head].len = nm_buf_len; 800 ring->slot[head].flags = 0; 801 head = nm_ring_next(ring, head); 802 803 /* Now update ring->head and ring->cur. */ 804 ring->head = ring->cur = head; 805 txsync: 806 ioctl(be->fd, NIOCTXSYNC, NULL); 807 808 return (totlen); 809 } 810 811 static ssize_t 812 netmap_peek_recvlen(struct net_backend *be) 813 { 814 struct netmap_priv *priv = NET_BE_PRIV(be); 815 struct netmap_ring *ring = priv->rx; 816 uint32_t head = ring->head; 817 ssize_t totlen = 0; 818 819 while (head != ring->tail) { 820 struct netmap_slot *slot = ring->slot + head; 821 822 totlen += slot->len; 823 if ((slot->flags & NS_MOREFRAG) == 0) 824 break; 825 head = nm_ring_next(ring, head); 826 } 827 828 return (totlen); 829 } 830 831 static ssize_t 832 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 833 { 834 struct netmap_priv *priv = NET_BE_PRIV(be); 835 struct netmap_slot *slot = NULL; 836 struct netmap_ring *ring; 837 uint8_t *iov_frag_buf; 838 int iov_frag_size; 839 ssize_t totlen = 0; 840 uint32_t head; 841 842 assert(iovcnt); 843 844 ring = priv->rx; 845 head = ring->head; 846 iov_frag_buf = iov->iov_base; 847 iov_frag_size = iov->iov_len; 848 849 do { 850 uint8_t *nm_buf; 851 int nm_buf_len; 852 853 if (head == ring->tail) { 854 return (0); 855 } 856 857 slot = ring->slot + head; 858 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 859 nm_buf_len = slot->len; 860 861 for (;;) { 862 int copylen = nm_buf_len < iov_frag_size ? 863 nm_buf_len : iov_frag_size; 864 865 memcpy(iov_frag_buf, nm_buf, copylen); 866 nm_buf += copylen; 867 nm_buf_len -= copylen; 868 iov_frag_buf += copylen; 869 iov_frag_size -= copylen; 870 totlen += copylen; 871 872 if (nm_buf_len == 0) { 873 break; 874 } 875 876 iov++; 877 iovcnt--; 878 if (iovcnt == 0) { 879 /* No space to receive. */ 880 WPRINTF(("Short iov, drop %zd bytes", 881 totlen)); 882 return (-ENOSPC); 883 } 884 iov_frag_buf = iov->iov_base; 885 iov_frag_size = iov->iov_len; 886 } 887 888 head = nm_ring_next(ring, head); 889 890 } while (slot->flags & NS_MOREFRAG); 891 892 /* Release slots to netmap. */ 893 ring->head = ring->cur = head; 894 895 return (totlen); 896 } 897 898 static void 899 netmap_recv_enable(struct net_backend *be) 900 { 901 struct netmap_priv *priv = NET_BE_PRIV(be); 902 903 mevent_enable(priv->mevp); 904 } 905 906 static void 907 netmap_recv_disable(struct net_backend *be) 908 { 909 struct netmap_priv *priv = NET_BE_PRIV(be); 910 911 mevent_disable(priv->mevp); 912 } 913 914 static struct net_backend netmap_backend = { 915 .prefix = "netmap", 916 .priv_size = sizeof(struct netmap_priv), 917 .init = netmap_init, 918 .cleanup = netmap_cleanup, 919 .send = netmap_send, 920 .peek_recvlen = netmap_peek_recvlen, 921 .recv = netmap_recv, 922 .recv_enable = netmap_recv_enable, 923 .recv_disable = netmap_recv_disable, 924 .get_cap = netmap_get_cap, 925 .set_cap = netmap_set_cap, 926 }; 927 928 /* A clone of the netmap backend, with a different prefix. */ 929 static struct net_backend vale_backend = { 930 .prefix = "vale", 931 .priv_size = sizeof(struct netmap_priv), 932 .init = netmap_init, 933 .cleanup = netmap_cleanup, 934 .send = netmap_send, 935 .peek_recvlen = netmap_peek_recvlen, 936 .recv = netmap_recv, 937 .recv_enable = netmap_recv_enable, 938 .recv_disable = netmap_recv_disable, 939 .get_cap = netmap_get_cap, 940 .set_cap = netmap_set_cap, 941 }; 942 943 DATA_SET(net_backend_set, netmap_backend); 944 DATA_SET(net_backend_set, vale_backend); 945 946 int 947 netbe_legacy_config(nvlist_t *nvl, const char *opts) 948 { 949 char *backend, *cp; 950 951 if (opts == NULL) 952 return (0); 953 954 cp = strchr(opts, ','); 955 if (cp == NULL) { 956 set_config_value_node(nvl, "backend", opts); 957 return (0); 958 } 959 backend = strndup(opts, cp - opts); 960 set_config_value_node(nvl, "backend", backend); 961 free(backend); 962 return (pci_parse_legacy_config(nvl, cp + 1)); 963 } 964 965 /* 966 * Initialize a backend and attach to the frontend. 967 * This is called during frontend initialization. 968 * @ret is a pointer to the backend to be initialized 969 * @devname is the backend-name as supplied on the command line, 970 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 971 * @cb is the receive callback supplied by the frontend, 972 * and it is invoked in the event loop when a receive 973 * event is generated in the hypervisor, 974 * @param is a pointer to the frontend, and normally used as 975 * the argument for the callback. 976 */ 977 int 978 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 979 void *param) 980 { 981 struct net_backend **pbe, *nbe, *tbe = NULL; 982 const char *value, *type; 983 char *devname; 984 int err; 985 986 value = get_config_value_node(nvl, "backend"); 987 if (value == NULL) { 988 return (-1); 989 } 990 devname = strdup(value); 991 992 /* 993 * Use the type given by configuration if exists; otherwise 994 * use the prefix of the backend as the type. 995 */ 996 type = get_config_value_node(nvl, "type"); 997 if (type == NULL) 998 type = devname; 999 1000 /* 1001 * Find the network backend that matches the user-provided 1002 * device name. net_backend_set is built using a linker set. 1003 */ 1004 SET_FOREACH(pbe, net_backend_set) { 1005 if (strncmp(type, (*pbe)->prefix, 1006 strlen((*pbe)->prefix)) == 0) { 1007 tbe = *pbe; 1008 assert(tbe->init != NULL); 1009 assert(tbe->cleanup != NULL); 1010 assert(tbe->send != NULL); 1011 assert(tbe->recv != NULL); 1012 assert(tbe->get_cap != NULL); 1013 assert(tbe->set_cap != NULL); 1014 break; 1015 } 1016 } 1017 1018 *ret = NULL; 1019 if (tbe == NULL) { 1020 free(devname); 1021 return (EINVAL); 1022 } 1023 1024 nbe = calloc(1, NET_BE_SIZE(tbe)); 1025 *nbe = *tbe; /* copy the template */ 1026 nbe->fd = -1; 1027 nbe->sc = param; 1028 nbe->be_vnet_hdr_len = 0; 1029 nbe->fe_vnet_hdr_len = 0; 1030 1031 /* Initialize the backend. */ 1032 err = nbe->init(nbe, devname, nvl, cb, param); 1033 if (err) { 1034 free(devname); 1035 free(nbe); 1036 return (err); 1037 } 1038 1039 *ret = nbe; 1040 free(devname); 1041 1042 return (0); 1043 } 1044 1045 void 1046 netbe_cleanup(struct net_backend *be) 1047 { 1048 1049 if (be != NULL) { 1050 be->cleanup(be); 1051 free(be); 1052 } 1053 } 1054 1055 uint64_t 1056 netbe_get_cap(struct net_backend *be) 1057 { 1058 1059 assert(be != NULL); 1060 return (be->get_cap(be)); 1061 } 1062 1063 int 1064 netbe_set_cap(struct net_backend *be, uint64_t features, 1065 unsigned vnet_hdr_len) 1066 { 1067 int ret; 1068 1069 assert(be != NULL); 1070 1071 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1072 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1073 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1074 return (-1); 1075 1076 be->fe_vnet_hdr_len = vnet_hdr_len; 1077 1078 ret = be->set_cap(be, features, vnet_hdr_len); 1079 assert(be->be_vnet_hdr_len == 0 || 1080 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1081 1082 return (ret); 1083 } 1084 1085 ssize_t 1086 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1087 { 1088 1089 return (be->send(be, iov, iovcnt)); 1090 } 1091 1092 ssize_t 1093 netbe_peek_recvlen(struct net_backend *be) 1094 { 1095 1096 return (be->peek_recvlen(be)); 1097 } 1098 1099 /* 1100 * Try to read a packet from the backend, without blocking. 1101 * If no packets are available, return 0. In case of success, return 1102 * the length of the packet just read. Return -1 in case of errors. 1103 */ 1104 ssize_t 1105 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1106 { 1107 1108 return (be->recv(be, iov, iovcnt)); 1109 } 1110 1111 /* 1112 * Read a packet from the backend and discard it. 1113 * Returns the size of the discarded packet or zero if no packet was available. 1114 * A negative error code is returned in case of read error. 1115 */ 1116 ssize_t 1117 netbe_rx_discard(struct net_backend *be) 1118 { 1119 /* 1120 * MP note: the dummybuf is only used to discard frames, 1121 * so there is no need for it to be per-vtnet or locked. 1122 * We only make it large enough for TSO-sized segment. 1123 */ 1124 static uint8_t dummybuf[65536 + 64]; 1125 struct iovec iov; 1126 1127 iov.iov_base = dummybuf; 1128 iov.iov_len = sizeof(dummybuf); 1129 1130 return netbe_recv(be, &iov, 1); 1131 } 1132 1133 void 1134 netbe_rx_disable(struct net_backend *be) 1135 { 1136 1137 return be->recv_disable(be); 1138 } 1139 1140 void 1141 netbe_rx_enable(struct net_backend *be) 1142 { 1143 1144 return be->recv_enable(be); 1145 } 1146 1147 size_t 1148 netbe_get_vnet_hdr_len(struct net_backend *be) 1149 { 1150 1151 return (be->be_vnet_hdr_len); 1152 } 1153