1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35 #include <sys/cdefs.h> 36 #include <sys/types.h> /* u_short etc */ 37 #ifndef WITHOUT_CAPSICUM 38 #include <sys/capsicum.h> 39 #endif 40 #include <sys/ioctl.h> 41 #include <sys/mman.h> 42 #include <sys/uio.h> 43 44 #include <net/if.h> 45 #if defined(INET6) || defined(INET) 46 #include <net/if_tap.h> 47 #endif 48 #include <net/netmap.h> 49 #include <net/netmap_virt.h> 50 #define NETMAP_WITH_LIBS 51 #include <net/netmap_user.h> 52 53 #ifndef WITHOUT_CAPSICUM 54 #include <capsicum_helpers.h> 55 #endif 56 #include <err.h> 57 #include <errno.h> 58 #include <fcntl.h> 59 #include <stdio.h> 60 #include <stdlib.h> 61 #include <stdint.h> 62 #include <string.h> 63 #include <unistd.h> 64 #include <sysexits.h> 65 #include <assert.h> 66 #include <pthread.h> 67 #include <pthread_np.h> 68 #include <poll.h> 69 #include <assert.h> 70 71 #ifdef NETGRAPH 72 #include <sys/param.h> 73 #include <sys/sysctl.h> 74 #include <netgraph.h> 75 #endif 76 77 #include "config.h" 78 #include "debug.h" 79 #include "iov.h" 80 #include "mevent.h" 81 #include "net_backends.h" 82 #include "pci_emul.h" 83 84 #include <sys/linker_set.h> 85 86 /* 87 * Each network backend registers a set of function pointers that are 88 * used to implement the net backends API. 89 * This might need to be exposed if we implement backends in separate files. 90 */ 91 struct net_backend { 92 const char *prefix; /* prefix matching this backend */ 93 94 /* 95 * Routines used to initialize and cleanup the resources needed 96 * by a backend. The cleanup function is used internally, 97 * and should not be called by the frontend. 98 */ 99 int (*init)(struct net_backend *be, const char *devname, 100 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 101 void (*cleanup)(struct net_backend *be); 102 103 /* 104 * Called to serve a guest transmit request. The scatter-gather 105 * vector provided by the caller has 'iovcnt' elements and contains 106 * the packet to send. 107 */ 108 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 109 int iovcnt); 110 111 /* 112 * Get the length of the next packet that can be received from 113 * the backend. If no packets are currently available, this 114 * function returns 0. 115 */ 116 ssize_t (*peek_recvlen)(struct net_backend *be); 117 118 /* 119 * Called to receive a packet from the backend. When the function 120 * returns a positive value 'len', the scatter-gather vector 121 * provided by the caller contains a packet with such length. 122 * The function returns 0 if the backend doesn't have a new packet to 123 * receive. 124 */ 125 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 126 int iovcnt); 127 128 /* 129 * Ask the backend to enable or disable receive operation in the 130 * backend. On return from a disable operation, it is guaranteed 131 * that the receive callback won't be called until receive is 132 * enabled again. Note however that it is up to the caller to make 133 * sure that netbe_recv() is not currently being executed by another 134 * thread. 135 */ 136 void (*recv_enable)(struct net_backend *be); 137 void (*recv_disable)(struct net_backend *be); 138 139 /* 140 * Ask the backend for the virtio-net features it is able to 141 * support. Possible features are TSO, UFO and checksum offloading 142 * in both rx and tx direction and for both IPv4 and IPv6. 143 */ 144 uint64_t (*get_cap)(struct net_backend *be); 145 146 /* 147 * Tell the backend to enable/disable the specified virtio-net 148 * features (capabilities). 149 */ 150 int (*set_cap)(struct net_backend *be, uint64_t features, 151 unsigned int vnet_hdr_len); 152 153 struct pci_vtnet_softc *sc; 154 int fd; 155 156 /* 157 * Length of the virtio-net header used by the backend and the 158 * frontend, respectively. A zero value means that the header 159 * is not used. 160 */ 161 unsigned int be_vnet_hdr_len; 162 unsigned int fe_vnet_hdr_len; 163 164 /* Size of backend-specific private data. */ 165 size_t priv_size; 166 167 /* Backend-specific private data follows. */ 168 }; 169 170 #define NET_BE_PRIV(be) ((void *)((be) + 1)) 171 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 172 173 SET_DECLARE(net_backend_set, struct net_backend); 174 175 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 176 177 #define WPRINTF(params) PRINTLN params 178 179 /* 180 * The tap backend 181 */ 182 183 #if defined(INET6) || defined(INET) 184 static const int pf_list[] = { 185 #if defined(INET6) 186 PF_INET6, 187 #endif 188 #if defined(INET) 189 PF_INET, 190 #endif 191 }; 192 #endif 193 194 struct tap_priv { 195 struct mevent *mevp; 196 /* 197 * A bounce buffer that allows us to implement the peek_recvlen 198 * callback. In the future we may get the same information from 199 * the kevent data. 200 */ 201 char bbuf[1 << 16]; 202 ssize_t bbuflen; 203 }; 204 205 static void 206 tap_cleanup(struct net_backend *be) 207 { 208 struct tap_priv *priv = NET_BE_PRIV(be); 209 210 if (priv->mevp) { 211 mevent_delete(priv->mevp); 212 } 213 if (be->fd != -1) { 214 close(be->fd); 215 be->fd = -1; 216 } 217 } 218 219 static int 220 tap_init(struct net_backend *be, const char *devname, 221 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 222 { 223 struct tap_priv *priv = NET_BE_PRIV(be); 224 char tbuf[80]; 225 int opt = 1; 226 #if defined(INET6) || defined(INET) 227 struct ifreq ifrq; 228 int s; 229 #endif 230 #ifndef WITHOUT_CAPSICUM 231 cap_rights_t rights; 232 #endif 233 234 if (cb == NULL) { 235 WPRINTF(("TAP backend requires non-NULL callback")); 236 return (-1); 237 } 238 239 strcpy(tbuf, "/dev/"); 240 strlcat(tbuf, devname, sizeof(tbuf)); 241 242 be->fd = open(tbuf, O_RDWR); 243 if (be->fd == -1) { 244 WPRINTF(("open of tap device %s failed", tbuf)); 245 goto error; 246 } 247 248 /* 249 * Set non-blocking and register for read 250 * notifications with the event loop 251 */ 252 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 253 WPRINTF(("tap device O_NONBLOCK failed")); 254 goto error; 255 } 256 257 #if defined(INET6) || defined(INET) 258 /* 259 * Try to UP the interface rather than relying on 260 * net.link.tap.up_on_open. 261 */ 262 bzero(&ifrq, sizeof(ifrq)); 263 if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) { 264 WPRINTF(("Could not get interface name")); 265 goto error; 266 } 267 268 s = -1; 269 for (size_t i = 0; s == -1 && i < nitems(pf_list); i++) 270 s = socket(pf_list[i], SOCK_DGRAM, 0); 271 if (s == -1) { 272 WPRINTF(("Could open socket")); 273 goto error; 274 } 275 276 if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) { 277 (void)close(s); 278 WPRINTF(("Could not get interface flags")); 279 goto error; 280 } 281 ifrq.ifr_flags |= IFF_UP; 282 if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) { 283 (void)close(s); 284 WPRINTF(("Could not set interface flags")); 285 goto error; 286 } 287 (void)close(s); 288 #endif 289 290 #ifndef WITHOUT_CAPSICUM 291 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 292 if (caph_rights_limit(be->fd, &rights) == -1) 293 errx(EX_OSERR, "Unable to apply rights for sandbox"); 294 #endif 295 296 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 297 priv->bbuflen = 0; 298 299 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 300 if (priv->mevp == NULL) { 301 WPRINTF(("Could not register event")); 302 goto error; 303 } 304 305 return (0); 306 307 error: 308 tap_cleanup(be); 309 return (-1); 310 } 311 312 /* 313 * Called to send a buffer chain out to the tap device 314 */ 315 static ssize_t 316 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 317 { 318 return (writev(be->fd, iov, iovcnt)); 319 } 320 321 static ssize_t 322 tap_peek_recvlen(struct net_backend *be) 323 { 324 struct tap_priv *priv = NET_BE_PRIV(be); 325 ssize_t ret; 326 327 if (priv->bbuflen > 0) { 328 /* 329 * We already have a packet in the bounce buffer. 330 * Just return its length. 331 */ 332 return priv->bbuflen; 333 } 334 335 /* 336 * Read the next packet (if any) into the bounce buffer, so 337 * that we get to know its length and we can return that 338 * to the caller. 339 */ 340 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 341 if (ret < 0 && errno == EWOULDBLOCK) { 342 return (0); 343 } 344 345 if (ret > 0) 346 priv->bbuflen = ret; 347 348 return (ret); 349 } 350 351 static ssize_t 352 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 353 { 354 struct tap_priv *priv = NET_BE_PRIV(be); 355 ssize_t ret; 356 357 if (priv->bbuflen > 0) { 358 /* 359 * A packet is available in the bounce buffer, so 360 * we read it from there. 361 */ 362 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 363 iov, iovcnt, 0); 364 365 /* Mark the bounce buffer as empty. */ 366 priv->bbuflen = 0; 367 368 return (ret); 369 } 370 371 ret = readv(be->fd, iov, iovcnt); 372 if (ret < 0 && errno == EWOULDBLOCK) { 373 return (0); 374 } 375 376 return (ret); 377 } 378 379 static void 380 tap_recv_enable(struct net_backend *be) 381 { 382 struct tap_priv *priv = NET_BE_PRIV(be); 383 384 mevent_enable(priv->mevp); 385 } 386 387 static void 388 tap_recv_disable(struct net_backend *be) 389 { 390 struct tap_priv *priv = NET_BE_PRIV(be); 391 392 mevent_disable(priv->mevp); 393 } 394 395 static uint64_t 396 tap_get_cap(struct net_backend *be __unused) 397 { 398 399 return (0); /* no capabilities for now */ 400 } 401 402 static int 403 tap_set_cap(struct net_backend *be __unused, uint64_t features, 404 unsigned vnet_hdr_len) 405 { 406 407 return ((features || vnet_hdr_len) ? -1 : 0); 408 } 409 410 static struct net_backend tap_backend = { 411 .prefix = "tap", 412 .priv_size = sizeof(struct tap_priv), 413 .init = tap_init, 414 .cleanup = tap_cleanup, 415 .send = tap_send, 416 .peek_recvlen = tap_peek_recvlen, 417 .recv = tap_recv, 418 .recv_enable = tap_recv_enable, 419 .recv_disable = tap_recv_disable, 420 .get_cap = tap_get_cap, 421 .set_cap = tap_set_cap, 422 }; 423 424 /* A clone of the tap backend, with a different prefix. */ 425 static struct net_backend vmnet_backend = { 426 .prefix = "vmnet", 427 .priv_size = sizeof(struct tap_priv), 428 .init = tap_init, 429 .cleanup = tap_cleanup, 430 .send = tap_send, 431 .peek_recvlen = tap_peek_recvlen, 432 .recv = tap_recv, 433 .recv_enable = tap_recv_enable, 434 .recv_disable = tap_recv_disable, 435 .get_cap = tap_get_cap, 436 .set_cap = tap_set_cap, 437 }; 438 439 DATA_SET(net_backend_set, tap_backend); 440 DATA_SET(net_backend_set, vmnet_backend); 441 442 #ifdef NETGRAPH 443 444 /* 445 * Netgraph backend 446 */ 447 448 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 449 450 static int 451 ng_init(struct net_backend *be, const char *devname __unused, 452 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 453 { 454 struct tap_priv *p = NET_BE_PRIV(be); 455 struct ngm_connect ngc; 456 const char *value, *nodename; 457 int sbsz; 458 int ctrl_sock; 459 int flags; 460 unsigned long maxsbsz; 461 size_t msbsz; 462 #ifndef WITHOUT_CAPSICUM 463 cap_rights_t rights; 464 #endif 465 466 if (cb == NULL) { 467 WPRINTF(("Netgraph backend requires non-NULL callback")); 468 return (-1); 469 } 470 471 be->fd = -1; 472 473 memset(&ngc, 0, sizeof(ngc)); 474 475 value = get_config_value_node(nvl, "path"); 476 if (value == NULL) { 477 WPRINTF(("path must be provided")); 478 return (-1); 479 } 480 strncpy(ngc.path, value, NG_PATHSIZ - 1); 481 482 value = get_config_value_node(nvl, "hook"); 483 if (value == NULL) 484 value = "vmlink"; 485 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 486 487 value = get_config_value_node(nvl, "peerhook"); 488 if (value == NULL) { 489 WPRINTF(("peer hook must be provided")); 490 return (-1); 491 } 492 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 493 494 nodename = get_config_value_node(nvl, "socket"); 495 if (NgMkSockNode(nodename, 496 &ctrl_sock, &be->fd) < 0) { 497 WPRINTF(("can't get Netgraph sockets")); 498 return (-1); 499 } 500 501 if (NgSendMsg(ctrl_sock, ".", 502 NGM_GENERIC_COOKIE, 503 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 504 WPRINTF(("can't connect to node")); 505 close(ctrl_sock); 506 goto error; 507 } 508 509 close(ctrl_sock); 510 511 flags = fcntl(be->fd, F_GETFL); 512 513 if (flags < 0) { 514 WPRINTF(("can't get socket flags")); 515 goto error; 516 } 517 518 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 519 WPRINTF(("can't set O_NONBLOCK flag")); 520 goto error; 521 } 522 523 /* 524 * The default ng_socket(4) buffer's size is too low. 525 * Calculate the minimum value between NG_SBUF_MAX_SIZE 526 * and kern.ipc.maxsockbuf. 527 */ 528 msbsz = sizeof(maxsbsz); 529 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 530 NULL, 0) < 0) { 531 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 532 goto error; 533 } 534 535 /* 536 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 537 * as it takes into account the mbuf(9) overhead. 538 */ 539 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 540 541 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 542 543 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 544 sizeof(sbsz)) < 0) { 545 WPRINTF(("can't set TX buffer size")); 546 goto error; 547 } 548 549 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 550 sizeof(sbsz)) < 0) { 551 WPRINTF(("can't set RX buffer size")); 552 goto error; 553 } 554 555 #ifndef WITHOUT_CAPSICUM 556 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 557 if (caph_rights_limit(be->fd, &rights) == -1) 558 errx(EX_OSERR, "Unable to apply rights for sandbox"); 559 #endif 560 561 memset(p->bbuf, 0, sizeof(p->bbuf)); 562 p->bbuflen = 0; 563 564 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 565 if (p->mevp == NULL) { 566 WPRINTF(("Could not register event")); 567 goto error; 568 } 569 570 return (0); 571 572 error: 573 tap_cleanup(be); 574 return (-1); 575 } 576 577 static struct net_backend ng_backend = { 578 .prefix = "netgraph", 579 .priv_size = sizeof(struct tap_priv), 580 .init = ng_init, 581 .cleanup = tap_cleanup, 582 .send = tap_send, 583 .peek_recvlen = tap_peek_recvlen, 584 .recv = tap_recv, 585 .recv_enable = tap_recv_enable, 586 .recv_disable = tap_recv_disable, 587 .get_cap = tap_get_cap, 588 .set_cap = tap_set_cap, 589 }; 590 591 DATA_SET(net_backend_set, ng_backend); 592 593 #endif /* NETGRAPH */ 594 595 /* 596 * The netmap backend 597 */ 598 599 /* The virtio-net features supported by netmap. */ 600 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 601 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 602 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 603 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 604 605 struct netmap_priv { 606 char ifname[IFNAMSIZ]; 607 struct nm_desc *nmd; 608 uint16_t memid; 609 struct netmap_ring *rx; 610 struct netmap_ring *tx; 611 struct mevent *mevp; 612 net_be_rxeof_t cb; 613 void *cb_param; 614 }; 615 616 static void 617 nmreq_init(struct nmreq *req, char *ifname) 618 { 619 620 memset(req, 0, sizeof(*req)); 621 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 622 req->nr_version = NETMAP_API; 623 } 624 625 static int 626 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 627 { 628 int err; 629 struct nmreq req; 630 struct netmap_priv *priv = NET_BE_PRIV(be); 631 632 nmreq_init(&req, priv->ifname); 633 req.nr_cmd = NETMAP_BDG_VNET_HDR; 634 req.nr_arg1 = vnet_hdr_len; 635 err = ioctl(be->fd, NIOCREGIF, &req); 636 if (err) { 637 WPRINTF(("Unable to set vnet header length %d", 638 vnet_hdr_len)); 639 return (err); 640 } 641 642 be->be_vnet_hdr_len = vnet_hdr_len; 643 644 return (0); 645 } 646 647 static int 648 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 649 { 650 unsigned prev_hdr_len = be->be_vnet_hdr_len; 651 int ret; 652 653 if (vnet_hdr_len == prev_hdr_len) { 654 return (1); 655 } 656 657 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 658 if (ret) { 659 return (0); 660 } 661 662 netmap_set_vnet_hdr_len(be, prev_hdr_len); 663 664 return (1); 665 } 666 667 static uint64_t 668 netmap_get_cap(struct net_backend *be) 669 { 670 671 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 672 NETMAP_FEATURES : 0); 673 } 674 675 static int 676 netmap_set_cap(struct net_backend *be, uint64_t features __unused, 677 unsigned vnet_hdr_len) 678 { 679 680 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 681 } 682 683 static int 684 netmap_init(struct net_backend *be, const char *devname, 685 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 686 { 687 struct netmap_priv *priv = NET_BE_PRIV(be); 688 689 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 690 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 691 692 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 693 if (priv->nmd == NULL) { 694 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 695 devname, strerror(errno))); 696 return (-1); 697 } 698 699 priv->memid = priv->nmd->req.nr_arg2; 700 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 701 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 702 priv->cb = cb; 703 priv->cb_param = param; 704 be->fd = priv->nmd->fd; 705 706 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 707 if (priv->mevp == NULL) { 708 WPRINTF(("Could not register event")); 709 return (-1); 710 } 711 712 return (0); 713 } 714 715 static void 716 netmap_cleanup(struct net_backend *be) 717 { 718 struct netmap_priv *priv = NET_BE_PRIV(be); 719 720 if (priv->mevp) { 721 mevent_delete(priv->mevp); 722 } 723 if (priv->nmd) { 724 nm_close(priv->nmd); 725 } 726 be->fd = -1; 727 } 728 729 static ssize_t 730 netmap_send(struct net_backend *be, const struct iovec *iov, 731 int iovcnt) 732 { 733 struct netmap_priv *priv = NET_BE_PRIV(be); 734 struct netmap_ring *ring; 735 ssize_t totlen = 0; 736 int nm_buf_size; 737 int nm_buf_len; 738 uint32_t head; 739 uint8_t *nm_buf; 740 int j; 741 742 ring = priv->tx; 743 head = ring->head; 744 if (head == ring->tail) { 745 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 746 goto txsync; 747 } 748 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 749 nm_buf_size = ring->nr_buf_size; 750 nm_buf_len = 0; 751 752 for (j = 0; j < iovcnt; j++) { 753 uint8_t *iov_frag_buf = iov[j].iov_base; 754 int iov_frag_size = iov[j].iov_len; 755 756 totlen += iov_frag_size; 757 758 /* 759 * Split each iovec fragment over more netmap slots, if 760 * necessary. 761 */ 762 for (;;) { 763 int copylen; 764 765 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 766 memcpy(nm_buf, iov_frag_buf, copylen); 767 768 iov_frag_buf += copylen; 769 iov_frag_size -= copylen; 770 nm_buf += copylen; 771 nm_buf_size -= copylen; 772 nm_buf_len += copylen; 773 774 if (iov_frag_size == 0) { 775 break; 776 } 777 778 ring->slot[head].len = nm_buf_len; 779 ring->slot[head].flags = NS_MOREFRAG; 780 head = nm_ring_next(ring, head); 781 if (head == ring->tail) { 782 /* 783 * We ran out of netmap slots while 784 * splitting the iovec fragments. 785 */ 786 WPRINTF(("No space, drop %zu bytes", 787 count_iov(iov, iovcnt))); 788 goto txsync; 789 } 790 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 791 nm_buf_size = ring->nr_buf_size; 792 nm_buf_len = 0; 793 } 794 } 795 796 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 797 ring->slot[head].len = nm_buf_len; 798 ring->slot[head].flags = 0; 799 head = nm_ring_next(ring, head); 800 801 /* Now update ring->head and ring->cur. */ 802 ring->head = ring->cur = head; 803 txsync: 804 ioctl(be->fd, NIOCTXSYNC, NULL); 805 806 return (totlen); 807 } 808 809 static ssize_t 810 netmap_peek_recvlen(struct net_backend *be) 811 { 812 struct netmap_priv *priv = NET_BE_PRIV(be); 813 struct netmap_ring *ring = priv->rx; 814 uint32_t head = ring->head; 815 ssize_t totlen = 0; 816 817 while (head != ring->tail) { 818 struct netmap_slot *slot = ring->slot + head; 819 820 totlen += slot->len; 821 if ((slot->flags & NS_MOREFRAG) == 0) 822 break; 823 head = nm_ring_next(ring, head); 824 } 825 826 return (totlen); 827 } 828 829 static ssize_t 830 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 831 { 832 struct netmap_priv *priv = NET_BE_PRIV(be); 833 struct netmap_slot *slot = NULL; 834 struct netmap_ring *ring; 835 uint8_t *iov_frag_buf; 836 int iov_frag_size; 837 ssize_t totlen = 0; 838 uint32_t head; 839 840 assert(iovcnt); 841 842 ring = priv->rx; 843 head = ring->head; 844 iov_frag_buf = iov->iov_base; 845 iov_frag_size = iov->iov_len; 846 847 do { 848 uint8_t *nm_buf; 849 int nm_buf_len; 850 851 if (head == ring->tail) { 852 return (0); 853 } 854 855 slot = ring->slot + head; 856 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 857 nm_buf_len = slot->len; 858 859 for (;;) { 860 int copylen = nm_buf_len < iov_frag_size ? 861 nm_buf_len : iov_frag_size; 862 863 memcpy(iov_frag_buf, nm_buf, copylen); 864 nm_buf += copylen; 865 nm_buf_len -= copylen; 866 iov_frag_buf += copylen; 867 iov_frag_size -= copylen; 868 totlen += copylen; 869 870 if (nm_buf_len == 0) { 871 break; 872 } 873 874 iov++; 875 iovcnt--; 876 if (iovcnt == 0) { 877 /* No space to receive. */ 878 WPRINTF(("Short iov, drop %zd bytes", 879 totlen)); 880 return (-ENOSPC); 881 } 882 iov_frag_buf = iov->iov_base; 883 iov_frag_size = iov->iov_len; 884 } 885 886 head = nm_ring_next(ring, head); 887 888 } while (slot->flags & NS_MOREFRAG); 889 890 /* Release slots to netmap. */ 891 ring->head = ring->cur = head; 892 893 return (totlen); 894 } 895 896 static void 897 netmap_recv_enable(struct net_backend *be) 898 { 899 struct netmap_priv *priv = NET_BE_PRIV(be); 900 901 mevent_enable(priv->mevp); 902 } 903 904 static void 905 netmap_recv_disable(struct net_backend *be) 906 { 907 struct netmap_priv *priv = NET_BE_PRIV(be); 908 909 mevent_disable(priv->mevp); 910 } 911 912 static struct net_backend netmap_backend = { 913 .prefix = "netmap", 914 .priv_size = sizeof(struct netmap_priv), 915 .init = netmap_init, 916 .cleanup = netmap_cleanup, 917 .send = netmap_send, 918 .peek_recvlen = netmap_peek_recvlen, 919 .recv = netmap_recv, 920 .recv_enable = netmap_recv_enable, 921 .recv_disable = netmap_recv_disable, 922 .get_cap = netmap_get_cap, 923 .set_cap = netmap_set_cap, 924 }; 925 926 /* A clone of the netmap backend, with a different prefix. */ 927 static struct net_backend vale_backend = { 928 .prefix = "vale", 929 .priv_size = sizeof(struct netmap_priv), 930 .init = netmap_init, 931 .cleanup = netmap_cleanup, 932 .send = netmap_send, 933 .peek_recvlen = netmap_peek_recvlen, 934 .recv = netmap_recv, 935 .recv_enable = netmap_recv_enable, 936 .recv_disable = netmap_recv_disable, 937 .get_cap = netmap_get_cap, 938 .set_cap = netmap_set_cap, 939 }; 940 941 DATA_SET(net_backend_set, netmap_backend); 942 DATA_SET(net_backend_set, vale_backend); 943 944 int 945 netbe_legacy_config(nvlist_t *nvl, const char *opts) 946 { 947 char *backend, *cp; 948 949 if (opts == NULL) 950 return (0); 951 952 cp = strchr(opts, ','); 953 if (cp == NULL) { 954 set_config_value_node(nvl, "backend", opts); 955 return (0); 956 } 957 backend = strndup(opts, cp - opts); 958 set_config_value_node(nvl, "backend", backend); 959 free(backend); 960 return (pci_parse_legacy_config(nvl, cp + 1)); 961 } 962 963 /* 964 * Initialize a backend and attach to the frontend. 965 * This is called during frontend initialization. 966 * @ret is a pointer to the backend to be initialized 967 * @devname is the backend-name as supplied on the command line, 968 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 969 * @cb is the receive callback supplied by the frontend, 970 * and it is invoked in the event loop when a receive 971 * event is generated in the hypervisor, 972 * @param is a pointer to the frontend, and normally used as 973 * the argument for the callback. 974 */ 975 int 976 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 977 void *param) 978 { 979 struct net_backend **pbe, *nbe, *tbe = NULL; 980 const char *value, *type; 981 char *devname; 982 int err; 983 984 value = get_config_value_node(nvl, "backend"); 985 if (value == NULL) { 986 return (-1); 987 } 988 devname = strdup(value); 989 990 /* 991 * Use the type given by configuration if exists; otherwise 992 * use the prefix of the backend as the type. 993 */ 994 type = get_config_value_node(nvl, "type"); 995 if (type == NULL) 996 type = devname; 997 998 /* 999 * Find the network backend that matches the user-provided 1000 * device name. net_backend_set is built using a linker set. 1001 */ 1002 SET_FOREACH(pbe, net_backend_set) { 1003 if (strncmp(type, (*pbe)->prefix, 1004 strlen((*pbe)->prefix)) == 0) { 1005 tbe = *pbe; 1006 assert(tbe->init != NULL); 1007 assert(tbe->cleanup != NULL); 1008 assert(tbe->send != NULL); 1009 assert(tbe->recv != NULL); 1010 assert(tbe->get_cap != NULL); 1011 assert(tbe->set_cap != NULL); 1012 break; 1013 } 1014 } 1015 1016 *ret = NULL; 1017 if (tbe == NULL) { 1018 free(devname); 1019 return (EINVAL); 1020 } 1021 1022 nbe = calloc(1, NET_BE_SIZE(tbe)); 1023 *nbe = *tbe; /* copy the template */ 1024 nbe->fd = -1; 1025 nbe->sc = param; 1026 nbe->be_vnet_hdr_len = 0; 1027 nbe->fe_vnet_hdr_len = 0; 1028 1029 /* Initialize the backend. */ 1030 err = nbe->init(nbe, devname, nvl, cb, param); 1031 if (err) { 1032 free(devname); 1033 free(nbe); 1034 return (err); 1035 } 1036 1037 *ret = nbe; 1038 free(devname); 1039 1040 return (0); 1041 } 1042 1043 void 1044 netbe_cleanup(struct net_backend *be) 1045 { 1046 1047 if (be != NULL) { 1048 be->cleanup(be); 1049 free(be); 1050 } 1051 } 1052 1053 uint64_t 1054 netbe_get_cap(struct net_backend *be) 1055 { 1056 1057 assert(be != NULL); 1058 return (be->get_cap(be)); 1059 } 1060 1061 int 1062 netbe_set_cap(struct net_backend *be, uint64_t features, 1063 unsigned vnet_hdr_len) 1064 { 1065 int ret; 1066 1067 assert(be != NULL); 1068 1069 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1070 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1071 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1072 return (-1); 1073 1074 be->fe_vnet_hdr_len = vnet_hdr_len; 1075 1076 ret = be->set_cap(be, features, vnet_hdr_len); 1077 assert(be->be_vnet_hdr_len == 0 || 1078 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1079 1080 return (ret); 1081 } 1082 1083 ssize_t 1084 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1085 { 1086 1087 return (be->send(be, iov, iovcnt)); 1088 } 1089 1090 ssize_t 1091 netbe_peek_recvlen(struct net_backend *be) 1092 { 1093 1094 return (be->peek_recvlen(be)); 1095 } 1096 1097 /* 1098 * Try to read a packet from the backend, without blocking. 1099 * If no packets are available, return 0. In case of success, return 1100 * the length of the packet just read. Return -1 in case of errors. 1101 */ 1102 ssize_t 1103 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1104 { 1105 1106 return (be->recv(be, iov, iovcnt)); 1107 } 1108 1109 /* 1110 * Read a packet from the backend and discard it. 1111 * Returns the size of the discarded packet or zero if no packet was available. 1112 * A negative error code is returned in case of read error. 1113 */ 1114 ssize_t 1115 netbe_rx_discard(struct net_backend *be) 1116 { 1117 /* 1118 * MP note: the dummybuf is only used to discard frames, 1119 * so there is no need for it to be per-vtnet or locked. 1120 * We only make it large enough for TSO-sized segment. 1121 */ 1122 static uint8_t dummybuf[65536 + 64]; 1123 struct iovec iov; 1124 1125 iov.iov_base = dummybuf; 1126 iov.iov_len = sizeof(dummybuf); 1127 1128 return netbe_recv(be, &iov, 1); 1129 } 1130 1131 void 1132 netbe_rx_disable(struct net_backend *be) 1133 { 1134 1135 return be->recv_disable(be); 1136 } 1137 1138 void 1139 netbe_rx_enable(struct net_backend *be) 1140 { 1141 1142 return be->recv_enable(be); 1143 } 1144 1145 size_t 1146 netbe_get_vnet_hdr_len(struct net_backend *be) 1147 { 1148 1149 return (be->be_vnet_hdr_len); 1150 } 1151