1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 /* 31 * This file implements multiple network backends (tap, netmap, ...), 32 * to be used by network frontends such as virtio-net and e1000. 33 * The API to access the backend (e.g. send/receive packets, negotiate 34 * features) is exported by net_backends.h. 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/types.h> /* u_short etc */ 41 #ifndef WITHOUT_CAPSICUM 42 #include <sys/capsicum.h> 43 #endif 44 #include <sys/ioctl.h> 45 #include <sys/mman.h> 46 #include <sys/uio.h> 47 48 #include <net/if.h> 49 #if defined(INET6) || defined(INET) 50 #include <net/if_tap.h> 51 #endif 52 #include <net/netmap.h> 53 #include <net/netmap_virt.h> 54 #define NETMAP_WITH_LIBS 55 #include <net/netmap_user.h> 56 57 #ifndef WITHOUT_CAPSICUM 58 #include <capsicum_helpers.h> 59 #endif 60 #include <err.h> 61 #include <errno.h> 62 #include <fcntl.h> 63 #include <stdio.h> 64 #include <stdlib.h> 65 #include <stdint.h> 66 #include <string.h> 67 #include <unistd.h> 68 #include <sysexits.h> 69 #include <assert.h> 70 #include <pthread.h> 71 #include <pthread_np.h> 72 #include <poll.h> 73 #include <assert.h> 74 75 #ifdef NETGRAPH 76 #include <sys/param.h> 77 #include <sys/sysctl.h> 78 #include <netgraph.h> 79 #endif 80 81 #include "config.h" 82 #include "debug.h" 83 #include "iov.h" 84 #include "mevent.h" 85 #include "net_backends.h" 86 #include "pci_emul.h" 87 88 #include <sys/linker_set.h> 89 90 /* 91 * Each network backend registers a set of function pointers that are 92 * used to implement the net backends API. 93 * This might need to be exposed if we implement backends in separate files. 94 */ 95 struct net_backend { 96 const char *prefix; /* prefix matching this backend */ 97 98 /* 99 * Routines used to initialize and cleanup the resources needed 100 * by a backend. The cleanup function is used internally, 101 * and should not be called by the frontend. 102 */ 103 int (*init)(struct net_backend *be, const char *devname, 104 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 105 void (*cleanup)(struct net_backend *be); 106 107 /* 108 * Called to serve a guest transmit request. The scatter-gather 109 * vector provided by the caller has 'iovcnt' elements and contains 110 * the packet to send. 111 */ 112 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 113 int iovcnt); 114 115 /* 116 * Get the length of the next packet that can be received from 117 * the backend. If no packets are currently available, this 118 * function returns 0. 119 */ 120 ssize_t (*peek_recvlen)(struct net_backend *be); 121 122 /* 123 * Called to receive a packet from the backend. When the function 124 * returns a positive value 'len', the scatter-gather vector 125 * provided by the caller contains a packet with such length. 126 * The function returns 0 if the backend doesn't have a new packet to 127 * receive. 128 */ 129 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 130 int iovcnt); 131 132 /* 133 * Ask the backend to enable or disable receive operation in the 134 * backend. On return from a disable operation, it is guaranteed 135 * that the receive callback won't be called until receive is 136 * enabled again. Note however that it is up to the caller to make 137 * sure that netbe_recv() is not currently being executed by another 138 * thread. 139 */ 140 void (*recv_enable)(struct net_backend *be); 141 void (*recv_disable)(struct net_backend *be); 142 143 /* 144 * Ask the backend for the virtio-net features it is able to 145 * support. Possible features are TSO, UFO and checksum offloading 146 * in both rx and tx direction and for both IPv4 and IPv6. 147 */ 148 uint64_t (*get_cap)(struct net_backend *be); 149 150 /* 151 * Tell the backend to enable/disable the specified virtio-net 152 * features (capabilities). 153 */ 154 int (*set_cap)(struct net_backend *be, uint64_t features, 155 unsigned int vnet_hdr_len); 156 157 struct pci_vtnet_softc *sc; 158 int fd; 159 160 /* 161 * Length of the virtio-net header used by the backend and the 162 * frontend, respectively. A zero value means that the header 163 * is not used. 164 */ 165 unsigned int be_vnet_hdr_len; 166 unsigned int fe_vnet_hdr_len; 167 168 /* Size of backend-specific private data. */ 169 size_t priv_size; 170 171 /* Room for backend-specific data. */ 172 char opaque[0]; 173 }; 174 175 SET_DECLARE(net_backend_set, struct net_backend); 176 177 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 178 179 #define WPRINTF(params) PRINTLN params 180 181 /* 182 * The tap backend 183 */ 184 185 #if defined(INET6) || defined(INET) 186 static const int pf_list[] = { 187 #if defined(INET6) 188 PF_INET6, 189 #endif 190 #if defined(INET) 191 PF_INET, 192 #endif 193 }; 194 #endif 195 196 struct tap_priv { 197 struct mevent *mevp; 198 /* 199 * A bounce buffer that allows us to implement the peek_recvlen 200 * callback. In the future we may get the same information from 201 * the kevent data. 202 */ 203 char bbuf[1 << 16]; 204 ssize_t bbuflen; 205 }; 206 207 static void 208 tap_cleanup(struct net_backend *be) 209 { 210 struct tap_priv *priv = (struct tap_priv *)be->opaque; 211 212 if (priv->mevp) { 213 mevent_delete(priv->mevp); 214 } 215 if (be->fd != -1) { 216 close(be->fd); 217 be->fd = -1; 218 } 219 } 220 221 static int 222 tap_init(struct net_backend *be, const char *devname, 223 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 224 { 225 struct tap_priv *priv = (struct tap_priv *)be->opaque; 226 char tbuf[80]; 227 int opt = 1; 228 #if defined(INET6) || defined(INET) 229 struct ifreq ifrq; 230 int i, s; 231 #endif 232 #ifndef WITHOUT_CAPSICUM 233 cap_rights_t rights; 234 #endif 235 236 if (cb == NULL) { 237 WPRINTF(("TAP backend requires non-NULL callback")); 238 return (-1); 239 } 240 241 strcpy(tbuf, "/dev/"); 242 strlcat(tbuf, devname, sizeof(tbuf)); 243 244 be->fd = open(tbuf, O_RDWR); 245 if (be->fd == -1) { 246 WPRINTF(("open of tap device %s failed", tbuf)); 247 goto error; 248 } 249 250 /* 251 * Set non-blocking and register for read 252 * notifications with the event loop 253 */ 254 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 255 WPRINTF(("tap device O_NONBLOCK failed")); 256 goto error; 257 } 258 259 #if defined(INET6) || defined(INET) 260 /* 261 * Try to UP the interface rather than relying on 262 * net.link.tap.up_on_open. 263 */ 264 bzero(&ifrq, sizeof(ifrq)); 265 if (ioctl(be->fd, TAPGIFNAME, &ifrq) < 0) { 266 WPRINTF(("Could not get interface name")); 267 goto error; 268 } 269 270 s = -1; 271 for (i = 0; s == -1 && i < nitems(pf_list); i++) 272 s = socket(pf_list[i], SOCK_DGRAM, 0); 273 if (s == -1) { 274 WPRINTF(("Could open socket")); 275 goto error; 276 } 277 278 if (ioctl(s, SIOCGIFFLAGS, &ifrq) < 0) { 279 (void)close(s); 280 WPRINTF(("Could not get interface flags")); 281 goto error; 282 } 283 ifrq.ifr_flags |= IFF_UP; 284 if (ioctl(s, SIOCSIFFLAGS, &ifrq) < 0) { 285 (void)close(s); 286 WPRINTF(("Could not set interface flags")); 287 goto error; 288 } 289 (void)close(s); 290 #endif 291 292 #ifndef WITHOUT_CAPSICUM 293 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 294 if (caph_rights_limit(be->fd, &rights) == -1) 295 errx(EX_OSERR, "Unable to apply rights for sandbox"); 296 #endif 297 298 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 299 priv->bbuflen = 0; 300 301 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 302 if (priv->mevp == NULL) { 303 WPRINTF(("Could not register event")); 304 goto error; 305 } 306 307 return (0); 308 309 error: 310 tap_cleanup(be); 311 return (-1); 312 } 313 314 /* 315 * Called to send a buffer chain out to the tap device 316 */ 317 static ssize_t 318 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 319 { 320 return (writev(be->fd, iov, iovcnt)); 321 } 322 323 static ssize_t 324 tap_peek_recvlen(struct net_backend *be) 325 { 326 struct tap_priv *priv = (struct tap_priv *)be->opaque; 327 ssize_t ret; 328 329 if (priv->bbuflen > 0) { 330 /* 331 * We already have a packet in the bounce buffer. 332 * Just return its length. 333 */ 334 return priv->bbuflen; 335 } 336 337 /* 338 * Read the next packet (if any) into the bounce buffer, so 339 * that we get to know its length and we can return that 340 * to the caller. 341 */ 342 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 343 if (ret < 0 && errno == EWOULDBLOCK) { 344 return (0); 345 } 346 347 if (ret > 0) 348 priv->bbuflen = ret; 349 350 return (ret); 351 } 352 353 static ssize_t 354 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 355 { 356 struct tap_priv *priv = (struct tap_priv *)be->opaque; 357 ssize_t ret; 358 359 if (priv->bbuflen > 0) { 360 /* 361 * A packet is available in the bounce buffer, so 362 * we read it from there. 363 */ 364 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 365 iov, iovcnt, 0); 366 367 /* Mark the bounce buffer as empty. */ 368 priv->bbuflen = 0; 369 370 return (ret); 371 } 372 373 ret = readv(be->fd, iov, iovcnt); 374 if (ret < 0 && errno == EWOULDBLOCK) { 375 return (0); 376 } 377 378 return (ret); 379 } 380 381 static void 382 tap_recv_enable(struct net_backend *be) 383 { 384 struct tap_priv *priv = (struct tap_priv *)be->opaque; 385 386 mevent_enable(priv->mevp); 387 } 388 389 static void 390 tap_recv_disable(struct net_backend *be) 391 { 392 struct tap_priv *priv = (struct tap_priv *)be->opaque; 393 394 mevent_disable(priv->mevp); 395 } 396 397 static uint64_t 398 tap_get_cap(struct net_backend *be) 399 { 400 401 return (0); /* no capabilities for now */ 402 } 403 404 static int 405 tap_set_cap(struct net_backend *be, uint64_t features, 406 unsigned vnet_hdr_len) 407 { 408 409 return ((features || vnet_hdr_len) ? -1 : 0); 410 } 411 412 static struct net_backend tap_backend = { 413 .prefix = "tap", 414 .priv_size = sizeof(struct tap_priv), 415 .init = tap_init, 416 .cleanup = tap_cleanup, 417 .send = tap_send, 418 .peek_recvlen = tap_peek_recvlen, 419 .recv = tap_recv, 420 .recv_enable = tap_recv_enable, 421 .recv_disable = tap_recv_disable, 422 .get_cap = tap_get_cap, 423 .set_cap = tap_set_cap, 424 }; 425 426 /* A clone of the tap backend, with a different prefix. */ 427 static struct net_backend vmnet_backend = { 428 .prefix = "vmnet", 429 .priv_size = sizeof(struct tap_priv), 430 .init = tap_init, 431 .cleanup = tap_cleanup, 432 .send = tap_send, 433 .peek_recvlen = tap_peek_recvlen, 434 .recv = tap_recv, 435 .recv_enable = tap_recv_enable, 436 .recv_disable = tap_recv_disable, 437 .get_cap = tap_get_cap, 438 .set_cap = tap_set_cap, 439 }; 440 441 DATA_SET(net_backend_set, tap_backend); 442 DATA_SET(net_backend_set, vmnet_backend); 443 444 #ifdef NETGRAPH 445 446 /* 447 * Netgraph backend 448 */ 449 450 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 451 452 static int 453 ng_init(struct net_backend *be, const char *devname, 454 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 455 { 456 struct tap_priv *p = (struct tap_priv *)be->opaque; 457 struct ngm_connect ngc; 458 const char *value, *nodename; 459 int sbsz; 460 int ctrl_sock; 461 int flags; 462 unsigned long maxsbsz; 463 size_t msbsz; 464 #ifndef WITHOUT_CAPSICUM 465 cap_rights_t rights; 466 #endif 467 468 if (cb == NULL) { 469 WPRINTF(("Netgraph backend requires non-NULL callback")); 470 return (-1); 471 } 472 473 be->fd = -1; 474 475 memset(&ngc, 0, sizeof(ngc)); 476 477 value = get_config_value_node(nvl, "path"); 478 if (value == NULL) { 479 WPRINTF(("path must be provided")); 480 return (-1); 481 } 482 strncpy(ngc.path, value, NG_PATHSIZ - 1); 483 484 value = get_config_value_node(nvl, "hook"); 485 if (value == NULL) 486 value = "vmlink"; 487 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 488 489 value = get_config_value_node(nvl, "peerhook"); 490 if (value == NULL) { 491 WPRINTF(("peer hook must be provided")); 492 return (-1); 493 } 494 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 495 496 nodename = get_config_value_node(nvl, "socket"); 497 if (NgMkSockNode(nodename, 498 &ctrl_sock, &be->fd) < 0) { 499 WPRINTF(("can't get Netgraph sockets")); 500 return (-1); 501 } 502 503 if (NgSendMsg(ctrl_sock, ".", 504 NGM_GENERIC_COOKIE, 505 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 506 WPRINTF(("can't connect to node")); 507 close(ctrl_sock); 508 goto error; 509 } 510 511 close(ctrl_sock); 512 513 flags = fcntl(be->fd, F_GETFL); 514 515 if (flags < 0) { 516 WPRINTF(("can't get socket flags")); 517 goto error; 518 } 519 520 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 521 WPRINTF(("can't set O_NONBLOCK flag")); 522 goto error; 523 } 524 525 /* 526 * The default ng_socket(4) buffer's size is too low. 527 * Calculate the minimum value between NG_SBUF_MAX_SIZE 528 * and kern.ipc.maxsockbuf. 529 */ 530 msbsz = sizeof(maxsbsz); 531 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 532 NULL, 0) < 0) { 533 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 534 goto error; 535 } 536 537 /* 538 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 539 * as it takes into account the mbuf(9) overhead. 540 */ 541 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 542 543 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 544 545 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 546 sizeof(sbsz)) < 0) { 547 WPRINTF(("can't set TX buffer size")); 548 goto error; 549 } 550 551 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 552 sizeof(sbsz)) < 0) { 553 WPRINTF(("can't set RX buffer size")); 554 goto error; 555 } 556 557 #ifndef WITHOUT_CAPSICUM 558 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 559 if (caph_rights_limit(be->fd, &rights) == -1) 560 errx(EX_OSERR, "Unable to apply rights for sandbox"); 561 #endif 562 563 memset(p->bbuf, 0, sizeof(p->bbuf)); 564 p->bbuflen = 0; 565 566 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 567 if (p->mevp == NULL) { 568 WPRINTF(("Could not register event")); 569 goto error; 570 } 571 572 return (0); 573 574 error: 575 tap_cleanup(be); 576 return (-1); 577 } 578 579 static struct net_backend ng_backend = { 580 .prefix = "netgraph", 581 .priv_size = sizeof(struct tap_priv), 582 .init = ng_init, 583 .cleanup = tap_cleanup, 584 .send = tap_send, 585 .peek_recvlen = tap_peek_recvlen, 586 .recv = tap_recv, 587 .recv_enable = tap_recv_enable, 588 .recv_disable = tap_recv_disable, 589 .get_cap = tap_get_cap, 590 .set_cap = tap_set_cap, 591 }; 592 593 DATA_SET(net_backend_set, ng_backend); 594 595 #endif /* NETGRAPH */ 596 597 /* 598 * The netmap backend 599 */ 600 601 /* The virtio-net features supported by netmap. */ 602 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 603 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 604 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 605 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 606 607 struct netmap_priv { 608 char ifname[IFNAMSIZ]; 609 struct nm_desc *nmd; 610 uint16_t memid; 611 struct netmap_ring *rx; 612 struct netmap_ring *tx; 613 struct mevent *mevp; 614 net_be_rxeof_t cb; 615 void *cb_param; 616 }; 617 618 static void 619 nmreq_init(struct nmreq *req, char *ifname) 620 { 621 622 memset(req, 0, sizeof(*req)); 623 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 624 req->nr_version = NETMAP_API; 625 } 626 627 static int 628 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 629 { 630 int err; 631 struct nmreq req; 632 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 633 634 nmreq_init(&req, priv->ifname); 635 req.nr_cmd = NETMAP_BDG_VNET_HDR; 636 req.nr_arg1 = vnet_hdr_len; 637 err = ioctl(be->fd, NIOCREGIF, &req); 638 if (err) { 639 WPRINTF(("Unable to set vnet header length %d", 640 vnet_hdr_len)); 641 return (err); 642 } 643 644 be->be_vnet_hdr_len = vnet_hdr_len; 645 646 return (0); 647 } 648 649 static int 650 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 651 { 652 int prev_hdr_len = be->be_vnet_hdr_len; 653 int ret; 654 655 if (vnet_hdr_len == prev_hdr_len) { 656 return (1); 657 } 658 659 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 660 if (ret) { 661 return (0); 662 } 663 664 netmap_set_vnet_hdr_len(be, prev_hdr_len); 665 666 return (1); 667 } 668 669 static uint64_t 670 netmap_get_cap(struct net_backend *be) 671 { 672 673 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 674 NETMAP_FEATURES : 0); 675 } 676 677 static int 678 netmap_set_cap(struct net_backend *be, uint64_t features, 679 unsigned vnet_hdr_len) 680 { 681 682 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 683 } 684 685 static int 686 netmap_init(struct net_backend *be, const char *devname, 687 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 688 { 689 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 690 691 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 692 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 693 694 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 695 if (priv->nmd == NULL) { 696 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 697 devname, strerror(errno))); 698 free(priv); 699 return (-1); 700 } 701 702 priv->memid = priv->nmd->req.nr_arg2; 703 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 704 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 705 priv->cb = cb; 706 priv->cb_param = param; 707 be->fd = priv->nmd->fd; 708 709 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 710 if (priv->mevp == NULL) { 711 WPRINTF(("Could not register event")); 712 return (-1); 713 } 714 715 return (0); 716 } 717 718 static void 719 netmap_cleanup(struct net_backend *be) 720 { 721 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 722 723 if (priv->mevp) { 724 mevent_delete(priv->mevp); 725 } 726 if (priv->nmd) { 727 nm_close(priv->nmd); 728 } 729 be->fd = -1; 730 } 731 732 static ssize_t 733 netmap_send(struct net_backend *be, const struct iovec *iov, 734 int iovcnt) 735 { 736 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 737 struct netmap_ring *ring; 738 ssize_t totlen = 0; 739 int nm_buf_size; 740 int nm_buf_len; 741 uint32_t head; 742 void *nm_buf; 743 int j; 744 745 ring = priv->tx; 746 head = ring->head; 747 if (head == ring->tail) { 748 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 749 goto txsync; 750 } 751 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 752 nm_buf_size = ring->nr_buf_size; 753 nm_buf_len = 0; 754 755 for (j = 0; j < iovcnt; j++) { 756 int iov_frag_size = iov[j].iov_len; 757 void *iov_frag_buf = iov[j].iov_base; 758 759 totlen += iov_frag_size; 760 761 /* 762 * Split each iovec fragment over more netmap slots, if 763 * necessary. 764 */ 765 for (;;) { 766 int copylen; 767 768 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 769 memcpy(nm_buf, iov_frag_buf, copylen); 770 771 iov_frag_buf += copylen; 772 iov_frag_size -= copylen; 773 nm_buf += copylen; 774 nm_buf_size -= copylen; 775 nm_buf_len += copylen; 776 777 if (iov_frag_size == 0) { 778 break; 779 } 780 781 ring->slot[head].len = nm_buf_len; 782 ring->slot[head].flags = NS_MOREFRAG; 783 head = nm_ring_next(ring, head); 784 if (head == ring->tail) { 785 /* 786 * We ran out of netmap slots while 787 * splitting the iovec fragments. 788 */ 789 WPRINTF(("No space, drop %zu bytes", 790 count_iov(iov, iovcnt))); 791 goto txsync; 792 } 793 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 794 nm_buf_size = ring->nr_buf_size; 795 nm_buf_len = 0; 796 } 797 } 798 799 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 800 ring->slot[head].len = nm_buf_len; 801 ring->slot[head].flags = 0; 802 head = nm_ring_next(ring, head); 803 804 /* Now update ring->head and ring->cur. */ 805 ring->head = ring->cur = head; 806 txsync: 807 ioctl(be->fd, NIOCTXSYNC, NULL); 808 809 return (totlen); 810 } 811 812 static ssize_t 813 netmap_peek_recvlen(struct net_backend *be) 814 { 815 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 816 struct netmap_ring *ring = priv->rx; 817 uint32_t head = ring->head; 818 ssize_t totlen = 0; 819 820 while (head != ring->tail) { 821 struct netmap_slot *slot = ring->slot + head; 822 823 totlen += slot->len; 824 if ((slot->flags & NS_MOREFRAG) == 0) 825 break; 826 head = nm_ring_next(ring, head); 827 } 828 829 return (totlen); 830 } 831 832 static ssize_t 833 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 834 { 835 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 836 struct netmap_slot *slot = NULL; 837 struct netmap_ring *ring; 838 void *iov_frag_buf; 839 int iov_frag_size; 840 ssize_t totlen = 0; 841 uint32_t head; 842 843 assert(iovcnt); 844 845 ring = priv->rx; 846 head = ring->head; 847 iov_frag_buf = iov->iov_base; 848 iov_frag_size = iov->iov_len; 849 850 do { 851 int nm_buf_len; 852 void *nm_buf; 853 854 if (head == ring->tail) { 855 return (0); 856 } 857 858 slot = ring->slot + head; 859 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 860 nm_buf_len = slot->len; 861 862 for (;;) { 863 int copylen = nm_buf_len < iov_frag_size ? 864 nm_buf_len : iov_frag_size; 865 866 memcpy(iov_frag_buf, nm_buf, copylen); 867 nm_buf += copylen; 868 nm_buf_len -= copylen; 869 iov_frag_buf += copylen; 870 iov_frag_size -= copylen; 871 totlen += copylen; 872 873 if (nm_buf_len == 0) { 874 break; 875 } 876 877 iov++; 878 iovcnt--; 879 if (iovcnt == 0) { 880 /* No space to receive. */ 881 WPRINTF(("Short iov, drop %zd bytes", 882 totlen)); 883 return (-ENOSPC); 884 } 885 iov_frag_buf = iov->iov_base; 886 iov_frag_size = iov->iov_len; 887 } 888 889 head = nm_ring_next(ring, head); 890 891 } while (slot->flags & NS_MOREFRAG); 892 893 /* Release slots to netmap. */ 894 ring->head = ring->cur = head; 895 896 return (totlen); 897 } 898 899 static void 900 netmap_recv_enable(struct net_backend *be) 901 { 902 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 903 904 mevent_enable(priv->mevp); 905 } 906 907 static void 908 netmap_recv_disable(struct net_backend *be) 909 { 910 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 911 912 mevent_disable(priv->mevp); 913 } 914 915 static struct net_backend netmap_backend = { 916 .prefix = "netmap", 917 .priv_size = sizeof(struct netmap_priv), 918 .init = netmap_init, 919 .cleanup = netmap_cleanup, 920 .send = netmap_send, 921 .peek_recvlen = netmap_peek_recvlen, 922 .recv = netmap_recv, 923 .recv_enable = netmap_recv_enable, 924 .recv_disable = netmap_recv_disable, 925 .get_cap = netmap_get_cap, 926 .set_cap = netmap_set_cap, 927 }; 928 929 /* A clone of the netmap backend, with a different prefix. */ 930 static struct net_backend vale_backend = { 931 .prefix = "vale", 932 .priv_size = sizeof(struct netmap_priv), 933 .init = netmap_init, 934 .cleanup = netmap_cleanup, 935 .send = netmap_send, 936 .peek_recvlen = netmap_peek_recvlen, 937 .recv = netmap_recv, 938 .recv_enable = netmap_recv_enable, 939 .recv_disable = netmap_recv_disable, 940 .get_cap = netmap_get_cap, 941 .set_cap = netmap_set_cap, 942 }; 943 944 DATA_SET(net_backend_set, netmap_backend); 945 DATA_SET(net_backend_set, vale_backend); 946 947 int 948 netbe_legacy_config(nvlist_t *nvl, const char *opts) 949 { 950 char *backend, *cp; 951 952 if (opts == NULL) 953 return (0); 954 955 cp = strchr(opts, ','); 956 if (cp == NULL) { 957 set_config_value_node(nvl, "backend", opts); 958 return (0); 959 } 960 backend = strndup(opts, cp - opts); 961 set_config_value_node(nvl, "backend", backend); 962 free(backend); 963 return (pci_parse_legacy_config(nvl, cp + 1)); 964 } 965 966 /* 967 * Initialize a backend and attach to the frontend. 968 * This is called during frontend initialization. 969 * @ret is a pointer to the backend to be initialized 970 * @devname is the backend-name as supplied on the command line, 971 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 972 * @cb is the receive callback supplied by the frontend, 973 * and it is invoked in the event loop when a receive 974 * event is generated in the hypervisor, 975 * @param is a pointer to the frontend, and normally used as 976 * the argument for the callback. 977 */ 978 int 979 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 980 void *param) 981 { 982 struct net_backend **pbe, *nbe, *tbe = NULL; 983 const char *value, *type; 984 char *devname; 985 int err; 986 987 value = get_config_value_node(nvl, "backend"); 988 if (value == NULL) { 989 return (-1); 990 } 991 devname = strdup(value); 992 993 /* 994 * Use the type given by configuration if exists; otherwise 995 * use the prefix of the backend as the type. 996 */ 997 type = get_config_value_node(nvl, "type"); 998 if (type == NULL) 999 type = devname; 1000 1001 /* 1002 * Find the network backend that matches the user-provided 1003 * device name. net_backend_set is built using a linker set. 1004 */ 1005 SET_FOREACH(pbe, net_backend_set) { 1006 if (strncmp(type, (*pbe)->prefix, 1007 strlen((*pbe)->prefix)) == 0) { 1008 tbe = *pbe; 1009 assert(tbe->init != NULL); 1010 assert(tbe->cleanup != NULL); 1011 assert(tbe->send != NULL); 1012 assert(tbe->recv != NULL); 1013 assert(tbe->get_cap != NULL); 1014 assert(tbe->set_cap != NULL); 1015 break; 1016 } 1017 } 1018 1019 *ret = NULL; 1020 if (tbe == NULL) { 1021 free(devname); 1022 return (EINVAL); 1023 } 1024 1025 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); 1026 *nbe = *tbe; /* copy the template */ 1027 nbe->fd = -1; 1028 nbe->sc = param; 1029 nbe->be_vnet_hdr_len = 0; 1030 nbe->fe_vnet_hdr_len = 0; 1031 1032 /* Initialize the backend. */ 1033 err = nbe->init(nbe, devname, nvl, cb, param); 1034 if (err) { 1035 free(devname); 1036 free(nbe); 1037 return (err); 1038 } 1039 1040 *ret = nbe; 1041 free(devname); 1042 1043 return (0); 1044 } 1045 1046 void 1047 netbe_cleanup(struct net_backend *be) 1048 { 1049 1050 if (be != NULL) { 1051 be->cleanup(be); 1052 free(be); 1053 } 1054 } 1055 1056 uint64_t 1057 netbe_get_cap(struct net_backend *be) 1058 { 1059 1060 assert(be != NULL); 1061 return (be->get_cap(be)); 1062 } 1063 1064 int 1065 netbe_set_cap(struct net_backend *be, uint64_t features, 1066 unsigned vnet_hdr_len) 1067 { 1068 int ret; 1069 1070 assert(be != NULL); 1071 1072 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1073 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1074 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1075 return (-1); 1076 1077 be->fe_vnet_hdr_len = vnet_hdr_len; 1078 1079 ret = be->set_cap(be, features, vnet_hdr_len); 1080 assert(be->be_vnet_hdr_len == 0 || 1081 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1082 1083 return (ret); 1084 } 1085 1086 ssize_t 1087 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1088 { 1089 1090 return (be->send(be, iov, iovcnt)); 1091 } 1092 1093 ssize_t 1094 netbe_peek_recvlen(struct net_backend *be) 1095 { 1096 1097 return (be->peek_recvlen(be)); 1098 } 1099 1100 /* 1101 * Try to read a packet from the backend, without blocking. 1102 * If no packets are available, return 0. In case of success, return 1103 * the length of the packet just read. Return -1 in case of errors. 1104 */ 1105 ssize_t 1106 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1107 { 1108 1109 return (be->recv(be, iov, iovcnt)); 1110 } 1111 1112 /* 1113 * Read a packet from the backend and discard it. 1114 * Returns the size of the discarded packet or zero if no packet was available. 1115 * A negative error code is returned in case of read error. 1116 */ 1117 ssize_t 1118 netbe_rx_discard(struct net_backend *be) 1119 { 1120 /* 1121 * MP note: the dummybuf is only used to discard frames, 1122 * so there is no need for it to be per-vtnet or locked. 1123 * We only make it large enough for TSO-sized segment. 1124 */ 1125 static uint8_t dummybuf[65536 + 64]; 1126 struct iovec iov; 1127 1128 iov.iov_base = dummybuf; 1129 iov.iov_len = sizeof(dummybuf); 1130 1131 return netbe_recv(be, &iov, 1); 1132 } 1133 1134 void 1135 netbe_rx_disable(struct net_backend *be) 1136 { 1137 1138 return be->recv_disable(be); 1139 } 1140 1141 void 1142 netbe_rx_enable(struct net_backend *be) 1143 { 1144 1145 return be->recv_enable(be); 1146 } 1147 1148 size_t 1149 netbe_get_vnet_hdr_len(struct net_backend *be) 1150 { 1151 1152 return (be->be_vnet_hdr_len); 1153 } 1154