1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 /* 29 * This file implements multiple network backends (tap, netmap, ...), 30 * to be used by network frontends such as virtio-net and e1000. 31 * The API to access the backend (e.g. send/receive packets, negotiate 32 * features) is exported by net_backends.h. 33 */ 34 35 #include <sys/cdefs.h> 36 #include <sys/types.h> /* u_short etc */ 37 #ifndef WITHOUT_CAPSICUM 38 #include <sys/capsicum.h> 39 #endif 40 #include <sys/ioctl.h> 41 #include <sys/mman.h> 42 #include <sys/uio.h> 43 44 #include <net/if.h> 45 #include <net/if_tap.h> 46 #include <net/netmap.h> 47 #include <net/netmap_virt.h> 48 #define NETMAP_WITH_LIBS 49 #include <net/netmap_user.h> 50 51 #ifndef WITHOUT_CAPSICUM 52 #include <capsicum_helpers.h> 53 #endif 54 #include <err.h> 55 #include <errno.h> 56 #include <fcntl.h> 57 #include <stdio.h> 58 #include <stdlib.h> 59 #include <stdint.h> 60 #include <string.h> 61 #include <unistd.h> 62 #include <sysexits.h> 63 #include <assert.h> 64 #include <pthread.h> 65 #include <pthread_np.h> 66 #include <poll.h> 67 #include <assert.h> 68 69 #ifdef NETGRAPH 70 #include <sys/param.h> 71 #include <sys/sysctl.h> 72 #include <netgraph.h> 73 #endif 74 75 #include "config.h" 76 #include "debug.h" 77 #include "iov.h" 78 #include "mevent.h" 79 #include "net_backends.h" 80 #include "pci_emul.h" 81 82 #include <sys/linker_set.h> 83 84 /* 85 * Each network backend registers a set of function pointers that are 86 * used to implement the net backends API. 87 * This might need to be exposed if we implement backends in separate files. 88 */ 89 struct net_backend { 90 const char *prefix; /* prefix matching this backend */ 91 92 /* 93 * Routines used to initialize and cleanup the resources needed 94 * by a backend. The cleanup function is used internally, 95 * and should not be called by the frontend. 96 */ 97 int (*init)(struct net_backend *be, const char *devname, 98 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 99 void (*cleanup)(struct net_backend *be); 100 101 /* 102 * Called to serve a guest transmit request. The scatter-gather 103 * vector provided by the caller has 'iovcnt' elements and contains 104 * the packet to send. 105 */ 106 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 107 int iovcnt); 108 109 /* 110 * Get the length of the next packet that can be received from 111 * the backend. If no packets are currently available, this 112 * function returns 0. 113 */ 114 ssize_t (*peek_recvlen)(struct net_backend *be); 115 116 /* 117 * Called to receive a packet from the backend. When the function 118 * returns a positive value 'len', the scatter-gather vector 119 * provided by the caller contains a packet with such length. 120 * The function returns 0 if the backend doesn't have a new packet to 121 * receive. 122 */ 123 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 124 int iovcnt); 125 126 /* 127 * Ask the backend to enable or disable receive operation in the 128 * backend. On return from a disable operation, it is guaranteed 129 * that the receive callback won't be called until receive is 130 * enabled again. Note however that it is up to the caller to make 131 * sure that netbe_recv() is not currently being executed by another 132 * thread. 133 */ 134 void (*recv_enable)(struct net_backend *be); 135 void (*recv_disable)(struct net_backend *be); 136 137 /* 138 * Ask the backend for the virtio-net features it is able to 139 * support. Possible features are TSO, UFO and checksum offloading 140 * in both rx and tx direction and for both IPv4 and IPv6. 141 */ 142 uint64_t (*get_cap)(struct net_backend *be); 143 144 /* 145 * Tell the backend to enable/disable the specified virtio-net 146 * features (capabilities). 147 */ 148 int (*set_cap)(struct net_backend *be, uint64_t features, 149 unsigned int vnet_hdr_len); 150 151 struct pci_vtnet_softc *sc; 152 int fd; 153 154 /* 155 * Length of the virtio-net header used by the backend and the 156 * frontend, respectively. A zero value means that the header 157 * is not used. 158 */ 159 unsigned int be_vnet_hdr_len; 160 unsigned int fe_vnet_hdr_len; 161 162 /* Size of backend-specific private data. */ 163 size_t priv_size; 164 165 /* Backend-specific private data follows. */ 166 }; 167 168 #define NET_BE_PRIV(be) ((void *)((be) + 1)) 169 #define NET_BE_SIZE(be) (sizeof(*be) + (be)->priv_size) 170 171 SET_DECLARE(net_backend_set, struct net_backend); 172 173 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 174 175 #define WPRINTF(params) PRINTLN params 176 177 /* 178 * The tap backend 179 */ 180 181 struct tap_priv { 182 struct mevent *mevp; 183 /* 184 * A bounce buffer that allows us to implement the peek_recvlen 185 * callback. In the future we may get the same information from 186 * the kevent data. 187 */ 188 char bbuf[1 << 16]; 189 ssize_t bbuflen; 190 }; 191 192 static void 193 tap_cleanup(struct net_backend *be) 194 { 195 struct tap_priv *priv = NET_BE_PRIV(be); 196 197 if (priv->mevp) { 198 mevent_delete(priv->mevp); 199 } 200 if (be->fd != -1) { 201 close(be->fd); 202 be->fd = -1; 203 } 204 } 205 206 static int 207 tap_init(struct net_backend *be, const char *devname, 208 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 209 { 210 struct tap_priv *priv = NET_BE_PRIV(be); 211 char tbuf[80]; 212 int opt = 1, up = IFF_UP; 213 214 #ifndef WITHOUT_CAPSICUM 215 cap_rights_t rights; 216 #endif 217 218 if (cb == NULL) { 219 WPRINTF(("TAP backend requires non-NULL callback")); 220 return (-1); 221 } 222 223 strcpy(tbuf, "/dev/"); 224 strlcat(tbuf, devname, sizeof(tbuf)); 225 226 be->fd = open(tbuf, O_RDWR); 227 if (be->fd == -1) { 228 WPRINTF(("open of tap device %s failed", tbuf)); 229 goto error; 230 } 231 232 /* 233 * Set non-blocking and register for read 234 * notifications with the event loop 235 */ 236 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 237 WPRINTF(("tap device O_NONBLOCK failed")); 238 goto error; 239 } 240 241 if (ioctl(be->fd, VMIO_SIOCSIFFLAGS, up)) { 242 WPRINTF(("tap device link up failed")); 243 goto error; 244 } 245 246 #ifndef WITHOUT_CAPSICUM 247 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 248 if (caph_rights_limit(be->fd, &rights) == -1) 249 errx(EX_OSERR, "Unable to apply rights for sandbox"); 250 #endif 251 252 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 253 priv->bbuflen = 0; 254 255 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 256 if (priv->mevp == NULL) { 257 WPRINTF(("Could not register event")); 258 goto error; 259 } 260 261 return (0); 262 263 error: 264 tap_cleanup(be); 265 return (-1); 266 } 267 268 /* 269 * Called to send a buffer chain out to the tap device 270 */ 271 static ssize_t 272 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 273 { 274 return (writev(be->fd, iov, iovcnt)); 275 } 276 277 static ssize_t 278 tap_peek_recvlen(struct net_backend *be) 279 { 280 struct tap_priv *priv = NET_BE_PRIV(be); 281 ssize_t ret; 282 283 if (priv->bbuflen > 0) { 284 /* 285 * We already have a packet in the bounce buffer. 286 * Just return its length. 287 */ 288 return priv->bbuflen; 289 } 290 291 /* 292 * Read the next packet (if any) into the bounce buffer, so 293 * that we get to know its length and we can return that 294 * to the caller. 295 */ 296 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 297 if (ret < 0 && errno == EWOULDBLOCK) { 298 return (0); 299 } 300 301 if (ret > 0) 302 priv->bbuflen = ret; 303 304 return (ret); 305 } 306 307 static ssize_t 308 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 309 { 310 struct tap_priv *priv = NET_BE_PRIV(be); 311 ssize_t ret; 312 313 if (priv->bbuflen > 0) { 314 /* 315 * A packet is available in the bounce buffer, so 316 * we read it from there. 317 */ 318 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 319 iov, iovcnt, 0); 320 321 /* Mark the bounce buffer as empty. */ 322 priv->bbuflen = 0; 323 324 return (ret); 325 } 326 327 ret = readv(be->fd, iov, iovcnt); 328 if (ret < 0 && errno == EWOULDBLOCK) { 329 return (0); 330 } 331 332 return (ret); 333 } 334 335 static void 336 tap_recv_enable(struct net_backend *be) 337 { 338 struct tap_priv *priv = NET_BE_PRIV(be); 339 340 mevent_enable(priv->mevp); 341 } 342 343 static void 344 tap_recv_disable(struct net_backend *be) 345 { 346 struct tap_priv *priv = NET_BE_PRIV(be); 347 348 mevent_disable(priv->mevp); 349 } 350 351 static uint64_t 352 tap_get_cap(struct net_backend *be __unused) 353 { 354 355 return (0); /* no capabilities for now */ 356 } 357 358 static int 359 tap_set_cap(struct net_backend *be __unused, uint64_t features, 360 unsigned vnet_hdr_len) 361 { 362 363 return ((features || vnet_hdr_len) ? -1 : 0); 364 } 365 366 static struct net_backend tap_backend = { 367 .prefix = "tap", 368 .priv_size = sizeof(struct tap_priv), 369 .init = tap_init, 370 .cleanup = tap_cleanup, 371 .send = tap_send, 372 .peek_recvlen = tap_peek_recvlen, 373 .recv = tap_recv, 374 .recv_enable = tap_recv_enable, 375 .recv_disable = tap_recv_disable, 376 .get_cap = tap_get_cap, 377 .set_cap = tap_set_cap, 378 }; 379 380 /* A clone of the tap backend, with a different prefix. */ 381 static struct net_backend vmnet_backend = { 382 .prefix = "vmnet", 383 .priv_size = sizeof(struct tap_priv), 384 .init = tap_init, 385 .cleanup = tap_cleanup, 386 .send = tap_send, 387 .peek_recvlen = tap_peek_recvlen, 388 .recv = tap_recv, 389 .recv_enable = tap_recv_enable, 390 .recv_disable = tap_recv_disable, 391 .get_cap = tap_get_cap, 392 .set_cap = tap_set_cap, 393 }; 394 395 DATA_SET(net_backend_set, tap_backend); 396 DATA_SET(net_backend_set, vmnet_backend); 397 398 #ifdef NETGRAPH 399 400 /* 401 * Netgraph backend 402 */ 403 404 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 405 406 static int 407 ng_init(struct net_backend *be, const char *devname __unused, 408 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 409 { 410 struct tap_priv *p = NET_BE_PRIV(be); 411 struct ngm_connect ngc; 412 const char *value, *nodename; 413 int sbsz; 414 int ctrl_sock; 415 int flags; 416 unsigned long maxsbsz; 417 size_t msbsz; 418 #ifndef WITHOUT_CAPSICUM 419 cap_rights_t rights; 420 #endif 421 422 if (cb == NULL) { 423 WPRINTF(("Netgraph backend requires non-NULL callback")); 424 return (-1); 425 } 426 427 be->fd = -1; 428 429 memset(&ngc, 0, sizeof(ngc)); 430 431 value = get_config_value_node(nvl, "path"); 432 if (value == NULL) { 433 WPRINTF(("path must be provided")); 434 return (-1); 435 } 436 strncpy(ngc.path, value, NG_PATHSIZ - 1); 437 438 value = get_config_value_node(nvl, "hook"); 439 if (value == NULL) 440 value = "vmlink"; 441 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 442 443 value = get_config_value_node(nvl, "peerhook"); 444 if (value == NULL) { 445 WPRINTF(("peer hook must be provided")); 446 return (-1); 447 } 448 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 449 450 nodename = get_config_value_node(nvl, "socket"); 451 if (NgMkSockNode(nodename, 452 &ctrl_sock, &be->fd) < 0) { 453 WPRINTF(("can't get Netgraph sockets")); 454 return (-1); 455 } 456 457 if (NgSendMsg(ctrl_sock, ".", 458 NGM_GENERIC_COOKIE, 459 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 460 WPRINTF(("can't connect to node")); 461 close(ctrl_sock); 462 goto error; 463 } 464 465 close(ctrl_sock); 466 467 flags = fcntl(be->fd, F_GETFL); 468 469 if (flags < 0) { 470 WPRINTF(("can't get socket flags")); 471 goto error; 472 } 473 474 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 475 WPRINTF(("can't set O_NONBLOCK flag")); 476 goto error; 477 } 478 479 /* 480 * The default ng_socket(4) buffer's size is too low. 481 * Calculate the minimum value between NG_SBUF_MAX_SIZE 482 * and kern.ipc.maxsockbuf. 483 */ 484 msbsz = sizeof(maxsbsz); 485 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 486 NULL, 0) < 0) { 487 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 488 goto error; 489 } 490 491 /* 492 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 493 * as it takes into account the mbuf(9) overhead. 494 */ 495 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 496 497 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 498 499 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 500 sizeof(sbsz)) < 0) { 501 WPRINTF(("can't set TX buffer size")); 502 goto error; 503 } 504 505 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 506 sizeof(sbsz)) < 0) { 507 WPRINTF(("can't set RX buffer size")); 508 goto error; 509 } 510 511 #ifndef WITHOUT_CAPSICUM 512 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 513 if (caph_rights_limit(be->fd, &rights) == -1) 514 errx(EX_OSERR, "Unable to apply rights for sandbox"); 515 #endif 516 517 memset(p->bbuf, 0, sizeof(p->bbuf)); 518 p->bbuflen = 0; 519 520 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 521 if (p->mevp == NULL) { 522 WPRINTF(("Could not register event")); 523 goto error; 524 } 525 526 return (0); 527 528 error: 529 tap_cleanup(be); 530 return (-1); 531 } 532 533 static struct net_backend ng_backend = { 534 .prefix = "netgraph", 535 .priv_size = sizeof(struct tap_priv), 536 .init = ng_init, 537 .cleanup = tap_cleanup, 538 .send = tap_send, 539 .peek_recvlen = tap_peek_recvlen, 540 .recv = tap_recv, 541 .recv_enable = tap_recv_enable, 542 .recv_disable = tap_recv_disable, 543 .get_cap = tap_get_cap, 544 .set_cap = tap_set_cap, 545 }; 546 547 DATA_SET(net_backend_set, ng_backend); 548 549 #endif /* NETGRAPH */ 550 551 /* 552 * The netmap backend 553 */ 554 555 /* The virtio-net features supported by netmap. */ 556 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 557 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 558 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 559 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 560 561 struct netmap_priv { 562 char ifname[IFNAMSIZ]; 563 struct nm_desc *nmd; 564 uint16_t memid; 565 struct netmap_ring *rx; 566 struct netmap_ring *tx; 567 struct mevent *mevp; 568 net_be_rxeof_t cb; 569 void *cb_param; 570 }; 571 572 static void 573 nmreq_init(struct nmreq *req, char *ifname) 574 { 575 576 memset(req, 0, sizeof(*req)); 577 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 578 req->nr_version = NETMAP_API; 579 } 580 581 static int 582 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 583 { 584 int err; 585 struct nmreq req; 586 struct netmap_priv *priv = NET_BE_PRIV(be); 587 588 nmreq_init(&req, priv->ifname); 589 req.nr_cmd = NETMAP_BDG_VNET_HDR; 590 req.nr_arg1 = vnet_hdr_len; 591 err = ioctl(be->fd, NIOCREGIF, &req); 592 if (err) { 593 WPRINTF(("Unable to set vnet header length %d", 594 vnet_hdr_len)); 595 return (err); 596 } 597 598 be->be_vnet_hdr_len = vnet_hdr_len; 599 600 return (0); 601 } 602 603 static int 604 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 605 { 606 unsigned prev_hdr_len = be->be_vnet_hdr_len; 607 int ret; 608 609 if (vnet_hdr_len == prev_hdr_len) { 610 return (1); 611 } 612 613 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 614 if (ret) { 615 return (0); 616 } 617 618 netmap_set_vnet_hdr_len(be, prev_hdr_len); 619 620 return (1); 621 } 622 623 static uint64_t 624 netmap_get_cap(struct net_backend *be) 625 { 626 627 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 628 NETMAP_FEATURES : 0); 629 } 630 631 static int 632 netmap_set_cap(struct net_backend *be, uint64_t features __unused, 633 unsigned vnet_hdr_len) 634 { 635 636 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 637 } 638 639 static int 640 netmap_init(struct net_backend *be, const char *devname, 641 nvlist_t *nvl __unused, net_be_rxeof_t cb, void *param) 642 { 643 struct netmap_priv *priv = NET_BE_PRIV(be); 644 645 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 646 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 647 648 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 649 if (priv->nmd == NULL) { 650 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 651 devname, strerror(errno))); 652 return (-1); 653 } 654 655 priv->memid = priv->nmd->req.nr_arg2; 656 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 657 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 658 priv->cb = cb; 659 priv->cb_param = param; 660 be->fd = priv->nmd->fd; 661 662 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 663 if (priv->mevp == NULL) { 664 WPRINTF(("Could not register event")); 665 return (-1); 666 } 667 668 return (0); 669 } 670 671 static void 672 netmap_cleanup(struct net_backend *be) 673 { 674 struct netmap_priv *priv = NET_BE_PRIV(be); 675 676 if (priv->mevp) { 677 mevent_delete(priv->mevp); 678 } 679 if (priv->nmd) { 680 nm_close(priv->nmd); 681 } 682 be->fd = -1; 683 } 684 685 static ssize_t 686 netmap_send(struct net_backend *be, const struct iovec *iov, 687 int iovcnt) 688 { 689 struct netmap_priv *priv = NET_BE_PRIV(be); 690 struct netmap_ring *ring; 691 ssize_t totlen = 0; 692 int nm_buf_size; 693 int nm_buf_len; 694 uint32_t head; 695 uint8_t *nm_buf; 696 int j; 697 698 ring = priv->tx; 699 head = ring->head; 700 if (head == ring->tail) { 701 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 702 goto txsync; 703 } 704 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 705 nm_buf_size = ring->nr_buf_size; 706 nm_buf_len = 0; 707 708 for (j = 0; j < iovcnt; j++) { 709 uint8_t *iov_frag_buf = iov[j].iov_base; 710 int iov_frag_size = iov[j].iov_len; 711 712 totlen += iov_frag_size; 713 714 /* 715 * Split each iovec fragment over more netmap slots, if 716 * necessary. 717 */ 718 for (;;) { 719 int copylen; 720 721 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 722 memcpy(nm_buf, iov_frag_buf, copylen); 723 724 iov_frag_buf += copylen; 725 iov_frag_size -= copylen; 726 nm_buf += copylen; 727 nm_buf_size -= copylen; 728 nm_buf_len += copylen; 729 730 if (iov_frag_size == 0) { 731 break; 732 } 733 734 ring->slot[head].len = nm_buf_len; 735 ring->slot[head].flags = NS_MOREFRAG; 736 head = nm_ring_next(ring, head); 737 if (head == ring->tail) { 738 /* 739 * We ran out of netmap slots while 740 * splitting the iovec fragments. 741 */ 742 WPRINTF(("No space, drop %zu bytes", 743 count_iov(iov, iovcnt))); 744 goto txsync; 745 } 746 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 747 nm_buf_size = ring->nr_buf_size; 748 nm_buf_len = 0; 749 } 750 } 751 752 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 753 ring->slot[head].len = nm_buf_len; 754 ring->slot[head].flags = 0; 755 head = nm_ring_next(ring, head); 756 757 /* Now update ring->head and ring->cur. */ 758 ring->head = ring->cur = head; 759 txsync: 760 ioctl(be->fd, NIOCTXSYNC, NULL); 761 762 return (totlen); 763 } 764 765 static ssize_t 766 netmap_peek_recvlen(struct net_backend *be) 767 { 768 struct netmap_priv *priv = NET_BE_PRIV(be); 769 struct netmap_ring *ring = priv->rx; 770 uint32_t head = ring->head; 771 ssize_t totlen = 0; 772 773 while (head != ring->tail) { 774 struct netmap_slot *slot = ring->slot + head; 775 776 totlen += slot->len; 777 if ((slot->flags & NS_MOREFRAG) == 0) 778 break; 779 head = nm_ring_next(ring, head); 780 } 781 782 return (totlen); 783 } 784 785 static ssize_t 786 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 787 { 788 struct netmap_priv *priv = NET_BE_PRIV(be); 789 struct netmap_slot *slot = NULL; 790 struct netmap_ring *ring; 791 uint8_t *iov_frag_buf; 792 int iov_frag_size; 793 ssize_t totlen = 0; 794 uint32_t head; 795 796 assert(iovcnt); 797 798 ring = priv->rx; 799 head = ring->head; 800 iov_frag_buf = iov->iov_base; 801 iov_frag_size = iov->iov_len; 802 803 do { 804 uint8_t *nm_buf; 805 int nm_buf_len; 806 807 if (head == ring->tail) { 808 return (0); 809 } 810 811 slot = ring->slot + head; 812 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 813 nm_buf_len = slot->len; 814 815 for (;;) { 816 int copylen = nm_buf_len < iov_frag_size ? 817 nm_buf_len : iov_frag_size; 818 819 memcpy(iov_frag_buf, nm_buf, copylen); 820 nm_buf += copylen; 821 nm_buf_len -= copylen; 822 iov_frag_buf += copylen; 823 iov_frag_size -= copylen; 824 totlen += copylen; 825 826 if (nm_buf_len == 0) { 827 break; 828 } 829 830 iov++; 831 iovcnt--; 832 if (iovcnt == 0) { 833 /* No space to receive. */ 834 WPRINTF(("Short iov, drop %zd bytes", 835 totlen)); 836 return (-ENOSPC); 837 } 838 iov_frag_buf = iov->iov_base; 839 iov_frag_size = iov->iov_len; 840 } 841 842 head = nm_ring_next(ring, head); 843 844 } while (slot->flags & NS_MOREFRAG); 845 846 /* Release slots to netmap. */ 847 ring->head = ring->cur = head; 848 849 return (totlen); 850 } 851 852 static void 853 netmap_recv_enable(struct net_backend *be) 854 { 855 struct netmap_priv *priv = NET_BE_PRIV(be); 856 857 mevent_enable(priv->mevp); 858 } 859 860 static void 861 netmap_recv_disable(struct net_backend *be) 862 { 863 struct netmap_priv *priv = NET_BE_PRIV(be); 864 865 mevent_disable(priv->mevp); 866 } 867 868 static struct net_backend netmap_backend = { 869 .prefix = "netmap", 870 .priv_size = sizeof(struct netmap_priv), 871 .init = netmap_init, 872 .cleanup = netmap_cleanup, 873 .send = netmap_send, 874 .peek_recvlen = netmap_peek_recvlen, 875 .recv = netmap_recv, 876 .recv_enable = netmap_recv_enable, 877 .recv_disable = netmap_recv_disable, 878 .get_cap = netmap_get_cap, 879 .set_cap = netmap_set_cap, 880 }; 881 882 /* A clone of the netmap backend, with a different prefix. */ 883 static struct net_backend vale_backend = { 884 .prefix = "vale", 885 .priv_size = sizeof(struct netmap_priv), 886 .init = netmap_init, 887 .cleanup = netmap_cleanup, 888 .send = netmap_send, 889 .peek_recvlen = netmap_peek_recvlen, 890 .recv = netmap_recv, 891 .recv_enable = netmap_recv_enable, 892 .recv_disable = netmap_recv_disable, 893 .get_cap = netmap_get_cap, 894 .set_cap = netmap_set_cap, 895 }; 896 897 DATA_SET(net_backend_set, netmap_backend); 898 DATA_SET(net_backend_set, vale_backend); 899 900 int 901 netbe_legacy_config(nvlist_t *nvl, const char *opts) 902 { 903 char *backend, *cp; 904 905 if (opts == NULL) 906 return (0); 907 908 cp = strchr(opts, ','); 909 if (cp == NULL) { 910 set_config_value_node(nvl, "backend", opts); 911 return (0); 912 } 913 backend = strndup(opts, cp - opts); 914 set_config_value_node(nvl, "backend", backend); 915 free(backend); 916 return (pci_parse_legacy_config(nvl, cp + 1)); 917 } 918 919 /* 920 * Initialize a backend and attach to the frontend. 921 * This is called during frontend initialization. 922 * @ret is a pointer to the backend to be initialized 923 * @devname is the backend-name as supplied on the command line, 924 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 925 * @cb is the receive callback supplied by the frontend, 926 * and it is invoked in the event loop when a receive 927 * event is generated in the hypervisor, 928 * @param is a pointer to the frontend, and normally used as 929 * the argument for the callback. 930 */ 931 int 932 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 933 void *param) 934 { 935 struct net_backend **pbe, *nbe, *tbe = NULL; 936 const char *value, *type; 937 char *devname; 938 int err; 939 940 value = get_config_value_node(nvl, "backend"); 941 if (value == NULL) { 942 return (-1); 943 } 944 devname = strdup(value); 945 946 /* 947 * Use the type given by configuration if exists; otherwise 948 * use the prefix of the backend as the type. 949 */ 950 type = get_config_value_node(nvl, "type"); 951 if (type == NULL) 952 type = devname; 953 954 /* 955 * Find the network backend that matches the user-provided 956 * device name. net_backend_set is built using a linker set. 957 */ 958 SET_FOREACH(pbe, net_backend_set) { 959 if (strncmp(type, (*pbe)->prefix, 960 strlen((*pbe)->prefix)) == 0) { 961 tbe = *pbe; 962 assert(tbe->init != NULL); 963 assert(tbe->cleanup != NULL); 964 assert(tbe->send != NULL); 965 assert(tbe->recv != NULL); 966 assert(tbe->get_cap != NULL); 967 assert(tbe->set_cap != NULL); 968 break; 969 } 970 } 971 972 *ret = NULL; 973 if (tbe == NULL) { 974 free(devname); 975 return (EINVAL); 976 } 977 978 nbe = calloc(1, NET_BE_SIZE(tbe)); 979 *nbe = *tbe; /* copy the template */ 980 nbe->fd = -1; 981 nbe->sc = param; 982 nbe->be_vnet_hdr_len = 0; 983 nbe->fe_vnet_hdr_len = 0; 984 985 /* Initialize the backend. */ 986 err = nbe->init(nbe, devname, nvl, cb, param); 987 if (err) { 988 free(devname); 989 free(nbe); 990 return (err); 991 } 992 993 *ret = nbe; 994 free(devname); 995 996 return (0); 997 } 998 999 void 1000 netbe_cleanup(struct net_backend *be) 1001 { 1002 1003 if (be != NULL) { 1004 be->cleanup(be); 1005 free(be); 1006 } 1007 } 1008 1009 uint64_t 1010 netbe_get_cap(struct net_backend *be) 1011 { 1012 1013 assert(be != NULL); 1014 return (be->get_cap(be)); 1015 } 1016 1017 int 1018 netbe_set_cap(struct net_backend *be, uint64_t features, 1019 unsigned vnet_hdr_len) 1020 { 1021 int ret; 1022 1023 assert(be != NULL); 1024 1025 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1026 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1027 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1028 return (-1); 1029 1030 be->fe_vnet_hdr_len = vnet_hdr_len; 1031 1032 ret = be->set_cap(be, features, vnet_hdr_len); 1033 assert(be->be_vnet_hdr_len == 0 || 1034 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1035 1036 return (ret); 1037 } 1038 1039 ssize_t 1040 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1041 { 1042 1043 return (be->send(be, iov, iovcnt)); 1044 } 1045 1046 ssize_t 1047 netbe_peek_recvlen(struct net_backend *be) 1048 { 1049 1050 return (be->peek_recvlen(be)); 1051 } 1052 1053 /* 1054 * Try to read a packet from the backend, without blocking. 1055 * If no packets are available, return 0. In case of success, return 1056 * the length of the packet just read. Return -1 in case of errors. 1057 */ 1058 ssize_t 1059 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1060 { 1061 1062 return (be->recv(be, iov, iovcnt)); 1063 } 1064 1065 /* 1066 * Read a packet from the backend and discard it. 1067 * Returns the size of the discarded packet or zero if no packet was available. 1068 * A negative error code is returned in case of read error. 1069 */ 1070 ssize_t 1071 netbe_rx_discard(struct net_backend *be) 1072 { 1073 /* 1074 * MP note: the dummybuf is only used to discard frames, 1075 * so there is no need for it to be per-vtnet or locked. 1076 * We only make it large enough for TSO-sized segment. 1077 */ 1078 static uint8_t dummybuf[65536 + 64]; 1079 struct iovec iov; 1080 1081 iov.iov_base = dummybuf; 1082 iov.iov_len = sizeof(dummybuf); 1083 1084 return netbe_recv(be, &iov, 1); 1085 } 1086 1087 void 1088 netbe_rx_disable(struct net_backend *be) 1089 { 1090 1091 return be->recv_disable(be); 1092 } 1093 1094 void 1095 netbe_rx_enable(struct net_backend *be) 1096 { 1097 1098 return be->recv_enable(be); 1099 } 1100 1101 size_t 1102 netbe_get_vnet_hdr_len(struct net_backend *be) 1103 { 1104 1105 return (be->be_vnet_hdr_len); 1106 } 1107