1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 /* 31 * This file implements multiple network backends (tap, netmap, ...), 32 * to be used by network frontends such as virtio-net and e1000. 33 * The API to access the backend (e.g. send/receive packets, negotiate 34 * features) is exported by net_backends.h. 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/types.h> /* u_short etc */ 41 #ifndef WITHOUT_CAPSICUM 42 #include <sys/capsicum.h> 43 #endif 44 #include <sys/ioctl.h> 45 #include <sys/mman.h> 46 #include <sys/uio.h> 47 48 #include <net/if.h> 49 #include <net/netmap.h> 50 #include <net/netmap_virt.h> 51 #define NETMAP_WITH_LIBS 52 #include <net/netmap_user.h> 53 54 #ifndef WITHOUT_CAPSICUM 55 #include <capsicum_helpers.h> 56 #endif 57 #include <err.h> 58 #include <errno.h> 59 #include <fcntl.h> 60 #include <stdio.h> 61 #include <stdlib.h> 62 #include <stdint.h> 63 #include <string.h> 64 #include <unistd.h> 65 #include <sysexits.h> 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <poll.h> 70 #include <assert.h> 71 72 #ifdef NETGRAPH 73 #include <sys/param.h> 74 #include <sys/sysctl.h> 75 #include <netgraph.h> 76 #endif 77 78 #include "config.h" 79 #include "debug.h" 80 #include "iov.h" 81 #include "mevent.h" 82 #include "net_backends.h" 83 #include "pci_emul.h" 84 85 #include <sys/linker_set.h> 86 87 /* 88 * Each network backend registers a set of function pointers that are 89 * used to implement the net backends API. 90 * This might need to be exposed if we implement backends in separate files. 91 */ 92 struct net_backend { 93 const char *prefix; /* prefix matching this backend */ 94 95 /* 96 * Routines used to initialize and cleanup the resources needed 97 * by a backend. The cleanup function is used internally, 98 * and should not be called by the frontend. 99 */ 100 int (*init)(struct net_backend *be, const char *devname, 101 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 102 void (*cleanup)(struct net_backend *be); 103 104 /* 105 * Called to serve a guest transmit request. The scatter-gather 106 * vector provided by the caller has 'iovcnt' elements and contains 107 * the packet to send. 108 */ 109 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 110 int iovcnt); 111 112 /* 113 * Get the length of the next packet that can be received from 114 * the backend. If no packets are currently available, this 115 * function returns 0. 116 */ 117 ssize_t (*peek_recvlen)(struct net_backend *be); 118 119 /* 120 * Called to receive a packet from the backend. When the function 121 * returns a positive value 'len', the scatter-gather vector 122 * provided by the caller contains a packet with such length. 123 * The function returns 0 if the backend doesn't have a new packet to 124 * receive. 125 */ 126 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 127 int iovcnt); 128 129 /* 130 * Ask the backend to enable or disable receive operation in the 131 * backend. On return from a disable operation, it is guaranteed 132 * that the receive callback won't be called until receive is 133 * enabled again. Note however that it is up to the caller to make 134 * sure that netbe_recv() is not currently being executed by another 135 * thread. 136 */ 137 void (*recv_enable)(struct net_backend *be); 138 void (*recv_disable)(struct net_backend *be); 139 140 /* 141 * Ask the backend for the virtio-net features it is able to 142 * support. Possible features are TSO, UFO and checksum offloading 143 * in both rx and tx direction and for both IPv4 and IPv6. 144 */ 145 uint64_t (*get_cap)(struct net_backend *be); 146 147 /* 148 * Tell the backend to enable/disable the specified virtio-net 149 * features (capabilities). 150 */ 151 int (*set_cap)(struct net_backend *be, uint64_t features, 152 unsigned int vnet_hdr_len); 153 154 struct pci_vtnet_softc *sc; 155 int fd; 156 157 /* 158 * Length of the virtio-net header used by the backend and the 159 * frontend, respectively. A zero value means that the header 160 * is not used. 161 */ 162 unsigned int be_vnet_hdr_len; 163 unsigned int fe_vnet_hdr_len; 164 165 /* Size of backend-specific private data. */ 166 size_t priv_size; 167 168 /* Room for backend-specific data. */ 169 char opaque[0]; 170 }; 171 172 SET_DECLARE(net_backend_set, struct net_backend); 173 174 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 175 176 #define WPRINTF(params) PRINTLN params 177 178 /* 179 * The tap backend 180 */ 181 182 struct tap_priv { 183 struct mevent *mevp; 184 /* 185 * A bounce buffer that allows us to implement the peek_recvlen 186 * callback. In the future we may get the same information from 187 * the kevent data. 188 */ 189 char bbuf[1 << 16]; 190 ssize_t bbuflen; 191 }; 192 193 static void 194 tap_cleanup(struct net_backend *be) 195 { 196 struct tap_priv *priv = (struct tap_priv *)be->opaque; 197 198 if (priv->mevp) { 199 mevent_delete(priv->mevp); 200 } 201 if (be->fd != -1) { 202 close(be->fd); 203 be->fd = -1; 204 } 205 } 206 207 static int 208 tap_init(struct net_backend *be, const char *devname, 209 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 210 { 211 struct tap_priv *priv = (struct tap_priv *)be->opaque; 212 char tbuf[80]; 213 int opt = 1; 214 #ifndef WITHOUT_CAPSICUM 215 cap_rights_t rights; 216 #endif 217 218 if (cb == NULL) { 219 WPRINTF(("TAP backend requires non-NULL callback")); 220 return (-1); 221 } 222 223 strcpy(tbuf, "/dev/"); 224 strlcat(tbuf, devname, sizeof(tbuf)); 225 226 be->fd = open(tbuf, O_RDWR); 227 if (be->fd == -1) { 228 WPRINTF(("open of tap device %s failed", tbuf)); 229 goto error; 230 } 231 232 /* 233 * Set non-blocking and register for read 234 * notifications with the event loop 235 */ 236 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 237 WPRINTF(("tap device O_NONBLOCK failed")); 238 goto error; 239 } 240 241 #ifndef WITHOUT_CAPSICUM 242 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 243 if (caph_rights_limit(be->fd, &rights) == -1) 244 errx(EX_OSERR, "Unable to apply rights for sandbox"); 245 #endif 246 247 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 248 priv->bbuflen = 0; 249 250 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 251 if (priv->mevp == NULL) { 252 WPRINTF(("Could not register event")); 253 goto error; 254 } 255 256 return (0); 257 258 error: 259 tap_cleanup(be); 260 return (-1); 261 } 262 263 /* 264 * Called to send a buffer chain out to the tap device 265 */ 266 static ssize_t 267 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 268 { 269 return (writev(be->fd, iov, iovcnt)); 270 } 271 272 static ssize_t 273 tap_peek_recvlen(struct net_backend *be) 274 { 275 struct tap_priv *priv = (struct tap_priv *)be->opaque; 276 ssize_t ret; 277 278 if (priv->bbuflen > 0) { 279 /* 280 * We already have a packet in the bounce buffer. 281 * Just return its length. 282 */ 283 return priv->bbuflen; 284 } 285 286 /* 287 * Read the next packet (if any) into the bounce buffer, so 288 * that we get to know its length and we can return that 289 * to the caller. 290 */ 291 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 292 if (ret < 0 && errno == EWOULDBLOCK) { 293 return (0); 294 } 295 296 if (ret > 0) 297 priv->bbuflen = ret; 298 299 return (ret); 300 } 301 302 static ssize_t 303 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 304 { 305 struct tap_priv *priv = (struct tap_priv *)be->opaque; 306 ssize_t ret; 307 308 if (priv->bbuflen > 0) { 309 /* 310 * A packet is available in the bounce buffer, so 311 * we read it from there. 312 */ 313 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 314 iov, iovcnt, 0); 315 316 /* Mark the bounce buffer as empty. */ 317 priv->bbuflen = 0; 318 319 return (ret); 320 } 321 322 ret = readv(be->fd, iov, iovcnt); 323 if (ret < 0 && errno == EWOULDBLOCK) { 324 return (0); 325 } 326 327 return (ret); 328 } 329 330 static void 331 tap_recv_enable(struct net_backend *be) 332 { 333 struct tap_priv *priv = (struct tap_priv *)be->opaque; 334 335 mevent_enable(priv->mevp); 336 } 337 338 static void 339 tap_recv_disable(struct net_backend *be) 340 { 341 struct tap_priv *priv = (struct tap_priv *)be->opaque; 342 343 mevent_disable(priv->mevp); 344 } 345 346 static uint64_t 347 tap_get_cap(struct net_backend *be) 348 { 349 350 return (0); /* no capabilities for now */ 351 } 352 353 static int 354 tap_set_cap(struct net_backend *be, uint64_t features, 355 unsigned vnet_hdr_len) 356 { 357 358 return ((features || vnet_hdr_len) ? -1 : 0); 359 } 360 361 static struct net_backend tap_backend = { 362 .prefix = "tap", 363 .priv_size = sizeof(struct tap_priv), 364 .init = tap_init, 365 .cleanup = tap_cleanup, 366 .send = tap_send, 367 .peek_recvlen = tap_peek_recvlen, 368 .recv = tap_recv, 369 .recv_enable = tap_recv_enable, 370 .recv_disable = tap_recv_disable, 371 .get_cap = tap_get_cap, 372 .set_cap = tap_set_cap, 373 }; 374 375 /* A clone of the tap backend, with a different prefix. */ 376 static struct net_backend vmnet_backend = { 377 .prefix = "vmnet", 378 .priv_size = sizeof(struct tap_priv), 379 .init = tap_init, 380 .cleanup = tap_cleanup, 381 .send = tap_send, 382 .peek_recvlen = tap_peek_recvlen, 383 .recv = tap_recv, 384 .recv_enable = tap_recv_enable, 385 .recv_disable = tap_recv_disable, 386 .get_cap = tap_get_cap, 387 .set_cap = tap_set_cap, 388 }; 389 390 DATA_SET(net_backend_set, tap_backend); 391 DATA_SET(net_backend_set, vmnet_backend); 392 393 #ifdef NETGRAPH 394 395 /* 396 * Netgraph backend 397 */ 398 399 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 400 401 static int 402 ng_init(struct net_backend *be, const char *devname, 403 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 404 { 405 struct tap_priv *p = (struct tap_priv *)be->opaque; 406 struct ngm_connect ngc; 407 const char *value, *nodename; 408 int sbsz; 409 int ctrl_sock; 410 int flags; 411 unsigned long maxsbsz; 412 size_t msbsz; 413 #ifndef WITHOUT_CAPSICUM 414 cap_rights_t rights; 415 #endif 416 417 if (cb == NULL) { 418 WPRINTF(("Netgraph backend requires non-NULL callback")); 419 return (-1); 420 } 421 422 be->fd = -1; 423 424 memset(&ngc, 0, sizeof(ngc)); 425 426 value = get_config_value_node(nvl, "path"); 427 if (value == NULL) { 428 WPRINTF(("path must be provided")); 429 return (-1); 430 } 431 strncpy(ngc.path, value, NG_PATHSIZ - 1); 432 433 value = get_config_value_node(nvl, "hook"); 434 if (value == NULL) 435 value = "vmlink"; 436 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 437 438 value = get_config_value_node(nvl, "peerhook"); 439 if (value == NULL) { 440 WPRINTF(("peer hook must be provided")); 441 return (-1); 442 } 443 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 444 445 nodename = get_config_value_node(nvl, "socket"); 446 if (NgMkSockNode(nodename, 447 &ctrl_sock, &be->fd) < 0) { 448 WPRINTF(("can't get Netgraph sockets")); 449 return (-1); 450 } 451 452 if (NgSendMsg(ctrl_sock, ".", 453 NGM_GENERIC_COOKIE, 454 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 455 WPRINTF(("can't connect to node")); 456 close(ctrl_sock); 457 goto error; 458 } 459 460 close(ctrl_sock); 461 462 flags = fcntl(be->fd, F_GETFL); 463 464 if (flags < 0) { 465 WPRINTF(("can't get socket flags")); 466 goto error; 467 } 468 469 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 470 WPRINTF(("can't set O_NONBLOCK flag")); 471 goto error; 472 } 473 474 /* 475 * The default ng_socket(4) buffer's size is too low. 476 * Calculate the minimum value between NG_SBUF_MAX_SIZE 477 * and kern.ipc.maxsockbuf. 478 */ 479 msbsz = sizeof(maxsbsz); 480 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 481 NULL, 0) < 0) { 482 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 483 goto error; 484 } 485 486 /* 487 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 488 * as it takes into account the mbuf(9) overhead. 489 */ 490 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 491 492 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 493 494 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 495 sizeof(sbsz)) < 0) { 496 WPRINTF(("can't set TX buffer size")); 497 goto error; 498 } 499 500 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 501 sizeof(sbsz)) < 0) { 502 WPRINTF(("can't set RX buffer size")); 503 goto error; 504 } 505 506 #ifndef WITHOUT_CAPSICUM 507 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 508 if (caph_rights_limit(be->fd, &rights) == -1) 509 errx(EX_OSERR, "Unable to apply rights for sandbox"); 510 #endif 511 512 memset(p->bbuf, 0, sizeof(p->bbuf)); 513 p->bbuflen = 0; 514 515 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 516 if (p->mevp == NULL) { 517 WPRINTF(("Could not register event")); 518 goto error; 519 } 520 521 return (0); 522 523 error: 524 tap_cleanup(be); 525 return (-1); 526 } 527 528 static struct net_backend ng_backend = { 529 .prefix = "netgraph", 530 .priv_size = sizeof(struct tap_priv), 531 .init = ng_init, 532 .cleanup = tap_cleanup, 533 .send = tap_send, 534 .peek_recvlen = tap_peek_recvlen, 535 .recv = tap_recv, 536 .recv_enable = tap_recv_enable, 537 .recv_disable = tap_recv_disable, 538 .get_cap = tap_get_cap, 539 .set_cap = tap_set_cap, 540 }; 541 542 DATA_SET(net_backend_set, ng_backend); 543 544 #endif /* NETGRAPH */ 545 546 /* 547 * The netmap backend 548 */ 549 550 /* The virtio-net features supported by netmap. */ 551 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 552 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 553 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 554 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 555 556 struct netmap_priv { 557 char ifname[IFNAMSIZ]; 558 struct nm_desc *nmd; 559 uint16_t memid; 560 struct netmap_ring *rx; 561 struct netmap_ring *tx; 562 struct mevent *mevp; 563 net_be_rxeof_t cb; 564 void *cb_param; 565 }; 566 567 static void 568 nmreq_init(struct nmreq *req, char *ifname) 569 { 570 571 memset(req, 0, sizeof(*req)); 572 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 573 req->nr_version = NETMAP_API; 574 } 575 576 static int 577 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 578 { 579 int err; 580 struct nmreq req; 581 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 582 583 nmreq_init(&req, priv->ifname); 584 req.nr_cmd = NETMAP_BDG_VNET_HDR; 585 req.nr_arg1 = vnet_hdr_len; 586 err = ioctl(be->fd, NIOCREGIF, &req); 587 if (err) { 588 WPRINTF(("Unable to set vnet header length %d", 589 vnet_hdr_len)); 590 return (err); 591 } 592 593 be->be_vnet_hdr_len = vnet_hdr_len; 594 595 return (0); 596 } 597 598 static int 599 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 600 { 601 int prev_hdr_len = be->be_vnet_hdr_len; 602 int ret; 603 604 if (vnet_hdr_len == prev_hdr_len) { 605 return (1); 606 } 607 608 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 609 if (ret) { 610 return (0); 611 } 612 613 netmap_set_vnet_hdr_len(be, prev_hdr_len); 614 615 return (1); 616 } 617 618 static uint64_t 619 netmap_get_cap(struct net_backend *be) 620 { 621 622 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 623 NETMAP_FEATURES : 0); 624 } 625 626 static int 627 netmap_set_cap(struct net_backend *be, uint64_t features, 628 unsigned vnet_hdr_len) 629 { 630 631 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 632 } 633 634 static int 635 netmap_init(struct net_backend *be, const char *devname, 636 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 637 { 638 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 639 640 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 641 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 642 643 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 644 if (priv->nmd == NULL) { 645 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 646 devname, strerror(errno))); 647 free(priv); 648 return (-1); 649 } 650 651 priv->memid = priv->nmd->req.nr_arg2; 652 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 653 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 654 priv->cb = cb; 655 priv->cb_param = param; 656 be->fd = priv->nmd->fd; 657 658 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 659 if (priv->mevp == NULL) { 660 WPRINTF(("Could not register event")); 661 return (-1); 662 } 663 664 return (0); 665 } 666 667 static void 668 netmap_cleanup(struct net_backend *be) 669 { 670 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 671 672 if (priv->mevp) { 673 mevent_delete(priv->mevp); 674 } 675 if (priv->nmd) { 676 nm_close(priv->nmd); 677 } 678 be->fd = -1; 679 } 680 681 static ssize_t 682 netmap_send(struct net_backend *be, const struct iovec *iov, 683 int iovcnt) 684 { 685 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 686 struct netmap_ring *ring; 687 ssize_t totlen = 0; 688 int nm_buf_size; 689 int nm_buf_len; 690 uint32_t head; 691 void *nm_buf; 692 int j; 693 694 ring = priv->tx; 695 head = ring->head; 696 if (head == ring->tail) { 697 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 698 goto txsync; 699 } 700 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 701 nm_buf_size = ring->nr_buf_size; 702 nm_buf_len = 0; 703 704 for (j = 0; j < iovcnt; j++) { 705 int iov_frag_size = iov[j].iov_len; 706 void *iov_frag_buf = iov[j].iov_base; 707 708 totlen += iov_frag_size; 709 710 /* 711 * Split each iovec fragment over more netmap slots, if 712 * necessary. 713 */ 714 for (;;) { 715 int copylen; 716 717 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 718 memcpy(nm_buf, iov_frag_buf, copylen); 719 720 iov_frag_buf += copylen; 721 iov_frag_size -= copylen; 722 nm_buf += copylen; 723 nm_buf_size -= copylen; 724 nm_buf_len += copylen; 725 726 if (iov_frag_size == 0) { 727 break; 728 } 729 730 ring->slot[head].len = nm_buf_len; 731 ring->slot[head].flags = NS_MOREFRAG; 732 head = nm_ring_next(ring, head); 733 if (head == ring->tail) { 734 /* 735 * We ran out of netmap slots while 736 * splitting the iovec fragments. 737 */ 738 WPRINTF(("No space, drop %zu bytes", 739 count_iov(iov, iovcnt))); 740 goto txsync; 741 } 742 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 743 nm_buf_size = ring->nr_buf_size; 744 nm_buf_len = 0; 745 } 746 } 747 748 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 749 ring->slot[head].len = nm_buf_len; 750 ring->slot[head].flags = 0; 751 head = nm_ring_next(ring, head); 752 753 /* Now update ring->head and ring->cur. */ 754 ring->head = ring->cur = head; 755 txsync: 756 ioctl(be->fd, NIOCTXSYNC, NULL); 757 758 return (totlen); 759 } 760 761 static ssize_t 762 netmap_peek_recvlen(struct net_backend *be) 763 { 764 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 765 struct netmap_ring *ring = priv->rx; 766 uint32_t head = ring->head; 767 ssize_t totlen = 0; 768 769 while (head != ring->tail) { 770 struct netmap_slot *slot = ring->slot + head; 771 772 totlen += slot->len; 773 if ((slot->flags & NS_MOREFRAG) == 0) 774 break; 775 head = nm_ring_next(ring, head); 776 } 777 778 return (totlen); 779 } 780 781 static ssize_t 782 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 783 { 784 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 785 struct netmap_slot *slot = NULL; 786 struct netmap_ring *ring; 787 void *iov_frag_buf; 788 int iov_frag_size; 789 ssize_t totlen = 0; 790 uint32_t head; 791 792 assert(iovcnt); 793 794 ring = priv->rx; 795 head = ring->head; 796 iov_frag_buf = iov->iov_base; 797 iov_frag_size = iov->iov_len; 798 799 do { 800 int nm_buf_len; 801 void *nm_buf; 802 803 if (head == ring->tail) { 804 return (0); 805 } 806 807 slot = ring->slot + head; 808 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 809 nm_buf_len = slot->len; 810 811 for (;;) { 812 int copylen = nm_buf_len < iov_frag_size ? 813 nm_buf_len : iov_frag_size; 814 815 memcpy(iov_frag_buf, nm_buf, copylen); 816 nm_buf += copylen; 817 nm_buf_len -= copylen; 818 iov_frag_buf += copylen; 819 iov_frag_size -= copylen; 820 totlen += copylen; 821 822 if (nm_buf_len == 0) { 823 break; 824 } 825 826 iov++; 827 iovcnt--; 828 if (iovcnt == 0) { 829 /* No space to receive. */ 830 WPRINTF(("Short iov, drop %zd bytes", 831 totlen)); 832 return (-ENOSPC); 833 } 834 iov_frag_buf = iov->iov_base; 835 iov_frag_size = iov->iov_len; 836 } 837 838 head = nm_ring_next(ring, head); 839 840 } while (slot->flags & NS_MOREFRAG); 841 842 /* Release slots to netmap. */ 843 ring->head = ring->cur = head; 844 845 return (totlen); 846 } 847 848 static void 849 netmap_recv_enable(struct net_backend *be) 850 { 851 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 852 853 mevent_enable(priv->mevp); 854 } 855 856 static void 857 netmap_recv_disable(struct net_backend *be) 858 { 859 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 860 861 mevent_disable(priv->mevp); 862 } 863 864 static struct net_backend netmap_backend = { 865 .prefix = "netmap", 866 .priv_size = sizeof(struct netmap_priv), 867 .init = netmap_init, 868 .cleanup = netmap_cleanup, 869 .send = netmap_send, 870 .peek_recvlen = netmap_peek_recvlen, 871 .recv = netmap_recv, 872 .recv_enable = netmap_recv_enable, 873 .recv_disable = netmap_recv_disable, 874 .get_cap = netmap_get_cap, 875 .set_cap = netmap_set_cap, 876 }; 877 878 /* A clone of the netmap backend, with a different prefix. */ 879 static struct net_backend vale_backend = { 880 .prefix = "vale", 881 .priv_size = sizeof(struct netmap_priv), 882 .init = netmap_init, 883 .cleanup = netmap_cleanup, 884 .send = netmap_send, 885 .peek_recvlen = netmap_peek_recvlen, 886 .recv = netmap_recv, 887 .recv_enable = netmap_recv_enable, 888 .recv_disable = netmap_recv_disable, 889 .get_cap = netmap_get_cap, 890 .set_cap = netmap_set_cap, 891 }; 892 893 DATA_SET(net_backend_set, netmap_backend); 894 DATA_SET(net_backend_set, vale_backend); 895 896 int 897 netbe_legacy_config(nvlist_t *nvl, const char *opts) 898 { 899 char *backend, *cp; 900 901 if (opts == NULL) 902 return (0); 903 904 cp = strchr(opts, ','); 905 if (cp == NULL) { 906 set_config_value_node(nvl, "backend", opts); 907 return (0); 908 } 909 backend = strndup(opts, cp - opts); 910 set_config_value_node(nvl, "backend", backend); 911 free(backend); 912 return (pci_parse_legacy_config(nvl, cp + 1)); 913 } 914 915 /* 916 * Initialize a backend and attach to the frontend. 917 * This is called during frontend initialization. 918 * @ret is a pointer to the backend to be initialized 919 * @devname is the backend-name as supplied on the command line, 920 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 921 * @cb is the receive callback supplied by the frontend, 922 * and it is invoked in the event loop when a receive 923 * event is generated in the hypervisor, 924 * @param is a pointer to the frontend, and normally used as 925 * the argument for the callback. 926 */ 927 int 928 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 929 void *param) 930 { 931 struct net_backend **pbe, *nbe, *tbe = NULL; 932 const char *value; 933 char *devname; 934 int err; 935 936 value = get_config_value_node(nvl, "backend"); 937 if (value == NULL) { 938 return (-1); 939 } 940 devname = strdup(value); 941 942 /* 943 * Find the network backend that matches the user-provided 944 * device name. net_backend_set is built using a linker set. 945 */ 946 SET_FOREACH(pbe, net_backend_set) { 947 if (strncmp(devname, (*pbe)->prefix, 948 strlen((*pbe)->prefix)) == 0) { 949 tbe = *pbe; 950 assert(tbe->init != NULL); 951 assert(tbe->cleanup != NULL); 952 assert(tbe->send != NULL); 953 assert(tbe->recv != NULL); 954 assert(tbe->get_cap != NULL); 955 assert(tbe->set_cap != NULL); 956 break; 957 } 958 } 959 960 *ret = NULL; 961 if (tbe == NULL) { 962 free(devname); 963 return (EINVAL); 964 } 965 966 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); 967 *nbe = *tbe; /* copy the template */ 968 nbe->fd = -1; 969 nbe->sc = param; 970 nbe->be_vnet_hdr_len = 0; 971 nbe->fe_vnet_hdr_len = 0; 972 973 /* Initialize the backend. */ 974 err = nbe->init(nbe, devname, nvl, cb, param); 975 if (err) { 976 free(devname); 977 free(nbe); 978 return (err); 979 } 980 981 *ret = nbe; 982 free(devname); 983 984 return (0); 985 } 986 987 void 988 netbe_cleanup(struct net_backend *be) 989 { 990 991 if (be != NULL) { 992 be->cleanup(be); 993 free(be); 994 } 995 } 996 997 uint64_t 998 netbe_get_cap(struct net_backend *be) 999 { 1000 1001 assert(be != NULL); 1002 return (be->get_cap(be)); 1003 } 1004 1005 int 1006 netbe_set_cap(struct net_backend *be, uint64_t features, 1007 unsigned vnet_hdr_len) 1008 { 1009 int ret; 1010 1011 assert(be != NULL); 1012 1013 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1014 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1015 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1016 return (-1); 1017 1018 be->fe_vnet_hdr_len = vnet_hdr_len; 1019 1020 ret = be->set_cap(be, features, vnet_hdr_len); 1021 assert(be->be_vnet_hdr_len == 0 || 1022 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1023 1024 return (ret); 1025 } 1026 1027 ssize_t 1028 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1029 { 1030 1031 return (be->send(be, iov, iovcnt)); 1032 } 1033 1034 ssize_t 1035 netbe_peek_recvlen(struct net_backend *be) 1036 { 1037 1038 return (be->peek_recvlen(be)); 1039 } 1040 1041 /* 1042 * Try to read a packet from the backend, without blocking. 1043 * If no packets are available, return 0. In case of success, return 1044 * the length of the packet just read. Return -1 in case of errors. 1045 */ 1046 ssize_t 1047 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1048 { 1049 1050 return (be->recv(be, iov, iovcnt)); 1051 } 1052 1053 /* 1054 * Read a packet from the backend and discard it. 1055 * Returns the size of the discarded packet or zero if no packet was available. 1056 * A negative error code is returned in case of read error. 1057 */ 1058 ssize_t 1059 netbe_rx_discard(struct net_backend *be) 1060 { 1061 /* 1062 * MP note: the dummybuf is only used to discard frames, 1063 * so there is no need for it to be per-vtnet or locked. 1064 * We only make it large enough for TSO-sized segment. 1065 */ 1066 static uint8_t dummybuf[65536 + 64]; 1067 struct iovec iov; 1068 1069 iov.iov_base = dummybuf; 1070 iov.iov_len = sizeof(dummybuf); 1071 1072 return netbe_recv(be, &iov, 1); 1073 } 1074 1075 void 1076 netbe_rx_disable(struct net_backend *be) 1077 { 1078 1079 return be->recv_disable(be); 1080 } 1081 1082 void 1083 netbe_rx_enable(struct net_backend *be) 1084 { 1085 1086 return be->recv_enable(be); 1087 } 1088 1089 size_t 1090 netbe_get_vnet_hdr_len(struct net_backend *be) 1091 { 1092 1093 return (be->be_vnet_hdr_len); 1094 } 1095