1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 /* 31 * This file implements multiple network backends (tap, netmap, ...), 32 * to be used by network frontends such as virtio-net and e1000. 33 * The API to access the backend (e.g. send/receive packets, negotiate 34 * features) is exported by net_backends.h. 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/types.h> /* u_short etc */ 41 #ifndef WITHOUT_CAPSICUM 42 #include <sys/capsicum.h> 43 #endif 44 #include <sys/ioctl.h> 45 #include <sys/mman.h> 46 #include <sys/uio.h> 47 48 #include <net/if.h> 49 #include <net/netmap.h> 50 #include <net/netmap_virt.h> 51 #define NETMAP_WITH_LIBS 52 #include <net/netmap_user.h> 53 54 #ifndef WITHOUT_CAPSICUM 55 #include <capsicum_helpers.h> 56 #endif 57 #include <err.h> 58 #include <errno.h> 59 #include <fcntl.h> 60 #include <stdio.h> 61 #include <stdlib.h> 62 #include <stdint.h> 63 #include <string.h> 64 #include <unistd.h> 65 #include <sysexits.h> 66 #include <assert.h> 67 #include <pthread.h> 68 #include <pthread_np.h> 69 #include <poll.h> 70 #include <assert.h> 71 72 #ifdef NETGRAPH 73 #include <sys/param.h> 74 #include <sys/sysctl.h> 75 #include <netgraph.h> 76 #endif 77 78 #include "debug.h" 79 #include "iov.h" 80 #include "mevent.h" 81 #include "net_backends.h" 82 83 #include <sys/linker_set.h> 84 85 /* 86 * Each network backend registers a set of function pointers that are 87 * used to implement the net backends API. 88 * This might need to be exposed if we implement backends in separate files. 89 */ 90 struct net_backend { 91 const char *prefix; /* prefix matching this backend */ 92 93 /* 94 * Routines used to initialize and cleanup the resources needed 95 * by a backend. The cleanup function is used internally, 96 * and should not be called by the frontend. 97 */ 98 int (*init)(struct net_backend *be, const char *devname, 99 const char *opts, net_be_rxeof_t cb, void *param); 100 void (*cleanup)(struct net_backend *be); 101 102 /* 103 * Called to serve a guest transmit request. The scatter-gather 104 * vector provided by the caller has 'iovcnt' elements and contains 105 * the packet to send. 106 */ 107 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 108 int iovcnt); 109 110 /* 111 * Get the length of the next packet that can be received from 112 * the backend. If no packets are currently available, this 113 * function returns 0. 114 */ 115 ssize_t (*peek_recvlen)(struct net_backend *be); 116 117 /* 118 * Called to receive a packet from the backend. When the function 119 * returns a positive value 'len', the scatter-gather vector 120 * provided by the caller contains a packet with such length. 121 * The function returns 0 if the backend doesn't have a new packet to 122 * receive. 123 */ 124 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 125 int iovcnt); 126 127 /* 128 * Ask the backend to enable or disable receive operation in the 129 * backend. On return from a disable operation, it is guaranteed 130 * that the receive callback won't be called until receive is 131 * enabled again. Note however that it is up to the caller to make 132 * sure that netbe_recv() is not currently being executed by another 133 * thread. 134 */ 135 void (*recv_enable)(struct net_backend *be); 136 void (*recv_disable)(struct net_backend *be); 137 138 /* 139 * Ask the backend for the virtio-net features it is able to 140 * support. Possible features are TSO, UFO and checksum offloading 141 * in both rx and tx direction and for both IPv4 and IPv6. 142 */ 143 uint64_t (*get_cap)(struct net_backend *be); 144 145 /* 146 * Tell the backend to enable/disable the specified virtio-net 147 * features (capabilities). 148 */ 149 int (*set_cap)(struct net_backend *be, uint64_t features, 150 unsigned int vnet_hdr_len); 151 152 struct pci_vtnet_softc *sc; 153 int fd; 154 155 /* 156 * Length of the virtio-net header used by the backend and the 157 * frontend, respectively. A zero value means that the header 158 * is not used. 159 */ 160 unsigned int be_vnet_hdr_len; 161 unsigned int fe_vnet_hdr_len; 162 163 /* Size of backend-specific private data. */ 164 size_t priv_size; 165 166 /* Room for backend-specific data. */ 167 char opaque[0]; 168 }; 169 170 SET_DECLARE(net_backend_set, struct net_backend); 171 172 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 173 174 #define WPRINTF(params) PRINTLN params 175 176 /* 177 * The tap backend 178 */ 179 180 struct tap_priv { 181 struct mevent *mevp; 182 /* 183 * A bounce buffer that allows us to implement the peek_recvlen 184 * callback. In the future we may get the same information from 185 * the kevent data. 186 */ 187 char bbuf[1 << 16]; 188 ssize_t bbuflen; 189 }; 190 191 static void 192 tap_cleanup(struct net_backend *be) 193 { 194 struct tap_priv *priv = (struct tap_priv *)be->opaque; 195 196 if (priv->mevp) { 197 mevent_delete(priv->mevp); 198 } 199 if (be->fd != -1) { 200 close(be->fd); 201 be->fd = -1; 202 } 203 } 204 205 static int 206 tap_init(struct net_backend *be, const char *devname, 207 const char *opts, net_be_rxeof_t cb, void *param) 208 { 209 struct tap_priv *priv = (struct tap_priv *)be->opaque; 210 char tbuf[80]; 211 int opt = 1; 212 #ifndef WITHOUT_CAPSICUM 213 cap_rights_t rights; 214 #endif 215 216 if (cb == NULL) { 217 WPRINTF(("TAP backend requires non-NULL callback")); 218 return (-1); 219 } 220 221 strcpy(tbuf, "/dev/"); 222 strlcat(tbuf, devname, sizeof(tbuf)); 223 224 be->fd = open(tbuf, O_RDWR); 225 if (be->fd == -1) { 226 WPRINTF(("open of tap device %s failed", tbuf)); 227 goto error; 228 } 229 230 /* 231 * Set non-blocking and register for read 232 * notifications with the event loop 233 */ 234 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 235 WPRINTF(("tap device O_NONBLOCK failed")); 236 goto error; 237 } 238 239 #ifndef WITHOUT_CAPSICUM 240 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 241 if (caph_rights_limit(be->fd, &rights) == -1) 242 errx(EX_OSERR, "Unable to apply rights for sandbox"); 243 #endif 244 245 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 246 priv->bbuflen = 0; 247 248 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 249 if (priv->mevp == NULL) { 250 WPRINTF(("Could not register event")); 251 goto error; 252 } 253 254 return (0); 255 256 error: 257 tap_cleanup(be); 258 return (-1); 259 } 260 261 /* 262 * Called to send a buffer chain out to the tap device 263 */ 264 static ssize_t 265 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 266 { 267 return (writev(be->fd, iov, iovcnt)); 268 } 269 270 static ssize_t 271 tap_peek_recvlen(struct net_backend *be) 272 { 273 struct tap_priv *priv = (struct tap_priv *)be->opaque; 274 ssize_t ret; 275 276 if (priv->bbuflen > 0) { 277 /* 278 * We already have a packet in the bounce buffer. 279 * Just return its length. 280 */ 281 return priv->bbuflen; 282 } 283 284 /* 285 * Read the next packet (if any) into the bounce buffer, so 286 * that we get to know its length and we can return that 287 * to the caller. 288 */ 289 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 290 if (ret < 0 && errno == EWOULDBLOCK) { 291 return (0); 292 } 293 294 if (ret > 0) 295 priv->bbuflen = ret; 296 297 return (ret); 298 } 299 300 static ssize_t 301 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 302 { 303 struct tap_priv *priv = (struct tap_priv *)be->opaque; 304 ssize_t ret; 305 306 if (priv->bbuflen > 0) { 307 /* 308 * A packet is available in the bounce buffer, so 309 * we read it from there. 310 */ 311 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 312 iov, iovcnt, 0); 313 314 /* Mark the bounce buffer as empty. */ 315 priv->bbuflen = 0; 316 317 return (ret); 318 } 319 320 ret = readv(be->fd, iov, iovcnt); 321 if (ret < 0 && errno == EWOULDBLOCK) { 322 return (0); 323 } 324 325 return (ret); 326 } 327 328 static void 329 tap_recv_enable(struct net_backend *be) 330 { 331 struct tap_priv *priv = (struct tap_priv *)be->opaque; 332 333 mevent_enable(priv->mevp); 334 } 335 336 static void 337 tap_recv_disable(struct net_backend *be) 338 { 339 struct tap_priv *priv = (struct tap_priv *)be->opaque; 340 341 mevent_disable(priv->mevp); 342 } 343 344 static uint64_t 345 tap_get_cap(struct net_backend *be) 346 { 347 348 return (0); /* no capabilities for now */ 349 } 350 351 static int 352 tap_set_cap(struct net_backend *be, uint64_t features, 353 unsigned vnet_hdr_len) 354 { 355 356 return ((features || vnet_hdr_len) ? -1 : 0); 357 } 358 359 static struct net_backend tap_backend = { 360 .prefix = "tap", 361 .priv_size = sizeof(struct tap_priv), 362 .init = tap_init, 363 .cleanup = tap_cleanup, 364 .send = tap_send, 365 .peek_recvlen = tap_peek_recvlen, 366 .recv = tap_recv, 367 .recv_enable = tap_recv_enable, 368 .recv_disable = tap_recv_disable, 369 .get_cap = tap_get_cap, 370 .set_cap = tap_set_cap, 371 }; 372 373 /* A clone of the tap backend, with a different prefix. */ 374 static struct net_backend vmnet_backend = { 375 .prefix = "vmnet", 376 .priv_size = sizeof(struct tap_priv), 377 .init = tap_init, 378 .cleanup = tap_cleanup, 379 .send = tap_send, 380 .peek_recvlen = tap_peek_recvlen, 381 .recv = tap_recv, 382 .recv_enable = tap_recv_enable, 383 .recv_disable = tap_recv_disable, 384 .get_cap = tap_get_cap, 385 .set_cap = tap_set_cap, 386 }; 387 388 DATA_SET(net_backend_set, tap_backend); 389 DATA_SET(net_backend_set, vmnet_backend); 390 391 #ifdef NETGRAPH 392 393 /* 394 * Netgraph backend 395 */ 396 397 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 398 399 static int 400 ng_init(struct net_backend *be, const char *devname, 401 const char *opts, net_be_rxeof_t cb, void *param) 402 { 403 struct tap_priv *p = (struct tap_priv *)be->opaque; 404 struct ngm_connect ngc; 405 char *ngopts, *tofree; 406 char nodename[NG_NODESIZ]; 407 int sbsz; 408 int ctrl_sock; 409 int flags; 410 int path_provided; 411 int peerhook_provided; 412 int socket_provided; 413 unsigned long maxsbsz; 414 size_t msbsz; 415 #ifndef WITHOUT_CAPSICUM 416 cap_rights_t rights; 417 #endif 418 419 if (cb == NULL) { 420 WPRINTF(("Netgraph backend requires non-NULL callback")); 421 return (-1); 422 } 423 424 be->fd = -1; 425 426 memset(&ngc, 0, sizeof(ngc)); 427 428 strncpy(ngc.ourhook, "vmlink", NG_HOOKSIZ - 1); 429 430 tofree = ngopts = strdup(opts); 431 432 if (ngopts == NULL) { 433 WPRINTF(("strdup error")); 434 return (-1); 435 } 436 437 socket_provided = 0; 438 path_provided = 0; 439 peerhook_provided = 0; 440 441 while (ngopts != NULL) { 442 char *value = ngopts; 443 char *key; 444 445 key = strsep(&value, "="); 446 if (value == NULL) 447 break; 448 ngopts = value; 449 (void) strsep(&ngopts, ","); 450 451 if (strcmp(key, "socket") == 0) { 452 strncpy(nodename, value, NG_NODESIZ - 1); 453 socket_provided = 1; 454 } else if (strcmp(key, "path") == 0) { 455 strncpy(ngc.path, value, NG_PATHSIZ - 1); 456 path_provided = 1; 457 } else if (strcmp(key, "hook") == 0) { 458 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 459 } else if (strcmp(key, "peerhook") == 0) { 460 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 461 peerhook_provided = 1; 462 } 463 } 464 465 free(tofree); 466 467 if (!path_provided) { 468 WPRINTF(("path must be provided")); 469 return (-1); 470 } 471 472 if (!peerhook_provided) { 473 WPRINTF(("peer hook must be provided")); 474 return (-1); 475 } 476 477 if (NgMkSockNode(socket_provided ? nodename : NULL, 478 &ctrl_sock, &be->fd) < 0) { 479 WPRINTF(("can't get Netgraph sockets")); 480 return (-1); 481 } 482 483 if (NgSendMsg(ctrl_sock, ".", 484 NGM_GENERIC_COOKIE, 485 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 486 WPRINTF(("can't connect to node")); 487 close(ctrl_sock); 488 goto error; 489 } 490 491 close(ctrl_sock); 492 493 flags = fcntl(be->fd, F_GETFL); 494 495 if (flags < 0) { 496 WPRINTF(("can't get socket flags")); 497 goto error; 498 } 499 500 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 501 WPRINTF(("can't set O_NONBLOCK flag")); 502 goto error; 503 } 504 505 /* 506 * The default ng_socket(4) buffer's size is too low. 507 * Calculate the minimum value between NG_SBUF_MAX_SIZE 508 * and kern.ipc.maxsockbuf. 509 */ 510 msbsz = sizeof(maxsbsz); 511 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 512 NULL, 0) < 0) { 513 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 514 goto error; 515 } 516 517 /* 518 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 519 * as it takes into account the mbuf(9) overhead. 520 */ 521 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 522 523 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 524 525 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 526 sizeof(sbsz)) < 0) { 527 WPRINTF(("can't set TX buffer size")); 528 goto error; 529 } 530 531 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 532 sizeof(sbsz)) < 0) { 533 WPRINTF(("can't set RX buffer size")); 534 goto error; 535 } 536 537 #ifndef WITHOUT_CAPSICUM 538 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 539 if (caph_rights_limit(be->fd, &rights) == -1) 540 errx(EX_OSERR, "Unable to apply rights for sandbox"); 541 #endif 542 543 memset(p->bbuf, 0, sizeof(p->bbuf)); 544 p->bbuflen = 0; 545 546 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 547 if (p->mevp == NULL) { 548 WPRINTF(("Could not register event")); 549 goto error; 550 } 551 552 return (0); 553 554 error: 555 tap_cleanup(be); 556 return (-1); 557 } 558 559 static struct net_backend ng_backend = { 560 .prefix = "netgraph", 561 .priv_size = sizeof(struct tap_priv), 562 .init = ng_init, 563 .cleanup = tap_cleanup, 564 .send = tap_send, 565 .peek_recvlen = tap_peek_recvlen, 566 .recv = tap_recv, 567 .recv_enable = tap_recv_enable, 568 .recv_disable = tap_recv_disable, 569 .get_cap = tap_get_cap, 570 .set_cap = tap_set_cap, 571 }; 572 573 DATA_SET(net_backend_set, ng_backend); 574 575 #endif /* NETGRAPH */ 576 577 /* 578 * The netmap backend 579 */ 580 581 /* The virtio-net features supported by netmap. */ 582 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 583 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 584 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 585 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 586 587 struct netmap_priv { 588 char ifname[IFNAMSIZ]; 589 struct nm_desc *nmd; 590 uint16_t memid; 591 struct netmap_ring *rx; 592 struct netmap_ring *tx; 593 struct mevent *mevp; 594 net_be_rxeof_t cb; 595 void *cb_param; 596 }; 597 598 static void 599 nmreq_init(struct nmreq *req, char *ifname) 600 { 601 602 memset(req, 0, sizeof(*req)); 603 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 604 req->nr_version = NETMAP_API; 605 } 606 607 static int 608 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 609 { 610 int err; 611 struct nmreq req; 612 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 613 614 nmreq_init(&req, priv->ifname); 615 req.nr_cmd = NETMAP_BDG_VNET_HDR; 616 req.nr_arg1 = vnet_hdr_len; 617 err = ioctl(be->fd, NIOCREGIF, &req); 618 if (err) { 619 WPRINTF(("Unable to set vnet header length %d", 620 vnet_hdr_len)); 621 return (err); 622 } 623 624 be->be_vnet_hdr_len = vnet_hdr_len; 625 626 return (0); 627 } 628 629 static int 630 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 631 { 632 int prev_hdr_len = be->be_vnet_hdr_len; 633 int ret; 634 635 if (vnet_hdr_len == prev_hdr_len) { 636 return (1); 637 } 638 639 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 640 if (ret) { 641 return (0); 642 } 643 644 netmap_set_vnet_hdr_len(be, prev_hdr_len); 645 646 return (1); 647 } 648 649 static uint64_t 650 netmap_get_cap(struct net_backend *be) 651 { 652 653 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 654 NETMAP_FEATURES : 0); 655 } 656 657 static int 658 netmap_set_cap(struct net_backend *be, uint64_t features, 659 unsigned vnet_hdr_len) 660 { 661 662 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 663 } 664 665 static int 666 netmap_init(struct net_backend *be, const char *devname, 667 const char *opts, net_be_rxeof_t cb, void *param) 668 { 669 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 670 671 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 672 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 673 674 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 675 if (priv->nmd == NULL) { 676 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 677 devname, strerror(errno))); 678 free(priv); 679 return (-1); 680 } 681 682 priv->memid = priv->nmd->req.nr_arg2; 683 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 684 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 685 priv->cb = cb; 686 priv->cb_param = param; 687 be->fd = priv->nmd->fd; 688 689 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 690 if (priv->mevp == NULL) { 691 WPRINTF(("Could not register event")); 692 return (-1); 693 } 694 695 return (0); 696 } 697 698 static void 699 netmap_cleanup(struct net_backend *be) 700 { 701 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 702 703 if (priv->mevp) { 704 mevent_delete(priv->mevp); 705 } 706 if (priv->nmd) { 707 nm_close(priv->nmd); 708 } 709 be->fd = -1; 710 } 711 712 static ssize_t 713 netmap_send(struct net_backend *be, const struct iovec *iov, 714 int iovcnt) 715 { 716 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 717 struct netmap_ring *ring; 718 ssize_t totlen = 0; 719 int nm_buf_size; 720 int nm_buf_len; 721 uint32_t head; 722 void *nm_buf; 723 int j; 724 725 ring = priv->tx; 726 head = ring->head; 727 if (head == ring->tail) { 728 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 729 goto txsync; 730 } 731 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 732 nm_buf_size = ring->nr_buf_size; 733 nm_buf_len = 0; 734 735 for (j = 0; j < iovcnt; j++) { 736 int iov_frag_size = iov[j].iov_len; 737 void *iov_frag_buf = iov[j].iov_base; 738 739 totlen += iov_frag_size; 740 741 /* 742 * Split each iovec fragment over more netmap slots, if 743 * necessary. 744 */ 745 for (;;) { 746 int copylen; 747 748 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 749 memcpy(nm_buf, iov_frag_buf, copylen); 750 751 iov_frag_buf += copylen; 752 iov_frag_size -= copylen; 753 nm_buf += copylen; 754 nm_buf_size -= copylen; 755 nm_buf_len += copylen; 756 757 if (iov_frag_size == 0) { 758 break; 759 } 760 761 ring->slot[head].len = nm_buf_len; 762 ring->slot[head].flags = NS_MOREFRAG; 763 head = nm_ring_next(ring, head); 764 if (head == ring->tail) { 765 /* 766 * We ran out of netmap slots while 767 * splitting the iovec fragments. 768 */ 769 WPRINTF(("No space, drop %zu bytes", 770 count_iov(iov, iovcnt))); 771 goto txsync; 772 } 773 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 774 nm_buf_size = ring->nr_buf_size; 775 nm_buf_len = 0; 776 } 777 } 778 779 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 780 ring->slot[head].len = nm_buf_len; 781 ring->slot[head].flags = 0; 782 head = nm_ring_next(ring, head); 783 784 /* Now update ring->head and ring->cur. */ 785 ring->head = ring->cur = head; 786 txsync: 787 ioctl(be->fd, NIOCTXSYNC, NULL); 788 789 return (totlen); 790 } 791 792 static ssize_t 793 netmap_peek_recvlen(struct net_backend *be) 794 { 795 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 796 struct netmap_ring *ring = priv->rx; 797 uint32_t head = ring->head; 798 ssize_t totlen = 0; 799 800 while (head != ring->tail) { 801 struct netmap_slot *slot = ring->slot + head; 802 803 totlen += slot->len; 804 if ((slot->flags & NS_MOREFRAG) == 0) 805 break; 806 head = nm_ring_next(ring, head); 807 } 808 809 return (totlen); 810 } 811 812 static ssize_t 813 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 814 { 815 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 816 struct netmap_slot *slot = NULL; 817 struct netmap_ring *ring; 818 void *iov_frag_buf; 819 int iov_frag_size; 820 ssize_t totlen = 0; 821 uint32_t head; 822 823 assert(iovcnt); 824 825 ring = priv->rx; 826 head = ring->head; 827 iov_frag_buf = iov->iov_base; 828 iov_frag_size = iov->iov_len; 829 830 do { 831 int nm_buf_len; 832 void *nm_buf; 833 834 if (head == ring->tail) { 835 return (0); 836 } 837 838 slot = ring->slot + head; 839 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 840 nm_buf_len = slot->len; 841 842 for (;;) { 843 int copylen = nm_buf_len < iov_frag_size ? 844 nm_buf_len : iov_frag_size; 845 846 memcpy(iov_frag_buf, nm_buf, copylen); 847 nm_buf += copylen; 848 nm_buf_len -= copylen; 849 iov_frag_buf += copylen; 850 iov_frag_size -= copylen; 851 totlen += copylen; 852 853 if (nm_buf_len == 0) { 854 break; 855 } 856 857 iov++; 858 iovcnt--; 859 if (iovcnt == 0) { 860 /* No space to receive. */ 861 WPRINTF(("Short iov, drop %zd bytes", 862 totlen)); 863 return (-ENOSPC); 864 } 865 iov_frag_buf = iov->iov_base; 866 iov_frag_size = iov->iov_len; 867 } 868 869 head = nm_ring_next(ring, head); 870 871 } while (slot->flags & NS_MOREFRAG); 872 873 /* Release slots to netmap. */ 874 ring->head = ring->cur = head; 875 876 return (totlen); 877 } 878 879 static void 880 netmap_recv_enable(struct net_backend *be) 881 { 882 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 883 884 mevent_enable(priv->mevp); 885 } 886 887 static void 888 netmap_recv_disable(struct net_backend *be) 889 { 890 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 891 892 mevent_disable(priv->mevp); 893 } 894 895 static struct net_backend netmap_backend = { 896 .prefix = "netmap", 897 .priv_size = sizeof(struct netmap_priv), 898 .init = netmap_init, 899 .cleanup = netmap_cleanup, 900 .send = netmap_send, 901 .peek_recvlen = netmap_peek_recvlen, 902 .recv = netmap_recv, 903 .recv_enable = netmap_recv_enable, 904 .recv_disable = netmap_recv_disable, 905 .get_cap = netmap_get_cap, 906 .set_cap = netmap_set_cap, 907 }; 908 909 /* A clone of the netmap backend, with a different prefix. */ 910 static struct net_backend vale_backend = { 911 .prefix = "vale", 912 .priv_size = sizeof(struct netmap_priv), 913 .init = netmap_init, 914 .cleanup = netmap_cleanup, 915 .send = netmap_send, 916 .peek_recvlen = netmap_peek_recvlen, 917 .recv = netmap_recv, 918 .recv_enable = netmap_recv_enable, 919 .recv_disable = netmap_recv_disable, 920 .get_cap = netmap_get_cap, 921 .set_cap = netmap_set_cap, 922 }; 923 924 DATA_SET(net_backend_set, netmap_backend); 925 DATA_SET(net_backend_set, vale_backend); 926 927 /* 928 * Initialize a backend and attach to the frontend. 929 * This is called during frontend initialization. 930 * @pbe is a pointer to the backend to be initialized 931 * @devname is the backend-name as supplied on the command line, 932 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 933 * @cb is the receive callback supplied by the frontend, 934 * and it is invoked in the event loop when a receive 935 * event is generated in the hypervisor, 936 * @param is a pointer to the frontend, and normally used as 937 * the argument for the callback. 938 */ 939 int 940 netbe_init(struct net_backend **ret, const char *opts, net_be_rxeof_t cb, 941 void *param) 942 { 943 struct net_backend **pbe, *nbe, *tbe = NULL; 944 char *devname; 945 char *options; 946 int err; 947 948 devname = options = strdup(opts); 949 950 if (devname == NULL) { 951 return (-1); 952 } 953 954 devname = strsep(&options, ","); 955 956 /* 957 * Find the network backend that matches the user-provided 958 * device name. net_backend_set is built using a linker set. 959 */ 960 SET_FOREACH(pbe, net_backend_set) { 961 if (strncmp(devname, (*pbe)->prefix, 962 strlen((*pbe)->prefix)) == 0) { 963 tbe = *pbe; 964 assert(tbe->init != NULL); 965 assert(tbe->cleanup != NULL); 966 assert(tbe->send != NULL); 967 assert(tbe->recv != NULL); 968 assert(tbe->get_cap != NULL); 969 assert(tbe->set_cap != NULL); 970 break; 971 } 972 } 973 974 *ret = NULL; 975 if (tbe == NULL) { 976 free(devname); 977 return (EINVAL); 978 } 979 980 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); 981 *nbe = *tbe; /* copy the template */ 982 nbe->fd = -1; 983 nbe->sc = param; 984 nbe->be_vnet_hdr_len = 0; 985 nbe->fe_vnet_hdr_len = 0; 986 987 /* Initialize the backend. */ 988 err = nbe->init(nbe, devname, options, cb, param); 989 if (err) { 990 free(devname); 991 free(nbe); 992 return (err); 993 } 994 995 *ret = nbe; 996 free(devname); 997 998 return (0); 999 } 1000 1001 void 1002 netbe_cleanup(struct net_backend *be) 1003 { 1004 1005 if (be != NULL) { 1006 be->cleanup(be); 1007 free(be); 1008 } 1009 } 1010 1011 uint64_t 1012 netbe_get_cap(struct net_backend *be) 1013 { 1014 1015 assert(be != NULL); 1016 return (be->get_cap(be)); 1017 } 1018 1019 int 1020 netbe_set_cap(struct net_backend *be, uint64_t features, 1021 unsigned vnet_hdr_len) 1022 { 1023 int ret; 1024 1025 assert(be != NULL); 1026 1027 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1028 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1029 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1030 return (-1); 1031 1032 be->fe_vnet_hdr_len = vnet_hdr_len; 1033 1034 ret = be->set_cap(be, features, vnet_hdr_len); 1035 assert(be->be_vnet_hdr_len == 0 || 1036 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1037 1038 return (ret); 1039 } 1040 1041 ssize_t 1042 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1043 { 1044 1045 return (be->send(be, iov, iovcnt)); 1046 } 1047 1048 ssize_t 1049 netbe_peek_recvlen(struct net_backend *be) 1050 { 1051 1052 return (be->peek_recvlen(be)); 1053 } 1054 1055 /* 1056 * Try to read a packet from the backend, without blocking. 1057 * If no packets are available, return 0. In case of success, return 1058 * the length of the packet just read. Return -1 in case of errors. 1059 */ 1060 ssize_t 1061 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1062 { 1063 1064 return (be->recv(be, iov, iovcnt)); 1065 } 1066 1067 /* 1068 * Read a packet from the backend and discard it. 1069 * Returns the size of the discarded packet or zero if no packet was available. 1070 * A negative error code is returned in case of read error. 1071 */ 1072 ssize_t 1073 netbe_rx_discard(struct net_backend *be) 1074 { 1075 /* 1076 * MP note: the dummybuf is only used to discard frames, 1077 * so there is no need for it to be per-vtnet or locked. 1078 * We only make it large enough for TSO-sized segment. 1079 */ 1080 static uint8_t dummybuf[65536 + 64]; 1081 struct iovec iov; 1082 1083 iov.iov_base = dummybuf; 1084 iov.iov_len = sizeof(dummybuf); 1085 1086 return netbe_recv(be, &iov, 1); 1087 } 1088 1089 void 1090 netbe_rx_disable(struct net_backend *be) 1091 { 1092 1093 return be->recv_disable(be); 1094 } 1095 1096 void 1097 netbe_rx_enable(struct net_backend *be) 1098 { 1099 1100 return be->recv_enable(be); 1101 } 1102 1103 size_t 1104 netbe_get_vnet_hdr_len(struct net_backend *be) 1105 { 1106 1107 return (be->be_vnet_hdr_len); 1108 } 1109