1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2019 Vincenzo Maffione <vmaffione@FreeBSD.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS 19 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 20 * OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT 21 * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE 24 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 * 27 * $FreeBSD$ 28 */ 29 30 /* 31 * This file implements multiple network backends (tap, netmap, ...), 32 * to be used by network frontends such as virtio-net and e1000. 33 * The API to access the backend (e.g. send/receive packets, negotiate 34 * features) is exported by net_backends.h. 35 */ 36 37 #include <sys/cdefs.h> 38 __FBSDID("$FreeBSD$"); 39 40 #include <sys/types.h> /* u_short etc */ 41 #ifndef WITHOUT_CAPSICUM 42 #include <sys/capsicum.h> 43 #endif 44 #include <sys/ioctl.h> 45 #include <sys/mman.h> 46 #include <sys/uio.h> 47 48 #include <net/if.h> 49 #ifdef __FreeBSD__ 50 #include <net/netmap.h> 51 #include <net/netmap_virt.h> 52 #define NETMAP_WITH_LIBS 53 #include <net/netmap_user.h> 54 #endif 55 56 #ifndef WITHOUT_CAPSICUM 57 #include <capsicum_helpers.h> 58 #endif 59 #include <err.h> 60 #include <errno.h> 61 #include <fcntl.h> 62 #include <stdio.h> 63 #include <stdlib.h> 64 #include <stdint.h> 65 #include <string.h> 66 #include <unistd.h> 67 #include <sysexits.h> 68 #include <assert.h> 69 #include <pthread.h> 70 #include <pthread_np.h> 71 #include <poll.h> 72 #include <assert.h> 73 74 #ifdef NETGRAPH 75 #include <sys/param.h> 76 #include <sys/sysctl.h> 77 #include <netgraph.h> 78 #endif 79 80 #ifndef __FreeBSD__ 81 #include <libdlpi.h> 82 #include <net/ethernet.h> 83 #endif 84 85 #include "config.h" 86 #include "debug.h" 87 #include "iov.h" 88 #include "mevent.h" 89 #include "net_backends.h" 90 #include "pci_emul.h" 91 92 #include <sys/linker_set.h> 93 94 /* 95 * Each network backend registers a set of function pointers that are 96 * used to implement the net backends API. 97 * This might need to be exposed if we implement backends in separate files. 98 */ 99 struct net_backend { 100 const char *prefix; /* prefix matching this backend */ 101 102 /* 103 * Routines used to initialize and cleanup the resources needed 104 * by a backend. The cleanup function is used internally, 105 * and should not be called by the frontend. 106 */ 107 int (*init)(struct net_backend *be, const char *devname, 108 nvlist_t *nvl, net_be_rxeof_t cb, void *param); 109 void (*cleanup)(struct net_backend *be); 110 111 /* 112 * Called to serve a guest transmit request. The scatter-gather 113 * vector provided by the caller has 'iovcnt' elements and contains 114 * the packet to send. 115 */ 116 ssize_t (*send)(struct net_backend *be, const struct iovec *iov, 117 int iovcnt); 118 119 /* 120 * Get the length of the next packet that can be received from 121 * the backend. If no packets are currently available, this 122 * function returns 0. 123 */ 124 ssize_t (*peek_recvlen)(struct net_backend *be); 125 126 /* 127 * Called to receive a packet from the backend. When the function 128 * returns a positive value 'len', the scatter-gather vector 129 * provided by the caller contains a packet with such length. 130 * The function returns 0 if the backend doesn't have a new packet to 131 * receive. 132 */ 133 ssize_t (*recv)(struct net_backend *be, const struct iovec *iov, 134 int iovcnt); 135 136 /* 137 * Ask the backend to enable or disable receive operation in the 138 * backend. On return from a disable operation, it is guaranteed 139 * that the receive callback won't be called until receive is 140 * enabled again. Note however that it is up to the caller to make 141 * sure that netbe_recv() is not currently being executed by another 142 * thread. 143 */ 144 void (*recv_enable)(struct net_backend *be); 145 void (*recv_disable)(struct net_backend *be); 146 147 /* 148 * Ask the backend for the virtio-net features it is able to 149 * support. Possible features are TSO, UFO and checksum offloading 150 * in both rx and tx direction and for both IPv4 and IPv6. 151 */ 152 uint64_t (*get_cap)(struct net_backend *be); 153 154 /* 155 * Tell the backend to enable/disable the specified virtio-net 156 * features (capabilities). 157 */ 158 int (*set_cap)(struct net_backend *be, uint64_t features, 159 unsigned int vnet_hdr_len); 160 161 #ifndef __FreeBSD__ 162 int (*get_mac)(struct net_backend *be, void *, size_t *); 163 #endif 164 165 struct pci_vtnet_softc *sc; 166 int fd; 167 168 /* 169 * Length of the virtio-net header used by the backend and the 170 * frontend, respectively. A zero value means that the header 171 * is not used. 172 */ 173 unsigned int be_vnet_hdr_len; 174 unsigned int fe_vnet_hdr_len; 175 176 /* Size of backend-specific private data. */ 177 size_t priv_size; 178 179 /* Room for backend-specific data. */ 180 char opaque[0]; 181 }; 182 183 SET_DECLARE(net_backend_set, struct net_backend); 184 185 #define VNET_HDR_LEN sizeof(struct virtio_net_rxhdr) 186 187 #define WPRINTF(params) PRINTLN params 188 189 #ifdef __FreeBSD__ 190 191 /* 192 * The tap backend 193 */ 194 195 struct tap_priv { 196 struct mevent *mevp; 197 /* 198 * A bounce buffer that allows us to implement the peek_recvlen 199 * callback. In the future we may get the same information from 200 * the kevent data. 201 */ 202 char bbuf[1 << 16]; 203 ssize_t bbuflen; 204 }; 205 206 static void 207 tap_cleanup(struct net_backend *be) 208 { 209 struct tap_priv *priv = (struct tap_priv *)be->opaque; 210 211 if (priv->mevp) { 212 mevent_delete(priv->mevp); 213 } 214 if (be->fd != -1) { 215 close(be->fd); 216 be->fd = -1; 217 } 218 } 219 220 static int 221 tap_init(struct net_backend *be, const char *devname, 222 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 223 { 224 struct tap_priv *priv = (struct tap_priv *)be->opaque; 225 char tbuf[80]; 226 int opt = 1; 227 #ifndef WITHOUT_CAPSICUM 228 cap_rights_t rights; 229 #endif 230 231 if (cb == NULL) { 232 WPRINTF(("TAP backend requires non-NULL callback")); 233 return (-1); 234 } 235 236 strcpy(tbuf, "/dev/"); 237 strlcat(tbuf, devname, sizeof(tbuf)); 238 239 be->fd = open(tbuf, O_RDWR); 240 if (be->fd == -1) { 241 WPRINTF(("open of tap device %s failed", tbuf)); 242 goto error; 243 } 244 245 /* 246 * Set non-blocking and register for read 247 * notifications with the event loop 248 */ 249 if (ioctl(be->fd, FIONBIO, &opt) < 0) { 250 WPRINTF(("tap device O_NONBLOCK failed")); 251 goto error; 252 } 253 254 #ifndef WITHOUT_CAPSICUM 255 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 256 if (caph_rights_limit(be->fd, &rights) == -1) 257 errx(EX_OSERR, "Unable to apply rights for sandbox"); 258 #endif 259 260 memset(priv->bbuf, 0, sizeof(priv->bbuf)); 261 priv->bbuflen = 0; 262 263 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 264 if (priv->mevp == NULL) { 265 WPRINTF(("Could not register event")); 266 goto error; 267 } 268 269 return (0); 270 271 error: 272 tap_cleanup(be); 273 return (-1); 274 } 275 276 /* 277 * Called to send a buffer chain out to the tap device 278 */ 279 static ssize_t 280 tap_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 281 { 282 return (writev(be->fd, iov, iovcnt)); 283 } 284 285 static ssize_t 286 tap_peek_recvlen(struct net_backend *be) 287 { 288 struct tap_priv *priv = (struct tap_priv *)be->opaque; 289 ssize_t ret; 290 291 if (priv->bbuflen > 0) { 292 /* 293 * We already have a packet in the bounce buffer. 294 * Just return its length. 295 */ 296 return priv->bbuflen; 297 } 298 299 /* 300 * Read the next packet (if any) into the bounce buffer, so 301 * that we get to know its length and we can return that 302 * to the caller. 303 */ 304 ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf)); 305 if (ret < 0 && errno == EWOULDBLOCK) { 306 return (0); 307 } 308 309 if (ret > 0) 310 priv->bbuflen = ret; 311 312 return (ret); 313 } 314 315 static ssize_t 316 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 317 { 318 struct tap_priv *priv = (struct tap_priv *)be->opaque; 319 ssize_t ret; 320 321 if (priv->bbuflen > 0) { 322 /* 323 * A packet is available in the bounce buffer, so 324 * we read it from there. 325 */ 326 ret = buf_to_iov(priv->bbuf, priv->bbuflen, 327 iov, iovcnt, 0); 328 329 /* Mark the bounce buffer as empty. */ 330 priv->bbuflen = 0; 331 332 return (ret); 333 } 334 335 ret = readv(be->fd, iov, iovcnt); 336 if (ret < 0 && errno == EWOULDBLOCK) { 337 return (0); 338 } 339 340 return (ret); 341 } 342 343 static void 344 tap_recv_enable(struct net_backend *be) 345 { 346 struct tap_priv *priv = (struct tap_priv *)be->opaque; 347 348 mevent_enable(priv->mevp); 349 } 350 351 static void 352 tap_recv_disable(struct net_backend *be) 353 { 354 struct tap_priv *priv = (struct tap_priv *)be->opaque; 355 356 mevent_disable(priv->mevp); 357 } 358 359 static uint64_t 360 tap_get_cap(struct net_backend *be) 361 { 362 363 return (0); /* no capabilities for now */ 364 } 365 366 static int 367 tap_set_cap(struct net_backend *be, uint64_t features, 368 unsigned vnet_hdr_len) 369 { 370 371 return ((features || vnet_hdr_len) ? -1 : 0); 372 } 373 374 static struct net_backend tap_backend = { 375 .prefix = "tap", 376 .priv_size = sizeof(struct tap_priv), 377 .init = tap_init, 378 .cleanup = tap_cleanup, 379 .send = tap_send, 380 .peek_recvlen = tap_peek_recvlen, 381 .recv = tap_recv, 382 .recv_enable = tap_recv_enable, 383 .recv_disable = tap_recv_disable, 384 .get_cap = tap_get_cap, 385 .set_cap = tap_set_cap, 386 }; 387 388 /* A clone of the tap backend, with a different prefix. */ 389 static struct net_backend vmnet_backend = { 390 .prefix = "vmnet", 391 .priv_size = sizeof(struct tap_priv), 392 .init = tap_init, 393 .cleanup = tap_cleanup, 394 .send = tap_send, 395 .peek_recvlen = tap_peek_recvlen, 396 .recv = tap_recv, 397 .recv_enable = tap_recv_enable, 398 .recv_disable = tap_recv_disable, 399 .get_cap = tap_get_cap, 400 .set_cap = tap_set_cap, 401 }; 402 403 DATA_SET(net_backend_set, tap_backend); 404 DATA_SET(net_backend_set, vmnet_backend); 405 406 #ifdef NETGRAPH 407 408 /* 409 * Netgraph backend 410 */ 411 412 #define NG_SBUF_MAX_SIZE (4 * 1024 * 1024) 413 414 static int 415 ng_init(struct net_backend *be, const char *devname, 416 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 417 { 418 struct tap_priv *p = (struct tap_priv *)be->opaque; 419 struct ngm_connect ngc; 420 const char *value, *nodename; 421 int sbsz; 422 int ctrl_sock; 423 int flags; 424 unsigned long maxsbsz; 425 size_t msbsz; 426 #ifndef WITHOUT_CAPSICUM 427 cap_rights_t rights; 428 #endif 429 430 if (cb == NULL) { 431 WPRINTF(("Netgraph backend requires non-NULL callback")); 432 return (-1); 433 } 434 435 be->fd = -1; 436 437 memset(&ngc, 0, sizeof(ngc)); 438 439 value = get_config_value_node(nvl, "path"); 440 if (value == NULL) { 441 WPRINTF(("path must be provided")); 442 return (-1); 443 } 444 strncpy(ngc.path, value, NG_PATHSIZ - 1); 445 446 value = get_config_value_node(nvl, "hook"); 447 if (value == NULL) 448 value = "vmlink"; 449 strncpy(ngc.ourhook, value, NG_HOOKSIZ - 1); 450 451 value = get_config_value_node(nvl, "peerhook"); 452 if (value == NULL) { 453 WPRINTF(("peer hook must be provided")); 454 return (-1); 455 } 456 strncpy(ngc.peerhook, value, NG_HOOKSIZ - 1); 457 458 nodename = get_config_value_node(nvl, "socket"); 459 if (NgMkSockNode(nodename, 460 &ctrl_sock, &be->fd) < 0) { 461 WPRINTF(("can't get Netgraph sockets")); 462 return (-1); 463 } 464 465 if (NgSendMsg(ctrl_sock, ".", 466 NGM_GENERIC_COOKIE, 467 NGM_CONNECT, &ngc, sizeof(ngc)) < 0) { 468 WPRINTF(("can't connect to node")); 469 close(ctrl_sock); 470 goto error; 471 } 472 473 close(ctrl_sock); 474 475 flags = fcntl(be->fd, F_GETFL); 476 477 if (flags < 0) { 478 WPRINTF(("can't get socket flags")); 479 goto error; 480 } 481 482 if (fcntl(be->fd, F_SETFL, flags | O_NONBLOCK) < 0) { 483 WPRINTF(("can't set O_NONBLOCK flag")); 484 goto error; 485 } 486 487 /* 488 * The default ng_socket(4) buffer's size is too low. 489 * Calculate the minimum value between NG_SBUF_MAX_SIZE 490 * and kern.ipc.maxsockbuf. 491 */ 492 msbsz = sizeof(maxsbsz); 493 if (sysctlbyname("kern.ipc.maxsockbuf", &maxsbsz, &msbsz, 494 NULL, 0) < 0) { 495 WPRINTF(("can't get 'kern.ipc.maxsockbuf' value")); 496 goto error; 497 } 498 499 /* 500 * We can't set the socket buffer size to kern.ipc.maxsockbuf value, 501 * as it takes into account the mbuf(9) overhead. 502 */ 503 maxsbsz = maxsbsz * MCLBYTES / (MSIZE + MCLBYTES); 504 505 sbsz = MIN(NG_SBUF_MAX_SIZE, maxsbsz); 506 507 if (setsockopt(be->fd, SOL_SOCKET, SO_SNDBUF, &sbsz, 508 sizeof(sbsz)) < 0) { 509 WPRINTF(("can't set TX buffer size")); 510 goto error; 511 } 512 513 if (setsockopt(be->fd, SOL_SOCKET, SO_RCVBUF, &sbsz, 514 sizeof(sbsz)) < 0) { 515 WPRINTF(("can't set RX buffer size")); 516 goto error; 517 } 518 519 #ifndef WITHOUT_CAPSICUM 520 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 521 if (caph_rights_limit(be->fd, &rights) == -1) 522 errx(EX_OSERR, "Unable to apply rights for sandbox"); 523 #endif 524 525 memset(p->bbuf, 0, sizeof(p->bbuf)); 526 p->bbuflen = 0; 527 528 p->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 529 if (p->mevp == NULL) { 530 WPRINTF(("Could not register event")); 531 goto error; 532 } 533 534 return (0); 535 536 error: 537 tap_cleanup(be); 538 return (-1); 539 } 540 541 static struct net_backend ng_backend = { 542 .prefix = "netgraph", 543 .priv_size = sizeof(struct tap_priv), 544 .init = ng_init, 545 .cleanup = tap_cleanup, 546 .send = tap_send, 547 .peek_recvlen = tap_peek_recvlen, 548 .recv = tap_recv, 549 .recv_enable = tap_recv_enable, 550 .recv_disable = tap_recv_disable, 551 .get_cap = tap_get_cap, 552 .set_cap = tap_set_cap, 553 }; 554 555 DATA_SET(net_backend_set, ng_backend); 556 557 #endif /* NETGRAPH */ 558 559 /* 560 * The netmap backend 561 */ 562 563 /* The virtio-net features supported by netmap. */ 564 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \ 565 VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \ 566 VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \ 567 VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO) 568 569 struct netmap_priv { 570 char ifname[IFNAMSIZ]; 571 struct nm_desc *nmd; 572 uint16_t memid; 573 struct netmap_ring *rx; 574 struct netmap_ring *tx; 575 struct mevent *mevp; 576 net_be_rxeof_t cb; 577 void *cb_param; 578 }; 579 580 static void 581 nmreq_init(struct nmreq *req, char *ifname) 582 { 583 584 memset(req, 0, sizeof(*req)); 585 strlcpy(req->nr_name, ifname, sizeof(req->nr_name)); 586 req->nr_version = NETMAP_API; 587 } 588 589 static int 590 netmap_set_vnet_hdr_len(struct net_backend *be, int vnet_hdr_len) 591 { 592 int err; 593 struct nmreq req; 594 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 595 596 nmreq_init(&req, priv->ifname); 597 req.nr_cmd = NETMAP_BDG_VNET_HDR; 598 req.nr_arg1 = vnet_hdr_len; 599 err = ioctl(be->fd, NIOCREGIF, &req); 600 if (err) { 601 WPRINTF(("Unable to set vnet header length %d", 602 vnet_hdr_len)); 603 return (err); 604 } 605 606 be->be_vnet_hdr_len = vnet_hdr_len; 607 608 return (0); 609 } 610 611 static int 612 netmap_has_vnet_hdr_len(struct net_backend *be, unsigned vnet_hdr_len) 613 { 614 int prev_hdr_len = be->be_vnet_hdr_len; 615 int ret; 616 617 if (vnet_hdr_len == prev_hdr_len) { 618 return (1); 619 } 620 621 ret = netmap_set_vnet_hdr_len(be, vnet_hdr_len); 622 if (ret) { 623 return (0); 624 } 625 626 netmap_set_vnet_hdr_len(be, prev_hdr_len); 627 628 return (1); 629 } 630 631 static uint64_t 632 netmap_get_cap(struct net_backend *be) 633 { 634 635 return (netmap_has_vnet_hdr_len(be, VNET_HDR_LEN) ? 636 NETMAP_FEATURES : 0); 637 } 638 639 static int 640 netmap_set_cap(struct net_backend *be, uint64_t features, 641 unsigned vnet_hdr_len) 642 { 643 644 return (netmap_set_vnet_hdr_len(be, vnet_hdr_len)); 645 } 646 647 static int 648 netmap_init(struct net_backend *be, const char *devname, 649 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 650 { 651 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 652 653 strlcpy(priv->ifname, devname, sizeof(priv->ifname)); 654 priv->ifname[sizeof(priv->ifname) - 1] = '\0'; 655 656 priv->nmd = nm_open(priv->ifname, NULL, NETMAP_NO_TX_POLL, NULL); 657 if (priv->nmd == NULL) { 658 WPRINTF(("Unable to nm_open(): interface '%s', errno (%s)", 659 devname, strerror(errno))); 660 free(priv); 661 return (-1); 662 } 663 664 priv->memid = priv->nmd->req.nr_arg2; 665 priv->tx = NETMAP_TXRING(priv->nmd->nifp, 0); 666 priv->rx = NETMAP_RXRING(priv->nmd->nifp, 0); 667 priv->cb = cb; 668 priv->cb_param = param; 669 be->fd = priv->nmd->fd; 670 671 priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 672 if (priv->mevp == NULL) { 673 WPRINTF(("Could not register event")); 674 return (-1); 675 } 676 677 return (0); 678 } 679 680 static void 681 netmap_cleanup(struct net_backend *be) 682 { 683 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 684 685 if (priv->mevp) { 686 mevent_delete(priv->mevp); 687 } 688 if (priv->nmd) { 689 nm_close(priv->nmd); 690 } 691 be->fd = -1; 692 } 693 694 static ssize_t 695 netmap_send(struct net_backend *be, const struct iovec *iov, 696 int iovcnt) 697 { 698 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 699 struct netmap_ring *ring; 700 ssize_t totlen = 0; 701 int nm_buf_size; 702 int nm_buf_len; 703 uint32_t head; 704 void *nm_buf; 705 int j; 706 707 ring = priv->tx; 708 head = ring->head; 709 if (head == ring->tail) { 710 WPRINTF(("No space, drop %zu bytes", count_iov(iov, iovcnt))); 711 goto txsync; 712 } 713 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 714 nm_buf_size = ring->nr_buf_size; 715 nm_buf_len = 0; 716 717 for (j = 0; j < iovcnt; j++) { 718 int iov_frag_size = iov[j].iov_len; 719 void *iov_frag_buf = iov[j].iov_base; 720 721 totlen += iov_frag_size; 722 723 /* 724 * Split each iovec fragment over more netmap slots, if 725 * necessary. 726 */ 727 for (;;) { 728 int copylen; 729 730 copylen = iov_frag_size < nm_buf_size ? iov_frag_size : nm_buf_size; 731 memcpy(nm_buf, iov_frag_buf, copylen); 732 733 iov_frag_buf += copylen; 734 iov_frag_size -= copylen; 735 nm_buf += copylen; 736 nm_buf_size -= copylen; 737 nm_buf_len += copylen; 738 739 if (iov_frag_size == 0) { 740 break; 741 } 742 743 ring->slot[head].len = nm_buf_len; 744 ring->slot[head].flags = NS_MOREFRAG; 745 head = nm_ring_next(ring, head); 746 if (head == ring->tail) { 747 /* 748 * We ran out of netmap slots while 749 * splitting the iovec fragments. 750 */ 751 WPRINTF(("No space, drop %zu bytes", 752 count_iov(iov, iovcnt))); 753 goto txsync; 754 } 755 nm_buf = NETMAP_BUF(ring, ring->slot[head].buf_idx); 756 nm_buf_size = ring->nr_buf_size; 757 nm_buf_len = 0; 758 } 759 } 760 761 /* Complete the last slot, which must not have NS_MOREFRAG set. */ 762 ring->slot[head].len = nm_buf_len; 763 ring->slot[head].flags = 0; 764 head = nm_ring_next(ring, head); 765 766 /* Now update ring->head and ring->cur. */ 767 ring->head = ring->cur = head; 768 txsync: 769 ioctl(be->fd, NIOCTXSYNC, NULL); 770 771 return (totlen); 772 } 773 774 static ssize_t 775 netmap_peek_recvlen(struct net_backend *be) 776 { 777 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 778 struct netmap_ring *ring = priv->rx; 779 uint32_t head = ring->head; 780 ssize_t totlen = 0; 781 782 while (head != ring->tail) { 783 struct netmap_slot *slot = ring->slot + head; 784 785 totlen += slot->len; 786 if ((slot->flags & NS_MOREFRAG) == 0) 787 break; 788 head = nm_ring_next(ring, head); 789 } 790 791 return (totlen); 792 } 793 794 static ssize_t 795 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 796 { 797 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 798 struct netmap_slot *slot = NULL; 799 struct netmap_ring *ring; 800 void *iov_frag_buf; 801 int iov_frag_size; 802 ssize_t totlen = 0; 803 uint32_t head; 804 805 assert(iovcnt); 806 807 ring = priv->rx; 808 head = ring->head; 809 iov_frag_buf = iov->iov_base; 810 iov_frag_size = iov->iov_len; 811 812 do { 813 int nm_buf_len; 814 void *nm_buf; 815 816 if (head == ring->tail) { 817 return (0); 818 } 819 820 slot = ring->slot + head; 821 nm_buf = NETMAP_BUF(ring, slot->buf_idx); 822 nm_buf_len = slot->len; 823 824 for (;;) { 825 int copylen = nm_buf_len < iov_frag_size ? 826 nm_buf_len : iov_frag_size; 827 828 memcpy(iov_frag_buf, nm_buf, copylen); 829 nm_buf += copylen; 830 nm_buf_len -= copylen; 831 iov_frag_buf += copylen; 832 iov_frag_size -= copylen; 833 totlen += copylen; 834 835 if (nm_buf_len == 0) { 836 break; 837 } 838 839 iov++; 840 iovcnt--; 841 if (iovcnt == 0) { 842 /* No space to receive. */ 843 WPRINTF(("Short iov, drop %zd bytes", 844 totlen)); 845 return (-ENOSPC); 846 } 847 iov_frag_buf = iov->iov_base; 848 iov_frag_size = iov->iov_len; 849 } 850 851 head = nm_ring_next(ring, head); 852 853 } while (slot->flags & NS_MOREFRAG); 854 855 /* Release slots to netmap. */ 856 ring->head = ring->cur = head; 857 858 return (totlen); 859 } 860 861 static void 862 netmap_recv_enable(struct net_backend *be) 863 { 864 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 865 866 mevent_enable(priv->mevp); 867 } 868 869 static void 870 netmap_recv_disable(struct net_backend *be) 871 { 872 struct netmap_priv *priv = (struct netmap_priv *)be->opaque; 873 874 mevent_disable(priv->mevp); 875 } 876 877 static struct net_backend netmap_backend = { 878 .prefix = "netmap", 879 .priv_size = sizeof(struct netmap_priv), 880 .init = netmap_init, 881 .cleanup = netmap_cleanup, 882 .send = netmap_send, 883 .peek_recvlen = netmap_peek_recvlen, 884 .recv = netmap_recv, 885 .recv_enable = netmap_recv_enable, 886 .recv_disable = netmap_recv_disable, 887 .get_cap = netmap_get_cap, 888 .set_cap = netmap_set_cap, 889 }; 890 891 /* A clone of the netmap backend, with a different prefix. */ 892 static struct net_backend vale_backend = { 893 .prefix = "vale", 894 .priv_size = sizeof(struct netmap_priv), 895 .init = netmap_init, 896 .cleanup = netmap_cleanup, 897 .send = netmap_send, 898 .peek_recvlen = netmap_peek_recvlen, 899 .recv = netmap_recv, 900 .recv_enable = netmap_recv_enable, 901 .recv_disable = netmap_recv_disable, 902 .get_cap = netmap_get_cap, 903 .set_cap = netmap_set_cap, 904 }; 905 906 DATA_SET(net_backend_set, netmap_backend); 907 DATA_SET(net_backend_set, vale_backend); 908 909 #else /* __FreeBSD__ */ 910 911 /* 912 * The illumos dlpi backend 913 */ 914 915 /* 916 * The size of the bounce buffer used to implement the peek callback. 917 * This value should be big enough to accommodate the largest of all possible 918 * frontend packet lengths. The value here matches the definition of 919 * VTNET_MAX_PKT_LEN in pci_virtio_net.c 920 */ 921 #define DLPI_BBUF_SIZE (65536 + 64) 922 923 typedef struct be_dlpi_priv { 924 dlpi_handle_t bdp_dhp; 925 struct mevent *bdp_mevp; 926 /* 927 * A bounce buffer that allows us to implement the peek_recvlen 928 * callback. Each structure is only used by a single thread so 929 * one is enough. 930 */ 931 uint8_t bdp_bbuf[DLPI_BBUF_SIZE]; 932 ssize_t bdp_bbuflen; 933 } be_dlpi_priv_t; 934 935 static void 936 be_dlpi_cleanup(net_backend_t *be) 937 { 938 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 939 940 if (priv->bdp_dhp != NULL) 941 dlpi_close(priv->bdp_dhp); 942 priv->bdp_dhp = NULL; 943 944 if (priv->bdp_mevp != NULL) 945 mevent_delete(priv->bdp_mevp); 946 priv->bdp_mevp = NULL; 947 948 priv->bdp_bbuflen = 0; 949 be->fd = -1; 950 } 951 952 static void 953 be_dlpi_err(int ret, const char *dev, char *msg) 954 { 955 WPRINTF(("%s: %s (%s)", dev, msg, dlpi_strerror(ret))); 956 } 957 958 static int 959 be_dlpi_init(net_backend_t *be, const char *devname __unused, 960 nvlist_t *nvl, net_be_rxeof_t cb, void *param) 961 { 962 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 963 const char *vnic; 964 int ret; 965 966 if (cb == NULL) { 967 WPRINTF(("dlpi backend requires non-NULL callback")); 968 return (-1); 969 } 970 971 vnic = get_config_value_node(nvl, "vnic"); 972 if (vnic == NULL) { 973 WPRINTF(("dlpi backend requires a VNIC")); 974 return (-1); 975 } 976 977 priv->bdp_bbuflen = 0; 978 979 ret = dlpi_open(vnic, &priv->bdp_dhp, DLPI_RAW); 980 981 if (ret != DLPI_SUCCESS) { 982 be_dlpi_err(ret, vnic, "open failed"); 983 goto error; 984 } 985 986 if ((ret = dlpi_bind(priv->bdp_dhp, DLPI_ANY_SAP, NULL)) != 987 DLPI_SUCCESS) { 988 be_dlpi_err(ret, vnic, "bind failed"); 989 goto error; 990 } 991 992 if (get_config_bool_node_default(nvl, "promiscrxonly", true)) { 993 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_RX_ONLY)) != 994 DLPI_SUCCESS) { 995 be_dlpi_err(ret, vnic, 996 "enable promiscuous mode(rxonly) failed"); 997 goto error; 998 } 999 } 1000 if (get_config_bool_node_default(nvl, "promiscphys", false)) { 1001 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_PHYS)) != 1002 DLPI_SUCCESS) { 1003 be_dlpi_err(ret, vnic, 1004 "enable promiscuous mode(physical) failed"); 1005 goto error; 1006 } 1007 } 1008 if (get_config_bool_node_default(nvl, "promiscsap", true)) { 1009 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_SAP)) != 1010 DLPI_SUCCESS) { 1011 be_dlpi_err(ret, vnic, 1012 "enable promiscuous mode(SAP) failed"); 1013 goto error; 1014 } 1015 } 1016 if (get_config_bool_node_default(nvl, "promiscmulti", true)) { 1017 if ((ret = dlpi_promiscon(priv->bdp_dhp, DL_PROMISC_MULTI)) != 1018 DLPI_SUCCESS) { 1019 be_dlpi_err(ret, vnic, 1020 "enable promiscuous mode(muticast) failed"); 1021 goto error; 1022 } 1023 } 1024 1025 be->fd = dlpi_fd(priv->bdp_dhp); 1026 1027 if (fcntl(be->fd, F_SETFL, O_NONBLOCK) < 0) { 1028 WPRINTF(("%s: enable O_NONBLOCK failed", vnic)); 1029 goto error; 1030 } 1031 1032 priv->bdp_mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param); 1033 if (priv->bdp_mevp == NULL) { 1034 WPRINTF(("Could not register event")); 1035 goto error; 1036 } 1037 1038 return (0); 1039 1040 error: 1041 be_dlpi_cleanup(be); 1042 return (-1); 1043 } 1044 1045 /* 1046 * Called to send a buffer chain out to the dlpi device 1047 */ 1048 static ssize_t 1049 be_dlpi_send(net_backend_t *be, const struct iovec *iov, int iovcnt) 1050 { 1051 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 1052 ssize_t len = 0; 1053 int ret; 1054 1055 if (iovcnt == 1) { 1056 len = iov[0].iov_len; 1057 ret = dlpi_send(priv->bdp_dhp, NULL, 0, iov[0].iov_base, len, 1058 NULL); 1059 } else { 1060 void *buf = NULL; 1061 1062 len = iov_to_buf(iov, iovcnt, &buf); 1063 1064 if (len <= 0 || buf == NULL) 1065 return (-1); 1066 1067 ret = dlpi_send(priv->bdp_dhp, NULL, 0, buf, len, NULL); 1068 free(buf); 1069 } 1070 1071 if (ret != DLPI_SUCCESS) 1072 return (-1); 1073 1074 return (len); 1075 } 1076 1077 static ssize_t 1078 be_dlpi_peek_recvlen(net_backend_t *be) 1079 { 1080 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 1081 dlpi_recvinfo_t recv; 1082 size_t len; 1083 int ret; 1084 1085 /* 1086 * We already have a packet in the bounce buffer. 1087 * Just return its length. 1088 */ 1089 if (priv->bdp_bbuflen > 0) 1090 return (priv->bdp_bbuflen); 1091 1092 /* 1093 * Read the next packet (if any) into the bounce buffer, so 1094 * that we get to know its length and we can return that 1095 * to the caller. 1096 */ 1097 len = sizeof (priv->bdp_bbuf); 1098 ret = dlpi_recv(priv->bdp_dhp, NULL, NULL, priv->bdp_bbuf, &len, 1099 0, &recv); 1100 if (ret == DL_SYSERR) { 1101 if (errno == EWOULDBLOCK) 1102 return (0); 1103 return (-1); 1104 } else if (ret == DLPI_ETIMEDOUT) { 1105 return (0); 1106 } else if (ret != DLPI_SUCCESS) { 1107 return (-1); 1108 } 1109 1110 if (recv.dri_totmsglen > sizeof (priv->bdp_bbuf)) { 1111 EPRINTLN("DLPI bounce buffer was too small! - needed %x bytes", 1112 recv.dri_totmsglen); 1113 } 1114 1115 priv->bdp_bbuflen = len; 1116 1117 return (len); 1118 } 1119 1120 static ssize_t 1121 be_dlpi_recv(net_backend_t *be, const struct iovec *iov, int iovcnt) 1122 { 1123 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 1124 size_t len; 1125 int ret; 1126 1127 if (priv->bdp_bbuflen > 0) { 1128 /* 1129 * A packet is available in the bounce buffer, so 1130 * we read it from there. 1131 */ 1132 len = buf_to_iov(priv->bdp_bbuf, priv->bdp_bbuflen, 1133 iov, iovcnt, 0); 1134 1135 /* Mark the bounce buffer as empty. */ 1136 priv->bdp_bbuflen = 0; 1137 1138 return (len); 1139 } 1140 1141 len = iov[0].iov_len; 1142 ret = dlpi_recv(priv->bdp_dhp, NULL, NULL, 1143 (uint8_t *)iov[0].iov_base, &len, 0, NULL); 1144 if (ret == DL_SYSERR) { 1145 if (errno == EWOULDBLOCK) 1146 return (0); 1147 return (-1); 1148 } else if (ret == DLPI_ETIMEDOUT) { 1149 return (0); 1150 } else if (ret != DLPI_SUCCESS) { 1151 return (-1); 1152 } 1153 1154 return (len); 1155 } 1156 1157 static void 1158 be_dlpi_recv_enable(net_backend_t *be) 1159 { 1160 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 1161 1162 mevent_enable(priv->bdp_mevp); 1163 } 1164 1165 static void 1166 be_dlpi_recv_disable(net_backend_t *be) 1167 { 1168 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 1169 1170 mevent_disable(priv->bdp_mevp); 1171 } 1172 1173 static uint64_t 1174 be_dlpi_get_cap(net_backend_t *be) 1175 { 1176 return (0); /* no capabilities for now */ 1177 } 1178 1179 static int 1180 be_dlpi_set_cap(net_backend_t *be, uint64_t features, 1181 unsigned vnet_hdr_len) 1182 { 1183 return ((features || vnet_hdr_len) ? -1 : 0); 1184 } 1185 1186 static int 1187 be_dlpi_get_mac(net_backend_t *be, void *buf, size_t *buflen) 1188 { 1189 be_dlpi_priv_t *priv = (be_dlpi_priv_t *)be->opaque; 1190 uchar_t physaddr[DLPI_PHYSADDR_MAX]; 1191 size_t physaddrlen = DLPI_PHYSADDR_MAX; 1192 int ret; 1193 1194 if ((ret = dlpi_get_physaddr(priv->bdp_dhp, DL_CURR_PHYS_ADDR, 1195 physaddr, &physaddrlen)) != DLPI_SUCCESS) { 1196 be_dlpi_err(ret, dlpi_linkname(priv->bdp_dhp), 1197 "read MAC address failed"); 1198 return (EINVAL); 1199 } 1200 1201 if (physaddrlen != ETHERADDRL) { 1202 WPRINTF(("%s: bad MAC address len %d", 1203 dlpi_linkname(priv->bdp_dhp), physaddrlen)); 1204 return (EINVAL); 1205 } 1206 1207 if (physaddrlen > *buflen) { 1208 WPRINTF(("%s: MAC address too long (%d bytes required)", 1209 dlpi_linkname(priv->bdp_dhp), physaddrlen)); 1210 return (ENOMEM); 1211 } 1212 1213 *buflen = physaddrlen; 1214 memcpy(buf, physaddr, *buflen); 1215 1216 return (0); 1217 } 1218 1219 static struct net_backend dlpi_backend = { 1220 .prefix = "dlpi", 1221 .priv_size = sizeof(struct be_dlpi_priv), 1222 .init = be_dlpi_init, 1223 .cleanup = be_dlpi_cleanup, 1224 .send = be_dlpi_send, 1225 .peek_recvlen = be_dlpi_peek_recvlen, 1226 .recv = be_dlpi_recv, 1227 .recv_enable = be_dlpi_recv_enable, 1228 .recv_disable = be_dlpi_recv_disable, 1229 .get_cap = be_dlpi_get_cap, 1230 .set_cap = be_dlpi_set_cap, 1231 .get_mac = be_dlpi_get_mac, 1232 }; 1233 1234 DATA_SET(net_backend_set, dlpi_backend); 1235 1236 #endif /* __FreeBSD__ */ 1237 1238 #ifdef __FreeBSD__ 1239 int 1240 netbe_legacy_config(nvlist_t *nvl, const char *opts) 1241 { 1242 char *backend, *cp; 1243 1244 if (opts == NULL) 1245 return (0); 1246 1247 cp = strchr(opts, ','); 1248 if (cp == NULL) { 1249 set_config_value_node(nvl, "backend", opts); 1250 return (0); 1251 } 1252 backend = strndup(opts, cp - opts); 1253 set_config_value_node(nvl, "backend", backend); 1254 free(backend); 1255 return (pci_parse_legacy_config(nvl, cp + 1)); 1256 } 1257 #else 1258 int 1259 netbe_legacy_config(nvlist_t *nvl, const char *opts) 1260 { 1261 char *config, *name, *tofree, *value; 1262 1263 if (opts == NULL) 1264 return (0); 1265 1266 /* Default to the 'dlpi' backend - can still be overridden by opts */ 1267 set_config_value_node(nvl, "backend", "dlpi"); 1268 1269 config = tofree = strdup(opts); 1270 if (config == NULL) 1271 err(4, "netbe_legacy_config strdup()"); 1272 while ((name = strsep(&config, ",")) != NULL) { 1273 value = strchr(name, '='); 1274 if (value != NULL) { 1275 *value++ = '\0'; 1276 set_config_value_node(nvl, name, value); 1277 } else { 1278 set_config_value_node(nvl, "vnic", name); 1279 } 1280 } 1281 free(tofree); 1282 1283 return (0); 1284 } 1285 #endif 1286 1287 /* 1288 * Initialize a backend and attach to the frontend. 1289 * This is called during frontend initialization. 1290 * @ret is a pointer to the backend to be initialized 1291 * @devname is the backend-name as supplied on the command line, 1292 * e.g. -s 2:0,frontend-name,backend-name[,other-args] 1293 * @cb is the receive callback supplied by the frontend, 1294 * and it is invoked in the event loop when a receive 1295 * event is generated in the hypervisor, 1296 * @param is a pointer to the frontend, and normally used as 1297 * the argument for the callback. 1298 */ 1299 int 1300 netbe_init(struct net_backend **ret, nvlist_t *nvl, net_be_rxeof_t cb, 1301 void *param) 1302 { 1303 struct net_backend **pbe, *nbe, *tbe = NULL; 1304 const char *value; 1305 char *devname; 1306 int err; 1307 1308 value = get_config_value_node(nvl, "backend"); 1309 if (value == NULL) { 1310 return (-1); 1311 } 1312 devname = strdup(value); 1313 1314 /* 1315 * Find the network backend that matches the user-provided 1316 * device name. net_backend_set is built using a linker set. 1317 */ 1318 SET_FOREACH(pbe, net_backend_set) { 1319 if (strncmp(devname, (*pbe)->prefix, 1320 strlen((*pbe)->prefix)) == 0) { 1321 tbe = *pbe; 1322 assert(tbe->init != NULL); 1323 assert(tbe->cleanup != NULL); 1324 assert(tbe->send != NULL); 1325 assert(tbe->recv != NULL); 1326 assert(tbe->get_cap != NULL); 1327 assert(tbe->set_cap != NULL); 1328 break; 1329 } 1330 } 1331 1332 *ret = NULL; 1333 if (tbe == NULL) { 1334 free(devname); 1335 return (EINVAL); 1336 } 1337 1338 nbe = calloc(1, sizeof(*nbe) + tbe->priv_size); 1339 *nbe = *tbe; /* copy the template */ 1340 nbe->fd = -1; 1341 nbe->sc = param; 1342 nbe->be_vnet_hdr_len = 0; 1343 nbe->fe_vnet_hdr_len = 0; 1344 1345 /* Initialize the backend. */ 1346 err = nbe->init(nbe, devname, nvl, cb, param); 1347 if (err) { 1348 free(devname); 1349 free(nbe); 1350 return (err); 1351 } 1352 1353 *ret = nbe; 1354 free(devname); 1355 1356 return (0); 1357 } 1358 1359 void 1360 netbe_cleanup(struct net_backend *be) 1361 { 1362 1363 if (be != NULL) { 1364 be->cleanup(be); 1365 free(be); 1366 } 1367 } 1368 1369 uint64_t 1370 netbe_get_cap(struct net_backend *be) 1371 { 1372 1373 assert(be != NULL); 1374 return (be->get_cap(be)); 1375 } 1376 1377 int 1378 netbe_set_cap(struct net_backend *be, uint64_t features, 1379 unsigned vnet_hdr_len) 1380 { 1381 int ret; 1382 1383 assert(be != NULL); 1384 1385 /* There are only three valid lengths, i.e., 0, 10 and 12. */ 1386 if (vnet_hdr_len && vnet_hdr_len != VNET_HDR_LEN 1387 && vnet_hdr_len != (VNET_HDR_LEN - sizeof(uint16_t))) 1388 return (-1); 1389 1390 be->fe_vnet_hdr_len = vnet_hdr_len; 1391 1392 ret = be->set_cap(be, features, vnet_hdr_len); 1393 assert(be->be_vnet_hdr_len == 0 || 1394 be->be_vnet_hdr_len == be->fe_vnet_hdr_len); 1395 1396 return (ret); 1397 } 1398 1399 ssize_t 1400 netbe_send(struct net_backend *be, const struct iovec *iov, int iovcnt) 1401 { 1402 1403 return (be->send(be, iov, iovcnt)); 1404 } 1405 1406 ssize_t 1407 netbe_peek_recvlen(struct net_backend *be) 1408 { 1409 1410 return (be->peek_recvlen(be)); 1411 } 1412 1413 /* 1414 * Try to read a packet from the backend, without blocking. 1415 * If no packets are available, return 0. In case of success, return 1416 * the length of the packet just read. Return -1 in case of errors. 1417 */ 1418 ssize_t 1419 netbe_recv(struct net_backend *be, const struct iovec *iov, int iovcnt) 1420 { 1421 1422 return (be->recv(be, iov, iovcnt)); 1423 } 1424 1425 /* 1426 * Read a packet from the backend and discard it. 1427 * Returns the size of the discarded packet or zero if no packet was available. 1428 * A negative error code is returned in case of read error. 1429 */ 1430 ssize_t 1431 netbe_rx_discard(struct net_backend *be) 1432 { 1433 /* 1434 * MP note: the dummybuf is only used to discard frames, 1435 * so there is no need for it to be per-vtnet or locked. 1436 * We only make it large enough for TSO-sized segment. 1437 */ 1438 static uint8_t dummybuf[65536 + 64]; 1439 struct iovec iov; 1440 1441 #ifdef __FreeBSD__ 1442 iov.iov_base = dummybuf; 1443 #else 1444 iov.iov_base = (caddr_t)dummybuf; 1445 #endif 1446 iov.iov_len = sizeof(dummybuf); 1447 1448 return netbe_recv(be, &iov, 1); 1449 } 1450 1451 void 1452 netbe_rx_disable(struct net_backend *be) 1453 { 1454 1455 return be->recv_disable(be); 1456 } 1457 1458 void 1459 netbe_rx_enable(struct net_backend *be) 1460 { 1461 1462 return be->recv_enable(be); 1463 } 1464 1465 size_t 1466 netbe_get_vnet_hdr_len(struct net_backend *be) 1467 { 1468 1469 return (be->be_vnet_hdr_len); 1470 } 1471 1472 #ifndef __FreeBSD__ 1473 int 1474 netbe_get_mac(net_backend_t *be, void *buf, size_t *buflen) 1475 { 1476 if (be->get_mac == NULL) 1477 return (ENOTSUP); 1478 return (be->get_mac(be, buf, buflen)); 1479 } 1480 #endif 1481