1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2013 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 */ 43 44 #include <sys/cdefs.h> 45 __FBSDID("$FreeBSD$"); 46 47 #include <sys/param.h> 48 #ifndef WITHOUT_CAPSICUM 49 #include <sys/capsicum.h> 50 #endif 51 #include <sys/linker_set.h> 52 #include <sys/select.h> 53 #include <sys/uio.h> 54 #include <sys/ioctl.h> 55 #include <net/ethernet.h> 56 #ifdef __FreeBSD__ 57 #ifndef NETMAP_WITH_LIBS 58 #define NETMAP_WITH_LIBS 59 #endif 60 #include <net/netmap_user.h> 61 #endif 62 63 #ifndef WITHOUT_CAPSICUM 64 #include <capsicum_helpers.h> 65 #endif 66 #include <err.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <stdint.h> 72 #include <string.h> 73 #include <strings.h> 74 #include <unistd.h> 75 #include <assert.h> 76 #include <md5.h> 77 #include <pthread.h> 78 #include <pthread_np.h> 79 #include <sysexits.h> 80 #ifndef __FreeBSD__ 81 #include <poll.h> 82 #include <libdlpi.h> 83 #endif 84 85 #include "bhyverun.h" 86 #include "pci_emul.h" 87 #ifdef __FreeBSD__ 88 #include "mevent.h" 89 #endif 90 #include "virtio.h" 91 #include "net_utils.h" 92 93 #define VTNET_RINGSZ 1024 94 95 #define VTNET_MAXSEGS 256 96 97 /* 98 * Host capabilities. Note that we only offer a few of these. 99 */ 100 #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ 101 #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ 102 #define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ 103 #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ 104 #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ 105 #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ 106 #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ 107 #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ 108 #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ 109 #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ 110 #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ 111 #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ 112 #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ 113 #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ 114 #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ 115 #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ 116 #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ 117 #define VIRTIO_NET_F_GUEST_ANNOUNCE \ 118 (1 << 21) /* guest can send gratuitous pkts */ 119 120 #define VTNET_S_HOSTCAPS \ 121 ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ 122 VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) 123 124 /* 125 * PCI config-space "registers" 126 */ 127 struct virtio_net_config { 128 uint8_t mac[6]; 129 uint16_t status; 130 } __packed; 131 132 /* 133 * Queue definitions. 134 */ 135 #define VTNET_RXQ 0 136 #define VTNET_TXQ 1 137 #define VTNET_CTLQ 2 /* NB: not yet supported */ 138 139 #define VTNET_MAXQ 3 140 141 /* 142 * Fixed network header size 143 */ 144 struct virtio_net_rxhdr { 145 uint8_t vrh_flags; 146 uint8_t vrh_gso_type; 147 uint16_t vrh_hdr_len; 148 uint16_t vrh_gso_size; 149 uint16_t vrh_csum_start; 150 uint16_t vrh_csum_offset; 151 uint16_t vrh_bufs; 152 } __packed; 153 154 /* 155 * Debug printf 156 */ 157 static int pci_vtnet_debug; 158 #define DPRINTF(params) if (pci_vtnet_debug) printf params 159 #define WPRINTF(params) printf params 160 161 /* 162 * Per-device softc 163 */ 164 struct pci_vtnet_softc { 165 struct virtio_softc vsc_vs; 166 struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; 167 pthread_mutex_t vsc_mtx; 168 struct mevent *vsc_mevp; 169 170 #ifdef __FreeBSD 171 int vsc_tapfd; 172 #else 173 dlpi_handle_t vsc_dhp; 174 int vsc_dlpifd; 175 #endif 176 struct nm_desc *vsc_nmd; 177 178 int vsc_rx_ready; 179 int resetting; /* protected by tx_mtx */ 180 181 uint64_t vsc_features; /* negotiated features */ 182 183 struct virtio_net_config vsc_config; 184 185 pthread_mutex_t rx_mtx; 186 int rx_vhdrlen; 187 int rx_merge; /* merged rx bufs in use */ 188 189 pthread_t tx_tid; 190 pthread_mutex_t tx_mtx; 191 pthread_cond_t tx_cond; 192 int tx_in_progress; 193 194 void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); 195 void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, 196 int iovcnt, int len); 197 }; 198 199 static void pci_vtnet_reset(void *); 200 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ 201 static int pci_vtnet_cfgread(void *, int, int, uint32_t *); 202 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); 203 static void pci_vtnet_neg_features(void *, uint64_t); 204 205 static struct virtio_consts vtnet_vi_consts = { 206 "vtnet", /* our name */ 207 VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ 208 sizeof(struct virtio_net_config), /* config reg size */ 209 pci_vtnet_reset, /* reset */ 210 NULL, /* device-wide qnotify -- not used */ 211 pci_vtnet_cfgread, /* read PCI config */ 212 pci_vtnet_cfgwrite, /* write PCI config */ 213 pci_vtnet_neg_features, /* apply negotiated features */ 214 VTNET_S_HOSTCAPS, /* our capabilities */ 215 }; 216 217 static void 218 pci_vtnet_reset(void *vsc) 219 { 220 struct pci_vtnet_softc *sc = vsc; 221 222 DPRINTF(("vtnet: device reset requested !\n")); 223 224 /* Acquire the RX lock to block RX processing. */ 225 pthread_mutex_lock(&sc->rx_mtx); 226 227 /* Set sc->resetting and give a chance to the TX thread to stop. */ 228 pthread_mutex_lock(&sc->tx_mtx); 229 sc->resetting = 1; 230 while (sc->tx_in_progress) { 231 pthread_mutex_unlock(&sc->tx_mtx); 232 usleep(10000); 233 pthread_mutex_lock(&sc->tx_mtx); 234 } 235 236 sc->vsc_rx_ready = 0; 237 sc->rx_merge = 1; 238 sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); 239 240 /* 241 * Now reset rings, MSI-X vectors, and negotiated capabilities. 242 * Do that with the TX lock held, since we need to reset 243 * sc->resetting. 244 */ 245 vi_reset_dev(&sc->vsc_vs); 246 247 sc->resetting = 0; 248 pthread_mutex_unlock(&sc->tx_mtx); 249 pthread_mutex_unlock(&sc->rx_mtx); 250 } 251 252 /* 253 * Called to send a buffer chain out to the tap device 254 */ 255 #ifdef __FreeBSD__ 256 static void 257 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 258 int len) 259 { 260 static char pad[60]; /* all zero bytes */ 261 262 if (sc->vsc_tapfd == -1) 263 return; 264 265 /* 266 * If the length is < 60, pad out to that and add the 267 * extra zero'd segment to the iov. It is guaranteed that 268 * there is always an extra iov available by the caller. 269 */ 270 if (len < 60) { 271 iov[iovcnt].iov_base = pad; 272 iov[iovcnt].iov_len = 60 - len; 273 iovcnt++; 274 } 275 (void) writev(sc->vsc_tapfd, iov, iovcnt); 276 } 277 #else 278 static void 279 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 280 int len) 281 { 282 int i; 283 284 for (i = 0; i < iovcnt; i++) { 285 (void) dlpi_send(sc->vsc_dhp, NULL, 0, 286 iov[i].iov_base, iov[i].iov_len, NULL); 287 } 288 } 289 #endif /* __FreeBSD__ */ 290 291 #ifdef __FreeBSD__ 292 /* 293 * Called when there is read activity on the tap file descriptor. 294 * Each buffer posted by the guest is assumed to be able to contain 295 * an entire ethernet frame + rx header. 296 * MP note: the dummybuf is only used for discarding frames, so there 297 * is no need for it to be per-vtnet or locked. 298 */ 299 static uint8_t dummybuf[2048]; 300 #endif /* __FreeBSD__ */ 301 302 static __inline struct iovec * 303 rx_iov_trim(struct iovec *iov, int *niov, int tlen) 304 { 305 struct iovec *riov; 306 307 /* XXX short-cut: assume first segment is >= tlen */ 308 assert(iov[0].iov_len >= tlen); 309 310 iov[0].iov_len -= tlen; 311 if (iov[0].iov_len == 0) { 312 assert(*niov > 1); 313 *niov -= 1; 314 riov = &iov[1]; 315 } else { 316 iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); 317 riov = &iov[0]; 318 } 319 320 return (riov); 321 } 322 323 static void 324 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) 325 { 326 struct iovec iov[VTNET_MAXSEGS], *riov; 327 struct vqueue_info *vq; 328 void *vrx; 329 int n; 330 #ifdef __FreeBSD__ 331 int len; 332 #else 333 size_t len; 334 int ret; 335 #endif 336 uint16_t idx; 337 338 /* 339 * Should never be called without a valid tap fd 340 */ 341 #ifdef __FreeBSD__ 342 assert(sc->vsc_tapfd != -1); 343 #else 344 assert(sc->vsc_dlpifd != -1); 345 #endif 346 347 /* 348 * But, will be called when the rx ring hasn't yet 349 * been set up. 350 */ 351 if (!sc->vsc_rx_ready) { 352 #ifdef __FreeBSD__ 353 /* 354 * Drop the packet and try later. 355 */ 356 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); 357 #endif 358 return; 359 } 360 361 /* 362 * Check for available rx buffers 363 */ 364 vq = &sc->vsc_queues[VTNET_RXQ]; 365 if (!vq_has_descs(vq)) { 366 /* 367 * Drop the packet and try later. Interrupt on 368 * empty, if that's negotiated. 369 */ 370 #ifdef __FreeBSD__ 371 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); 372 #endif 373 vq_endchains(vq, 1); 374 return; 375 } 376 377 do { 378 /* 379 * Get descriptor chain 380 */ 381 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 382 assert(n >= 1 && n <= VTNET_MAXSEGS); 383 384 /* 385 * Get a pointer to the rx header, and use the 386 * data immediately following it for the packet buffer. 387 */ 388 vrx = iov[0].iov_base; 389 riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); 390 #ifdef __FreeBSD__ 391 len = readv(sc->vsc_tapfd, riov, n); 392 #else 393 len = riov[0].iov_len; 394 ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, 395 (uint8_t *)riov[0].iov_base, &len, 0, NULL); 396 if (ret != DLPI_SUCCESS) { 397 errno = EWOULDBLOCK; 398 len = 0; 399 } 400 #endif 401 if (len <= 0 && errno == EWOULDBLOCK) { 402 /* 403 * No more packets, but still some avail ring 404 * entries. Interrupt if needed/appropriate. 405 */ 406 vq_retchain(vq); 407 vq_endchains(vq, 0); 408 return; 409 } 410 411 /* 412 * The only valid field in the rx packet header is the 413 * number of buffers if merged rx bufs were negotiated. 414 */ 415 memset(vrx, 0, sc->rx_vhdrlen); 416 417 if (sc->rx_merge) { 418 struct virtio_net_rxhdr *vrxh; 419 420 vrxh = vrx; 421 vrxh->vrh_bufs = 1; 422 } 423 424 /* 425 * Release this chain and handle more chains. 426 */ 427 vq_relchain(vq, idx, len + sc->rx_vhdrlen); 428 } while (vq_has_descs(vq)); 429 430 /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ 431 vq_endchains(vq, 1); 432 } 433 434 #ifdef __FreeBSD__ 435 static __inline int 436 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) 437 { 438 int r, i; 439 int len = 0; 440 441 for (r = nmd->cur_tx_ring; ; ) { 442 struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); 443 uint32_t cur, idx; 444 char *buf; 445 446 if (nm_ring_empty(ring)) { 447 r++; 448 if (r > nmd->last_tx_ring) 449 r = nmd->first_tx_ring; 450 if (r == nmd->cur_tx_ring) 451 break; 452 continue; 453 } 454 cur = ring->cur; 455 idx = ring->slot[cur].buf_idx; 456 buf = NETMAP_BUF(ring, idx); 457 458 for (i = 0; i < iovcnt; i++) { 459 if (len + iov[i].iov_len > 2048) 460 break; 461 memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); 462 len += iov[i].iov_len; 463 } 464 ring->slot[cur].len = len; 465 ring->head = ring->cur = nm_ring_next(ring, cur); 466 nmd->cur_tx_ring = r; 467 ioctl(nmd->fd, NIOCTXSYNC, NULL); 468 break; 469 } 470 471 return (len); 472 } 473 474 static __inline int 475 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) 476 { 477 int len = 0; 478 int i = 0; 479 int r; 480 481 for (r = nmd->cur_rx_ring; ; ) { 482 struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); 483 uint32_t cur, idx; 484 char *buf; 485 size_t left; 486 487 if (nm_ring_empty(ring)) { 488 r++; 489 if (r > nmd->last_rx_ring) 490 r = nmd->first_rx_ring; 491 if (r == nmd->cur_rx_ring) 492 break; 493 continue; 494 } 495 cur = ring->cur; 496 idx = ring->slot[cur].buf_idx; 497 buf = NETMAP_BUF(ring, idx); 498 left = ring->slot[cur].len; 499 500 for (i = 0; i < iovcnt && left > 0; i++) { 501 if (iov[i].iov_len > left) 502 iov[i].iov_len = left; 503 memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); 504 len += iov[i].iov_len; 505 left -= iov[i].iov_len; 506 } 507 ring->head = ring->cur = nm_ring_next(ring, cur); 508 nmd->cur_rx_ring = r; 509 ioctl(nmd->fd, NIOCRXSYNC, NULL); 510 break; 511 } 512 for (; i < iovcnt; i++) 513 iov[i].iov_len = 0; 514 515 return (len); 516 } 517 518 /* 519 * Called to send a buffer chain out to the vale port 520 */ 521 static void 522 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 523 int len) 524 { 525 static char pad[60]; /* all zero bytes */ 526 527 if (sc->vsc_nmd == NULL) 528 return; 529 530 /* 531 * If the length is < 60, pad out to that and add the 532 * extra zero'd segment to the iov. It is guaranteed that 533 * there is always an extra iov available by the caller. 534 */ 535 if (len < 60) { 536 iov[iovcnt].iov_base = pad; 537 iov[iovcnt].iov_len = 60 - len; 538 iovcnt++; 539 } 540 (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); 541 } 542 543 static void 544 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) 545 { 546 struct iovec iov[VTNET_MAXSEGS], *riov; 547 struct vqueue_info *vq; 548 void *vrx; 549 int len, n; 550 uint16_t idx; 551 552 /* 553 * Should never be called without a valid netmap descriptor 554 */ 555 assert(sc->vsc_nmd != NULL); 556 557 /* 558 * But, will be called when the rx ring hasn't yet 559 * been set up. 560 */ 561 if (!sc->vsc_rx_ready) { 562 /* 563 * Drop the packet and try later. 564 */ 565 (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); 566 return; 567 } 568 569 /* 570 * Check for available rx buffers 571 */ 572 vq = &sc->vsc_queues[VTNET_RXQ]; 573 if (!vq_has_descs(vq)) { 574 /* 575 * Drop the packet and try later. Interrupt on 576 * empty, if that's negotiated. 577 */ 578 (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); 579 vq_endchains(vq, 1); 580 return; 581 } 582 583 do { 584 /* 585 * Get descriptor chain. 586 */ 587 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 588 assert(n >= 1 && n <= VTNET_MAXSEGS); 589 590 /* 591 * Get a pointer to the rx header, and use the 592 * data immediately following it for the packet buffer. 593 */ 594 vrx = iov[0].iov_base; 595 riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); 596 597 len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); 598 599 if (len == 0) { 600 /* 601 * No more packets, but still some avail ring 602 * entries. Interrupt if needed/appropriate. 603 */ 604 vq_retchain(vq); 605 vq_endchains(vq, 0); 606 return; 607 } 608 609 /* 610 * The only valid field in the rx packet header is the 611 * number of buffers if merged rx bufs were negotiated. 612 */ 613 memset(vrx, 0, sc->rx_vhdrlen); 614 615 if (sc->rx_merge) { 616 struct virtio_net_rxhdr *vrxh; 617 618 vrxh = vrx; 619 vrxh->vrh_bufs = 1; 620 } 621 622 /* 623 * Release this chain and handle more chains. 624 */ 625 vq_relchain(vq, idx, len + sc->rx_vhdrlen); 626 } while (vq_has_descs(vq)); 627 628 /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ 629 vq_endchains(vq, 1); 630 } 631 #endif /* __FreeBSD__ */ 632 633 #ifdef __FreeBSD__ 634 static void 635 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) 636 { 637 struct pci_vtnet_softc *sc = param; 638 639 pthread_mutex_lock(&sc->rx_mtx); 640 sc->pci_vtnet_rx(sc); 641 pthread_mutex_unlock(&sc->rx_mtx); 642 643 } 644 #else 645 static void * 646 pci_vtnet_poll_thread(void *param) 647 { 648 struct pci_vtnet_softc *sc = param; 649 pollfd_t pollset; 650 651 pollset.fd = sc->vsc_dlpifd; 652 pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; 653 654 for (;;) { 655 if (poll(&pollset, 1, -1) < 0) { 656 if (errno == EINTR) 657 continue; 658 fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno); 659 continue; 660 } 661 pthread_mutex_lock(&sc->vsc_mtx); 662 pci_vtnet_tap_rx(sc); 663 pthread_mutex_unlock(&sc->vsc_mtx); 664 } 665 666 return (NULL); 667 } 668 #endif /* __FreeBSD__ */ 669 670 static void 671 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) 672 { 673 struct pci_vtnet_softc *sc = vsc; 674 675 /* 676 * A qnotify means that the rx process can now begin 677 */ 678 if (sc->vsc_rx_ready == 0) { 679 sc->vsc_rx_ready = 1; 680 vq_kick_disable(vq); 681 } 682 } 683 684 static void 685 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) 686 { 687 struct iovec iov[VTNET_MAXSEGS + 1]; 688 int i, n; 689 int plen, tlen; 690 uint16_t idx; 691 692 /* 693 * Obtain chain of descriptors. The first one is 694 * really the header descriptor, so we need to sum 695 * up two lengths: packet length and transfer length. 696 */ 697 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 698 assert(n >= 1 && n <= VTNET_MAXSEGS); 699 plen = 0; 700 tlen = iov[0].iov_len; 701 for (i = 1; i < n; i++) { 702 plen += iov[i].iov_len; 703 tlen += iov[i].iov_len; 704 } 705 706 DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); 707 sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); 708 709 /* chain is processed, release it and set tlen */ 710 vq_relchain(vq, idx, tlen); 711 } 712 713 static void 714 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) 715 { 716 struct pci_vtnet_softc *sc = vsc; 717 718 /* 719 * Any ring entries to process? 720 */ 721 if (!vq_has_descs(vq)) 722 return; 723 724 /* Signal the tx thread for processing */ 725 pthread_mutex_lock(&sc->tx_mtx); 726 vq_kick_disable(vq); 727 if (sc->tx_in_progress == 0) 728 pthread_cond_signal(&sc->tx_cond); 729 pthread_mutex_unlock(&sc->tx_mtx); 730 } 731 732 /* 733 * Thread which will handle processing of TX desc 734 */ 735 static void * 736 pci_vtnet_tx_thread(void *param) 737 { 738 struct pci_vtnet_softc *sc = param; 739 struct vqueue_info *vq; 740 int error; 741 742 vq = &sc->vsc_queues[VTNET_TXQ]; 743 744 /* 745 * Let us wait till the tx queue pointers get initialised & 746 * first tx signaled 747 */ 748 pthread_mutex_lock(&sc->tx_mtx); 749 error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); 750 assert(error == 0); 751 752 for (;;) { 753 /* note - tx mutex is locked here */ 754 while (sc->resetting || !vq_has_descs(vq)) { 755 vq_kick_enable(vq); 756 if (!sc->resetting && vq_has_descs(vq)) 757 break; 758 759 sc->tx_in_progress = 0; 760 error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); 761 assert(error == 0); 762 } 763 vq_kick_disable(vq); 764 sc->tx_in_progress = 1; 765 pthread_mutex_unlock(&sc->tx_mtx); 766 767 do { 768 /* 769 * Run through entries, placing them into 770 * iovecs and sending when an end-of-packet 771 * is found 772 */ 773 pci_vtnet_proctx(sc, vq); 774 } while (vq_has_descs(vq)); 775 776 /* 777 * Generate an interrupt if needed. 778 */ 779 vq_endchains(vq, 1); 780 781 pthread_mutex_lock(&sc->tx_mtx); 782 } 783 return (NULL); 784 } 785 786 #ifdef __FreeBSD__ 787 static void 788 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) 789 { 790 791 DPRINTF(("vtnet: control qnotify!\n\r")); 792 } 793 #endif /* __FreeBSD__ */ 794 795 static void 796 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) 797 { 798 char tbuf[80]; 799 #ifndef WITHOUT_CAPSICUM 800 cap_rights_t rights; 801 #endif 802 #ifndef __FreeBSD__ 803 uchar_t physaddr[DLPI_PHYSADDR_MAX]; 804 size_t physaddrlen = DLPI_PHYSADDR_MAX; 805 int error; 806 #endif 807 808 strcpy(tbuf, "/dev/"); 809 strlcat(tbuf, devname, sizeof(tbuf)); 810 811 sc->pci_vtnet_rx = pci_vtnet_tap_rx; 812 sc->pci_vtnet_tx = pci_vtnet_tap_tx; 813 #ifdef __FreeBSD__ 814 sc->vsc_tapfd = open(tbuf, O_RDWR); 815 if (sc->vsc_tapfd == -1) { 816 WPRINTF(("open of tap device %s failed\n", tbuf)); 817 return; 818 } 819 820 /* 821 * Set non-blocking and register for read 822 * notifications with the event loop 823 */ 824 int opt = 1; 825 if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { 826 WPRINTF(("tap device O_NONBLOCK failed\n")); 827 close(sc->vsc_tapfd); 828 sc->vsc_tapfd = -1; 829 } 830 831 #ifndef WITHOUT_CAPSICUM 832 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 833 if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) 834 errx(EX_OSERR, "Unable to apply rights for sandbox"); 835 #endif 836 837 sc->vsc_mevp = mevent_add(sc->vsc_tapfd, 838 EVF_READ, 839 pci_vtnet_rx_callback, 840 sc); 841 if (sc->vsc_mevp == NULL) { 842 WPRINTF(("Could not register event\n")); 843 close(sc->vsc_tapfd); 844 sc->vsc_tapfd = -1; 845 } 846 #else 847 if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { 848 WPRINTF(("open of vnic device %s failed\n", devname)); 849 } 850 851 if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, 852 &physaddrlen) != DLPI_SUCCESS) { 853 WPRINTF(("read MAC address of vnic device %s failed\n", 854 devname)); 855 } 856 if (physaddrlen != ETHERADDRL) { 857 WPRINTF(("bad MAC address len %d on vnic device %s\n", 858 physaddrlen, devname)); 859 } 860 memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); 861 862 if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { 863 WPRINTF(("bind of vnic device %s failed\n", devname)); 864 } 865 866 if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { 867 WPRINTF(("enable promiscous mode(physical) of vnic device %s " 868 "failed\n", devname)); 869 } 870 if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { 871 WPRINTF(("enable promiscous mode(SAP) of vnic device %s " 872 "failed\n", devname)); 873 } 874 875 sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); 876 877 if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { 878 WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", 879 devname)); 880 dlpi_close(sc->vsc_dhp); 881 sc->vsc_dlpifd = -1; 882 } 883 884 error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); 885 assert(error == 0); 886 #endif 887 } 888 889 #ifdef __FreeBSD__ 890 static void 891 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) 892 { 893 sc->pci_vtnet_rx = pci_vtnet_netmap_rx; 894 sc->pci_vtnet_tx = pci_vtnet_netmap_tx; 895 896 sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); 897 if (sc->vsc_nmd == NULL) { 898 WPRINTF(("open of netmap device %s failed\n", ifname)); 899 return; 900 } 901 902 sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, 903 EVF_READ, 904 pci_vtnet_rx_callback, 905 sc); 906 if (sc->vsc_mevp == NULL) { 907 WPRINTF(("Could not register event\n")); 908 nm_close(sc->vsc_nmd); 909 sc->vsc_nmd = NULL; 910 } 911 } 912 #endif /* __FreeBSD__ */ 913 914 static int 915 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 916 { 917 char tname[MAXCOMLEN + 1]; 918 struct pci_vtnet_softc *sc; 919 const char *env_msi; 920 char *devname; 921 char *vtopts; 922 #ifdef __FreeBSD__ 923 int mac_provided; 924 #endif 925 int use_msix; 926 927 sc = calloc(1, sizeof(struct pci_vtnet_softc)); 928 929 pthread_mutex_init(&sc->vsc_mtx, NULL); 930 931 vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); 932 sc->vsc_vs.vs_mtx = &sc->vsc_mtx; 933 934 sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; 935 sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; 936 sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; 937 sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; 938 #ifdef __FreeBSD__ 939 sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; 940 sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; 941 #endif 942 943 /* 944 * Use MSI if set by user 945 */ 946 use_msix = 1; 947 if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) { 948 if (strcasecmp(env_msi, "yes") == 0) 949 use_msix = 0; 950 } 951 952 /* 953 * Attempt to open the tap device and read the MAC address 954 * if specified 955 */ 956 #ifdef __FreeBSD__ 957 mac_provided = 0; 958 sc->vsc_tapfd = -1; 959 #endif 960 sc->vsc_nmd = NULL; 961 if (opts != NULL) { 962 #ifdef __FreeBSD__ 963 int err; 964 #endif 965 966 devname = vtopts = strdup(opts); 967 (void) strsep(&vtopts, ","); 968 969 #ifdef __FreBSD__ 970 if (vtopts != NULL) { 971 err = net_parsemac(vtopts, sc->vsc_config.mac); 972 if (err != 0) { 973 free(devname); 974 return (err); 975 } 976 mac_provided = 1; 977 } 978 #endif 979 980 #ifdef __FreeBSD__ 981 if (strncmp(devname, "vale", 4) == 0) 982 pci_vtnet_netmap_setup(sc, devname); 983 #endif 984 if (strncmp(devname, "tap", 3) == 0 || 985 strncmp(devname, "vmnet", 5) == 0) 986 pci_vtnet_tap_setup(sc, devname); 987 988 free(devname); 989 } 990 991 #ifdef __FreeBSD__ 992 if (!mac_provided) { 993 net_genmac(pi, sc->vsc_config.mac); 994 } 995 #endif 996 997 /* initialize config space */ 998 pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); 999 pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); 1000 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); 1001 pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); 1002 pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); 1003 1004 /* Link is up if we managed to open tap device or vale port. */ 1005 #ifdef __FreeBSD__ 1006 sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || 1007 #else 1008 sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 || 1009 #endif 1010 sc->vsc_nmd != NULL); 1011 1012 /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ 1013 if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) 1014 return (1); 1015 1016 /* use BAR 0 to map config regs in IO space */ 1017 vi_set_io_bar(&sc->vsc_vs, 0); 1018 1019 sc->resetting = 0; 1020 1021 sc->rx_merge = 1; 1022 sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); 1023 pthread_mutex_init(&sc->rx_mtx, NULL); 1024 1025 /* 1026 * Initialize tx semaphore & spawn TX processing thread. 1027 * As of now, only one thread for TX desc processing is 1028 * spawned. 1029 */ 1030 sc->tx_in_progress = 0; 1031 pthread_mutex_init(&sc->tx_mtx, NULL); 1032 pthread_cond_init(&sc->tx_cond, NULL); 1033 pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); 1034 snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, 1035 pi->pi_func); 1036 pthread_set_name_np(sc->tx_tid, tname); 1037 1038 return (0); 1039 } 1040 1041 static int 1042 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) 1043 { 1044 struct pci_vtnet_softc *sc = vsc; 1045 void *ptr; 1046 1047 if (offset < 6) { 1048 assert(offset + size <= 6); 1049 /* 1050 * The driver is allowed to change the MAC address 1051 */ 1052 ptr = &sc->vsc_config.mac[offset]; 1053 memcpy(ptr, &value, size); 1054 } else { 1055 /* silently ignore other writes */ 1056 DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); 1057 } 1058 1059 return (0); 1060 } 1061 1062 static int 1063 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) 1064 { 1065 struct pci_vtnet_softc *sc = vsc; 1066 void *ptr; 1067 1068 ptr = (uint8_t *)&sc->vsc_config + offset; 1069 memcpy(retval, ptr, size); 1070 return (0); 1071 } 1072 1073 static void 1074 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) 1075 { 1076 struct pci_vtnet_softc *sc = vsc; 1077 1078 sc->vsc_features = negotiated_features; 1079 1080 if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { 1081 sc->rx_merge = 0; 1082 /* non-merge rx header is 2 bytes shorter */ 1083 sc->rx_vhdrlen -= 2; 1084 } 1085 } 1086 1087 struct pci_devemu pci_de_vnet = { 1088 .pe_emu = "virtio-net", 1089 .pe_init = pci_vtnet_init, 1090 .pe_barwrite = vi_pci_write, 1091 .pe_barread = vi_pci_read 1092 }; 1093 PCI_EMUL_SET(pci_de_vnet); 1094