1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2011 NetApp, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 * 28 * $FreeBSD$ 29 */ 30 /* 31 * This file and its contents are supplied under the terms of the 32 * Common Development and Distribution License ("CDDL"), version 1.0. 33 * You may only use this file in accordance with the terms of version 34 * 1.0 of the CDDL. 35 * 36 * A full copy of the text of the CDDL should have accompanied this 37 * source. A copy of the CDDL is also available via the Internet at 38 * http://www.illumos.org/license/CDDL. 39 * 40 * Copyright 2013 Pluribus Networks Inc. 41 * Copyright 2018 Joyent, Inc. 42 */ 43 44 #include <sys/cdefs.h> 45 __FBSDID("$FreeBSD$"); 46 47 #include <sys/param.h> 48 #ifndef WITHOUT_CAPSICUM 49 #include <sys/capsicum.h> 50 #endif 51 #include <sys/linker_set.h> 52 #include <sys/select.h> 53 #include <sys/uio.h> 54 #include <sys/ioctl.h> 55 #include <net/ethernet.h> 56 #ifdef __FreeBSD__ 57 #ifndef NETMAP_WITH_LIBS 58 #define NETMAP_WITH_LIBS 59 #endif 60 #include <net/netmap_user.h> 61 #endif 62 63 #ifndef WITHOUT_CAPSICUM 64 #include <capsicum_helpers.h> 65 #endif 66 #include <err.h> 67 #include <errno.h> 68 #include <fcntl.h> 69 #include <stdio.h> 70 #include <stdlib.h> 71 #include <stdint.h> 72 #include <string.h> 73 #include <strings.h> 74 #include <unistd.h> 75 #include <assert.h> 76 #include <md5.h> 77 #include <pthread.h> 78 #include <pthread_np.h> 79 #include <sysexits.h> 80 #ifndef __FreeBSD__ 81 #include <poll.h> 82 #include <libdlpi.h> 83 #endif 84 85 #include "bhyverun.h" 86 #include "debug.h" 87 #include "pci_emul.h" 88 #ifdef __FreeBSD__ 89 #include "mevent.h" 90 #endif 91 #include "virtio.h" 92 #include "net_utils.h" 93 94 #define VTNET_RINGSZ 1024 95 96 #define VTNET_MAXSEGS 256 97 98 /* 99 * Host capabilities. Note that we only offer a few of these. 100 */ 101 #define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ 102 #define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ 103 #define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ 104 #define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ 105 #define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ 106 #define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ 107 #define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ 108 #define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ 109 #define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ 110 #define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ 111 #define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ 112 #define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ 113 #define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ 114 #define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ 115 #define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ 116 #define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ 117 #define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ 118 #define VIRTIO_NET_F_GUEST_ANNOUNCE \ 119 (1 << 21) /* guest can send gratuitous pkts */ 120 121 #define VTNET_S_HOSTCAPS \ 122 ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ 123 VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) 124 125 /* 126 * PCI config-space "registers" 127 */ 128 struct virtio_net_config { 129 uint8_t mac[6]; 130 uint16_t status; 131 uint16_t max_virtqueue_pairs; 132 uint16_t mtu; 133 } __packed; 134 135 /* 136 * Queue definitions. 137 */ 138 #define VTNET_RXQ 0 139 #define VTNET_TXQ 1 140 #define VTNET_CTLQ 2 /* NB: not yet supported */ 141 142 #define VTNET_MAXQ 3 143 144 /* 145 * Fixed network header size 146 */ 147 struct virtio_net_rxhdr { 148 uint8_t vrh_flags; 149 uint8_t vrh_gso_type; 150 uint16_t vrh_hdr_len; 151 uint16_t vrh_gso_size; 152 uint16_t vrh_csum_start; 153 uint16_t vrh_csum_offset; 154 uint16_t vrh_bufs; 155 } __packed; 156 157 /* 158 * Debug printf 159 */ 160 static int pci_vtnet_debug; 161 #define DPRINTF(params) if (pci_vtnet_debug) PRINTLN params 162 #define WPRINTF(params) PRINTLN params 163 164 /* 165 * Per-device softc 166 */ 167 struct pci_vtnet_softc { 168 struct virtio_softc vsc_vs; 169 struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; 170 pthread_mutex_t vsc_mtx; 171 struct mevent *vsc_mevp; 172 173 #ifdef __FreeBSD 174 int vsc_tapfd; 175 #else 176 dlpi_handle_t vsc_dhp; 177 int vsc_dlpifd; 178 #endif 179 struct nm_desc *vsc_nmd; 180 181 int vsc_rx_ready; 182 bool features_negotiated; /* protected by rx_mtx */ 183 int resetting; /* protected by tx_mtx */ 184 185 uint64_t vsc_features; /* negotiated features */ 186 187 struct virtio_net_config vsc_config; 188 struct virtio_consts vsc_consts; 189 190 pthread_mutex_t rx_mtx; 191 int rx_vhdrlen; 192 int rx_merge; /* merged rx bufs in use */ 193 194 pthread_t tx_tid; 195 pthread_mutex_t tx_mtx; 196 pthread_cond_t tx_cond; 197 int tx_in_progress; 198 199 void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); 200 void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, 201 int iovcnt, int len); 202 }; 203 204 static void pci_vtnet_reset(void *); 205 /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ 206 static int pci_vtnet_cfgread(void *, int, int, uint32_t *); 207 static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); 208 static void pci_vtnet_neg_features(void *, uint64_t); 209 210 static struct virtio_consts vtnet_vi_consts = { 211 "vtnet", /* our name */ 212 VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ 213 sizeof(struct virtio_net_config), /* config reg size */ 214 pci_vtnet_reset, /* reset */ 215 NULL, /* device-wide qnotify -- not used */ 216 pci_vtnet_cfgread, /* read PCI config */ 217 pci_vtnet_cfgwrite, /* write PCI config */ 218 pci_vtnet_neg_features, /* apply negotiated features */ 219 VTNET_S_HOSTCAPS, /* our capabilities */ 220 }; 221 222 static void 223 pci_vtnet_reset(void *vsc) 224 { 225 struct pci_vtnet_softc *sc = vsc; 226 227 DPRINTF(("vtnet: device reset requested !")); 228 229 /* Acquire the RX lock to block RX processing. */ 230 pthread_mutex_lock(&sc->rx_mtx); 231 232 sc->features_negotiated = false; 233 234 /* Set sc->resetting and give a chance to the TX thread to stop. */ 235 pthread_mutex_lock(&sc->tx_mtx); 236 sc->resetting = 1; 237 while (sc->tx_in_progress) { 238 pthread_mutex_unlock(&sc->tx_mtx); 239 usleep(10000); 240 pthread_mutex_lock(&sc->tx_mtx); 241 } 242 243 sc->vsc_rx_ready = 0; 244 sc->rx_merge = 1; 245 sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); 246 247 /* 248 * Now reset rings, MSI-X vectors, and negotiated capabilities. 249 * Do that with the TX lock held, since we need to reset 250 * sc->resetting. 251 */ 252 vi_reset_dev(&sc->vsc_vs); 253 254 sc->resetting = 0; 255 pthread_mutex_unlock(&sc->tx_mtx); 256 pthread_mutex_unlock(&sc->rx_mtx); 257 } 258 259 /* 260 * Called to send a buffer chain out to the tap device 261 */ 262 #ifdef __FreeBSD__ 263 static void 264 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 265 int len) 266 { 267 static char pad[60]; /* all zero bytes */ 268 269 if (sc->vsc_tapfd == -1) 270 return; 271 272 /* 273 * If the length is < 60, pad out to that and add the 274 * extra zero'd segment to the iov. It is guaranteed that 275 * there is always an extra iov available by the caller. 276 */ 277 if (len < 60) { 278 iov[iovcnt].iov_base = pad; 279 iov[iovcnt].iov_len = 60 - len; 280 iovcnt++; 281 } 282 (void) writev(sc->vsc_tapfd, iov, iovcnt); 283 } 284 #else 285 static void 286 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 287 int len) 288 { 289 int i; 290 291 for (i = 0; i < iovcnt; i++) { 292 (void) dlpi_send(sc->vsc_dhp, NULL, 0, 293 iov[i].iov_base, iov[i].iov_len, NULL); 294 } 295 } 296 #endif /* __FreeBSD__ */ 297 298 #ifdef __FreeBSD__ 299 /* 300 * Called when there is read activity on the tap file descriptor. 301 * Each buffer posted by the guest is assumed to be able to contain 302 * an entire ethernet frame + rx header. 303 * MP note: the dummybuf is only used for discarding frames, so there 304 * is no need for it to be per-vtnet or locked. 305 */ 306 static uint8_t dummybuf[2048]; 307 #endif /* __FreeBSD__ */ 308 309 static __inline struct iovec * 310 rx_iov_trim(struct iovec *iov, int *niov, int tlen) 311 { 312 struct iovec *riov; 313 314 /* XXX short-cut: assume first segment is >= tlen */ 315 assert(iov[0].iov_len >= tlen); 316 317 iov[0].iov_len -= tlen; 318 if (iov[0].iov_len == 0) { 319 assert(*niov > 1); 320 *niov -= 1; 321 riov = &iov[1]; 322 } else { 323 iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); 324 riov = &iov[0]; 325 } 326 327 return (riov); 328 } 329 330 static void 331 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) 332 { 333 struct iovec iov[VTNET_MAXSEGS], *riov; 334 struct vqueue_info *vq; 335 void *vrx; 336 int n; 337 #ifdef __FreeBSD__ 338 int len; 339 #else 340 size_t len; 341 int ret; 342 #endif 343 uint16_t idx; 344 345 /* 346 * Should never be called without a valid tap fd 347 */ 348 #ifdef __FreeBSD__ 349 assert(sc->vsc_tapfd != -1); 350 #else 351 assert(sc->vsc_dlpifd != -1); 352 #endif 353 354 /* Features must be negotiated */ 355 if (!sc->features_negotiated) { 356 return; 357 } 358 359 /* 360 * But, will be called when the rx ring hasn't yet 361 * been set up. 362 */ 363 if (!sc->vsc_rx_ready) { 364 #ifdef __FreeBSD__ 365 /* 366 * Drop the packet and try later. 367 */ 368 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); 369 #endif 370 return; 371 } 372 373 /* 374 * Check for available rx buffers 375 */ 376 vq = &sc->vsc_queues[VTNET_RXQ]; 377 if (!vq_has_descs(vq)) { 378 /* 379 * Drop the packet and try later. Interrupt on 380 * empty, if that's negotiated. 381 */ 382 #ifdef __FreeBSD__ 383 (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); 384 #endif 385 vq_endchains(vq, 1); 386 return; 387 } 388 389 do { 390 /* 391 * Get descriptor chain 392 */ 393 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 394 assert(n >= 1 && n <= VTNET_MAXSEGS); 395 396 /* 397 * Get a pointer to the rx header, and use the 398 * data immediately following it for the packet buffer. 399 */ 400 vrx = iov[0].iov_base; 401 riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); 402 #ifdef __FreeBSD__ 403 len = readv(sc->vsc_tapfd, riov, n); 404 #else 405 len = riov[0].iov_len; 406 ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, 407 (uint8_t *)riov[0].iov_base, &len, 0, NULL); 408 if (ret != DLPI_SUCCESS) { 409 errno = EWOULDBLOCK; 410 len = 0; 411 } 412 #endif 413 if (len <= 0 && errno == EWOULDBLOCK) { 414 /* 415 * No more packets, but still some avail ring 416 * entries. Interrupt if needed/appropriate. 417 */ 418 vq_retchains(vq, 1); 419 vq_endchains(vq, 0); 420 return; 421 } 422 423 /* 424 * The only valid field in the rx packet header is the 425 * number of buffers if merged rx bufs were negotiated. 426 */ 427 memset(vrx, 0, sc->rx_vhdrlen); 428 429 if (sc->rx_merge) { 430 struct virtio_net_rxhdr *vrxh; 431 432 vrxh = vrx; 433 vrxh->vrh_bufs = 1; 434 } 435 436 /* 437 * Release this chain and handle more chains. 438 */ 439 vq_relchain(vq, idx, len + sc->rx_vhdrlen); 440 } while (vq_has_descs(vq)); 441 442 /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ 443 vq_endchains(vq, 1); 444 } 445 446 #ifdef __FreeBSD__ 447 static __inline int 448 pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) 449 { 450 int r, i; 451 int len = 0; 452 453 for (r = nmd->cur_tx_ring; ; ) { 454 struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); 455 uint32_t cur, idx; 456 char *buf; 457 458 if (nm_ring_empty(ring)) { 459 r++; 460 if (r > nmd->last_tx_ring) 461 r = nmd->first_tx_ring; 462 if (r == nmd->cur_tx_ring) 463 break; 464 continue; 465 } 466 cur = ring->cur; 467 idx = ring->slot[cur].buf_idx; 468 buf = NETMAP_BUF(ring, idx); 469 470 for (i = 0; i < iovcnt; i++) { 471 if (len + iov[i].iov_len > 2048) 472 break; 473 memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); 474 len += iov[i].iov_len; 475 } 476 ring->slot[cur].len = len; 477 ring->head = ring->cur = nm_ring_next(ring, cur); 478 nmd->cur_tx_ring = r; 479 ioctl(nmd->fd, NIOCTXSYNC, NULL); 480 break; 481 } 482 483 return (len); 484 } 485 486 static __inline int 487 pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) 488 { 489 int len = 0; 490 int i = 0; 491 int r; 492 493 for (r = nmd->cur_rx_ring; ; ) { 494 struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); 495 uint32_t cur, idx; 496 char *buf; 497 size_t left; 498 499 if (nm_ring_empty(ring)) { 500 r++; 501 if (r > nmd->last_rx_ring) 502 r = nmd->first_rx_ring; 503 if (r == nmd->cur_rx_ring) 504 break; 505 continue; 506 } 507 cur = ring->cur; 508 idx = ring->slot[cur].buf_idx; 509 buf = NETMAP_BUF(ring, idx); 510 left = ring->slot[cur].len; 511 512 for (i = 0; i < iovcnt && left > 0; i++) { 513 if (iov[i].iov_len > left) 514 iov[i].iov_len = left; 515 memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); 516 len += iov[i].iov_len; 517 left -= iov[i].iov_len; 518 } 519 ring->head = ring->cur = nm_ring_next(ring, cur); 520 nmd->cur_rx_ring = r; 521 ioctl(nmd->fd, NIOCRXSYNC, NULL); 522 break; 523 } 524 for (; i < iovcnt; i++) 525 iov[i].iov_len = 0; 526 527 return (len); 528 } 529 530 /* 531 * Called to send a buffer chain out to the vale port 532 */ 533 static void 534 pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, 535 int len) 536 { 537 static char pad[60]; /* all zero bytes */ 538 539 if (sc->vsc_nmd == NULL) 540 return; 541 542 /* 543 * If the length is < 60, pad out to that and add the 544 * extra zero'd segment to the iov. It is guaranteed that 545 * there is always an extra iov available by the caller. 546 */ 547 if (len < 60) { 548 iov[iovcnt].iov_base = pad; 549 iov[iovcnt].iov_len = 60 - len; 550 iovcnt++; 551 } 552 (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); 553 } 554 555 static void 556 pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) 557 { 558 struct iovec iov[VTNET_MAXSEGS], *riov; 559 struct vqueue_info *vq; 560 void *vrx; 561 int len, n; 562 uint16_t idx; 563 564 /* 565 * Should never be called without a valid netmap descriptor 566 */ 567 assert(sc->vsc_nmd != NULL); 568 569 /* Features must be negotiated */ 570 if (!sc->features_negotiated) { 571 return; 572 } 573 574 /* 575 * But, will be called when the rx ring hasn't yet 576 * been set up. 577 */ 578 if (!sc->vsc_rx_ready) { 579 /* 580 * Drop the packet and try later. 581 */ 582 (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); 583 return; 584 } 585 586 /* 587 * Check for available rx buffers 588 */ 589 vq = &sc->vsc_queues[VTNET_RXQ]; 590 if (!vq_has_descs(vq)) { 591 /* 592 * Drop the packet and try later. Interrupt on 593 * empty, if that's negotiated. 594 */ 595 (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); 596 vq_endchains(vq, 1); 597 return; 598 } 599 600 do { 601 /* 602 * Get descriptor chain. 603 */ 604 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 605 assert(n >= 1 && n <= VTNET_MAXSEGS); 606 607 /* 608 * Get a pointer to the rx header, and use the 609 * data immediately following it for the packet buffer. 610 */ 611 vrx = iov[0].iov_base; 612 riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); 613 614 len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); 615 616 if (len == 0) { 617 /* 618 * No more packets, but still some avail ring 619 * entries. Interrupt if needed/appropriate. 620 */ 621 vq_retchain(vq); 622 vq_endchains(vq, 0); 623 return; 624 } 625 626 /* 627 * The only valid field in the rx packet header is the 628 * number of buffers if merged rx bufs were negotiated. 629 */ 630 memset(vrx, 0, sc->rx_vhdrlen); 631 632 if (sc->rx_merge) { 633 struct virtio_net_rxhdr *vrxh; 634 635 vrxh = vrx; 636 vrxh->vrh_bufs = 1; 637 } 638 639 /* 640 * Release this chain and handle more chains. 641 */ 642 vq_relchain(vq, idx, len + sc->rx_vhdrlen); 643 } while (vq_has_descs(vq)); 644 645 /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ 646 vq_endchains(vq, 1); 647 } 648 #endif /* __FreeBSD__ */ 649 650 #ifdef __FreeBSD__ 651 static void 652 pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) 653 { 654 struct pci_vtnet_softc *sc = param; 655 656 pthread_mutex_lock(&sc->rx_mtx); 657 sc->pci_vtnet_rx(sc); 658 pthread_mutex_unlock(&sc->rx_mtx); 659 660 } 661 #else 662 static void * 663 pci_vtnet_poll_thread(void *param) 664 { 665 struct pci_vtnet_softc *sc = param; 666 pollfd_t pollset; 667 668 pollset.fd = sc->vsc_dlpifd; 669 pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; 670 671 for (;;) { 672 if (poll(&pollset, 1, -1) < 0) { 673 if (errno == EINTR) 674 continue; 675 fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno); 676 continue; 677 } 678 pthread_mutex_lock(&sc->vsc_mtx); 679 pci_vtnet_tap_rx(sc); 680 pthread_mutex_unlock(&sc->vsc_mtx); 681 } 682 683 return (NULL); 684 } 685 #endif /* __FreeBSD__ */ 686 687 static void 688 pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) 689 { 690 struct pci_vtnet_softc *sc = vsc; 691 692 /* 693 * A qnotify means that the rx process can now begin. 694 * Enable RX only if features are negotiated. 695 */ 696 pthread_mutex_lock(&sc->rx_mtx); 697 if (sc->vsc_rx_ready == 0 && sc->features_negotiated) { 698 sc->vsc_rx_ready = 1; 699 vq_kick_disable(vq); 700 } 701 pthread_mutex_unlock(&sc->rx_mtx); 702 } 703 704 static void 705 pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) 706 { 707 struct iovec iov[VTNET_MAXSEGS + 1]; 708 int i, n; 709 int plen, tlen; 710 uint16_t idx; 711 712 /* 713 * Obtain chain of descriptors. The first one is 714 * really the header descriptor, so we need to sum 715 * up two lengths: packet length and transfer length. 716 */ 717 n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); 718 assert(n >= 1 && n <= VTNET_MAXSEGS); 719 plen = 0; 720 tlen = iov[0].iov_len; 721 for (i = 1; i < n; i++) { 722 plen += iov[i].iov_len; 723 tlen += iov[i].iov_len; 724 } 725 726 DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); 727 sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); 728 729 /* chain is processed, release it and set tlen */ 730 vq_relchain(vq, idx, tlen); 731 } 732 733 static void 734 pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) 735 { 736 struct pci_vtnet_softc *sc = vsc; 737 738 /* 739 * Any ring entries to process? 740 */ 741 if (!vq_has_descs(vq)) 742 return; 743 744 /* Signal the tx thread for processing */ 745 pthread_mutex_lock(&sc->tx_mtx); 746 vq_kick_disable(vq); 747 if (sc->tx_in_progress == 0) 748 pthread_cond_signal(&sc->tx_cond); 749 pthread_mutex_unlock(&sc->tx_mtx); 750 } 751 752 /* 753 * Thread which will handle processing of TX desc 754 */ 755 static void * 756 pci_vtnet_tx_thread(void *param) 757 { 758 struct pci_vtnet_softc *sc = param; 759 struct vqueue_info *vq; 760 int error; 761 762 vq = &sc->vsc_queues[VTNET_TXQ]; 763 764 /* 765 * Let us wait till the tx queue pointers get initialised & 766 * first tx signaled 767 */ 768 pthread_mutex_lock(&sc->tx_mtx); 769 error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); 770 assert(error == 0); 771 772 for (;;) { 773 /* note - tx mutex is locked here */ 774 while (sc->resetting || !vq_has_descs(vq)) { 775 vq_kick_enable(vq); 776 if (!sc->resetting && vq_has_descs(vq)) 777 break; 778 779 sc->tx_in_progress = 0; 780 error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); 781 assert(error == 0); 782 } 783 vq_kick_disable(vq); 784 sc->tx_in_progress = 1; 785 pthread_mutex_unlock(&sc->tx_mtx); 786 787 do { 788 /* 789 * Run through entries, placing them into 790 * iovecs and sending when an end-of-packet 791 * is found 792 */ 793 pci_vtnet_proctx(sc, vq); 794 } while (vq_has_descs(vq)); 795 796 /* 797 * Generate an interrupt if needed. 798 */ 799 vq_endchains(vq, 1); 800 801 pthread_mutex_lock(&sc->tx_mtx); 802 } 803 return (NULL); 804 } 805 806 #ifdef __FreeBSD__ 807 static void 808 pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) 809 { 810 811 DPRINTF(("vtnet: control qnotify!")); 812 } 813 #endif /* __FreeBSD__ */ 814 815 static void 816 pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) 817 { 818 char tbuf[80]; 819 #ifndef WITHOUT_CAPSICUM 820 cap_rights_t rights; 821 #endif 822 #ifndef __FreeBSD__ 823 uchar_t physaddr[DLPI_PHYSADDR_MAX]; 824 size_t physaddrlen = DLPI_PHYSADDR_MAX; 825 int error; 826 #endif 827 828 strcpy(tbuf, "/dev/"); 829 strlcat(tbuf, devname, sizeof(tbuf)); 830 831 sc->pci_vtnet_rx = pci_vtnet_tap_rx; 832 sc->pci_vtnet_tx = pci_vtnet_tap_tx; 833 #ifdef __FreeBSD__ 834 sc->vsc_tapfd = open(tbuf, O_RDWR); 835 if (sc->vsc_tapfd == -1) { 836 WPRINTF(("open of tap device %s failed\n", tbuf)); 837 return; 838 } 839 840 /* 841 * Set non-blocking and register for read 842 * notifications with the event loop 843 */ 844 int opt = 1; 845 if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { 846 WPRINTF(("tap device O_NONBLOCK failed\n")); 847 close(sc->vsc_tapfd); 848 sc->vsc_tapfd = -1; 849 } 850 851 #ifndef WITHOUT_CAPSICUM 852 cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); 853 if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) 854 errx(EX_OSERR, "Unable to apply rights for sandbox"); 855 #endif 856 857 sc->vsc_mevp = mevent_add(sc->vsc_tapfd, 858 EVF_READ, 859 pci_vtnet_rx_callback, 860 sc); 861 if (sc->vsc_mevp == NULL) { 862 WPRINTF(("Could not register event\n")); 863 close(sc->vsc_tapfd); 864 sc->vsc_tapfd = -1; 865 } 866 #else 867 if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { 868 WPRINTF(("open of vnic device %s failed\n", devname)); 869 } 870 871 if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, 872 &physaddrlen) != DLPI_SUCCESS) { 873 WPRINTF(("read MAC address of vnic device %s failed\n", 874 devname)); 875 } 876 if (physaddrlen != ETHERADDRL) { 877 WPRINTF(("bad MAC address len %d on vnic device %s\n", 878 physaddrlen, devname)); 879 } 880 memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); 881 882 if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { 883 WPRINTF(("bind of vnic device %s failed\n", devname)); 884 } 885 886 if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { 887 WPRINTF(("enable promiscous mode(physical) of vnic device %s " 888 "failed\n", devname)); 889 } 890 if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { 891 WPRINTF(("enable promiscous mode(SAP) of vnic device %s " 892 "failed\n", devname)); 893 } 894 895 sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); 896 897 if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { 898 WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", 899 devname)); 900 dlpi_close(sc->vsc_dhp); 901 sc->vsc_dlpifd = -1; 902 } 903 904 error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); 905 assert(error == 0); 906 #endif 907 } 908 909 #ifdef __FreeBSD__ 910 static void 911 pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) 912 { 913 sc->pci_vtnet_rx = pci_vtnet_netmap_rx; 914 sc->pci_vtnet_tx = pci_vtnet_netmap_tx; 915 916 sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); 917 if (sc->vsc_nmd == NULL) { 918 WPRINTF(("open of netmap device %s failed\n", ifname)); 919 return; 920 } 921 922 sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, 923 EVF_READ, 924 pci_vtnet_rx_callback, 925 sc); 926 if (sc->vsc_mevp == NULL) { 927 WPRINTF(("Could not register event\n")); 928 nm_close(sc->vsc_nmd); 929 sc->vsc_nmd = NULL; 930 } 931 } 932 #endif /* __FreeBSD__ */ 933 934 static int 935 pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) 936 { 937 struct pci_vtnet_softc *sc; 938 char tname[MAXCOMLEN + 1]; 939 #ifdef __FreeBSD__ 940 int mac_provided; 941 int mtu_provided; 942 unsigned long mtu = ETHERMTU; 943 #else 944 int use_msix = 1; 945 #endif 946 947 /* 948 * Allocate data structures for further virtio initializations. 949 * sc also contains a copy of vtnet_vi_consts, since capabilities 950 * change depending on the backend. 951 */ 952 sc = calloc(1, sizeof(struct pci_vtnet_softc)); 953 954 sc->vsc_consts = vtnet_vi_consts; 955 pthread_mutex_init(&sc->vsc_mtx, NULL); 956 957 sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; 958 sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; 959 sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; 960 sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; 961 #ifdef notyet 962 sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; 963 sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; 964 #endif 965 966 /* 967 * Attempt to open the backend device and read the MAC address 968 * if specified. 969 */ 970 #ifdef __FreeBSD__ 971 mac_provided = 0; 972 mtu_provided = 0; 973 #endif 974 if (opts != NULL) { 975 char *optscopy; 976 char *vtopts; 977 int err = 0; 978 979 /* Get the device name. */ 980 optscopy = vtopts = strdup(opts); 981 (void) strsep(&vtopts, ","); 982 983 #ifdef __FreeBSD__ 984 /* 985 * Parse the list of options in the form 986 * key1=value1,...,keyN=valueN. 987 */ 988 while (vtopts != NULL) { 989 char *value = vtopts; 990 char *key; 991 992 key = strsep(&value, "="); 993 if (value == NULL) 994 break; 995 vtopts = value; 996 (void) strsep(&vtopts, ","); 997 998 if (strcmp(key, "mac") == 0) { 999 err = net_parsemac(value, sc->vsc_config.mac); 1000 if (err) 1001 break; 1002 mac_provided = 1; 1003 } else if (strcmp(key, "mtu") == 0) { 1004 err = net_parsemtu(value, &mtu); 1005 if (err) 1006 break; 1007 1008 if (mtu < VTNET_MIN_MTU || mtu > VTNET_MAX_MTU) { 1009 err = EINVAL; 1010 errno = EINVAL; 1011 break; 1012 } 1013 mtu_provided = 1; 1014 } 1015 } 1016 #endif 1017 1018 #ifndef __FreeBSD__ 1019 /* Use the already strsep(",")-ed optscopy */ 1020 if (strncmp(optscopy, "tap", 3) == 0 || 1021 strncmp(optscopy, "vmnet", 5) == 0) 1022 pci_vtnet_tap_setup(sc, optscopy); 1023 #endif 1024 1025 free(optscopy); 1026 1027 if (err) { 1028 free(sc); 1029 return (err); 1030 } 1031 1032 #ifdef __FreeBSD__ 1033 err = netbe_init(&sc->vsc_be, opts, pci_vtnet_rx_callback, 1034 sc); 1035 if (err) { 1036 free(sc); 1037 return (err); 1038 } 1039 1040 sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF | 1041 netbe_get_cap(sc->vsc_be); 1042 #endif 1043 1044 } 1045 1046 #ifdef __FreeBSD__ 1047 if (!mac_provided) { 1048 net_genmac(pi, sc->vsc_config.mac); 1049 } 1050 1051 sc->vsc_config.mtu = mtu; 1052 if (mtu_provided) { 1053 sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MTU; 1054 } 1055 #endif 1056 1057 /* 1058 * Since we do not actually support multiqueue, 1059 * set the maximum virtqueue pairs to 1. 1060 */ 1061 sc->vsc_config.max_virtqueue_pairs = 1; 1062 1063 /* initialize config space */ 1064 pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); 1065 pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); 1066 pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); 1067 pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); 1068 pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); 1069 1070 /* Link is up if we managed to open tap device or vale port. */ 1071 #ifdef __FreeBSD__ 1072 sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || 1073 #else 1074 sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 || 1075 sc->vsc_nmd != NULL); 1076 #endif 1077 1078 /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ 1079 if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) 1080 return (1); 1081 1082 /* use BAR 0 to map config regs in IO space */ 1083 vi_set_io_bar(&sc->vsc_vs, 0); 1084 1085 sc->resetting = 0; 1086 1087 sc->rx_merge = 1; 1088 sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); 1089 pthread_mutex_init(&sc->rx_mtx, NULL); 1090 1091 /* 1092 * Initialize tx semaphore & spawn TX processing thread. 1093 * As of now, only one thread for TX desc processing is 1094 * spawned. 1095 */ 1096 sc->tx_in_progress = 0; 1097 pthread_mutex_init(&sc->tx_mtx, NULL); 1098 pthread_cond_init(&sc->tx_cond, NULL); 1099 pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); 1100 snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, 1101 pi->pi_func); 1102 pthread_set_name_np(sc->tx_tid, tname); 1103 1104 return (0); 1105 } 1106 1107 static int 1108 pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) 1109 { 1110 struct pci_vtnet_softc *sc = vsc; 1111 void *ptr; 1112 1113 if (offset < 6) { 1114 assert(offset + size <= 6); 1115 /* 1116 * The driver is allowed to change the MAC address 1117 */ 1118 ptr = &sc->vsc_config.mac[offset]; 1119 memcpy(ptr, &value, size); 1120 } else { 1121 /* silently ignore other writes */ 1122 DPRINTF(("vtnet: write to readonly reg %d", offset)); 1123 } 1124 1125 return (0); 1126 } 1127 1128 static int 1129 pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) 1130 { 1131 struct pci_vtnet_softc *sc = vsc; 1132 void *ptr; 1133 1134 ptr = (uint8_t *)&sc->vsc_config + offset; 1135 memcpy(retval, ptr, size); 1136 return (0); 1137 } 1138 1139 static void 1140 pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) 1141 { 1142 struct pci_vtnet_softc *sc = vsc; 1143 1144 sc->vsc_features = negotiated_features; 1145 1146 if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { 1147 sc->rx_merge = 0; 1148 /* non-merge rx header is 2 bytes shorter */ 1149 sc->rx_vhdrlen -= 2; 1150 } 1151 1152 pthread_mutex_lock(&sc->rx_mtx); 1153 sc->features_negotiated = true; 1154 pthread_mutex_unlock(&sc->rx_mtx); 1155 } 1156 1157 struct pci_devemu pci_de_vnet = { 1158 .pe_emu = "virtio-net", 1159 .pe_init = pci_vtnet_init, 1160 .pe_barwrite = vi_pci_write, 1161 .pe_barread = vi_pci_read 1162 }; 1163 PCI_EMUL_SET(pci_de_vnet); 1164