1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2013 Nexenta Inc. All rights reserved. 14 * Copyright (c) 2014, 2016 by Delphix. All rights reserved. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org> 17 */ 18 19 /* Based on the NetBSD virtio driver by Minoura Makoto. */ 20 /* 21 * Copyright (c) 2010 Minoura Makoto. 22 * All rights reserved. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 34 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 35 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 36 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 38 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 39 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 40 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 41 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 42 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 /* 46 * VIRTIO NETWORK DRIVER 47 */ 48 49 #include <sys/types.h> 50 #include <sys/errno.h> 51 #include <sys/param.h> 52 #include <sys/stropts.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/kmem.h> 56 #include <sys/conf.h> 57 #include <sys/devops.h> 58 #include <sys/ksynch.h> 59 #include <sys/stat.h> 60 #include <sys/modctl.h> 61 #include <sys/debug.h> 62 #include <sys/pci.h> 63 #include <sys/ethernet.h> 64 #include <sys/vlan.h> 65 #include <sys/sysmacros.h> 66 #include <sys/smbios.h> 67 68 #include <sys/dlpi.h> 69 #include <sys/taskq.h> 70 71 #include <sys/pattr.h> 72 #include <sys/strsun.h> 73 74 #include <sys/random.h> 75 #include <sys/containerof.h> 76 #include <sys/stream.h> 77 #include <inet/tcp.h> 78 79 #include <sys/mac.h> 80 #include <sys/mac_provider.h> 81 #include <sys/mac_ether.h> 82 83 #include "virtio.h" 84 #include "vioif.h" 85 86 87 static int vioif_quiesce(dev_info_t *); 88 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t); 89 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t); 90 static boolean_t vioif_has_feature(vioif_t *, uint32_t); 91 static void vioif_reclaim_restart(vioif_t *); 92 static int vioif_m_stat(void *, uint_t, uint64_t *); 93 static void vioif_m_stop(void *); 94 static int vioif_m_start(void *); 95 static int vioif_m_multicst(void *, boolean_t, const uint8_t *); 96 static int vioif_m_setpromisc(void *, boolean_t); 97 static int vioif_m_unicst(void *, const uint8_t *); 98 static mblk_t *vioif_m_tx(void *, mblk_t *); 99 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 100 const void *); 101 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 102 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t, 103 mac_prop_info_handle_t); 104 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *); 105 static uint_t vioif_add_rx(vioif_t *); 106 107 108 static struct cb_ops vioif_cb_ops = { 109 .cb_rev = CB_REV, 110 .cb_flag = D_MP | D_NEW, 111 112 .cb_open = nulldev, 113 .cb_close = nulldev, 114 .cb_strategy = nodev, 115 .cb_print = nodev, 116 .cb_dump = nodev, 117 .cb_read = nodev, 118 .cb_write = nodev, 119 .cb_ioctl = nodev, 120 .cb_devmap = nodev, 121 .cb_mmap = nodev, 122 .cb_segmap = nodev, 123 .cb_chpoll = nochpoll, 124 .cb_prop_op = ddi_prop_op, 125 .cb_str = NULL, 126 .cb_aread = nodev, 127 .cb_awrite = nodev, 128 }; 129 130 static struct dev_ops vioif_dev_ops = { 131 .devo_rev = DEVO_REV, 132 .devo_refcnt = 0, 133 134 .devo_attach = vioif_attach, 135 .devo_detach = vioif_detach, 136 .devo_quiesce = vioif_quiesce, 137 138 .devo_cb_ops = &vioif_cb_ops, 139 140 .devo_getinfo = NULL, 141 .devo_identify = nulldev, 142 .devo_probe = nulldev, 143 .devo_reset = nodev, 144 .devo_bus_ops = NULL, 145 .devo_power = NULL, 146 }; 147 148 static struct modldrv vioif_modldrv = { 149 .drv_modops = &mod_driverops, 150 .drv_linkinfo = "VIRTIO network driver", 151 .drv_dev_ops = &vioif_dev_ops 152 }; 153 154 static struct modlinkage vioif_modlinkage = { 155 .ml_rev = MODREV_1, 156 .ml_linkage = { &vioif_modldrv, NULL } 157 }; 158 159 static mac_callbacks_t vioif_mac_callbacks = { 160 .mc_getstat = vioif_m_stat, 161 .mc_start = vioif_m_start, 162 .mc_stop = vioif_m_stop, 163 .mc_setpromisc = vioif_m_setpromisc, 164 .mc_multicst = vioif_m_multicst, 165 .mc_unicst = vioif_m_unicst, 166 .mc_tx = vioif_m_tx, 167 168 .mc_callbacks = (MC_GETCAPAB | MC_SETPROP | 169 MC_GETPROP | MC_PROPINFO), 170 .mc_getcapab = vioif_m_getcapab, 171 .mc_setprop = vioif_m_setprop, 172 .mc_getprop = vioif_m_getprop, 173 .mc_propinfo = vioif_m_propinfo, 174 }; 175 176 static const uchar_t vioif_broadcast[ETHERADDRL] = { 177 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 178 }; 179 180 /* 181 * Interval for the periodic TX reclaim. 182 */ 183 uint_t vioif_reclaim_ms = 200; 184 185 /* 186 * Allow the operator to override the kinds of interrupts we'll use for 187 * vioif. This value defaults to -1 so that it can be overridden to 0 in 188 * /etc/system. 189 */ 190 int vioif_allowed_int_types = -1; 191 192 /* 193 * DMA attribute template for transmit and receive buffers. The SGL entry 194 * count will be modified before using the template. Note that these 195 * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in 196 * received frames at the correct offset for the networking stack. 197 */ 198 ddi_dma_attr_t vioif_dma_attr_bufs = { 199 .dma_attr_version = DMA_ATTR_V0, 200 .dma_attr_addr_lo = 0x0000000000000000, 201 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 202 .dma_attr_count_max = 0x00000000FFFFFFFF, 203 .dma_attr_align = VIOIF_HEADER_ALIGN, 204 .dma_attr_burstsizes = 1, 205 .dma_attr_minxfer = 1, 206 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 207 .dma_attr_seg = 0x00000000FFFFFFFF, 208 .dma_attr_sgllen = 0, 209 .dma_attr_granular = 1, 210 .dma_attr_flags = 0 211 }; 212 213 /* 214 * DMA attributes for mapping larger transmit buffers from the networking 215 * stack. The requirements are quite loose, but note that the SGL entry length 216 * field is 32-bit. 217 */ 218 ddi_dma_attr_t vioif_dma_attr_external = { 219 .dma_attr_version = DMA_ATTR_V0, 220 .dma_attr_addr_lo = 0x0000000000000000, 221 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 222 .dma_attr_count_max = 0x00000000FFFFFFFF, 223 .dma_attr_align = 1, 224 .dma_attr_burstsizes = 1, 225 .dma_attr_minxfer = 1, 226 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 227 .dma_attr_seg = 0x00000000FFFFFFFF, 228 .dma_attr_sgllen = VIOIF_MAX_SEGS - 1, 229 .dma_attr_granular = 1, 230 .dma_attr_flags = 0 231 }; 232 233 234 /* 235 * VIRTIO NET MAC PROPERTIES 236 */ 237 #define VIOIF_MACPROP_TXCOPY_THRESH "_txcopy_thresh" 238 #define VIOIF_MACPROP_TXCOPY_THRESH_DEF 300 239 #define VIOIF_MACPROP_TXCOPY_THRESH_MAX 640 240 241 #define VIOIF_MACPROP_RXCOPY_THRESH "_rxcopy_thresh" 242 #define VIOIF_MACPROP_RXCOPY_THRESH_DEF 300 243 #define VIOIF_MACPROP_RXCOPY_THRESH_MAX 640 244 245 static char *vioif_priv_props[] = { 246 VIOIF_MACPROP_TXCOPY_THRESH, 247 VIOIF_MACPROP_RXCOPY_THRESH, 248 NULL 249 }; 250 251 252 static vioif_txbuf_t * 253 vioif_txbuf_alloc(vioif_t *vif) 254 { 255 vioif_txbuf_t *tb; 256 257 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 258 259 if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) { 260 vif->vif_ntxbufs_alloc++; 261 } 262 263 return (tb); 264 } 265 266 static void 267 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb) 268 { 269 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 270 271 VERIFY3U(vif->vif_ntxbufs_alloc, >, 0); 272 vif->vif_ntxbufs_alloc--; 273 274 virtio_chain_clear(tb->tb_chain); 275 list_insert_head(&vif->vif_txbufs, tb); 276 } 277 278 static vioif_rxbuf_t * 279 vioif_rxbuf_alloc(vioif_t *vif) 280 { 281 vioif_rxbuf_t *rb; 282 283 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 284 285 if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) { 286 vif->vif_nrxbufs_alloc++; 287 } 288 289 return (rb); 290 } 291 292 static void 293 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb) 294 { 295 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 296 297 VERIFY3U(vif->vif_nrxbufs_alloc, >, 0); 298 vif->vif_nrxbufs_alloc--; 299 300 virtio_chain_clear(rb->rb_chain); 301 list_insert_head(&vif->vif_rxbufs, rb); 302 } 303 304 static void 305 vioif_rx_free_callback(caddr_t free_arg) 306 { 307 vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg; 308 vioif_t *vif = rb->rb_vioif; 309 310 mutex_enter(&vif->vif_mutex); 311 312 /* 313 * Return this receive buffer to the free list. 314 */ 315 vioif_rxbuf_free(vif, rb); 316 317 VERIFY3U(vif->vif_nrxbufs_onloan, >, 0); 318 vif->vif_nrxbufs_onloan--; 319 320 /* 321 * Attempt to replenish the receive queue with at least the buffer we 322 * just freed. There isn't a great way to deal with failure here, 323 * though because we'll only loan at most half of the buffers there 324 * should always be at least some available even if this fails. 325 */ 326 (void) vioif_add_rx(vif); 327 328 mutex_exit(&vif->vif_mutex); 329 } 330 331 static void 332 vioif_free_bufs(vioif_t *vif) 333 { 334 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 335 336 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); 337 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { 338 vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i]; 339 340 /* 341 * Ensure that this txbuf is now in the free list: 342 */ 343 VERIFY(list_link_active(&tb->tb_link)); 344 list_remove(&vif->vif_txbufs, tb); 345 346 /* 347 * We should not have an mblk chain at this point. 348 */ 349 VERIFY3P(tb->tb_mp, ==, NULL); 350 351 if (tb->tb_dma != NULL) { 352 virtio_dma_free(tb->tb_dma); 353 tb->tb_dma = NULL; 354 } 355 356 if (tb->tb_chain != NULL) { 357 virtio_chain_free(tb->tb_chain); 358 tb->tb_chain = NULL; 359 } 360 361 if (tb->tb_dmaext != NULL) { 362 for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) { 363 if (tb->tb_dmaext[j] != NULL) { 364 virtio_dma_free( 365 tb->tb_dmaext[j]); 366 tb->tb_dmaext[j] = NULL; 367 } 368 } 369 370 kmem_free(tb->tb_dmaext, 371 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity); 372 tb->tb_dmaext = NULL; 373 tb->tb_dmaext_capacity = 0; 374 } 375 } 376 VERIFY(list_is_empty(&vif->vif_txbufs)); 377 if (vif->vif_txbufs_mem != NULL) { 378 kmem_free(vif->vif_txbufs_mem, 379 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity); 380 vif->vif_txbufs_mem = NULL; 381 vif->vif_txbufs_capacity = 0; 382 } 383 384 VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0); 385 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { 386 vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i]; 387 388 /* 389 * Ensure that this rxbuf is now in the free list: 390 */ 391 VERIFY(list_link_active(&rb->rb_link)); 392 list_remove(&vif->vif_rxbufs, rb); 393 394 if (rb->rb_dma != NULL) { 395 virtio_dma_free(rb->rb_dma); 396 rb->rb_dma = NULL; 397 } 398 399 if (rb->rb_chain != NULL) { 400 virtio_chain_free(rb->rb_chain); 401 rb->rb_chain = NULL; 402 } 403 } 404 VERIFY(list_is_empty(&vif->vif_rxbufs)); 405 if (vif->vif_rxbufs_mem != NULL) { 406 kmem_free(vif->vif_rxbufs_mem, 407 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity); 408 vif->vif_rxbufs_mem = NULL; 409 vif->vif_rxbufs_capacity = 0; 410 } 411 } 412 413 static int 414 vioif_alloc_bufs(vioif_t *vif) 415 { 416 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 417 418 /* 419 * Allocate one contiguous chunk of memory for the transmit and receive 420 * buffer tracking objects. If the ring is unusually small, we'll 421 * reduce our target buffer count accordingly. 422 */ 423 vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS, 424 virtio_queue_size(vif->vif_tx_vq)); 425 vif->vif_txbufs_mem = kmem_zalloc( 426 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP); 427 list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t), 428 offsetof(vioif_txbuf_t, tb_link)); 429 430 vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS, 431 virtio_queue_size(vif->vif_rx_vq)); 432 vif->vif_rxbufs_mem = kmem_zalloc( 433 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP); 434 list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t), 435 offsetof(vioif_rxbuf_t, rb_link)); 436 437 /* 438 * Do not loan more than half of our allocated receive buffers into 439 * the networking stack. 440 */ 441 vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2; 442 443 /* 444 * Put everything in the free list straight away in order to simplify 445 * the use of vioif_free_bufs() for cleanup on allocation failure. 446 */ 447 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { 448 list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]); 449 } 450 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { 451 list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]); 452 } 453 454 /* 455 * Start from the DMA attribute template common to both transmit and 456 * receive buffers. The SGL entry count will be modified for each 457 * buffer type. 458 */ 459 ddi_dma_attr_t attr = vioif_dma_attr_bufs; 460 461 /* 462 * The transmit inline buffer is small (less than a page), so it's 463 * reasonable to request a single cookie. 464 */ 465 attr.dma_attr_sgllen = 1; 466 467 for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL; 468 tb = list_next(&vif->vif_txbufs, tb)) { 469 if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio, 470 VIOIF_TX_INLINE_SIZE, &attr, 471 DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) { 472 goto fail; 473 } 474 VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1); 475 476 if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq, 477 KM_SLEEP)) == NULL) { 478 goto fail; 479 } 480 virtio_chain_data_set(tb->tb_chain, tb); 481 482 tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1; 483 tb->tb_dmaext = kmem_zalloc( 484 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity, 485 KM_SLEEP); 486 } 487 488 /* 489 * The receive buffers are larger, and we can tolerate a large number 490 * of segments. Adjust the SGL entry count, setting aside one segment 491 * for the virtio net header. 492 */ 493 attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1; 494 495 for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL; 496 rb = list_next(&vif->vif_rxbufs, rb)) { 497 if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio, 498 VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ, 499 KM_SLEEP)) == NULL) { 500 goto fail; 501 } 502 503 if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq, 504 KM_SLEEP)) == NULL) { 505 goto fail; 506 } 507 virtio_chain_data_set(rb->rb_chain, rb); 508 509 /* 510 * Ensure that the first cookie is sufficient to cover the 511 * header skip region plus one byte. 512 */ 513 VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=, 514 VIOIF_HEADER_SKIP + 1); 515 516 /* 517 * Ensure that the frame data begins at a location with a 518 * correctly aligned IP header. 519 */ 520 VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma, 521 VIOIF_HEADER_SKIP) % 4, ==, 2); 522 523 rb->rb_vioif = vif; 524 rb->rb_frtn.free_func = vioif_rx_free_callback; 525 rb->rb_frtn.free_arg = (caddr_t)rb; 526 } 527 528 return (0); 529 530 fail: 531 vioif_free_bufs(vif); 532 return (ENOMEM); 533 } 534 535 static int 536 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr) 537 { 538 /* 539 * Even though we currently do not have support for programming 540 * multicast filters, or even enabling promiscuous mode, we return 541 * success here to avoid the networking stack falling back to link 542 * layer broadcast for multicast traffic. Some hypervisors already 543 * pass received multicast frames onto the guest, so at least on those 544 * systems multicast will work as expected anyway. 545 */ 546 return (0); 547 } 548 549 static int 550 vioif_m_setpromisc(void *arg, boolean_t on) 551 { 552 /* 553 * Even though we cannot currently enable promiscuous mode, we return 554 * success here to allow tools like snoop(1M) to continue to function. 555 */ 556 return (0); 557 } 558 559 static int 560 vioif_m_unicst(void *arg, const uint8_t *mac) 561 { 562 return (ENOTSUP); 563 } 564 565 static uint_t 566 vioif_add_rx(vioif_t *vif) 567 { 568 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 569 570 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { 571 /* 572 * If the NIC is not running, do not give the device any 573 * receive buffers. 574 */ 575 return (0); 576 } 577 578 uint_t num_added = 0; 579 580 vioif_rxbuf_t *rb; 581 while ((rb = vioif_rxbuf_alloc(vif)) != NULL) { 582 /* 583 * For legacy devices, and those that have not negotiated 584 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a 585 * separate descriptor entry to the rest of the buffer. 586 */ 587 if (virtio_chain_append(rb->rb_chain, 588 virtio_dma_cookie_pa(rb->rb_dma, 0), 589 sizeof (struct virtio_net_hdr), 590 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 591 goto fail; 592 } 593 594 for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) { 595 uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n); 596 size_t sz = virtio_dma_cookie_size(rb->rb_dma, n); 597 598 if (n == 0) { 599 pa += VIOIF_HEADER_SKIP; 600 VERIFY3U(sz, >, VIOIF_HEADER_SKIP); 601 sz -= VIOIF_HEADER_SKIP; 602 } 603 604 if (virtio_chain_append(rb->rb_chain, pa, sz, 605 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 606 goto fail; 607 } 608 } 609 610 virtio_chain_submit(rb->rb_chain, B_FALSE); 611 num_added++; 612 continue; 613 614 fail: 615 vioif_rxbuf_free(vif, rb); 616 vif->vif_norecvbuf++; 617 break; 618 } 619 620 if (num_added > 0) { 621 virtio_queue_flush(vif->vif_rx_vq); 622 } 623 624 return (num_added); 625 } 626 627 static uint_t 628 vioif_process_rx(vioif_t *vif) 629 { 630 virtio_chain_t *vic; 631 mblk_t *mphead = NULL, *lastmp = NULL, *mp; 632 uint_t num_processed = 0; 633 634 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 635 636 while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) { 637 /* 638 * We have to use the chain received length here, as the device 639 * does not tell us the received frame length any other way. 640 * In a limited survey of hypervisors, virtio network devices 641 * appear to provide the right value here. 642 */ 643 size_t len = virtio_chain_received_length(vic); 644 vioif_rxbuf_t *rb = virtio_chain_data(vic); 645 646 virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU); 647 648 /* 649 * If the NIC is not running, discard any received frames. 650 */ 651 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { 652 vioif_rxbuf_free(vif, rb); 653 continue; 654 } 655 656 if (len < sizeof (struct virtio_net_hdr)) { 657 vif->vif_rxfail_chain_undersize++; 658 vif->vif_ierrors++; 659 vioif_rxbuf_free(vif, rb); 660 continue; 661 } 662 len -= sizeof (struct virtio_net_hdr); 663 664 /* 665 * We copy small packets that happen to fit into a single 666 * cookie and reuse the buffers. For bigger ones, we loan 667 * the buffers upstream. 668 */ 669 if (len < vif->vif_rxcopy_thresh || 670 vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) { 671 mutex_exit(&vif->vif_mutex); 672 if ((mp = allocb(len, 0)) == NULL) { 673 mutex_enter(&vif->vif_mutex); 674 vif->vif_norecvbuf++; 675 vif->vif_ierrors++; 676 677 vioif_rxbuf_free(vif, rb); 678 continue; 679 } 680 681 bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP), 682 mp->b_rptr, len); 683 mp->b_wptr = mp->b_rptr + len; 684 685 /* 686 * As the packet contents was copied rather than 687 * loaned, we can return the receive buffer resources 688 * to the free list. 689 */ 690 mutex_enter(&vif->vif_mutex); 691 vioif_rxbuf_free(vif, rb); 692 693 } else { 694 mutex_exit(&vif->vif_mutex); 695 if ((mp = desballoc(virtio_dma_va(rb->rb_dma, 696 VIOIF_HEADER_SKIP), len, 0, 697 &rb->rb_frtn)) == NULL) { 698 mutex_enter(&vif->vif_mutex); 699 vif->vif_norecvbuf++; 700 vif->vif_ierrors++; 701 702 vioif_rxbuf_free(vif, rb); 703 continue; 704 } 705 mp->b_wptr = mp->b_rptr + len; 706 707 mutex_enter(&vif->vif_mutex); 708 vif->vif_nrxbufs_onloan++; 709 } 710 711 /* 712 * virtio-net does not tell us if this packet is multicast 713 * or broadcast, so we have to check it. 714 */ 715 if (mp->b_rptr[0] & 0x1) { 716 if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0) 717 vif->vif_multircv++; 718 else 719 vif->vif_brdcstrcv++; 720 } 721 722 vif->vif_rbytes += len; 723 vif->vif_ipackets++; 724 725 if (lastmp == NULL) { 726 mphead = mp; 727 } else { 728 lastmp->b_next = mp; 729 } 730 lastmp = mp; 731 num_processed++; 732 } 733 734 if (mphead != NULL) { 735 if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) { 736 mutex_exit(&vif->vif_mutex); 737 mac_rx(vif->vif_mac_handle, NULL, mphead); 738 mutex_enter(&vif->vif_mutex); 739 } else { 740 /* 741 * The NIC was disabled part way through our execution, 742 * so free the messages we allocated. 743 */ 744 freemsgchain(mphead); 745 } 746 } 747 748 return (num_processed); 749 } 750 751 static uint_t 752 vioif_reclaim_used_tx(vioif_t *vif) 753 { 754 virtio_chain_t *vic; 755 uint_t num_reclaimed = 0; 756 757 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 758 759 while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) { 760 vioif_txbuf_t *tb = virtio_chain_data(vic); 761 762 if (tb->tb_mp != NULL) { 763 /* 764 * Unbind the external mapping. 765 */ 766 for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) { 767 if (tb->tb_dmaext[i] == NULL) { 768 continue; 769 } 770 771 virtio_dma_unbind(tb->tb_dmaext[i]); 772 } 773 774 freemsg(tb->tb_mp); 775 tb->tb_mp = NULL; 776 } 777 778 /* 779 * Return this transmit buffer to the free list for reuse. 780 */ 781 mutex_enter(&vif->vif_mutex); 782 vioif_txbuf_free(vif, tb); 783 mutex_exit(&vif->vif_mutex); 784 785 num_reclaimed++; 786 } 787 788 /* Return ring to transmitting state if descriptors were reclaimed. */ 789 if (num_reclaimed > 0) { 790 boolean_t do_update = B_FALSE; 791 792 mutex_enter(&vif->vif_mutex); 793 vif->vif_stat_tx_reclaim += num_reclaimed; 794 if (vif->vif_tx_corked) { 795 /* 796 * TX was corked on a lack of available descriptors. 797 * That dire state has passed so the TX interrupt can 798 * be disabled and MAC can be notified that 799 * transmission is possible again. 800 */ 801 vif->vif_tx_corked = B_FALSE; 802 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 803 do_update = B_TRUE; 804 } 805 806 if (do_update) { 807 mac_tx_update(vif->vif_mac_handle); 808 } 809 mutex_exit(&vif->vif_mutex); 810 } 811 812 return (num_reclaimed); 813 } 814 815 static void 816 vioif_reclaim_periodic(void *arg) 817 { 818 vioif_t *vif = arg; 819 uint_t num_reclaimed; 820 821 num_reclaimed = vioif_reclaim_used_tx(vif); 822 823 mutex_enter(&vif->vif_mutex); 824 vif->vif_tx_reclaim_tid = 0; 825 /* 826 * If used descriptors were reclaimed or TX descriptors appear to be 827 * outstanding, the ring is considered active and periodic reclamation 828 * is necessary for now. 829 */ 830 if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) { 831 /* Do not reschedule if the ring is being drained. */ 832 if (!vif->vif_tx_drain) { 833 vioif_reclaim_restart(vif); 834 } 835 } 836 mutex_exit(&vif->vif_mutex); 837 } 838 839 static void 840 vioif_reclaim_restart(vioif_t *vif) 841 { 842 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 843 VERIFY(!vif->vif_tx_drain); 844 845 if (vif->vif_tx_reclaim_tid == 0) { 846 vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif, 847 MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms)); 848 } 849 } 850 851 static void 852 vioif_tx_drain(vioif_t *vif) 853 { 854 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 855 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING); 856 857 vif->vif_tx_drain = B_TRUE; 858 /* Put a stop to the periodic reclaim if it is running */ 859 if (vif->vif_tx_reclaim_tid != 0) { 860 timeout_id_t tid = vif->vif_tx_reclaim_tid; 861 862 /* 863 * With vif_tx_drain set, there is no risk that a racing 864 * vioif_reclaim_periodic() call will reschedule itself. 865 * 866 * Being part of the mc_stop hook also guarantees that 867 * vioif_m_tx() will not be called to restart it. 868 */ 869 vif->vif_tx_reclaim_tid = 0; 870 mutex_exit(&vif->vif_mutex); 871 (void) untimeout(tid); 872 mutex_enter(&vif->vif_mutex); 873 } 874 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 875 876 /* 877 * Wait for all of the TX descriptors to be processed by the host so 878 * they can be reclaimed. 879 */ 880 while (vif->vif_ntxbufs_alloc > 0) { 881 mutex_exit(&vif->vif_mutex); 882 (void) vioif_reclaim_used_tx(vif); 883 delay(5); 884 mutex_enter(&vif->vif_mutex); 885 } 886 VERIFY(!vif->vif_tx_corked); 887 VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0); 888 VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0); 889 } 890 891 static int 892 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) 893 { 894 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 895 896 VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP); 897 898 /* 899 * Copy the message into the inline buffer and then free the message. 900 */ 901 mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP)); 902 903 if (virtio_chain_append(tb->tb_chain, 904 virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP, 905 msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 906 return (DDI_FAILURE); 907 } 908 909 return (DDI_SUCCESS); 910 } 911 912 static int 913 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) 914 { 915 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 916 917 mblk_t *nmp = mp; 918 tb->tb_ndmaext = 0; 919 920 while (nmp != NULL) { 921 size_t len; 922 923 if ((len = MBLKL(nmp)) == 0) { 924 /* 925 * Skip any zero-length entries in the chain. 926 */ 927 nmp = nmp->b_cont; 928 continue; 929 } 930 931 if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) { 932 mutex_enter(&vif->vif_mutex); 933 vif->vif_txfail_indirect_limit++; 934 vif->vif_notxbuf++; 935 mutex_exit(&vif->vif_mutex); 936 goto fail; 937 } 938 939 if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) { 940 /* 941 * Allocate a DMA handle for this slot. 942 */ 943 if ((tb->tb_dmaext[tb->tb_ndmaext] = 944 virtio_dma_alloc_nomem(vif->vif_virtio, 945 &vioif_dma_attr_external, KM_SLEEP)) == NULL) { 946 mutex_enter(&vif->vif_mutex); 947 vif->vif_notxbuf++; 948 mutex_exit(&vif->vif_mutex); 949 goto fail; 950 } 951 } 952 virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++]; 953 954 if (virtio_dma_bind(extdma, nmp->b_rptr, len, 955 DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) != 956 DDI_SUCCESS) { 957 mutex_enter(&vif->vif_mutex); 958 vif->vif_txfail_dma_bind++; 959 mutex_exit(&vif->vif_mutex); 960 goto fail; 961 } 962 963 for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) { 964 uint64_t pa = virtio_dma_cookie_pa(extdma, n); 965 size_t sz = virtio_dma_cookie_size(extdma, n); 966 967 if (virtio_chain_append(tb->tb_chain, pa, sz, 968 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 969 mutex_enter(&vif->vif_mutex); 970 vif->vif_txfail_indirect_limit++; 971 vif->vif_notxbuf++; 972 mutex_exit(&vif->vif_mutex); 973 goto fail; 974 } 975 } 976 977 nmp = nmp->b_cont; 978 } 979 980 /* 981 * We need to keep the message around until we reclaim the buffer from 982 * the device before freeing it. 983 */ 984 tb->tb_mp = mp; 985 986 return (DDI_SUCCESS); 987 988 fail: 989 for (uint_t n = 0; n < tb->tb_ndmaext; n++) { 990 if (tb->tb_dmaext[n] != NULL) { 991 virtio_dma_unbind(tb->tb_dmaext[n]); 992 } 993 } 994 tb->tb_ndmaext = 0; 995 996 freemsg(mp); 997 998 return (DDI_FAILURE); 999 } 1000 1001 static boolean_t 1002 vioif_send(vioif_t *vif, mblk_t *mp) 1003 { 1004 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 1005 1006 vioif_txbuf_t *tb = NULL; 1007 struct virtio_net_hdr *vnh = NULL; 1008 size_t msg_size = 0; 1009 uint32_t csum_start; 1010 uint32_t csum_stuff; 1011 uint32_t csum_flags; 1012 uint32_t lso_flags; 1013 uint32_t lso_mss; 1014 mblk_t *nmp; 1015 int ret; 1016 boolean_t lso_required = B_FALSE; 1017 struct ether_header *ether = (void *)mp->b_rptr; 1018 1019 for (nmp = mp; nmp; nmp = nmp->b_cont) 1020 msg_size += MBLKL(nmp); 1021 1022 if (vif->vif_tx_tso4 || vif->vif_tx_tso6) { 1023 mac_lso_get(mp, &lso_mss, &lso_flags); 1024 lso_required = (lso_flags & HW_LSO) != 0; 1025 } 1026 1027 mutex_enter(&vif->vif_mutex); 1028 if ((tb = vioif_txbuf_alloc(vif)) == NULL) { 1029 vif->vif_notxbuf++; 1030 goto fail; 1031 } 1032 mutex_exit(&vif->vif_mutex); 1033 1034 /* 1035 * Use the inline buffer for the virtio net header. Zero the portion 1036 * of our DMA allocation prior to the packet data. 1037 */ 1038 vnh = virtio_dma_va(tb->tb_dma, 0); 1039 bzero(vnh, VIOIF_HEADER_SKIP); 1040 1041 /* 1042 * For legacy devices, and those that have not negotiated 1043 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate 1044 * descriptor entry to the rest of the buffer. 1045 */ 1046 if (virtio_chain_append(tb->tb_chain, 1047 virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr), 1048 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 1049 mutex_enter(&vif->vif_mutex); 1050 vif->vif_notxbuf++; 1051 goto fail; 1052 } 1053 1054 mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags); 1055 1056 /* 1057 * They want us to do the TCP/UDP csum calculation. 1058 */ 1059 if (csum_flags & HCK_PARTIALCKSUM) { 1060 int eth_hsize; 1061 1062 /* 1063 * Did we ask for it? 1064 */ 1065 ASSERT(vif->vif_tx_csum); 1066 1067 /* 1068 * We only asked for partial csum packets. 1069 */ 1070 ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM)); 1071 ASSERT(!(csum_flags & HCK_FULLCKSUM)); 1072 1073 if (ether->ether_type == htons(ETHERTYPE_VLAN)) { 1074 eth_hsize = sizeof (struct ether_vlan_header); 1075 } else { 1076 eth_hsize = sizeof (struct ether_header); 1077 } 1078 1079 vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 1080 vnh->vnh_csum_start = eth_hsize + csum_start; 1081 vnh->vnh_csum_offset = csum_stuff - csum_start; 1082 } 1083 1084 /* 1085 * Setup LSO fields if required. 1086 */ 1087 if (lso_required) { 1088 mac_ether_offload_flags_t needed; 1089 mac_ether_offload_info_t meo; 1090 uint32_t cksum; 1091 size_t len; 1092 mblk_t *pullmp = NULL; 1093 tcpha_t *tcpha; 1094 1095 if (mac_ether_offload_info(mp, &meo) != 0) { 1096 goto fail; 1097 } 1098 1099 needed = MEOI_L2INFO_SET | MEOI_L3INFO_SET | MEOI_L4INFO_SET; 1100 if ((meo.meoi_flags & needed) != needed) { 1101 goto fail; 1102 } 1103 1104 if (meo.meoi_l4proto != IPPROTO_TCP) { 1105 goto fail; 1106 } 1107 1108 if (meo.meoi_l3proto == ETHERTYPE_IP && vif->vif_tx_tso4) { 1109 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 1110 } else if (meo.meoi_l3proto == ETHERTYPE_IPV6 && 1111 vif->vif_tx_tso6) { 1112 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV6; 1113 } else { 1114 goto fail; 1115 } 1116 1117 /* 1118 * The TCP stack does not include the length in the TCP 1119 * pseudo-header when it is performing LSO since hardware 1120 * generally asks for it to be removed (as it'll change). 1121 * Unfortunately, for virtio, we actually need it. This means we 1122 * need to go through and calculate the actual length and fix 1123 * things up. Because the virtio spec cares about the ECN flag 1124 * and indicating that, at least this means we'll have that 1125 * available as well. 1126 */ 1127 if (MBLKL(mp) < vnh->vnh_hdr_len) { 1128 pullmp = msgpullup(mp, vnh->vnh_hdr_len); 1129 if (pullmp == NULL) 1130 goto fail; 1131 tcpha = (tcpha_t *)(pullmp->b_rptr + meo.meoi_l2hlen + 1132 meo.meoi_l3hlen); 1133 } else { 1134 tcpha = (tcpha_t *)(mp->b_rptr + meo.meoi_l2hlen + 1135 meo.meoi_l3hlen); 1136 } 1137 1138 len = meo.meoi_len - meo.meoi_l2hlen - meo.meoi_l3hlen; 1139 cksum = ntohs(tcpha->tha_sum) + len; 1140 cksum = (cksum >> 16) + (cksum & 0xffff); 1141 cksum = (cksum >> 16) + (cksum & 0xffff); 1142 tcpha->tha_sum = htons(cksum); 1143 1144 if (tcpha->tha_flags & TH_CWR) { 1145 vnh->vnh_gso_type |= VIRTIO_NET_HDR_GSO_ECN; 1146 } 1147 vnh->vnh_gso_size = (uint16_t)lso_mss; 1148 vnh->vnh_hdr_len = meo.meoi_l2hlen + meo.meoi_l3hlen + 1149 meo.meoi_l4hlen; 1150 1151 freemsg(pullmp); 1152 } 1153 1154 /* 1155 * The device does not maintain its own statistics about broadcast or 1156 * multicast packets, so we have to check the destination address 1157 * ourselves. 1158 */ 1159 if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) { 1160 mutex_enter(&vif->vif_mutex); 1161 if (ether_cmp(ðer->ether_dhost, vioif_broadcast) == 0) { 1162 vif->vif_brdcstxmt++; 1163 } else { 1164 vif->vif_multixmt++; 1165 } 1166 mutex_exit(&vif->vif_mutex); 1167 } 1168 1169 /* 1170 * For small packets, copy into the preallocated inline buffer rather 1171 * than incur the overhead of mapping. Note that both of these 1172 * functions ensure that "mp" is freed before returning. 1173 */ 1174 if (msg_size < vif->vif_txcopy_thresh) { 1175 ret = vioif_tx_inline(vif, tb, mp, msg_size); 1176 } else { 1177 ret = vioif_tx_external(vif, tb, mp, msg_size); 1178 } 1179 mp = NULL; 1180 1181 mutex_enter(&vif->vif_mutex); 1182 1183 if (ret != DDI_SUCCESS) { 1184 goto fail; 1185 } 1186 1187 vif->vif_opackets++; 1188 vif->vif_obytes += msg_size; 1189 mutex_exit(&vif->vif_mutex); 1190 1191 virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV); 1192 virtio_chain_submit(tb->tb_chain, B_TRUE); 1193 1194 return (B_TRUE); 1195 1196 fail: 1197 vif->vif_oerrors++; 1198 if (tb != NULL) { 1199 vioif_txbuf_free(vif, tb); 1200 } 1201 mutex_exit(&vif->vif_mutex); 1202 1203 return (mp == NULL); 1204 } 1205 1206 static mblk_t * 1207 vioif_m_tx(void *arg, mblk_t *mp) 1208 { 1209 vioif_t *vif = arg; 1210 mblk_t *nmp; 1211 1212 /* 1213 * Prior to attempting to send any more frames, do a reclaim to pick up 1214 * any descriptors which have been processed by the host. 1215 */ 1216 if (virtio_queue_nactive(vif->vif_tx_vq) != 0) { 1217 (void) vioif_reclaim_used_tx(vif); 1218 } 1219 1220 while (mp != NULL) { 1221 nmp = mp->b_next; 1222 mp->b_next = NULL; 1223 1224 if (!vioif_send(vif, mp)) { 1225 /* 1226 * If there are no descriptors available, try to 1227 * reclaim some, allowing a retry of the send if some 1228 * are found. 1229 */ 1230 mp->b_next = nmp; 1231 if (vioif_reclaim_used_tx(vif) != 0) { 1232 continue; 1233 } 1234 1235 /* 1236 * Otherwise, enable the TX ring interrupt so that as 1237 * soon as a descriptor becomes available, transmission 1238 * can begin again. For safety, make sure the periodic 1239 * reclaim is running as well. 1240 */ 1241 mutex_enter(&vif->vif_mutex); 1242 vif->vif_tx_corked = B_TRUE; 1243 virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE); 1244 vioif_reclaim_restart(vif); 1245 mutex_exit(&vif->vif_mutex); 1246 return (mp); 1247 } 1248 mp = nmp; 1249 } 1250 1251 /* Ensure the periodic reclaim has been started. */ 1252 mutex_enter(&vif->vif_mutex); 1253 vioif_reclaim_restart(vif); 1254 mutex_exit(&vif->vif_mutex); 1255 1256 return (NULL); 1257 } 1258 1259 static int 1260 vioif_m_start(void *arg) 1261 { 1262 vioif_t *vif = arg; 1263 1264 mutex_enter(&vif->vif_mutex); 1265 1266 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED); 1267 vif->vif_runstate = VIOIF_RUNSTATE_RUNNING; 1268 1269 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); 1270 1271 virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE); 1272 1273 /* 1274 * Starting interrupts on the TX virtqueue is unnecessary at this time. 1275 * Descriptor reclamation is handling during transmit, via a periodic 1276 * timer, and when resources are tight, via the then-enabled interrupt. 1277 */ 1278 vif->vif_tx_drain = B_FALSE; 1279 1280 /* 1281 * Add as many receive buffers as we can to the receive queue. If we 1282 * cannot add any, it may be because we have stopped and started again 1283 * and the descriptors are all in the queue already. 1284 */ 1285 (void) vioif_add_rx(vif); 1286 1287 mutex_exit(&vif->vif_mutex); 1288 return (DDI_SUCCESS); 1289 } 1290 1291 static void 1292 vioif_m_stop(void *arg) 1293 { 1294 vioif_t *vif = arg; 1295 1296 mutex_enter(&vif->vif_mutex); 1297 1298 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING); 1299 vif->vif_runstate = VIOIF_RUNSTATE_STOPPING; 1300 1301 /* Ensure all TX descriptors have been processed and reclaimed */ 1302 vioif_tx_drain(vif); 1303 1304 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); 1305 1306 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; 1307 mutex_exit(&vif->vif_mutex); 1308 } 1309 1310 static int 1311 vioif_m_stat(void *arg, uint_t stat, uint64_t *val) 1312 { 1313 vioif_t *vif = arg; 1314 1315 switch (stat) { 1316 case MAC_STAT_IERRORS: 1317 *val = vif->vif_ierrors; 1318 break; 1319 case MAC_STAT_OERRORS: 1320 *val = vif->vif_oerrors; 1321 break; 1322 case MAC_STAT_MULTIRCV: 1323 *val = vif->vif_multircv; 1324 break; 1325 case MAC_STAT_BRDCSTRCV: 1326 *val = vif->vif_brdcstrcv; 1327 break; 1328 case MAC_STAT_MULTIXMT: 1329 *val = vif->vif_multixmt; 1330 break; 1331 case MAC_STAT_BRDCSTXMT: 1332 *val = vif->vif_brdcstxmt; 1333 break; 1334 case MAC_STAT_IPACKETS: 1335 *val = vif->vif_ipackets; 1336 break; 1337 case MAC_STAT_RBYTES: 1338 *val = vif->vif_rbytes; 1339 break; 1340 case MAC_STAT_OPACKETS: 1341 *val = vif->vif_opackets; 1342 break; 1343 case MAC_STAT_OBYTES: 1344 *val = vif->vif_obytes; 1345 break; 1346 case MAC_STAT_NORCVBUF: 1347 *val = vif->vif_norecvbuf; 1348 break; 1349 case MAC_STAT_NOXMTBUF: 1350 *val = vif->vif_notxbuf; 1351 break; 1352 case MAC_STAT_IFSPEED: 1353 /* always 1 Gbit */ 1354 *val = 1000000000ULL; 1355 break; 1356 case ETHER_STAT_LINK_DUPLEX: 1357 /* virtual device, always full-duplex */ 1358 *val = LINK_DUPLEX_FULL; 1359 break; 1360 1361 default: 1362 return (ENOTSUP); 1363 } 1364 1365 return (DDI_SUCCESS); 1366 } 1367 1368 static int 1369 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1370 uint_t pr_valsize, const void *pr_val) 1371 { 1372 vioif_t *vif = arg; 1373 1374 switch (pr_num) { 1375 case MAC_PROP_MTU: { 1376 int r; 1377 uint32_t mtu; 1378 if (pr_valsize < sizeof (mtu)) { 1379 return (EOVERFLOW); 1380 } 1381 bcopy(pr_val, &mtu, sizeof (mtu)); 1382 1383 if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) { 1384 return (EINVAL); 1385 } 1386 1387 mutex_enter(&vif->vif_mutex); 1388 if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) { 1389 vif->vif_mtu = mtu; 1390 } 1391 mutex_exit(&vif->vif_mutex); 1392 1393 return (r); 1394 } 1395 1396 case MAC_PROP_PRIVATE: { 1397 long max, result; 1398 uint_t *resp; 1399 char *endptr; 1400 1401 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1402 max = VIOIF_MACPROP_TXCOPY_THRESH_MAX; 1403 resp = &vif->vif_txcopy_thresh; 1404 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1405 max = VIOIF_MACPROP_RXCOPY_THRESH_MAX; 1406 resp = &vif->vif_rxcopy_thresh; 1407 } else { 1408 return (ENOTSUP); 1409 } 1410 1411 if (pr_val == NULL) { 1412 return (EINVAL); 1413 } 1414 1415 if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 || 1416 *endptr != '\0' || result < 0 || result > max) { 1417 return (EINVAL); 1418 } 1419 1420 mutex_enter(&vif->vif_mutex); 1421 *resp = result; 1422 mutex_exit(&vif->vif_mutex); 1423 1424 return (0); 1425 } 1426 1427 default: 1428 return (ENOTSUP); 1429 } 1430 } 1431 1432 static int 1433 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1434 uint_t pr_valsize, void *pr_val) 1435 { 1436 vioif_t *vif = arg; 1437 1438 switch (pr_num) { 1439 case MAC_PROP_PRIVATE: { 1440 uint_t value; 1441 1442 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1443 value = vif->vif_txcopy_thresh; 1444 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1445 value = vif->vif_rxcopy_thresh; 1446 } else { 1447 return (ENOTSUP); 1448 } 1449 1450 if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) { 1451 return (EOVERFLOW); 1452 } 1453 1454 return (0); 1455 } 1456 1457 default: 1458 return (ENOTSUP); 1459 } 1460 } 1461 1462 static void 1463 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1464 mac_prop_info_handle_t prh) 1465 { 1466 vioif_t *vif = arg; 1467 char valstr[64]; 1468 int value; 1469 1470 switch (pr_num) { 1471 case MAC_PROP_MTU: 1472 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); 1473 mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max); 1474 return; 1475 1476 case MAC_PROP_PRIVATE: 1477 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1478 value = VIOIF_MACPROP_TXCOPY_THRESH_DEF; 1479 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1480 value = VIOIF_MACPROP_RXCOPY_THRESH_DEF; 1481 } else { 1482 /* 1483 * We do not recognise this private property name. 1484 */ 1485 return; 1486 } 1487 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); 1488 (void) snprintf(valstr, sizeof (valstr), "%d", value); 1489 mac_prop_info_set_default_str(prh, valstr); 1490 return; 1491 1492 default: 1493 return; 1494 } 1495 } 1496 1497 static boolean_t 1498 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 1499 { 1500 vioif_t *vif = arg; 1501 1502 switch (cap) { 1503 case MAC_CAPAB_HCKSUM: { 1504 if (!vif->vif_tx_csum) { 1505 return (B_FALSE); 1506 } 1507 1508 *(uint32_t *)cap_data = HCKSUM_INET_PARTIAL; 1509 1510 return (B_TRUE); 1511 } 1512 1513 case MAC_CAPAB_LSO: { 1514 if (!vif->vif_tx_tso4) { 1515 return (B_FALSE); 1516 } 1517 1518 mac_capab_lso_t *lso = cap_data; 1519 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4 | LSO_TX_BASIC_TCP_IPV6; 1520 lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE; 1521 lso->lso_basic_tcp_ipv6.lso_max = VIOIF_RX_DATA_SIZE; 1522 1523 return (B_TRUE); 1524 } 1525 1526 default: 1527 return (B_FALSE); 1528 } 1529 } 1530 1531 static boolean_t 1532 vioif_has_feature(vioif_t *vif, uint32_t feature) 1533 { 1534 return (virtio_feature_present(vif->vif_virtio, feature)); 1535 } 1536 1537 /* 1538 * Read the primary MAC address from the device if one is provided. If not, 1539 * generate a random locally administered MAC address and write it back to the 1540 * device. 1541 */ 1542 static void 1543 vioif_get_mac(vioif_t *vif) 1544 { 1545 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 1546 1547 if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) { 1548 for (uint_t i = 0; i < ETHERADDRL; i++) { 1549 vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio, 1550 VIRTIO_NET_CONFIG_MAC + i); 1551 } 1552 vif->vif_mac_from_host = 1; 1553 1554 return; 1555 } 1556 1557 /* Get a few random bytes */ 1558 (void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL); 1559 /* Make sure it's a unicast MAC */ 1560 vif->vif_mac[0] &= ~1; 1561 /* Set the "locally administered" bit */ 1562 vif->vif_mac[1] |= 2; 1563 1564 /* 1565 * Write the random MAC address back to the device. 1566 */ 1567 for (uint_t i = 0; i < ETHERADDRL; i++) { 1568 virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i, 1569 vif->vif_mac[i]); 1570 } 1571 vif->vif_mac_from_host = 0; 1572 1573 dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: " 1574 "%02x:%02x:%02x:%02x:%02x:%02x", 1575 (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1], 1576 (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3], 1577 (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]); 1578 } 1579 1580 /* 1581 * Virtqueue interrupt handlers 1582 */ 1583 static uint_t 1584 vioif_rx_handler(caddr_t arg0, caddr_t arg1) 1585 { 1586 vioif_t *vif = (vioif_t *)arg0; 1587 1588 mutex_enter(&vif->vif_mutex); 1589 (void) vioif_process_rx(vif); 1590 1591 /* 1592 * Attempt to replenish the receive queue. If we cannot add any 1593 * descriptors here, it may be because all of the recently received 1594 * packets were loaned up to the networking stack. 1595 */ 1596 (void) vioif_add_rx(vif); 1597 mutex_exit(&vif->vif_mutex); 1598 1599 return (DDI_INTR_CLAIMED); 1600 } 1601 1602 static uint_t 1603 vioif_tx_handler(caddr_t arg0, caddr_t arg1) 1604 { 1605 vioif_t *vif = (vioif_t *)arg0; 1606 1607 /* 1608 * The TX interrupt could race with other reclamation activity, so 1609 * interpreting the return value is unimportant. 1610 */ 1611 (void) vioif_reclaim_used_tx(vif); 1612 1613 return (DDI_INTR_CLAIMED); 1614 } 1615 1616 static void 1617 vioif_check_features(vioif_t *vif) 1618 { 1619 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 1620 1621 vif->vif_tx_csum = 0; 1622 vif->vif_tx_tso4 = 0; 1623 vif->vif_tx_tso6 = 0; 1624 1625 if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) { 1626 /* 1627 * The host will accept packets with partial checksums from us. 1628 */ 1629 vif->vif_tx_csum = 1; 1630 1631 /* 1632 * The legacy GSO feature represents the combination of 1633 * HOST_TSO4, HOST_TSO6, and HOST_ECN. 1634 */ 1635 boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO); 1636 boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4); 1637 boolean_t tso6 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO6); 1638 boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN); 1639 1640 /* 1641 * Explicit congestion notification (ECN) is configured 1642 * globally; see "tcp_ecn_permitted". As we cannot currently 1643 * request that the stack disable ECN on a per interface basis, 1644 * we require the device to support the combination of 1645 * segmentation offload and ECN support. 1646 */ 1647 if (gso) { 1648 vif->vif_tx_tso4 = 1; 1649 vif->vif_tx_tso6 = 1; 1650 } 1651 if (tso4 && ecn) { 1652 vif->vif_tx_tso4 = 1; 1653 } 1654 if (tso6 && ecn) { 1655 vif->vif_tx_tso6 = 1; 1656 } 1657 } 1658 } 1659 1660 static int 1661 vioif_select_interrupt_types(void) 1662 { 1663 id_t id; 1664 smbios_system_t sys; 1665 smbios_info_t info; 1666 1667 if (vioif_allowed_int_types != -1) { 1668 /* 1669 * If this value was tuned via /etc/system or the debugger, 1670 * use the provided value directly. 1671 */ 1672 return (vioif_allowed_int_types); 1673 } 1674 1675 if ((id = smbios_info_system(ksmbios, &sys)) == SMB_ERR || 1676 smbios_info_common(ksmbios, id, &info) == SMB_ERR) { 1677 /* 1678 * The system may not have valid SMBIOS data, so ignore a 1679 * failure here. 1680 */ 1681 return (0); 1682 } 1683 1684 if (strcmp(info.smbi_manufacturer, "Google") == 0 && 1685 strcmp(info.smbi_product, "Google Compute Engine") == 0) { 1686 /* 1687 * An undiagnosed issue with the Google Compute Engine (GCE) 1688 * hypervisor exists. In this environment, no RX interrupts 1689 * are received if MSI-X handlers are installed. This does not 1690 * appear to be true for the Virtio SCSI driver. Fixed 1691 * interrupts do appear to work, so we fall back for now: 1692 */ 1693 return (DDI_INTR_TYPE_FIXED); 1694 } 1695 1696 return (0); 1697 } 1698 1699 static int 1700 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1701 { 1702 int ret; 1703 vioif_t *vif; 1704 virtio_t *vio; 1705 mac_register_t *macp = NULL; 1706 1707 if (cmd != DDI_ATTACH) { 1708 return (DDI_FAILURE); 1709 } 1710 1711 if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) == 1712 NULL) { 1713 return (DDI_FAILURE); 1714 } 1715 1716 vif = kmem_zalloc(sizeof (*vif), KM_SLEEP); 1717 vif->vif_dip = dip; 1718 vif->vif_virtio = vio; 1719 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; 1720 ddi_set_driver_private(dip, vif); 1721 1722 if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX, 1723 "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL || 1724 (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX, 1725 "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) { 1726 goto fail; 1727 } 1728 1729 if (virtio_init_complete(vio, vioif_select_interrupt_types()) != 1730 DDI_SUCCESS) { 1731 dev_err(dip, CE_WARN, "failed to complete Virtio init"); 1732 goto fail; 1733 } 1734 1735 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); 1736 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 1737 1738 mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); 1739 mutex_enter(&vif->vif_mutex); 1740 1741 vioif_get_mac(vif); 1742 1743 vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF; 1744 vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF; 1745 1746 if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) { 1747 vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU); 1748 } else { 1749 vif->vif_mtu_max = ETHERMTU; 1750 } 1751 1752 vif->vif_mtu = ETHERMTU; 1753 if (vif->vif_mtu > vif->vif_mtu_max) { 1754 vif->vif_mtu = vif->vif_mtu_max; 1755 } 1756 1757 vioif_check_features(vif); 1758 1759 if (vioif_alloc_bufs(vif) != 0) { 1760 mutex_exit(&vif->vif_mutex); 1761 dev_err(dip, CE_WARN, "failed to allocate memory"); 1762 goto fail; 1763 } 1764 1765 mutex_exit(&vif->vif_mutex); 1766 1767 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { 1768 dev_err(dip, CE_WARN, "failed to enable interrupts"); 1769 goto fail; 1770 } 1771 1772 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 1773 dev_err(dip, CE_WARN, "failed to allocate a mac_register"); 1774 goto fail; 1775 } 1776 1777 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1778 macp->m_driver = vif; 1779 macp->m_dip = dip; 1780 macp->m_src_addr = vif->vif_mac; 1781 macp->m_callbacks = &vioif_mac_callbacks; 1782 macp->m_min_sdu = 0; 1783 macp->m_max_sdu = vif->vif_mtu; 1784 macp->m_margin = VLAN_TAGSZ; 1785 macp->m_priv_props = vioif_priv_props; 1786 1787 if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) { 1788 dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret); 1789 goto fail; 1790 } 1791 mac_free(macp); 1792 1793 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); 1794 1795 return (DDI_SUCCESS); 1796 1797 fail: 1798 vioif_free_bufs(vif); 1799 if (macp != NULL) { 1800 mac_free(macp); 1801 } 1802 (void) virtio_fini(vio, B_TRUE); 1803 kmem_free(vif, sizeof (*vif)); 1804 return (DDI_FAILURE); 1805 } 1806 1807 static int 1808 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1809 { 1810 int r; 1811 vioif_t *vif; 1812 1813 if (cmd != DDI_DETACH) { 1814 return (DDI_FAILURE); 1815 } 1816 1817 if ((vif = ddi_get_driver_private(dip)) == NULL) { 1818 return (DDI_FAILURE); 1819 } 1820 1821 mutex_enter(&vif->vif_mutex); 1822 if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) { 1823 dev_err(dip, CE_WARN, "!NIC still running, cannot detach"); 1824 mutex_exit(&vif->vif_mutex); 1825 return (DDI_FAILURE); 1826 } 1827 1828 /* 1829 * There should be no outstanding transmit buffers once the NIC is 1830 * completely stopped. 1831 */ 1832 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); 1833 1834 /* 1835 * Though we cannot claw back all of the receive buffers until we reset 1836 * the device, we must ensure all those loaned to MAC have been 1837 * returned before calling mac_unregister(). 1838 */ 1839 if (vif->vif_nrxbufs_onloan > 0) { 1840 dev_err(dip, CE_WARN, "!%u receive buffers still loaned, " 1841 "cannot detach", vif->vif_nrxbufs_onloan); 1842 mutex_exit(&vif->vif_mutex); 1843 return (DDI_FAILURE); 1844 } 1845 1846 if ((r = mac_unregister(vif->vif_mac_handle)) != 0) { 1847 dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r); 1848 return (DDI_FAILURE); 1849 } 1850 1851 /* 1852 * Shut down the device so that we can recover any previously 1853 * submitted receive buffers. 1854 */ 1855 virtio_shutdown(vif->vif_virtio); 1856 for (;;) { 1857 virtio_chain_t *vic; 1858 1859 if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) { 1860 break; 1861 } 1862 1863 vioif_rxbuf_t *rb = virtio_chain_data(vic); 1864 vioif_rxbuf_free(vif, rb); 1865 } 1866 1867 /* 1868 * vioif_free_bufs() must be called before virtio_fini() 1869 * as it uses virtio_chain_free() which itself depends on some 1870 * virtio data structures still being around. 1871 */ 1872 vioif_free_bufs(vif); 1873 (void) virtio_fini(vif->vif_virtio, B_FALSE); 1874 1875 mutex_exit(&vif->vif_mutex); 1876 mutex_destroy(&vif->vif_mutex); 1877 1878 kmem_free(vif, sizeof (*vif)); 1879 1880 return (DDI_SUCCESS); 1881 } 1882 1883 static int 1884 vioif_quiesce(dev_info_t *dip) 1885 { 1886 vioif_t *vif; 1887 1888 if ((vif = ddi_get_driver_private(dip)) == NULL) 1889 return (DDI_FAILURE); 1890 1891 return (virtio_quiesce(vif->vif_virtio)); 1892 } 1893 1894 int 1895 _init(void) 1896 { 1897 int ret; 1898 1899 mac_init_ops(&vioif_dev_ops, "vioif"); 1900 1901 if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) { 1902 mac_fini_ops(&vioif_dev_ops); 1903 } 1904 1905 return (ret); 1906 } 1907 1908 int 1909 _fini(void) 1910 { 1911 int ret; 1912 1913 if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) { 1914 mac_fini_ops(&vioif_dev_ops); 1915 } 1916 1917 return (ret); 1918 } 1919 1920 int 1921 _info(struct modinfo *modinfop) 1922 { 1923 return (mod_info(&vioif_modlinkage, modinfop)); 1924 } 1925