1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2013 Nexenta Inc. All rights reserved. 14 * Copyright (c) 2014, 2016 by Delphix. All rights reserved. 15 * Copyright 2019 Joyent, Inc. 16 * Copyright 2019 Joshua M. Clulow <josh@sysmgr.org> 17 */ 18 19 /* Based on the NetBSD virtio driver by Minoura Makoto. */ 20 /* 21 * Copyright (c) 2010 Minoura Makoto. 22 * All rights reserved. 23 * 24 * Redistribution and use in source and binary forms, with or without 25 * modification, are permitted provided that the following conditions 26 * are met: 27 * 1. Redistributions of source code must retain the above copyright 28 * notice, this list of conditions and the following disclaimer. 29 * 2. Redistributions in binary form must reproduce the above copyright 30 * notice, this list of conditions and the following disclaimer in the 31 * documentation and/or other materials provided with the distribution. 32 * 33 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 34 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 35 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 36 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 37 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 38 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 39 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 40 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 41 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 42 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 /* 46 * VIRTIO NETWORK DRIVER 47 */ 48 49 #include <sys/types.h> 50 #include <sys/errno.h> 51 #include <sys/param.h> 52 #include <sys/stropts.h> 53 #include <sys/stream.h> 54 #include <sys/strsubr.h> 55 #include <sys/kmem.h> 56 #include <sys/conf.h> 57 #include <sys/devops.h> 58 #include <sys/ksynch.h> 59 #include <sys/stat.h> 60 #include <sys/modctl.h> 61 #include <sys/debug.h> 62 #include <sys/pci.h> 63 #include <sys/ethernet.h> 64 #include <sys/vlan.h> 65 #include <sys/sysmacros.h> 66 #include <sys/smbios.h> 67 68 #include <sys/dlpi.h> 69 #include <sys/taskq.h> 70 71 #include <sys/pattr.h> 72 #include <sys/strsun.h> 73 74 #include <sys/random.h> 75 #include <sys/containerof.h> 76 #include <sys/stream.h> 77 78 #include <sys/mac.h> 79 #include <sys/mac_provider.h> 80 #include <sys/mac_ether.h> 81 82 #include "virtio.h" 83 #include "vioif.h" 84 85 86 static int vioif_quiesce(dev_info_t *); 87 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t); 88 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t); 89 static boolean_t vioif_has_feature(vioif_t *, uint32_t); 90 static void vioif_reclaim_restart(vioif_t *); 91 static int vioif_m_stat(void *, uint_t, uint64_t *); 92 static void vioif_m_stop(void *); 93 static int vioif_m_start(void *); 94 static int vioif_m_multicst(void *, boolean_t, const uint8_t *); 95 static int vioif_m_setpromisc(void *, boolean_t); 96 static int vioif_m_unicst(void *, const uint8_t *); 97 static mblk_t *vioif_m_tx(void *, mblk_t *); 98 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 99 const void *); 100 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 101 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t, 102 mac_prop_info_handle_t); 103 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *); 104 static uint_t vioif_add_rx(vioif_t *); 105 106 107 static struct cb_ops vioif_cb_ops = { 108 .cb_rev = CB_REV, 109 .cb_flag = D_MP | D_NEW, 110 111 .cb_open = nulldev, 112 .cb_close = nulldev, 113 .cb_strategy = nodev, 114 .cb_print = nodev, 115 .cb_dump = nodev, 116 .cb_read = nodev, 117 .cb_write = nodev, 118 .cb_ioctl = nodev, 119 .cb_devmap = nodev, 120 .cb_mmap = nodev, 121 .cb_segmap = nodev, 122 .cb_chpoll = nochpoll, 123 .cb_prop_op = ddi_prop_op, 124 .cb_str = NULL, 125 .cb_aread = nodev, 126 .cb_awrite = nodev, 127 }; 128 129 static struct dev_ops vioif_dev_ops = { 130 .devo_rev = DEVO_REV, 131 .devo_refcnt = 0, 132 133 .devo_attach = vioif_attach, 134 .devo_detach = vioif_detach, 135 .devo_quiesce = vioif_quiesce, 136 137 .devo_cb_ops = &vioif_cb_ops, 138 139 .devo_getinfo = NULL, 140 .devo_identify = nulldev, 141 .devo_probe = nulldev, 142 .devo_reset = nodev, 143 .devo_bus_ops = NULL, 144 .devo_power = NULL, 145 }; 146 147 static struct modldrv vioif_modldrv = { 148 .drv_modops = &mod_driverops, 149 .drv_linkinfo = "VIRTIO network driver", 150 .drv_dev_ops = &vioif_dev_ops 151 }; 152 153 static struct modlinkage vioif_modlinkage = { 154 .ml_rev = MODREV_1, 155 .ml_linkage = { &vioif_modldrv, NULL } 156 }; 157 158 static mac_callbacks_t vioif_mac_callbacks = { 159 .mc_getstat = vioif_m_stat, 160 .mc_start = vioif_m_start, 161 .mc_stop = vioif_m_stop, 162 .mc_setpromisc = vioif_m_setpromisc, 163 .mc_multicst = vioif_m_multicst, 164 .mc_unicst = vioif_m_unicst, 165 .mc_tx = vioif_m_tx, 166 167 .mc_callbacks = (MC_GETCAPAB | MC_SETPROP | 168 MC_GETPROP | MC_PROPINFO), 169 .mc_getcapab = vioif_m_getcapab, 170 .mc_setprop = vioif_m_setprop, 171 .mc_getprop = vioif_m_getprop, 172 .mc_propinfo = vioif_m_propinfo, 173 }; 174 175 static const uchar_t vioif_broadcast[ETHERADDRL] = { 176 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 177 }; 178 179 /* 180 * Interval for the periodic TX reclaim. 181 */ 182 uint_t vioif_reclaim_ms = 200; 183 184 /* 185 * Allow the operator to override the kinds of interrupts we'll use for 186 * vioif. This value defaults to -1 so that it can be overridden to 0 in 187 * /etc/system. 188 */ 189 int vioif_allowed_int_types = -1; 190 191 /* 192 * DMA attribute template for transmit and receive buffers. The SGL entry 193 * count will be modified before using the template. Note that these 194 * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in 195 * received frames at the correct offset for the networking stack. 196 */ 197 ddi_dma_attr_t vioif_dma_attr_bufs = { 198 .dma_attr_version = DMA_ATTR_V0, 199 .dma_attr_addr_lo = 0x0000000000000000, 200 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 201 .dma_attr_count_max = 0x00000000FFFFFFFF, 202 .dma_attr_align = VIOIF_HEADER_ALIGN, 203 .dma_attr_burstsizes = 1, 204 .dma_attr_minxfer = 1, 205 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 206 .dma_attr_seg = 0x00000000FFFFFFFF, 207 .dma_attr_sgllen = 0, 208 .dma_attr_granular = 1, 209 .dma_attr_flags = 0 210 }; 211 212 /* 213 * DMA attributes for mapping larger transmit buffers from the networking 214 * stack. The requirements are quite loose, but note that the SGL entry length 215 * field is 32-bit. 216 */ 217 ddi_dma_attr_t vioif_dma_attr_external = { 218 .dma_attr_version = DMA_ATTR_V0, 219 .dma_attr_addr_lo = 0x0000000000000000, 220 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 221 .dma_attr_count_max = 0x00000000FFFFFFFF, 222 .dma_attr_align = 1, 223 .dma_attr_burstsizes = 1, 224 .dma_attr_minxfer = 1, 225 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 226 .dma_attr_seg = 0x00000000FFFFFFFF, 227 .dma_attr_sgllen = VIOIF_MAX_SEGS - 1, 228 .dma_attr_granular = 1, 229 .dma_attr_flags = 0 230 }; 231 232 233 /* 234 * VIRTIO NET MAC PROPERTIES 235 */ 236 #define VIOIF_MACPROP_TXCOPY_THRESH "_txcopy_thresh" 237 #define VIOIF_MACPROP_TXCOPY_THRESH_DEF 300 238 #define VIOIF_MACPROP_TXCOPY_THRESH_MAX 640 239 240 #define VIOIF_MACPROP_RXCOPY_THRESH "_rxcopy_thresh" 241 #define VIOIF_MACPROP_RXCOPY_THRESH_DEF 300 242 #define VIOIF_MACPROP_RXCOPY_THRESH_MAX 640 243 244 static char *vioif_priv_props[] = { 245 VIOIF_MACPROP_TXCOPY_THRESH, 246 VIOIF_MACPROP_RXCOPY_THRESH, 247 NULL 248 }; 249 250 251 static vioif_txbuf_t * 252 vioif_txbuf_alloc(vioif_t *vif) 253 { 254 vioif_txbuf_t *tb; 255 256 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 257 258 if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) { 259 vif->vif_ntxbufs_alloc++; 260 } 261 262 return (tb); 263 } 264 265 static void 266 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb) 267 { 268 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 269 270 VERIFY3U(vif->vif_ntxbufs_alloc, >, 0); 271 vif->vif_ntxbufs_alloc--; 272 273 virtio_chain_clear(tb->tb_chain); 274 list_insert_head(&vif->vif_txbufs, tb); 275 } 276 277 static vioif_rxbuf_t * 278 vioif_rxbuf_alloc(vioif_t *vif) 279 { 280 vioif_rxbuf_t *rb; 281 282 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 283 284 if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) { 285 vif->vif_nrxbufs_alloc++; 286 } 287 288 return (rb); 289 } 290 291 static void 292 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb) 293 { 294 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 295 296 VERIFY3U(vif->vif_nrxbufs_alloc, >, 0); 297 vif->vif_nrxbufs_alloc--; 298 299 virtio_chain_clear(rb->rb_chain); 300 list_insert_head(&vif->vif_rxbufs, rb); 301 } 302 303 static void 304 vioif_rx_free_callback(caddr_t free_arg) 305 { 306 vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg; 307 vioif_t *vif = rb->rb_vioif; 308 309 mutex_enter(&vif->vif_mutex); 310 311 /* 312 * Return this receive buffer to the free list. 313 */ 314 vioif_rxbuf_free(vif, rb); 315 316 VERIFY3U(vif->vif_nrxbufs_onloan, >, 0); 317 vif->vif_nrxbufs_onloan--; 318 319 /* 320 * Attempt to replenish the receive queue with at least the buffer we 321 * just freed. There isn't a great way to deal with failure here, 322 * though because we'll only loan at most half of the buffers there 323 * should always be at least some available even if this fails. 324 */ 325 (void) vioif_add_rx(vif); 326 327 mutex_exit(&vif->vif_mutex); 328 } 329 330 static void 331 vioif_free_bufs(vioif_t *vif) 332 { 333 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 334 335 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); 336 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { 337 vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i]; 338 339 /* 340 * Ensure that this txbuf is now in the free list: 341 */ 342 VERIFY(list_link_active(&tb->tb_link)); 343 list_remove(&vif->vif_txbufs, tb); 344 345 /* 346 * We should not have an mblk chain at this point. 347 */ 348 VERIFY3P(tb->tb_mp, ==, NULL); 349 350 if (tb->tb_dma != NULL) { 351 virtio_dma_free(tb->tb_dma); 352 tb->tb_dma = NULL; 353 } 354 355 if (tb->tb_chain != NULL) { 356 virtio_chain_free(tb->tb_chain); 357 tb->tb_chain = NULL; 358 } 359 360 if (tb->tb_dmaext != NULL) { 361 for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) { 362 if (tb->tb_dmaext[j] != NULL) { 363 virtio_dma_free( 364 tb->tb_dmaext[j]); 365 tb->tb_dmaext[j] = NULL; 366 } 367 } 368 369 kmem_free(tb->tb_dmaext, 370 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity); 371 tb->tb_dmaext = NULL; 372 tb->tb_dmaext_capacity = 0; 373 } 374 } 375 VERIFY(list_is_empty(&vif->vif_txbufs)); 376 if (vif->vif_txbufs_mem != NULL) { 377 kmem_free(vif->vif_txbufs_mem, 378 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity); 379 vif->vif_txbufs_mem = NULL; 380 vif->vif_txbufs_capacity = 0; 381 } 382 383 VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0); 384 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { 385 vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i]; 386 387 /* 388 * Ensure that this rxbuf is now in the free list: 389 */ 390 VERIFY(list_link_active(&rb->rb_link)); 391 list_remove(&vif->vif_rxbufs, rb); 392 393 if (rb->rb_dma != NULL) { 394 virtio_dma_free(rb->rb_dma); 395 rb->rb_dma = NULL; 396 } 397 398 if (rb->rb_chain != NULL) { 399 virtio_chain_free(rb->rb_chain); 400 rb->rb_chain = NULL; 401 } 402 } 403 VERIFY(list_is_empty(&vif->vif_rxbufs)); 404 if (vif->vif_rxbufs_mem != NULL) { 405 kmem_free(vif->vif_rxbufs_mem, 406 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity); 407 vif->vif_rxbufs_mem = NULL; 408 vif->vif_rxbufs_capacity = 0; 409 } 410 } 411 412 static int 413 vioif_alloc_bufs(vioif_t *vif) 414 { 415 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 416 417 /* 418 * Allocate one contiguous chunk of memory for the transmit and receive 419 * buffer tracking objects. If the ring is unusually small, we'll 420 * reduce our target buffer count accordingly. 421 */ 422 vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS, 423 virtio_queue_size(vif->vif_tx_vq)); 424 vif->vif_txbufs_mem = kmem_zalloc( 425 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP); 426 list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t), 427 offsetof(vioif_txbuf_t, tb_link)); 428 429 vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS, 430 virtio_queue_size(vif->vif_rx_vq)); 431 vif->vif_rxbufs_mem = kmem_zalloc( 432 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP); 433 list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t), 434 offsetof(vioif_rxbuf_t, rb_link)); 435 436 /* 437 * Do not loan more than half of our allocated receive buffers into 438 * the networking stack. 439 */ 440 vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2; 441 442 /* 443 * Put everything in the free list straight away in order to simplify 444 * the use of vioif_free_bufs() for cleanup on allocation failure. 445 */ 446 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { 447 list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]); 448 } 449 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { 450 list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]); 451 } 452 453 /* 454 * Start from the DMA attribute template common to both transmit and 455 * receive buffers. The SGL entry count will be modified for each 456 * buffer type. 457 */ 458 ddi_dma_attr_t attr = vioif_dma_attr_bufs; 459 460 /* 461 * The transmit inline buffer is small (less than a page), so it's 462 * reasonable to request a single cookie. 463 */ 464 attr.dma_attr_sgllen = 1; 465 466 for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL; 467 tb = list_next(&vif->vif_txbufs, tb)) { 468 if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio, 469 VIOIF_TX_INLINE_SIZE, &attr, 470 DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) { 471 goto fail; 472 } 473 VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1); 474 475 if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq, 476 KM_SLEEP)) == NULL) { 477 goto fail; 478 } 479 virtio_chain_data_set(tb->tb_chain, tb); 480 481 tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1; 482 tb->tb_dmaext = kmem_zalloc( 483 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity, 484 KM_SLEEP); 485 } 486 487 /* 488 * The receive buffers are larger, and we can tolerate a large number 489 * of segments. Adjust the SGL entry count, setting aside one segment 490 * for the virtio net header. 491 */ 492 attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1; 493 494 for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL; 495 rb = list_next(&vif->vif_rxbufs, rb)) { 496 if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio, 497 VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ, 498 KM_SLEEP)) == NULL) { 499 goto fail; 500 } 501 502 if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq, 503 KM_SLEEP)) == NULL) { 504 goto fail; 505 } 506 virtio_chain_data_set(rb->rb_chain, rb); 507 508 /* 509 * Ensure that the first cookie is sufficient to cover the 510 * header skip region plus one byte. 511 */ 512 VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=, 513 VIOIF_HEADER_SKIP + 1); 514 515 /* 516 * Ensure that the frame data begins at a location with a 517 * correctly aligned IP header. 518 */ 519 VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma, 520 VIOIF_HEADER_SKIP) % 4, ==, 2); 521 522 rb->rb_vioif = vif; 523 rb->rb_frtn.free_func = vioif_rx_free_callback; 524 rb->rb_frtn.free_arg = (caddr_t)rb; 525 } 526 527 return (0); 528 529 fail: 530 vioif_free_bufs(vif); 531 return (ENOMEM); 532 } 533 534 static int 535 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr) 536 { 537 /* 538 * Even though we currently do not have support for programming 539 * multicast filters, or even enabling promiscuous mode, we return 540 * success here to avoid the networking stack falling back to link 541 * layer broadcast for multicast traffic. Some hypervisors already 542 * pass received multicast frames onto the guest, so at least on those 543 * systems multicast will work as expected anyway. 544 */ 545 return (0); 546 } 547 548 static int 549 vioif_m_setpromisc(void *arg, boolean_t on) 550 { 551 /* 552 * Even though we cannot currently enable promiscuous mode, we return 553 * success here to allow tools like snoop(1M) to continue to function. 554 */ 555 return (0); 556 } 557 558 static int 559 vioif_m_unicst(void *arg, const uint8_t *mac) 560 { 561 return (ENOTSUP); 562 } 563 564 static uint_t 565 vioif_add_rx(vioif_t *vif) 566 { 567 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 568 569 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { 570 /* 571 * If the NIC is not running, do not give the device any 572 * receive buffers. 573 */ 574 return (0); 575 } 576 577 uint_t num_added = 0; 578 579 vioif_rxbuf_t *rb; 580 while ((rb = vioif_rxbuf_alloc(vif)) != NULL) { 581 /* 582 * For legacy devices, and those that have not negotiated 583 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a 584 * separate descriptor entry to the rest of the buffer. 585 */ 586 if (virtio_chain_append(rb->rb_chain, 587 virtio_dma_cookie_pa(rb->rb_dma, 0), 588 sizeof (struct virtio_net_hdr), 589 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 590 goto fail; 591 } 592 593 for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) { 594 uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n); 595 size_t sz = virtio_dma_cookie_size(rb->rb_dma, n); 596 597 if (n == 0) { 598 pa += VIOIF_HEADER_SKIP; 599 VERIFY3U(sz, >, VIOIF_HEADER_SKIP); 600 sz -= VIOIF_HEADER_SKIP; 601 } 602 603 if (virtio_chain_append(rb->rb_chain, pa, sz, 604 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 605 goto fail; 606 } 607 } 608 609 virtio_chain_submit(rb->rb_chain, B_FALSE); 610 num_added++; 611 continue; 612 613 fail: 614 vioif_rxbuf_free(vif, rb); 615 vif->vif_norecvbuf++; 616 break; 617 } 618 619 if (num_added > 0) { 620 virtio_queue_flush(vif->vif_rx_vq); 621 } 622 623 return (num_added); 624 } 625 626 static uint_t 627 vioif_process_rx(vioif_t *vif) 628 { 629 virtio_chain_t *vic; 630 mblk_t *mphead = NULL, *lastmp = NULL, *mp; 631 uint_t num_processed = 0; 632 633 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 634 635 while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) { 636 /* 637 * We have to use the chain received length here, as the device 638 * does not tell us the received frame length any other way. 639 * In a limited survey of hypervisors, virtio network devices 640 * appear to provide the right value here. 641 */ 642 size_t len = virtio_chain_received_length(vic); 643 vioif_rxbuf_t *rb = virtio_chain_data(vic); 644 645 virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU); 646 647 /* 648 * If the NIC is not running, discard any received frames. 649 */ 650 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { 651 vioif_rxbuf_free(vif, rb); 652 continue; 653 } 654 655 if (len < sizeof (struct virtio_net_hdr)) { 656 vif->vif_rxfail_chain_undersize++; 657 vif->vif_ierrors++; 658 vioif_rxbuf_free(vif, rb); 659 continue; 660 } 661 len -= sizeof (struct virtio_net_hdr); 662 663 /* 664 * We copy small packets that happen to fit into a single 665 * cookie and reuse the buffers. For bigger ones, we loan 666 * the buffers upstream. 667 */ 668 if (len < vif->vif_rxcopy_thresh || 669 vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) { 670 mutex_exit(&vif->vif_mutex); 671 if ((mp = allocb(len, 0)) == NULL) { 672 mutex_enter(&vif->vif_mutex); 673 vif->vif_norecvbuf++; 674 vif->vif_ierrors++; 675 676 vioif_rxbuf_free(vif, rb); 677 continue; 678 } 679 680 bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP), 681 mp->b_rptr, len); 682 mp->b_wptr = mp->b_rptr + len; 683 684 /* 685 * As the packet contents was copied rather than 686 * loaned, we can return the receive buffer resources 687 * to the free list. 688 */ 689 mutex_enter(&vif->vif_mutex); 690 vioif_rxbuf_free(vif, rb); 691 692 } else { 693 mutex_exit(&vif->vif_mutex); 694 if ((mp = desballoc(virtio_dma_va(rb->rb_dma, 695 VIOIF_HEADER_SKIP), len, 0, 696 &rb->rb_frtn)) == NULL) { 697 mutex_enter(&vif->vif_mutex); 698 vif->vif_norecvbuf++; 699 vif->vif_ierrors++; 700 701 vioif_rxbuf_free(vif, rb); 702 continue; 703 } 704 mp->b_wptr = mp->b_rptr + len; 705 706 mutex_enter(&vif->vif_mutex); 707 vif->vif_nrxbufs_onloan++; 708 } 709 710 /* 711 * virtio-net does not tell us if this packet is multicast 712 * or broadcast, so we have to check it. 713 */ 714 if (mp->b_rptr[0] & 0x1) { 715 if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0) 716 vif->vif_multircv++; 717 else 718 vif->vif_brdcstrcv++; 719 } 720 721 vif->vif_rbytes += len; 722 vif->vif_ipackets++; 723 724 if (lastmp == NULL) { 725 mphead = mp; 726 } else { 727 lastmp->b_next = mp; 728 } 729 lastmp = mp; 730 num_processed++; 731 } 732 733 if (mphead != NULL) { 734 if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) { 735 mutex_exit(&vif->vif_mutex); 736 mac_rx(vif->vif_mac_handle, NULL, mphead); 737 mutex_enter(&vif->vif_mutex); 738 } else { 739 /* 740 * The NIC was disabled part way through our execution, 741 * so free the messages we allocated. 742 */ 743 freemsgchain(mphead); 744 } 745 } 746 747 return (num_processed); 748 } 749 750 static uint_t 751 vioif_reclaim_used_tx(vioif_t *vif) 752 { 753 virtio_chain_t *vic; 754 uint_t num_reclaimed = 0; 755 756 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 757 758 while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) { 759 vioif_txbuf_t *tb = virtio_chain_data(vic); 760 761 if (tb->tb_mp != NULL) { 762 /* 763 * Unbind the external mapping. 764 */ 765 for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) { 766 if (tb->tb_dmaext[i] == NULL) { 767 continue; 768 } 769 770 virtio_dma_unbind(tb->tb_dmaext[i]); 771 } 772 773 freemsg(tb->tb_mp); 774 tb->tb_mp = NULL; 775 } 776 777 /* 778 * Return this transmit buffer to the free list for reuse. 779 */ 780 mutex_enter(&vif->vif_mutex); 781 vioif_txbuf_free(vif, tb); 782 mutex_exit(&vif->vif_mutex); 783 784 num_reclaimed++; 785 } 786 787 /* Return ring to transmitting state if descriptors were reclaimed. */ 788 if (num_reclaimed > 0) { 789 boolean_t do_update = B_FALSE; 790 791 mutex_enter(&vif->vif_mutex); 792 vif->vif_stat_tx_reclaim += num_reclaimed; 793 if (vif->vif_tx_corked) { 794 /* 795 * TX was corked on a lack of available descriptors. 796 * That dire state has passed so the TX interrupt can 797 * be disabled and MAC can be notified that 798 * transmission is possible again. 799 */ 800 vif->vif_tx_corked = B_FALSE; 801 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 802 do_update = B_TRUE; 803 } 804 805 if (do_update) { 806 mac_tx_update(vif->vif_mac_handle); 807 } 808 mutex_exit(&vif->vif_mutex); 809 } 810 811 return (num_reclaimed); 812 } 813 814 static void 815 vioif_reclaim_periodic(void *arg) 816 { 817 vioif_t *vif = arg; 818 uint_t num_reclaimed; 819 820 num_reclaimed = vioif_reclaim_used_tx(vif); 821 822 mutex_enter(&vif->vif_mutex); 823 vif->vif_tx_reclaim_tid = 0; 824 /* 825 * If used descriptors were reclaimed or TX descriptors appear to be 826 * outstanding, the ring is considered active and periodic reclamation 827 * is necessary for now. 828 */ 829 if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) { 830 /* Do not reschedule if the ring is being drained. */ 831 if (!vif->vif_tx_drain) { 832 vioif_reclaim_restart(vif); 833 } 834 } 835 mutex_exit(&vif->vif_mutex); 836 } 837 838 static void 839 vioif_reclaim_restart(vioif_t *vif) 840 { 841 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 842 VERIFY(!vif->vif_tx_drain); 843 844 if (vif->vif_tx_reclaim_tid == 0) { 845 vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif, 846 MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms)); 847 } 848 } 849 850 static void 851 vioif_tx_drain(vioif_t *vif) 852 { 853 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 854 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING); 855 856 vif->vif_tx_drain = B_TRUE; 857 /* Put a stop to the periodic reclaim if it is running */ 858 if (vif->vif_tx_reclaim_tid != 0) { 859 timeout_id_t tid = vif->vif_tx_reclaim_tid; 860 861 /* 862 * With vif_tx_drain set, there is no risk that a racing 863 * vioif_reclaim_periodic() call will reschedule itself. 864 * 865 * Being part of the mc_stop hook also guarantees that 866 * vioif_m_tx() will not be called to restart it. 867 */ 868 vif->vif_tx_reclaim_tid = 0; 869 mutex_exit(&vif->vif_mutex); 870 (void) untimeout(tid); 871 mutex_enter(&vif->vif_mutex); 872 } 873 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 874 875 /* 876 * Wait for all of the TX descriptors to be processed by the host so 877 * they can be reclaimed. 878 */ 879 while (vif->vif_ntxbufs_alloc > 0) { 880 mutex_exit(&vif->vif_mutex); 881 (void) vioif_reclaim_used_tx(vif); 882 delay(5); 883 mutex_enter(&vif->vif_mutex); 884 } 885 VERIFY(!vif->vif_tx_corked); 886 VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0); 887 VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0); 888 } 889 890 static int 891 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) 892 { 893 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 894 895 VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP); 896 897 /* 898 * Copy the message into the inline buffer and then free the message. 899 */ 900 mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP)); 901 902 if (virtio_chain_append(tb->tb_chain, 903 virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP, 904 msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 905 return (DDI_FAILURE); 906 } 907 908 return (DDI_SUCCESS); 909 } 910 911 static int 912 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) 913 { 914 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 915 916 mblk_t *nmp = mp; 917 tb->tb_ndmaext = 0; 918 919 while (nmp != NULL) { 920 size_t len; 921 922 if ((len = MBLKL(nmp)) == 0) { 923 /* 924 * Skip any zero-length entries in the chain. 925 */ 926 nmp = nmp->b_cont; 927 continue; 928 } 929 930 if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) { 931 mutex_enter(&vif->vif_mutex); 932 vif->vif_txfail_indirect_limit++; 933 vif->vif_notxbuf++; 934 mutex_exit(&vif->vif_mutex); 935 goto fail; 936 } 937 938 if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) { 939 /* 940 * Allocate a DMA handle for this slot. 941 */ 942 if ((tb->tb_dmaext[tb->tb_ndmaext] = 943 virtio_dma_alloc_nomem(vif->vif_virtio, 944 &vioif_dma_attr_external, KM_SLEEP)) == NULL) { 945 mutex_enter(&vif->vif_mutex); 946 vif->vif_notxbuf++; 947 mutex_exit(&vif->vif_mutex); 948 goto fail; 949 } 950 } 951 virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++]; 952 953 if (virtio_dma_bind(extdma, nmp->b_rptr, len, 954 DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) != 955 DDI_SUCCESS) { 956 mutex_enter(&vif->vif_mutex); 957 vif->vif_txfail_dma_bind++; 958 mutex_exit(&vif->vif_mutex); 959 goto fail; 960 } 961 962 for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) { 963 uint64_t pa = virtio_dma_cookie_pa(extdma, n); 964 size_t sz = virtio_dma_cookie_size(extdma, n); 965 966 if (virtio_chain_append(tb->tb_chain, pa, sz, 967 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 968 mutex_enter(&vif->vif_mutex); 969 vif->vif_txfail_indirect_limit++; 970 vif->vif_notxbuf++; 971 mutex_exit(&vif->vif_mutex); 972 goto fail; 973 } 974 } 975 976 nmp = nmp->b_cont; 977 } 978 979 /* 980 * We need to keep the message around until we reclaim the buffer from 981 * the device before freeing it. 982 */ 983 tb->tb_mp = mp; 984 985 return (DDI_SUCCESS); 986 987 fail: 988 for (uint_t n = 0; n < tb->tb_ndmaext; n++) { 989 if (tb->tb_dmaext[n] != NULL) { 990 virtio_dma_unbind(tb->tb_dmaext[n]); 991 } 992 } 993 tb->tb_ndmaext = 0; 994 995 freemsg(mp); 996 997 return (DDI_FAILURE); 998 } 999 1000 static boolean_t 1001 vioif_send(vioif_t *vif, mblk_t *mp) 1002 { 1003 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 1004 1005 vioif_txbuf_t *tb = NULL; 1006 struct virtio_net_hdr *vnh = NULL; 1007 size_t msg_size = 0; 1008 uint32_t csum_start; 1009 uint32_t csum_stuff; 1010 uint32_t csum_flags; 1011 uint32_t lso_flags; 1012 uint32_t lso_mss; 1013 mblk_t *nmp; 1014 int ret; 1015 boolean_t lso_required = B_FALSE; 1016 struct ether_header *ether = (void *)mp->b_rptr; 1017 1018 for (nmp = mp; nmp; nmp = nmp->b_cont) 1019 msg_size += MBLKL(nmp); 1020 1021 if (vif->vif_tx_tso4) { 1022 mac_lso_get(mp, &lso_mss, &lso_flags); 1023 lso_required = (lso_flags & HW_LSO) != 0; 1024 } 1025 1026 mutex_enter(&vif->vif_mutex); 1027 if ((tb = vioif_txbuf_alloc(vif)) == NULL) { 1028 vif->vif_notxbuf++; 1029 goto fail; 1030 } 1031 mutex_exit(&vif->vif_mutex); 1032 1033 /* 1034 * Use the inline buffer for the virtio net header. Zero the portion 1035 * of our DMA allocation prior to the packet data. 1036 */ 1037 vnh = virtio_dma_va(tb->tb_dma, 0); 1038 bzero(vnh, VIOIF_HEADER_SKIP); 1039 1040 /* 1041 * For legacy devices, and those that have not negotiated 1042 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate 1043 * descriptor entry to the rest of the buffer. 1044 */ 1045 if (virtio_chain_append(tb->tb_chain, 1046 virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr), 1047 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 1048 mutex_enter(&vif->vif_mutex); 1049 vif->vif_notxbuf++; 1050 goto fail; 1051 } 1052 1053 mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags); 1054 1055 /* 1056 * They want us to do the TCP/UDP csum calculation. 1057 */ 1058 if (csum_flags & HCK_PARTIALCKSUM) { 1059 int eth_hsize; 1060 1061 /* 1062 * Did we ask for it? 1063 */ 1064 ASSERT(vif->vif_tx_csum); 1065 1066 /* 1067 * We only asked for partial csum packets. 1068 */ 1069 ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM)); 1070 ASSERT(!(csum_flags & HCK_FULLCKSUM)); 1071 1072 if (ether->ether_type == htons(ETHERTYPE_VLAN)) { 1073 eth_hsize = sizeof (struct ether_vlan_header); 1074 } else { 1075 eth_hsize = sizeof (struct ether_header); 1076 } 1077 1078 vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 1079 vnh->vnh_csum_start = eth_hsize + csum_start; 1080 vnh->vnh_csum_offset = csum_stuff - csum_start; 1081 } 1082 1083 /* 1084 * Setup LSO fields if required. 1085 */ 1086 if (lso_required) { 1087 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 1088 vnh->vnh_gso_size = (uint16_t)lso_mss; 1089 } 1090 1091 /* 1092 * The device does not maintain its own statistics about broadcast or 1093 * multicast packets, so we have to check the destination address 1094 * ourselves. 1095 */ 1096 if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) { 1097 mutex_enter(&vif->vif_mutex); 1098 if (ether_cmp(ðer->ether_dhost, vioif_broadcast) == 0) { 1099 vif->vif_brdcstxmt++; 1100 } else { 1101 vif->vif_multixmt++; 1102 } 1103 mutex_exit(&vif->vif_mutex); 1104 } 1105 1106 /* 1107 * For small packets, copy into the preallocated inline buffer rather 1108 * than incur the overhead of mapping. Note that both of these 1109 * functions ensure that "mp" is freed before returning. 1110 */ 1111 if (msg_size < vif->vif_txcopy_thresh) { 1112 ret = vioif_tx_inline(vif, tb, mp, msg_size); 1113 } else { 1114 ret = vioif_tx_external(vif, tb, mp, msg_size); 1115 } 1116 mp = NULL; 1117 1118 mutex_enter(&vif->vif_mutex); 1119 1120 if (ret != DDI_SUCCESS) { 1121 goto fail; 1122 } 1123 1124 vif->vif_opackets++; 1125 vif->vif_obytes += msg_size; 1126 mutex_exit(&vif->vif_mutex); 1127 1128 virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV); 1129 virtio_chain_submit(tb->tb_chain, B_TRUE); 1130 1131 return (B_TRUE); 1132 1133 fail: 1134 vif->vif_oerrors++; 1135 if (tb != NULL) { 1136 vioif_txbuf_free(vif, tb); 1137 } 1138 mutex_exit(&vif->vif_mutex); 1139 1140 return (mp == NULL); 1141 } 1142 1143 static mblk_t * 1144 vioif_m_tx(void *arg, mblk_t *mp) 1145 { 1146 vioif_t *vif = arg; 1147 mblk_t *nmp; 1148 1149 /* 1150 * Prior to attempting to send any more frames, do a reclaim to pick up 1151 * any descriptors which have been processed by the host. 1152 */ 1153 if (virtio_queue_nactive(vif->vif_tx_vq) != 0) { 1154 (void) vioif_reclaim_used_tx(vif); 1155 } 1156 1157 while (mp != NULL) { 1158 nmp = mp->b_next; 1159 mp->b_next = NULL; 1160 1161 if (!vioif_send(vif, mp)) { 1162 /* 1163 * If there are no descriptors available, try to 1164 * reclaim some, allowing a retry of the send if some 1165 * are found. 1166 */ 1167 mp->b_next = nmp; 1168 if (vioif_reclaim_used_tx(vif) != 0) { 1169 continue; 1170 } 1171 1172 /* 1173 * Otherwise, enable the TX ring interrupt so that as 1174 * soon as a descriptor becomes available, transmission 1175 * can begin again. For safety, make sure the periodic 1176 * reclaim is running as well. 1177 */ 1178 mutex_enter(&vif->vif_mutex); 1179 vif->vif_tx_corked = B_TRUE; 1180 virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE); 1181 vioif_reclaim_restart(vif); 1182 mutex_exit(&vif->vif_mutex); 1183 return (mp); 1184 } 1185 mp = nmp; 1186 } 1187 1188 /* Ensure the periodic reclaim has been started. */ 1189 mutex_enter(&vif->vif_mutex); 1190 vioif_reclaim_restart(vif); 1191 mutex_exit(&vif->vif_mutex); 1192 1193 return (NULL); 1194 } 1195 1196 static int 1197 vioif_m_start(void *arg) 1198 { 1199 vioif_t *vif = arg; 1200 1201 mutex_enter(&vif->vif_mutex); 1202 1203 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED); 1204 vif->vif_runstate = VIOIF_RUNSTATE_RUNNING; 1205 1206 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); 1207 1208 virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE); 1209 1210 /* 1211 * Starting interrupts on the TX virtqueue is unnecessary at this time. 1212 * Descriptor reclamation is handling during transmit, via a periodic 1213 * timer, and when resources are tight, via the then-enabled interrupt. 1214 */ 1215 vif->vif_tx_drain = B_FALSE; 1216 1217 /* 1218 * Add as many receive buffers as we can to the receive queue. If we 1219 * cannot add any, it may be because we have stopped and started again 1220 * and the descriptors are all in the queue already. 1221 */ 1222 (void) vioif_add_rx(vif); 1223 1224 mutex_exit(&vif->vif_mutex); 1225 return (DDI_SUCCESS); 1226 } 1227 1228 static void 1229 vioif_m_stop(void *arg) 1230 { 1231 vioif_t *vif = arg; 1232 1233 mutex_enter(&vif->vif_mutex); 1234 1235 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING); 1236 vif->vif_runstate = VIOIF_RUNSTATE_STOPPING; 1237 1238 /* Ensure all TX descriptors have been processed and reclaimed */ 1239 vioif_tx_drain(vif); 1240 1241 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); 1242 1243 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; 1244 mutex_exit(&vif->vif_mutex); 1245 } 1246 1247 static int 1248 vioif_m_stat(void *arg, uint_t stat, uint64_t *val) 1249 { 1250 vioif_t *vif = arg; 1251 1252 switch (stat) { 1253 case MAC_STAT_IERRORS: 1254 *val = vif->vif_ierrors; 1255 break; 1256 case MAC_STAT_OERRORS: 1257 *val = vif->vif_oerrors; 1258 break; 1259 case MAC_STAT_MULTIRCV: 1260 *val = vif->vif_multircv; 1261 break; 1262 case MAC_STAT_BRDCSTRCV: 1263 *val = vif->vif_brdcstrcv; 1264 break; 1265 case MAC_STAT_MULTIXMT: 1266 *val = vif->vif_multixmt; 1267 break; 1268 case MAC_STAT_BRDCSTXMT: 1269 *val = vif->vif_brdcstxmt; 1270 break; 1271 case MAC_STAT_IPACKETS: 1272 *val = vif->vif_ipackets; 1273 break; 1274 case MAC_STAT_RBYTES: 1275 *val = vif->vif_rbytes; 1276 break; 1277 case MAC_STAT_OPACKETS: 1278 *val = vif->vif_opackets; 1279 break; 1280 case MAC_STAT_OBYTES: 1281 *val = vif->vif_obytes; 1282 break; 1283 case MAC_STAT_NORCVBUF: 1284 *val = vif->vif_norecvbuf; 1285 break; 1286 case MAC_STAT_NOXMTBUF: 1287 *val = vif->vif_notxbuf; 1288 break; 1289 case MAC_STAT_IFSPEED: 1290 /* always 1 Gbit */ 1291 *val = 1000000000ULL; 1292 break; 1293 case ETHER_STAT_LINK_DUPLEX: 1294 /* virtual device, always full-duplex */ 1295 *val = LINK_DUPLEX_FULL; 1296 break; 1297 1298 default: 1299 return (ENOTSUP); 1300 } 1301 1302 return (DDI_SUCCESS); 1303 } 1304 1305 static int 1306 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1307 uint_t pr_valsize, const void *pr_val) 1308 { 1309 vioif_t *vif = arg; 1310 1311 switch (pr_num) { 1312 case MAC_PROP_MTU: { 1313 int r; 1314 uint32_t mtu; 1315 if (pr_valsize < sizeof (mtu)) { 1316 return (EOVERFLOW); 1317 } 1318 bcopy(pr_val, &mtu, sizeof (mtu)); 1319 1320 if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) { 1321 return (EINVAL); 1322 } 1323 1324 mutex_enter(&vif->vif_mutex); 1325 if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) { 1326 vif->vif_mtu = mtu; 1327 } 1328 mutex_exit(&vif->vif_mutex); 1329 1330 return (r); 1331 } 1332 1333 case MAC_PROP_PRIVATE: { 1334 long max, result; 1335 uint_t *resp; 1336 char *endptr; 1337 1338 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1339 max = VIOIF_MACPROP_TXCOPY_THRESH_MAX; 1340 resp = &vif->vif_txcopy_thresh; 1341 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1342 max = VIOIF_MACPROP_RXCOPY_THRESH_MAX; 1343 resp = &vif->vif_rxcopy_thresh; 1344 } else { 1345 return (ENOTSUP); 1346 } 1347 1348 if (pr_val == NULL) { 1349 return (EINVAL); 1350 } 1351 1352 if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 || 1353 *endptr != '\0' || result < 0 || result > max) { 1354 return (EINVAL); 1355 } 1356 1357 mutex_enter(&vif->vif_mutex); 1358 *resp = result; 1359 mutex_exit(&vif->vif_mutex); 1360 1361 return (0); 1362 } 1363 1364 default: 1365 return (ENOTSUP); 1366 } 1367 } 1368 1369 static int 1370 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1371 uint_t pr_valsize, void *pr_val) 1372 { 1373 vioif_t *vif = arg; 1374 1375 switch (pr_num) { 1376 case MAC_PROP_PRIVATE: { 1377 uint_t value; 1378 1379 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1380 value = vif->vif_txcopy_thresh; 1381 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1382 value = vif->vif_rxcopy_thresh; 1383 } else { 1384 return (ENOTSUP); 1385 } 1386 1387 if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) { 1388 return (EOVERFLOW); 1389 } 1390 1391 return (0); 1392 } 1393 1394 default: 1395 return (ENOTSUP); 1396 } 1397 } 1398 1399 static void 1400 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1401 mac_prop_info_handle_t prh) 1402 { 1403 vioif_t *vif = arg; 1404 char valstr[64]; 1405 int value; 1406 1407 switch (pr_num) { 1408 case MAC_PROP_MTU: 1409 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); 1410 mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max); 1411 return; 1412 1413 case MAC_PROP_PRIVATE: 1414 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1415 value = VIOIF_MACPROP_TXCOPY_THRESH_DEF; 1416 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1417 value = VIOIF_MACPROP_RXCOPY_THRESH_DEF; 1418 } else { 1419 /* 1420 * We do not recognise this private property name. 1421 */ 1422 return; 1423 } 1424 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); 1425 (void) snprintf(valstr, sizeof (valstr), "%d", value); 1426 mac_prop_info_set_default_str(prh, valstr); 1427 return; 1428 1429 default: 1430 return; 1431 } 1432 } 1433 1434 static boolean_t 1435 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 1436 { 1437 vioif_t *vif = arg; 1438 1439 switch (cap) { 1440 case MAC_CAPAB_HCKSUM: { 1441 if (!vif->vif_tx_csum) { 1442 return (B_FALSE); 1443 } 1444 1445 *(uint32_t *)cap_data = HCKSUM_INET_PARTIAL; 1446 1447 return (B_TRUE); 1448 } 1449 1450 case MAC_CAPAB_LSO: { 1451 if (!vif->vif_tx_tso4) { 1452 return (B_FALSE); 1453 } 1454 1455 mac_capab_lso_t *lso = cap_data; 1456 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 1457 lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE; 1458 1459 return (B_TRUE); 1460 } 1461 1462 default: 1463 return (B_FALSE); 1464 } 1465 } 1466 1467 static boolean_t 1468 vioif_has_feature(vioif_t *vif, uint32_t feature) 1469 { 1470 return (virtio_feature_present(vif->vif_virtio, feature)); 1471 } 1472 1473 /* 1474 * Read the primary MAC address from the device if one is provided. If not, 1475 * generate a random locally administered MAC address and write it back to the 1476 * device. 1477 */ 1478 static void 1479 vioif_get_mac(vioif_t *vif) 1480 { 1481 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 1482 1483 if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) { 1484 for (uint_t i = 0; i < ETHERADDRL; i++) { 1485 vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio, 1486 VIRTIO_NET_CONFIG_MAC + i); 1487 } 1488 vif->vif_mac_from_host = 1; 1489 1490 return; 1491 } 1492 1493 /* Get a few random bytes */ 1494 (void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL); 1495 /* Make sure it's a unicast MAC */ 1496 vif->vif_mac[0] &= ~1; 1497 /* Set the "locally administered" bit */ 1498 vif->vif_mac[1] |= 2; 1499 1500 /* 1501 * Write the random MAC address back to the device. 1502 */ 1503 for (uint_t i = 0; i < ETHERADDRL; i++) { 1504 virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i, 1505 vif->vif_mac[i]); 1506 } 1507 vif->vif_mac_from_host = 0; 1508 1509 dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: " 1510 "%02x:%02x:%02x:%02x:%02x:%02x", 1511 (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1], 1512 (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3], 1513 (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]); 1514 } 1515 1516 /* 1517 * Virtqueue interrupt handlers 1518 */ 1519 static uint_t 1520 vioif_rx_handler(caddr_t arg0, caddr_t arg1) 1521 { 1522 vioif_t *vif = (vioif_t *)arg0; 1523 1524 mutex_enter(&vif->vif_mutex); 1525 (void) vioif_process_rx(vif); 1526 1527 /* 1528 * Attempt to replenish the receive queue. If we cannot add any 1529 * descriptors here, it may be because all of the recently received 1530 * packets were loaned up to the networking stack. 1531 */ 1532 (void) vioif_add_rx(vif); 1533 mutex_exit(&vif->vif_mutex); 1534 1535 return (DDI_INTR_CLAIMED); 1536 } 1537 1538 static uint_t 1539 vioif_tx_handler(caddr_t arg0, caddr_t arg1) 1540 { 1541 vioif_t *vif = (vioif_t *)arg0; 1542 1543 /* 1544 * The TX interrupt could race with other reclamation activity, so 1545 * interpreting the return value is unimportant. 1546 */ 1547 (void) vioif_reclaim_used_tx(vif); 1548 1549 return (DDI_INTR_CLAIMED); 1550 } 1551 1552 static void 1553 vioif_check_features(vioif_t *vif) 1554 { 1555 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 1556 1557 vif->vif_tx_csum = 0; 1558 vif->vif_tx_tso4 = 0; 1559 1560 if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) { 1561 /* 1562 * The host will accept packets with partial checksums from us. 1563 */ 1564 vif->vif_tx_csum = 1; 1565 1566 /* 1567 * The legacy GSO feature represents the combination of 1568 * HOST_TSO4, HOST_TSO6, and HOST_ECN. 1569 */ 1570 boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO); 1571 boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4); 1572 boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN); 1573 1574 /* 1575 * Explicit congestion notification (ECN) is configured 1576 * globally; see "tcp_ecn_permitted". As we cannot currently 1577 * request that the stack disable ECN on a per interface basis, 1578 * we require the device to support the combination of 1579 * segmentation offload and ECN support. 1580 */ 1581 if (gso || (tso4 && ecn)) { 1582 vif->vif_tx_tso4 = 1; 1583 } 1584 } 1585 } 1586 1587 static int 1588 vioif_select_interrupt_types(void) 1589 { 1590 id_t id; 1591 smbios_system_t sys; 1592 smbios_info_t info; 1593 1594 if (vioif_allowed_int_types != -1) { 1595 /* 1596 * If this value was tuned via /etc/system or the debugger, 1597 * use the provided value directly. 1598 */ 1599 return (vioif_allowed_int_types); 1600 } 1601 1602 if ((id = smbios_info_system(ksmbios, &sys)) == SMB_ERR || 1603 smbios_info_common(ksmbios, id, &info) == SMB_ERR) { 1604 /* 1605 * The system may not have valid SMBIOS data, so ignore a 1606 * failure here. 1607 */ 1608 return (0); 1609 } 1610 1611 if (strcmp(info.smbi_manufacturer, "Google") == 0 && 1612 strcmp(info.smbi_product, "Google Compute Engine") == 0) { 1613 /* 1614 * An undiagnosed issue with the Google Compute Engine (GCE) 1615 * hypervisor exists. In this environment, no RX interrupts 1616 * are received if MSI-X handlers are installed. This does not 1617 * appear to be true for the Virtio SCSI driver. Fixed 1618 * interrupts do appear to work, so we fall back for now: 1619 */ 1620 return (DDI_INTR_TYPE_FIXED); 1621 } 1622 1623 return (0); 1624 } 1625 1626 static int 1627 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1628 { 1629 int ret; 1630 vioif_t *vif; 1631 virtio_t *vio; 1632 mac_register_t *macp = NULL; 1633 1634 if (cmd != DDI_ATTACH) { 1635 return (DDI_FAILURE); 1636 } 1637 1638 if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) == 1639 NULL) { 1640 return (DDI_FAILURE); 1641 } 1642 1643 vif = kmem_zalloc(sizeof (*vif), KM_SLEEP); 1644 vif->vif_dip = dip; 1645 vif->vif_virtio = vio; 1646 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; 1647 ddi_set_driver_private(dip, vif); 1648 1649 if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX, 1650 "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL || 1651 (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX, 1652 "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) { 1653 goto fail; 1654 } 1655 1656 if (virtio_init_complete(vio, vioif_select_interrupt_types()) != 1657 DDI_SUCCESS) { 1658 dev_err(dip, CE_WARN, "failed to complete Virtio init"); 1659 goto fail; 1660 } 1661 1662 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); 1663 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 1664 1665 mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); 1666 mutex_enter(&vif->vif_mutex); 1667 1668 vioif_get_mac(vif); 1669 1670 vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF; 1671 vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF; 1672 1673 if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) { 1674 vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU); 1675 } else { 1676 vif->vif_mtu_max = ETHERMTU; 1677 } 1678 1679 vif->vif_mtu = ETHERMTU; 1680 if (vif->vif_mtu > vif->vif_mtu_max) { 1681 vif->vif_mtu = vif->vif_mtu_max; 1682 } 1683 1684 vioif_check_features(vif); 1685 1686 if (vioif_alloc_bufs(vif) != 0) { 1687 mutex_exit(&vif->vif_mutex); 1688 dev_err(dip, CE_WARN, "failed to allocate memory"); 1689 goto fail; 1690 } 1691 1692 mutex_exit(&vif->vif_mutex); 1693 1694 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { 1695 dev_err(dip, CE_WARN, "failed to enable interrupts"); 1696 goto fail; 1697 } 1698 1699 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 1700 dev_err(dip, CE_WARN, "failed to allocate a mac_register"); 1701 goto fail; 1702 } 1703 1704 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1705 macp->m_driver = vif; 1706 macp->m_dip = dip; 1707 macp->m_src_addr = vif->vif_mac; 1708 macp->m_callbacks = &vioif_mac_callbacks; 1709 macp->m_min_sdu = 0; 1710 macp->m_max_sdu = vif->vif_mtu; 1711 macp->m_margin = VLAN_TAGSZ; 1712 macp->m_priv_props = vioif_priv_props; 1713 1714 if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) { 1715 dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret); 1716 goto fail; 1717 } 1718 mac_free(macp); 1719 1720 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); 1721 1722 return (DDI_SUCCESS); 1723 1724 fail: 1725 vioif_free_bufs(vif); 1726 if (macp != NULL) { 1727 mac_free(macp); 1728 } 1729 (void) virtio_fini(vio, B_TRUE); 1730 kmem_free(vif, sizeof (*vif)); 1731 return (DDI_FAILURE); 1732 } 1733 1734 static int 1735 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1736 { 1737 int r; 1738 vioif_t *vif; 1739 1740 if (cmd != DDI_DETACH) { 1741 return (DDI_FAILURE); 1742 } 1743 1744 if ((vif = ddi_get_driver_private(dip)) == NULL) { 1745 return (DDI_FAILURE); 1746 } 1747 1748 mutex_enter(&vif->vif_mutex); 1749 if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) { 1750 dev_err(dip, CE_WARN, "!NIC still running, cannot detach"); 1751 mutex_exit(&vif->vif_mutex); 1752 return (DDI_FAILURE); 1753 } 1754 1755 /* 1756 * There should be no outstanding transmit buffers once the NIC is 1757 * completely stopped. 1758 */ 1759 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); 1760 1761 /* 1762 * Though we cannot claw back all of the receive buffers until we reset 1763 * the device, we must ensure all those loaned to MAC have been 1764 * returned before calling mac_unregister(). 1765 */ 1766 if (vif->vif_nrxbufs_onloan > 0) { 1767 dev_err(dip, CE_WARN, "!%u receive buffers still loaned, " 1768 "cannot detach", vif->vif_nrxbufs_onloan); 1769 mutex_exit(&vif->vif_mutex); 1770 return (DDI_FAILURE); 1771 } 1772 1773 if ((r = mac_unregister(vif->vif_mac_handle)) != 0) { 1774 dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r); 1775 return (DDI_FAILURE); 1776 } 1777 1778 /* 1779 * Shut down the device so that we can recover any previously 1780 * submitted receive buffers. 1781 */ 1782 virtio_shutdown(vif->vif_virtio); 1783 for (;;) { 1784 virtio_chain_t *vic; 1785 1786 if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) { 1787 break; 1788 } 1789 1790 vioif_rxbuf_t *rb = virtio_chain_data(vic); 1791 vioif_rxbuf_free(vif, rb); 1792 } 1793 1794 /* 1795 * vioif_free_bufs() must be called before virtio_fini() 1796 * as it uses virtio_chain_free() which itself depends on some 1797 * virtio data structures still being around. 1798 */ 1799 vioif_free_bufs(vif); 1800 (void) virtio_fini(vif->vif_virtio, B_FALSE); 1801 1802 mutex_exit(&vif->vif_mutex); 1803 mutex_destroy(&vif->vif_mutex); 1804 1805 kmem_free(vif, sizeof (*vif)); 1806 1807 return (DDI_SUCCESS); 1808 } 1809 1810 static int 1811 vioif_quiesce(dev_info_t *dip) 1812 { 1813 vioif_t *vif; 1814 1815 if ((vif = ddi_get_driver_private(dip)) == NULL) 1816 return (DDI_FAILURE); 1817 1818 return (virtio_quiesce(vif->vif_virtio)); 1819 } 1820 1821 int 1822 _init(void) 1823 { 1824 int ret; 1825 1826 mac_init_ops(&vioif_dev_ops, "vioif"); 1827 1828 if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) { 1829 mac_fini_ops(&vioif_dev_ops); 1830 } 1831 1832 return (ret); 1833 } 1834 1835 int 1836 _fini(void) 1837 { 1838 int ret; 1839 1840 if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) { 1841 mac_fini_ops(&vioif_dev_ops); 1842 } 1843 1844 return (ret); 1845 } 1846 1847 int 1848 _info(struct modinfo *modinfop) 1849 { 1850 return (mod_info(&vioif_modlinkage, modinfop)); 1851 } 1852