1 /* 2 * This file and its contents are supplied under the terms of the 3 * Common Development and Distribution License ("CDDL"), version 1.0. 4 * You may only use this file in accordance with the terms of version 5 * 1.0 of the CDDL. 6 * 7 * A full copy of the text of the CDDL should have accompanied this 8 * source. A copy of the CDDL is also available via the Internet at 9 * http://www.illumos.org/license/CDDL. 10 */ 11 12 /* 13 * Copyright 2013 Nexenta Inc. All rights reserved. 14 * Copyright (c) 2014, 2016 by Delphix. All rights reserved. 15 * Copyright 2019 Joyent, Inc. 16 */ 17 18 /* Based on the NetBSD virtio driver by Minoura Makoto. */ 19 /* 20 * Copyright (c) 2010 Minoura Makoto. 21 * All rights reserved. 22 * 23 * Redistribution and use in source and binary forms, with or without 24 * modification, are permitted provided that the following conditions 25 * are met: 26 * 1. Redistributions of source code must retain the above copyright 27 * notice, this list of conditions and the following disclaimer. 28 * 2. Redistributions in binary form must reproduce the above copyright 29 * notice, this list of conditions and the following disclaimer in the 30 * documentation and/or other materials provided with the distribution. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 33 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 34 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 35 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 36 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 38 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 39 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 41 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 42 */ 43 44 /* 45 * VIRTIO NETWORK DRIVER 46 */ 47 48 #include <sys/types.h> 49 #include <sys/errno.h> 50 #include <sys/param.h> 51 #include <sys/stropts.h> 52 #include <sys/stream.h> 53 #include <sys/strsubr.h> 54 #include <sys/kmem.h> 55 #include <sys/conf.h> 56 #include <sys/devops.h> 57 #include <sys/ksynch.h> 58 #include <sys/stat.h> 59 #include <sys/modctl.h> 60 #include <sys/debug.h> 61 #include <sys/pci.h> 62 #include <sys/ethernet.h> 63 #include <sys/vlan.h> 64 #include <sys/sysmacros.h> 65 66 #include <sys/dlpi.h> 67 #include <sys/taskq.h> 68 69 #include <sys/pattr.h> 70 #include <sys/strsun.h> 71 72 #include <sys/random.h> 73 #include <sys/containerof.h> 74 #include <sys/stream.h> 75 76 #include <sys/mac.h> 77 #include <sys/mac_provider.h> 78 #include <sys/mac_ether.h> 79 80 #include "virtio.h" 81 #include "vioif.h" 82 83 84 static int vioif_quiesce(dev_info_t *); 85 static int vioif_attach(dev_info_t *, ddi_attach_cmd_t); 86 static int vioif_detach(dev_info_t *, ddi_detach_cmd_t); 87 static boolean_t vioif_has_feature(vioif_t *, uint32_t); 88 static void vioif_reclaim_restart(vioif_t *); 89 static int vioif_m_stat(void *, uint_t, uint64_t *); 90 static void vioif_m_stop(void *); 91 static int vioif_m_start(void *); 92 static int vioif_m_multicst(void *, boolean_t, const uint8_t *); 93 static int vioif_m_setpromisc(void *, boolean_t); 94 static int vioif_m_unicst(void *, const uint8_t *); 95 static mblk_t *vioif_m_tx(void *, mblk_t *); 96 static int vioif_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 97 const void *); 98 static int vioif_m_getprop(void *, const char *, mac_prop_id_t, uint_t, void *); 99 static void vioif_m_propinfo(void *, const char *, mac_prop_id_t, 100 mac_prop_info_handle_t); 101 static boolean_t vioif_m_getcapab(void *, mac_capab_t, void *); 102 static uint_t vioif_add_rx(vioif_t *); 103 104 105 static struct cb_ops vioif_cb_ops = { 106 .cb_rev = CB_REV, 107 .cb_flag = D_MP | D_NEW, 108 109 .cb_open = nulldev, 110 .cb_close = nulldev, 111 .cb_strategy = nodev, 112 .cb_print = nodev, 113 .cb_dump = nodev, 114 .cb_read = nodev, 115 .cb_write = nodev, 116 .cb_ioctl = nodev, 117 .cb_devmap = nodev, 118 .cb_mmap = nodev, 119 .cb_segmap = nodev, 120 .cb_chpoll = nochpoll, 121 .cb_prop_op = ddi_prop_op, 122 .cb_str = NULL, 123 .cb_aread = nodev, 124 .cb_awrite = nodev, 125 }; 126 127 static struct dev_ops vioif_dev_ops = { 128 .devo_rev = DEVO_REV, 129 .devo_refcnt = 0, 130 131 .devo_attach = vioif_attach, 132 .devo_detach = vioif_detach, 133 .devo_quiesce = vioif_quiesce, 134 135 .devo_cb_ops = &vioif_cb_ops, 136 137 .devo_getinfo = NULL, 138 .devo_identify = nulldev, 139 .devo_probe = nulldev, 140 .devo_reset = nodev, 141 .devo_bus_ops = NULL, 142 .devo_power = NULL, 143 }; 144 145 static struct modldrv vioif_modldrv = { 146 .drv_modops = &mod_driverops, 147 .drv_linkinfo = "VIRTIO network driver", 148 .drv_dev_ops = &vioif_dev_ops 149 }; 150 151 static struct modlinkage vioif_modlinkage = { 152 .ml_rev = MODREV_1, 153 .ml_linkage = { &vioif_modldrv, NULL } 154 }; 155 156 static mac_callbacks_t vioif_mac_callbacks = { 157 .mc_getstat = vioif_m_stat, 158 .mc_start = vioif_m_start, 159 .mc_stop = vioif_m_stop, 160 .mc_setpromisc = vioif_m_setpromisc, 161 .mc_multicst = vioif_m_multicst, 162 .mc_unicst = vioif_m_unicst, 163 .mc_tx = vioif_m_tx, 164 165 .mc_callbacks = (MC_GETCAPAB | MC_SETPROP | 166 MC_GETPROP | MC_PROPINFO), 167 .mc_getcapab = vioif_m_getcapab, 168 .mc_setprop = vioif_m_setprop, 169 .mc_getprop = vioif_m_getprop, 170 .mc_propinfo = vioif_m_propinfo, 171 }; 172 173 static const uchar_t vioif_broadcast[ETHERADDRL] = { 174 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 175 }; 176 177 /* 178 * Interval for the periodic TX reclaim. 179 */ 180 uint_t vioif_reclaim_ms = 200; 181 182 /* 183 * DMA attribute template for transmit and receive buffers. The SGL entry 184 * count will be modified before using the template. Note that these 185 * allocations are aligned so that VIOIF_HEADER_SKIP places the IP header in 186 * received frames at the correct offset for the networking stack. 187 */ 188 ddi_dma_attr_t vioif_dma_attr_bufs = { 189 .dma_attr_version = DMA_ATTR_V0, 190 .dma_attr_addr_lo = 0x0000000000000000, 191 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 192 .dma_attr_count_max = 0x00000000FFFFFFFF, 193 .dma_attr_align = VIOIF_HEADER_ALIGN, 194 .dma_attr_burstsizes = 1, 195 .dma_attr_minxfer = 1, 196 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 197 .dma_attr_seg = 0x00000000FFFFFFFF, 198 .dma_attr_sgllen = 0, 199 .dma_attr_granular = 1, 200 .dma_attr_flags = 0 201 }; 202 203 /* 204 * DMA attributes for mapping larger transmit buffers from the networking 205 * stack. The requirements are quite loose, but note that the SGL entry length 206 * field is 32-bit. 207 */ 208 ddi_dma_attr_t vioif_dma_attr_external = { 209 .dma_attr_version = DMA_ATTR_V0, 210 .dma_attr_addr_lo = 0x0000000000000000, 211 .dma_attr_addr_hi = 0xFFFFFFFFFFFFFFFF, 212 .dma_attr_count_max = 0x00000000FFFFFFFF, 213 .dma_attr_align = 1, 214 .dma_attr_burstsizes = 1, 215 .dma_attr_minxfer = 1, 216 .dma_attr_maxxfer = 0x00000000FFFFFFFF, 217 .dma_attr_seg = 0x00000000FFFFFFFF, 218 .dma_attr_sgllen = VIOIF_MAX_SEGS - 1, 219 .dma_attr_granular = 1, 220 .dma_attr_flags = 0 221 }; 222 223 224 /* 225 * VIRTIO NET MAC PROPERTIES 226 */ 227 #define VIOIF_MACPROP_TXCOPY_THRESH "_txcopy_thresh" 228 #define VIOIF_MACPROP_TXCOPY_THRESH_DEF 300 229 #define VIOIF_MACPROP_TXCOPY_THRESH_MAX 640 230 231 #define VIOIF_MACPROP_RXCOPY_THRESH "_rxcopy_thresh" 232 #define VIOIF_MACPROP_RXCOPY_THRESH_DEF 300 233 #define VIOIF_MACPROP_RXCOPY_THRESH_MAX 640 234 235 static char *vioif_priv_props[] = { 236 VIOIF_MACPROP_TXCOPY_THRESH, 237 VIOIF_MACPROP_RXCOPY_THRESH, 238 NULL 239 }; 240 241 242 static vioif_txbuf_t * 243 vioif_txbuf_alloc(vioif_t *vif) 244 { 245 vioif_txbuf_t *tb; 246 247 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 248 249 if ((tb = list_remove_head(&vif->vif_txbufs)) != NULL) { 250 vif->vif_ntxbufs_alloc++; 251 } 252 253 return (tb); 254 } 255 256 static void 257 vioif_txbuf_free(vioif_t *vif, vioif_txbuf_t *tb) 258 { 259 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 260 261 VERIFY3U(vif->vif_ntxbufs_alloc, >, 0); 262 vif->vif_ntxbufs_alloc--; 263 264 virtio_chain_clear(tb->tb_chain); 265 list_insert_head(&vif->vif_txbufs, tb); 266 } 267 268 static vioif_rxbuf_t * 269 vioif_rxbuf_alloc(vioif_t *vif) 270 { 271 vioif_rxbuf_t *rb; 272 273 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 274 275 if ((rb = list_remove_head(&vif->vif_rxbufs)) != NULL) { 276 vif->vif_nrxbufs_alloc++; 277 } 278 279 return (rb); 280 } 281 282 static void 283 vioif_rxbuf_free(vioif_t *vif, vioif_rxbuf_t *rb) 284 { 285 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 286 287 VERIFY3U(vif->vif_nrxbufs_alloc, >, 0); 288 vif->vif_nrxbufs_alloc--; 289 290 virtio_chain_clear(rb->rb_chain); 291 list_insert_head(&vif->vif_rxbufs, rb); 292 } 293 294 static void 295 vioif_rx_free_callback(caddr_t free_arg) 296 { 297 vioif_rxbuf_t *rb = (vioif_rxbuf_t *)free_arg; 298 vioif_t *vif = rb->rb_vioif; 299 300 mutex_enter(&vif->vif_mutex); 301 302 /* 303 * Return this receive buffer to the free list. 304 */ 305 vioif_rxbuf_free(vif, rb); 306 307 VERIFY3U(vif->vif_nrxbufs_onloan, >, 0); 308 vif->vif_nrxbufs_onloan--; 309 310 /* 311 * Attempt to replenish the receive queue with at least the buffer we 312 * just freed. There isn't a great way to deal with failure here, 313 * though because we'll only loan at most half of the buffers there 314 * should always be at least some available even if this fails. 315 */ 316 (void) vioif_add_rx(vif); 317 318 mutex_exit(&vif->vif_mutex); 319 } 320 321 static void 322 vioif_free_bufs(vioif_t *vif) 323 { 324 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 325 326 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); 327 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { 328 vioif_txbuf_t *tb = &vif->vif_txbufs_mem[i]; 329 330 /* 331 * Ensure that this txbuf is now in the free list: 332 */ 333 VERIFY(list_link_active(&tb->tb_link)); 334 list_remove(&vif->vif_txbufs, tb); 335 336 /* 337 * We should not have an mblk chain at this point. 338 */ 339 VERIFY3P(tb->tb_mp, ==, NULL); 340 341 if (tb->tb_dma != NULL) { 342 virtio_dma_free(tb->tb_dma); 343 tb->tb_dma = NULL; 344 } 345 346 if (tb->tb_chain != NULL) { 347 virtio_chain_free(tb->tb_chain); 348 tb->tb_chain = NULL; 349 } 350 351 if (tb->tb_dmaext != NULL) { 352 for (uint_t j = 0; j < tb->tb_dmaext_capacity; j++) { 353 if (tb->tb_dmaext[j] != NULL) { 354 virtio_dma_free( 355 tb->tb_dmaext[j]); 356 tb->tb_dmaext[j] = NULL; 357 } 358 } 359 360 kmem_free(tb->tb_dmaext, 361 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity); 362 tb->tb_dmaext = NULL; 363 tb->tb_dmaext_capacity = 0; 364 } 365 } 366 VERIFY(list_is_empty(&vif->vif_txbufs)); 367 if (vif->vif_txbufs_mem != NULL) { 368 kmem_free(vif->vif_txbufs_mem, 369 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity); 370 vif->vif_txbufs_mem = NULL; 371 vif->vif_txbufs_capacity = 0; 372 } 373 374 VERIFY3U(vif->vif_nrxbufs_alloc, ==, 0); 375 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { 376 vioif_rxbuf_t *rb = &vif->vif_rxbufs_mem[i]; 377 378 /* 379 * Ensure that this rxbuf is now in the free list: 380 */ 381 VERIFY(list_link_active(&rb->rb_link)); 382 list_remove(&vif->vif_rxbufs, rb); 383 384 if (rb->rb_dma != NULL) { 385 virtio_dma_free(rb->rb_dma); 386 rb->rb_dma = NULL; 387 } 388 389 if (rb->rb_chain != NULL) { 390 virtio_chain_free(rb->rb_chain); 391 rb->rb_chain = NULL; 392 } 393 } 394 VERIFY(list_is_empty(&vif->vif_rxbufs)); 395 if (vif->vif_rxbufs_mem != NULL) { 396 kmem_free(vif->vif_rxbufs_mem, 397 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity); 398 vif->vif_rxbufs_mem = NULL; 399 vif->vif_rxbufs_capacity = 0; 400 } 401 } 402 403 static int 404 vioif_alloc_bufs(vioif_t *vif) 405 { 406 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 407 408 /* 409 * Allocate one contiguous chunk of memory for the transmit and receive 410 * buffer tracking objects. If the ring is unusually small, we'll 411 * reduce our target buffer count accordingly. 412 */ 413 vif->vif_txbufs_capacity = MIN(VIRTIO_NET_TX_BUFS, 414 virtio_queue_size(vif->vif_tx_vq)); 415 vif->vif_txbufs_mem = kmem_zalloc( 416 sizeof (vioif_txbuf_t) * vif->vif_txbufs_capacity, KM_SLEEP); 417 list_create(&vif->vif_txbufs, sizeof (vioif_txbuf_t), 418 offsetof(vioif_txbuf_t, tb_link)); 419 420 vif->vif_rxbufs_capacity = MIN(VIRTIO_NET_RX_BUFS, 421 virtio_queue_size(vif->vif_rx_vq)); 422 vif->vif_rxbufs_mem = kmem_zalloc( 423 sizeof (vioif_rxbuf_t) * vif->vif_rxbufs_capacity, KM_SLEEP); 424 list_create(&vif->vif_rxbufs, sizeof (vioif_rxbuf_t), 425 offsetof(vioif_rxbuf_t, rb_link)); 426 427 /* 428 * Do not loan more than half of our allocated receive buffers into 429 * the networking stack. 430 */ 431 vif->vif_nrxbufs_onloan_max = vif->vif_rxbufs_capacity / 2; 432 433 /* 434 * Put everything in the free list straight away in order to simplify 435 * the use of vioif_free_bufs() for cleanup on allocation failure. 436 */ 437 for (uint_t i = 0; i < vif->vif_txbufs_capacity; i++) { 438 list_insert_tail(&vif->vif_txbufs, &vif->vif_txbufs_mem[i]); 439 } 440 for (uint_t i = 0; i < vif->vif_rxbufs_capacity; i++) { 441 list_insert_tail(&vif->vif_rxbufs, &vif->vif_rxbufs_mem[i]); 442 } 443 444 /* 445 * Start from the DMA attribute template common to both transmit and 446 * receive buffers. The SGL entry count will be modified for each 447 * buffer type. 448 */ 449 ddi_dma_attr_t attr = vioif_dma_attr_bufs; 450 451 /* 452 * The transmit inline buffer is small (less than a page), so it's 453 * reasonable to request a single cookie. 454 */ 455 attr.dma_attr_sgllen = 1; 456 457 for (vioif_txbuf_t *tb = list_head(&vif->vif_txbufs); tb != NULL; 458 tb = list_next(&vif->vif_txbufs, tb)) { 459 if ((tb->tb_dma = virtio_dma_alloc(vif->vif_virtio, 460 VIOIF_TX_INLINE_SIZE, &attr, 461 DDI_DMA_STREAMING | DDI_DMA_WRITE, KM_SLEEP)) == NULL) { 462 goto fail; 463 } 464 VERIFY3U(virtio_dma_ncookies(tb->tb_dma), ==, 1); 465 466 if ((tb->tb_chain = virtio_chain_alloc(vif->vif_tx_vq, 467 KM_SLEEP)) == NULL) { 468 goto fail; 469 } 470 virtio_chain_data_set(tb->tb_chain, tb); 471 472 tb->tb_dmaext_capacity = VIOIF_MAX_SEGS - 1; 473 tb->tb_dmaext = kmem_zalloc( 474 sizeof (virtio_dma_t *) * tb->tb_dmaext_capacity, 475 KM_SLEEP); 476 } 477 478 /* 479 * The receive buffers are larger, and we can tolerate a large number 480 * of segments. Adjust the SGL entry count, setting aside one segment 481 * for the virtio net header. 482 */ 483 attr.dma_attr_sgllen = VIOIF_MAX_SEGS - 1; 484 485 for (vioif_rxbuf_t *rb = list_head(&vif->vif_rxbufs); rb != NULL; 486 rb = list_next(&vif->vif_rxbufs, rb)) { 487 if ((rb->rb_dma = virtio_dma_alloc(vif->vif_virtio, 488 VIOIF_RX_BUF_SIZE, &attr, DDI_DMA_STREAMING | DDI_DMA_READ, 489 KM_SLEEP)) == NULL) { 490 goto fail; 491 } 492 493 if ((rb->rb_chain = virtio_chain_alloc(vif->vif_rx_vq, 494 KM_SLEEP)) == NULL) { 495 goto fail; 496 } 497 virtio_chain_data_set(rb->rb_chain, rb); 498 499 /* 500 * Ensure that the first cookie is sufficient to cover the 501 * header skip region plus one byte. 502 */ 503 VERIFY3U(virtio_dma_cookie_size(rb->rb_dma, 0), >=, 504 VIOIF_HEADER_SKIP + 1); 505 506 /* 507 * Ensure that the frame data begins at a location with a 508 * correctly aligned IP header. 509 */ 510 VERIFY3U((uintptr_t)virtio_dma_va(rb->rb_dma, 511 VIOIF_HEADER_SKIP) % 4, ==, 2); 512 513 rb->rb_vioif = vif; 514 rb->rb_frtn.free_func = vioif_rx_free_callback; 515 rb->rb_frtn.free_arg = (caddr_t)rb; 516 } 517 518 return (0); 519 520 fail: 521 vioif_free_bufs(vif); 522 return (ENOMEM); 523 } 524 525 static int 526 vioif_m_multicst(void *arg, boolean_t add, const uint8_t *mcst_addr) 527 { 528 /* 529 * Even though we currently do not have support for programming 530 * multicast filters, or even enabling promiscuous mode, we return 531 * success here to avoid the networking stack falling back to link 532 * layer broadcast for multicast traffic. Some hypervisors already 533 * pass received multicast frames onto the guest, so at least on those 534 * systems multicast will work as expected anyway. 535 */ 536 return (0); 537 } 538 539 static int 540 vioif_m_setpromisc(void *arg, boolean_t on) 541 { 542 /* 543 * Even though we cannot currently enable promiscuous mode, we return 544 * success here to allow tools like snoop(1M) to continue to function. 545 */ 546 return (0); 547 } 548 549 static int 550 vioif_m_unicst(void *arg, const uint8_t *mac) 551 { 552 return (ENOTSUP); 553 } 554 555 static uint_t 556 vioif_add_rx(vioif_t *vif) 557 { 558 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 559 560 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { 561 /* 562 * If the NIC is not running, do not give the device any 563 * receive buffers. 564 */ 565 return (0); 566 } 567 568 uint_t num_added = 0; 569 570 vioif_rxbuf_t *rb; 571 while ((rb = vioif_rxbuf_alloc(vif)) != NULL) { 572 /* 573 * For legacy devices, and those that have not negotiated 574 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a 575 * separate descriptor entry to the rest of the buffer. 576 */ 577 if (virtio_chain_append(rb->rb_chain, 578 virtio_dma_cookie_pa(rb->rb_dma, 0), 579 sizeof (struct virtio_net_hdr), 580 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 581 goto fail; 582 } 583 584 for (uint_t n = 0; n < virtio_dma_ncookies(rb->rb_dma); n++) { 585 uint64_t pa = virtio_dma_cookie_pa(rb->rb_dma, n); 586 size_t sz = virtio_dma_cookie_size(rb->rb_dma, n); 587 588 if (n == 0) { 589 pa += VIOIF_HEADER_SKIP; 590 VERIFY3U(sz, >, VIOIF_HEADER_SKIP); 591 sz -= VIOIF_HEADER_SKIP; 592 } 593 594 if (virtio_chain_append(rb->rb_chain, pa, sz, 595 VIRTIO_DIR_DEVICE_WRITES) != DDI_SUCCESS) { 596 goto fail; 597 } 598 } 599 600 virtio_chain_submit(rb->rb_chain, B_FALSE); 601 num_added++; 602 continue; 603 604 fail: 605 vioif_rxbuf_free(vif, rb); 606 vif->vif_norecvbuf++; 607 break; 608 } 609 610 if (num_added > 0) { 611 virtio_queue_flush(vif->vif_rx_vq); 612 } 613 614 return (num_added); 615 } 616 617 static uint_t 618 vioif_process_rx(vioif_t *vif) 619 { 620 virtio_chain_t *vic; 621 mblk_t *mphead = NULL, *lastmp = NULL, *mp; 622 uint_t num_processed = 0; 623 624 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 625 626 while ((vic = virtio_queue_poll(vif->vif_rx_vq)) != NULL) { 627 /* 628 * We have to use the chain received length here, as the device 629 * does not tell us the received frame length any other way. 630 * In a limited survey of hypervisors, virtio network devices 631 * appear to provide the right value here. 632 */ 633 size_t len = virtio_chain_received_length(vic); 634 vioif_rxbuf_t *rb = virtio_chain_data(vic); 635 636 virtio_dma_sync(rb->rb_dma, DDI_DMA_SYNC_FORCPU); 637 638 /* 639 * If the NIC is not running, discard any received frames. 640 */ 641 if (vif->vif_runstate != VIOIF_RUNSTATE_RUNNING) { 642 vioif_rxbuf_free(vif, rb); 643 continue; 644 } 645 646 if (len < sizeof (struct virtio_net_hdr)) { 647 vif->vif_rxfail_chain_undersize++; 648 vif->vif_ierrors++; 649 vioif_rxbuf_free(vif, rb); 650 continue; 651 } 652 len -= sizeof (struct virtio_net_hdr); 653 654 /* 655 * We copy small packets that happen to fit into a single 656 * cookie and reuse the buffers. For bigger ones, we loan 657 * the buffers upstream. 658 */ 659 if (len < vif->vif_rxcopy_thresh || 660 vif->vif_nrxbufs_onloan >= vif->vif_nrxbufs_onloan_max) { 661 mutex_exit(&vif->vif_mutex); 662 if ((mp = allocb(len, 0)) == NULL) { 663 mutex_enter(&vif->vif_mutex); 664 vif->vif_norecvbuf++; 665 vif->vif_ierrors++; 666 667 vioif_rxbuf_free(vif, rb); 668 continue; 669 } 670 671 bcopy(virtio_dma_va(rb->rb_dma, VIOIF_HEADER_SKIP), 672 mp->b_rptr, len); 673 mp->b_wptr = mp->b_rptr + len; 674 675 /* 676 * As the packet contents was copied rather than 677 * loaned, we can return the receive buffer resources 678 * to the free list. 679 */ 680 mutex_enter(&vif->vif_mutex); 681 vioif_rxbuf_free(vif, rb); 682 683 } else { 684 mutex_exit(&vif->vif_mutex); 685 if ((mp = desballoc(virtio_dma_va(rb->rb_dma, 686 VIOIF_HEADER_SKIP), len, 0, 687 &rb->rb_frtn)) == NULL) { 688 mutex_enter(&vif->vif_mutex); 689 vif->vif_norecvbuf++; 690 vif->vif_ierrors++; 691 692 vioif_rxbuf_free(vif, rb); 693 continue; 694 } 695 mp->b_wptr = mp->b_rptr + len; 696 697 mutex_enter(&vif->vif_mutex); 698 vif->vif_nrxbufs_onloan++; 699 } 700 701 /* 702 * virtio-net does not tell us if this packet is multicast 703 * or broadcast, so we have to check it. 704 */ 705 if (mp->b_rptr[0] & 0x1) { 706 if (bcmp(mp->b_rptr, vioif_broadcast, ETHERADDRL) != 0) 707 vif->vif_multircv++; 708 else 709 vif->vif_brdcstrcv++; 710 } 711 712 vif->vif_rbytes += len; 713 vif->vif_ipackets++; 714 715 if (lastmp == NULL) { 716 mphead = mp; 717 } else { 718 lastmp->b_next = mp; 719 } 720 lastmp = mp; 721 num_processed++; 722 } 723 724 if (mphead != NULL) { 725 if (vif->vif_runstate == VIOIF_RUNSTATE_RUNNING) { 726 mutex_exit(&vif->vif_mutex); 727 mac_rx(vif->vif_mac_handle, NULL, mphead); 728 mutex_enter(&vif->vif_mutex); 729 } else { 730 /* 731 * The NIC was disabled part way through our execution, 732 * so free the messages we allocated. 733 */ 734 freemsgchain(mphead); 735 } 736 } 737 738 return (num_processed); 739 } 740 741 static uint_t 742 vioif_reclaim_used_tx(vioif_t *vif) 743 { 744 virtio_chain_t *vic; 745 uint_t num_reclaimed = 0; 746 747 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 748 749 while ((vic = virtio_queue_poll(vif->vif_tx_vq)) != NULL) { 750 vioif_txbuf_t *tb = virtio_chain_data(vic); 751 752 if (tb->tb_mp != NULL) { 753 /* 754 * Unbind the external mapping. 755 */ 756 for (uint_t i = 0; i < tb->tb_dmaext_capacity; i++) { 757 if (tb->tb_dmaext[i] == NULL) { 758 continue; 759 } 760 761 virtio_dma_unbind(tb->tb_dmaext[i]); 762 } 763 764 freemsg(tb->tb_mp); 765 tb->tb_mp = NULL; 766 } 767 768 /* 769 * Return this transmit buffer to the free list for reuse. 770 */ 771 mutex_enter(&vif->vif_mutex); 772 vioif_txbuf_free(vif, tb); 773 mutex_exit(&vif->vif_mutex); 774 775 num_reclaimed++; 776 } 777 778 /* Return ring to transmitting state if descriptors were reclaimed. */ 779 if (num_reclaimed > 0) { 780 boolean_t do_update = B_FALSE; 781 782 mutex_enter(&vif->vif_mutex); 783 vif->vif_stat_tx_reclaim += num_reclaimed; 784 if (vif->vif_tx_corked) { 785 /* 786 * TX was corked on a lack of available descriptors. 787 * That dire state has passed so the TX interrupt can 788 * be disabled and MAC can be notified that 789 * transmission is possible again. 790 */ 791 vif->vif_tx_corked = B_FALSE; 792 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 793 do_update = B_TRUE; 794 } 795 796 if (do_update) { 797 mac_tx_update(vif->vif_mac_handle); 798 } 799 mutex_exit(&vif->vif_mutex); 800 } 801 802 return (num_reclaimed); 803 } 804 805 static void 806 vioif_reclaim_periodic(void *arg) 807 { 808 vioif_t *vif = arg; 809 uint_t num_reclaimed; 810 811 num_reclaimed = vioif_reclaim_used_tx(vif); 812 813 mutex_enter(&vif->vif_mutex); 814 vif->vif_tx_reclaim_tid = 0; 815 /* 816 * If used descriptors were reclaimed or TX descriptors appear to be 817 * outstanding, the ring is considered active and periodic reclamation 818 * is necessary for now. 819 */ 820 if (num_reclaimed != 0 || virtio_queue_nactive(vif->vif_tx_vq) != 0) { 821 /* Do not reschedule if the ring is being drained. */ 822 if (!vif->vif_tx_drain) { 823 vioif_reclaim_restart(vif); 824 } 825 } 826 mutex_exit(&vif->vif_mutex); 827 } 828 829 static void 830 vioif_reclaim_restart(vioif_t *vif) 831 { 832 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 833 VERIFY(!vif->vif_tx_drain); 834 835 if (vif->vif_tx_reclaim_tid == 0) { 836 vif->vif_tx_reclaim_tid = timeout(vioif_reclaim_periodic, vif, 837 MSEC_TO_TICK_ROUNDUP(vioif_reclaim_ms)); 838 } 839 } 840 841 static void 842 vioif_tx_drain(vioif_t *vif) 843 { 844 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 845 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPING); 846 847 vif->vif_tx_drain = B_TRUE; 848 /* Put a stop to the periodic reclaim if it is running */ 849 if (vif->vif_tx_reclaim_tid != 0) { 850 timeout_id_t tid = vif->vif_tx_reclaim_tid; 851 852 /* 853 * With vif_tx_drain set, there is no risk that a racing 854 * vioif_reclaim_periodic() call will reschedule itself. 855 * 856 * Being part of the mc_stop hook also guarantees that 857 * vioif_m_tx() will not be called to restart it. 858 */ 859 vif->vif_tx_reclaim_tid = 0; 860 mutex_exit(&vif->vif_mutex); 861 (void) untimeout(tid); 862 mutex_enter(&vif->vif_mutex); 863 } 864 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 865 866 /* 867 * Wait for all of the TX descriptors to be processed by the host so 868 * they can be reclaimed. 869 */ 870 while (vif->vif_ntxbufs_alloc > 0) { 871 mutex_exit(&vif->vif_mutex); 872 (void) vioif_reclaim_used_tx(vif); 873 delay(5); 874 mutex_enter(&vif->vif_mutex); 875 } 876 VERIFY(!vif->vif_tx_corked); 877 VERIFY3U(vif->vif_tx_reclaim_tid, ==, 0); 878 VERIFY3U(virtio_queue_nactive(vif->vif_tx_vq), ==, 0); 879 } 880 881 static int 882 vioif_tx_inline(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) 883 { 884 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 885 886 VERIFY3U(msg_size, <=, virtio_dma_size(tb->tb_dma) - VIOIF_HEADER_SKIP); 887 888 /* 889 * Copy the message into the inline buffer and then free the message. 890 */ 891 mcopymsg(mp, virtio_dma_va(tb->tb_dma, VIOIF_HEADER_SKIP)); 892 893 if (virtio_chain_append(tb->tb_chain, 894 virtio_dma_cookie_pa(tb->tb_dma, 0) + VIOIF_HEADER_SKIP, 895 msg_size, VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 896 return (DDI_FAILURE); 897 } 898 899 return (DDI_SUCCESS); 900 } 901 902 static int 903 vioif_tx_external(vioif_t *vif, vioif_txbuf_t *tb, mblk_t *mp, size_t msg_size) 904 { 905 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 906 907 mblk_t *nmp = mp; 908 tb->tb_ndmaext = 0; 909 910 while (nmp != NULL) { 911 size_t len; 912 913 if ((len = MBLKL(nmp)) == 0) { 914 /* 915 * Skip any zero-length entries in the chain. 916 */ 917 nmp = nmp->b_cont; 918 continue; 919 } 920 921 if (tb->tb_ndmaext >= tb->tb_dmaext_capacity) { 922 mutex_enter(&vif->vif_mutex); 923 vif->vif_txfail_indirect_limit++; 924 vif->vif_notxbuf++; 925 mutex_exit(&vif->vif_mutex); 926 goto fail; 927 } 928 929 if (tb->tb_dmaext[tb->tb_ndmaext] == NULL) { 930 /* 931 * Allocate a DMA handle for this slot. 932 */ 933 if ((tb->tb_dmaext[tb->tb_ndmaext] = 934 virtio_dma_alloc_nomem(vif->vif_virtio, 935 &vioif_dma_attr_external, KM_SLEEP)) == NULL) { 936 mutex_enter(&vif->vif_mutex); 937 vif->vif_notxbuf++; 938 mutex_exit(&vif->vif_mutex); 939 goto fail; 940 } 941 } 942 virtio_dma_t *extdma = tb->tb_dmaext[tb->tb_ndmaext++]; 943 944 if (virtio_dma_bind(extdma, nmp->b_rptr, len, 945 DDI_DMA_WRITE | DDI_DMA_STREAMING, KM_SLEEP) != 946 DDI_SUCCESS) { 947 mutex_enter(&vif->vif_mutex); 948 vif->vif_txfail_dma_bind++; 949 mutex_exit(&vif->vif_mutex); 950 goto fail; 951 } 952 953 for (uint_t n = 0; n < virtio_dma_ncookies(extdma); n++) { 954 uint64_t pa = virtio_dma_cookie_pa(extdma, n); 955 size_t sz = virtio_dma_cookie_size(extdma, n); 956 957 if (virtio_chain_append(tb->tb_chain, pa, sz, 958 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 959 mutex_enter(&vif->vif_mutex); 960 vif->vif_txfail_indirect_limit++; 961 vif->vif_notxbuf++; 962 mutex_exit(&vif->vif_mutex); 963 goto fail; 964 } 965 } 966 967 nmp = nmp->b_cont; 968 } 969 970 /* 971 * We need to keep the message around until we reclaim the buffer from 972 * the device before freeing it. 973 */ 974 tb->tb_mp = mp; 975 976 return (DDI_SUCCESS); 977 978 fail: 979 for (uint_t n = 0; n < tb->tb_ndmaext; n++) { 980 if (tb->tb_dmaext[n] != NULL) { 981 virtio_dma_unbind(tb->tb_dmaext[n]); 982 } 983 } 984 tb->tb_ndmaext = 0; 985 986 freemsg(mp); 987 988 return (DDI_FAILURE); 989 } 990 991 static boolean_t 992 vioif_send(vioif_t *vif, mblk_t *mp) 993 { 994 VERIFY(MUTEX_NOT_HELD(&vif->vif_mutex)); 995 996 vioif_txbuf_t *tb = NULL; 997 struct virtio_net_hdr *vnh = NULL; 998 size_t msg_size = 0; 999 uint32_t csum_start; 1000 uint32_t csum_stuff; 1001 uint32_t csum_flags; 1002 uint32_t lso_flags; 1003 uint32_t lso_mss; 1004 mblk_t *nmp; 1005 int ret; 1006 boolean_t lso_required = B_FALSE; 1007 struct ether_header *ether = (void *)mp->b_rptr; 1008 1009 for (nmp = mp; nmp; nmp = nmp->b_cont) 1010 msg_size += MBLKL(nmp); 1011 1012 if (vif->vif_tx_tso4) { 1013 mac_lso_get(mp, &lso_mss, &lso_flags); 1014 lso_required = (lso_flags & HW_LSO) != 0; 1015 } 1016 1017 mutex_enter(&vif->vif_mutex); 1018 if ((tb = vioif_txbuf_alloc(vif)) == NULL) { 1019 vif->vif_notxbuf++; 1020 goto fail; 1021 } 1022 mutex_exit(&vif->vif_mutex); 1023 1024 /* 1025 * Use the inline buffer for the virtio net header. Zero the portion 1026 * of our DMA allocation prior to the packet data. 1027 */ 1028 vnh = virtio_dma_va(tb->tb_dma, 0); 1029 bzero(vnh, VIOIF_HEADER_SKIP); 1030 1031 /* 1032 * For legacy devices, and those that have not negotiated 1033 * VIRTIO_F_ANY_LAYOUT, the virtio net header must appear in a separate 1034 * descriptor entry to the rest of the buffer. 1035 */ 1036 if (virtio_chain_append(tb->tb_chain, 1037 virtio_dma_cookie_pa(tb->tb_dma, 0), sizeof (struct virtio_net_hdr), 1038 VIRTIO_DIR_DEVICE_READS) != DDI_SUCCESS) { 1039 mutex_enter(&vif->vif_mutex); 1040 vif->vif_notxbuf++; 1041 goto fail; 1042 } 1043 1044 mac_hcksum_get(mp, &csum_start, &csum_stuff, NULL, NULL, &csum_flags); 1045 1046 /* 1047 * They want us to do the TCP/UDP csum calculation. 1048 */ 1049 if (csum_flags & HCK_PARTIALCKSUM) { 1050 int eth_hsize; 1051 1052 /* 1053 * Did we ask for it? 1054 */ 1055 ASSERT(vif->vif_tx_csum); 1056 1057 /* 1058 * We only asked for partial csum packets. 1059 */ 1060 ASSERT(!(csum_flags & HCK_IPV4_HDRCKSUM)); 1061 ASSERT(!(csum_flags & HCK_FULLCKSUM)); 1062 1063 if (ether->ether_type == htons(ETHERTYPE_VLAN)) { 1064 eth_hsize = sizeof (struct ether_vlan_header); 1065 } else { 1066 eth_hsize = sizeof (struct ether_header); 1067 } 1068 1069 vnh->vnh_flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; 1070 vnh->vnh_csum_start = eth_hsize + csum_start; 1071 vnh->vnh_csum_offset = csum_stuff - csum_start; 1072 } 1073 1074 /* 1075 * Setup LSO fields if required. 1076 */ 1077 if (lso_required) { 1078 vnh->vnh_gso_type = VIRTIO_NET_HDR_GSO_TCPV4; 1079 vnh->vnh_gso_size = (uint16_t)lso_mss; 1080 } 1081 1082 /* 1083 * The device does not maintain its own statistics about broadcast or 1084 * multicast packets, so we have to check the destination address 1085 * ourselves. 1086 */ 1087 if ((ether->ether_dhost.ether_addr_octet[0] & 0x01) != 0) { 1088 mutex_enter(&vif->vif_mutex); 1089 if (ether_cmp(ðer->ether_dhost, vioif_broadcast) == 0) { 1090 vif->vif_brdcstxmt++; 1091 } else { 1092 vif->vif_multixmt++; 1093 } 1094 mutex_exit(&vif->vif_mutex); 1095 } 1096 1097 /* 1098 * For small packets, copy into the preallocated inline buffer rather 1099 * than incur the overhead of mapping. Note that both of these 1100 * functions ensure that "mp" is freed before returning. 1101 */ 1102 if (msg_size < vif->vif_txcopy_thresh) { 1103 ret = vioif_tx_inline(vif, tb, mp, msg_size); 1104 } else { 1105 ret = vioif_tx_external(vif, tb, mp, msg_size); 1106 } 1107 mp = NULL; 1108 1109 mutex_enter(&vif->vif_mutex); 1110 1111 if (ret != DDI_SUCCESS) { 1112 goto fail; 1113 } 1114 1115 vif->vif_opackets++; 1116 vif->vif_obytes += msg_size; 1117 mutex_exit(&vif->vif_mutex); 1118 1119 virtio_dma_sync(tb->tb_dma, DDI_DMA_SYNC_FORDEV); 1120 virtio_chain_submit(tb->tb_chain, B_TRUE); 1121 1122 return (B_TRUE); 1123 1124 fail: 1125 vif->vif_oerrors++; 1126 if (tb != NULL) { 1127 vioif_txbuf_free(vif, tb); 1128 } 1129 mutex_exit(&vif->vif_mutex); 1130 1131 return (mp == NULL); 1132 } 1133 1134 static mblk_t * 1135 vioif_m_tx(void *arg, mblk_t *mp) 1136 { 1137 vioif_t *vif = arg; 1138 mblk_t *nmp; 1139 1140 /* 1141 * Prior to attempting to send any more frames, do a reclaim to pick up 1142 * any descriptors which have been processed by the host. 1143 */ 1144 if (virtio_queue_nactive(vif->vif_tx_vq) != 0) { 1145 (void) vioif_reclaim_used_tx(vif); 1146 } 1147 1148 while (mp != NULL) { 1149 nmp = mp->b_next; 1150 mp->b_next = NULL; 1151 1152 if (!vioif_send(vif, mp)) { 1153 /* 1154 * If there are no descriptors available, try to 1155 * reclaim some, allowing a retry of the send if some 1156 * are found. 1157 */ 1158 mp->b_next = nmp; 1159 if (vioif_reclaim_used_tx(vif) != 0) { 1160 continue; 1161 } 1162 1163 /* 1164 * Otherwise, enable the TX ring interrupt so that as 1165 * soon as a descriptor becomes available, transmission 1166 * can begin again. For safety, make sure the periodic 1167 * reclaim is running as well. 1168 */ 1169 mutex_enter(&vif->vif_mutex); 1170 vif->vif_tx_corked = B_TRUE; 1171 virtio_queue_no_interrupt(vif->vif_tx_vq, B_FALSE); 1172 vioif_reclaim_restart(vif); 1173 mutex_exit(&vif->vif_mutex); 1174 return (mp); 1175 } 1176 mp = nmp; 1177 } 1178 1179 /* Ensure the periodic reclaim has been started. */ 1180 mutex_enter(&vif->vif_mutex); 1181 vioif_reclaim_restart(vif); 1182 mutex_exit(&vif->vif_mutex); 1183 1184 return (NULL); 1185 } 1186 1187 static int 1188 vioif_m_start(void *arg) 1189 { 1190 vioif_t *vif = arg; 1191 1192 mutex_enter(&vif->vif_mutex); 1193 1194 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_STOPPED); 1195 vif->vif_runstate = VIOIF_RUNSTATE_RUNNING; 1196 1197 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); 1198 1199 virtio_queue_no_interrupt(vif->vif_rx_vq, B_FALSE); 1200 1201 /* 1202 * Starting interrupts on the TX virtqueue is unnecessary at this time. 1203 * Descriptor reclamation is handling during transmit, via a periodic 1204 * timer, and when resources are tight, via the then-enabled interrupt. 1205 */ 1206 vif->vif_tx_drain = B_FALSE; 1207 1208 /* 1209 * Add as many receive buffers as we can to the receive queue. If we 1210 * cannot add any, it may be because we have stopped and started again 1211 * and the descriptors are all in the queue already. 1212 */ 1213 (void) vioif_add_rx(vif); 1214 1215 mutex_exit(&vif->vif_mutex); 1216 return (DDI_SUCCESS); 1217 } 1218 1219 static void 1220 vioif_m_stop(void *arg) 1221 { 1222 vioif_t *vif = arg; 1223 1224 mutex_enter(&vif->vif_mutex); 1225 1226 VERIFY3S(vif->vif_runstate, ==, VIOIF_RUNSTATE_RUNNING); 1227 vif->vif_runstate = VIOIF_RUNSTATE_STOPPING; 1228 1229 /* Ensure all TX descriptors have been processed and reclaimed */ 1230 vioif_tx_drain(vif); 1231 1232 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); 1233 1234 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; 1235 mutex_exit(&vif->vif_mutex); 1236 } 1237 1238 static int 1239 vioif_m_stat(void *arg, uint_t stat, uint64_t *val) 1240 { 1241 vioif_t *vif = arg; 1242 1243 switch (stat) { 1244 case MAC_STAT_IERRORS: 1245 *val = vif->vif_ierrors; 1246 break; 1247 case MAC_STAT_OERRORS: 1248 *val = vif->vif_oerrors; 1249 break; 1250 case MAC_STAT_MULTIRCV: 1251 *val = vif->vif_multircv; 1252 break; 1253 case MAC_STAT_BRDCSTRCV: 1254 *val = vif->vif_brdcstrcv; 1255 break; 1256 case MAC_STAT_MULTIXMT: 1257 *val = vif->vif_multixmt; 1258 break; 1259 case MAC_STAT_BRDCSTXMT: 1260 *val = vif->vif_brdcstxmt; 1261 break; 1262 case MAC_STAT_IPACKETS: 1263 *val = vif->vif_ipackets; 1264 break; 1265 case MAC_STAT_RBYTES: 1266 *val = vif->vif_rbytes; 1267 break; 1268 case MAC_STAT_OPACKETS: 1269 *val = vif->vif_opackets; 1270 break; 1271 case MAC_STAT_OBYTES: 1272 *val = vif->vif_obytes; 1273 break; 1274 case MAC_STAT_NORCVBUF: 1275 *val = vif->vif_norecvbuf; 1276 break; 1277 case MAC_STAT_NOXMTBUF: 1278 *val = vif->vif_notxbuf; 1279 break; 1280 case MAC_STAT_IFSPEED: 1281 /* always 1 Gbit */ 1282 *val = 1000000000ULL; 1283 break; 1284 case ETHER_STAT_LINK_DUPLEX: 1285 /* virtual device, always full-duplex */ 1286 *val = LINK_DUPLEX_FULL; 1287 break; 1288 1289 default: 1290 return (ENOTSUP); 1291 } 1292 1293 return (DDI_SUCCESS); 1294 } 1295 1296 static int 1297 vioif_m_setprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1298 uint_t pr_valsize, const void *pr_val) 1299 { 1300 vioif_t *vif = arg; 1301 1302 switch (pr_num) { 1303 case MAC_PROP_MTU: { 1304 int r; 1305 uint32_t mtu; 1306 if (pr_valsize < sizeof (mtu)) { 1307 return (EOVERFLOW); 1308 } 1309 bcopy(pr_val, &mtu, sizeof (mtu)); 1310 1311 if (mtu < ETHERMIN || mtu > vif->vif_mtu_max) { 1312 return (EINVAL); 1313 } 1314 1315 mutex_enter(&vif->vif_mutex); 1316 if ((r = mac_maxsdu_update(vif->vif_mac_handle, mtu)) == 0) { 1317 vif->vif_mtu = mtu; 1318 } 1319 mutex_exit(&vif->vif_mutex); 1320 1321 return (r); 1322 } 1323 1324 case MAC_PROP_PRIVATE: { 1325 long max, result; 1326 uint_t *resp; 1327 char *endptr; 1328 1329 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1330 max = VIOIF_MACPROP_TXCOPY_THRESH_MAX; 1331 resp = &vif->vif_txcopy_thresh; 1332 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1333 max = VIOIF_MACPROP_RXCOPY_THRESH_MAX; 1334 resp = &vif->vif_rxcopy_thresh; 1335 } else { 1336 return (ENOTSUP); 1337 } 1338 1339 if (pr_val == NULL) { 1340 return (EINVAL); 1341 } 1342 1343 if (ddi_strtol(pr_val, &endptr, 10, &result) != 0 || 1344 *endptr != '\0' || result < 0 || result > max) { 1345 return (EINVAL); 1346 } 1347 1348 mutex_enter(&vif->vif_mutex); 1349 *resp = result; 1350 mutex_exit(&vif->vif_mutex); 1351 1352 return (0); 1353 } 1354 1355 default: 1356 return (ENOTSUP); 1357 } 1358 } 1359 1360 static int 1361 vioif_m_getprop(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1362 uint_t pr_valsize, void *pr_val) 1363 { 1364 vioif_t *vif = arg; 1365 1366 switch (pr_num) { 1367 case MAC_PROP_PRIVATE: { 1368 uint_t value; 1369 1370 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1371 value = vif->vif_txcopy_thresh; 1372 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1373 value = vif->vif_rxcopy_thresh; 1374 } else { 1375 return (ENOTSUP); 1376 } 1377 1378 if (snprintf(pr_val, pr_valsize, "%u", value) >= pr_valsize) { 1379 return (EOVERFLOW); 1380 } 1381 1382 return (0); 1383 } 1384 1385 default: 1386 return (ENOTSUP); 1387 } 1388 } 1389 1390 static void 1391 vioif_m_propinfo(void *arg, const char *pr_name, mac_prop_id_t pr_num, 1392 mac_prop_info_handle_t prh) 1393 { 1394 vioif_t *vif = arg; 1395 char valstr[64]; 1396 int value; 1397 1398 switch (pr_num) { 1399 case MAC_PROP_MTU: 1400 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); 1401 mac_prop_info_set_range_uint32(prh, ETHERMIN, vif->vif_mtu_max); 1402 return; 1403 1404 case MAC_PROP_PRIVATE: 1405 if (strcmp(pr_name, VIOIF_MACPROP_TXCOPY_THRESH) == 0) { 1406 value = VIOIF_MACPROP_TXCOPY_THRESH_DEF; 1407 } else if (strcmp(pr_name, VIOIF_MACPROP_RXCOPY_THRESH) == 0) { 1408 value = VIOIF_MACPROP_RXCOPY_THRESH_DEF; 1409 } else { 1410 /* 1411 * We do not recognise this private property name. 1412 */ 1413 return; 1414 } 1415 mac_prop_info_set_perm(prh, MAC_PROP_PERM_RW); 1416 (void) snprintf(valstr, sizeof (valstr), "%d", value); 1417 mac_prop_info_set_default_str(prh, valstr); 1418 return; 1419 1420 default: 1421 return; 1422 } 1423 } 1424 1425 static boolean_t 1426 vioif_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 1427 { 1428 vioif_t *vif = arg; 1429 1430 switch (cap) { 1431 case MAC_CAPAB_HCKSUM: { 1432 if (!vif->vif_tx_csum) { 1433 return (B_FALSE); 1434 } 1435 1436 *(uint32_t *)cap_data = HCKSUM_INET_PARTIAL; 1437 1438 return (B_TRUE); 1439 } 1440 1441 case MAC_CAPAB_LSO: { 1442 if (!vif->vif_tx_tso4) { 1443 return (B_FALSE); 1444 } 1445 1446 mac_capab_lso_t *lso = cap_data; 1447 lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 1448 lso->lso_basic_tcp_ipv4.lso_max = VIOIF_RX_DATA_SIZE; 1449 1450 return (B_TRUE); 1451 } 1452 1453 default: 1454 return (B_FALSE); 1455 } 1456 } 1457 1458 static boolean_t 1459 vioif_has_feature(vioif_t *vif, uint32_t feature) 1460 { 1461 return (virtio_feature_present(vif->vif_virtio, feature)); 1462 } 1463 1464 /* 1465 * Read the primary MAC address from the device if one is provided. If not, 1466 * generate a random locally administered MAC address and write it back to the 1467 * device. 1468 */ 1469 static void 1470 vioif_get_mac(vioif_t *vif) 1471 { 1472 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 1473 1474 if (vioif_has_feature(vif, VIRTIO_NET_F_MAC)) { 1475 for (uint_t i = 0; i < ETHERADDRL; i++) { 1476 vif->vif_mac[i] = virtio_dev_get8(vif->vif_virtio, 1477 VIRTIO_NET_CONFIG_MAC + i); 1478 } 1479 vif->vif_mac_from_host = 1; 1480 1481 return; 1482 } 1483 1484 /* Get a few random bytes */ 1485 (void) random_get_pseudo_bytes(vif->vif_mac, ETHERADDRL); 1486 /* Make sure it's a unicast MAC */ 1487 vif->vif_mac[0] &= ~1; 1488 /* Set the "locally administered" bit */ 1489 vif->vif_mac[1] |= 2; 1490 1491 /* 1492 * Write the random MAC address back to the device. 1493 */ 1494 for (uint_t i = 0; i < ETHERADDRL; i++) { 1495 virtio_dev_put8(vif->vif_virtio, VIRTIO_NET_CONFIG_MAC + i, 1496 vif->vif_mac[i]); 1497 } 1498 vif->vif_mac_from_host = 0; 1499 1500 dev_err(vif->vif_dip, CE_NOTE, "!Generated a random MAC address: " 1501 "%02x:%02x:%02x:%02x:%02x:%02x", 1502 (uint_t)vif->vif_mac[0], (uint_t)vif->vif_mac[1], 1503 (uint_t)vif->vif_mac[2], (uint_t)vif->vif_mac[3], 1504 (uint_t)vif->vif_mac[4], (uint_t)vif->vif_mac[5]); 1505 } 1506 1507 /* 1508 * Virtqueue interrupt handlers 1509 */ 1510 static uint_t 1511 vioif_rx_handler(caddr_t arg0, caddr_t arg1) 1512 { 1513 vioif_t *vif = (vioif_t *)arg0; 1514 1515 mutex_enter(&vif->vif_mutex); 1516 (void) vioif_process_rx(vif); 1517 1518 /* 1519 * Attempt to replenish the receive queue. If we cannot add any 1520 * descriptors here, it may be because all of the recently received 1521 * packets were loaned up to the networking stack. 1522 */ 1523 (void) vioif_add_rx(vif); 1524 mutex_exit(&vif->vif_mutex); 1525 1526 return (DDI_INTR_CLAIMED); 1527 } 1528 1529 static uint_t 1530 vioif_tx_handler(caddr_t arg0, caddr_t arg1) 1531 { 1532 vioif_t *vif = (vioif_t *)arg0; 1533 1534 /* 1535 * The TX interrupt could race with other reclamation activity, so 1536 * interpreting the return value is unimportant. 1537 */ 1538 (void) vioif_reclaim_used_tx(vif); 1539 1540 return (DDI_INTR_CLAIMED); 1541 } 1542 1543 static void 1544 vioif_check_features(vioif_t *vif) 1545 { 1546 VERIFY(MUTEX_HELD(&vif->vif_mutex)); 1547 1548 vif->vif_tx_csum = 0; 1549 vif->vif_tx_tso4 = 0; 1550 1551 if (vioif_has_feature(vif, VIRTIO_NET_F_CSUM)) { 1552 /* 1553 * The host will accept packets with partial checksums from us. 1554 */ 1555 vif->vif_tx_csum = 1; 1556 1557 /* 1558 * The legacy GSO feature represents the combination of 1559 * HOST_TSO4, HOST_TSO6, and HOST_ECN. 1560 */ 1561 boolean_t gso = vioif_has_feature(vif, VIRTIO_NET_F_GSO); 1562 boolean_t tso4 = vioif_has_feature(vif, VIRTIO_NET_F_HOST_TSO4); 1563 boolean_t ecn = vioif_has_feature(vif, VIRTIO_NET_F_HOST_ECN); 1564 1565 /* 1566 * Explicit congestion notification (ECN) is configured 1567 * globally; see "tcp_ecn_permitted". As we cannot currently 1568 * request that the stack disable ECN on a per interface basis, 1569 * we require the device to support the combination of 1570 * segmentation offload and ECN support. 1571 */ 1572 if (gso || (tso4 && ecn)) { 1573 vif->vif_tx_tso4 = 1; 1574 } 1575 } 1576 } 1577 1578 static int 1579 vioif_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 1580 { 1581 int ret; 1582 vioif_t *vif; 1583 virtio_t *vio; 1584 mac_register_t *macp = NULL; 1585 1586 if (cmd != DDI_ATTACH) { 1587 return (DDI_FAILURE); 1588 } 1589 1590 if ((vio = virtio_init(dip, VIRTIO_NET_WANTED_FEATURES, B_TRUE)) == 1591 NULL) { 1592 return (DDI_FAILURE); 1593 } 1594 1595 vif = kmem_zalloc(sizeof (*vif), KM_SLEEP); 1596 vif->vif_dip = dip; 1597 vif->vif_virtio = vio; 1598 vif->vif_runstate = VIOIF_RUNSTATE_STOPPED; 1599 ddi_set_driver_private(dip, vif); 1600 1601 if ((vif->vif_rx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_RX, 1602 "rx", vioif_rx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL || 1603 (vif->vif_tx_vq = virtio_queue_alloc(vio, VIRTIO_NET_VIRTQ_TX, 1604 "tx", vioif_tx_handler, vif, B_FALSE, VIOIF_MAX_SEGS)) == NULL) { 1605 goto fail; 1606 } 1607 1608 if (virtio_init_complete(vio, 0) != DDI_SUCCESS) { 1609 dev_err(dip, CE_WARN, "failed to complete Virtio init"); 1610 goto fail; 1611 } 1612 1613 virtio_queue_no_interrupt(vif->vif_rx_vq, B_TRUE); 1614 virtio_queue_no_interrupt(vif->vif_tx_vq, B_TRUE); 1615 1616 mutex_init(&vif->vif_mutex, NULL, MUTEX_DRIVER, virtio_intr_pri(vio)); 1617 mutex_enter(&vif->vif_mutex); 1618 1619 vioif_get_mac(vif); 1620 1621 vif->vif_rxcopy_thresh = VIOIF_MACPROP_RXCOPY_THRESH_DEF; 1622 vif->vif_txcopy_thresh = VIOIF_MACPROP_TXCOPY_THRESH_DEF; 1623 1624 if (vioif_has_feature(vif, VIRTIO_NET_F_MTU)) { 1625 vif->vif_mtu_max = virtio_dev_get16(vio, VIRTIO_NET_CONFIG_MTU); 1626 } else { 1627 vif->vif_mtu_max = ETHERMTU; 1628 } 1629 1630 vif->vif_mtu = ETHERMTU; 1631 if (vif->vif_mtu > vif->vif_mtu_max) { 1632 vif->vif_mtu = vif->vif_mtu_max; 1633 } 1634 1635 vioif_check_features(vif); 1636 1637 if (vioif_alloc_bufs(vif) != 0) { 1638 mutex_exit(&vif->vif_mutex); 1639 dev_err(dip, CE_WARN, "failed to allocate memory"); 1640 goto fail; 1641 } 1642 1643 mutex_exit(&vif->vif_mutex); 1644 1645 if (virtio_interrupts_enable(vio) != DDI_SUCCESS) { 1646 dev_err(dip, CE_WARN, "failed to enable interrupts"); 1647 goto fail; 1648 } 1649 1650 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 1651 dev_err(dip, CE_WARN, "failed to allocate a mac_register"); 1652 goto fail; 1653 } 1654 1655 macp->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1656 macp->m_driver = vif; 1657 macp->m_dip = dip; 1658 macp->m_src_addr = vif->vif_mac; 1659 macp->m_callbacks = &vioif_mac_callbacks; 1660 macp->m_min_sdu = 0; 1661 macp->m_max_sdu = vif->vif_mtu; 1662 macp->m_margin = VLAN_TAGSZ; 1663 macp->m_priv_props = vioif_priv_props; 1664 1665 if ((ret = mac_register(macp, &vif->vif_mac_handle)) != 0) { 1666 dev_err(dip, CE_WARN, "mac_register() failed (%d)", ret); 1667 goto fail; 1668 } 1669 mac_free(macp); 1670 1671 mac_link_update(vif->vif_mac_handle, LINK_STATE_UP); 1672 1673 return (DDI_SUCCESS); 1674 1675 fail: 1676 vioif_free_bufs(vif); 1677 if (macp != NULL) { 1678 mac_free(macp); 1679 } 1680 (void) virtio_fini(vio, B_TRUE); 1681 kmem_free(vif, sizeof (*vif)); 1682 return (DDI_FAILURE); 1683 } 1684 1685 static int 1686 vioif_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 1687 { 1688 int r; 1689 vioif_t *vif; 1690 1691 if (cmd != DDI_DETACH) { 1692 return (DDI_FAILURE); 1693 } 1694 1695 if ((vif = ddi_get_driver_private(dip)) == NULL) { 1696 return (DDI_FAILURE); 1697 } 1698 1699 mutex_enter(&vif->vif_mutex); 1700 if (vif->vif_runstate != VIOIF_RUNSTATE_STOPPED) { 1701 dev_err(dip, CE_WARN, "!NIC still running, cannot detach"); 1702 mutex_exit(&vif->vif_mutex); 1703 return (DDI_FAILURE); 1704 } 1705 1706 /* 1707 * There should be no outstanding transmit buffers once the NIC is 1708 * completely stopped. 1709 */ 1710 VERIFY3U(vif->vif_ntxbufs_alloc, ==, 0); 1711 1712 /* 1713 * Though we cannot claw back all of the receive buffers until we reset 1714 * the device, we must ensure all those loaned to MAC have been 1715 * returned before calling mac_unregister(). 1716 */ 1717 if (vif->vif_nrxbufs_onloan > 0) { 1718 dev_err(dip, CE_WARN, "!%u receive buffers still loaned, " 1719 "cannot detach", vif->vif_nrxbufs_onloan); 1720 mutex_exit(&vif->vif_mutex); 1721 return (DDI_FAILURE); 1722 } 1723 1724 if ((r = mac_unregister(vif->vif_mac_handle)) != 0) { 1725 dev_err(dip, CE_WARN, "!MAC unregister failed (%d)", r); 1726 return (DDI_FAILURE); 1727 } 1728 mac_free(vif->vif_macp); 1729 1730 /* 1731 * Shut down the device so that we can recover any previously 1732 * submitted receive buffers. 1733 */ 1734 virtio_shutdown(vif->vif_virtio); 1735 for (;;) { 1736 virtio_chain_t *vic; 1737 1738 if ((vic = virtio_queue_evacuate(vif->vif_rx_vq)) == NULL) { 1739 break; 1740 } 1741 1742 vioif_rxbuf_t *rb = virtio_chain_data(vic); 1743 vioif_rxbuf_free(vif, rb); 1744 } 1745 1746 (void) virtio_fini(vif->vif_virtio, B_FALSE); 1747 1748 vioif_free_bufs(vif); 1749 1750 mutex_exit(&vif->vif_mutex); 1751 mutex_destroy(&vif->vif_mutex); 1752 1753 kmem_free(vif, sizeof (*vif)); 1754 1755 return (DDI_SUCCESS); 1756 } 1757 1758 static int 1759 vioif_quiesce(dev_info_t *dip) 1760 { 1761 vioif_t *vif; 1762 1763 if ((vif = ddi_get_driver_private(dip)) == NULL) 1764 return (DDI_FAILURE); 1765 1766 return (virtio_quiesce(vif->vif_virtio)); 1767 } 1768 1769 int 1770 _init(void) 1771 { 1772 int ret; 1773 1774 mac_init_ops(&vioif_dev_ops, "vioif"); 1775 1776 if ((ret = mod_install(&vioif_modlinkage)) != DDI_SUCCESS) { 1777 mac_fini_ops(&vioif_dev_ops); 1778 } 1779 1780 return (ret); 1781 } 1782 1783 int 1784 _fini(void) 1785 { 1786 int ret; 1787 1788 if ((ret = mod_remove(&vioif_modlinkage)) == DDI_SUCCESS) { 1789 mac_fini_ops(&vioif_dev_ops); 1790 } 1791 1792 return (ret); 1793 } 1794 1795 int 1796 _info(struct modinfo *modinfop) 1797 { 1798 return (mod_info(&vioif_modlinkage, modinfop)); 1799 } 1800