1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. 26 * Copyright 2020 RackTop Systems, Inc. 27 * Copyright 2024 Oxide Computer Company 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/id_space.h> 33 #include <sys/esunddi.h> 34 #include <sys/stat.h> 35 #include <sys/mkdev.h> 36 #include <sys/stream.h> 37 #include <sys/strsubr.h> 38 #include <sys/dlpi.h> 39 #include <sys/modhash.h> 40 #include <sys/mac.h> 41 #include <sys/mac_provider.h> 42 #include <sys/mac_impl.h> 43 #include <sys/mac_client_impl.h> 44 #include <sys/mac_client_priv.h> 45 #include <sys/mac_soft_ring.h> 46 #include <sys/mac_stat.h> 47 #include <sys/dld.h> 48 #include <sys/modctl.h> 49 #include <sys/fs/dv_node.h> 50 #include <sys/thread.h> 51 #include <sys/proc.h> 52 #include <sys/callb.h> 53 #include <sys/cpuvar.h> 54 #include <sys/atomic.h> 55 #include <sys/sdt.h> 56 #include <sys/mac_flow.h> 57 #include <sys/ddi_intr_impl.h> 58 #include <sys/disp.h> 59 #include <sys/sdt.h> 60 #include <sys/pattr.h> 61 #include <sys/strsun.h> 62 #include <sys/vlan.h> 63 #include <inet/ip.h> 64 #include <inet/tcp.h> 65 #include <netinet/udp.h> 66 #include <netinet/sctp.h> 67 68 /* 69 * MAC Provider Interface. 70 * 71 * Interface for GLDv3 compatible NIC drivers. 72 */ 73 74 static void i_mac_notify_thread(void *); 75 76 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *); 77 78 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = { 79 mac_fanout_recompute, /* MAC_NOTE_LINK */ 80 NULL, /* MAC_NOTE_UNICST */ 81 NULL, /* MAC_NOTE_TX */ 82 NULL, /* MAC_NOTE_DEVPROMISC */ 83 NULL, /* MAC_NOTE_FASTPATH_FLUSH */ 84 NULL, /* MAC_NOTE_SDU_SIZE */ 85 NULL, /* MAC_NOTE_MARGIN */ 86 NULL, /* MAC_NOTE_CAPAB_CHG */ 87 NULL /* MAC_NOTE_LOWLINK */ 88 }; 89 90 /* 91 * Driver support functions. 92 */ 93 94 /* REGISTRATION */ 95 96 mac_register_t * 97 mac_alloc(uint_t mac_version) 98 { 99 mac_register_t *mregp; 100 101 /* 102 * Make sure there isn't a version mismatch between the driver and 103 * the framework. In the future, if multiple versions are 104 * supported, this check could become more sophisticated. 105 */ 106 if (mac_version != MAC_VERSION) 107 return (NULL); 108 109 mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP); 110 mregp->m_version = mac_version; 111 return (mregp); 112 } 113 114 void 115 mac_free(mac_register_t *mregp) 116 { 117 kmem_free(mregp, sizeof (mac_register_t)); 118 } 119 120 /* 121 * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS 122 * value. 123 */ 124 static uint16_t 125 mac_features_to_flags(mac_handle_t mh) 126 { 127 uint16_t flags = 0; 128 uint32_t cap_sum = 0; 129 mac_capab_lso_t cap_lso; 130 131 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) { 132 if (cap_sum & HCKSUM_IPHDRCKSUM) 133 flags |= HCK_IPV4_HDRCKSUM; 134 135 if (cap_sum & HCKSUM_INET_PARTIAL) 136 flags |= HCK_PARTIALCKSUM; 137 else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) 138 flags |= HCK_FULLCKSUM; 139 } 140 141 /* 142 * We don't need the information stored in 'cap_lso', but we 143 * need to pass a non-NULL pointer to appease the driver. 144 */ 145 if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso)) 146 flags |= HW_LSO; 147 148 return (flags); 149 } 150 151 /* 152 * mac_register() is how drivers register new MACs with the GLDv3 153 * framework. The mregp argument is allocated by drivers using the 154 * mac_alloc() function, and can be freed using mac_free() immediately upon 155 * return from mac_register(). Upon success (0 return value), the mhp 156 * opaque pointer becomes the driver's handle to its MAC interface, and is 157 * the argument to all other mac module entry points. 158 */ 159 /* ARGSUSED */ 160 int 161 mac_register(mac_register_t *mregp, mac_handle_t *mhp) 162 { 163 mac_impl_t *mip; 164 mactype_t *mtype; 165 int err = EINVAL; 166 struct devnames *dnp = NULL; 167 uint_t instance; 168 boolean_t style1_created = B_FALSE; 169 boolean_t style2_created = B_FALSE; 170 char *driver; 171 minor_t minor = 0; 172 173 /* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */ 174 if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip))) 175 return (EINVAL); 176 177 /* Find the required MAC-Type plugin. */ 178 if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL) 179 return (EINVAL); 180 181 /* Create a mac_impl_t to represent this MAC. */ 182 mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP); 183 184 /* 185 * The mac is not ready for open yet. 186 */ 187 mip->mi_state_flags |= MIS_DISABLED; 188 189 /* 190 * When a mac is registered, the m_instance field can be set to: 191 * 192 * 0: Get the mac's instance number from m_dip. 193 * This is usually used for physical device dips. 194 * 195 * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number. 196 * For example, when an aggregation is created with the key option, 197 * "key" will be used as the instance number. 198 * 199 * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1]. 200 * This is often used when a MAC of a virtual link is registered 201 * (e.g., aggregation when "key" is not specified, or vnic). 202 * 203 * Note that the instance number is used to derive the mi_minor field 204 * of mac_impl_t, which will then be used to derive the name of kstats 205 * and the devfs nodes. The first 2 cases are needed to preserve 206 * backward compatibility. 207 */ 208 switch (mregp->m_instance) { 209 case 0: 210 instance = ddi_get_instance(mregp->m_dip); 211 break; 212 case ((uint_t)-1): 213 minor = mac_minor_hold(B_TRUE); 214 if (minor == 0) { 215 err = ENOSPC; 216 goto fail; 217 } 218 instance = minor - 1; 219 break; 220 default: 221 instance = mregp->m_instance; 222 if (instance >= MAC_MAX_MINOR) { 223 err = EINVAL; 224 goto fail; 225 } 226 break; 227 } 228 229 mip->mi_minor = (minor_t)(instance + 1); 230 mip->mi_dip = mregp->m_dip; 231 mip->mi_clients_list = NULL; 232 mip->mi_nclients = 0; 233 234 /* Set the default IEEE Port VLAN Identifier */ 235 mip->mi_pvid = 1; 236 237 /* Default bridge link learning protection values */ 238 mip->mi_llimit = 1000; 239 mip->mi_ldecay = 200; 240 241 driver = (char *)ddi_driver_name(mip->mi_dip); 242 243 /* Construct the MAC name as <drvname><instance> */ 244 (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d", 245 driver, instance); 246 247 mip->mi_driver = mregp->m_driver; 248 249 mip->mi_type = mtype; 250 mip->mi_margin = mregp->m_margin; 251 mip->mi_info.mi_media = mtype->mt_type; 252 mip->mi_info.mi_nativemedia = mtype->mt_nativetype; 253 if (mregp->m_max_sdu <= mregp->m_min_sdu) 254 goto fail; 255 if (mregp->m_multicast_sdu == 0) 256 mregp->m_multicast_sdu = mregp->m_max_sdu; 257 if (mregp->m_multicast_sdu < mregp->m_min_sdu || 258 mregp->m_multicast_sdu > mregp->m_max_sdu) 259 goto fail; 260 mip->mi_sdu_min = mregp->m_min_sdu; 261 mip->mi_sdu_max = mregp->m_max_sdu; 262 mip->mi_sdu_multicast = mregp->m_multicast_sdu; 263 mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length; 264 /* 265 * If the media supports a broadcast address, cache a pointer to it 266 * in the mac_info_t so that upper layers can use it. 267 */ 268 mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr; 269 270 mip->mi_v12n_level = mregp->m_v12n; 271 272 /* 273 * Copy the unicast source address into the mac_info_t, but only if 274 * the MAC-Type defines a non-zero address length. We need to 275 * handle MAC-Types that have an address length of 0 276 * (point-to-point protocol MACs for example). 277 */ 278 if (mip->mi_type->mt_addr_length > 0) { 279 if (mregp->m_src_addr == NULL) 280 goto fail; 281 mip->mi_info.mi_unicst_addr = 282 kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP); 283 bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr, 284 mip->mi_type->mt_addr_length); 285 286 /* 287 * Copy the fixed 'factory' MAC address from the immutable 288 * info. This is taken to be the MAC address currently in 289 * use. 290 */ 291 bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr, 292 mip->mi_type->mt_addr_length); 293 294 /* 295 * At this point, we should set up the classification 296 * rules etc but we delay it till mac_open() so that 297 * the resource discovery has taken place and we 298 * know someone wants to use the device. Otherwise 299 * memory gets allocated for Rx ring structures even 300 * during probe. 301 */ 302 303 /* Copy the destination address if one is provided. */ 304 if (mregp->m_dst_addr != NULL) { 305 bcopy(mregp->m_dst_addr, mip->mi_dstaddr, 306 mip->mi_type->mt_addr_length); 307 mip->mi_dstaddr_set = B_TRUE; 308 } 309 } else if (mregp->m_src_addr != NULL) { 310 goto fail; 311 } 312 313 /* 314 * The format of the m_pdata is specific to the plugin. It is 315 * passed in as an argument to all of the plugin callbacks. The 316 * driver can update this information by calling 317 * mac_pdata_update(). 318 */ 319 if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) { 320 /* 321 * Verify if the supplied plugin data is valid. Note that 322 * even if the caller passed in a NULL pointer as plugin data, 323 * we still need to verify if that's valid as the plugin may 324 * require plugin data to function. 325 */ 326 if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata, 327 mregp->m_pdata_size)) { 328 goto fail; 329 } 330 if (mregp->m_pdata != NULL) { 331 mip->mi_pdata = 332 kmem_alloc(mregp->m_pdata_size, KM_SLEEP); 333 bcopy(mregp->m_pdata, mip->mi_pdata, 334 mregp->m_pdata_size); 335 mip->mi_pdata_size = mregp->m_pdata_size; 336 } 337 } else if (mregp->m_pdata != NULL) { 338 /* 339 * The caller supplied non-NULL plugin data, but the plugin 340 * does not recognize plugin data. 341 */ 342 err = EINVAL; 343 goto fail; 344 } 345 346 /* 347 * Register the private properties. 348 */ 349 mac_register_priv_prop(mip, mregp->m_priv_props); 350 351 /* 352 * Stash the driver callbacks into the mac_impl_t, but first sanity 353 * check to make sure all mandatory callbacks are set. 354 */ 355 if (mregp->m_callbacks->mc_getstat == NULL || 356 mregp->m_callbacks->mc_start == NULL || 357 mregp->m_callbacks->mc_stop == NULL || 358 mregp->m_callbacks->mc_setpromisc == NULL || 359 mregp->m_callbacks->mc_multicst == NULL) { 360 goto fail; 361 } 362 mip->mi_callbacks = mregp->m_callbacks; 363 364 if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY, 365 &mip->mi_capab_legacy)) { 366 mip->mi_state_flags |= MIS_LEGACY; 367 mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev; 368 } else { 369 mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip), 370 mip->mi_minor); 371 } 372 373 /* 374 * Allocate a notification thread. thread_create blocks for memory 375 * if needed, it never fails. 376 */ 377 mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread, 378 mip, 0, &p0, TS_RUN, minclsyspri); 379 380 /* 381 * Cache the DB_CKSUMFLAGS that this MAC supports. 382 */ 383 mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip); 384 385 /* 386 * Initialize the capabilities 387 */ 388 bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t)); 389 bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t)); 390 391 if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL)) 392 mip->mi_state_flags |= MIS_IS_VNIC; 393 394 if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) 395 mip->mi_state_flags |= MIS_IS_AGGR; 396 397 if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) 398 mip->mi_state_flags |= MIS_IS_OVERLAY; 399 400 mac_addr_factory_init(mip); 401 402 mac_transceiver_init(mip); 403 404 mac_led_init(mip); 405 406 /* 407 * Enforce the virtrualization level registered. 408 */ 409 if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) { 410 if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 || 411 mac_init_rings(mip, MAC_RING_TYPE_TX) != 0) 412 goto fail; 413 414 /* 415 * The driver needs to register at least rx rings for this 416 * virtualization level. 417 */ 418 if (mip->mi_rx_groups == NULL) 419 goto fail; 420 } 421 422 /* 423 * The driver must set mc_unicst entry point to NULL when it advertises 424 * CAP_RINGS for rx groups. 425 */ 426 if (mip->mi_rx_groups != NULL) { 427 if (mregp->m_callbacks->mc_unicst != NULL) 428 goto fail; 429 } else { 430 if (mregp->m_callbacks->mc_unicst == NULL) 431 goto fail; 432 } 433 434 /* 435 * Initialize MAC addresses. Must be called after mac_init_rings(). 436 */ 437 mac_init_macaddr(mip); 438 439 mip->mi_share_capab.ms_snum = 0; 440 if (mip->mi_v12n_level & MAC_VIRT_HIO) { 441 (void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES, 442 &mip->mi_share_capab); 443 } 444 445 /* 446 * Initialize the kstats for this device. 447 */ 448 mac_driver_stat_create(mip); 449 450 /* Zero out any properties. */ 451 bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t)); 452 453 if (mip->mi_minor <= MAC_MAX_MINOR) { 454 /* Create a style-2 DLPI device */ 455 if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0, 456 DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS) 457 goto fail; 458 style2_created = B_TRUE; 459 460 /* Create a style-1 DLPI device */ 461 if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR, 462 mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS) 463 goto fail; 464 style1_created = B_TRUE; 465 } 466 467 mac_flow_l2tab_create(mip, &mip->mi_flow_tab); 468 469 rw_enter(&i_mac_impl_lock, RW_WRITER); 470 if (mod_hash_insert(i_mac_impl_hash, 471 (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) { 472 rw_exit(&i_mac_impl_lock); 473 err = EEXIST; 474 goto fail; 475 } 476 477 DTRACE_PROBE2(mac__register, struct devnames *, dnp, 478 (mac_impl_t *), mip); 479 480 /* 481 * Mark the MAC to be ready for open. 482 */ 483 mip->mi_state_flags &= ~MIS_DISABLED; 484 rw_exit(&i_mac_impl_lock); 485 486 atomic_inc_32(&i_mac_impl_count); 487 488 cmn_err(CE_NOTE, "!%s registered", mip->mi_name); 489 *mhp = (mac_handle_t)mip; 490 return (0); 491 492 fail: 493 if (style1_created) 494 ddi_remove_minor_node(mip->mi_dip, mip->mi_name); 495 496 if (style2_created) 497 ddi_remove_minor_node(mip->mi_dip, driver); 498 499 mac_addr_factory_fini(mip); 500 501 /* Clean up registered MAC addresses */ 502 mac_fini_macaddr(mip); 503 504 /* Clean up registered rings */ 505 mac_free_rings(mip, MAC_RING_TYPE_RX); 506 mac_free_rings(mip, MAC_RING_TYPE_TX); 507 508 /* Clean up notification thread */ 509 if (mip->mi_notify_thread != NULL) 510 i_mac_notify_exit(mip); 511 512 if (mip->mi_info.mi_unicst_addr != NULL) { 513 kmem_free(mip->mi_info.mi_unicst_addr, 514 mip->mi_type->mt_addr_length); 515 mip->mi_info.mi_unicst_addr = NULL; 516 } 517 518 mac_driver_stat_delete(mip); 519 520 if (mip->mi_type != NULL) { 521 atomic_dec_32(&mip->mi_type->mt_ref); 522 mip->mi_type = NULL; 523 } 524 525 if (mip->mi_pdata != NULL) { 526 kmem_free(mip->mi_pdata, mip->mi_pdata_size); 527 mip->mi_pdata = NULL; 528 mip->mi_pdata_size = 0; 529 } 530 531 if (minor != 0) { 532 ASSERT(minor > MAC_MAX_MINOR); 533 mac_minor_rele(minor); 534 } 535 536 mip->mi_state_flags = 0; 537 mac_unregister_priv_prop(mip); 538 539 /* 540 * Clear the state before destroying the mac_impl_t 541 */ 542 mip->mi_state_flags = 0; 543 544 kmem_cache_free(i_mac_impl_cachep, mip); 545 return (err); 546 } 547 548 /* 549 * Unregister from the GLDv3 framework 550 */ 551 int 552 mac_unregister(mac_handle_t mh) 553 { 554 int err; 555 mac_impl_t *mip = (mac_impl_t *)mh; 556 mod_hash_val_t val; 557 mac_margin_req_t *mmr, *nextmmr; 558 559 /* Fail the unregister if there are any open references to this mac. */ 560 if ((err = mac_disable_nowait(mh)) != 0) 561 return (err); 562 563 /* 564 * Clean up notification thread and wait for it to exit. 565 */ 566 i_mac_notify_exit(mip); 567 568 /* 569 * Prior to acquiring the MAC perimeter, remove the MAC instance from 570 * the internal hash table. Such removal means table-walkers that 571 * acquire the perimeter will not do so on behalf of what we are 572 * unregistering, which prevents a deadlock. 573 */ 574 rw_enter(&i_mac_impl_lock, RW_WRITER); 575 (void) mod_hash_remove(i_mac_impl_hash, 576 (mod_hash_key_t)mip->mi_name, &val); 577 rw_exit(&i_mac_impl_lock); 578 ASSERT(mip == (mac_impl_t *)val); 579 580 i_mac_perim_enter(mip); 581 582 /* 583 * There is still resource properties configured over this mac. 584 */ 585 if (mip->mi_resource_props.mrp_mask != 0) 586 mac_fastpath_enable((mac_handle_t)mip); 587 588 if (mip->mi_minor < MAC_MAX_MINOR + 1) { 589 ddi_remove_minor_node(mip->mi_dip, mip->mi_name); 590 ddi_remove_minor_node(mip->mi_dip, 591 (char *)ddi_driver_name(mip->mi_dip)); 592 } 593 594 ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags & 595 MIS_EXCLUSIVE)); 596 597 mac_driver_stat_delete(mip); 598 599 ASSERT(i_mac_impl_count > 0); 600 atomic_dec_32(&i_mac_impl_count); 601 602 if (mip->mi_pdata != NULL) 603 kmem_free(mip->mi_pdata, mip->mi_pdata_size); 604 mip->mi_pdata = NULL; 605 mip->mi_pdata_size = 0; 606 607 /* 608 * Free the list of margin request. 609 */ 610 for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) { 611 nextmmr = mmr->mmr_nextp; 612 kmem_free(mmr, sizeof (mac_margin_req_t)); 613 } 614 mip->mi_mmrp = NULL; 615 616 mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN; 617 kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length); 618 mip->mi_info.mi_unicst_addr = NULL; 619 620 atomic_dec_32(&mip->mi_type->mt_ref); 621 mip->mi_type = NULL; 622 623 /* 624 * Free the primary MAC address. 625 */ 626 mac_fini_macaddr(mip); 627 628 /* 629 * free all rings 630 */ 631 mac_free_rings(mip, MAC_RING_TYPE_RX); 632 mac_free_rings(mip, MAC_RING_TYPE_TX); 633 634 mac_addr_factory_fini(mip); 635 636 bzero(mip->mi_addr, MAXMACADDRLEN); 637 bzero(mip->mi_dstaddr, MAXMACADDRLEN); 638 mip->mi_dstaddr_set = B_FALSE; 639 640 /* and the flows */ 641 mac_flow_tab_destroy(mip->mi_flow_tab); 642 mip->mi_flow_tab = NULL; 643 644 if (mip->mi_minor > MAC_MAX_MINOR) 645 mac_minor_rele(mip->mi_minor); 646 647 cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name); 648 649 /* 650 * Reset the perim related fields to default values before 651 * kmem_cache_free 652 */ 653 i_mac_perim_exit(mip); 654 mip->mi_state_flags = 0; 655 656 mac_unregister_priv_prop(mip); 657 658 ASSERT(mip->mi_bridge_link == NULL); 659 kmem_cache_free(i_mac_impl_cachep, mip); 660 661 return (0); 662 } 663 664 /* DATA RECEPTION */ 665 666 /* 667 * This function is invoked for packets received by the MAC driver in 668 * interrupt context. The ring generation number provided by the driver 669 * is matched with the ring generation number held in MAC. If they do not 670 * match, received packets are considered stale packets coming from an older 671 * assignment of the ring. Drop them. 672 */ 673 void 674 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain, 675 uint64_t mr_gen_num) 676 { 677 mac_ring_t *mr = (mac_ring_t *)mrh; 678 679 if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) { 680 DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t, 681 mr->mr_gen_num, uint64_t, mr_gen_num); 682 freemsgchain(mp_chain); 683 return; 684 } 685 mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain); 686 } 687 688 /* 689 * This function is invoked for each packet received by the underlying driver. 690 */ 691 void 692 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) 693 { 694 mac_impl_t *mip = (mac_impl_t *)mh; 695 696 /* 697 * Check if the link is part of a bridge. If not, then we don't need 698 * to take the lock to remain consistent. Make this common case 699 * lock-free and tail-call optimized. 700 */ 701 if (mip->mi_bridge_link == NULL) { 702 mac_rx_common(mh, mrh, mp_chain); 703 } else { 704 /* 705 * Once we take a reference on the bridge link, the bridge 706 * module itself can't unload, so the callback pointers are 707 * stable. 708 */ 709 mutex_enter(&mip->mi_bridge_lock); 710 if ((mh = mip->mi_bridge_link) != NULL) 711 mac_bridge_ref_cb(mh, B_TRUE); 712 mutex_exit(&mip->mi_bridge_lock); 713 if (mh == NULL) { 714 mac_rx_common((mac_handle_t)mip, mrh, mp_chain); 715 } else { 716 mac_bridge_rx_cb(mh, mrh, mp_chain); 717 mac_bridge_ref_cb(mh, B_FALSE); 718 } 719 } 720 } 721 722 /* 723 * Special case function: this allows snooping of packets transmitted and 724 * received by TRILL. By design, they go directly into the TRILL module. 725 */ 726 void 727 mac_trill_snoop(mac_handle_t mh, mblk_t *mp) 728 { 729 mac_impl_t *mip = (mac_impl_t *)mh; 730 731 if (mip->mi_promisc_list != NULL) 732 mac_promisc_dispatch(mip, mp, NULL, B_FALSE); 733 } 734 735 /* 736 * This is the upward reentry point for packets arriving from the bridging 737 * module and from mac_rx for links not part of a bridge. 738 */ 739 void 740 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) 741 { 742 mac_impl_t *mip = (mac_impl_t *)mh; 743 mac_ring_t *mr = (mac_ring_t *)mrh; 744 mac_soft_ring_set_t *mac_srs; 745 mblk_t *bp = mp_chain; 746 747 /* 748 * If there are any promiscuous mode callbacks defined for 749 * this MAC, pass them a copy if appropriate. 750 */ 751 if (mip->mi_promisc_list != NULL) 752 mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE); 753 754 if (mr != NULL) { 755 /* 756 * If the SRS teardown has started, just return. The 'mr' 757 * continues to be valid until the driver unregisters the MAC. 758 * Hardware classified packets will not make their way up 759 * beyond this point once the teardown has started. The driver 760 * is never passed a pointer to a flow entry or SRS or any 761 * structure that can be freed much before mac_unregister. 762 */ 763 mutex_enter(&mr->mr_lock); 764 if ((mr->mr_state != MR_INUSE) || (mr->mr_flag & 765 (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) { 766 mutex_exit(&mr->mr_lock); 767 freemsgchain(mp_chain); 768 return; 769 } 770 771 /* 772 * The ring is in passthru mode; pass the chain up to 773 * the pseudo ring. 774 */ 775 if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) { 776 MR_REFHOLD_LOCKED(mr); 777 mutex_exit(&mr->mr_lock); 778 mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain, 779 B_FALSE); 780 MR_REFRELE(mr); 781 return; 782 } 783 784 /* 785 * The passthru callback should only be set when in 786 * MAC_PASSTHRU_CLASSIFIER mode. 787 */ 788 ASSERT3P(mr->mr_pt_fn, ==, NULL); 789 790 /* 791 * We check if an SRS is controlling this ring. 792 * If so, we can directly call the srs_lower_proc 793 * routine otherwise we need to go through mac_rx_classify 794 * to reach the right place. 795 */ 796 if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { 797 MR_REFHOLD_LOCKED(mr); 798 mutex_exit(&mr->mr_lock); 799 ASSERT3P(mr->mr_srs, !=, NULL); 800 mac_srs = mr->mr_srs; 801 802 /* 803 * This is the fast path. All packets received 804 * on this ring are hardware classified and 805 * share the same MAC header info. 806 */ 807 mac_srs->srs_rx.sr_lower_proc(mh, 808 (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); 809 MR_REFRELE(mr); 810 return; 811 } 812 813 mutex_exit(&mr->mr_lock); 814 /* We'll fall through to software classification */ 815 } else { 816 flow_entry_t *flent; 817 int err; 818 819 rw_enter(&mip->mi_rw_lock, RW_READER); 820 if (mip->mi_single_active_client != NULL) { 821 flent = mip->mi_single_active_client->mci_flent_list; 822 FLOW_TRY_REFHOLD(flent, err); 823 rw_exit(&mip->mi_rw_lock); 824 if (err == 0) { 825 (flent->fe_cb_fn)(flent->fe_cb_arg1, 826 flent->fe_cb_arg2, mp_chain, B_FALSE); 827 FLOW_REFRELE(flent); 828 return; 829 } 830 } else { 831 rw_exit(&mip->mi_rw_lock); 832 } 833 } 834 835 if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) { 836 if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL) 837 return; 838 } 839 840 freemsgchain(bp); 841 } 842 843 /* DATA TRANSMISSION */ 844 845 /* 846 * A driver's notification to resume transmission, in case of a provider 847 * without TX rings. 848 */ 849 void 850 mac_tx_update(mac_handle_t mh) 851 { 852 mac_tx_ring_update(mh, NULL); 853 } 854 855 /* 856 * A driver's notification to resume transmission on the specified TX ring. 857 */ 858 void 859 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh) 860 { 861 i_mac_tx_srs_notify((mac_impl_t *)mh, rh); 862 } 863 864 /* LINK STATE */ 865 /* 866 * Notify the MAC layer about a link state change 867 */ 868 void 869 mac_link_update(mac_handle_t mh, link_state_t link) 870 { 871 mac_impl_t *mip = (mac_impl_t *)mh; 872 873 /* 874 * Save the link state. 875 */ 876 mip->mi_lowlinkstate = link; 877 878 /* 879 * Send a MAC_NOTE_LOWLINK notification. This tells the notification 880 * thread to deliver both lower and upper notifications. 881 */ 882 i_mac_notify(mip, MAC_NOTE_LOWLINK); 883 } 884 885 /* 886 * Notify the MAC layer about a link state change due to bridging. 887 */ 888 void 889 mac_link_redo(mac_handle_t mh, link_state_t link) 890 { 891 mac_impl_t *mip = (mac_impl_t *)mh; 892 893 /* 894 * Save the link state. 895 */ 896 mip->mi_linkstate = link; 897 898 /* 899 * Send a MAC_NOTE_LINK notification. Only upper notifications are 900 * made. 901 */ 902 i_mac_notify(mip, MAC_NOTE_LINK); 903 } 904 905 /* MINOR NODE HANDLING */ 906 907 /* 908 * Given a dev_t, return the instance number (PPA) associated with it. 909 * Drivers can use this in their getinfo(9e) implementation to lookup 910 * the instance number (i.e. PPA) of the device, to use as an index to 911 * their own array of soft state structures. 912 * 913 * Returns -1 on error. 914 */ 915 int 916 mac_devt_to_instance(dev_t devt) 917 { 918 return (dld_devt_to_instance(devt)); 919 } 920 921 /* 922 * Drivers that make use of the private minor number space are expected to 923 * provide their own getinfo(9e) entry point. This function simply forwards 924 * to the default MAC framework getinfo(9e) implementation as a convenience 925 * if they don't need any special mapping (mac instance != ddi_get_instance()) 926 */ 927 int 928 mac_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) 929 { 930 return (dld_getinfo(dip, cmd, arg, resp)); 931 } 932 933 /* 934 * This function returns the first minor number that is available for 935 * driver private use. All minor numbers smaller than this are 936 * reserved for GLDv3 use. 937 */ 938 minor_t 939 mac_private_minor(void) 940 { 941 return (MAC_PRIVATE_MINOR); 942 } 943 944 /* OTHER CONTROL INFORMATION */ 945 946 /* 947 * A driver notified us that its primary MAC address has changed. 948 */ 949 void 950 mac_unicst_update(mac_handle_t mh, const uint8_t *addr) 951 { 952 mac_impl_t *mip = (mac_impl_t *)mh; 953 954 if (mip->mi_type->mt_addr_length == 0) 955 return; 956 957 i_mac_perim_enter(mip); 958 959 /* 960 * If address changes, freshen the MAC address value and update 961 * all MAC clients that share this MAC address. 962 */ 963 if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) { 964 mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr), 965 (uint8_t *)addr); 966 } 967 968 i_mac_perim_exit(mip); 969 970 /* 971 * Send a MAC_NOTE_UNICST notification. 972 */ 973 i_mac_notify(mip, MAC_NOTE_UNICST); 974 } 975 976 void 977 mac_dst_update(mac_handle_t mh, const uint8_t *addr) 978 { 979 mac_impl_t *mip = (mac_impl_t *)mh; 980 981 if (mip->mi_type->mt_addr_length == 0) 982 return; 983 984 i_mac_perim_enter(mip); 985 bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length); 986 i_mac_perim_exit(mip); 987 i_mac_notify(mip, MAC_NOTE_DEST); 988 } 989 990 /* 991 * MAC plugin information changed. 992 */ 993 int 994 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize) 995 { 996 mac_impl_t *mip = (mac_impl_t *)mh; 997 998 /* 999 * Verify that the plugin supports MAC plugin data and that the 1000 * supplied data is valid. 1001 */ 1002 if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY)) 1003 return (EINVAL); 1004 if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize)) 1005 return (EINVAL); 1006 1007 if (mip->mi_pdata != NULL) 1008 kmem_free(mip->mi_pdata, mip->mi_pdata_size); 1009 1010 mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP); 1011 bcopy(mac_pdata, mip->mi_pdata, dsize); 1012 mip->mi_pdata_size = dsize; 1013 1014 /* 1015 * Since the MAC plugin data is used to construct MAC headers that 1016 * were cached in fast-path headers, we need to flush fast-path 1017 * information for links associated with this mac. 1018 */ 1019 i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH); 1020 return (0); 1021 } 1022 1023 /* 1024 * The mac provider or mac frameowrk calls this function when it wants 1025 * to notify upstream consumers that the capabilities have changed and 1026 * that they should modify their own internal state accordingly. 1027 * 1028 * We currently have no regard for the fact that a provider could 1029 * decide to drop capabilities which would invalidate pending traffic. 1030 * For example, if one was to disable the Tx checksum offload while 1031 * TCP/IP traffic was being sent by mac clients relying on that 1032 * feature, then those packets would hit the write with missing or 1033 * partial checksums. A proper solution involves not only providing 1034 * notfication, but also performing client quiescing. That is, a capab 1035 * change should be treated as an atomic transaction that forms a 1036 * barrier between traffic relying on the current capabs and traffic 1037 * relying on the new capabs. In practice, simnet is currently the 1038 * only provider that could hit this, and it's an easily avoidable 1039 * situation (and at worst it should only lead to some dropped 1040 * packets). But if we ever want better on-the-fly capab change to 1041 * actual hardware providers, then we should give this update 1042 * mechanism a proper implementation. 1043 */ 1044 void 1045 mac_capab_update(mac_handle_t mh) 1046 { 1047 /* 1048 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream 1049 * clients to renegotiate capabilities. 1050 */ 1051 i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG); 1052 } 1053 1054 /* 1055 * Used by normal drivers to update the max sdu size. 1056 * We need to handle the case of a smaller mi_sdu_multicast 1057 * since this is called by mac_set_mtu() even for drivers that 1058 * have differing unicast and multicast mtu and we don't want to 1059 * increase the multicast mtu by accident in that case. 1060 */ 1061 int 1062 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max) 1063 { 1064 mac_impl_t *mip = (mac_impl_t *)mh; 1065 1066 if (sdu_max == 0 || sdu_max < mip->mi_sdu_min) 1067 return (EINVAL); 1068 mip->mi_sdu_max = sdu_max; 1069 if (mip->mi_sdu_multicast > mip->mi_sdu_max) 1070 mip->mi_sdu_multicast = mip->mi_sdu_max; 1071 1072 /* Send a MAC_NOTE_SDU_SIZE notification. */ 1073 i_mac_notify(mip, MAC_NOTE_SDU_SIZE); 1074 return (0); 1075 } 1076 1077 /* 1078 * Version of the above function that is used by drivers that have a different 1079 * max sdu size for multicast/broadcast vs. unicast. 1080 */ 1081 int 1082 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast) 1083 { 1084 mac_impl_t *mip = (mac_impl_t *)mh; 1085 1086 if (sdu_max == 0 || sdu_max < mip->mi_sdu_min) 1087 return (EINVAL); 1088 if (sdu_multicast == 0) 1089 sdu_multicast = sdu_max; 1090 if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min) 1091 return (EINVAL); 1092 mip->mi_sdu_max = sdu_max; 1093 mip->mi_sdu_multicast = sdu_multicast; 1094 1095 /* Send a MAC_NOTE_SDU_SIZE notification. */ 1096 i_mac_notify(mip, MAC_NOTE_SDU_SIZE); 1097 return (0); 1098 } 1099 1100 static void 1101 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring) 1102 { 1103 mac_client_impl_t *mcip; 1104 flow_entry_t *flent; 1105 mac_soft_ring_set_t *mac_rx_srs; 1106 mac_cpus_t *srs_cpu; 1107 int i; 1108 1109 if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) && 1110 (!ring->mr_info.mri_intr.mi_ddi_shared)) { 1111 /* interrupt can be re-targeted */ 1112 ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED); 1113 flent = mcip->mci_flent; 1114 if (ring->mr_type == MAC_RING_TYPE_RX) { 1115 for (i = 0; i < flent->fe_rx_srs_cnt; i++) { 1116 mac_rx_srs = flent->fe_rx_srs[i]; 1117 if (mac_rx_srs->srs_ring != ring) 1118 continue; 1119 srs_cpu = &mac_rx_srs->srs_cpu; 1120 mutex_enter(&cpu_lock); 1121 mac_rx_srs_retarget_intr(mac_rx_srs, 1122 srs_cpu->mc_rx_intr_cpu); 1123 mutex_exit(&cpu_lock); 1124 break; 1125 } 1126 } else { 1127 if (flent->fe_tx_srs != NULL) { 1128 mutex_enter(&cpu_lock); 1129 mac_tx_srs_retarget_intr( 1130 flent->fe_tx_srs); 1131 mutex_exit(&cpu_lock); 1132 } 1133 } 1134 } 1135 } 1136 1137 /* 1138 * Clients like aggr create pseudo rings (mac_ring_t) and expose them to 1139 * their clients. There is a 1-1 mapping pseudo ring and the hardware 1140 * ring. ddi interrupt handles are exported from the hardware ring to 1141 * the pseudo ring. Thus when the interrupt handle changes, clients of 1142 * aggr that are using the handle need to use the new handle and 1143 * re-target their interrupts. 1144 */ 1145 static void 1146 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring, 1147 ddi_intr_handle_t ddh) 1148 { 1149 mac_ring_t *pring; 1150 mac_group_t *pgroup; 1151 mac_impl_t *pmip; 1152 char macname[MAXNAMELEN]; 1153 mac_perim_handle_t p_mph; 1154 uint64_t saved_gen_num; 1155 1156 again: 1157 pring = (mac_ring_t *)ring->mr_prh; 1158 pgroup = (mac_group_t *)pring->mr_gh; 1159 pmip = (mac_impl_t *)pgroup->mrg_mh; 1160 saved_gen_num = ring->mr_gen_num; 1161 (void) strlcpy(macname, pmip->mi_name, MAXNAMELEN); 1162 /* 1163 * We need to enter aggr's perimeter. The locking hierarchy 1164 * dictates that aggr's perimeter should be entered first 1165 * and then the port's perimeter. So drop the port's 1166 * perimeter, enter aggr's and then re-enter port's 1167 * perimeter. 1168 */ 1169 i_mac_perim_exit(mip); 1170 /* 1171 * While we know pmip is the aggr's mip, there is a 1172 * possibility that aggr could have unregistered by 1173 * the time we exit port's perimeter (mip) and 1174 * enter aggr's perimeter (pmip). To avoid that 1175 * scenario, enter aggr's perimeter using its name. 1176 */ 1177 if (mac_perim_enter_by_macname(macname, &p_mph) != 0) 1178 return; 1179 i_mac_perim_enter(mip); 1180 /* 1181 * Check if the ring got assigned to another aggregation before 1182 * be could enter aggr's and the port's perimeter. When a ring 1183 * gets deleted from an aggregation, it calls mac_stop_ring() 1184 * which increments the generation number. So checking 1185 * generation number will be enough. 1186 */ 1187 if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) { 1188 i_mac_perim_exit(mip); 1189 mac_perim_exit(p_mph); 1190 i_mac_perim_enter(mip); 1191 goto again; 1192 } 1193 1194 /* Check if pseudo ring is still present */ 1195 if (ring->mr_prh != NULL) { 1196 pring->mr_info.mri_intr.mi_ddi_handle = ddh; 1197 pring->mr_info.mri_intr.mi_ddi_shared = 1198 ring->mr_info.mri_intr.mi_ddi_shared; 1199 if (ddh != NULL) 1200 mac_ring_intr_retarget(pgroup, pring); 1201 } 1202 i_mac_perim_exit(mip); 1203 mac_perim_exit(p_mph); 1204 } 1205 /* 1206 * API called by driver to provide new interrupt handle for TX/RX rings. 1207 * This usually happens when IRM (Interrupt Resource Manangement) 1208 * framework either gives the driver more MSI-x interrupts or takes 1209 * away MSI-x interrupts from the driver. 1210 */ 1211 void 1212 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh) 1213 { 1214 mac_ring_t *ring = (mac_ring_t *)mrh; 1215 mac_group_t *group = (mac_group_t *)ring->mr_gh; 1216 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 1217 1218 i_mac_perim_enter(mip); 1219 ring->mr_info.mri_intr.mi_ddi_handle = ddh; 1220 if (ddh == NULL) { 1221 /* Interrupts being reset */ 1222 ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE; 1223 if (ring->mr_prh != NULL) { 1224 mac_pseudo_ring_intr_retarget(mip, ring, ddh); 1225 return; 1226 } 1227 } else { 1228 /* New interrupt handle */ 1229 mac_compare_ddi_handle(mip->mi_rx_groups, 1230 mip->mi_rx_group_count, ring); 1231 if (!ring->mr_info.mri_intr.mi_ddi_shared) { 1232 mac_compare_ddi_handle(mip->mi_tx_groups, 1233 mip->mi_tx_group_count, ring); 1234 } 1235 if (ring->mr_prh != NULL) { 1236 mac_pseudo_ring_intr_retarget(mip, ring, ddh); 1237 return; 1238 } else { 1239 mac_ring_intr_retarget(group, ring); 1240 } 1241 } 1242 i_mac_perim_exit(mip); 1243 } 1244 1245 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */ 1246 1247 /* 1248 * Updates the mac_impl structure with the current state of the link 1249 */ 1250 static void 1251 i_mac_log_link_state(mac_impl_t *mip) 1252 { 1253 /* 1254 * If no change, then it is not interesting. 1255 */ 1256 if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate) 1257 return; 1258 1259 switch (mip->mi_lowlinkstate) { 1260 case LINK_STATE_UP: 1261 if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) { 1262 char det[200]; 1263 1264 mip->mi_type->mt_ops.mtops_link_details(det, 1265 sizeof (det), (mac_handle_t)mip, mip->mi_pdata); 1266 1267 cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det); 1268 } else { 1269 cmn_err(CE_NOTE, "!%s link up", mip->mi_name); 1270 } 1271 break; 1272 1273 case LINK_STATE_DOWN: 1274 /* 1275 * Only transitions from UP to DOWN are interesting 1276 */ 1277 if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN) 1278 cmn_err(CE_NOTE, "!%s link down", mip->mi_name); 1279 break; 1280 1281 case LINK_STATE_UNKNOWN: 1282 /* 1283 * This case is normally not interesting. 1284 */ 1285 break; 1286 } 1287 mip->mi_lastlowlinkstate = mip->mi_lowlinkstate; 1288 } 1289 1290 /* 1291 * Main routine for the callbacks notifications thread 1292 */ 1293 static void 1294 i_mac_notify_thread(void *arg) 1295 { 1296 mac_impl_t *mip = arg; 1297 callb_cpr_t cprinfo; 1298 mac_cb_t *mcb; 1299 mac_cb_info_t *mcbi; 1300 mac_notify_cb_t *mncb; 1301 1302 mcbi = &mip->mi_notify_cb_info; 1303 CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr, 1304 "i_mac_notify_thread"); 1305 1306 mutex_enter(mcbi->mcbi_lockp); 1307 1308 for (;;) { 1309 uint32_t bits; 1310 uint32_t type; 1311 1312 bits = mip->mi_notify_bits; 1313 if (bits == 0) { 1314 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1315 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); 1316 CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp); 1317 continue; 1318 } 1319 mip->mi_notify_bits = 0; 1320 if ((bits & (1 << MAC_NNOTE)) != 0) { 1321 /* request to quit */ 1322 ASSERT(mip->mi_state_flags & MIS_DISABLED); 1323 break; 1324 } 1325 1326 mutex_exit(mcbi->mcbi_lockp); 1327 1328 /* 1329 * Log link changes on the actual link, but then do reports on 1330 * synthetic state (if part of a bridge). 1331 */ 1332 if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) { 1333 link_state_t newstate; 1334 mac_handle_t mh; 1335 1336 i_mac_log_link_state(mip); 1337 newstate = mip->mi_lowlinkstate; 1338 if (mip->mi_bridge_link != NULL) { 1339 mutex_enter(&mip->mi_bridge_lock); 1340 if ((mh = mip->mi_bridge_link) != NULL) { 1341 newstate = mac_bridge_ls_cb(mh, 1342 newstate); 1343 } 1344 mutex_exit(&mip->mi_bridge_lock); 1345 } 1346 if (newstate != mip->mi_linkstate) { 1347 mip->mi_linkstate = newstate; 1348 bits |= 1 << MAC_NOTE_LINK; 1349 } 1350 } 1351 1352 /* 1353 * Depending on which capabs have changed, the Tx 1354 * checksum flags may also need to be updated. 1355 */ 1356 if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) { 1357 mac_perim_handle_t mph; 1358 mac_handle_t mh = (mac_handle_t)mip; 1359 1360 mac_perim_enter_by_mh(mh, &mph); 1361 mip->mi_tx_cksum_flags = mac_features_to_flags(mh); 1362 mac_perim_exit(mph); 1363 } 1364 1365 /* 1366 * Do notification callbacks for each notification type. 1367 */ 1368 for (type = 0; type < MAC_NNOTE; type++) { 1369 if ((bits & (1 << type)) == 0) { 1370 continue; 1371 } 1372 1373 if (mac_notify_cb_list[type] != NULL) 1374 (*mac_notify_cb_list[type])(mip); 1375 1376 /* 1377 * Walk the list of notifications. 1378 */ 1379 MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info); 1380 for (mcb = mip->mi_notify_cb_list; mcb != NULL; 1381 mcb = mcb->mcb_nextp) { 1382 mncb = (mac_notify_cb_t *)mcb->mcb_objp; 1383 mncb->mncb_fn(mncb->mncb_arg, type); 1384 } 1385 MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info, 1386 &mip->mi_notify_cb_list); 1387 } 1388 1389 mutex_enter(mcbi->mcbi_lockp); 1390 } 1391 1392 mip->mi_state_flags |= MIS_NOTIFY_DONE; 1393 cv_broadcast(&mcbi->mcbi_cv); 1394 1395 /* CALLB_CPR_EXIT drops the lock */ 1396 CALLB_CPR_EXIT(&cprinfo); 1397 thread_exit(); 1398 } 1399 1400 /* 1401 * Signal the i_mac_notify_thread asking it to quit. 1402 * Then wait till it is done. 1403 */ 1404 void 1405 i_mac_notify_exit(mac_impl_t *mip) 1406 { 1407 mac_cb_info_t *mcbi; 1408 1409 mcbi = &mip->mi_notify_cb_info; 1410 1411 mutex_enter(mcbi->mcbi_lockp); 1412 mip->mi_notify_bits = (1 << MAC_NNOTE); 1413 cv_broadcast(&mcbi->mcbi_cv); 1414 1415 1416 while ((mip->mi_notify_thread != NULL) && 1417 !(mip->mi_state_flags & MIS_NOTIFY_DONE)) { 1418 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); 1419 } 1420 1421 /* Necessary clean up before doing kmem_cache_free */ 1422 mip->mi_state_flags &= ~MIS_NOTIFY_DONE; 1423 mip->mi_notify_bits = 0; 1424 mip->mi_notify_thread = NULL; 1425 mutex_exit(mcbi->mcbi_lockp); 1426 } 1427 1428 /* 1429 * Entry point invoked by drivers to dynamically add a ring to an 1430 * existing group. 1431 */ 1432 int 1433 mac_group_add_ring(mac_group_handle_t gh, int index) 1434 { 1435 mac_group_t *group = (mac_group_t *)gh; 1436 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 1437 int ret; 1438 1439 i_mac_perim_enter(mip); 1440 ret = i_mac_group_add_ring(group, NULL, index); 1441 i_mac_perim_exit(mip); 1442 return (ret); 1443 } 1444 1445 /* 1446 * Entry point invoked by drivers to dynamically remove a ring 1447 * from an existing group. The specified ring handle must no longer 1448 * be used by the driver after a call to this function. 1449 */ 1450 void 1451 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh) 1452 { 1453 mac_group_t *group = (mac_group_t *)gh; 1454 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 1455 1456 i_mac_perim_enter(mip); 1457 i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE); 1458 i_mac_perim_exit(mip); 1459 } 1460 1461 /* 1462 * mac_prop_info_*() callbacks called from the driver's prefix_propinfo() 1463 * entry points. 1464 */ 1465 1466 void 1467 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val) 1468 { 1469 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1470 1471 /* nothing to do if the caller doesn't want the default value */ 1472 if (pr->pr_default == NULL) 1473 return; 1474 1475 ASSERT(pr->pr_default_size >= sizeof (uint8_t)); 1476 1477 *(uint8_t *)(pr->pr_default) = val; 1478 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1479 } 1480 1481 void 1482 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val) 1483 { 1484 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1485 1486 /* nothing to do if the caller doesn't want the default value */ 1487 if (pr->pr_default == NULL) 1488 return; 1489 1490 ASSERT(pr->pr_default_size >= sizeof (uint64_t)); 1491 1492 bcopy(&val, pr->pr_default, sizeof (val)); 1493 1494 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1495 } 1496 1497 void 1498 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val) 1499 { 1500 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1501 1502 /* nothing to do if the caller doesn't want the default value */ 1503 if (pr->pr_default == NULL) 1504 return; 1505 1506 ASSERT(pr->pr_default_size >= sizeof (uint32_t)); 1507 1508 bcopy(&val, pr->pr_default, sizeof (val)); 1509 1510 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1511 } 1512 1513 void 1514 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str) 1515 { 1516 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1517 1518 /* nothing to do if the caller doesn't want the default value */ 1519 if (pr->pr_default == NULL) 1520 return; 1521 1522 if (strlen(str) >= pr->pr_default_size) 1523 pr->pr_errno = ENOBUFS; 1524 else 1525 (void) strlcpy(pr->pr_default, str, pr->pr_default_size); 1526 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1527 } 1528 1529 void 1530 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph, 1531 link_flowctrl_t val) 1532 { 1533 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1534 1535 /* nothing to do if the caller doesn't want the default value */ 1536 if (pr->pr_default == NULL) 1537 return; 1538 1539 ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t)); 1540 1541 bcopy(&val, pr->pr_default, sizeof (val)); 1542 1543 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1544 } 1545 1546 void 1547 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val) 1548 { 1549 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1550 1551 /* nothing to do if the caller doesn't want the default value */ 1552 if (pr->pr_default == NULL) 1553 return; 1554 1555 ASSERT(pr->pr_default_size >= sizeof (link_fec_t)); 1556 1557 bcopy(&val, pr->pr_default, sizeof (val)); 1558 1559 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1560 } 1561 1562 void 1563 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min, 1564 uint32_t max) 1565 { 1566 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1567 mac_propval_range_t *range = pr->pr_range; 1568 mac_propval_uint32_range_t *range32; 1569 1570 /* nothing to do if the caller doesn't want the range info */ 1571 if (range == NULL) 1572 return; 1573 1574 if (pr->pr_range_cur_count++ == 0) { 1575 /* first range */ 1576 pr->pr_flags |= MAC_PROP_INFO_RANGE; 1577 range->mpr_type = MAC_PROPVAL_UINT32; 1578 } else { 1579 /* all ranges of a property should be of the same type */ 1580 ASSERT(range->mpr_type == MAC_PROPVAL_UINT32); 1581 if (pr->pr_range_cur_count > range->mpr_count) { 1582 pr->pr_errno = ENOSPC; 1583 return; 1584 } 1585 } 1586 1587 range32 = range->mpr_range_uint32; 1588 range32[pr->pr_range_cur_count - 1].mpur_min = min; 1589 range32[pr->pr_range_cur_count - 1].mpur_max = max; 1590 } 1591 1592 void 1593 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm) 1594 { 1595 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1596 1597 pr->pr_perm = perm; 1598 pr->pr_flags |= MAC_PROP_INFO_PERM; 1599 } 1600 1601 void 1602 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff, 1603 uint32_t *end, uint32_t *value, uint32_t *flags_ptr) 1604 { 1605 uint32_t flags; 1606 1607 ASSERT(DB_TYPE(mp) == M_DATA); 1608 1609 flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS; 1610 if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) { 1611 if (value != NULL) 1612 *value = (uint32_t)DB_CKSUM16(mp); 1613 if ((flags & HCK_PARTIALCKSUM) != 0) { 1614 if (start != NULL) 1615 *start = (uint32_t)DB_CKSUMSTART(mp); 1616 if (stuff != NULL) 1617 *stuff = (uint32_t)DB_CKSUMSTUFF(mp); 1618 if (end != NULL) 1619 *end = (uint32_t)DB_CKSUMEND(mp); 1620 } 1621 } 1622 1623 if (flags_ptr != NULL) 1624 *flags_ptr = flags; 1625 } 1626 1627 void 1628 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end, 1629 uint32_t value, uint32_t flags) 1630 { 1631 ASSERT(DB_TYPE(mp) == M_DATA); 1632 1633 DB_CKSUMSTART(mp) = (intptr_t)start; 1634 DB_CKSUMSTUFF(mp) = (intptr_t)stuff; 1635 DB_CKSUMEND(mp) = (intptr_t)end; 1636 DB_CKSUMFLAGS(mp) = (uint16_t)flags; 1637 DB_CKSUM16(mp) = (uint16_t)value; 1638 } 1639 1640 void 1641 mac_hcksum_clone(const mblk_t *src, mblk_t *dst) 1642 { 1643 ASSERT3U(DB_TYPE(src), ==, M_DATA); 1644 ASSERT3U(DB_TYPE(dst), ==, M_DATA); 1645 1646 /* 1647 * Do these assignments unconditionally, rather than only when 1648 * flags is non-zero. This protects a situation where zeroed 1649 * hcksum data does not make the jump onto an mblk_t with 1650 * stale data in those fields. It's important to copy all 1651 * possible flags (HCK_* as well as HW_*) and not just the 1652 * checksum specific flags. Dropping flags during a clone 1653 * could result in dropped packets. If the caller has good 1654 * reason to drop those flags then it should do it manually, 1655 * after the clone. 1656 */ 1657 DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src); 1658 DB_CKSUMSTART(dst) = DB_CKSUMSTART(src); 1659 DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src); 1660 DB_CKSUMEND(dst) = DB_CKSUMEND(src); 1661 DB_CKSUM16(dst) = DB_CKSUM16(src); 1662 DB_LSOMSS(dst) = DB_LSOMSS(src); 1663 } 1664 1665 void 1666 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags) 1667 { 1668 ASSERT(DB_TYPE(mp) == M_DATA); 1669 1670 if (flags != NULL) { 1671 *flags = DB_CKSUMFLAGS(mp) & HW_LSO; 1672 if ((*flags != 0) && (mss != NULL)) 1673 *mss = (uint32_t)DB_LSOMSS(mp); 1674 } 1675 } 1676 1677 void 1678 mac_transceiver_info_set_present(mac_transceiver_info_t *infop, 1679 boolean_t present) 1680 { 1681 infop->mti_present = present; 1682 } 1683 1684 void 1685 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop, 1686 boolean_t usable) 1687 { 1688 infop->mti_usable = usable; 1689 } 1690 1691 /* 1692 * We should really keep track of our offset and not walk everything every 1693 * time. I can't imagine that this will be kind to us at high packet rates; 1694 * however, for the moment, let's leave that. 1695 * 1696 * This walks a message block chain without pulling up to fill in the context 1697 * information. Note that the data we care about could be hidden across more 1698 * than one mblk_t. 1699 */ 1700 static int 1701 mac_meoi_get_uint8(mblk_t *mp, off_t off, uint8_t *out) 1702 { 1703 size_t mpsize; 1704 uint8_t *bp; 1705 1706 mpsize = msgsize(mp); 1707 /* Check for overflow */ 1708 if (off + sizeof (uint16_t) > mpsize) 1709 return (-1); 1710 1711 mpsize = MBLKL(mp); 1712 while (off >= mpsize) { 1713 mp = mp->b_cont; 1714 off -= mpsize; 1715 mpsize = MBLKL(mp); 1716 } 1717 1718 bp = mp->b_rptr + off; 1719 *out = *bp; 1720 return (0); 1721 1722 } 1723 1724 static int 1725 mac_meoi_get_uint16(mblk_t *mp, off_t off, uint16_t *out) 1726 { 1727 size_t mpsize; 1728 uint8_t *bp; 1729 1730 mpsize = msgsize(mp); 1731 /* Check for overflow */ 1732 if (off + sizeof (uint16_t) > mpsize) 1733 return (-1); 1734 1735 mpsize = MBLKL(mp); 1736 while (off >= mpsize) { 1737 mp = mp->b_cont; 1738 off -= mpsize; 1739 mpsize = MBLKL(mp); 1740 } 1741 1742 /* 1743 * Data is in network order. Note the second byte of data might be in 1744 * the next mp. 1745 */ 1746 bp = mp->b_rptr + off; 1747 *out = *bp << 8; 1748 if (off + 1 == mpsize) { 1749 mp = mp->b_cont; 1750 bp = mp->b_rptr; 1751 } else { 1752 bp++; 1753 } 1754 1755 *out |= *bp; 1756 return (0); 1757 1758 } 1759 1760 static boolean_t 1761 mac_meoi_ip6eh_proto(uint8_t id) 1762 { 1763 switch (id) { 1764 case IPPROTO_HOPOPTS: 1765 case IPPROTO_ROUTING: 1766 case IPPROTO_FRAGMENT: 1767 case IPPROTO_AH: 1768 case IPPROTO_DSTOPTS: 1769 case IPPROTO_MH: 1770 case IPPROTO_HIP: 1771 case IPPROTO_SHIM6: 1772 /* Currently known extension headers */ 1773 return (B_TRUE); 1774 case IPPROTO_ESP: 1775 /* 1776 * While the IANA protocol numbers listing notes ESP as an IPv6 1777 * extension header, we cannot effectively parse it like one. 1778 * 1779 * For now, mac_ether_offload_info() will report it as the L4 1780 * protocol for a parsed packet containing this EH. 1781 */ 1782 default: 1783 return (B_FALSE); 1784 } 1785 } 1786 1787 int 1788 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) 1789 { 1790 size_t off; 1791 uint16_t ether, iplen; 1792 uint8_t ipproto, ip4verlen, l4len, maclen; 1793 1794 bzero(meoi, sizeof (mac_ether_offload_info_t)); 1795 1796 const size_t pktlen = msgsize(mp); 1797 meoi->meoi_len = pktlen; 1798 off = offsetof(struct ether_header, ether_type); 1799 if (mac_meoi_get_uint16(mp, off, ðer) != 0) 1800 return (-1); 1801 1802 if (ether == ETHERTYPE_VLAN) { 1803 off = offsetof(struct ether_vlan_header, ether_type); 1804 if (mac_meoi_get_uint16(mp, off, ðer) != 0) 1805 return (-1); 1806 meoi->meoi_flags |= MEOI_VLAN_TAGGED; 1807 maclen = sizeof (struct ether_vlan_header); 1808 } else { 1809 maclen = sizeof (struct ether_header); 1810 } 1811 if (maclen > pktlen) 1812 return (-1); 1813 meoi->meoi_flags |= MEOI_L2INFO_SET; 1814 meoi->meoi_l2hlen = maclen; 1815 meoi->meoi_l3proto = ether; 1816 1817 switch (ether) { 1818 case ETHERTYPE_IP: 1819 /* 1820 * For IPv4 we need to get the length of the header, as it can 1821 * be variable. 1822 */ 1823 off = offsetof(ipha_t, ipha_version_and_hdr_length) + maclen; 1824 if (mac_meoi_get_uint8(mp, off, &ip4verlen) != 0) 1825 return (-1); 1826 ip4verlen &= 0x0f; 1827 if (ip4verlen < 5 || ip4verlen > 0x0f) 1828 return (-1); 1829 iplen = ip4verlen * 4; 1830 off = offsetof(ipha_t, ipha_protocol) + maclen; 1831 if (mac_meoi_get_uint8(mp, off, &ipproto) == -1) 1832 return (-1); 1833 break; 1834 case ETHERTYPE_IPV6: 1835 iplen = sizeof (ip6_t); 1836 off = offsetof(ip6_t, ip6_nxt) + maclen; 1837 if (mac_meoi_get_uint8(mp, off, &ipproto) == -1) 1838 return (-1); 1839 /* Chase any extension headers present in packet */ 1840 while (mac_meoi_ip6eh_proto(ipproto)) { 1841 uint8_t len_val, next_proto; 1842 uint16_t eh_len; 1843 1844 off = maclen + iplen; 1845 if (mac_meoi_get_uint8(mp, off, &next_proto) == -1) 1846 return (-1); 1847 if (ipproto == IPPROTO_FRAGMENT) { 1848 /* 1849 * The Fragment extension header bears a 1850 * predefined fixed length, rather than 1851 * communicating it through the EH itself. 1852 */ 1853 eh_len = 8; 1854 } else if (ipproto == IPPROTO_AH) { 1855 /* 1856 * The length of the IP Authentication EH is 1857 * stored as (n + 2) * 32-bits, where 'n' is the 1858 * recorded EH length field 1859 */ 1860 off += 1; 1861 if (mac_meoi_get_uint8(mp, off, &len_val) == -1) 1862 return (-1); 1863 eh_len = ((uint16_t)len_val + 2) * 4; 1864 } else { 1865 /* 1866 * All other EHs should follow the sizing 1867 * formula of (n + 1) * 64-bits, where 'n' is 1868 * the recorded EH length field. 1869 */ 1870 off += 1; 1871 if (mac_meoi_get_uint8(mp, off, &len_val) == -1) 1872 return (-1); 1873 eh_len = ((uint16_t)len_val + 1) * 8; 1874 } 1875 /* 1876 * Protect against overflow in the case of a very 1877 * contrived packet. 1878 */ 1879 if ((iplen + eh_len) < iplen) { 1880 return (-1); 1881 } 1882 1883 iplen += eh_len; 1884 ipproto = next_proto; 1885 } 1886 break; 1887 default: 1888 return (0); 1889 } 1890 if (((size_t)maclen + (size_t)iplen) > pktlen) 1891 return (-1); 1892 meoi->meoi_l3hlen = iplen; 1893 meoi->meoi_l4proto = ipproto; 1894 meoi->meoi_flags |= MEOI_L3INFO_SET; 1895 1896 switch (ipproto) { 1897 case IPPROTO_TCP: 1898 off = offsetof(tcph_t, th_offset_and_rsrvd) + maclen + iplen; 1899 if (mac_meoi_get_uint8(mp, off, &l4len) == -1) 1900 return (-1); 1901 l4len = (l4len & 0xf0) >> 4; 1902 if (l4len < 5 || l4len > 0xf) 1903 return (-1); 1904 l4len *= 4; 1905 break; 1906 case IPPROTO_UDP: 1907 l4len = sizeof (struct udphdr); 1908 break; 1909 case IPPROTO_SCTP: 1910 l4len = sizeof (sctp_hdr_t); 1911 break; 1912 default: 1913 return (0); 1914 } 1915 1916 if (((size_t)maclen + (size_t)iplen + (size_t)l4len) > pktlen) 1917 return (-1); 1918 meoi->meoi_l4hlen = l4len; 1919 meoi->meoi_flags |= MEOI_L4INFO_SET; 1920 return (0); 1921 } 1922