1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright 2019 Joyent, Inc. 25 * Copyright 2017 OmniTI Computer Consulting, Inc. All rights reserved. 26 * Copyright 2020 RackTop Systems, Inc. 27 * Copyright 2025 Oxide Computer Company 28 */ 29 30 #include <sys/types.h> 31 #include <sys/conf.h> 32 #include <sys/id_space.h> 33 #include <sys/esunddi.h> 34 #include <sys/stat.h> 35 #include <sys/mkdev.h> 36 #include <sys/stream.h> 37 #include <sys/strsubr.h> 38 #include <sys/dlpi.h> 39 #include <sys/modhash.h> 40 #include <sys/mac.h> 41 #include <sys/mac_provider.h> 42 #include <sys/mac_impl.h> 43 #include <sys/mac_client_impl.h> 44 #include <sys/mac_client_priv.h> 45 #include <sys/mac_soft_ring.h> 46 #include <sys/mac_stat.h> 47 #include <sys/dld.h> 48 #include <sys/modctl.h> 49 #include <sys/fs/dv_node.h> 50 #include <sys/thread.h> 51 #include <sys/proc.h> 52 #include <sys/callb.h> 53 #include <sys/cpuvar.h> 54 #include <sys/atomic.h> 55 #include <sys/sdt.h> 56 #include <sys/mac_flow.h> 57 #include <sys/ddi_intr_impl.h> 58 #include <sys/disp.h> 59 #include <sys/sdt.h> 60 #include <sys/stdbool.h> 61 #include <sys/pattr.h> 62 #include <sys/strsun.h> 63 #include <sys/vlan.h> 64 #include <inet/ip.h> 65 #include <inet/tcp.h> 66 #include <netinet/udp.h> 67 #include <netinet/sctp.h> 68 #include <netinet/ip_icmp.h> 69 #include <netinet/icmp6.h> 70 71 /* 72 * MAC Provider Interface. 73 * 74 * Interface for GLDv3 compatible NIC drivers. 75 */ 76 77 static void i_mac_notify_thread(void *); 78 79 typedef void (*mac_notify_default_cb_fn_t)(mac_impl_t *); 80 81 static const mac_notify_default_cb_fn_t mac_notify_cb_list[MAC_NNOTE] = { 82 mac_fanout_recompute, /* MAC_NOTE_LINK */ 83 NULL, /* MAC_NOTE_UNICST */ 84 NULL, /* MAC_NOTE_TX */ 85 NULL, /* MAC_NOTE_DEVPROMISC */ 86 NULL, /* MAC_NOTE_FASTPATH_FLUSH */ 87 NULL, /* MAC_NOTE_SDU_SIZE */ 88 NULL, /* MAC_NOTE_MARGIN */ 89 NULL, /* MAC_NOTE_CAPAB_CHG */ 90 NULL /* MAC_NOTE_LOWLINK */ 91 }; 92 93 /* 94 * Driver support functions. 95 */ 96 97 /* REGISTRATION */ 98 99 mac_register_t * 100 mac_alloc(uint_t mac_version) 101 { 102 mac_register_t *mregp; 103 104 /* 105 * Make sure there isn't a version mismatch between the driver and 106 * the framework. In the future, if multiple versions are 107 * supported, this check could become more sophisticated. 108 */ 109 if (mac_version != MAC_VERSION) 110 return (NULL); 111 112 mregp = kmem_zalloc(sizeof (mac_register_t), KM_SLEEP); 113 mregp->m_version = mac_version; 114 return (mregp); 115 } 116 117 void 118 mac_free(mac_register_t *mregp) 119 { 120 kmem_free(mregp, sizeof (mac_register_t)); 121 } 122 123 /* 124 * Convert a MAC's offload features into the equivalent DB_CKSUMFLAGS 125 * value. 126 */ 127 static uint16_t 128 mac_features_to_flags(mac_handle_t mh) 129 { 130 uint16_t flags = 0; 131 uint32_t cap_sum = 0; 132 mac_capab_lso_t cap_lso; 133 134 if (mac_capab_get(mh, MAC_CAPAB_HCKSUM, &cap_sum)) { 135 if (cap_sum & HCKSUM_IPHDRCKSUM) 136 flags |= HCK_IPV4_HDRCKSUM; 137 138 if (cap_sum & HCKSUM_INET_PARTIAL) 139 flags |= HCK_PARTIALCKSUM; 140 else if (cap_sum & (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) 141 flags |= HCK_FULLCKSUM; 142 } 143 144 /* 145 * We don't need the information stored in 'cap_lso', but we 146 * need to pass a non-NULL pointer to appease the driver. 147 */ 148 if (mac_capab_get(mh, MAC_CAPAB_LSO, &cap_lso)) 149 flags |= HW_LSO; 150 151 return (flags); 152 } 153 154 /* 155 * mac_register() is how drivers register new MACs with the GLDv3 156 * framework. The mregp argument is allocated by drivers using the 157 * mac_alloc() function, and can be freed using mac_free() immediately upon 158 * return from mac_register(). Upon success (0 return value), the mhp 159 * opaque pointer becomes the driver's handle to its MAC interface, and is 160 * the argument to all other mac module entry points. 161 */ 162 /* ARGSUSED */ 163 int 164 mac_register(mac_register_t *mregp, mac_handle_t *mhp) 165 { 166 mac_impl_t *mip; 167 mactype_t *mtype; 168 int err = EINVAL; 169 struct devnames *dnp = NULL; 170 uint_t instance; 171 boolean_t style1_created = B_FALSE; 172 boolean_t style2_created = B_FALSE; 173 char *driver; 174 minor_t minor = 0; 175 176 /* A successful call to mac_init_ops() sets the DN_GLDV3_DRIVER flag. */ 177 if (!GLDV3_DRV(ddi_driver_major(mregp->m_dip))) 178 return (EINVAL); 179 180 /* Find the required MAC-Type plugin. */ 181 if ((mtype = mactype_getplugin(mregp->m_type_ident)) == NULL) 182 return (EINVAL); 183 184 /* Create a mac_impl_t to represent this MAC. */ 185 mip = kmem_cache_alloc(i_mac_impl_cachep, KM_SLEEP); 186 187 /* 188 * The mac is not ready for open yet. 189 */ 190 mip->mi_state_flags |= MIS_DISABLED; 191 192 /* 193 * When a mac is registered, the m_instance field can be set to: 194 * 195 * 0: Get the mac's instance number from m_dip. 196 * This is usually used for physical device dips. 197 * 198 * [1 .. MAC_MAX_MINOR-1]: Use the value as the mac's instance number. 199 * For example, when an aggregation is created with the key option, 200 * "key" will be used as the instance number. 201 * 202 * -1: Assign an instance number from [MAC_MAX_MINOR .. MAXMIN-1]. 203 * This is often used when a MAC of a virtual link is registered 204 * (e.g., aggregation when "key" is not specified, or vnic). 205 * 206 * Note that the instance number is used to derive the mi_minor field 207 * of mac_impl_t, which will then be used to derive the name of kstats 208 * and the devfs nodes. The first 2 cases are needed to preserve 209 * backward compatibility. 210 */ 211 switch (mregp->m_instance) { 212 case 0: 213 instance = ddi_get_instance(mregp->m_dip); 214 break; 215 case ((uint_t)-1): 216 minor = mac_minor_hold(B_TRUE); 217 if (minor == 0) { 218 err = ENOSPC; 219 goto fail; 220 } 221 instance = minor - 1; 222 break; 223 default: 224 instance = mregp->m_instance; 225 if (instance >= MAC_MAX_MINOR) { 226 err = EINVAL; 227 goto fail; 228 } 229 break; 230 } 231 232 mip->mi_minor = (minor_t)(instance + 1); 233 mip->mi_dip = mregp->m_dip; 234 mip->mi_clients_list = NULL; 235 mip->mi_nclients = 0; 236 237 /* Set the default IEEE Port VLAN Identifier */ 238 mip->mi_pvid = 1; 239 240 /* Default bridge link learning protection values */ 241 mip->mi_llimit = 1000; 242 mip->mi_ldecay = 200; 243 244 driver = (char *)ddi_driver_name(mip->mi_dip); 245 246 /* Construct the MAC name as <drvname><instance> */ 247 (void) snprintf(mip->mi_name, sizeof (mip->mi_name), "%s%d", 248 driver, instance); 249 250 mip->mi_driver = mregp->m_driver; 251 252 mip->mi_type = mtype; 253 mip->mi_margin = mregp->m_margin; 254 mip->mi_info.mi_media = mtype->mt_type; 255 mip->mi_info.mi_nativemedia = mtype->mt_nativetype; 256 if (mregp->m_max_sdu <= mregp->m_min_sdu) 257 goto fail; 258 if (mregp->m_multicast_sdu == 0) 259 mregp->m_multicast_sdu = mregp->m_max_sdu; 260 if (mregp->m_multicast_sdu < mregp->m_min_sdu || 261 mregp->m_multicast_sdu > mregp->m_max_sdu) 262 goto fail; 263 mip->mi_sdu_min = mregp->m_min_sdu; 264 mip->mi_sdu_max = mregp->m_max_sdu; 265 mip->mi_sdu_multicast = mregp->m_multicast_sdu; 266 mip->mi_info.mi_addr_length = mip->mi_type->mt_addr_length; 267 /* 268 * If the media supports a broadcast address, cache a pointer to it 269 * in the mac_info_t so that upper layers can use it. 270 */ 271 mip->mi_info.mi_brdcst_addr = mip->mi_type->mt_brdcst_addr; 272 273 mip->mi_v12n_level = mregp->m_v12n; 274 275 /* 276 * Copy the unicast source address into the mac_info_t, but only if 277 * the MAC-Type defines a non-zero address length. We need to 278 * handle MAC-Types that have an address length of 0 279 * (point-to-point protocol MACs for example). 280 */ 281 if (mip->mi_type->mt_addr_length > 0) { 282 if (mregp->m_src_addr == NULL) 283 goto fail; 284 mip->mi_info.mi_unicst_addr = 285 kmem_alloc(mip->mi_type->mt_addr_length, KM_SLEEP); 286 bcopy(mregp->m_src_addr, mip->mi_info.mi_unicst_addr, 287 mip->mi_type->mt_addr_length); 288 289 /* 290 * Copy the fixed 'factory' MAC address from the immutable 291 * info. This is taken to be the MAC address currently in 292 * use. 293 */ 294 bcopy(mip->mi_info.mi_unicst_addr, mip->mi_addr, 295 mip->mi_type->mt_addr_length); 296 297 /* 298 * At this point, we should set up the classification 299 * rules etc but we delay it till mac_open() so that 300 * the resource discovery has taken place and we 301 * know someone wants to use the device. Otherwise 302 * memory gets allocated for Rx ring structures even 303 * during probe. 304 */ 305 306 /* Copy the destination address if one is provided. */ 307 if (mregp->m_dst_addr != NULL) { 308 bcopy(mregp->m_dst_addr, mip->mi_dstaddr, 309 mip->mi_type->mt_addr_length); 310 mip->mi_dstaddr_set = B_TRUE; 311 } 312 } else if (mregp->m_src_addr != NULL) { 313 goto fail; 314 } 315 316 /* 317 * The format of the m_pdata is specific to the plugin. It is 318 * passed in as an argument to all of the plugin callbacks. The 319 * driver can update this information by calling 320 * mac_pdata_update(). 321 */ 322 if (mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY) { 323 /* 324 * Verify if the supplied plugin data is valid. Note that 325 * even if the caller passed in a NULL pointer as plugin data, 326 * we still need to verify if that's valid as the plugin may 327 * require plugin data to function. 328 */ 329 if (!mip->mi_type->mt_ops.mtops_pdata_verify(mregp->m_pdata, 330 mregp->m_pdata_size)) { 331 goto fail; 332 } 333 if (mregp->m_pdata != NULL) { 334 mip->mi_pdata = 335 kmem_alloc(mregp->m_pdata_size, KM_SLEEP); 336 bcopy(mregp->m_pdata, mip->mi_pdata, 337 mregp->m_pdata_size); 338 mip->mi_pdata_size = mregp->m_pdata_size; 339 } 340 } else if (mregp->m_pdata != NULL) { 341 /* 342 * The caller supplied non-NULL plugin data, but the plugin 343 * does not recognize plugin data. 344 */ 345 err = EINVAL; 346 goto fail; 347 } 348 349 /* 350 * Register the private properties. 351 */ 352 mac_register_priv_prop(mip, mregp->m_priv_props); 353 354 /* 355 * Stash the driver callbacks into the mac_impl_t, but first sanity 356 * check to make sure all mandatory callbacks are set. 357 */ 358 if (mregp->m_callbacks->mc_getstat == NULL || 359 mregp->m_callbacks->mc_start == NULL || 360 mregp->m_callbacks->mc_stop == NULL || 361 mregp->m_callbacks->mc_setpromisc == NULL || 362 mregp->m_callbacks->mc_multicst == NULL) { 363 goto fail; 364 } 365 mip->mi_callbacks = mregp->m_callbacks; 366 367 if (mac_capab_get((mac_handle_t)mip, MAC_CAPAB_LEGACY, 368 &mip->mi_capab_legacy)) { 369 mip->mi_state_flags |= MIS_LEGACY; 370 mip->mi_phy_dev = mip->mi_capab_legacy.ml_dev; 371 } else { 372 mip->mi_phy_dev = makedevice(ddi_driver_major(mip->mi_dip), 373 mip->mi_minor); 374 } 375 376 /* 377 * Allocate a notification thread. thread_create blocks for memory 378 * if needed, it never fails. 379 */ 380 mip->mi_notify_thread = thread_create(NULL, 0, i_mac_notify_thread, 381 mip, 0, &p0, TS_RUN, minclsyspri); 382 383 /* 384 * Cache the DB_CKSUMFLAGS that this MAC supports. 385 */ 386 mip->mi_tx_cksum_flags = mac_features_to_flags((mac_handle_t)mip); 387 388 /* 389 * Initialize the capabilities 390 */ 391 bzero(&mip->mi_rx_rings_cap, sizeof (mac_capab_rings_t)); 392 bzero(&mip->mi_tx_rings_cap, sizeof (mac_capab_rings_t)); 393 394 if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_VNIC, NULL)) 395 mip->mi_state_flags |= MIS_IS_VNIC; 396 397 if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_AGGR, NULL)) 398 mip->mi_state_flags |= MIS_IS_AGGR; 399 400 if (i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_OVERLAY, NULL)) 401 mip->mi_state_flags |= MIS_IS_OVERLAY; 402 403 mac_addr_factory_init(mip); 404 405 mac_transceiver_init(mip); 406 407 mac_led_init(mip); 408 409 /* 410 * Enforce the virtrualization level registered. 411 */ 412 if (mip->mi_v12n_level & MAC_VIRT_LEVEL1) { 413 if (mac_init_rings(mip, MAC_RING_TYPE_RX) != 0 || 414 mac_init_rings(mip, MAC_RING_TYPE_TX) != 0) 415 goto fail; 416 417 /* 418 * The driver needs to register at least rx rings for this 419 * virtualization level. 420 */ 421 if (mip->mi_rx_groups == NULL) 422 goto fail; 423 } 424 425 /* 426 * The driver must set mc_unicst entry point to NULL when it advertises 427 * CAP_RINGS for rx groups. 428 */ 429 if (mip->mi_rx_groups != NULL) { 430 if (mregp->m_callbacks->mc_unicst != NULL) 431 goto fail; 432 } else { 433 if (mregp->m_callbacks->mc_unicst == NULL) 434 goto fail; 435 } 436 437 /* 438 * Initialize MAC addresses. Must be called after mac_init_rings(). 439 */ 440 mac_init_macaddr(mip); 441 442 mip->mi_share_capab.ms_snum = 0; 443 if (mip->mi_v12n_level & MAC_VIRT_HIO) { 444 (void) mac_capab_get((mac_handle_t)mip, MAC_CAPAB_SHARES, 445 &mip->mi_share_capab); 446 } 447 448 /* 449 * Initialize the kstats for this device. 450 */ 451 mac_driver_stat_create(mip); 452 453 /* Zero out any properties. */ 454 bzero(&mip->mi_resource_props, sizeof (mac_resource_props_t)); 455 456 if (mip->mi_minor <= MAC_MAX_MINOR) { 457 /* Create a style-2 DLPI device */ 458 if (ddi_create_minor_node(mip->mi_dip, driver, S_IFCHR, 0, 459 DDI_NT_NET, CLONE_DEV) != DDI_SUCCESS) 460 goto fail; 461 style2_created = B_TRUE; 462 463 /* Create a style-1 DLPI device */ 464 if (ddi_create_minor_node(mip->mi_dip, mip->mi_name, S_IFCHR, 465 mip->mi_minor, DDI_NT_NET, 0) != DDI_SUCCESS) 466 goto fail; 467 style1_created = B_TRUE; 468 } 469 470 mac_flow_l2tab_create(mip, &mip->mi_flow_tab); 471 472 rw_enter(&i_mac_impl_lock, RW_WRITER); 473 if (mod_hash_insert(i_mac_impl_hash, 474 (mod_hash_key_t)mip->mi_name, (mod_hash_val_t)mip) != 0) { 475 rw_exit(&i_mac_impl_lock); 476 err = EEXIST; 477 goto fail; 478 } 479 480 DTRACE_PROBE2(mac__register, struct devnames *, dnp, 481 (mac_impl_t *), mip); 482 483 /* 484 * Mark the MAC to be ready for open. 485 */ 486 mip->mi_state_flags &= ~MIS_DISABLED; 487 rw_exit(&i_mac_impl_lock); 488 489 atomic_inc_32(&i_mac_impl_count); 490 491 cmn_err(CE_NOTE, "!%s registered", mip->mi_name); 492 *mhp = (mac_handle_t)mip; 493 return (0); 494 495 fail: 496 if (style1_created) 497 ddi_remove_minor_node(mip->mi_dip, mip->mi_name); 498 499 if (style2_created) 500 ddi_remove_minor_node(mip->mi_dip, driver); 501 502 mac_addr_factory_fini(mip); 503 504 /* Clean up registered MAC addresses */ 505 mac_fini_macaddr(mip); 506 507 /* Clean up registered rings */ 508 mac_free_rings(mip, MAC_RING_TYPE_RX); 509 mac_free_rings(mip, MAC_RING_TYPE_TX); 510 511 /* Clean up notification thread */ 512 if (mip->mi_notify_thread != NULL) 513 i_mac_notify_exit(mip); 514 515 if (mip->mi_info.mi_unicst_addr != NULL) { 516 kmem_free(mip->mi_info.mi_unicst_addr, 517 mip->mi_type->mt_addr_length); 518 mip->mi_info.mi_unicst_addr = NULL; 519 } 520 521 mac_driver_stat_delete(mip); 522 523 if (mip->mi_type != NULL) { 524 atomic_dec_32(&mip->mi_type->mt_ref); 525 mip->mi_type = NULL; 526 } 527 528 if (mip->mi_pdata != NULL) { 529 kmem_free(mip->mi_pdata, mip->mi_pdata_size); 530 mip->mi_pdata = NULL; 531 mip->mi_pdata_size = 0; 532 } 533 534 if (minor != 0) { 535 ASSERT(minor > MAC_MAX_MINOR); 536 mac_minor_rele(minor); 537 } 538 539 mip->mi_state_flags = 0; 540 mac_unregister_priv_prop(mip); 541 542 /* 543 * Clear the state before destroying the mac_impl_t 544 */ 545 mip->mi_state_flags = 0; 546 547 kmem_cache_free(i_mac_impl_cachep, mip); 548 return (err); 549 } 550 551 /* 552 * Unregister from the GLDv3 framework 553 */ 554 int 555 mac_unregister(mac_handle_t mh) 556 { 557 int err; 558 mac_impl_t *mip = (mac_impl_t *)mh; 559 mod_hash_val_t val; 560 mac_margin_req_t *mmr, *nextmmr; 561 562 /* Fail the unregister if there are any open references to this mac. */ 563 if ((err = mac_disable_nowait(mh)) != 0) 564 return (err); 565 566 /* 567 * Clean up notification thread and wait for it to exit. 568 */ 569 i_mac_notify_exit(mip); 570 571 /* 572 * Prior to acquiring the MAC perimeter, remove the MAC instance from 573 * the internal hash table. Such removal means table-walkers that 574 * acquire the perimeter will not do so on behalf of what we are 575 * unregistering, which prevents a deadlock. 576 */ 577 rw_enter(&i_mac_impl_lock, RW_WRITER); 578 (void) mod_hash_remove(i_mac_impl_hash, 579 (mod_hash_key_t)mip->mi_name, &val); 580 rw_exit(&i_mac_impl_lock); 581 ASSERT(mip == (mac_impl_t *)val); 582 583 i_mac_perim_enter(mip); 584 585 /* 586 * There is still resource properties configured over this mac. 587 */ 588 if (mip->mi_resource_props.mrp_mask != 0) 589 mac_fastpath_enable((mac_handle_t)mip); 590 591 if (mip->mi_minor < MAC_MAX_MINOR + 1) { 592 ddi_remove_minor_node(mip->mi_dip, mip->mi_name); 593 ddi_remove_minor_node(mip->mi_dip, 594 (char *)ddi_driver_name(mip->mi_dip)); 595 } 596 597 ASSERT(mip->mi_nactiveclients == 0 && !(mip->mi_state_flags & 598 MIS_EXCLUSIVE)); 599 600 mac_driver_stat_delete(mip); 601 602 ASSERT(i_mac_impl_count > 0); 603 atomic_dec_32(&i_mac_impl_count); 604 605 if (mip->mi_pdata != NULL) 606 kmem_free(mip->mi_pdata, mip->mi_pdata_size); 607 mip->mi_pdata = NULL; 608 mip->mi_pdata_size = 0; 609 610 /* 611 * Free the list of margin request. 612 */ 613 for (mmr = mip->mi_mmrp; mmr != NULL; mmr = nextmmr) { 614 nextmmr = mmr->mmr_nextp; 615 kmem_free(mmr, sizeof (mac_margin_req_t)); 616 } 617 mip->mi_mmrp = NULL; 618 619 mip->mi_linkstate = mip->mi_lowlinkstate = LINK_STATE_UNKNOWN; 620 kmem_free(mip->mi_info.mi_unicst_addr, mip->mi_type->mt_addr_length); 621 mip->mi_info.mi_unicst_addr = NULL; 622 623 atomic_dec_32(&mip->mi_type->mt_ref); 624 mip->mi_type = NULL; 625 626 /* 627 * Free the primary MAC address. 628 */ 629 mac_fini_macaddr(mip); 630 631 /* 632 * free all rings 633 */ 634 mac_free_rings(mip, MAC_RING_TYPE_RX); 635 mac_free_rings(mip, MAC_RING_TYPE_TX); 636 637 mac_addr_factory_fini(mip); 638 639 bzero(mip->mi_addr, MAXMACADDRLEN); 640 bzero(mip->mi_dstaddr, MAXMACADDRLEN); 641 mip->mi_dstaddr_set = B_FALSE; 642 643 /* and the flows */ 644 mac_flow_tab_destroy(mip->mi_flow_tab); 645 mip->mi_flow_tab = NULL; 646 647 if (mip->mi_minor > MAC_MAX_MINOR) 648 mac_minor_rele(mip->mi_minor); 649 650 cmn_err(CE_NOTE, "!%s unregistered", mip->mi_name); 651 652 /* 653 * Reset the perim related fields to default values before 654 * kmem_cache_free 655 */ 656 i_mac_perim_exit(mip); 657 mip->mi_state_flags = 0; 658 659 mac_unregister_priv_prop(mip); 660 661 ASSERT(mip->mi_bridge_link == NULL); 662 kmem_cache_free(i_mac_impl_cachep, mip); 663 664 return (0); 665 } 666 667 /* DATA RECEPTION */ 668 669 /* 670 * This function is invoked for packets received by the MAC driver in 671 * interrupt context. The ring generation number provided by the driver 672 * is matched with the ring generation number held in MAC. If they do not 673 * match, received packets are considered stale packets coming from an older 674 * assignment of the ring. Drop them. 675 */ 676 void 677 mac_rx_ring(mac_handle_t mh, mac_ring_handle_t mrh, mblk_t *mp_chain, 678 uint64_t mr_gen_num) 679 { 680 mac_ring_t *mr = (mac_ring_t *)mrh; 681 682 if ((mr != NULL) && (mr->mr_gen_num != mr_gen_num)) { 683 DTRACE_PROBE2(mac__rx__rings__stale__packet, uint64_t, 684 mr->mr_gen_num, uint64_t, mr_gen_num); 685 freemsgchain(mp_chain); 686 return; 687 } 688 mac_rx(mh, (mac_resource_handle_t)mrh, mp_chain); 689 } 690 691 /* 692 * This function is invoked for each packet received by the underlying driver. 693 */ 694 void 695 mac_rx(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) 696 { 697 mac_impl_t *mip = (mac_impl_t *)mh; 698 699 /* 700 * Check if the link is part of a bridge. If not, then we don't need 701 * to take the lock to remain consistent. Make this common case 702 * lock-free and tail-call optimized. 703 */ 704 if (mip->mi_bridge_link == NULL) { 705 mac_rx_common(mh, mrh, mp_chain); 706 } else { 707 /* 708 * Once we take a reference on the bridge link, the bridge 709 * module itself can't unload, so the callback pointers are 710 * stable. 711 */ 712 mutex_enter(&mip->mi_bridge_lock); 713 if ((mh = mip->mi_bridge_link) != NULL) 714 mac_bridge_ref_cb(mh, B_TRUE); 715 mutex_exit(&mip->mi_bridge_lock); 716 if (mh == NULL) { 717 mac_rx_common((mac_handle_t)mip, mrh, mp_chain); 718 } else { 719 mac_bridge_rx_cb(mh, mrh, mp_chain); 720 mac_bridge_ref_cb(mh, B_FALSE); 721 } 722 } 723 } 724 725 /* 726 * Special case function: this allows snooping of packets transmitted and 727 * received by TRILL. By design, they go directly into the TRILL module. 728 */ 729 void 730 mac_trill_snoop(mac_handle_t mh, mblk_t *mp) 731 { 732 mac_impl_t *mip = (mac_impl_t *)mh; 733 734 if (mip->mi_promisc_list != NULL) 735 mac_promisc_dispatch(mip, mp, NULL, B_FALSE); 736 } 737 738 /* 739 * This is the upward reentry point for packets arriving from the bridging 740 * module and from mac_rx for links not part of a bridge. 741 */ 742 void 743 mac_rx_common(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain) 744 { 745 mac_impl_t *mip = (mac_impl_t *)mh; 746 mac_ring_t *mr = (mac_ring_t *)mrh; 747 mac_soft_ring_set_t *mac_srs; 748 mblk_t *bp = mp_chain; 749 750 /* 751 * If there are any promiscuous mode callbacks defined for 752 * this MAC, pass them a copy if appropriate. 753 */ 754 if (mip->mi_promisc_list != NULL) 755 mac_promisc_dispatch(mip, mp_chain, NULL, B_FALSE); 756 757 if (mr != NULL) { 758 /* 759 * If the SRS teardown has started, just return. The 'mr' 760 * continues to be valid until the driver unregisters the MAC. 761 * Hardware classified packets will not make their way up 762 * beyond this point once the teardown has started. The driver 763 * is never passed a pointer to a flow entry or SRS or any 764 * structure that can be freed much before mac_unregister. 765 */ 766 mutex_enter(&mr->mr_lock); 767 if ((mr->mr_state != MR_INUSE) || (mr->mr_flag & 768 (MR_INCIPIENT | MR_CONDEMNED | MR_QUIESCE))) { 769 mutex_exit(&mr->mr_lock); 770 freemsgchain(mp_chain); 771 return; 772 } 773 774 /* 775 * The ring is in passthru mode; pass the chain up to 776 * the pseudo ring. 777 */ 778 if (mr->mr_classify_type == MAC_PASSTHRU_CLASSIFIER) { 779 MR_REFHOLD_LOCKED(mr); 780 mutex_exit(&mr->mr_lock); 781 mr->mr_pt_fn(mr->mr_pt_arg1, mr->mr_pt_arg2, mp_chain, 782 B_FALSE); 783 MR_REFRELE(mr); 784 return; 785 } 786 787 /* 788 * The passthru callback should only be set when in 789 * MAC_PASSTHRU_CLASSIFIER mode. 790 */ 791 ASSERT3P(mr->mr_pt_fn, ==, NULL); 792 793 /* 794 * We check if an SRS is controlling this ring. 795 * If so, we can directly call the srs_lower_proc 796 * routine otherwise we need to go through mac_rx_classify 797 * to reach the right place. 798 */ 799 if (mr->mr_classify_type == MAC_HW_CLASSIFIER) { 800 MR_REFHOLD_LOCKED(mr); 801 mutex_exit(&mr->mr_lock); 802 ASSERT3P(mr->mr_srs, !=, NULL); 803 mac_srs = mr->mr_srs; 804 805 /* 806 * This is the fast path. All packets received 807 * on this ring are hardware classified and 808 * share the same MAC header info. 809 */ 810 mac_srs->srs_rx.sr_lower_proc(mh, 811 (mac_resource_handle_t)mac_srs, mp_chain, B_FALSE); 812 MR_REFRELE(mr); 813 return; 814 } 815 816 mutex_exit(&mr->mr_lock); 817 /* We'll fall through to software classification */ 818 } else { 819 flow_entry_t *flent; 820 int err; 821 822 rw_enter(&mip->mi_rw_lock, RW_READER); 823 if (mip->mi_single_active_client != NULL) { 824 flent = mip->mi_single_active_client->mci_flent_list; 825 FLOW_TRY_REFHOLD(flent, err); 826 rw_exit(&mip->mi_rw_lock); 827 if (err == 0) { 828 (flent->fe_cb_fn)(flent->fe_cb_arg1, 829 flent->fe_cb_arg2, mp_chain, B_FALSE); 830 FLOW_REFRELE(flent); 831 return; 832 } 833 } else { 834 rw_exit(&mip->mi_rw_lock); 835 } 836 } 837 838 if (!FLOW_TAB_EMPTY(mip->mi_flow_tab)) { 839 if ((bp = mac_rx_flow(mh, mrh, bp)) == NULL) 840 return; 841 } 842 843 freemsgchain(bp); 844 } 845 846 /* DATA TRANSMISSION */ 847 848 /* 849 * A driver's notification to resume transmission, in case of a provider 850 * without TX rings. 851 */ 852 void 853 mac_tx_update(mac_handle_t mh) 854 { 855 mac_tx_ring_update(mh, NULL); 856 } 857 858 /* 859 * A driver's notification to resume transmission on the specified TX ring. 860 */ 861 void 862 mac_tx_ring_update(mac_handle_t mh, mac_ring_handle_t rh) 863 { 864 i_mac_tx_srs_notify((mac_impl_t *)mh, rh); 865 } 866 867 /* LINK STATE */ 868 /* 869 * Notify the MAC layer about a link state change 870 */ 871 void 872 mac_link_update(mac_handle_t mh, link_state_t link) 873 { 874 mac_impl_t *mip = (mac_impl_t *)mh; 875 876 /* 877 * Save the link state. 878 */ 879 mip->mi_lowlinkstate = link; 880 881 /* 882 * Send a MAC_NOTE_LOWLINK notification. This tells the notification 883 * thread to deliver both lower and upper notifications. 884 */ 885 i_mac_notify(mip, MAC_NOTE_LOWLINK); 886 } 887 888 /* 889 * Notify the MAC layer about a link state change due to bridging. 890 */ 891 void 892 mac_link_redo(mac_handle_t mh, link_state_t link) 893 { 894 mac_impl_t *mip = (mac_impl_t *)mh; 895 896 /* 897 * Save the link state. 898 */ 899 mip->mi_linkstate = link; 900 901 /* 902 * Send a MAC_NOTE_LINK notification. Only upper notifications are 903 * made. 904 */ 905 i_mac_notify(mip, MAC_NOTE_LINK); 906 } 907 908 /* MINOR NODE HANDLING */ 909 910 /* 911 * Given a dev_t, return the instance number (PPA) associated with it. 912 * Drivers can use this in their getinfo(9e) implementation to lookup 913 * the instance number (i.e. PPA) of the device, to use as an index to 914 * their own array of soft state structures. 915 * 916 * Returns -1 on error. 917 */ 918 int 919 mac_devt_to_instance(dev_t devt) 920 { 921 return (dld_devt_to_instance(devt)); 922 } 923 924 /* 925 * Drivers that make use of the private minor number space are expected to 926 * provide their own getinfo(9e) entry point. This function simply forwards 927 * to the default MAC framework getinfo(9e) implementation as a convenience 928 * if they don't need any special mapping (mac instance != ddi_get_instance()) 929 */ 930 int 931 mac_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **resp) 932 { 933 return (dld_getinfo(dip, cmd, arg, resp)); 934 } 935 936 /* 937 * This function returns the first minor number that is available for 938 * driver private use. All minor numbers smaller than this are 939 * reserved for GLDv3 use. 940 */ 941 minor_t 942 mac_private_minor(void) 943 { 944 return (MAC_PRIVATE_MINOR); 945 } 946 947 /* OTHER CONTROL INFORMATION */ 948 949 /* 950 * A driver notified us that its primary MAC address has changed. 951 */ 952 void 953 mac_unicst_update(mac_handle_t mh, const uint8_t *addr) 954 { 955 mac_impl_t *mip = (mac_impl_t *)mh; 956 957 if (mip->mi_type->mt_addr_length == 0) 958 return; 959 960 i_mac_perim_enter(mip); 961 962 /* 963 * If address changes, freshen the MAC address value and update 964 * all MAC clients that share this MAC address. 965 */ 966 if (bcmp(addr, mip->mi_addr, mip->mi_type->mt_addr_length) != 0) { 967 mac_freshen_macaddr(mac_find_macaddr(mip, mip->mi_addr), 968 (uint8_t *)addr); 969 } 970 971 i_mac_perim_exit(mip); 972 973 /* 974 * Send a MAC_NOTE_UNICST notification. 975 */ 976 i_mac_notify(mip, MAC_NOTE_UNICST); 977 } 978 979 void 980 mac_dst_update(mac_handle_t mh, const uint8_t *addr) 981 { 982 mac_impl_t *mip = (mac_impl_t *)mh; 983 984 if (mip->mi_type->mt_addr_length == 0) 985 return; 986 987 i_mac_perim_enter(mip); 988 bcopy(addr, mip->mi_dstaddr, mip->mi_type->mt_addr_length); 989 i_mac_perim_exit(mip); 990 i_mac_notify(mip, MAC_NOTE_DEST); 991 } 992 993 /* 994 * MAC plugin information changed. 995 */ 996 int 997 mac_pdata_update(mac_handle_t mh, void *mac_pdata, size_t dsize) 998 { 999 mac_impl_t *mip = (mac_impl_t *)mh; 1000 1001 /* 1002 * Verify that the plugin supports MAC plugin data and that the 1003 * supplied data is valid. 1004 */ 1005 if (!(mip->mi_type->mt_ops.mtops_ops & MTOPS_PDATA_VERIFY)) 1006 return (EINVAL); 1007 if (!mip->mi_type->mt_ops.mtops_pdata_verify(mac_pdata, dsize)) 1008 return (EINVAL); 1009 1010 if (mip->mi_pdata != NULL) 1011 kmem_free(mip->mi_pdata, mip->mi_pdata_size); 1012 1013 mip->mi_pdata = kmem_alloc(dsize, KM_SLEEP); 1014 bcopy(mac_pdata, mip->mi_pdata, dsize); 1015 mip->mi_pdata_size = dsize; 1016 1017 /* 1018 * Since the MAC plugin data is used to construct MAC headers that 1019 * were cached in fast-path headers, we need to flush fast-path 1020 * information for links associated with this mac. 1021 */ 1022 i_mac_notify(mip, MAC_NOTE_FASTPATH_FLUSH); 1023 return (0); 1024 } 1025 1026 /* 1027 * The mac provider or mac frameowrk calls this function when it wants 1028 * to notify upstream consumers that the capabilities have changed and 1029 * that they should modify their own internal state accordingly. 1030 * 1031 * We currently have no regard for the fact that a provider could 1032 * decide to drop capabilities which would invalidate pending traffic. 1033 * For example, if one was to disable the Tx checksum offload while 1034 * TCP/IP traffic was being sent by mac clients relying on that 1035 * feature, then those packets would hit the write with missing or 1036 * partial checksums. A proper solution involves not only providing 1037 * notfication, but also performing client quiescing. That is, a capab 1038 * change should be treated as an atomic transaction that forms a 1039 * barrier between traffic relying on the current capabs and traffic 1040 * relying on the new capabs. In practice, simnet is currently the 1041 * only provider that could hit this, and it's an easily avoidable 1042 * situation (and at worst it should only lead to some dropped 1043 * packets). But if we ever want better on-the-fly capab change to 1044 * actual hardware providers, then we should give this update 1045 * mechanism a proper implementation. 1046 */ 1047 void 1048 mac_capab_update(mac_handle_t mh) 1049 { 1050 /* 1051 * Send a MAC_NOTE_CAPAB_CHG notification to alert upstream 1052 * clients to renegotiate capabilities. 1053 */ 1054 i_mac_notify((mac_impl_t *)mh, MAC_NOTE_CAPAB_CHG); 1055 } 1056 1057 /* 1058 * Used by normal drivers to update the max sdu size. 1059 * We need to handle the case of a smaller mi_sdu_multicast 1060 * since this is called by mac_set_mtu() even for drivers that 1061 * have differing unicast and multicast mtu and we don't want to 1062 * increase the multicast mtu by accident in that case. 1063 */ 1064 int 1065 mac_maxsdu_update(mac_handle_t mh, uint_t sdu_max) 1066 { 1067 mac_impl_t *mip = (mac_impl_t *)mh; 1068 1069 if (sdu_max == 0 || sdu_max < mip->mi_sdu_min) 1070 return (EINVAL); 1071 mip->mi_sdu_max = sdu_max; 1072 if (mip->mi_sdu_multicast > mip->mi_sdu_max) 1073 mip->mi_sdu_multicast = mip->mi_sdu_max; 1074 1075 /* Send a MAC_NOTE_SDU_SIZE notification. */ 1076 i_mac_notify(mip, MAC_NOTE_SDU_SIZE); 1077 return (0); 1078 } 1079 1080 /* 1081 * Version of the above function that is used by drivers that have a different 1082 * max sdu size for multicast/broadcast vs. unicast. 1083 */ 1084 int 1085 mac_maxsdu_update2(mac_handle_t mh, uint_t sdu_max, uint_t sdu_multicast) 1086 { 1087 mac_impl_t *mip = (mac_impl_t *)mh; 1088 1089 if (sdu_max == 0 || sdu_max < mip->mi_sdu_min) 1090 return (EINVAL); 1091 if (sdu_multicast == 0) 1092 sdu_multicast = sdu_max; 1093 if (sdu_multicast > sdu_max || sdu_multicast < mip->mi_sdu_min) 1094 return (EINVAL); 1095 mip->mi_sdu_max = sdu_max; 1096 mip->mi_sdu_multicast = sdu_multicast; 1097 1098 /* Send a MAC_NOTE_SDU_SIZE notification. */ 1099 i_mac_notify(mip, MAC_NOTE_SDU_SIZE); 1100 return (0); 1101 } 1102 1103 static void 1104 mac_ring_intr_retarget(mac_group_t *group, mac_ring_t *ring) 1105 { 1106 mac_client_impl_t *mcip; 1107 flow_entry_t *flent; 1108 mac_soft_ring_set_t *mac_rx_srs; 1109 mac_cpus_t *srs_cpu; 1110 int i; 1111 1112 if (((mcip = MAC_GROUP_ONLY_CLIENT(group)) != NULL) && 1113 (!ring->mr_info.mri_intr.mi_ddi_shared)) { 1114 /* interrupt can be re-targeted */ 1115 ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED); 1116 flent = mcip->mci_flent; 1117 if (ring->mr_type == MAC_RING_TYPE_RX) { 1118 for (i = 0; i < flent->fe_rx_srs_cnt; i++) { 1119 mac_rx_srs = flent->fe_rx_srs[i]; 1120 if (mac_rx_srs->srs_ring != ring) 1121 continue; 1122 srs_cpu = &mac_rx_srs->srs_cpu; 1123 mutex_enter(&cpu_lock); 1124 mac_rx_srs_retarget_intr(mac_rx_srs, 1125 srs_cpu->mc_rx_intr_cpu); 1126 mutex_exit(&cpu_lock); 1127 break; 1128 } 1129 } else { 1130 if (flent->fe_tx_srs != NULL) { 1131 mutex_enter(&cpu_lock); 1132 mac_tx_srs_retarget_intr( 1133 flent->fe_tx_srs); 1134 mutex_exit(&cpu_lock); 1135 } 1136 } 1137 } 1138 } 1139 1140 /* 1141 * Clients like aggr create pseudo rings (mac_ring_t) and expose them to 1142 * their clients. There is a 1-1 mapping pseudo ring and the hardware 1143 * ring. ddi interrupt handles are exported from the hardware ring to 1144 * the pseudo ring. Thus when the interrupt handle changes, clients of 1145 * aggr that are using the handle need to use the new handle and 1146 * re-target their interrupts. 1147 */ 1148 static void 1149 mac_pseudo_ring_intr_retarget(mac_impl_t *mip, mac_ring_t *ring, 1150 ddi_intr_handle_t ddh) 1151 { 1152 mac_ring_t *pring; 1153 mac_group_t *pgroup; 1154 mac_impl_t *pmip; 1155 char macname[MAXNAMELEN]; 1156 mac_perim_handle_t p_mph; 1157 uint64_t saved_gen_num; 1158 1159 again: 1160 pring = (mac_ring_t *)ring->mr_prh; 1161 pgroup = (mac_group_t *)pring->mr_gh; 1162 pmip = (mac_impl_t *)pgroup->mrg_mh; 1163 saved_gen_num = ring->mr_gen_num; 1164 (void) strlcpy(macname, pmip->mi_name, MAXNAMELEN); 1165 /* 1166 * We need to enter aggr's perimeter. The locking hierarchy 1167 * dictates that aggr's perimeter should be entered first 1168 * and then the port's perimeter. So drop the port's 1169 * perimeter, enter aggr's and then re-enter port's 1170 * perimeter. 1171 */ 1172 i_mac_perim_exit(mip); 1173 /* 1174 * While we know pmip is the aggr's mip, there is a 1175 * possibility that aggr could have unregistered by 1176 * the time we exit port's perimeter (mip) and 1177 * enter aggr's perimeter (pmip). To avoid that 1178 * scenario, enter aggr's perimeter using its name. 1179 */ 1180 if (mac_perim_enter_by_macname(macname, &p_mph) != 0) 1181 return; 1182 i_mac_perim_enter(mip); 1183 /* 1184 * Check if the ring got assigned to another aggregation before 1185 * be could enter aggr's and the port's perimeter. When a ring 1186 * gets deleted from an aggregation, it calls mac_stop_ring() 1187 * which increments the generation number. So checking 1188 * generation number will be enough. 1189 */ 1190 if (ring->mr_gen_num != saved_gen_num && ring->mr_prh != NULL) { 1191 i_mac_perim_exit(mip); 1192 mac_perim_exit(p_mph); 1193 i_mac_perim_enter(mip); 1194 goto again; 1195 } 1196 1197 /* Check if pseudo ring is still present */ 1198 if (ring->mr_prh != NULL) { 1199 pring->mr_info.mri_intr.mi_ddi_handle = ddh; 1200 pring->mr_info.mri_intr.mi_ddi_shared = 1201 ring->mr_info.mri_intr.mi_ddi_shared; 1202 if (ddh != NULL) 1203 mac_ring_intr_retarget(pgroup, pring); 1204 } 1205 i_mac_perim_exit(mip); 1206 mac_perim_exit(p_mph); 1207 } 1208 /* 1209 * API called by driver to provide new interrupt handle for TX/RX rings. 1210 * This usually happens when IRM (Interrupt Resource Manangement) 1211 * framework either gives the driver more MSI-x interrupts or takes 1212 * away MSI-x interrupts from the driver. 1213 */ 1214 void 1215 mac_ring_intr_set(mac_ring_handle_t mrh, ddi_intr_handle_t ddh) 1216 { 1217 mac_ring_t *ring = (mac_ring_t *)mrh; 1218 mac_group_t *group = (mac_group_t *)ring->mr_gh; 1219 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 1220 1221 i_mac_perim_enter(mip); 1222 ring->mr_info.mri_intr.mi_ddi_handle = ddh; 1223 if (ddh == NULL) { 1224 /* Interrupts being reset */ 1225 ring->mr_info.mri_intr.mi_ddi_shared = B_FALSE; 1226 if (ring->mr_prh != NULL) { 1227 mac_pseudo_ring_intr_retarget(mip, ring, ddh); 1228 return; 1229 } 1230 } else { 1231 /* New interrupt handle */ 1232 mac_compare_ddi_handle(mip->mi_rx_groups, 1233 mip->mi_rx_group_count, ring); 1234 if (!ring->mr_info.mri_intr.mi_ddi_shared) { 1235 mac_compare_ddi_handle(mip->mi_tx_groups, 1236 mip->mi_tx_group_count, ring); 1237 } 1238 if (ring->mr_prh != NULL) { 1239 mac_pseudo_ring_intr_retarget(mip, ring, ddh); 1240 return; 1241 } else { 1242 mac_ring_intr_retarget(group, ring); 1243 } 1244 } 1245 i_mac_perim_exit(mip); 1246 } 1247 1248 /* PRIVATE FUNCTIONS, FOR INTERNAL USE ONLY */ 1249 1250 /* 1251 * Updates the mac_impl structure with the current state of the link 1252 */ 1253 static void 1254 i_mac_log_link_state(mac_impl_t *mip) 1255 { 1256 /* 1257 * If no change, then it is not interesting. 1258 */ 1259 if (mip->mi_lastlowlinkstate == mip->mi_lowlinkstate) 1260 return; 1261 1262 switch (mip->mi_lowlinkstate) { 1263 case LINK_STATE_UP: 1264 if (mip->mi_type->mt_ops.mtops_ops & MTOPS_LINK_DETAILS) { 1265 char det[200]; 1266 1267 mip->mi_type->mt_ops.mtops_link_details(det, 1268 sizeof (det), (mac_handle_t)mip, mip->mi_pdata); 1269 1270 cmn_err(CE_NOTE, "!%s link up, %s", mip->mi_name, det); 1271 } else { 1272 cmn_err(CE_NOTE, "!%s link up", mip->mi_name); 1273 } 1274 break; 1275 1276 case LINK_STATE_DOWN: 1277 /* 1278 * Only transitions from UP to DOWN are interesting 1279 */ 1280 if (mip->mi_lastlowlinkstate != LINK_STATE_UNKNOWN) 1281 cmn_err(CE_NOTE, "!%s link down", mip->mi_name); 1282 break; 1283 1284 case LINK_STATE_UNKNOWN: 1285 /* 1286 * This case is normally not interesting. 1287 */ 1288 break; 1289 } 1290 mip->mi_lastlowlinkstate = mip->mi_lowlinkstate; 1291 } 1292 1293 /* 1294 * Main routine for the callbacks notifications thread 1295 */ 1296 static void 1297 i_mac_notify_thread(void *arg) 1298 { 1299 mac_impl_t *mip = arg; 1300 callb_cpr_t cprinfo; 1301 mac_cb_t *mcb; 1302 mac_cb_info_t *mcbi; 1303 mac_notify_cb_t *mncb; 1304 1305 mcbi = &mip->mi_notify_cb_info; 1306 CALLB_CPR_INIT(&cprinfo, mcbi->mcbi_lockp, callb_generic_cpr, 1307 "i_mac_notify_thread"); 1308 1309 mutex_enter(mcbi->mcbi_lockp); 1310 1311 for (;;) { 1312 uint32_t bits; 1313 uint32_t type; 1314 1315 bits = mip->mi_notify_bits; 1316 if (bits == 0) { 1317 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1318 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); 1319 CALLB_CPR_SAFE_END(&cprinfo, mcbi->mcbi_lockp); 1320 continue; 1321 } 1322 mip->mi_notify_bits = 0; 1323 if ((bits & (1 << MAC_NNOTE)) != 0) { 1324 /* request to quit */ 1325 ASSERT(mip->mi_state_flags & MIS_DISABLED); 1326 break; 1327 } 1328 1329 mutex_exit(mcbi->mcbi_lockp); 1330 1331 /* 1332 * Log link changes on the actual link, but then do reports on 1333 * synthetic state (if part of a bridge). 1334 */ 1335 if ((bits & (1 << MAC_NOTE_LOWLINK)) != 0) { 1336 link_state_t newstate; 1337 mac_handle_t mh; 1338 1339 i_mac_log_link_state(mip); 1340 newstate = mip->mi_lowlinkstate; 1341 if (mip->mi_bridge_link != NULL) { 1342 mutex_enter(&mip->mi_bridge_lock); 1343 if ((mh = mip->mi_bridge_link) != NULL) { 1344 newstate = mac_bridge_ls_cb(mh, 1345 newstate); 1346 } 1347 mutex_exit(&mip->mi_bridge_lock); 1348 } 1349 if (newstate != mip->mi_linkstate) { 1350 mip->mi_linkstate = newstate; 1351 bits |= 1 << MAC_NOTE_LINK; 1352 } 1353 } 1354 1355 /* 1356 * Depending on which capabs have changed, the Tx 1357 * checksum flags may also need to be updated. 1358 */ 1359 if ((bits & (1 << MAC_NOTE_CAPAB_CHG)) != 0) { 1360 mac_perim_handle_t mph; 1361 mac_handle_t mh = (mac_handle_t)mip; 1362 1363 mac_perim_enter_by_mh(mh, &mph); 1364 mip->mi_tx_cksum_flags = mac_features_to_flags(mh); 1365 mac_perim_exit(mph); 1366 } 1367 1368 /* 1369 * Do notification callbacks for each notification type. 1370 */ 1371 for (type = 0; type < MAC_NNOTE; type++) { 1372 if ((bits & (1 << type)) == 0) { 1373 continue; 1374 } 1375 1376 if (mac_notify_cb_list[type] != NULL) 1377 (*mac_notify_cb_list[type])(mip); 1378 1379 /* 1380 * Walk the list of notifications. 1381 */ 1382 MAC_CALLBACK_WALKER_INC(&mip->mi_notify_cb_info); 1383 for (mcb = mip->mi_notify_cb_list; mcb != NULL; 1384 mcb = mcb->mcb_nextp) { 1385 mncb = (mac_notify_cb_t *)mcb->mcb_objp; 1386 mncb->mncb_fn(mncb->mncb_arg, type); 1387 } 1388 MAC_CALLBACK_WALKER_DCR(&mip->mi_notify_cb_info, 1389 &mip->mi_notify_cb_list); 1390 } 1391 1392 mutex_enter(mcbi->mcbi_lockp); 1393 } 1394 1395 mip->mi_state_flags |= MIS_NOTIFY_DONE; 1396 cv_broadcast(&mcbi->mcbi_cv); 1397 1398 /* CALLB_CPR_EXIT drops the lock */ 1399 CALLB_CPR_EXIT(&cprinfo); 1400 thread_exit(); 1401 } 1402 1403 /* 1404 * Signal the i_mac_notify_thread asking it to quit. 1405 * Then wait till it is done. 1406 */ 1407 void 1408 i_mac_notify_exit(mac_impl_t *mip) 1409 { 1410 mac_cb_info_t *mcbi; 1411 1412 mcbi = &mip->mi_notify_cb_info; 1413 1414 mutex_enter(mcbi->mcbi_lockp); 1415 mip->mi_notify_bits = (1 << MAC_NNOTE); 1416 cv_broadcast(&mcbi->mcbi_cv); 1417 1418 1419 while ((mip->mi_notify_thread != NULL) && 1420 !(mip->mi_state_flags & MIS_NOTIFY_DONE)) { 1421 cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp); 1422 } 1423 1424 /* Necessary clean up before doing kmem_cache_free */ 1425 mip->mi_state_flags &= ~MIS_NOTIFY_DONE; 1426 mip->mi_notify_bits = 0; 1427 mip->mi_notify_thread = NULL; 1428 mutex_exit(mcbi->mcbi_lockp); 1429 } 1430 1431 /* 1432 * Entry point invoked by drivers to dynamically add a ring to an 1433 * existing group. 1434 */ 1435 int 1436 mac_group_add_ring(mac_group_handle_t gh, int index) 1437 { 1438 mac_group_t *group = (mac_group_t *)gh; 1439 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 1440 int ret; 1441 1442 i_mac_perim_enter(mip); 1443 ret = i_mac_group_add_ring(group, NULL, index); 1444 i_mac_perim_exit(mip); 1445 return (ret); 1446 } 1447 1448 /* 1449 * Entry point invoked by drivers to dynamically remove a ring 1450 * from an existing group. The specified ring handle must no longer 1451 * be used by the driver after a call to this function. 1452 */ 1453 void 1454 mac_group_rem_ring(mac_group_handle_t gh, mac_ring_handle_t rh) 1455 { 1456 mac_group_t *group = (mac_group_t *)gh; 1457 mac_impl_t *mip = (mac_impl_t *)group->mrg_mh; 1458 1459 i_mac_perim_enter(mip); 1460 i_mac_group_rem_ring(group, (mac_ring_t *)rh, B_TRUE); 1461 i_mac_perim_exit(mip); 1462 } 1463 1464 /* 1465 * mac_prop_info_*() callbacks called from the driver's prefix_propinfo() 1466 * entry points. 1467 */ 1468 1469 void 1470 mac_prop_info_set_default_uint8(mac_prop_info_handle_t ph, uint8_t val) 1471 { 1472 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1473 1474 /* nothing to do if the caller doesn't want the default value */ 1475 if (pr->pr_default == NULL) 1476 return; 1477 1478 ASSERT(pr->pr_default_size >= sizeof (uint8_t)); 1479 1480 *(uint8_t *)(pr->pr_default) = val; 1481 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1482 } 1483 1484 void 1485 mac_prop_info_set_default_uint64(mac_prop_info_handle_t ph, uint64_t val) 1486 { 1487 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1488 1489 /* nothing to do if the caller doesn't want the default value */ 1490 if (pr->pr_default == NULL) 1491 return; 1492 1493 ASSERT(pr->pr_default_size >= sizeof (uint64_t)); 1494 1495 bcopy(&val, pr->pr_default, sizeof (val)); 1496 1497 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1498 } 1499 1500 void 1501 mac_prop_info_set_default_uint32(mac_prop_info_handle_t ph, uint32_t val) 1502 { 1503 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1504 1505 /* nothing to do if the caller doesn't want the default value */ 1506 if (pr->pr_default == NULL) 1507 return; 1508 1509 ASSERT(pr->pr_default_size >= sizeof (uint32_t)); 1510 1511 bcopy(&val, pr->pr_default, sizeof (val)); 1512 1513 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1514 } 1515 1516 void 1517 mac_prop_info_set_default_str(mac_prop_info_handle_t ph, const char *str) 1518 { 1519 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1520 1521 /* nothing to do if the caller doesn't want the default value */ 1522 if (pr->pr_default == NULL) 1523 return; 1524 1525 if (strlen(str) >= pr->pr_default_size) 1526 pr->pr_errno = ENOBUFS; 1527 else 1528 (void) strlcpy(pr->pr_default, str, pr->pr_default_size); 1529 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1530 } 1531 1532 void 1533 mac_prop_info_set_default_link_flowctrl(mac_prop_info_handle_t ph, 1534 link_flowctrl_t val) 1535 { 1536 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1537 1538 /* nothing to do if the caller doesn't want the default value */ 1539 if (pr->pr_default == NULL) 1540 return; 1541 1542 ASSERT(pr->pr_default_size >= sizeof (link_flowctrl_t)); 1543 1544 bcopy(&val, pr->pr_default, sizeof (val)); 1545 1546 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1547 } 1548 1549 void 1550 mac_prop_info_set_default_fec(mac_prop_info_handle_t ph, link_fec_t val) 1551 { 1552 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1553 1554 /* nothing to do if the caller doesn't want the default value */ 1555 if (pr->pr_default == NULL) 1556 return; 1557 1558 ASSERT(pr->pr_default_size >= sizeof (link_fec_t)); 1559 1560 bcopy(&val, pr->pr_default, sizeof (val)); 1561 1562 pr->pr_flags |= MAC_PROP_INFO_DEFAULT; 1563 } 1564 1565 void 1566 mac_prop_info_set_range_uint32(mac_prop_info_handle_t ph, uint32_t min, 1567 uint32_t max) 1568 { 1569 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1570 mac_propval_range_t *range = pr->pr_range; 1571 mac_propval_uint32_range_t *range32; 1572 1573 /* nothing to do if the caller doesn't want the range info */ 1574 if (range == NULL) 1575 return; 1576 1577 if (pr->pr_range_cur_count++ == 0) { 1578 /* first range */ 1579 pr->pr_flags |= MAC_PROP_INFO_RANGE; 1580 range->mpr_type = MAC_PROPVAL_UINT32; 1581 } else { 1582 /* all ranges of a property should be of the same type */ 1583 ASSERT(range->mpr_type == MAC_PROPVAL_UINT32); 1584 if (pr->pr_range_cur_count > range->mpr_count) { 1585 pr->pr_errno = ENOSPC; 1586 return; 1587 } 1588 } 1589 1590 range32 = range->mpr_range_uint32; 1591 range32[pr->pr_range_cur_count - 1].mpur_min = min; 1592 range32[pr->pr_range_cur_count - 1].mpur_max = max; 1593 } 1594 1595 void 1596 mac_prop_info_set_perm(mac_prop_info_handle_t ph, uint8_t perm) 1597 { 1598 mac_prop_info_state_t *pr = (mac_prop_info_state_t *)ph; 1599 1600 pr->pr_perm = perm; 1601 pr->pr_flags |= MAC_PROP_INFO_PERM; 1602 } 1603 1604 void 1605 mac_hcksum_get(const mblk_t *mp, uint32_t *start, uint32_t *stuff, 1606 uint32_t *end, uint32_t *value, uint32_t *flags_ptr) 1607 { 1608 uint32_t flags; 1609 1610 ASSERT(DB_TYPE(mp) == M_DATA); 1611 1612 flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS; 1613 if ((flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) { 1614 if (value != NULL) 1615 *value = (uint32_t)DB_CKSUM16(mp); 1616 if ((flags & HCK_PARTIALCKSUM) != 0) { 1617 if (start != NULL) 1618 *start = (uint32_t)DB_CKSUMSTART(mp); 1619 if (stuff != NULL) 1620 *stuff = (uint32_t)DB_CKSUMSTUFF(mp); 1621 if (end != NULL) 1622 *end = (uint32_t)DB_CKSUMEND(mp); 1623 } 1624 } 1625 1626 if (flags_ptr != NULL) 1627 *flags_ptr = flags; 1628 } 1629 1630 void 1631 mac_hcksum_set(mblk_t *mp, uint32_t start, uint32_t stuff, uint32_t end, 1632 uint32_t value, uint32_t flags) 1633 { 1634 ASSERT(DB_TYPE(mp) == M_DATA); 1635 1636 DB_CKSUMSTART(mp) = (intptr_t)start; 1637 DB_CKSUMSTUFF(mp) = (intptr_t)stuff; 1638 DB_CKSUMEND(mp) = (intptr_t)end; 1639 DB_CKSUMFLAGS(mp) = (uint16_t)flags; 1640 DB_CKSUM16(mp) = (uint16_t)value; 1641 } 1642 1643 void 1644 mac_hcksum_clone(const mblk_t *src, mblk_t *dst) 1645 { 1646 ASSERT3U(DB_TYPE(src), ==, M_DATA); 1647 ASSERT3U(DB_TYPE(dst), ==, M_DATA); 1648 1649 /* 1650 * Do these assignments unconditionally, rather than only when 1651 * flags is non-zero. This protects a situation where zeroed 1652 * hcksum data does not make the jump onto an mblk_t with 1653 * stale data in those fields. It's important to copy all 1654 * possible flags (HCK_* as well as HW_*) and not just the 1655 * checksum specific flags. Dropping flags during a clone 1656 * could result in dropped packets. If the caller has good 1657 * reason to drop those flags then it should do it manually, 1658 * after the clone. 1659 */ 1660 DB_CKSUMFLAGS(dst) = DB_CKSUMFLAGS(src); 1661 DB_CKSUMSTART(dst) = DB_CKSUMSTART(src); 1662 DB_CKSUMSTUFF(dst) = DB_CKSUMSTUFF(src); 1663 DB_CKSUMEND(dst) = DB_CKSUMEND(src); 1664 DB_CKSUM16(dst) = DB_CKSUM16(src); 1665 DB_LSOMSS(dst) = DB_LSOMSS(src); 1666 } 1667 1668 void 1669 mac_lso_get(mblk_t *mp, uint32_t *mss, uint32_t *flags) 1670 { 1671 ASSERT(DB_TYPE(mp) == M_DATA); 1672 1673 if (flags != NULL) { 1674 *flags = DB_CKSUMFLAGS(mp) & HW_LSO; 1675 if ((*flags != 0) && (mss != NULL)) 1676 *mss = (uint32_t)DB_LSOMSS(mp); 1677 } 1678 } 1679 1680 void 1681 mac_transceiver_info_set_present(mac_transceiver_info_t *infop, 1682 boolean_t present) 1683 { 1684 infop->mti_present = present; 1685 } 1686 1687 void 1688 mac_transceiver_info_set_usable(mac_transceiver_info_t *infop, 1689 boolean_t usable) 1690 { 1691 infop->mti_usable = usable; 1692 } 1693 1694 static bool 1695 mac_parse_is_ipv6eh(uint8_t id) 1696 { 1697 switch (id) { 1698 case IPPROTO_HOPOPTS: 1699 case IPPROTO_ROUTING: 1700 case IPPROTO_FRAGMENT: 1701 case IPPROTO_AH: 1702 case IPPROTO_DSTOPTS: 1703 case IPPROTO_MH: 1704 case IPPROTO_HIP: 1705 case IPPROTO_SHIM6: 1706 /* Currently known extension headers */ 1707 return (true); 1708 case IPPROTO_ESP: 1709 /* 1710 * While the IANA protocol numbers listing notes ESP as an IPv6 1711 * extension header, we cannot effectively parse it like one. 1712 * 1713 * For now, mac_ether_offload_info() will report it as the L4 1714 * protocol for a parsed packet containing this EH. 1715 */ 1716 default: 1717 return (false); 1718 } 1719 } 1720 1721 typedef struct mac_mblk_cursor { 1722 mblk_t *mmc_head; 1723 mblk_t *mmc_cur; 1724 size_t mmc_off_total; 1725 size_t mmc_off_mp; 1726 } mac_mblk_cursor_t; 1727 1728 static void mac_mmc_advance(mac_mblk_cursor_t *, size_t); 1729 static void mac_mmc_reset(mac_mblk_cursor_t *); 1730 1731 static void 1732 mac_mmc_init(mac_mblk_cursor_t *cursor, mblk_t *mp) 1733 { 1734 cursor->mmc_head = mp; 1735 mac_mmc_reset(cursor); 1736 } 1737 1738 static void 1739 mac_mmc_reset(mac_mblk_cursor_t *cursor) 1740 { 1741 ASSERT(cursor->mmc_head != NULL); 1742 1743 cursor->mmc_cur = cursor->mmc_head; 1744 cursor->mmc_off_total = cursor->mmc_off_mp = 0; 1745 1746 /* Advance past any zero-length mblks at head */ 1747 mac_mmc_advance(cursor, 0); 1748 } 1749 1750 static inline size_t 1751 mac_mmc_mp_left(const mac_mblk_cursor_t *cursor) 1752 { 1753 if (cursor->mmc_cur != NULL) { 1754 const size_t mp_len = MBLKL(cursor->mmc_cur); 1755 1756 ASSERT3U(mp_len, >=, cursor->mmc_off_mp); 1757 1758 return (mp_len - cursor->mmc_off_mp); 1759 } else { 1760 return (0); 1761 } 1762 } 1763 1764 static inline uint8_t * 1765 mac_mmc_mp_ptr(const mac_mblk_cursor_t *cursor) 1766 { 1767 return (cursor->mmc_cur->b_rptr + cursor->mmc_off_mp); 1768 } 1769 1770 static inline size_t 1771 mac_mmc_offset(const mac_mblk_cursor_t *cursor) 1772 { 1773 return (cursor->mmc_off_total); 1774 } 1775 1776 /* 1777 * Advance cursor forward `len` bytes. 1778 * 1779 * The length to advance must be no greater than the number of bytes remaining 1780 * in the current mblk. If the position reaches (exactly) the end of the 1781 * current mblk, the cursor will be pushed forward to the next non-zero-length 1782 * mblk in the chain. 1783 */ 1784 static inline void 1785 mac_mmc_advance(mac_mblk_cursor_t *cursor, size_t len) 1786 { 1787 ASSERT(cursor->mmc_cur != NULL); 1788 1789 const size_t mp_len = MBLKL(cursor->mmc_cur); 1790 1791 ASSERT3U(cursor->mmc_off_mp + len, <=, mp_len); 1792 1793 cursor->mmc_off_total += len; 1794 cursor->mmc_off_mp += len; 1795 1796 if (cursor->mmc_off_mp == mp_len) { 1797 cursor->mmc_off_mp = 0; 1798 cursor->mmc_cur = cursor->mmc_cur->b_cont; 1799 } 1800 1801 /* Skip over any 0-length mblks */ 1802 while (cursor->mmc_cur != NULL && MBLKL(cursor->mmc_cur) == 0) { 1803 cursor->mmc_cur = cursor->mmc_cur->b_cont; 1804 } 1805 } 1806 1807 /* 1808 * Attempt to seek to byte offset `off` in mblk chain. 1809 * 1810 * Returns true if the offset is <= the total chain length. 1811 */ 1812 static bool 1813 mac_mmc_seek(mac_mblk_cursor_t *cursor, const size_t off) 1814 { 1815 ASSERT(cursor->mmc_head != NULL); 1816 1817 if (off == cursor->mmc_off_total) { 1818 /* 1819 * Any prior init, reset, or seek operation will have advanced 1820 * past any zero-length mblks, making this short-circuit safe. 1821 */ 1822 return (true); 1823 } else if (off < cursor->mmc_off_total) { 1824 /* Rewind to beginning if offset precedes current position */ 1825 mac_mmc_reset(cursor); 1826 } 1827 1828 size_t seek_left = off - cursor->mmc_off_total; 1829 while (cursor->mmc_cur != NULL) { 1830 const size_t mp_left = mac_mmc_mp_left(cursor); 1831 1832 if (mp_left > seek_left) { 1833 /* Target position is within current mblk */ 1834 cursor->mmc_off_mp += seek_left; 1835 cursor->mmc_off_total += seek_left; 1836 return (true); 1837 } 1838 1839 /* Move on to the next mblk... */ 1840 mac_mmc_advance(cursor, mp_left); 1841 seek_left -= mp_left; 1842 } 1843 1844 /* 1845 * We have reached the end of the mblk chain, but there is a chance that 1846 * it corresponds to the target seek position. 1847 */ 1848 return (cursor->mmc_off_total == off); 1849 } 1850 1851 /* 1852 * Attempt to read uint8_t at offset `pos` in mblk chain. 1853 * 1854 * Returns true (and sets value in `out`) if the offset is within the chain. 1855 */ 1856 static bool 1857 mac_mmc_get_uint8(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out) 1858 { 1859 if (!mac_mmc_seek(cursor, pos)) { 1860 return (false); 1861 } 1862 1863 if (mac_mmc_mp_left(cursor) != 0) { 1864 *out = *(mac_mmc_mp_ptr(cursor)); 1865 mac_mmc_advance(cursor, 1); 1866 return (true); 1867 } 1868 1869 return (false); 1870 } 1871 1872 /* 1873 * Attempt to read uint16_t at offset `pos` in mblk chain. The two 1874 * network-order bytes are converted into a host-order value. 1875 * 1876 * Returns true (and sets value in `out`) if the 16-bit region specified by the 1877 * offset is within the chain. 1878 */ 1879 static bool 1880 mac_mmc_get_uint16(mac_mblk_cursor_t *cursor, size_t pos, uint16_t *out) 1881 { 1882 if (!mac_mmc_seek(cursor, pos)) { 1883 return (false); 1884 } 1885 1886 const size_t mp_left = mac_mmc_mp_left(cursor); 1887 uint16_t result = 0; 1888 1889 if (mp_left >= 2) { 1890 uint8_t *bp = mac_mmc_mp_ptr(cursor); 1891 1892 result = (uint16_t)bp[0] << 8; 1893 result |= bp[1]; 1894 mac_mmc_advance(cursor, 2); 1895 *out = result; 1896 return (true); 1897 } else if (mp_left == 1) { 1898 result = (uint16_t)*(mac_mmc_mp_ptr(cursor)); 1899 mac_mmc_advance(cursor, 1); 1900 1901 if (mac_mmc_mp_left(cursor) == 0) { 1902 return (false); 1903 } 1904 1905 result = result << 8; 1906 result |= (uint16_t)*(mac_mmc_mp_ptr(cursor)); 1907 mac_mmc_advance(cursor, 1); 1908 *out = result; 1909 return (true); 1910 } 1911 1912 return (false); 1913 } 1914 1915 /* 1916 * Attempt to read `count` bytes at offset `pos` in mblk chain. 1917 * 1918 * Returns true (and copies data to `out`) if `count` length region is available 1919 * at offset within the chain. 1920 */ 1921 static bool 1922 mac_mmc_get_bytes(mac_mblk_cursor_t *cursor, size_t pos, uint8_t *out, 1923 size_t count) 1924 { 1925 if (!mac_mmc_seek(cursor, pos)) { 1926 return (false); 1927 } 1928 1929 while (count > 0) { 1930 const size_t mp_left = mac_mmc_mp_left(cursor); 1931 1932 if (mp_left == 0) { 1933 return (false); 1934 } 1935 const size_t to_copy = MIN(mp_left, count); 1936 1937 bcopy(mac_mmc_mp_ptr(cursor), out, to_copy); 1938 out += to_copy; 1939 mac_mmc_advance(cursor, to_copy); 1940 count -= to_copy; 1941 } 1942 return (true); 1943 } 1944 1945 /* 1946 * Attempt to parse ethernet header (VLAN or not) from mblk chain. 1947 * 1948 * Returns true if header was successfully parsed. Parsing will begin at 1949 * current offset of `cursor`. Any non-NULL arguments for VLAN, SAP, and header 1950 * size will be populated on success. A value of MEOI_VLAN_TCI_INVALID will be 1951 * reported for the TCI if the header does not bear VLAN infomation. 1952 */ 1953 static bool 1954 mac_mmc_parse_ether(mac_mblk_cursor_t *cursor, uint8_t *dst_addrp, 1955 uint32_t *vlan_tcip, uint16_t *ethertypep, uint16_t *hdr_sizep) 1956 { 1957 const size_t l2_off = mac_mmc_offset(cursor); 1958 1959 if (dst_addrp != NULL) { 1960 if (!mac_mmc_get_bytes(cursor, l2_off, dst_addrp, ETHERADDRL)) { 1961 return (false); 1962 } 1963 } 1964 1965 uint16_t ethertype = 0; 1966 if (!mac_mmc_get_uint16(cursor, 1967 l2_off + offsetof(struct ether_header, ether_type), ðertype)) { 1968 return (false); 1969 } 1970 1971 uint32_t tci = MEOI_VLAN_TCI_INVALID; 1972 uint16_t hdrsize = sizeof (struct ether_header); 1973 1974 if (ethertype == ETHERTYPE_VLAN) { 1975 uint16_t tci_val; 1976 1977 if (!mac_mmc_get_uint16(cursor, 1978 l2_off + offsetof(struct ether_vlan_header, ether_tci), 1979 &tci_val)) { 1980 return (false); 1981 } 1982 if (!mac_mmc_get_uint16(cursor, 1983 l2_off + offsetof(struct ether_vlan_header, ether_type), 1984 ðertype)) { 1985 return (false); 1986 } 1987 hdrsize = sizeof (struct ether_vlan_header); 1988 tci = (uint32_t)tci_val; 1989 } 1990 1991 if (vlan_tcip != NULL) { 1992 *vlan_tcip = tci; 1993 } 1994 if (ethertypep != NULL) { 1995 *ethertypep = ethertype; 1996 } 1997 if (hdr_sizep != NULL) { 1998 *hdr_sizep = hdrsize; 1999 } 2000 return (true); 2001 } 2002 2003 /* 2004 * Attempt to parse L3 protocol header from mblk chain. 2005 * 2006 * The SAP/ethertype of the containing header must be specified by the caller. 2007 * 2008 * Returns true if header was successfully parsed. Parsing will begin at 2009 * current offset of `cursor`. Any non-NULL arguments for IP protocol and 2010 * header size will be populated on success. 2011 */ 2012 static bool 2013 mac_mmc_parse_l3(mac_mblk_cursor_t *cursor, uint16_t l3_sap, uint8_t *ipprotop, 2014 mac_ether_offload_flags_t *fragp, uint16_t *hdr_sizep) 2015 { 2016 const size_t l3_off = mac_mmc_offset(cursor); 2017 2018 if (l3_sap == ETHERTYPE_IP) { 2019 uint8_t verlen, ipproto; 2020 uint16_t frag_off; 2021 2022 if (!mac_mmc_get_uint8(cursor, l3_off, &verlen)) { 2023 return (false); 2024 } 2025 verlen &= 0x0f; 2026 if (verlen < 5 || verlen > 0x0f) { 2027 return (false); 2028 } 2029 2030 if (!mac_mmc_get_uint16(cursor, 2031 l3_off + offsetof(ipha_t, ipha_fragment_offset_and_flags), 2032 &frag_off)) { 2033 return (false); 2034 } 2035 2036 if (!mac_mmc_get_uint8(cursor, 2037 l3_off + offsetof(ipha_t, ipha_protocol), &ipproto)) { 2038 return (false); 2039 } 2040 2041 if (ipprotop != NULL) { 2042 *ipprotop = ipproto; 2043 } 2044 if (fragp != NULL) { 2045 mac_ether_offload_flags_t frag_flags = 0; 2046 if ((frag_off & IPH_MF) != 0) { 2047 frag_flags |= MEOI_L3_FRAG_MORE; 2048 } 2049 if ((frag_off & IPH_OFFSET) != 0) { 2050 frag_flags |= MEOI_L3_FRAG_OFFSET; 2051 } 2052 *fragp = frag_flags; 2053 } 2054 if (hdr_sizep != NULL) { 2055 *hdr_sizep = verlen * 4; 2056 } 2057 return (true); 2058 } 2059 if (l3_sap == ETHERTYPE_IPV6) { 2060 uint16_t ip_len = sizeof (ip6_t); 2061 uint8_t ipproto; 2062 mac_ether_offload_flags_t frag_flags = 0; 2063 2064 if (!mac_mmc_get_uint8(cursor, 2065 l3_off + offsetof(ip6_t, ip6_nxt), &ipproto)) { 2066 return (false); 2067 } 2068 2069 /* Chase any extension headers present in packet */ 2070 while (mac_parse_is_ipv6eh(ipproto)) { 2071 uint8_t len_val, next_hdr; 2072 uint16_t eh_len; 2073 2074 const size_t hdr_off = l3_off + ip_len; 2075 if (!mac_mmc_get_uint8(cursor, hdr_off, &next_hdr)) { 2076 return (false); 2077 } 2078 2079 if (ipproto == IPPROTO_FRAGMENT) { 2080 /* 2081 * The Fragment extension header bears a 2082 * predefined fixed length, rather than 2083 * communicating it through the EH itself. 2084 */ 2085 eh_len = 8; 2086 2087 uint16_t frag_off; 2088 if (!mac_mmc_get_uint16(cursor, hdr_off + 2, 2089 &frag_off)) { 2090 return (false); 2091 } 2092 /* IP6F_* defines already in network order */ 2093 frag_off = htons(frag_off); 2094 if ((frag_off & IP6F_MORE_FRAG) != 0) { 2095 frag_flags |= MEOI_L3_FRAG_MORE; 2096 } 2097 if ((frag_off & IP6F_OFF_MASK) != 0) { 2098 frag_flags |= MEOI_L3_FRAG_OFFSET; 2099 } 2100 } else if (ipproto == IPPROTO_AH) { 2101 /* 2102 * The length of the IP Authentication EH is 2103 * stored as (n + 2) * 32-bits, where 'n' is the 2104 * recorded EH length field 2105 */ 2106 if (!mac_mmc_get_uint8(cursor, hdr_off + 1, 2107 &len_val)) { 2108 return (false); 2109 } 2110 eh_len = ((uint16_t)len_val + 2) * 4; 2111 } else { 2112 /* 2113 * All other EHs should follow the sizing 2114 * formula of (n + 1) * 64-bits, where 'n' is 2115 * the recorded EH length field. 2116 */ 2117 if (!mac_mmc_get_uint8(cursor, hdr_off + 1, 2118 &len_val)) { 2119 return (false); 2120 } 2121 eh_len = ((uint16_t)len_val + 1) * 8; 2122 } 2123 /* 2124 * Protect against overflow in the case of a very 2125 * contrived packet. 2126 */ 2127 if ((ip_len + eh_len) < ip_len) { 2128 return (-1); 2129 } 2130 2131 ipproto = next_hdr; 2132 ip_len += eh_len; 2133 } 2134 2135 if (ipprotop != NULL) { 2136 *ipprotop = ipproto; 2137 } 2138 if (fragp != NULL) { 2139 *fragp = frag_flags; 2140 } 2141 if (hdr_sizep != NULL) { 2142 *hdr_sizep = ip_len; 2143 } 2144 return (true); 2145 } 2146 2147 return (false); 2148 } 2149 2150 /* 2151 * Attempt to parse L4 protocol header from mblk chain. 2152 * 2153 * The IP protocol of the containing header must be specified by the caller. 2154 * 2155 * Returns true if header was successfully parsed. Parsing will begin at 2156 * current offset of `cursor`. A non-NULL argument for header size will be 2157 * populated on success. 2158 */ 2159 static bool 2160 mac_mmc_parse_l4(mac_mblk_cursor_t *cursor, uint8_t ipproto, uint8_t *hdr_sizep) 2161 { 2162 ASSERT(hdr_sizep != NULL); 2163 2164 const size_t l4_off = mac_mmc_offset(cursor); 2165 uint8_t tcp_doff; 2166 2167 switch (ipproto) { 2168 case IPPROTO_TCP: 2169 if (!mac_mmc_get_uint8(cursor, 2170 l4_off + offsetof(tcph_t, th_offset_and_rsrvd), 2171 &tcp_doff)) { 2172 return (false); 2173 } 2174 tcp_doff = (tcp_doff & 0xf0) >> 4; 2175 if (tcp_doff < 5 || tcp_doff > 0xf) { 2176 return (false); 2177 } 2178 *hdr_sizep = tcp_doff * 4; 2179 return (true); 2180 case IPPROTO_UDP: 2181 *hdr_sizep = sizeof (struct udphdr); 2182 return (true); 2183 case IPPROTO_ICMP: 2184 /* 2185 * Only count the parts of the header which are common to 2186 * message types. 2187 */ 2188 *hdr_sizep = offsetof(struct icmp, icmp_hun); 2189 return (true); 2190 case IPPROTO_ICMPV6: 2191 *hdr_sizep = sizeof (icmp6_t); 2192 return (true); 2193 case IPPROTO_SCTP: 2194 *hdr_sizep = sizeof (sctp_hdr_t); 2195 return (true); 2196 default: 2197 return (false); 2198 } 2199 } 2200 2201 /* 2202 * Parse destination MAC address and VLAN TCI (if any) from mblk chain. 2203 * 2204 * If packet ethertype does not indicate that a VLAN is present, 2205 * MEOI_VLAN_TCI_INVALID will be returned for the TCI. 2206 * 2207 * Returns B_TRUE if header could be parsed for destination MAC address and VLAN 2208 * TCI, otherwise B_FALSE. 2209 */ 2210 boolean_t 2211 mac_ether_l2_info(mblk_t *mp, uint8_t *dst_addrp, uint32_t *vlan_tcip) 2212 { 2213 mac_mblk_cursor_t cursor; 2214 2215 mac_mmc_init(&cursor, mp); 2216 if (!mac_mmc_parse_ether(&cursor, dst_addrp, vlan_tcip, NULL, NULL)) { 2217 return (B_FALSE); 2218 } 2219 2220 return (B_TRUE); 2221 } 2222 2223 /* 2224 * Perform a partial parsing of offload info from a frame and/or packet. 2225 * 2226 * Beginning at the provided byte offset (`off`) in the mblk, attempt to parse 2227 * any offload info which has not yet been populated in `meoi`. The contents of 2228 * `meoi_flags` upon entry will be considered as "already parsed", their 2229 * corresponding data fields will be considered valid. 2230 * 2231 * A motivating example: A non-Ethernet packet could be parsed for L3/L4 offload 2232 * information by setting MEOI_L2INFO_SET in `meoi_flags`, and the L3 SAP in 2233 * `meoi_l3_proto`. With a value in `meoi_l2hlen` that, when combined with the 2234 * provided `off`, will direct the parser to the start of the L3 header in the 2235 * mblk, the rest of the logic will be free to run. 2236 * 2237 * Alternatively, this could be used to parse the headers in an encapsulated 2238 * Ethernet packet by simply specifying the start of its header in `off`. 2239 * 2240 * The degree to which parsing was able to proceed is stored in `meoi_flags`. 2241 */ 2242 void 2243 mac_partial_offload_info(mblk_t *mp, size_t off, mac_ether_offload_info_t *meoi) 2244 { 2245 mac_mblk_cursor_t cursor; 2246 2247 mac_mmc_init(&cursor, mp); 2248 2249 if (!mac_mmc_seek(&cursor, off)) { 2250 return; 2251 } 2252 2253 if ((meoi->meoi_flags & MEOI_L2INFO_SET) == 0) { 2254 uint32_t vlan_tci; 2255 uint16_t l2_sz, ethertype; 2256 if (!mac_mmc_parse_ether(&cursor, NULL, &vlan_tci, ðertype, 2257 &l2_sz)) { 2258 return; 2259 } 2260 2261 meoi->meoi_flags |= MEOI_L2INFO_SET; 2262 meoi->meoi_l2hlen = l2_sz; 2263 meoi->meoi_l3proto = ethertype; 2264 if (vlan_tci != MEOI_VLAN_TCI_INVALID) { 2265 ASSERT3U(meoi->meoi_l2hlen, ==, 2266 sizeof (struct ether_vlan_header)); 2267 meoi->meoi_flags |= MEOI_VLAN_TAGGED; 2268 } 2269 } 2270 const size_t l2_end = off + (size_t)meoi->meoi_l2hlen; 2271 if (!mac_mmc_seek(&cursor, l2_end)) { 2272 meoi->meoi_flags &= ~MEOI_L2INFO_SET; 2273 return; 2274 } 2275 2276 if ((meoi->meoi_flags & MEOI_L3INFO_SET) == 0) { 2277 uint8_t ipproto; 2278 uint16_t l3_sz; 2279 mac_ether_offload_flags_t frag_flags; 2280 2281 if (!mac_mmc_parse_l3(&cursor, meoi->meoi_l3proto, &ipproto, 2282 &frag_flags, &l3_sz)) { 2283 return; 2284 } 2285 2286 /* Only the fragment-related flags should be emitted */ 2287 ASSERT3U(frag_flags & 2288 ~(MEOI_L3_FRAG_MORE | MEOI_L3_FRAG_OFFSET), ==, 0); 2289 2290 meoi->meoi_l3hlen = l3_sz; 2291 meoi->meoi_l4proto = ipproto; 2292 meoi->meoi_flags |= MEOI_L3INFO_SET | frag_flags; 2293 } 2294 const size_t l3_end = l2_end + (size_t)meoi->meoi_l3hlen; 2295 if (!mac_mmc_seek(&cursor, l3_end)) { 2296 meoi->meoi_flags &= ~MEOI_L3INFO_SET; 2297 return; 2298 } 2299 2300 if ((meoi->meoi_flags & MEOI_L4INFO_SET) == 0) { 2301 if ((meoi->meoi_flags & MEOI_L3_FRAG_OFFSET) != 0) { 2302 /* 2303 * If this packet is a fragment, and is offset into the 2304 * data (not at the "head"), then we are past where the 2305 * L4 header would be, and should parse no further. 2306 */ 2307 return; 2308 } 2309 2310 uint8_t l4_sz; 2311 if (!mac_mmc_parse_l4(&cursor, meoi->meoi_l4proto, &l4_sz)) { 2312 return; 2313 } 2314 2315 meoi->meoi_l4hlen = l4_sz; 2316 meoi->meoi_flags |= MEOI_L4INFO_SET; 2317 } 2318 const size_t l4_end = l3_end + (size_t)meoi->meoi_l4hlen; 2319 if (!mac_mmc_seek(&cursor, l4_end)) { 2320 meoi->meoi_flags &= ~MEOI_L4INFO_SET; 2321 } 2322 } 2323 2324 /* 2325 * Attempt to parse packet headers to extract information useful for various 2326 * offloads. This includes header protocols and lengths. 2327 * 2328 * The meoi_flags field will indicate the extent to which parsing was able to 2329 * complete. Each in turn promises that subsequent fields are populated, and 2330 * that the mblk chain is large enough to contain the parsed header(s): 2331 * 2332 * - MEOI_L2INFO_SET: meoi_l3_proto and meoi_l2hlen 2333 * - MEOI_L3INFO_SET: meoi_l4_proto and meoi_l3hlen 2334 * - MEOI_L4INFO_SET: meoi_l4hlen 2335 * 2336 * When any of those flags are absent, their corresponding data fields will be 2337 * zeroed. 2338 * 2339 * These additional flags are set when certain conditions are met during 2340 * parsing: 2341 * 2342 * - MEOI_VLAN_TAGGED: Ethernet header is tagged with a VLAN 2343 * - MEOI_L3_FRAG_MORE: L3 indicated that this packet is fragmented into one or 2344 * more packets to follow 2345 * - MEOI_L3_FRAG_OFFSET: L3 header that this packet is a fragment which is 2346 * offset (following) from the head of the data 2347 */ 2348 void 2349 mac_ether_offload_info(mblk_t *mp, mac_ether_offload_info_t *meoi) 2350 { 2351 bzero(meoi, sizeof (mac_ether_offload_info_t)); 2352 meoi->meoi_len = msgdsize(mp); 2353 2354 mac_partial_offload_info(mp, 0, meoi); 2355 } 2356