1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 27 * 28 * An instance of the structure aggr_grp_t is allocated for each 29 * link aggregation group. When created, aggr_grp_t objects are 30 * entered into the aggr_grp_hash hash table maintained by the modhash 31 * module. The hash key is the linkid associated with the link 32 * aggregation group. 33 * 34 * A set of MAC ports are associated with each association group. 35 * 36 * Aggr pseudo TX rings 37 * -------------------- 38 * The underlying ports (NICs) in an aggregation can have TX rings. To 39 * enhance aggr's performance, these TX rings are made available to the 40 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new. 41 * They are already present and implemented on the RX side. It is called 42 * as pseudo RX rings. The same concept is extended to the TX side where 43 * each TX ring of an underlying port is reflected in aggr as a pseudo 44 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX 45 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo 46 * TX ring is given to the aggregation layer. 47 * 48 * With this change, the outgoing stack depth looks much better: 49 * 50 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 51 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 52 * 53 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings: 54 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 55 * 56 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 57 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX 58 * ring belonging to a port on which the packet has to be sent. 59 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 60 * policy and then uses the fanout_hint passed to it to pick a TX ring from 61 * the selected port. 62 * 63 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 64 * bandwidth limit is applied first on the outgoing packet and the packets 65 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 66 * particular TX ring. 67 */ 68 69 #include <sys/types.h> 70 #include <sys/sysmacros.h> 71 #include <sys/conf.h> 72 #include <sys/cmn_err.h> 73 #include <sys/disp.h> 74 #include <sys/list.h> 75 #include <sys/ksynch.h> 76 #include <sys/kmem.h> 77 #include <sys/stream.h> 78 #include <sys/modctl.h> 79 #include <sys/ddi.h> 80 #include <sys/sunddi.h> 81 #include <sys/atomic.h> 82 #include <sys/stat.h> 83 #include <sys/modhash.h> 84 #include <sys/id_space.h> 85 #include <sys/strsun.h> 86 #include <sys/cred.h> 87 #include <sys/dlpi.h> 88 #include <sys/zone.h> 89 #include <sys/mac_provider.h> 90 #include <sys/dls.h> 91 #include <sys/vlan.h> 92 #include <sys/aggr.h> 93 #include <sys/aggr_impl.h> 94 95 static int aggr_m_start(void *); 96 static void aggr_m_stop(void *); 97 static int aggr_m_promisc(void *, boolean_t); 98 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 99 static int aggr_m_unicst(void *, const uint8_t *); 100 static int aggr_m_stat(void *, uint_t, uint64_t *); 101 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 102 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 103 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 104 const void *); 105 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 106 mac_prop_info_handle_t); 107 108 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 109 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 110 boolean_t *); 111 112 static void aggr_grp_capab_set(aggr_grp_t *); 113 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 114 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 115 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 116 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 117 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 118 119 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 120 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 121 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 122 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 123 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); 124 static void aggr_pseudo_stop_ring(mac_ring_driver_t); 125 static int aggr_addmac(void *, const uint8_t *); 126 static int aggr_remmac(void *, const uint8_t *); 127 static mblk_t *aggr_rx_poll(void *, int); 128 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 129 const int, mac_ring_info_t *, mac_ring_handle_t); 130 static void aggr_fill_group(void *, mac_ring_type_t, const int, 131 mac_group_info_t *, mac_group_handle_t); 132 133 static kmem_cache_t *aggr_grp_cache; 134 static mod_hash_t *aggr_grp_hash; 135 static krwlock_t aggr_grp_lock; 136 static uint_t aggr_grp_cnt; 137 static id_space_t *key_ids; 138 139 #define GRP_HASHSZ 64 140 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 141 #define AGGR_PORT_NAME_DELIMIT '-' 142 143 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 144 145 #define AGGR_M_CALLBACK_FLAGS \ 146 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 147 148 static mac_callbacks_t aggr_m_callbacks = { 149 AGGR_M_CALLBACK_FLAGS, 150 aggr_m_stat, 151 aggr_m_start, 152 aggr_m_stop, 153 aggr_m_promisc, 154 aggr_m_multicst, 155 NULL, 156 NULL, 157 NULL, 158 aggr_m_ioctl, 159 aggr_m_capab_get, 160 NULL, 161 NULL, 162 aggr_m_setprop, 163 NULL, 164 aggr_m_propinfo 165 }; 166 167 /*ARGSUSED*/ 168 static int 169 aggr_grp_constructor(void *buf, void *arg, int kmflag) 170 { 171 aggr_grp_t *grp = buf; 172 173 bzero(grp, sizeof (*grp)); 174 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 175 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 176 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 177 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 178 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 179 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 180 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 181 grp->lg_link_state = LINK_STATE_UNKNOWN; 182 return (0); 183 } 184 185 /*ARGSUSED*/ 186 static void 187 aggr_grp_destructor(void *buf, void *arg) 188 { 189 aggr_grp_t *grp = buf; 190 191 if (grp->lg_tx_ports != NULL) { 192 kmem_free(grp->lg_tx_ports, 193 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 194 } 195 196 mutex_destroy(&grp->lg_lacp_lock); 197 cv_destroy(&grp->lg_lacp_cv); 198 mutex_destroy(&grp->lg_port_lock); 199 cv_destroy(&grp->lg_port_cv); 200 rw_destroy(&grp->lg_tx_lock); 201 mutex_destroy(&grp->lg_tx_flowctl_lock); 202 cv_destroy(&grp->lg_tx_flowctl_cv); 203 } 204 205 void 206 aggr_grp_init(void) 207 { 208 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 209 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 210 aggr_grp_destructor, NULL, NULL, NULL, 0); 211 212 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 213 GRP_HASHSZ, mod_hash_null_valdtor); 214 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 215 aggr_grp_cnt = 0; 216 217 /* 218 * Allocate an id space to manage key values (when key is not 219 * specified). The range of the id space will be from 220 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 221 * uses a 16-bit key. 222 */ 223 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 224 ASSERT(key_ids != NULL); 225 } 226 227 void 228 aggr_grp_fini(void) 229 { 230 id_space_destroy(key_ids); 231 rw_destroy(&aggr_grp_lock); 232 mod_hash_destroy_idhash(aggr_grp_hash); 233 kmem_cache_destroy(aggr_grp_cache); 234 } 235 236 uint_t 237 aggr_grp_count(void) 238 { 239 uint_t count; 240 241 rw_enter(&aggr_grp_lock, RW_READER); 242 count = aggr_grp_cnt; 243 rw_exit(&aggr_grp_lock); 244 return (count); 245 } 246 247 /* 248 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 249 * requires the mac perimeter, this function holds a reference of the aggr 250 * and aggr won't call mac_unregister() until this reference drops to 0. 251 */ 252 void 253 aggr_grp_port_hold(aggr_port_t *port) 254 { 255 aggr_grp_t *grp = port->lp_grp; 256 257 AGGR_PORT_REFHOLD(port); 258 mutex_enter(&grp->lg_port_lock); 259 grp->lg_port_ref++; 260 mutex_exit(&grp->lg_port_lock); 261 } 262 263 /* 264 * Release the reference of the grp and inform aggr_grp_delete() calling 265 * mac_unregister() is now safe. 266 */ 267 void 268 aggr_grp_port_rele(aggr_port_t *port) 269 { 270 aggr_grp_t *grp = port->lp_grp; 271 272 mutex_enter(&grp->lg_port_lock); 273 if (--grp->lg_port_ref == 0) 274 cv_signal(&grp->lg_port_cv); 275 mutex_exit(&grp->lg_port_lock); 276 AGGR_PORT_REFRELE(port); 277 } 278 279 /* 280 * Wait for the port's lacp timer thread and the port's notification callback 281 * to exit. 282 */ 283 void 284 aggr_grp_port_wait(aggr_grp_t *grp) 285 { 286 mutex_enter(&grp->lg_port_lock); 287 if (grp->lg_port_ref != 0) 288 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 289 mutex_exit(&grp->lg_port_lock); 290 } 291 292 /* 293 * Attach a port to a link aggregation group. 294 * 295 * A port is attached to a link aggregation group once its speed 296 * and link state have been verified. 297 * 298 * Returns B_TRUE if the group link state or speed has changed. If 299 * it's the case, the caller must notify the MAC layer via a call 300 * to mac_link(). 301 */ 302 boolean_t 303 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 304 { 305 boolean_t link_state_changed = B_FALSE; 306 307 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 308 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 309 310 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 311 return (B_FALSE); 312 313 /* 314 * Validate the MAC port link speed and update the group 315 * link speed if needed. 316 */ 317 if (port->lp_ifspeed == 0 || 318 port->lp_link_state != LINK_STATE_UP || 319 port->lp_link_duplex != LINK_DUPLEX_FULL) { 320 /* 321 * Can't attach a MAC port with unknown link speed, 322 * down link, or not in full duplex mode. 323 */ 324 return (B_FALSE); 325 } 326 327 if (grp->lg_ifspeed == 0) { 328 /* 329 * The group inherits the speed of the first link being 330 * attached. 331 */ 332 grp->lg_ifspeed = port->lp_ifspeed; 333 link_state_changed = B_TRUE; 334 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 335 /* 336 * The link speed of the MAC port must be the same as 337 * the group link speed, as per 802.3ad. Since it is 338 * not, the attach is cancelled. 339 */ 340 return (B_FALSE); 341 } 342 343 grp->lg_nattached_ports++; 344 345 /* 346 * Update the group link state. 347 */ 348 if (grp->lg_link_state != LINK_STATE_UP) { 349 grp->lg_link_state = LINK_STATE_UP; 350 grp->lg_link_duplex = LINK_DUPLEX_FULL; 351 link_state_changed = B_TRUE; 352 } 353 354 /* 355 * Update port's state. 356 */ 357 port->lp_state = AGGR_PORT_STATE_ATTACHED; 358 359 aggr_grp_multicst_port(port, B_TRUE); 360 361 /* 362 * Set port's receive callback 363 */ 364 mac_rx_set(port->lp_mch, aggr_recv_cb, port); 365 366 /* 367 * If LACP is OFF, the port can be used to send data as soon 368 * as its link is up and verified to be compatible with the 369 * aggregation. 370 * 371 * If LACP is active or passive, notify the LACP subsystem, which 372 * will enable sending on the port following the LACP protocol. 373 */ 374 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 375 aggr_send_port_enable(port); 376 else 377 aggr_lacp_port_attached(port); 378 379 return (link_state_changed); 380 } 381 382 boolean_t 383 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 384 { 385 boolean_t link_state_changed = B_FALSE; 386 387 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 388 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 389 390 /* update state */ 391 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 392 return (B_FALSE); 393 394 mac_rx_clear(port->lp_mch); 395 396 aggr_grp_multicst_port(port, B_FALSE); 397 398 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 399 aggr_send_port_disable(port); 400 else 401 aggr_lacp_port_detached(port); 402 403 port->lp_state = AGGR_PORT_STATE_STANDBY; 404 405 grp->lg_nattached_ports--; 406 if (grp->lg_nattached_ports == 0) { 407 /* the last attached MAC port of the group is being detached */ 408 grp->lg_ifspeed = 0; 409 grp->lg_link_state = LINK_STATE_DOWN; 410 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 411 link_state_changed = B_TRUE; 412 } 413 414 return (link_state_changed); 415 } 416 417 /* 418 * Update the MAC addresses of the constituent ports of the specified 419 * group. This function is invoked: 420 * - after creating a new aggregation group. 421 * - after adding new ports to an aggregation group. 422 * - after removing a port from a group when the MAC address of 423 * that port was used for the MAC address of the group. 424 * - after the MAC address of a port changed when the MAC address 425 * of that port was used for the MAC address of the group. 426 * 427 * Return true if the link state of the aggregation changed, for example 428 * as a result of a failure changing the MAC address of one of the 429 * constituent ports. 430 */ 431 boolean_t 432 aggr_grp_update_ports_mac(aggr_grp_t *grp) 433 { 434 aggr_port_t *cport; 435 boolean_t link_state_changed = B_FALSE; 436 mac_perim_handle_t mph; 437 438 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 439 440 for (cport = grp->lg_ports; cport != NULL; 441 cport = cport->lp_next) { 442 mac_perim_enter_by_mh(cport->lp_mh, &mph); 443 if (aggr_port_unicst(cport) != 0) { 444 if (aggr_grp_detach_port(grp, cport)) 445 link_state_changed = B_TRUE; 446 } else { 447 /* 448 * If a port was detached because of a previous 449 * failure changing the MAC address, the port is 450 * reattached when it successfully changes the MAC 451 * address now, and this might cause the link state 452 * of the aggregation to change. 453 */ 454 if (aggr_grp_attach_port(grp, cport)) 455 link_state_changed = B_TRUE; 456 } 457 mac_perim_exit(mph); 458 } 459 return (link_state_changed); 460 } 461 462 /* 463 * Invoked when the MAC address of a port has changed. If the port's 464 * MAC address was used for the group MAC address, set mac_addr_changedp 465 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 466 * notification. If the link state changes due to detach/attach of 467 * the constituent port, set link_state_changedp to B_TRUE to indicate 468 * to the caller that it should send a MAC_NOTE_LINK notification. In both 469 * cases, it is the responsibility of the caller to invoke notification 470 * functions after releasing the the port lock. 471 */ 472 void 473 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 474 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 475 { 476 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 477 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 478 ASSERT(mac_addr_changedp != NULL); 479 ASSERT(link_state_changedp != NULL); 480 481 *mac_addr_changedp = B_FALSE; 482 *link_state_changedp = B_FALSE; 483 484 if (grp->lg_addr_fixed) { 485 /* 486 * The group is using a fixed MAC address or an automatic 487 * MAC address has not been set. 488 */ 489 return; 490 } 491 492 if (grp->lg_mac_addr_port == port) { 493 /* 494 * The MAC address of the port was assigned to the group 495 * MAC address. Update the group MAC address. 496 */ 497 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 498 *mac_addr_changedp = B_TRUE; 499 } else { 500 /* 501 * Update the actual port MAC address to the MAC address 502 * of the group. 503 */ 504 if (aggr_port_unicst(port) != 0) { 505 *link_state_changedp = aggr_grp_detach_port(grp, port); 506 } else { 507 /* 508 * If a port was detached because of a previous 509 * failure changing the MAC address, the port is 510 * reattached when it successfully changes the MAC 511 * address now, and this might cause the link state 512 * of the aggregation to change. 513 */ 514 *link_state_changedp = aggr_grp_attach_port(grp, port); 515 } 516 } 517 } 518 519 /* 520 * Add a port to a link aggregation group. 521 */ 522 static int 523 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 524 aggr_port_t **pp) 525 { 526 aggr_port_t *port, **cport; 527 mac_perim_handle_t mph; 528 zoneid_t port_zoneid = ALL_ZONES; 529 int err; 530 531 /* The port must be int the same zone as the aggregation. */ 532 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 533 port_zoneid = GLOBAL_ZONEID; 534 if (grp->lg_zoneid != port_zoneid) 535 return (EBUSY); 536 537 /* 538 * lg_mh could be NULL when the function is called during the creation 539 * of the aggregation. 540 */ 541 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 542 543 /* create new port */ 544 err = aggr_port_create(grp, port_linkid, force, &port); 545 if (err != 0) 546 return (err); 547 548 mac_perim_enter_by_mh(port->lp_mh, &mph); 549 550 /* add port to list of group constituent ports */ 551 cport = &grp->lg_ports; 552 while (*cport != NULL) 553 cport = &((*cport)->lp_next); 554 *cport = port; 555 556 /* 557 * Back reference to the group it is member of. A port always 558 * holds a reference to its group to ensure that the back 559 * reference is always valid. 560 */ 561 port->lp_grp = grp; 562 AGGR_GRP_REFHOLD(grp); 563 grp->lg_nports++; 564 565 aggr_lacp_init_port(port); 566 mac_perim_exit(mph); 567 568 if (pp != NULL) 569 *pp = port; 570 571 return (0); 572 } 573 574 /* 575 * Add a pseudo RX ring for the given HW ring handle. 576 */ 577 static int 578 aggr_add_pseudo_rx_ring(aggr_port_t *port, 579 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 580 { 581 aggr_pseudo_rx_ring_t *ring; 582 int err; 583 int j; 584 585 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 586 ring = rx_grp->arg_rings + j; 587 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 588 break; 589 } 590 591 /* 592 * No slot for this new RX ring. 593 */ 594 if (j == MAX_RINGS_PER_GROUP) 595 return (EIO); 596 597 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 598 ring->arr_hw_rh = hw_rh; 599 ring->arr_port = port; 600 rx_grp->arg_ring_cnt++; 601 602 /* 603 * The group is already registered, dynamically add a new ring to the 604 * mac group. 605 */ 606 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 607 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 608 ring->arr_hw_rh = NULL; 609 ring->arr_port = NULL; 610 rx_grp->arg_ring_cnt--; 611 } else { 612 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 613 mac_find_ring(rx_grp->arg_gh, j)); 614 } 615 return (err); 616 } 617 618 /* 619 * Remove the pseudo RX ring of the given HW ring handle. 620 */ 621 static void 622 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 623 { 624 aggr_pseudo_rx_ring_t *ring; 625 int j; 626 627 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 628 ring = rx_grp->arg_rings + j; 629 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 630 ring->arr_hw_rh != hw_rh) { 631 continue; 632 } 633 634 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 635 636 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 637 ring->arr_hw_rh = NULL; 638 ring->arr_port = NULL; 639 rx_grp->arg_ring_cnt--; 640 mac_hwring_teardown(hw_rh); 641 break; 642 } 643 } 644 645 /* 646 * This function is called to create pseudo rings over the hardware rings of 647 * the underlying device. Note that there is a 1:1 mapping between the pseudo 648 * RX rings of the aggr and the hardware rings of the underlying port. 649 */ 650 static int 651 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 652 { 653 aggr_grp_t *grp = port->lp_grp; 654 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 655 aggr_unicst_addr_t *addr, *a; 656 mac_perim_handle_t pmph; 657 int hw_rh_cnt, i = 0, j; 658 int err = 0; 659 660 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 661 mac_perim_enter_by_mh(port->lp_mh, &pmph); 662 663 /* 664 * This function must be called after the aggr registers its mac 665 * and its RX group has been initialized. 666 */ 667 ASSERT(rx_grp->arg_gh != NULL); 668 669 /* 670 * Get the list the the underlying HW rings. 671 */ 672 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 673 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX); 674 675 if (port->lp_hwgh != NULL) { 676 /* 677 * Quiesce the HW ring and the mac srs on the ring. Note 678 * that the HW ring will be restarted when the pseudo ring 679 * is started. At that time all the packets will be 680 * directly passed up to the pseudo RX ring and handled 681 * by mac srs created over the pseudo RX ring. 682 */ 683 mac_rx_client_quiesce(port->lp_mch); 684 mac_srs_perm_quiesce(port->lp_mch, B_TRUE); 685 } 686 687 /* 688 * Add all the unicast addresses to the newly added port. 689 */ 690 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 691 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) 692 break; 693 } 694 695 for (i = 0; err == 0 && i < hw_rh_cnt; i++) 696 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 697 698 if (err != 0) { 699 for (j = 0; j < i; j++) 700 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 701 702 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 703 aggr_port_remmac(port, a->aua_addr); 704 705 if (port->lp_hwgh != NULL) { 706 mac_srs_perm_quiesce(port->lp_mch, B_FALSE); 707 mac_rx_client_restart(port->lp_mch); 708 port->lp_hwgh = NULL; 709 } 710 } else { 711 port->lp_rx_grp_added = B_TRUE; 712 } 713 done: 714 mac_perim_exit(pmph); 715 return (err); 716 } 717 718 /* 719 * This function is called by aggr to remove pseudo RX rings over the 720 * HW rings of the underlying port. 721 */ 722 static void 723 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 724 { 725 aggr_grp_t *grp = port->lp_grp; 726 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 727 aggr_unicst_addr_t *addr; 728 mac_group_handle_t hwgh; 729 mac_perim_handle_t pmph; 730 int hw_rh_cnt, i; 731 732 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 733 mac_perim_enter_by_mh(port->lp_mh, &pmph); 734 735 if (!port->lp_rx_grp_added) 736 goto done; 737 738 ASSERT(rx_grp->arg_gh != NULL); 739 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 740 &hwgh, hw_rh, MAC_RING_TYPE_RX); 741 742 /* 743 * If hw_rh_cnt is 0, it means that the underlying port does not 744 * support RX rings. Directly return in this case. 745 */ 746 for (i = 0; i < hw_rh_cnt; i++) 747 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 748 749 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 750 aggr_port_remmac(port, addr->aua_addr); 751 752 if (port->lp_hwgh != NULL) { 753 port->lp_hwgh = NULL; 754 755 /* 756 * First clear the permanent-quiesced flag of the RX srs then 757 * restart the HW ring and the mac srs on the ring. Note that 758 * the HW ring and associated SRS will soon been removed when 759 * the port is removed from the aggr. 760 */ 761 mac_srs_perm_quiesce(port->lp_mch, B_FALSE); 762 mac_rx_client_restart(port->lp_mch); 763 } 764 765 port->lp_rx_grp_added = B_FALSE; 766 done: 767 mac_perim_exit(pmph); 768 } 769 770 /* 771 * Add a pseudo TX ring for the given HW ring handle. 772 */ 773 static int 774 aggr_add_pseudo_tx_ring(aggr_port_t *port, 775 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 776 mac_ring_handle_t *pseudo_rh) 777 { 778 aggr_pseudo_tx_ring_t *ring; 779 int err; 780 int i; 781 782 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 783 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 784 ring = tx_grp->atg_rings + i; 785 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 786 break; 787 } 788 /* 789 * No slot for this new TX ring. 790 */ 791 if (i == MAX_RINGS_PER_GROUP) 792 return (EIO); 793 /* 794 * The following 4 statements needs to be done before 795 * calling mac_group_add_ring(). Otherwise it will 796 * result in an assertion failure in mac_init_ring(). 797 */ 798 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 799 ring->atr_hw_rh = hw_rh; 800 ring->atr_port = port; 801 tx_grp->atg_ring_cnt++; 802 803 /* 804 * The TX side has no concept of ring groups unlike RX groups. 805 * There is just a single group which stores all the TX rings. 806 * This group will be used to store aggr's pseudo TX rings. 807 */ 808 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 809 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 810 ring->atr_hw_rh = NULL; 811 ring->atr_port = NULL; 812 tx_grp->atg_ring_cnt--; 813 } else { 814 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 815 if (hw_rh != NULL) { 816 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 817 mac_find_ring(tx_grp->atg_gh, i)); 818 } 819 } 820 return (err); 821 } 822 823 /* 824 * Remove the pseudo TX ring of the given HW ring handle. 825 */ 826 static void 827 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 828 mac_ring_handle_t pseudo_hw_rh) 829 { 830 aggr_pseudo_tx_ring_t *ring; 831 int i; 832 833 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 834 ring = tx_grp->atg_rings + i; 835 if (ring->atr_rh != pseudo_hw_rh) 836 continue; 837 838 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 839 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 840 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 841 mac_hwring_teardown(ring->atr_hw_rh); 842 ring->atr_hw_rh = NULL; 843 ring->atr_port = NULL; 844 tx_grp->atg_ring_cnt--; 845 break; 846 } 847 } 848 849 /* 850 * This function is called to create pseudo rings over hardware rings of 851 * the underlying device. There is a 1:1 mapping between the pseudo TX 852 * rings of the aggr and the hardware rings of the underlying port. 853 */ 854 static int 855 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 856 { 857 aggr_grp_t *grp = port->lp_grp; 858 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 859 mac_perim_handle_t pmph; 860 int hw_rh_cnt, i = 0, j; 861 int err = 0; 862 863 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 864 mac_perim_enter_by_mh(port->lp_mh, &pmph); 865 866 /* 867 * Get the list the the underlying HW rings. 868 */ 869 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 870 NULL, hw_rh, MAC_RING_TYPE_TX); 871 872 /* 873 * Even if the underlying NIC does not have TX rings, we 874 * still make a psuedo TX ring for that NIC with NULL as 875 * the ring handle. 876 */ 877 if (hw_rh_cnt == 0) 878 port->lp_tx_ring_cnt = 1; 879 else 880 port->lp_tx_ring_cnt = hw_rh_cnt; 881 882 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 883 port->lp_tx_ring_cnt), KM_SLEEP); 884 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 885 port->lp_tx_ring_cnt), KM_SLEEP); 886 887 if (hw_rh_cnt == 0) { 888 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 889 NULL, &pseudo_rh)) == 0) { 890 port->lp_tx_rings[0] = NULL; 891 port->lp_pseudo_tx_rings[0] = pseudo_rh; 892 } 893 } else { 894 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 895 err = aggr_add_pseudo_tx_ring(port, 896 tx_grp, hw_rh[i], &pseudo_rh); 897 if (err != 0) 898 break; 899 port->lp_tx_rings[i] = hw_rh[i]; 900 port->lp_pseudo_tx_rings[i] = pseudo_rh; 901 } 902 } 903 904 if (err != 0) { 905 if (hw_rh_cnt != 0) { 906 for (j = 0; j < i; j++) { 907 aggr_rem_pseudo_tx_ring(tx_grp, 908 port->lp_pseudo_tx_rings[j]); 909 } 910 } 911 kmem_free(port->lp_tx_rings, 912 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 913 kmem_free(port->lp_pseudo_tx_rings, 914 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 915 port->lp_tx_ring_cnt = 0; 916 } else { 917 port->lp_tx_grp_added = B_TRUE; 918 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 919 aggr_tx_ring_update, port); 920 } 921 mac_perim_exit(pmph); 922 return (err); 923 } 924 925 /* 926 * This function is called by aggr to remove pseudo TX rings over the 927 * HW rings of the underlying port. 928 */ 929 static void 930 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 931 { 932 aggr_grp_t *grp = port->lp_grp; 933 mac_perim_handle_t pmph; 934 int i; 935 936 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 937 mac_perim_enter_by_mh(port->lp_mh, &pmph); 938 939 if (!port->lp_tx_grp_added) 940 goto done; 941 942 ASSERT(tx_grp->atg_gh != NULL); 943 944 for (i = 0; i < port->lp_tx_ring_cnt; i++) 945 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 946 947 kmem_free(port->lp_tx_rings, 948 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 949 kmem_free(port->lp_pseudo_tx_rings, 950 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 951 952 port->lp_tx_ring_cnt = 0; 953 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 954 port->lp_tx_grp_added = B_FALSE; 955 done: 956 mac_perim_exit(pmph); 957 } 958 959 static int 960 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 961 { 962 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 963 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 964 } 965 966 static int 967 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 968 { 969 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 970 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 971 } 972 973 static int 974 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) 975 { 976 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 977 int err; 978 979 err = mac_hwring_start(rr_ring->arr_hw_rh); 980 if (err == 0) 981 rr_ring->arr_gen = mr_gen; 982 return (err); 983 } 984 985 static void 986 aggr_pseudo_stop_ring(mac_ring_driver_t arg) 987 { 988 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 989 mac_hwring_stop(rr_ring->arr_hw_rh); 990 } 991 992 /* 993 * Add one or more ports to an existing link aggregation group. 994 */ 995 int 996 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 997 laioc_port_t *ports) 998 { 999 int rc, i, nadded = 0; 1000 aggr_grp_t *grp = NULL; 1001 aggr_port_t *port; 1002 boolean_t link_state_changed = B_FALSE; 1003 mac_perim_handle_t mph, pmph; 1004 1005 /* get group corresponding to linkid */ 1006 rw_enter(&aggr_grp_lock, RW_READER); 1007 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1008 (mod_hash_val_t *)&grp) != 0) { 1009 rw_exit(&aggr_grp_lock); 1010 return (ENOENT); 1011 } 1012 AGGR_GRP_REFHOLD(grp); 1013 1014 /* 1015 * Hold the perimeter so that the aggregation won't be destroyed. 1016 */ 1017 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1018 rw_exit(&aggr_grp_lock); 1019 1020 /* add the specified ports to group */ 1021 for (i = 0; i < nports; i++) { 1022 /* add port to group */ 1023 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1024 force, &port)) != 0) { 1025 goto bail; 1026 } 1027 ASSERT(port != NULL); 1028 nadded++; 1029 1030 /* check capabilities */ 1031 if (!aggr_grp_capab_check(grp, port) || 1032 !aggr_grp_sdu_check(grp, port) || 1033 !aggr_grp_margin_check(grp, port)) { 1034 rc = ENOTSUP; 1035 goto bail; 1036 } 1037 1038 /* 1039 * Create the pseudo ring for each HW ring of the underlying 1040 * port. 1041 */ 1042 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1043 if (rc != 0) 1044 goto bail; 1045 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); 1046 if (rc != 0) 1047 goto bail; 1048 1049 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1050 1051 /* set LACP mode */ 1052 aggr_port_lacp_set_mode(grp, port); 1053 1054 /* start port if group has already been started */ 1055 if (grp->lg_started) { 1056 rc = aggr_port_start(port); 1057 if (rc != 0) { 1058 mac_perim_exit(pmph); 1059 goto bail; 1060 } 1061 1062 /* 1063 * Turn on the promiscuous mode over the port when it 1064 * is requested to be turned on to receive the 1065 * non-primary address over a port, or the promiscous 1066 * mode is enabled over the aggr. 1067 */ 1068 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1069 rc = aggr_port_promisc(port, B_TRUE); 1070 if (rc != 0) { 1071 mac_perim_exit(pmph); 1072 goto bail; 1073 } 1074 } 1075 } 1076 mac_perim_exit(pmph); 1077 1078 /* 1079 * Attach each port if necessary. 1080 */ 1081 if (aggr_port_notify_link(grp, port)) 1082 link_state_changed = B_TRUE; 1083 1084 /* 1085 * Initialize the callback functions for this port. 1086 */ 1087 aggr_port_init_callbacks(port); 1088 } 1089 1090 /* update the MAC address of the constituent ports */ 1091 if (aggr_grp_update_ports_mac(grp)) 1092 link_state_changed = B_TRUE; 1093 1094 if (link_state_changed) 1095 mac_link_update(grp->lg_mh, grp->lg_link_state); 1096 1097 bail: 1098 if (rc != 0) { 1099 /* stop and remove ports that have been added */ 1100 for (i = 0; i < nadded; i++) { 1101 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1102 ASSERT(port != NULL); 1103 if (grp->lg_started) { 1104 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1105 (void) aggr_port_promisc(port, B_FALSE); 1106 aggr_port_stop(port); 1107 mac_perim_exit(pmph); 1108 } 1109 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1110 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1111 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1112 } 1113 } 1114 1115 mac_perim_exit(mph); 1116 AGGR_GRP_REFRELE(grp); 1117 return (rc); 1118 } 1119 1120 static int 1121 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1122 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1123 aggr_lacp_timer_t lacp_timer) 1124 { 1125 boolean_t mac_addr_changed = B_FALSE; 1126 boolean_t link_state_changed = B_FALSE; 1127 mac_perim_handle_t pmph; 1128 1129 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1130 1131 /* validate fixed address if specified */ 1132 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1133 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1134 (mac_addr[0] & 0x01))) { 1135 return (EINVAL); 1136 } 1137 1138 /* update policy if requested */ 1139 if (update_mask & AGGR_MODIFY_POLICY) 1140 aggr_send_update_policy(grp, policy); 1141 1142 /* update unicast MAC address if requested */ 1143 if (update_mask & AGGR_MODIFY_MAC) { 1144 if (mac_fixed) { 1145 /* user-supplied MAC address */ 1146 grp->lg_mac_addr_port = NULL; 1147 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1148 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1149 mac_addr_changed = B_TRUE; 1150 } 1151 } else if (grp->lg_addr_fixed) { 1152 /* switch from user-supplied to automatic */ 1153 aggr_port_t *port = grp->lg_ports; 1154 1155 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1156 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1157 grp->lg_mac_addr_port = port; 1158 mac_addr_changed = B_TRUE; 1159 mac_perim_exit(pmph); 1160 } 1161 grp->lg_addr_fixed = mac_fixed; 1162 } 1163 1164 if (mac_addr_changed) 1165 link_state_changed = aggr_grp_update_ports_mac(grp); 1166 1167 if (update_mask & AGGR_MODIFY_LACP_MODE) 1168 aggr_lacp_update_mode(grp, lacp_mode); 1169 1170 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1171 aggr_lacp_update_timer(grp, lacp_timer); 1172 1173 if (link_state_changed) 1174 mac_link_update(grp->lg_mh, grp->lg_link_state); 1175 1176 if (mac_addr_changed) 1177 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1178 1179 return (0); 1180 } 1181 1182 /* 1183 * Update properties of an existing link aggregation group. 1184 */ 1185 int 1186 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1187 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1188 aggr_lacp_timer_t lacp_timer) 1189 { 1190 aggr_grp_t *grp = NULL; 1191 mac_perim_handle_t mph; 1192 int err; 1193 1194 /* get group corresponding to linkid */ 1195 rw_enter(&aggr_grp_lock, RW_READER); 1196 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1197 (mod_hash_val_t *)&grp) != 0) { 1198 rw_exit(&aggr_grp_lock); 1199 return (ENOENT); 1200 } 1201 AGGR_GRP_REFHOLD(grp); 1202 1203 /* 1204 * Hold the perimeter so that the aggregation won't be destroyed. 1205 */ 1206 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1207 rw_exit(&aggr_grp_lock); 1208 1209 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1210 mac_addr, lacp_mode, lacp_timer); 1211 1212 mac_perim_exit(mph); 1213 AGGR_GRP_REFRELE(grp); 1214 return (err); 1215 } 1216 1217 /* 1218 * Create a new link aggregation group upon request from administrator. 1219 * Returns 0 on success, an errno on failure. 1220 */ 1221 int 1222 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1223 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1224 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1225 cred_t *credp) 1226 { 1227 aggr_grp_t *grp = NULL; 1228 aggr_port_t *port; 1229 mac_register_t *mac; 1230 boolean_t link_state_changed; 1231 mac_perim_handle_t mph; 1232 int err; 1233 int i; 1234 kt_did_t tid = 0; 1235 1236 /* need at least one port */ 1237 if (nports == 0) 1238 return (EINVAL); 1239 1240 rw_enter(&aggr_grp_lock, RW_WRITER); 1241 1242 /* does a group with the same linkid already exist? */ 1243 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1244 (mod_hash_val_t *)&grp); 1245 if (err == 0) { 1246 rw_exit(&aggr_grp_lock); 1247 return (EEXIST); 1248 } 1249 1250 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1251 1252 grp->lg_refs = 1; 1253 grp->lg_closing = B_FALSE; 1254 grp->lg_force = force; 1255 grp->lg_linkid = linkid; 1256 grp->lg_zoneid = crgetzoneid(credp); 1257 grp->lg_ifspeed = 0; 1258 grp->lg_link_state = LINK_STATE_UNKNOWN; 1259 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1260 grp->lg_started = B_FALSE; 1261 grp->lg_promisc = B_FALSE; 1262 grp->lg_lacp_done = B_FALSE; 1263 grp->lg_tx_notify_done = B_FALSE; 1264 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1265 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1266 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1267 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1268 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1269 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1270 MAX_RINGS_PER_GROUP), KM_SLEEP); 1271 grp->lg_tx_blocked_cnt = 0; 1272 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); 1273 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1274 aggr_lacp_init_grp(grp); 1275 1276 /* add MAC ports to group */ 1277 grp->lg_ports = NULL; 1278 grp->lg_nports = 0; 1279 grp->lg_nattached_ports = 0; 1280 grp->lg_ntx_ports = 0; 1281 1282 /* 1283 * If key is not specified by the user, allocate the key. 1284 */ 1285 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1286 err = ENOMEM; 1287 goto bail; 1288 } 1289 grp->lg_key = key; 1290 1291 for (i = 0; i < nports; i++) { 1292 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL); 1293 if (err != 0) 1294 goto bail; 1295 } 1296 1297 /* 1298 * If no explicit MAC address was specified by the administrator, 1299 * set it to the MAC address of the first port. 1300 */ 1301 grp->lg_addr_fixed = mac_fixed; 1302 if (grp->lg_addr_fixed) { 1303 /* validate specified address */ 1304 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1305 err = EINVAL; 1306 goto bail; 1307 } 1308 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1309 } else { 1310 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1311 grp->lg_mac_addr_port = grp->lg_ports; 1312 } 1313 1314 /* set the initial group capabilities */ 1315 aggr_grp_capab_set(grp); 1316 1317 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1318 err = ENOMEM; 1319 goto bail; 1320 } 1321 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1322 mac->m_driver = grp; 1323 mac->m_dip = aggr_dip; 1324 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1325 mac->m_src_addr = grp->lg_addr; 1326 mac->m_callbacks = &aggr_m_callbacks; 1327 mac->m_min_sdu = 0; 1328 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1329 mac->m_margin = aggr_grp_max_margin(grp); 1330 mac->m_v12n = MAC_VIRT_LEVEL1; 1331 err = mac_register(mac, &grp->lg_mh); 1332 mac_free(mac); 1333 if (err != 0) 1334 goto bail; 1335 1336 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1337 if (err != 0) { 1338 (void) mac_unregister(grp->lg_mh); 1339 grp->lg_mh = NULL; 1340 goto bail; 1341 } 1342 1343 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1344 1345 /* 1346 * Update the MAC address of the constituent ports. 1347 * None of the port is attached at this time, the link state of the 1348 * aggregation will not change. 1349 */ 1350 link_state_changed = aggr_grp_update_ports_mac(grp); 1351 ASSERT(!link_state_changed); 1352 1353 /* update outbound load balancing policy */ 1354 aggr_send_update_policy(grp, policy); 1355 1356 /* set LACP mode */ 1357 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1358 1359 /* 1360 * Attach each port if necessary. 1361 */ 1362 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1363 /* 1364 * Create the pseudo ring for each HW ring of the underlying 1365 * port. Note that this is done after the aggr registers the 1366 * mac. 1367 */ 1368 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0); 1369 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); 1370 if (aggr_port_notify_link(grp, port)) 1371 link_state_changed = B_TRUE; 1372 1373 /* 1374 * Initialize the callback functions for this port. 1375 */ 1376 aggr_port_init_callbacks(port); 1377 } 1378 1379 if (link_state_changed) 1380 mac_link_update(grp->lg_mh, grp->lg_link_state); 1381 1382 /* add new group to hash table */ 1383 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1384 (mod_hash_val_t)grp); 1385 ASSERT(err == 0); 1386 aggr_grp_cnt++; 1387 1388 mac_perim_exit(mph); 1389 rw_exit(&aggr_grp_lock); 1390 return (0); 1391 1392 bail: 1393 1394 grp->lg_closing = B_TRUE; 1395 1396 port = grp->lg_ports; 1397 while (port != NULL) { 1398 aggr_port_t *cport; 1399 1400 cport = port->lp_next; 1401 aggr_port_delete(port); 1402 port = cport; 1403 } 1404 1405 /* 1406 * Inform the lacp_rx thread to exit. 1407 */ 1408 mutex_enter(&grp->lg_lacp_lock); 1409 grp->lg_lacp_done = B_TRUE; 1410 cv_signal(&grp->lg_lacp_cv); 1411 while (grp->lg_lacp_rx_thread != NULL) 1412 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1413 mutex_exit(&grp->lg_lacp_lock); 1414 /* 1415 * Inform the tx_notify thread to exit. 1416 */ 1417 mutex_enter(&grp->lg_tx_flowctl_lock); 1418 if (grp->lg_tx_notify_thread != NULL) { 1419 tid = grp->lg_tx_notify_thread->t_did; 1420 grp->lg_tx_notify_done = B_TRUE; 1421 cv_signal(&grp->lg_tx_flowctl_cv); 1422 } 1423 mutex_exit(&grp->lg_tx_flowctl_lock); 1424 if (tid != 0) 1425 thread_join(tid); 1426 1427 kmem_free(grp->lg_tx_blocked_rings, 1428 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1429 rw_exit(&aggr_grp_lock); 1430 AGGR_GRP_REFRELE(grp); 1431 return (err); 1432 } 1433 1434 /* 1435 * Return a pointer to the member of a group with specified linkid. 1436 */ 1437 static aggr_port_t * 1438 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1439 { 1440 aggr_port_t *port; 1441 1442 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1443 1444 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1445 if (port->lp_linkid == linkid) 1446 break; 1447 } 1448 1449 return (port); 1450 } 1451 1452 /* 1453 * Stop, detach and remove a port from a link aggregation group. 1454 */ 1455 static int 1456 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1457 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1458 { 1459 int rc = 0; 1460 aggr_port_t **pport; 1461 boolean_t mac_addr_changed = B_FALSE; 1462 boolean_t link_state_changed = B_FALSE; 1463 mac_perim_handle_t mph; 1464 uint64_t val; 1465 uint_t i; 1466 uint_t stat; 1467 1468 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1469 ASSERT(grp->lg_nports > 1); 1470 ASSERT(!grp->lg_closing); 1471 1472 /* unlink port */ 1473 for (pport = &grp->lg_ports; *pport != port; 1474 pport = &(*pport)->lp_next) { 1475 if (*pport == NULL) { 1476 rc = ENOENT; 1477 goto done; 1478 } 1479 } 1480 *pport = port->lp_next; 1481 1482 mac_perim_enter_by_mh(port->lp_mh, &mph); 1483 1484 /* 1485 * If the MAC address of the port being removed was assigned 1486 * to the group, update the group MAC address 1487 * using the MAC address of a different port. 1488 */ 1489 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1490 /* 1491 * Set the MAC address of the group to the 1492 * MAC address of its first port. 1493 */ 1494 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1495 grp->lg_mac_addr_port = grp->lg_ports; 1496 mac_addr_changed = B_TRUE; 1497 } 1498 1499 link_state_changed = aggr_grp_detach_port(grp, port); 1500 1501 /* 1502 * Add the counter statistics of the ports while it was aggregated 1503 * to the group's residual statistics. This is done by obtaining 1504 * the current counter from the underlying MAC then subtracting the 1505 * value of the counter at the moment it was added to the 1506 * aggregation. 1507 */ 1508 for (i = 0; i < MAC_NSTAT; i++) { 1509 stat = i + MAC_STAT_MIN; 1510 if (!MAC_STAT_ISACOUNTER(stat)) 1511 continue; 1512 val = aggr_port_stat(port, stat); 1513 val -= port->lp_stat[i]; 1514 grp->lg_stat[i] += val; 1515 } 1516 for (i = 0; i < ETHER_NSTAT; i++) { 1517 stat = i + MACTYPE_STAT_MIN; 1518 if (!ETHER_STAT_ISACOUNTER(stat)) 1519 continue; 1520 val = aggr_port_stat(port, stat); 1521 val -= port->lp_ether_stat[i]; 1522 grp->lg_ether_stat[i] += val; 1523 } 1524 1525 grp->lg_nports--; 1526 mac_perim_exit(mph); 1527 1528 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1529 aggr_port_delete(port); 1530 1531 /* 1532 * If the group MAC address has changed, update the MAC address of 1533 * the remaining constituent ports according to the new MAC 1534 * address of the group. 1535 */ 1536 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1537 link_state_changed = B_TRUE; 1538 1539 done: 1540 if (mac_addr_changedp != NULL) 1541 *mac_addr_changedp = mac_addr_changed; 1542 if (link_state_changedp != NULL) 1543 *link_state_changedp = link_state_changed; 1544 1545 return (rc); 1546 } 1547 1548 /* 1549 * Remove one or more ports from an existing link aggregation group. 1550 */ 1551 int 1552 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1553 { 1554 int rc = 0, i; 1555 aggr_grp_t *grp = NULL; 1556 aggr_port_t *port; 1557 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1558 boolean_t link_state_update = B_FALSE, link_state_changed; 1559 mac_perim_handle_t mph, pmph; 1560 1561 /* get group corresponding to linkid */ 1562 rw_enter(&aggr_grp_lock, RW_READER); 1563 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1564 (mod_hash_val_t *)&grp) != 0) { 1565 rw_exit(&aggr_grp_lock); 1566 return (ENOENT); 1567 } 1568 AGGR_GRP_REFHOLD(grp); 1569 1570 /* 1571 * Hold the perimeter so that the aggregation won't be destroyed. 1572 */ 1573 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1574 rw_exit(&aggr_grp_lock); 1575 1576 /* we need to keep at least one port per group */ 1577 if (nports >= grp->lg_nports) { 1578 rc = EINVAL; 1579 goto bail; 1580 } 1581 1582 /* first verify that all the groups are valid */ 1583 for (i = 0; i < nports; i++) { 1584 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1585 /* port not found */ 1586 rc = ENOENT; 1587 goto bail; 1588 } 1589 } 1590 1591 /* clear the promiscous mode for the specified ports */ 1592 for (i = 0; i < nports && rc == 0; i++) { 1593 /* lookup port */ 1594 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1595 ASSERT(port != NULL); 1596 1597 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1598 rc = aggr_port_promisc(port, B_FALSE); 1599 mac_perim_exit(pmph); 1600 } 1601 if (rc != 0) { 1602 for (i = 0; i < nports; i++) { 1603 port = aggr_grp_port_lookup(grp, 1604 ports[i].lp_linkid); 1605 ASSERT(port != NULL); 1606 1607 /* 1608 * Turn the promiscuous mode back on if it is required 1609 * to receive the non-primary address over a port, or 1610 * the promiscous mode is enabled over the aggr. 1611 */ 1612 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1613 if (port->lp_started && (grp->lg_promisc || 1614 port->lp_prom_addr != NULL)) { 1615 (void) aggr_port_promisc(port, B_TRUE); 1616 } 1617 mac_perim_exit(pmph); 1618 } 1619 goto bail; 1620 } 1621 1622 /* remove the specified ports from group */ 1623 for (i = 0; i < nports; i++) { 1624 /* lookup port */ 1625 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1626 ASSERT(port != NULL); 1627 1628 /* stop port if group has already been started */ 1629 if (grp->lg_started) { 1630 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1631 aggr_port_stop(port); 1632 mac_perim_exit(pmph); 1633 } 1634 1635 /* 1636 * aggr_rem_pseudo_tx_group() is not called here. Instead 1637 * it is called from inside aggr_grp_rem_port() after the 1638 * port has been detached. The reason is that 1639 * aggr_rem_pseudo_tx_group() removes one ring at a time 1640 * and if there is still traffic going on, then there 1641 * is the possibility of aggr_find_tx_ring() returning a 1642 * removed ring for transmission. Once the port has been 1643 * detached, that port will not be used and 1644 * aggr_find_tx_ring() will not return any rings 1645 * belonging to it. 1646 */ 1647 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1648 1649 /* remove port from group */ 1650 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1651 &link_state_changed); 1652 ASSERT(rc == 0); 1653 mac_addr_update = mac_addr_update || mac_addr_changed; 1654 link_state_update = link_state_update || link_state_changed; 1655 } 1656 1657 bail: 1658 if (mac_addr_update) 1659 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1660 if (link_state_update) 1661 mac_link_update(grp->lg_mh, grp->lg_link_state); 1662 1663 mac_perim_exit(mph); 1664 AGGR_GRP_REFRELE(grp); 1665 1666 return (rc); 1667 } 1668 1669 int 1670 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1671 { 1672 aggr_grp_t *grp = NULL; 1673 aggr_port_t *port, *cport; 1674 datalink_id_t tmpid; 1675 mod_hash_val_t val; 1676 mac_perim_handle_t mph, pmph; 1677 int err; 1678 kt_did_t tid = 0; 1679 1680 rw_enter(&aggr_grp_lock, RW_WRITER); 1681 1682 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1683 (mod_hash_val_t *)&grp) != 0) { 1684 rw_exit(&aggr_grp_lock); 1685 return (ENOENT); 1686 } 1687 1688 /* 1689 * Note that dls_devnet_destroy() must be called before lg_lock is 1690 * held. Otherwise, it will deadlock if another thread is in 1691 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1692 * dls_devnet_destroy() needs to delete. 1693 */ 1694 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1695 rw_exit(&aggr_grp_lock); 1696 return (err); 1697 } 1698 ASSERT(linkid == tmpid); 1699 1700 /* 1701 * Unregister from the MAC service module. Since this can 1702 * fail if a client hasn't closed the MAC port, we gracefully 1703 * fail the operation. 1704 */ 1705 if ((err = mac_disable(grp->lg_mh)) != 0) { 1706 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1707 rw_exit(&aggr_grp_lock); 1708 return (err); 1709 } 1710 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1711 ASSERT(grp == (aggr_grp_t *)val); 1712 1713 ASSERT(aggr_grp_cnt > 0); 1714 aggr_grp_cnt--; 1715 rw_exit(&aggr_grp_lock); 1716 1717 /* 1718 * Inform the lacp_rx thread to exit. 1719 */ 1720 mutex_enter(&grp->lg_lacp_lock); 1721 grp->lg_lacp_done = B_TRUE; 1722 cv_signal(&grp->lg_lacp_cv); 1723 while (grp->lg_lacp_rx_thread != NULL) 1724 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1725 mutex_exit(&grp->lg_lacp_lock); 1726 /* 1727 * Inform the tx_notify_thread to exit. 1728 */ 1729 mutex_enter(&grp->lg_tx_flowctl_lock); 1730 if (grp->lg_tx_notify_thread != NULL) { 1731 tid = grp->lg_tx_notify_thread->t_did; 1732 grp->lg_tx_notify_done = B_TRUE; 1733 cv_signal(&grp->lg_tx_flowctl_cv); 1734 } 1735 mutex_exit(&grp->lg_tx_flowctl_lock); 1736 if (tid != 0) 1737 thread_join(tid); 1738 1739 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1740 1741 grp->lg_closing = B_TRUE; 1742 /* detach and free MAC ports associated with group */ 1743 port = grp->lg_ports; 1744 while (port != NULL) { 1745 cport = port->lp_next; 1746 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1747 if (grp->lg_started) 1748 aggr_port_stop(port); 1749 (void) aggr_grp_detach_port(grp, port); 1750 mac_perim_exit(pmph); 1751 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1752 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1753 aggr_port_delete(port); 1754 port = cport; 1755 } 1756 1757 mac_perim_exit(mph); 1758 1759 kmem_free(grp->lg_tx_blocked_rings, 1760 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1761 /* 1762 * Wait for the port's lacp timer thread and its notification callback 1763 * to exit before calling mac_unregister() since both needs to access 1764 * the mac perimeter of the grp. 1765 */ 1766 aggr_grp_port_wait(grp); 1767 1768 VERIFY(mac_unregister(grp->lg_mh) == 0); 1769 grp->lg_mh = NULL; 1770 1771 AGGR_GRP_REFRELE(grp); 1772 return (0); 1773 } 1774 1775 void 1776 aggr_grp_free(aggr_grp_t *grp) 1777 { 1778 ASSERT(grp->lg_refs == 0); 1779 ASSERT(grp->lg_port_ref == 0); 1780 if (grp->lg_key > AGGR_MAX_KEY) { 1781 id_free(key_ids, grp->lg_key); 1782 grp->lg_key = 0; 1783 } 1784 kmem_cache_free(aggr_grp_cache, grp); 1785 } 1786 1787 int 1788 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1789 aggr_grp_info_new_grp_fn_t new_grp_fn, 1790 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1791 { 1792 aggr_grp_t *grp; 1793 aggr_port_t *port; 1794 mac_perim_handle_t mph, pmph; 1795 int rc = 0; 1796 1797 /* 1798 * Make sure that the aggregation link is visible from the caller's 1799 * zone. 1800 */ 1801 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 1802 return (ENOENT); 1803 1804 rw_enter(&aggr_grp_lock, RW_READER); 1805 1806 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1807 (mod_hash_val_t *)&grp) != 0) { 1808 rw_exit(&aggr_grp_lock); 1809 return (ENOENT); 1810 } 1811 AGGR_GRP_REFHOLD(grp); 1812 1813 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1814 rw_exit(&aggr_grp_lock); 1815 1816 rc = new_grp_fn(fn_arg, grp->lg_linkid, 1817 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 1818 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 1819 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 1820 1821 if (rc != 0) 1822 goto bail; 1823 1824 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1825 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1826 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 1827 port->lp_state, &port->lp_lacp.ActorOperPortState); 1828 mac_perim_exit(pmph); 1829 1830 if (rc != 0) 1831 goto bail; 1832 } 1833 1834 bail: 1835 mac_perim_exit(mph); 1836 AGGR_GRP_REFRELE(grp); 1837 return (rc); 1838 } 1839 1840 /*ARGSUSED*/ 1841 static void 1842 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 1843 { 1844 miocnak(q, mp, 0, ENOTSUP); 1845 } 1846 1847 static int 1848 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 1849 { 1850 aggr_port_t *port; 1851 uint_t stat_index; 1852 1853 /* We only aggregate counter statistics. */ 1854 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 1855 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 1856 return (ENOTSUP); 1857 } 1858 1859 /* 1860 * Counter statistics for a group are computed by aggregating the 1861 * counters of the members MACs while they were aggregated, plus 1862 * the residual counter of the group itself, which is updated each 1863 * time a MAC is removed from the group. 1864 */ 1865 *val = 0; 1866 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1867 /* actual port statistic */ 1868 *val += aggr_port_stat(port, stat); 1869 /* 1870 * minus the port stat when it was added, plus any residual 1871 * amount for the group. 1872 */ 1873 if (IS_MAC_STAT(stat)) { 1874 stat_index = stat - MAC_STAT_MIN; 1875 *val -= port->lp_stat[stat_index]; 1876 *val += grp->lg_stat[stat_index]; 1877 } else if (IS_MACTYPE_STAT(stat)) { 1878 stat_index = stat - MACTYPE_STAT_MIN; 1879 *val -= port->lp_ether_stat[stat_index]; 1880 *val += grp->lg_ether_stat[stat_index]; 1881 } 1882 } 1883 return (0); 1884 } 1885 1886 int 1887 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 1888 { 1889 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 1890 1891 if (rx_ring->arr_hw_rh != NULL) { 1892 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 1893 } else { 1894 aggr_port_t *port = rx_ring->arr_port; 1895 1896 *val = mac_stat_get(port->lp_mh, stat); 1897 1898 } 1899 return (0); 1900 } 1901 1902 int 1903 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 1904 { 1905 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 1906 1907 if (tx_ring->atr_hw_rh != NULL) { 1908 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 1909 } else { 1910 aggr_port_t *port = tx_ring->atr_port; 1911 1912 *val = mac_stat_get(port->lp_mh, stat); 1913 } 1914 return (0); 1915 } 1916 1917 static int 1918 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 1919 { 1920 aggr_grp_t *grp = arg; 1921 mac_perim_handle_t mph; 1922 int rval = 0; 1923 1924 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1925 1926 switch (stat) { 1927 case MAC_STAT_IFSPEED: 1928 *val = grp->lg_ifspeed; 1929 break; 1930 1931 case ETHER_STAT_LINK_DUPLEX: 1932 *val = grp->lg_link_duplex; 1933 break; 1934 1935 default: 1936 /* 1937 * For all other statistics, we return the aggregated stat 1938 * from the underlying ports. aggr_grp_stat() will set 1939 * rval appropriately if the statistic isn't a counter. 1940 */ 1941 rval = aggr_grp_stat(grp, stat, val); 1942 } 1943 1944 mac_perim_exit(mph); 1945 return (rval); 1946 } 1947 1948 static int 1949 aggr_m_start(void *arg) 1950 { 1951 aggr_grp_t *grp = arg; 1952 aggr_port_t *port; 1953 mac_perim_handle_t mph, pmph; 1954 1955 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1956 1957 /* 1958 * Attempts to start all configured members of the group. 1959 * Group members will be attached when their link-up notification 1960 * is received. 1961 */ 1962 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1963 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1964 if (aggr_port_start(port) != 0) { 1965 mac_perim_exit(pmph); 1966 continue; 1967 } 1968 1969 /* 1970 * Turn on the promiscuous mode if it is required to receive 1971 * the non-primary address over a port, or the promiscous 1972 * mode is enabled over the aggr. 1973 */ 1974 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1975 if (aggr_port_promisc(port, B_TRUE) != 0) 1976 aggr_port_stop(port); 1977 } 1978 mac_perim_exit(pmph); 1979 } 1980 1981 grp->lg_started = B_TRUE; 1982 1983 mac_perim_exit(mph); 1984 return (0); 1985 } 1986 1987 static void 1988 aggr_m_stop(void *arg) 1989 { 1990 aggr_grp_t *grp = arg; 1991 aggr_port_t *port; 1992 mac_perim_handle_t mph, pmph; 1993 1994 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1995 1996 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1997 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1998 1999 /* reset port promiscuous mode */ 2000 (void) aggr_port_promisc(port, B_FALSE); 2001 2002 aggr_port_stop(port); 2003 mac_perim_exit(pmph); 2004 } 2005 2006 grp->lg_started = B_FALSE; 2007 mac_perim_exit(mph); 2008 } 2009 2010 static int 2011 aggr_m_promisc(void *arg, boolean_t on) 2012 { 2013 aggr_grp_t *grp = arg; 2014 aggr_port_t *port; 2015 boolean_t link_state_changed = B_FALSE; 2016 mac_perim_handle_t mph, pmph; 2017 2018 AGGR_GRP_REFHOLD(grp); 2019 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2020 2021 ASSERT(!grp->lg_closing); 2022 2023 if (on == grp->lg_promisc) 2024 goto bail; 2025 2026 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2027 int err = 0; 2028 2029 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2030 AGGR_PORT_REFHOLD(port); 2031 if (!on && (port->lp_prom_addr == NULL)) 2032 err = aggr_port_promisc(port, B_FALSE); 2033 else if (on && port->lp_started) 2034 err = aggr_port_promisc(port, B_TRUE); 2035 2036 if (err != 0) { 2037 if (aggr_grp_detach_port(grp, port)) 2038 link_state_changed = B_TRUE; 2039 } else { 2040 /* 2041 * If a port was detached because of a previous 2042 * failure changing the promiscuity, the port 2043 * is reattached when it successfully changes 2044 * the promiscuity now, and this might cause 2045 * the link state of the aggregation to change. 2046 */ 2047 if (aggr_grp_attach_port(grp, port)) 2048 link_state_changed = B_TRUE; 2049 } 2050 mac_perim_exit(pmph); 2051 AGGR_PORT_REFRELE(port); 2052 } 2053 2054 grp->lg_promisc = on; 2055 2056 if (link_state_changed) 2057 mac_link_update(grp->lg_mh, grp->lg_link_state); 2058 2059 bail: 2060 mac_perim_exit(mph); 2061 AGGR_GRP_REFRELE(grp); 2062 2063 return (0); 2064 } 2065 2066 static void 2067 aggr_grp_port_rename(const char *new_name, void *arg) 2068 { 2069 /* 2070 * aggr port's mac client name is the format of "aggr link name" plus 2071 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2072 */ 2073 int aggr_len, link_len, clnt_name_len, i; 2074 char *str_end, *str_st, *str_del; 2075 char aggr_name[MAXNAMELEN]; 2076 char link_name[MAXNAMELEN]; 2077 char *clnt_name; 2078 aggr_grp_t *aggr_grp = arg; 2079 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2080 2081 for (i = 0; i < aggr_grp->lg_nports; i++) { 2082 clnt_name = mac_client_name(aggr_port->lp_mch); 2083 clnt_name_len = strlen(clnt_name); 2084 str_st = clnt_name; 2085 str_end = &(clnt_name[clnt_name_len]); 2086 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2087 ASSERT(str_del != NULL); 2088 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2089 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2090 bzero(aggr_name, MAXNAMELEN); 2091 bzero(link_name, MAXNAMELEN); 2092 bcopy(clnt_name, aggr_name, aggr_len); 2093 bcopy(str_del, link_name, link_len + 1); 2094 bzero(clnt_name, MAXNAMELEN); 2095 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2096 link_name); 2097 2098 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2099 aggr_port = aggr_port->lp_next; 2100 } 2101 } 2102 2103 /* 2104 * Initialize the capabilities that are advertised for the group 2105 * according to the capabilities of the constituent ports. 2106 */ 2107 static boolean_t 2108 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2109 { 2110 aggr_grp_t *grp = arg; 2111 2112 switch (cap) { 2113 case MAC_CAPAB_HCKSUM: { 2114 uint32_t *hcksum_txflags = cap_data; 2115 *hcksum_txflags = grp->lg_hcksum_txflags; 2116 break; 2117 } 2118 case MAC_CAPAB_LSO: { 2119 mac_capab_lso_t *cap_lso = cap_data; 2120 2121 if (grp->lg_lso) { 2122 *cap_lso = grp->lg_cap_lso; 2123 break; 2124 } else { 2125 return (B_FALSE); 2126 } 2127 } 2128 case MAC_CAPAB_NO_NATIVEVLAN: 2129 return (!grp->lg_vlan); 2130 case MAC_CAPAB_NO_ZCOPY: 2131 return (!grp->lg_zcopy); 2132 case MAC_CAPAB_RINGS: { 2133 mac_capab_rings_t *cap_rings = cap_data; 2134 2135 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2136 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2137 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; 2138 2139 /* 2140 * An aggregation advertises only one (pseudo) RX 2141 * group, which virtualizes the main/primary group of 2142 * the underlying devices. 2143 */ 2144 cap_rings->mr_gnum = 1; 2145 cap_rings->mr_gaddring = NULL; 2146 cap_rings->mr_gremring = NULL; 2147 } else { 2148 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2149 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2150 cap_rings->mr_gnum = 0; 2151 } 2152 cap_rings->mr_rget = aggr_fill_ring; 2153 cap_rings->mr_gget = aggr_fill_group; 2154 break; 2155 } 2156 case MAC_CAPAB_AGGR: 2157 { 2158 mac_capab_aggr_t *aggr_cap; 2159 2160 if (cap_data != NULL) { 2161 aggr_cap = cap_data; 2162 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2163 aggr_cap->mca_unicst = aggr_m_unicst; 2164 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2165 aggr_cap->mca_arg = arg; 2166 } 2167 return (B_TRUE); 2168 } 2169 default: 2170 return (B_FALSE); 2171 } 2172 return (B_TRUE); 2173 } 2174 2175 /* 2176 * Callback funtion for MAC layer to register groups. 2177 */ 2178 static void 2179 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2180 mac_group_info_t *infop, mac_group_handle_t gh) 2181 { 2182 aggr_grp_t *grp = arg; 2183 aggr_pseudo_rx_group_t *rx_group; 2184 aggr_pseudo_tx_group_t *tx_group; 2185 2186 ASSERT(index == 0); 2187 if (rtype == MAC_RING_TYPE_RX) { 2188 rx_group = &grp->lg_rx_group; 2189 rx_group->arg_gh = gh; 2190 rx_group->arg_grp = grp; 2191 2192 infop->mgi_driver = (mac_group_driver_t)rx_group; 2193 infop->mgi_start = NULL; 2194 infop->mgi_stop = NULL; 2195 infop->mgi_addmac = aggr_addmac; 2196 infop->mgi_remmac = aggr_remmac; 2197 infop->mgi_count = rx_group->arg_ring_cnt; 2198 } else { 2199 tx_group = &grp->lg_tx_group; 2200 tx_group->atg_gh = gh; 2201 } 2202 } 2203 2204 /* 2205 * Callback funtion for MAC layer to register all rings. 2206 */ 2207 static void 2208 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2209 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2210 { 2211 aggr_grp_t *grp = arg; 2212 2213 switch (rtype) { 2214 case MAC_RING_TYPE_RX: { 2215 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; 2216 aggr_pseudo_rx_ring_t *rx_ring; 2217 mac_intr_t aggr_mac_intr; 2218 2219 ASSERT(rg_index == 0); 2220 2221 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); 2222 rx_ring = rx_group->arg_rings + index; 2223 rx_ring->arr_rh = rh; 2224 2225 /* 2226 * Entrypoint to enable interrupt (disable poll) and 2227 * disable interrupt (enable poll). 2228 */ 2229 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2230 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2231 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2232 aggr_mac_intr.mi_ddi_handle = NULL; 2233 2234 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2235 infop->mri_start = aggr_pseudo_start_ring; 2236 infop->mri_stop = aggr_pseudo_stop_ring; 2237 2238 infop->mri_intr = aggr_mac_intr; 2239 infop->mri_poll = aggr_rx_poll; 2240 2241 infop->mri_stat = aggr_rx_ring_stat; 2242 break; 2243 } 2244 case MAC_RING_TYPE_TX: { 2245 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2246 aggr_pseudo_tx_ring_t *tx_ring; 2247 2248 ASSERT(rg_index == -1); 2249 ASSERT(index < tx_group->atg_ring_cnt); 2250 2251 tx_ring = &tx_group->atg_rings[index]; 2252 tx_ring->atr_rh = rh; 2253 2254 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2255 infop->mri_start = NULL; 2256 infop->mri_stop = NULL; 2257 infop->mri_tx = aggr_ring_tx; 2258 infop->mri_stat = aggr_tx_ring_stat; 2259 /* 2260 * Use the hw TX ring handle to find if the ring needs 2261 * serialization or not. For NICs that do not expose 2262 * Tx rings, atr_hw_rh will be NULL. 2263 */ 2264 if (tx_ring->atr_hw_rh != NULL) { 2265 infop->mri_flags = 2266 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2267 } 2268 break; 2269 } 2270 default: 2271 break; 2272 } 2273 } 2274 2275 static mblk_t * 2276 aggr_rx_poll(void *arg, int bytes_to_pickup) 2277 { 2278 aggr_pseudo_rx_ring_t *rr_ring = arg; 2279 aggr_port_t *port = rr_ring->arr_port; 2280 aggr_grp_t *grp = port->lp_grp; 2281 mblk_t *mp_chain, *mp, **mpp; 2282 2283 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2284 2285 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2286 return (mp_chain); 2287 2288 mpp = &mp_chain; 2289 while ((mp = *mpp) != NULL) { 2290 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2291 struct ether_header *ehp; 2292 2293 ehp = (struct ether_header *)mp->b_rptr; 2294 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2295 *mpp = mp->b_next; 2296 mp->b_next = NULL; 2297 aggr_recv_lacp(port, 2298 (mac_resource_handle_t)rr_ring, mp); 2299 continue; 2300 } 2301 } 2302 2303 if (!port->lp_collector_enabled) { 2304 *mpp = mp->b_next; 2305 mp->b_next = NULL; 2306 freemsg(mp); 2307 continue; 2308 } 2309 mpp = &mp->b_next; 2310 } 2311 return (mp_chain); 2312 } 2313 2314 static int 2315 aggr_addmac(void *arg, const uint8_t *mac_addr) 2316 { 2317 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2318 aggr_unicst_addr_t *addr, **pprev; 2319 aggr_grp_t *grp = rx_group->arg_grp; 2320 aggr_port_t *port, *p; 2321 mac_perim_handle_t mph; 2322 int err = 0; 2323 2324 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2325 2326 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2327 mac_perim_exit(mph); 2328 return (0); 2329 } 2330 2331 /* 2332 * Insert this mac address into the list of mac addresses owned by 2333 * the aggregation pseudo group. 2334 */ 2335 pprev = &rx_group->arg_macaddr; 2336 while ((addr = *pprev) != NULL) { 2337 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2338 mac_perim_exit(mph); 2339 return (EEXIST); 2340 } 2341 pprev = &addr->aua_next; 2342 } 2343 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2344 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2345 addr->aua_next = NULL; 2346 *pprev = addr; 2347 2348 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2349 if ((err = aggr_port_addmac(port, mac_addr)) != 0) 2350 break; 2351 2352 if (err != 0) { 2353 for (p = grp->lg_ports; p != port; p = p->lp_next) 2354 aggr_port_remmac(p, mac_addr); 2355 2356 *pprev = NULL; 2357 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2358 } 2359 2360 mac_perim_exit(mph); 2361 return (err); 2362 } 2363 2364 static int 2365 aggr_remmac(void *arg, const uint8_t *mac_addr) 2366 { 2367 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2368 aggr_unicst_addr_t *addr, **pprev; 2369 aggr_grp_t *grp = rx_group->arg_grp; 2370 aggr_port_t *port; 2371 mac_perim_handle_t mph; 2372 int err = 0; 2373 2374 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2375 2376 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2377 mac_perim_exit(mph); 2378 return (0); 2379 } 2380 2381 /* 2382 * Insert this mac address into the list of mac addresses owned by 2383 * the aggregation pseudo group. 2384 */ 2385 pprev = &rx_group->arg_macaddr; 2386 while ((addr = *pprev) != NULL) { 2387 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2388 pprev = &addr->aua_next; 2389 continue; 2390 } 2391 break; 2392 } 2393 if (addr == NULL) { 2394 mac_perim_exit(mph); 2395 return (EINVAL); 2396 } 2397 2398 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2399 aggr_port_remmac(port, mac_addr); 2400 2401 *pprev = addr->aua_next; 2402 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2403 2404 mac_perim_exit(mph); 2405 return (err); 2406 } 2407 2408 /* 2409 * Add or remove the multicast addresses that are defined for the group 2410 * to or from the specified port. 2411 * 2412 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2413 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2414 * called when the port is either stopped or detached. 2415 */ 2416 void 2417 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2418 { 2419 aggr_grp_t *grp = port->lp_grp; 2420 2421 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2422 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2423 2424 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2425 return; 2426 2427 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2428 } 2429 2430 static int 2431 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2432 { 2433 aggr_grp_t *grp = arg; 2434 aggr_port_t *port = NULL; 2435 mac_perim_handle_t mph; 2436 int err = 0, cerr; 2437 2438 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2439 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2440 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2441 !port->lp_started) { 2442 continue; 2443 } 2444 cerr = aggr_port_multicst(port, add, addrp); 2445 if (cerr != 0 && err == 0) 2446 err = cerr; 2447 } 2448 mac_perim_exit(mph); 2449 return (err); 2450 } 2451 2452 static int 2453 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2454 { 2455 aggr_grp_t *grp = arg; 2456 mac_perim_handle_t mph; 2457 int err; 2458 2459 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2460 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2461 0, 0); 2462 mac_perim_exit(mph); 2463 return (err); 2464 } 2465 2466 /* 2467 * Initialize the capabilities that are advertised for the group 2468 * according to the capabilities of the constituent ports. 2469 */ 2470 static void 2471 aggr_grp_capab_set(aggr_grp_t *grp) 2472 { 2473 uint32_t cksum; 2474 aggr_port_t *port; 2475 mac_capab_lso_t cap_lso; 2476 2477 ASSERT(grp->lg_mh == NULL); 2478 ASSERT(grp->lg_ports != NULL); 2479 2480 grp->lg_hcksum_txflags = (uint32_t)-1; 2481 grp->lg_zcopy = B_TRUE; 2482 grp->lg_vlan = B_TRUE; 2483 2484 grp->lg_lso = B_TRUE; 2485 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2486 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2487 2488 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2489 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2490 cksum = 0; 2491 grp->lg_hcksum_txflags &= cksum; 2492 2493 grp->lg_vlan &= 2494 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2495 2496 grp->lg_zcopy &= 2497 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2498 2499 grp->lg_lso &= 2500 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2501 if (grp->lg_lso) { 2502 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2503 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2504 cap_lso.lso_basic_tcp_ipv4.lso_max) 2505 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2506 cap_lso.lso_basic_tcp_ipv4.lso_max; 2507 } 2508 } 2509 } 2510 2511 /* 2512 * Checks whether the capabilities of the port being added are compatible 2513 * with the current capabilities of the aggregation. 2514 */ 2515 static boolean_t 2516 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2517 { 2518 uint32_t hcksum_txflags; 2519 2520 ASSERT(grp->lg_ports != NULL); 2521 2522 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2523 grp->lg_vlan) != grp->lg_vlan) { 2524 return (B_FALSE); 2525 } 2526 2527 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2528 grp->lg_zcopy) != grp->lg_zcopy) { 2529 return (B_FALSE); 2530 } 2531 2532 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2533 if (grp->lg_hcksum_txflags != 0) 2534 return (B_FALSE); 2535 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2536 grp->lg_hcksum_txflags) { 2537 return (B_FALSE); 2538 } 2539 2540 if (grp->lg_lso) { 2541 mac_capab_lso_t cap_lso; 2542 2543 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2544 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2545 grp->lg_cap_lso.lso_flags) 2546 return (B_FALSE); 2547 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2548 cap_lso.lso_basic_tcp_ipv4.lso_max) 2549 return (B_FALSE); 2550 } else { 2551 return (B_FALSE); 2552 } 2553 } 2554 2555 return (B_TRUE); 2556 } 2557 2558 /* 2559 * Returns the maximum SDU according to the SDU of the constituent ports. 2560 */ 2561 static uint_t 2562 aggr_grp_max_sdu(aggr_grp_t *grp) 2563 { 2564 uint_t max_sdu = (uint_t)-1; 2565 aggr_port_t *port; 2566 2567 ASSERT(grp->lg_ports != NULL); 2568 2569 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2570 uint_t port_sdu_max; 2571 2572 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2573 if (max_sdu > port_sdu_max) 2574 max_sdu = port_sdu_max; 2575 } 2576 2577 return (max_sdu); 2578 } 2579 2580 /* 2581 * Checks if the maximum SDU of the specified port is compatible 2582 * with the maximum SDU of the specified aggregation group, returns 2583 * B_TRUE if it is, B_FALSE otherwise. 2584 */ 2585 static boolean_t 2586 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 2587 { 2588 uint_t port_sdu_max; 2589 2590 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2591 return (port_sdu_max >= grp->lg_max_sdu); 2592 } 2593 2594 /* 2595 * Returns the maximum margin according to the margin of the constituent ports. 2596 */ 2597 static uint32_t 2598 aggr_grp_max_margin(aggr_grp_t *grp) 2599 { 2600 uint32_t margin = UINT32_MAX; 2601 aggr_port_t *port; 2602 2603 ASSERT(grp->lg_mh == NULL); 2604 ASSERT(grp->lg_ports != NULL); 2605 2606 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2607 if (margin > port->lp_margin) 2608 margin = port->lp_margin; 2609 } 2610 2611 grp->lg_margin = margin; 2612 return (margin); 2613 } 2614 2615 /* 2616 * Checks if the maximum margin of the specified port is compatible 2617 * with the maximum margin of the specified aggregation group, returns 2618 * B_TRUE if it is, B_FALSE otherwise. 2619 */ 2620 static boolean_t 2621 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 2622 { 2623 if (port->lp_margin >= grp->lg_margin) 2624 return (B_TRUE); 2625 2626 /* 2627 * See whether the current margin value is allowed to be changed to 2628 * the new value. 2629 */ 2630 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 2631 return (B_FALSE); 2632 2633 grp->lg_margin = port->lp_margin; 2634 return (B_TRUE); 2635 } 2636 2637 /* 2638 * Set MTU on individual ports of an aggregation group 2639 */ 2640 static int 2641 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 2642 uint32_t *old_mtu) 2643 { 2644 boolean_t removed = B_FALSE; 2645 mac_perim_handle_t mph; 2646 mac_diag_t diag; 2647 int err, rv, retry = 0; 2648 2649 if (port->lp_mah != NULL) { 2650 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 2651 port->lp_mah = NULL; 2652 removed = B_TRUE; 2653 } 2654 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 2655 try_again: 2656 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 2657 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 2658 &port->lp_mah, 0, &diag)) != 0) { 2659 /* 2660 * following is a workaround for a bug in 'bge' driver. 2661 * See CR 6794654 for more information and this work around 2662 * will be removed once the CR is fixed. 2663 */ 2664 if (rv == EIO && retry++ < 3) { 2665 delay(2 * hz); 2666 goto try_again; 2667 } 2668 /* 2669 * if mac_unicast_add() failed while setting the MTU, 2670 * detach the port from the group. 2671 */ 2672 mac_perim_enter_by_mh(port->lp_mh, &mph); 2673 (void) aggr_grp_detach_port(grp, port); 2674 mac_perim_exit(mph); 2675 cmn_err(CE_WARN, "Unable to restart the port %s while " 2676 "setting MTU. Detaching the port from the aggregation.", 2677 mac_client_name(port->lp_mch)); 2678 } 2679 return (err); 2680 } 2681 2682 static int 2683 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 2684 { 2685 int err = 0, i, rv; 2686 aggr_port_t *port; 2687 uint32_t *mtu; 2688 2689 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2690 2691 /* 2692 * If the MTU being set is equal to aggr group's maximum 2693 * allowable value, then there is nothing to change 2694 */ 2695 if (sdu == grp->lg_max_sdu) 2696 return (0); 2697 2698 /* 0 is aggr group's min sdu */ 2699 if (sdu == 0) 2700 return (EINVAL); 2701 2702 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 2703 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 2704 port = port->lp_next, i++) { 2705 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 2706 } 2707 if (err != 0) { 2708 /* recover from error: reset the mtus of the ports */ 2709 aggr_port_t *tmp; 2710 2711 for (tmp = grp->lg_ports, i = 0; tmp != port; 2712 tmp = tmp->lp_next, i++) { 2713 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 2714 } 2715 goto bail; 2716 } 2717 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 2718 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 2719 ASSERT(rv == 0); 2720 bail: 2721 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 2722 return (err); 2723 } 2724 2725 /* 2726 * Callback functions for set/get of properties 2727 */ 2728 /*ARGSUSED*/ 2729 static int 2730 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 2731 uint_t pr_valsize, const void *pr_val) 2732 { 2733 int err = ENOTSUP; 2734 aggr_grp_t *grp = m_driver; 2735 2736 switch (pr_num) { 2737 case MAC_PROP_MTU: { 2738 uint32_t mtu; 2739 2740 if (pr_valsize < sizeof (mtu)) { 2741 err = EINVAL; 2742 break; 2743 } 2744 bcopy(pr_val, &mtu, sizeof (mtu)); 2745 err = aggr_sdu_update(grp, mtu); 2746 break; 2747 } 2748 default: 2749 break; 2750 } 2751 return (err); 2752 } 2753 2754 typedef struct rboundary { 2755 uint32_t bval; 2756 int btype; 2757 } rboundary_t; 2758 2759 /* 2760 * This function finds the intersection of mtu ranges stored in arrays - 2761 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 2762 * Individual arrays are assumed to contain non-overlapping ranges. 2763 * Algorithm: 2764 * A range has two boundaries - min and max. We scan all arrays and store 2765 * each boundary as a separate element in a temporary array. We also store 2766 * the boundary types, min or max, as +1 or -1 respectively in the temporary 2767 * array. Then we sort the temporary array in ascending order. We scan the 2768 * sorted array from lower to higher values and keep a cumulative sum of 2769 * boundary types. Element in the temporary array for which the sum reaches 2770 * mcount is a min boundary of a range in the result and next element will be 2771 * max boundary. 2772 * 2773 * Example for mcount = 3, 2774 * 2775 * ----|_________|-------|_______|----|__|------ mrange[0] 2776 * 2777 * -------|________|--|____________|-----|___|-- mrange[1] 2778 * 2779 * --------|________________|-------|____|------ mrange[2] 2780 * 2781 * 3 2 1 2782 * \|/ 2783 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 2784 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 2785 * 2786 * same min and max 2787 * V 2788 * --------|_____|-------|__|------------|------ intersecting ranges 2789 */ 2790 void 2791 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 2792 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 2793 { 2794 mac_propval_uint32_range_t *rval, *ur; 2795 int rmaxcnt, rcount; 2796 size_t sz_range32; 2797 rboundary_t *ta; /* temporary array */ 2798 rboundary_t temp; 2799 boolean_t range_started = B_FALSE; 2800 int i, j, m, sum; 2801 2802 sz_range32 = sizeof (mac_propval_uint32_range_t); 2803 2804 for (i = 0, rmaxcnt = 0; i < mcount; i++) 2805 rmaxcnt += mrange[i]->mpr_count; 2806 2807 /* Allocate enough space to store the results */ 2808 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 2809 2810 /* Number of boundaries are twice as many as ranges */ 2811 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 2812 2813 for (i = 0, m = 0; i < mcount; i++) { 2814 ur = &(mrange[i]->mpr_range_uint32[0]); 2815 for (j = 0; j < mrange[i]->mpr_count; j++) { 2816 ta[m].bval = ur[j].mpur_min; 2817 ta[m++].btype = 1; 2818 ta[m].bval = ur[j].mpur_max; 2819 ta[m++].btype = -1; 2820 } 2821 } 2822 2823 /* 2824 * Sort the temporary array in ascending order of bval; 2825 * if boundary values are same then sort on btype. 2826 */ 2827 for (i = 0; i < m-1; i++) { 2828 for (j = i+1; j < m; j++) { 2829 if ((ta[i].bval > ta[j].bval) || 2830 ((ta[i].bval == ta[j].bval) && 2831 (ta[i].btype < ta[j].btype))) { 2832 temp = ta[i]; 2833 ta[i] = ta[j]; 2834 ta[j] = temp; 2835 } 2836 } 2837 } 2838 2839 /* Walk through temporary array to find all ranges in the results */ 2840 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 2841 sum += ta[i].btype; 2842 if (sum == mcount) { 2843 rval[rcount].mpur_min = ta[i].bval; 2844 range_started = B_TRUE; 2845 } else if (sum < mcount && range_started) { 2846 rval[rcount++].mpur_max = ta[i].bval; 2847 range_started = B_FALSE; 2848 } 2849 } 2850 2851 *prval = rval; 2852 *prmaxcnt = rmaxcnt; 2853 *prcount = rcount; 2854 } 2855 2856 /* 2857 * Returns the mtu ranges which could be supported by aggr group. 2858 * prmaxcnt returns the size of the buffer prval, prcount returns 2859 * the number of valid entries in prval. Caller is responsible 2860 * for freeing up prval. 2861 */ 2862 int 2863 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 2864 int *prmaxcnt, int *prcount) 2865 { 2866 mac_propval_range_t **vals; 2867 aggr_port_t *port; 2868 mac_perim_handle_t mph; 2869 uint_t i, numr; 2870 int err = 0; 2871 size_t sz_propval, sz_range32; 2872 size_t size; 2873 2874 sz_propval = sizeof (mac_propval_range_t); 2875 sz_range32 = sizeof (mac_propval_uint32_range_t); 2876 2877 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2878 2879 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 2880 KM_SLEEP); 2881 2882 for (port = grp->lg_ports, i = 0; port != NULL; 2883 port = port->lp_next, i++) { 2884 2885 size = sz_propval; 2886 vals[i] = kmem_alloc(size, KM_SLEEP); 2887 vals[i]->mpr_count = 1; 2888 2889 mac_perim_enter_by_mh(port->lp_mh, &mph); 2890 2891 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 2892 NULL, 0, vals[i], NULL); 2893 if (err == ENOSPC) { 2894 /* 2895 * Not enough space to hold all ranges. 2896 * Allocate extra space as indicated and retry. 2897 */ 2898 numr = vals[i]->mpr_count; 2899 kmem_free(vals[i], sz_propval); 2900 size = sz_propval + (numr - 1) * sz_range32; 2901 vals[i] = kmem_alloc(size, KM_SLEEP); 2902 vals[i]->mpr_count = numr; 2903 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 2904 NULL, 0, vals[i], NULL); 2905 ASSERT(err != ENOSPC); 2906 } 2907 mac_perim_exit(mph); 2908 if (err != 0) { 2909 kmem_free(vals[i], size); 2910 vals[i] = NULL; 2911 break; 2912 } 2913 } 2914 2915 /* 2916 * if any of the underlying ports does not support changing MTU then 2917 * just return ENOTSUP 2918 */ 2919 if (port != NULL) { 2920 ASSERT(err != 0); 2921 goto done; 2922 } 2923 2924 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 2925 prcount); 2926 2927 done: 2928 for (i = 0; i < grp->lg_nports; i++) { 2929 if (vals[i] != NULL) { 2930 numr = vals[i]->mpr_count; 2931 size = sz_propval + (numr - 1) * sz_range32; 2932 kmem_free(vals[i], size); 2933 } 2934 } 2935 2936 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 2937 return (err); 2938 } 2939 2940 static void 2941 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 2942 mac_prop_info_handle_t prh) 2943 { 2944 aggr_grp_t *grp = m_driver; 2945 mac_propval_uint32_range_t *rval = NULL; 2946 int i, rcount, rmaxcnt; 2947 int err = 0; 2948 2949 _NOTE(ARGUNUSED(pr_name)); 2950 2951 switch (pr_num) { 2952 case MAC_PROP_MTU: 2953 2954 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 2955 &rcount); 2956 if (err != 0) { 2957 ASSERT(rval == NULL); 2958 return; 2959 } 2960 for (i = 0; i < rcount; i++) { 2961 mac_prop_info_set_range_uint32(prh, 2962 rval[i].mpur_min, rval[i].mpur_max); 2963 } 2964 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 2965 break; 2966 } 2967 } 2968