1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2020 Joyent, Inc. 24 * Copyright 2020 RackTop Systems, Inc. 25 * Copyright 2024 MNX Cloud, Inc. 26 */ 27 28 /* 29 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 30 * 31 * An instance of the structure aggr_grp_t is allocated for each 32 * link aggregation group. When created, aggr_grp_t objects are 33 * entered into the aggr_grp_hash hash table maintained by the modhash 34 * module. The hash key is the linkid associated with the link 35 * aggregation group. 36 * 37 * Each aggregation contains a set of ports. The port is represented 38 * by the aggr_port_t structure. A port consists of a single MAC 39 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 40 * MAC. This client is used by the aggr to send and receive LACP 41 * traffic. Each port client takes on the same MAC unicast address -- 42 * the address of the aggregation itself (taken from the first port by 43 * default). 44 * 45 * The MAC client that hangs off each aggr port is not your typical 46 * MAC client. Not only does it have exclusive control of the MAC, but 47 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 48 * fanout traffic among L4 protocols; but the aggr is an intermediary, 49 * not a consumer. Instead of using SRSes, the aggr puts the 50 * underlying hardware rings into passthru mode and ships packets up 51 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 52 * LACP while passing all other traffic up to clients of the aggr. 53 * 54 * Pseudo Rx Groups and Rings 55 * -------------------------- 56 * 57 * It is imperative for client performance that the aggr provide as 58 * many MAC groups as possible. In order to use the underlying HW 59 * resources, aggr creates pseudo groups to aggregate the underlying 60 * HW groups. Every HW group gets mapped to a pseudo group; and every 61 * HW ring in that group gets mapped to a pseudo ring. The pseudo 62 * group at index 0 combines all the HW groups at index 0 from each 63 * port, etc. The aggr's MAC then creates normal MAC groups and rings 64 * out of these pseudo groups and rings to present to the aggr's 65 * clients. To the clients, the aggr's groups and rings are absolutely 66 * no different than a NIC's groups or rings. 67 * 68 * Pseudo Tx Rings 69 * --------------- 70 * 71 * The underlying ports (NICs) in an aggregation can have Tx rings. To 72 * enhance aggr's performance, these Tx rings are made available to 73 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 74 * not new. They are already present and implemented on the Rx side. 75 * The same concept is extended to the Tx side where each Tx ring of 76 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 77 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 78 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 79 * is given to the aggregation layer. 80 * 81 * With this change, the outgoing stack depth looks much better: 82 * 83 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 84 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 85 * 86 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 87 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 88 * 89 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 90 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 91 * ring belonging to a port on which the packet has to be sent. 92 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 93 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 94 * the selected port. 95 * 96 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 97 * bandwidth limit is applied first on the outgoing packet and the packets 98 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 99 * particular Tx ring. 100 */ 101 102 #include <sys/types.h> 103 #include <sys/sysmacros.h> 104 #include <sys/conf.h> 105 #include <sys/cmn_err.h> 106 #include <sys/disp.h> 107 #include <sys/list.h> 108 #include <sys/ksynch.h> 109 #include <sys/kmem.h> 110 #include <sys/stream.h> 111 #include <sys/modctl.h> 112 #include <sys/ddi.h> 113 #include <sys/sunddi.h> 114 #include <sys/atomic.h> 115 #include <sys/stat.h> 116 #include <sys/modhash.h> 117 #include <sys/id_space.h> 118 #include <sys/strsun.h> 119 #include <sys/cred.h> 120 #include <sys/dlpi.h> 121 #include <sys/zone.h> 122 #include <sys/mac_provider.h> 123 #include <sys/dls.h> 124 #include <sys/vlan.h> 125 #include <sys/aggr.h> 126 #include <sys/aggr_impl.h> 127 128 static int aggr_m_start(void *); 129 static void aggr_m_stop(void *); 130 static int aggr_m_promisc(void *, boolean_t); 131 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 132 static int aggr_m_unicst(void *, const uint8_t *); 133 static int aggr_m_stat(void *, uint_t, uint64_t *); 134 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 135 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 136 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 137 const void *); 138 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 139 mac_prop_info_handle_t); 140 141 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 142 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 143 boolean_t *); 144 145 static void aggr_grp_capab_set(aggr_grp_t *); 146 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 147 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 148 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 149 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 150 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 151 152 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 153 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 154 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 155 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 156 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 157 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 158 static int aggr_addmac(void *, const uint8_t *); 159 static int aggr_remmac(void *, const uint8_t *); 160 static int aggr_addvlan(mac_group_driver_t, uint16_t); 161 static int aggr_remvlan(mac_group_driver_t, uint16_t); 162 static mblk_t *aggr_rx_poll(void *, int); 163 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 164 const int, mac_ring_info_t *, mac_ring_handle_t); 165 static void aggr_fill_group(void *, mac_ring_type_t, const int, 166 mac_group_info_t *, mac_group_handle_t); 167 168 static kmem_cache_t *aggr_grp_cache; 169 static mod_hash_t *aggr_grp_hash; 170 static krwlock_t aggr_grp_lock; 171 static uint_t aggr_grp_cnt; 172 static id_space_t *key_ids; 173 174 #define GRP_HASHSZ 64 175 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 176 #define AGGR_PORT_NAME_DELIMIT '-' 177 178 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 179 180 #define AGGR_M_CALLBACK_FLAGS \ 181 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 182 183 static mac_callbacks_t aggr_m_callbacks = { 184 AGGR_M_CALLBACK_FLAGS, 185 aggr_m_stat, 186 aggr_m_start, 187 aggr_m_stop, 188 aggr_m_promisc, 189 aggr_m_multicst, 190 NULL, 191 NULL, 192 NULL, 193 aggr_m_ioctl, 194 aggr_m_capab_get, 195 NULL, 196 NULL, 197 aggr_m_setprop, 198 NULL, 199 aggr_m_propinfo 200 }; 201 202 /*ARGSUSED*/ 203 static int 204 aggr_grp_constructor(void *buf, void *arg, int kmflag) 205 { 206 aggr_grp_t *grp = buf; 207 208 bzero(grp, sizeof (*grp)); 209 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 210 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 211 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 212 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 213 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 214 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 215 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 216 grp->lg_link_state = LINK_STATE_UNKNOWN; 217 return (0); 218 } 219 220 /*ARGSUSED*/ 221 static void 222 aggr_grp_destructor(void *buf, void *arg) 223 { 224 aggr_grp_t *grp = buf; 225 226 if (grp->lg_tx_ports != NULL) { 227 kmem_free(grp->lg_tx_ports, 228 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 229 } 230 231 mutex_destroy(&grp->lg_lacp_lock); 232 cv_destroy(&grp->lg_lacp_cv); 233 mutex_destroy(&grp->lg_port_lock); 234 cv_destroy(&grp->lg_port_cv); 235 rw_destroy(&grp->lg_tx_lock); 236 mutex_destroy(&grp->lg_tx_flowctl_lock); 237 cv_destroy(&grp->lg_tx_flowctl_cv); 238 } 239 240 void 241 aggr_grp_init(void) 242 { 243 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 244 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 245 aggr_grp_destructor, NULL, NULL, NULL, 0); 246 247 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 248 GRP_HASHSZ, mod_hash_null_valdtor); 249 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 250 aggr_grp_cnt = 0; 251 252 /* 253 * Allocate an id space to manage key values (when key is not 254 * specified). The range of the id space will be from 255 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 256 * uses a 16-bit key. 257 */ 258 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 259 ASSERT(key_ids != NULL); 260 } 261 262 void 263 aggr_grp_fini(void) 264 { 265 id_space_destroy(key_ids); 266 rw_destroy(&aggr_grp_lock); 267 mod_hash_destroy_idhash(aggr_grp_hash); 268 kmem_cache_destroy(aggr_grp_cache); 269 } 270 271 uint_t 272 aggr_grp_count(void) 273 { 274 uint_t count; 275 276 rw_enter(&aggr_grp_lock, RW_READER); 277 count = aggr_grp_cnt; 278 rw_exit(&aggr_grp_lock); 279 return (count); 280 } 281 282 /* 283 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 284 * requires the mac perimeter, this function holds a reference of the aggr 285 * and aggr won't call mac_unregister() until this reference drops to 0. 286 */ 287 void 288 aggr_grp_port_hold(aggr_port_t *port) 289 { 290 aggr_grp_t *grp = port->lp_grp; 291 292 AGGR_PORT_REFHOLD(port); 293 mutex_enter(&grp->lg_port_lock); 294 grp->lg_port_ref++; 295 mutex_exit(&grp->lg_port_lock); 296 } 297 298 /* 299 * Release the reference of the grp and inform aggr_grp_delete() calling 300 * mac_unregister() is now safe. 301 */ 302 void 303 aggr_grp_port_rele(aggr_port_t *port) 304 { 305 aggr_grp_t *grp = port->lp_grp; 306 307 mutex_enter(&grp->lg_port_lock); 308 if (--grp->lg_port_ref == 0) 309 cv_signal(&grp->lg_port_cv); 310 mutex_exit(&grp->lg_port_lock); 311 AGGR_PORT_REFRELE(port); 312 } 313 314 /* 315 * Wait for the port's lacp timer thread and the port's notification callback 316 * to exit. 317 */ 318 void 319 aggr_grp_port_wait(aggr_grp_t *grp) 320 { 321 mutex_enter(&grp->lg_port_lock); 322 if (grp->lg_port_ref != 0) 323 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 324 mutex_exit(&grp->lg_port_lock); 325 } 326 327 /* 328 * Attach a port to a link aggregation group. 329 * 330 * A port is attached to a link aggregation group once its speed 331 * and link state have been verified. 332 * 333 * Returns B_TRUE if the group link state or speed has changed. If 334 * it's the case, the caller must notify the MAC layer via a call 335 * to mac_link(). 336 */ 337 boolean_t 338 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 339 { 340 boolean_t link_state_changed = B_FALSE; 341 342 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 343 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 344 345 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 346 return (B_FALSE); 347 348 /* 349 * Validate the MAC port link speed and update the group 350 * link speed if needed. 351 */ 352 if (port->lp_ifspeed == 0 || 353 port->lp_link_state != LINK_STATE_UP || 354 port->lp_link_duplex != LINK_DUPLEX_FULL) { 355 /* 356 * Can't attach a MAC port with unknown link speed, 357 * down link, or not in full duplex mode. 358 */ 359 return (B_FALSE); 360 } 361 362 mutex_enter(&grp->lg_stat_lock); 363 if (grp->lg_ifspeed == 0) { 364 /* 365 * The group inherits the speed of the first link being 366 * attached. 367 */ 368 grp->lg_ifspeed = port->lp_ifspeed; 369 link_state_changed = B_TRUE; 370 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 371 /* 372 * The link speed of the MAC port must be the same as 373 * the group link speed, as per 802.3ad. Since it is 374 * not, the attach is cancelled. 375 */ 376 mutex_exit(&grp->lg_stat_lock); 377 return (B_FALSE); 378 } 379 mutex_exit(&grp->lg_stat_lock); 380 381 grp->lg_nattached_ports++; 382 383 /* 384 * Update the group link state. 385 */ 386 if (grp->lg_link_state != LINK_STATE_UP) { 387 grp->lg_link_state = LINK_STATE_UP; 388 mutex_enter(&grp->lg_stat_lock); 389 grp->lg_link_duplex = LINK_DUPLEX_FULL; 390 mutex_exit(&grp->lg_stat_lock); 391 link_state_changed = B_TRUE; 392 } 393 394 /* 395 * Update port's state. 396 */ 397 port->lp_state = AGGR_PORT_STATE_ATTACHED; 398 399 aggr_grp_multicst_port(port, B_TRUE); 400 401 /* 402 * The port client doesn't have an Rx SRS; instead of calling 403 * mac_rx_set() we set the client's flow callback directly. 404 * This datapath is used only when the port's driver doesn't 405 * support MAC_CAPAB_RINGS. Drivers with ring support will 406 * deliver traffic to the aggr via ring passthru. 407 */ 408 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 409 410 /* 411 * If LACP is OFF, the port can be used to send data as soon 412 * as its link is up and verified to be compatible with the 413 * aggregation. 414 * 415 * If LACP is active or passive, notify the LACP subsystem, which 416 * will enable sending on the port following the LACP protocol. 417 */ 418 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 419 aggr_send_port_enable(port); 420 else 421 aggr_lacp_port_attached(port); 422 423 return (link_state_changed); 424 } 425 426 boolean_t 427 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 428 { 429 boolean_t link_state_changed = B_FALSE; 430 431 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 432 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 433 434 /* update state */ 435 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 436 return (B_FALSE); 437 438 mac_client_clear_flow_cb(port->lp_mch); 439 440 aggr_grp_multicst_port(port, B_FALSE); 441 442 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 443 aggr_send_port_disable(port); 444 else 445 aggr_lacp_port_detached(port); 446 447 port->lp_state = AGGR_PORT_STATE_STANDBY; 448 449 grp->lg_nattached_ports--; 450 if (grp->lg_nattached_ports == 0) { 451 /* the last attached MAC port of the group is being detached */ 452 grp->lg_link_state = LINK_STATE_DOWN; 453 mutex_enter(&grp->lg_stat_lock); 454 grp->lg_ifspeed = 0; 455 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 456 mutex_exit(&grp->lg_stat_lock); 457 link_state_changed = B_TRUE; 458 } 459 460 return (link_state_changed); 461 } 462 463 /* 464 * Update the MAC addresses of the constituent ports of the specified 465 * group. This function is invoked: 466 * - after creating a new aggregation group. 467 * - after adding new ports to an aggregation group. 468 * - after removing a port from a group when the MAC address of 469 * that port was used for the MAC address of the group. 470 * - after the MAC address of a port changed when the MAC address 471 * of that port was used for the MAC address of the group. 472 * 473 * Return true if the link state of the aggregation changed, for example 474 * as a result of a failure changing the MAC address of one of the 475 * constituent ports. 476 */ 477 boolean_t 478 aggr_grp_update_ports_mac(aggr_grp_t *grp) 479 { 480 aggr_port_t *cport; 481 boolean_t link_state_changed = B_FALSE; 482 mac_perim_handle_t mph; 483 484 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 485 486 for (cport = grp->lg_ports; cport != NULL; 487 cport = cport->lp_next) { 488 mac_perim_enter_by_mh(cport->lp_mh, &mph); 489 if (aggr_port_unicst(cport) != 0) { 490 if (aggr_grp_detach_port(grp, cport)) 491 link_state_changed = B_TRUE; 492 } else { 493 /* 494 * If a port was detached because of a previous 495 * failure changing the MAC address, the port is 496 * reattached when it successfully changes the MAC 497 * address now, and this might cause the link state 498 * of the aggregation to change. 499 */ 500 if (aggr_grp_attach_port(grp, cport)) 501 link_state_changed = B_TRUE; 502 } 503 mac_perim_exit(mph); 504 } 505 return (link_state_changed); 506 } 507 508 /* 509 * Invoked when the MAC address of a port has changed. If the port's 510 * MAC address was used for the group MAC address, set mac_addr_changedp 511 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 512 * notification. If the link state changes due to detach/attach of 513 * the constituent port, set link_state_changedp to B_TRUE to indicate 514 * to the caller that it should send a MAC_NOTE_LINK notification. In both 515 * cases, it is the responsibility of the caller to invoke notification 516 * functions after releasing the the port lock. 517 */ 518 void 519 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 520 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 521 { 522 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 523 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 524 ASSERT(mac_addr_changedp != NULL); 525 ASSERT(link_state_changedp != NULL); 526 527 *mac_addr_changedp = B_FALSE; 528 *link_state_changedp = B_FALSE; 529 530 if (grp->lg_addr_fixed) { 531 /* 532 * The group is using a fixed MAC address or an automatic 533 * MAC address has not been set. 534 */ 535 return; 536 } 537 538 if (grp->lg_mac_addr_port == port) { 539 /* 540 * The MAC address of the port was assigned to the group 541 * MAC address. Update the group MAC address. 542 */ 543 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 544 *mac_addr_changedp = B_TRUE; 545 } else { 546 /* 547 * Update the actual port MAC address to the MAC address 548 * of the group. 549 */ 550 if (aggr_port_unicst(port) != 0) { 551 *link_state_changedp = aggr_grp_detach_port(grp, port); 552 } else { 553 /* 554 * If a port was detached because of a previous 555 * failure changing the MAC address, the port is 556 * reattached when it successfully changes the MAC 557 * address now, and this might cause the link state 558 * of the aggregation to change. 559 */ 560 *link_state_changedp = aggr_grp_attach_port(grp, port); 561 } 562 } 563 } 564 565 /* 566 * Add a port to a link aggregation group. 567 */ 568 static int 569 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 570 aggr_port_t **pp) 571 { 572 aggr_port_t *port, **cport; 573 mac_perim_handle_t mph; 574 zoneid_t port_zoneid = ALL_ZONES; 575 int err; 576 577 /* The port must be in the same zone as the aggregation. */ 578 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 579 port_zoneid = GLOBAL_ZONEID; 580 if (grp->lg_zoneid != port_zoneid) 581 return (EBUSY); 582 583 /* 584 * If we are creating the aggr, then there is no MAC handle 585 * and thus no perimeter to hold. If we are adding a port to 586 * an existing aggr, then the perimiter of the aggr's MAC must 587 * be held. 588 */ 589 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 590 591 err = aggr_port_create(grp, port_linkid, force, &port); 592 if (err != 0) 593 return (err); 594 595 mac_perim_enter_by_mh(port->lp_mh, &mph); 596 597 /* Add the new port to the end of the list. */ 598 cport = &grp->lg_ports; 599 while (*cport != NULL) 600 cport = &((*cport)->lp_next); 601 *cport = port; 602 603 /* 604 * Back reference to the group it is member of. A port always 605 * holds a reference to its group to ensure that the back 606 * reference is always valid. 607 */ 608 port->lp_grp = grp; 609 AGGR_GRP_REFHOLD(grp); 610 grp->lg_nports++; 611 if (grp->lg_nports > grp->lg_nports_high) 612 grp->lg_nports_high = grp->lg_nports; 613 614 aggr_lacp_init_port(port); 615 mac_perim_exit(mph); 616 617 if (pp != NULL) 618 *pp = port; 619 620 return (0); 621 } 622 623 /* 624 * This is called when the 'lg_tx_ports' arrangement has changed and 625 * we need to update the corresponding 'mi_default_tx_ring'. This 626 * happens for several reasons. 627 * 628 * - A pseudo TX mac group was added or removed. 629 * - An LACP message has changed the port's state. 630 * - A link event has changed the port's state. 631 * 632 * In any case, we see if there is at least one port enabled (see 633 * 'aggr_send_port_enable()'), and if so we use its first ring as the 634 * mac's default TX ring. 635 * 636 * Note, because we only have a single TX group, we don't have to 637 * worry about the rings moving between groups and the chance that mac 638 * will reassign it unless someone removes a port, at which point, we 639 * play it safe and call this again. 640 */ 641 void 642 aggr_grp_update_default(aggr_grp_t *grp) 643 { 644 aggr_port_t *port; 645 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 646 647 rw_enter(&grp->lg_tx_lock, RW_WRITER); 648 649 if (grp->lg_ntx_ports == 0) { 650 rw_exit(&grp->lg_tx_lock); 651 return; 652 } 653 654 port = grp->lg_tx_ports[0]; 655 ASSERT(port->lp_tx_ring_cnt > 0); 656 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 657 rw_exit(&grp->lg_tx_lock); 658 } 659 660 /* 661 * Add a pseudo RX ring for the given HW ring handle. 662 */ 663 static int 664 aggr_add_pseudo_rx_ring(aggr_port_t *port, 665 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 666 { 667 aggr_pseudo_rx_ring_t *ring; 668 int err; 669 int j; 670 671 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 672 ring = rx_grp->arg_rings + j; 673 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 674 break; 675 } 676 677 /* 678 * No slot for this new RX ring. 679 */ 680 if (j == MAX_RINGS_PER_GROUP) 681 return (ENOSPC); 682 683 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 684 ring->arr_hw_rh = hw_rh; 685 ring->arr_port = port; 686 ring->arr_grp = rx_grp; 687 rx_grp->arg_ring_cnt++; 688 689 /* 690 * The group is already registered, dynamically add a new ring to the 691 * mac group. 692 */ 693 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 694 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 695 ring->arr_hw_rh = NULL; 696 ring->arr_port = NULL; 697 ring->arr_grp = NULL; 698 rx_grp->arg_ring_cnt--; 699 } else { 700 /* 701 * This must run after the MAC is registered. 702 */ 703 ASSERT3P(ring->arr_rh, !=, NULL); 704 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 705 (void *)port, (mac_resource_handle_t)ring); 706 } 707 return (err); 708 } 709 710 /* 711 * Remove the pseudo RX ring of the given HW ring handle. 712 */ 713 static void 714 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 715 { 716 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 717 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 718 719 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 720 ring->arr_hw_rh != hw_rh) { 721 continue; 722 } 723 724 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 725 726 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 727 ring->arr_hw_rh = NULL; 728 ring->arr_port = NULL; 729 ring->arr_grp = NULL; 730 rx_grp->arg_ring_cnt--; 731 mac_hwring_clear_passthru(hw_rh); 732 break; 733 } 734 } 735 736 /* 737 * Create pseudo rings over the HW rings of the port. 738 * 739 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 740 * 741 * o Program existing unicast filters on the pseudo group into the HW group. 742 * 743 * o Program existing VLAN filters on the pseudo group into the HW group. 744 */ 745 static int 746 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 747 { 748 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 749 aggr_unicst_addr_t *addr, *a; 750 mac_perim_handle_t pmph; 751 aggr_vlan_t *avp; 752 uint_t hw_rh_cnt, i; 753 int err = 0; 754 uint_t g_idx = rx_grp->arg_index; 755 756 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 757 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 758 mac_perim_enter_by_mh(port->lp_mh, &pmph); 759 760 i = 0; 761 addr = NULL; 762 /* 763 * This function must be called after the aggr registers its 764 * MAC and its Rx groups have been initialized. 765 */ 766 ASSERT(rx_grp->arg_gh != NULL); 767 768 /* 769 * Get the list of the underlying HW rings. 770 */ 771 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 772 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 773 774 /* 775 * Add existing VLAN and unicast address filters to the port. 776 */ 777 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 778 avp = list_next(&rx_grp->arg_vlans, avp)) { 779 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 780 goto err; 781 } 782 783 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 784 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 785 goto err; 786 } 787 788 for (i = 0; i < hw_rh_cnt; i++) { 789 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 790 if (err != 0) 791 goto err; 792 } 793 794 mac_perim_exit(pmph); 795 return (0); 796 797 err: 798 ASSERT(err != 0); 799 800 for (uint_t j = 0; j < i; j++) 801 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 802 803 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 804 aggr_port_remmac(port, g_idx, a->aua_addr); 805 806 if (avp != NULL) 807 avp = list_prev(&rx_grp->arg_vlans, avp); 808 809 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 810 int err2; 811 812 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 813 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 814 ": errno %d.", avp->av_vid, 815 mac_client_name(port->lp_mch), err2); 816 } 817 } 818 819 port->lp_hwghs[g_idx] = NULL; 820 mac_perim_exit(pmph); 821 return (err); 822 } 823 824 /* 825 * Destroy the pseudo rings mapping to this port and remove all VLAN 826 * and unicast filters from this port. Even if there are no underlying 827 * HW rings we must still remove the unicast filters to take the port 828 * out of promisc mode. 829 */ 830 static void 831 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 832 { 833 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 834 aggr_unicst_addr_t *addr; 835 mac_perim_handle_t pmph; 836 uint_t hw_rh_cnt; 837 uint_t g_idx = rx_grp->arg_index; 838 839 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 840 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 841 ASSERT3P(rx_grp->arg_gh, !=, NULL); 842 mac_perim_enter_by_mh(port->lp_mh, &pmph); 843 844 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 845 MAC_RING_TYPE_RX); 846 847 for (uint_t i = 0; i < hw_rh_cnt; i++) 848 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 849 850 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 851 aggr_port_remmac(port, g_idx, addr->aua_addr); 852 853 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 854 avp = list_next(&rx_grp->arg_vlans, avp)) { 855 int err; 856 857 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 858 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 859 ": errno %d.", avp->av_vid, 860 mac_client_name(port->lp_mch), err); 861 } 862 } 863 864 port->lp_hwghs[g_idx] = NULL; 865 mac_perim_exit(pmph); 866 } 867 868 /* 869 * Add a pseudo TX ring for the given HW ring handle. 870 */ 871 static int 872 aggr_add_pseudo_tx_ring(aggr_port_t *port, 873 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 874 mac_ring_handle_t *pseudo_rh) 875 { 876 aggr_pseudo_tx_ring_t *ring; 877 int err; 878 int i; 879 880 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 881 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 882 ring = tx_grp->atg_rings + i; 883 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 884 break; 885 } 886 /* 887 * No slot for this new TX ring. 888 */ 889 if (i == MAX_RINGS_PER_GROUP) 890 return (ENOSPC); 891 /* 892 * The following 4 statements needs to be done before 893 * calling mac_group_add_ring(). Otherwise it will 894 * result in an assertion failure in mac_init_ring(). 895 */ 896 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 897 ring->atr_hw_rh = hw_rh; 898 ring->atr_port = port; 899 tx_grp->atg_ring_cnt++; 900 901 /* 902 * The TX side has no concept of ring groups unlike RX groups. 903 * There is just a single group which stores all the TX rings. 904 * This group will be used to store aggr's pseudo TX rings. 905 */ 906 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 907 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 908 ring->atr_hw_rh = NULL; 909 ring->atr_port = NULL; 910 tx_grp->atg_ring_cnt--; 911 } else { 912 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 913 if (hw_rh != NULL) { 914 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 915 mac_find_ring(tx_grp->atg_gh, i)); 916 } 917 } 918 919 return (err); 920 } 921 922 /* 923 * Remove the pseudo TX ring of the given HW ring handle. 924 */ 925 static void 926 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 927 mac_ring_handle_t pseudo_hw_rh) 928 { 929 aggr_pseudo_tx_ring_t *ring; 930 int i; 931 932 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 933 ring = tx_grp->atg_rings + i; 934 if (ring->atr_rh != pseudo_hw_rh) 935 continue; 936 937 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 938 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 939 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 940 mac_hwring_teardown(ring->atr_hw_rh); 941 ring->atr_hw_rh = NULL; 942 ring->atr_port = NULL; 943 tx_grp->atg_ring_cnt--; 944 break; 945 } 946 } 947 948 /* 949 * This function is called to create pseudo rings over hardware rings of 950 * the underlying device. There is a 1:1 mapping between the pseudo TX 951 * rings of the aggr and the hardware rings of the underlying port. 952 */ 953 static int 954 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp, 955 uint_t limit) 956 { 957 aggr_grp_t *grp = port->lp_grp; 958 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 959 mac_perim_handle_t pmph; 960 int hw_rh_cnt, i = 0, j; 961 int err = 0; 962 963 if (limit == 0) 964 return (ENOSPC); 965 966 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 967 mac_perim_enter_by_mh(port->lp_mh, &pmph); 968 969 /* 970 * Get the list the the underlying HW rings. 971 */ 972 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 973 MAC_RING_TYPE_TX); 974 975 /* 976 * Even if the underlying NIC does not have TX rings, we 977 * still make a psuedo TX ring for that NIC with NULL as 978 * the ring handle. 979 */ 980 if (hw_rh_cnt == 0) 981 port->lp_tx_ring_cnt = 1; 982 else 983 port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit); 984 985 port->lp_tx_ring_alloc = port->lp_tx_ring_cnt; 986 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 987 port->lp_tx_ring_alloc), KM_SLEEP); 988 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 989 port->lp_tx_ring_alloc), KM_SLEEP); 990 991 if (hw_rh_cnt == 0) { 992 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 993 NULL, &pseudo_rh)) == 0) { 994 port->lp_tx_rings[0] = NULL; 995 port->lp_pseudo_tx_rings[0] = pseudo_rh; 996 } 997 } else { 998 for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) { 999 err = aggr_add_pseudo_tx_ring(port, 1000 tx_grp, hw_rh[i], &pseudo_rh); 1001 if (err != 0) 1002 break; 1003 port->lp_tx_rings[i] = hw_rh[i]; 1004 port->lp_pseudo_tx_rings[i] = pseudo_rh; 1005 } 1006 } 1007 1008 if (err != 0) { 1009 if (hw_rh_cnt != 0) { 1010 for (j = 0; j < i; j++) { 1011 aggr_rem_pseudo_tx_ring(tx_grp, 1012 port->lp_pseudo_tx_rings[j]); 1013 } 1014 } 1015 kmem_free(port->lp_tx_rings, 1016 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1017 kmem_free(port->lp_pseudo_tx_rings, 1018 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1019 port->lp_tx_ring_cnt = 0; 1020 port->lp_tx_ring_alloc = 0; 1021 } else { 1022 port->lp_tx_grp_added = B_TRUE; 1023 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1024 aggr_tx_ring_update, port); 1025 } 1026 mac_perim_exit(pmph); 1027 aggr_grp_update_default(grp); 1028 return (err); 1029 } 1030 1031 /* 1032 * This function is called by aggr to remove pseudo TX rings over the 1033 * HW rings of the underlying port. 1034 */ 1035 static void 1036 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1037 { 1038 aggr_grp_t *grp = port->lp_grp; 1039 mac_perim_handle_t pmph; 1040 int i; 1041 1042 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1043 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1044 1045 if (!port->lp_tx_grp_added) 1046 goto done; 1047 1048 ASSERT(tx_grp->atg_gh != NULL); 1049 1050 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1051 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1052 1053 kmem_free(port->lp_tx_rings, 1054 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1055 kmem_free(port->lp_pseudo_tx_rings, 1056 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1057 1058 port->lp_tx_ring_cnt = 0; 1059 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1060 port->lp_tx_grp_added = B_FALSE; 1061 aggr_grp_update_default(grp); 1062 done: 1063 mac_perim_exit(pmph); 1064 } 1065 1066 static int 1067 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1068 { 1069 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1070 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1071 } 1072 1073 static int 1074 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1075 { 1076 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1077 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1078 } 1079 1080 /* 1081 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1082 * over an actual HW ring, the real task is to start the underlying HW 1083 * ring. 1084 */ 1085 static int 1086 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1087 { 1088 int err; 1089 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1090 1091 err = mac_hwring_start(rr_ring->arr_hw_rh); 1092 1093 if (err != 0) 1094 return (err); 1095 1096 rr_ring->arr_gen = mr_gen; 1097 return (err); 1098 } 1099 1100 /* 1101 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1102 * over an actual HW ring, the real task is to stop the underlying HW 1103 * ring. 1104 */ 1105 static void 1106 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1107 { 1108 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1109 1110 /* 1111 * The rings underlying the default group must stay up to 1112 * continue receiving LACP traffic. We would normally never 1113 * stop the default Rx rings because of the primary MAC 1114 * client; but aggr's primary MAC client doesn't call 1115 * mac_unicast_add() and thus mi_active is 0 when the last 1116 * non-primary client is deleted. 1117 */ 1118 if (rr_ring->arr_grp->arg_index != 0) 1119 mac_hwring_stop(rr_ring->arr_hw_rh); 1120 } 1121 1122 /* 1123 * Trim each port in a group to ensure it uses no more than tx_ring_limit 1124 * rings. 1125 */ 1126 static void 1127 aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit) 1128 { 1129 aggr_port_t *port; 1130 mac_perim_handle_t mph; 1131 uint_t i, tx_ring_cnt; 1132 1133 ASSERT(tx_ring_limit > 0); 1134 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1135 1136 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1137 mac_perim_enter_by_mh(port->lp_mh, &mph); 1138 1139 /* 1140 * Reduce the Tx ring count first to prevent rings being 1141 * used as they are removed. 1142 */ 1143 rw_enter(&grp->lg_tx_lock, RW_WRITER); 1144 if (port->lp_tx_ring_cnt <= tx_ring_limit) { 1145 rw_exit(&grp->lg_tx_lock); 1146 mac_perim_exit(mph); 1147 continue; 1148 } 1149 1150 tx_ring_cnt = port->lp_tx_ring_cnt; 1151 port->lp_tx_ring_cnt = tx_ring_limit; 1152 rw_exit(&grp->lg_tx_lock); 1153 1154 for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) { 1155 aggr_rem_pseudo_tx_ring(&grp->lg_tx_group, 1156 port->lp_pseudo_tx_rings[i]); 1157 1158 } 1159 1160 mac_perim_exit(mph); 1161 } 1162 } 1163 1164 /* 1165 * Add one or more ports to an existing link aggregation group. 1166 */ 1167 int 1168 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1169 laioc_port_t *ports) 1170 { 1171 int rc; 1172 uint_t port_added = 0; 1173 uint_t grp_added; 1174 uint_t nports_high, tx_ring_limit; 1175 aggr_grp_t *grp = NULL; 1176 aggr_port_t *port; 1177 boolean_t link_state_changed = B_FALSE; 1178 mac_perim_handle_t mph, pmph; 1179 1180 /* Get the aggr corresponding to linkid. */ 1181 rw_enter(&aggr_grp_lock, RW_READER); 1182 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1183 (mod_hash_val_t *)&grp) != 0) { 1184 rw_exit(&aggr_grp_lock); 1185 return (ENOENT); 1186 } 1187 AGGR_GRP_REFHOLD(grp); 1188 1189 /* 1190 * Hold the perimeter so that the aggregation can't be destroyed. 1191 */ 1192 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1193 rw_exit(&aggr_grp_lock); 1194 1195 /* 1196 * Limit the number of Tx rings per port. When determining the 1197 * number of ports take into consideration the existing high 1198 * value, and what the new high value may be after this request. 1199 */ 1200 nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports); 1201 tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high; 1202 1203 if (tx_ring_limit == 0) { 1204 rc = ENOSPC; 1205 goto bail; 1206 } 1207 1208 /* 1209 * Balance the Tx rings so each port has a fair share of rings. 1210 */ 1211 aggr_grp_balance_tx(grp, tx_ring_limit); 1212 1213 /* Add the specified ports to the aggr. */ 1214 for (uint_t i = 0; i < nports; i++) { 1215 grp_added = 0; 1216 1217 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1218 force, &port)) != 0) { 1219 goto bail; 1220 } 1221 1222 ASSERT(port != NULL); 1223 port_added++; 1224 1225 /* check capabilities */ 1226 if (!aggr_grp_capab_check(grp, port) || 1227 !aggr_grp_sdu_check(grp, port) || 1228 !aggr_grp_margin_check(grp, port)) { 1229 rc = ENOTSUP; 1230 goto bail; 1231 } 1232 1233 /* 1234 * Create the pseudo ring for each HW ring of the underlying 1235 * port. 1236 */ 1237 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, 1238 tx_ring_limit); 1239 if (rc != 0) 1240 goto bail; 1241 1242 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1243 rc = aggr_add_pseudo_rx_group(port, 1244 &grp->lg_rx_groups[j]); 1245 1246 if (rc != 0) 1247 goto bail; 1248 1249 grp_added++; 1250 } 1251 1252 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1253 1254 /* set LACP mode */ 1255 aggr_port_lacp_set_mode(grp, port); 1256 1257 /* start port if group has already been started */ 1258 if (grp->lg_started) { 1259 rc = aggr_port_start(port); 1260 if (rc != 0) { 1261 mac_perim_exit(pmph); 1262 goto bail; 1263 } 1264 1265 /* 1266 * Turn on the promiscuous mode over the port when it 1267 * is requested to be turned on to receive the 1268 * non-primary address over a port, or the promiscuous 1269 * mode is enabled over the aggr. 1270 */ 1271 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1272 rc = aggr_port_promisc(port, B_TRUE); 1273 if (rc != 0) { 1274 mac_perim_exit(pmph); 1275 goto bail; 1276 } 1277 } 1278 } 1279 mac_perim_exit(pmph); 1280 1281 /* 1282 * Attach each port if necessary. 1283 */ 1284 if (aggr_port_notify_link(grp, port)) 1285 link_state_changed = B_TRUE; 1286 1287 /* 1288 * Initialize the callback functions for this port. 1289 */ 1290 aggr_port_init_callbacks(port); 1291 } 1292 1293 /* update the MAC address of the constituent ports */ 1294 if (aggr_grp_update_ports_mac(grp)) 1295 link_state_changed = B_TRUE; 1296 1297 if (link_state_changed) 1298 mac_link_update(grp->lg_mh, grp->lg_link_state); 1299 1300 bail: 1301 if (rc != 0) { 1302 /* stop and remove ports that have been added */ 1303 for (uint_t i = 0; i < port_added; i++) { 1304 uint_t grp_remove; 1305 1306 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1307 ASSERT(port != NULL); 1308 1309 if (grp->lg_started) { 1310 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1311 (void) aggr_port_promisc(port, B_FALSE); 1312 aggr_port_stop(port); 1313 mac_perim_exit(pmph); 1314 } 1315 1316 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1317 1318 /* 1319 * Only the last port could have a partial set 1320 * of groups added. 1321 */ 1322 grp_remove = (i + 1 == port_added) ? grp_added : 1323 grp->lg_rx_group_count; 1324 1325 for (uint_t j = 0; j < grp_remove; j++) { 1326 aggr_rem_pseudo_rx_group(port, 1327 &grp->lg_rx_groups[j]); 1328 } 1329 1330 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1331 } 1332 } 1333 1334 mac_perim_exit(mph); 1335 AGGR_GRP_REFRELE(grp); 1336 return (rc); 1337 } 1338 1339 static int 1340 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1341 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1342 aggr_lacp_timer_t lacp_timer) 1343 { 1344 boolean_t mac_addr_changed = B_FALSE; 1345 boolean_t link_state_changed = B_FALSE; 1346 mac_perim_handle_t pmph; 1347 1348 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1349 1350 /* validate fixed address if specified */ 1351 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1352 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1353 (mac_addr[0] & 0x01))) { 1354 return (EINVAL); 1355 } 1356 1357 /* update policy if requested */ 1358 if (update_mask & AGGR_MODIFY_POLICY) 1359 aggr_send_update_policy(grp, policy); 1360 1361 /* update unicast MAC address if requested */ 1362 if (update_mask & AGGR_MODIFY_MAC) { 1363 if (mac_fixed) { 1364 /* user-supplied MAC address */ 1365 grp->lg_mac_addr_port = NULL; 1366 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1367 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1368 mac_addr_changed = B_TRUE; 1369 } 1370 } else if (grp->lg_addr_fixed) { 1371 /* switch from user-supplied to automatic */ 1372 aggr_port_t *port = grp->lg_ports; 1373 1374 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1375 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1376 grp->lg_mac_addr_port = port; 1377 mac_addr_changed = B_TRUE; 1378 mac_perim_exit(pmph); 1379 } 1380 grp->lg_addr_fixed = mac_fixed; 1381 } 1382 1383 if (mac_addr_changed) 1384 link_state_changed = aggr_grp_update_ports_mac(grp); 1385 1386 if (update_mask & AGGR_MODIFY_LACP_MODE) 1387 aggr_lacp_update_mode(grp, lacp_mode); 1388 1389 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1390 aggr_lacp_update_timer(grp, lacp_timer); 1391 1392 if (link_state_changed) 1393 mac_link_update(grp->lg_mh, grp->lg_link_state); 1394 1395 if (mac_addr_changed) 1396 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1397 1398 return (0); 1399 } 1400 1401 /* 1402 * Update properties of an existing link aggregation group. 1403 */ 1404 int 1405 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1406 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1407 aggr_lacp_timer_t lacp_timer) 1408 { 1409 aggr_grp_t *grp = NULL; 1410 mac_perim_handle_t mph; 1411 int err; 1412 1413 /* get group corresponding to linkid */ 1414 rw_enter(&aggr_grp_lock, RW_READER); 1415 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1416 (mod_hash_val_t *)&grp) != 0) { 1417 rw_exit(&aggr_grp_lock); 1418 return (ENOENT); 1419 } 1420 AGGR_GRP_REFHOLD(grp); 1421 1422 /* 1423 * Hold the perimeter so that the aggregation won't be destroyed. 1424 */ 1425 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1426 rw_exit(&aggr_grp_lock); 1427 1428 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1429 mac_addr, lacp_mode, lacp_timer); 1430 1431 mac_perim_exit(mph); 1432 AGGR_GRP_REFRELE(grp); 1433 return (err); 1434 } 1435 1436 /* 1437 * Create a new link aggregation group upon request from administrator. 1438 * Returns 0 on success, an errno on failure. 1439 */ 1440 int 1441 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1442 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1443 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1444 cred_t *credp) 1445 { 1446 aggr_grp_t *grp = NULL; 1447 aggr_port_t *port; 1448 aggr_port_t *last_attached = NULL; 1449 mac_register_t *mac; 1450 boolean_t link_state_changed; 1451 mac_perim_handle_t mph, pmph; 1452 datalink_id_t tempid; 1453 boolean_t mac_registered = B_FALSE; 1454 uint_t tx_ring_limit; 1455 int err; 1456 int i, j; 1457 kt_did_t tid = 0; 1458 1459 /* need at least one port */ 1460 if (nports == 0) 1461 return (EINVAL); 1462 1463 rw_enter(&aggr_grp_lock, RW_WRITER); 1464 1465 /* does a group with the same linkid already exist? */ 1466 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1467 (mod_hash_val_t *)&grp); 1468 if (err == 0) { 1469 rw_exit(&aggr_grp_lock); 1470 return (EEXIST); 1471 } 1472 1473 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1474 1475 grp->lg_refs = 1; 1476 grp->lg_closing = B_FALSE; 1477 grp->lg_force = force; 1478 grp->lg_linkid = linkid; 1479 grp->lg_zoneid = crgetzoneid(credp); 1480 grp->lg_ifspeed = 0; 1481 grp->lg_link_state = LINK_STATE_UNKNOWN; 1482 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1483 grp->lg_started = B_FALSE; 1484 grp->lg_promisc = B_FALSE; 1485 grp->lg_lacp_done = B_FALSE; 1486 grp->lg_tx_notify_done = B_FALSE; 1487 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1488 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1489 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1490 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1491 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1492 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1493 MAX_RINGS_PER_GROUP), KM_SLEEP); 1494 grp->lg_tx_blocked_cnt = 0; 1495 bzero(&grp->lg_rx_groups, 1496 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1497 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1498 aggr_lacp_init_grp(grp); 1499 1500 /* add MAC ports to group */ 1501 grp->lg_ports = NULL; 1502 grp->lg_nports = 0; 1503 grp->lg_nattached_ports = 0; 1504 grp->lg_ntx_ports = 0; 1505 1506 /* 1507 * If key is not specified by the user, allocate the key. 1508 */ 1509 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1510 err = ENOMEM; 1511 goto bail; 1512 } 1513 grp->lg_key = key; 1514 1515 for (i = 0; i < nports; i++) { 1516 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1517 if (err != 0) 1518 goto bail; 1519 } 1520 1521 grp->lg_rx_group_count = 1; 1522 1523 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1524 uint_t num_rgroups; 1525 1526 mac_perim_enter_by_mh(port->lp_mh, &mph); 1527 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1528 mac_perim_exit(mph); 1529 1530 /* 1531 * Utilize all the groups in a port. If some ports 1532 * have less groups than others, then traffic destined 1533 * for the same unicast address may be HW classified 1534 * on some ports but SW classified by aggr when 1535 * arriving on other ports. 1536 */ 1537 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1538 num_rgroups); 1539 } 1540 1541 /* 1542 * There could be cases where the hardware provides more 1543 * groups than aggr can support. Make sure we never go above 1544 * the max aggr can support. 1545 */ 1546 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1547 MAX_GROUPS_PER_PORT); 1548 1549 ASSERT3U(grp->lg_rx_group_count, >, 0); 1550 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1551 grp->lg_rx_groups[i].arg_index = i; 1552 grp->lg_rx_groups[i].arg_untagged = 0; 1553 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1554 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1555 } 1556 1557 /* 1558 * If no explicit MAC address was specified by the administrator, 1559 * set it to the MAC address of the first port. 1560 */ 1561 grp->lg_addr_fixed = mac_fixed; 1562 if (grp->lg_addr_fixed) { 1563 /* validate specified address */ 1564 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1565 err = EINVAL; 1566 goto bail; 1567 } 1568 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1569 } else { 1570 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1571 grp->lg_mac_addr_port = grp->lg_ports; 1572 } 1573 1574 /* Set the initial group capabilities. */ 1575 aggr_grp_capab_set(grp); 1576 1577 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1578 err = ENOMEM; 1579 goto bail; 1580 } 1581 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1582 mac->m_driver = grp; 1583 mac->m_dip = aggr_dip; 1584 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1585 mac->m_src_addr = grp->lg_addr; 1586 mac->m_callbacks = &aggr_m_callbacks; 1587 mac->m_min_sdu = 0; 1588 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1589 mac->m_margin = aggr_grp_max_margin(grp); 1590 mac->m_v12n = MAC_VIRT_LEVEL1; 1591 err = mac_register(mac, &grp->lg_mh); 1592 mac_free(mac); 1593 if (err != 0) 1594 goto bail; 1595 1596 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1597 if (err != 0) { 1598 (void) mac_unregister(grp->lg_mh); 1599 grp->lg_mh = NULL; 1600 goto bail; 1601 } 1602 1603 mac_registered = B_TRUE; 1604 1605 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1606 1607 /* 1608 * Update the MAC address of the constituent ports. 1609 * None of the port is attached at this time, the link state of the 1610 * aggregation will not change. 1611 * 1612 * All ports take on the primary MAC address of the aggr 1613 * (lg_aggr). At this point, none of the ports are attached; 1614 * thus the link state of the aggregation will not change. 1615 */ 1616 link_state_changed = aggr_grp_update_ports_mac(grp); 1617 ASSERT(!link_state_changed); 1618 1619 /* Update outbound load balancing policy. */ 1620 aggr_send_update_policy(grp, policy); 1621 1622 /* Set LACP mode. */ 1623 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1624 1625 /* 1626 * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP 1627 * rings, when all the Tx rings of all the ports are accumulated 1628 * it is conceivable this limit is exceeded. We try and prevent 1629 * this by limiting the number of rings an individual port will use. 1630 * 1631 * - When an aggr is first created, we will not let an 1632 * individual port use more than MAX_RINGS_PER_GROUP/nports 1633 * rings. 1634 * - As ports are added to an existing aggr, each of the 1635 * ports will not use more than MAX_RINGS_PER_GROUP/nports_high. 1636 * Where nports_high is the highest number of ports the aggr has 1637 * held (including any ports being added). This may involve 1638 * trimming rings from existing ports. 1639 */ 1640 1641 /* Leave room for 4 ports */ 1642 tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports); 1643 1644 /* 1645 * Attach each port if necessary. 1646 */ 1647 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1648 /* 1649 * Create the pseudo ring for each HW ring of the 1650 * underlying port. Note that this is done after the 1651 * aggr registers its MAC. 1652 */ 1653 err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, 1654 tx_ring_limit); 1655 1656 if (err != 0) { 1657 mac_perim_exit(mph); 1658 goto bail; 1659 } 1660 1661 for (i = 0; i < grp->lg_rx_group_count; i++) { 1662 err = aggr_add_pseudo_rx_group(port, 1663 &grp->lg_rx_groups[i]); 1664 1665 if (err != 0) { 1666 /* 1667 * Undo what we have added for the current 1668 * port. 1669 */ 1670 aggr_rem_pseudo_tx_group(port, 1671 &grp->lg_tx_group); 1672 1673 for (j = 0; j < i; j++) { 1674 aggr_rem_pseudo_rx_group(port, 1675 &grp->lg_rx_groups[j]); 1676 } 1677 1678 mac_perim_exit(mph); 1679 goto bail; 1680 } 1681 } 1682 1683 if (aggr_port_notify_link(grp, port)) 1684 link_state_changed = B_TRUE; 1685 1686 /* 1687 * Initialize the callback functions for this port. 1688 */ 1689 aggr_port_init_callbacks(port); 1690 1691 last_attached = port; 1692 } 1693 1694 if (link_state_changed) 1695 mac_link_update(grp->lg_mh, grp->lg_link_state); 1696 1697 /* add new group to hash table */ 1698 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1699 (mod_hash_val_t)grp); 1700 ASSERT(err == 0); 1701 aggr_grp_cnt++; 1702 1703 mac_perim_exit(mph); 1704 rw_exit(&aggr_grp_lock); 1705 return (0); 1706 1707 bail: 1708 grp->lg_closing = B_TRUE; 1709 1710 /* 1711 * Inform the lacp_rx thread to exit. 1712 */ 1713 mutex_enter(&grp->lg_lacp_lock); 1714 grp->lg_lacp_done = B_TRUE; 1715 cv_signal(&grp->lg_lacp_cv); 1716 while (grp->lg_lacp_rx_thread != NULL) 1717 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1718 mutex_exit(&grp->lg_lacp_lock); 1719 /* 1720 * Inform the tx_notify thread to exit. 1721 */ 1722 mutex_enter(&grp->lg_tx_flowctl_lock); 1723 if (grp->lg_tx_notify_thread != NULL) { 1724 tid = grp->lg_tx_notify_thread->t_did; 1725 grp->lg_tx_notify_done = B_TRUE; 1726 cv_signal(&grp->lg_tx_flowctl_cv); 1727 } 1728 mutex_exit(&grp->lg_tx_flowctl_lock); 1729 if (tid != 0) 1730 thread_join(tid); 1731 1732 if (mac_registered) { 1733 (void) dls_devnet_destroy(grp->lg_mh, &tempid, B_TRUE); 1734 (void) mac_disable(grp->lg_mh); 1735 1736 if (last_attached != NULL) { 1737 /* 1738 * Detach and clean up ports added. 1739 */ 1740 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1741 1742 for (port = grp->lg_ports; ; port = port->lp_next) { 1743 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1744 (void) aggr_grp_detach_port(grp, port); 1745 mac_perim_exit(pmph); 1746 1747 aggr_rem_pseudo_tx_group(port, 1748 &grp->lg_tx_group); 1749 1750 for (i = 0; i < grp->lg_rx_group_count; i++) { 1751 aggr_rem_pseudo_rx_group(port, 1752 &grp->lg_rx_groups[i]); 1753 } 1754 if (port == last_attached) 1755 break; 1756 } 1757 1758 mac_perim_exit(mph); 1759 } 1760 1761 (void) mac_unregister(grp->lg_mh); 1762 } 1763 1764 port = grp->lg_ports; 1765 while (port != NULL) { 1766 aggr_port_t *cport; 1767 1768 cport = port->lp_next; 1769 aggr_port_delete(port); 1770 port = cport; 1771 } 1772 1773 kmem_free(grp->lg_tx_blocked_rings, 1774 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1775 rw_exit(&aggr_grp_lock); 1776 AGGR_GRP_REFRELE(grp); 1777 return (err); 1778 } 1779 1780 /* 1781 * Return a pointer to the member of a group with specified linkid. 1782 */ 1783 static aggr_port_t * 1784 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1785 { 1786 aggr_port_t *port; 1787 1788 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1789 1790 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1791 if (port->lp_linkid == linkid) 1792 break; 1793 } 1794 1795 return (port); 1796 } 1797 1798 /* 1799 * Stop, detach and remove a port from a link aggregation group. 1800 */ 1801 static int 1802 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1803 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1804 { 1805 int rc = 0; 1806 aggr_port_t **pport; 1807 boolean_t mac_addr_changed = B_FALSE; 1808 boolean_t link_state_changed = B_FALSE; 1809 mac_perim_handle_t mph; 1810 uint64_t val; 1811 uint_t i; 1812 uint_t stat; 1813 1814 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1815 ASSERT(grp->lg_nports > 1); 1816 ASSERT(!grp->lg_closing); 1817 1818 /* unlink port */ 1819 for (pport = &grp->lg_ports; *pport != port; 1820 pport = &(*pport)->lp_next) { 1821 if (*pport == NULL) { 1822 rc = ENOENT; 1823 goto done; 1824 } 1825 } 1826 *pport = port->lp_next; 1827 1828 mac_perim_enter_by_mh(port->lp_mh, &mph); 1829 1830 /* 1831 * If the MAC address of the port being removed was assigned 1832 * to the group, update the group MAC address 1833 * using the MAC address of a different port. 1834 */ 1835 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1836 /* 1837 * Set the MAC address of the group to the 1838 * MAC address of its first port. 1839 */ 1840 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1841 grp->lg_mac_addr_port = grp->lg_ports; 1842 mac_addr_changed = B_TRUE; 1843 } 1844 1845 link_state_changed = aggr_grp_detach_port(grp, port); 1846 1847 /* 1848 * Add the counter statistics of the ports while it was aggregated 1849 * to the group's residual statistics. This is done by obtaining 1850 * the current counter from the underlying MAC then subtracting the 1851 * value of the counter at the moment it was added to the 1852 * aggregation. 1853 */ 1854 for (i = 0; i < MAC_NSTAT; i++) { 1855 stat = i + MAC_STAT_MIN; 1856 if (!MAC_STAT_ISACOUNTER(stat)) 1857 continue; 1858 val = aggr_port_stat(port, stat); 1859 val -= port->lp_stat[i]; 1860 mutex_enter(&grp->lg_stat_lock); 1861 grp->lg_stat[i] += val; 1862 mutex_exit(&grp->lg_stat_lock); 1863 } 1864 for (i = 0; i < ETHER_NSTAT; i++) { 1865 stat = i + MACTYPE_STAT_MIN; 1866 if (!ETHER_STAT_ISACOUNTER(stat)) 1867 continue; 1868 val = aggr_port_stat(port, stat); 1869 val -= port->lp_ether_stat[i]; 1870 mutex_enter(&grp->lg_stat_lock); 1871 grp->lg_ether_stat[i] += val; 1872 mutex_exit(&grp->lg_stat_lock); 1873 } 1874 1875 grp->lg_nports--; 1876 mac_perim_exit(mph); 1877 1878 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1879 aggr_port_delete(port); 1880 1881 /* 1882 * If the group MAC address has changed, update the MAC address of 1883 * the remaining constituent ports according to the new MAC 1884 * address of the group. 1885 */ 1886 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1887 link_state_changed = B_TRUE; 1888 1889 done: 1890 if (mac_addr_changedp != NULL) 1891 *mac_addr_changedp = mac_addr_changed; 1892 if (link_state_changedp != NULL) 1893 *link_state_changedp = link_state_changed; 1894 1895 return (rc); 1896 } 1897 1898 /* 1899 * Remove one or more ports from an existing link aggregation group. 1900 */ 1901 int 1902 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1903 { 1904 int rc = 0; 1905 uint_t i; 1906 aggr_grp_t *grp = NULL; 1907 aggr_port_t *port; 1908 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1909 boolean_t link_state_update = B_FALSE, link_state_changed; 1910 mac_perim_handle_t mph, pmph; 1911 1912 /* get group corresponding to linkid */ 1913 rw_enter(&aggr_grp_lock, RW_READER); 1914 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1915 (mod_hash_val_t *)&grp) != 0) { 1916 rw_exit(&aggr_grp_lock); 1917 return (ENOENT); 1918 } 1919 AGGR_GRP_REFHOLD(grp); 1920 1921 /* 1922 * Hold the perimeter so that the aggregation won't be destroyed. 1923 */ 1924 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1925 rw_exit(&aggr_grp_lock); 1926 1927 /* we need to keep at least one port per group */ 1928 if (nports >= grp->lg_nports) { 1929 rc = EINVAL; 1930 goto bail; 1931 } 1932 1933 /* first verify that all the groups are valid */ 1934 for (i = 0; i < nports; i++) { 1935 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1936 /* port not found */ 1937 rc = ENOENT; 1938 goto bail; 1939 } 1940 } 1941 1942 /* clear the promiscous mode for the specified ports */ 1943 for (i = 0; i < nports && rc == 0; i++) { 1944 /* lookup port */ 1945 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1946 ASSERT(port != NULL); 1947 1948 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1949 rc = aggr_port_promisc(port, B_FALSE); 1950 mac_perim_exit(pmph); 1951 } 1952 if (rc != 0) { 1953 for (i = 0; i < nports; i++) { 1954 port = aggr_grp_port_lookup(grp, 1955 ports[i].lp_linkid); 1956 ASSERT(port != NULL); 1957 1958 /* 1959 * Turn the promiscuous mode back on if it is required 1960 * to receive the non-primary address over a port, or 1961 * the promiscous mode is enabled over the aggr. 1962 */ 1963 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1964 if (port->lp_started && (grp->lg_promisc || 1965 port->lp_prom_addr != NULL)) { 1966 (void) aggr_port_promisc(port, B_TRUE); 1967 } 1968 mac_perim_exit(pmph); 1969 } 1970 goto bail; 1971 } 1972 1973 /* remove the specified ports from group */ 1974 for (i = 0; i < nports; i++) { 1975 /* lookup port */ 1976 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1977 ASSERT(port != NULL); 1978 1979 /* stop port if group has already been started */ 1980 if (grp->lg_started) { 1981 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1982 aggr_port_stop(port); 1983 mac_perim_exit(pmph); 1984 } 1985 1986 /* 1987 * aggr_rem_pseudo_tx_group() is not called here. Instead 1988 * it is called from inside aggr_grp_rem_port() after the 1989 * port has been detached. The reason is that 1990 * aggr_rem_pseudo_tx_group() removes one ring at a time 1991 * and if there is still traffic going on, then there 1992 * is the possibility of aggr_find_tx_ring() returning a 1993 * removed ring for transmission. Once the port has been 1994 * detached, that port will not be used and 1995 * aggr_find_tx_ring() will not return any rings 1996 * belonging to it. 1997 */ 1998 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) 1999 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]); 2000 2001 /* remove port from group */ 2002 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 2003 &link_state_changed); 2004 ASSERT(rc == 0); 2005 mac_addr_update = mac_addr_update || mac_addr_changed; 2006 link_state_update = link_state_update || link_state_changed; 2007 } 2008 2009 bail: 2010 if (mac_addr_update) 2011 mac_unicst_update(grp->lg_mh, grp->lg_addr); 2012 if (link_state_update) 2013 mac_link_update(grp->lg_mh, grp->lg_link_state); 2014 2015 mac_perim_exit(mph); 2016 AGGR_GRP_REFRELE(grp); 2017 2018 return (rc); 2019 } 2020 2021 int 2022 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 2023 { 2024 aggr_grp_t *grp = NULL; 2025 aggr_port_t *port, *cport; 2026 datalink_id_t tmpid; 2027 mod_hash_val_t val; 2028 mac_perim_handle_t mph, pmph; 2029 int err; 2030 kt_did_t tid = 0; 2031 2032 rw_enter(&aggr_grp_lock, RW_WRITER); 2033 2034 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2035 (mod_hash_val_t *)&grp) != 0) { 2036 rw_exit(&aggr_grp_lock); 2037 return (ENOENT); 2038 } 2039 2040 /* 2041 * Note that dls_devnet_destroy() must be called before lg_lock is 2042 * held. Otherwise, it will deadlock if another thread is in 2043 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 2044 * dls_devnet_destroy() needs to delete. 2045 */ 2046 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 2047 rw_exit(&aggr_grp_lock); 2048 return (err); 2049 } 2050 ASSERT(linkid == tmpid); 2051 2052 /* 2053 * Unregister from the MAC service module. Since this can 2054 * fail if a client hasn't closed the MAC port, we gracefully 2055 * fail the operation. 2056 */ 2057 if ((err = mac_disable(grp->lg_mh)) != 0) { 2058 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 2059 rw_exit(&aggr_grp_lock); 2060 return (err); 2061 } 2062 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 2063 ASSERT(grp == (aggr_grp_t *)val); 2064 2065 ASSERT(aggr_grp_cnt > 0); 2066 aggr_grp_cnt--; 2067 rw_exit(&aggr_grp_lock); 2068 2069 /* 2070 * Inform the lacp_rx thread to exit. 2071 */ 2072 mutex_enter(&grp->lg_lacp_lock); 2073 grp->lg_lacp_done = B_TRUE; 2074 cv_signal(&grp->lg_lacp_cv); 2075 while (grp->lg_lacp_rx_thread != NULL) 2076 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 2077 mutex_exit(&grp->lg_lacp_lock); 2078 /* 2079 * Inform the tx_notify_thread to exit. 2080 */ 2081 mutex_enter(&grp->lg_tx_flowctl_lock); 2082 if (grp->lg_tx_notify_thread != NULL) { 2083 tid = grp->lg_tx_notify_thread->t_did; 2084 grp->lg_tx_notify_done = B_TRUE; 2085 cv_signal(&grp->lg_tx_flowctl_cv); 2086 } 2087 mutex_exit(&grp->lg_tx_flowctl_lock); 2088 if (tid != 0) 2089 thread_join(tid); 2090 2091 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2092 2093 grp->lg_closing = B_TRUE; 2094 /* detach and free MAC ports associated with group */ 2095 port = grp->lg_ports; 2096 while (port != NULL) { 2097 cport = port->lp_next; 2098 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2099 if (grp->lg_started) 2100 aggr_port_stop(port); 2101 (void) aggr_grp_detach_port(grp, port); 2102 mac_perim_exit(pmph); 2103 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 2104 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2105 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 2106 aggr_port_delete(port); 2107 port = cport; 2108 } 2109 2110 mac_perim_exit(mph); 2111 2112 kmem_free(grp->lg_tx_blocked_rings, 2113 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 2114 /* 2115 * Wait for the port's lacp timer thread and its notification callback 2116 * to exit before calling mac_unregister() since both needs to access 2117 * the mac perimeter of the grp. 2118 */ 2119 aggr_grp_port_wait(grp); 2120 2121 VERIFY(mac_unregister(grp->lg_mh) == 0); 2122 grp->lg_mh = NULL; 2123 2124 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 2125 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 2126 } 2127 2128 AGGR_GRP_REFRELE(grp); 2129 return (0); 2130 } 2131 2132 void 2133 aggr_grp_free(aggr_grp_t *grp) 2134 { 2135 ASSERT(grp->lg_refs == 0); 2136 ASSERT(grp->lg_port_ref == 0); 2137 if (grp->lg_key > AGGR_MAX_KEY) { 2138 id_free(key_ids, grp->lg_key); 2139 grp->lg_key = 0; 2140 } 2141 kmem_cache_free(aggr_grp_cache, grp); 2142 } 2143 2144 int 2145 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 2146 aggr_grp_info_new_grp_fn_t new_grp_fn, 2147 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 2148 { 2149 aggr_grp_t *grp; 2150 aggr_port_t *port; 2151 mac_perim_handle_t mph, pmph; 2152 int rc = 0; 2153 2154 /* 2155 * Make sure that the aggregation link is visible from the caller's 2156 * zone. 2157 */ 2158 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 2159 return (ENOENT); 2160 2161 rw_enter(&aggr_grp_lock, RW_READER); 2162 2163 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2164 (mod_hash_val_t *)&grp) != 0) { 2165 rw_exit(&aggr_grp_lock); 2166 return (ENOENT); 2167 } 2168 AGGR_GRP_REFHOLD(grp); 2169 2170 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2171 rw_exit(&aggr_grp_lock); 2172 2173 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2174 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2175 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2176 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2177 2178 if (rc != 0) 2179 goto bail; 2180 2181 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2182 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2183 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2184 port->lp_state, &port->lp_lacp.ActorOperPortState); 2185 mac_perim_exit(pmph); 2186 2187 if (rc != 0) 2188 goto bail; 2189 } 2190 2191 bail: 2192 mac_perim_exit(mph); 2193 AGGR_GRP_REFRELE(grp); 2194 return (rc); 2195 } 2196 2197 /*ARGSUSED*/ 2198 static void 2199 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2200 { 2201 miocnak(q, mp, 0, ENOTSUP); 2202 } 2203 2204 static int 2205 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2206 { 2207 aggr_port_t *port; 2208 uint_t stat_index; 2209 2210 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2211 2212 /* We only aggregate counter statistics. */ 2213 if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) || 2214 (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) { 2215 return (ENOTSUP); 2216 } 2217 2218 /* 2219 * Counter statistics for a group are computed by aggregating the 2220 * counters of the members MACs while they were aggregated, plus 2221 * the residual counter of the group itself, which is updated each 2222 * time a MAC is removed from the group. 2223 */ 2224 *val = 0; 2225 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2226 /* actual port statistic */ 2227 *val += aggr_port_stat(port, stat); 2228 /* 2229 * minus the port stat when it was added, plus any residual 2230 * amount for the group. 2231 */ 2232 if (IS_MAC_STAT(stat)) { 2233 stat_index = stat - MAC_STAT_MIN; 2234 *val -= port->lp_stat[stat_index]; 2235 *val += grp->lg_stat[stat_index]; 2236 } else if (IS_MACTYPE_STAT(stat)) { 2237 stat_index = stat - MACTYPE_STAT_MIN; 2238 *val -= port->lp_ether_stat[stat_index]; 2239 *val += grp->lg_ether_stat[stat_index]; 2240 } 2241 } 2242 return (0); 2243 } 2244 2245 int 2246 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2247 { 2248 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2249 2250 if (rx_ring->arr_hw_rh != NULL) { 2251 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2252 } else { 2253 aggr_port_t *port = rx_ring->arr_port; 2254 2255 *val = mac_stat_get(port->lp_mh, stat); 2256 2257 } 2258 return (0); 2259 } 2260 2261 int 2262 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2263 { 2264 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2265 2266 if (tx_ring->atr_hw_rh != NULL) { 2267 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2268 } else { 2269 aggr_port_t *port = tx_ring->atr_port; 2270 2271 *val = mac_stat_get(port->lp_mh, stat); 2272 } 2273 return (0); 2274 } 2275 2276 static int 2277 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2278 { 2279 aggr_grp_t *grp = arg; 2280 int rval = 0; 2281 2282 mutex_enter(&grp->lg_stat_lock); 2283 2284 switch (stat) { 2285 case MAC_STAT_IFSPEED: 2286 *val = grp->lg_ifspeed; 2287 break; 2288 2289 case ETHER_STAT_LINK_DUPLEX: 2290 *val = grp->lg_link_duplex; 2291 break; 2292 2293 default: 2294 /* 2295 * For all other statistics, we return the aggregated stat 2296 * from the underlying ports. aggr_grp_stat() will set 2297 * rval appropriately if the statistic isn't a counter. 2298 */ 2299 rval = aggr_grp_stat(grp, stat, val); 2300 } 2301 2302 mutex_exit(&grp->lg_stat_lock); 2303 return (rval); 2304 } 2305 2306 static int 2307 aggr_m_start(void *arg) 2308 { 2309 aggr_grp_t *grp = arg; 2310 aggr_port_t *port; 2311 mac_perim_handle_t mph, pmph; 2312 2313 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2314 2315 /* 2316 * Attempts to start all configured members of the group. 2317 * Group members will be attached when their link-up notification 2318 * is received. 2319 */ 2320 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2321 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2322 if (aggr_port_start(port) != 0) { 2323 mac_perim_exit(pmph); 2324 continue; 2325 } 2326 2327 /* 2328 * Turn on the promiscuous mode if it is required to receive 2329 * the non-primary address over a port, or the promiscous 2330 * mode is enabled over the aggr. 2331 */ 2332 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2333 if (aggr_port_promisc(port, B_TRUE) != 0) 2334 aggr_port_stop(port); 2335 } 2336 mac_perim_exit(pmph); 2337 } 2338 2339 grp->lg_started = B_TRUE; 2340 2341 mac_perim_exit(mph); 2342 return (0); 2343 } 2344 2345 static void 2346 aggr_m_stop(void *arg) 2347 { 2348 aggr_grp_t *grp = arg; 2349 aggr_port_t *port; 2350 mac_perim_handle_t mph, pmph; 2351 2352 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2353 2354 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2355 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2356 2357 /* reset port promiscuous mode */ 2358 (void) aggr_port_promisc(port, B_FALSE); 2359 2360 aggr_port_stop(port); 2361 mac_perim_exit(pmph); 2362 } 2363 2364 grp->lg_started = B_FALSE; 2365 mac_perim_exit(mph); 2366 } 2367 2368 static int 2369 aggr_m_promisc(void *arg, boolean_t on) 2370 { 2371 aggr_grp_t *grp = arg; 2372 aggr_port_t *port; 2373 boolean_t link_state_changed = B_FALSE; 2374 mac_perim_handle_t mph, pmph; 2375 2376 AGGR_GRP_REFHOLD(grp); 2377 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2378 2379 ASSERT(!grp->lg_closing); 2380 2381 if (on == grp->lg_promisc) 2382 goto bail; 2383 2384 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2385 int err = 0; 2386 2387 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2388 AGGR_PORT_REFHOLD(port); 2389 if (!on && (port->lp_prom_addr == NULL)) 2390 err = aggr_port_promisc(port, B_FALSE); 2391 else if (on && port->lp_started) 2392 err = aggr_port_promisc(port, B_TRUE); 2393 2394 if (err != 0) { 2395 if (aggr_grp_detach_port(grp, port)) 2396 link_state_changed = B_TRUE; 2397 } else { 2398 /* 2399 * If a port was detached because of a previous 2400 * failure changing the promiscuity, the port 2401 * is reattached when it successfully changes 2402 * the promiscuity now, and this might cause 2403 * the link state of the aggregation to change. 2404 */ 2405 if (aggr_grp_attach_port(grp, port)) 2406 link_state_changed = B_TRUE; 2407 } 2408 mac_perim_exit(pmph); 2409 AGGR_PORT_REFRELE(port); 2410 } 2411 2412 grp->lg_promisc = on; 2413 2414 if (link_state_changed) 2415 mac_link_update(grp->lg_mh, grp->lg_link_state); 2416 2417 bail: 2418 mac_perim_exit(mph); 2419 AGGR_GRP_REFRELE(grp); 2420 2421 return (0); 2422 } 2423 2424 static void 2425 aggr_grp_port_rename(const char *new_name, void *arg) 2426 { 2427 /* 2428 * aggr port's mac client name is the format of "aggr link name" plus 2429 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2430 */ 2431 int aggr_len, link_len, clnt_name_len, i; 2432 char *str_end, *str_st, *str_del; 2433 char aggr_name[MAXNAMELEN]; 2434 char link_name[MAXNAMELEN]; 2435 char *clnt_name; 2436 aggr_grp_t *aggr_grp = arg; 2437 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2438 2439 for (i = 0; i < aggr_grp->lg_nports; i++) { 2440 clnt_name = mac_client_name(aggr_port->lp_mch); 2441 clnt_name_len = strlen(clnt_name); 2442 str_st = clnt_name; 2443 str_end = &(clnt_name[clnt_name_len]); 2444 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2445 ASSERT(str_del != NULL); 2446 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2447 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2448 bzero(aggr_name, MAXNAMELEN); 2449 bzero(link_name, MAXNAMELEN); 2450 bcopy(clnt_name, aggr_name, aggr_len); 2451 bcopy(str_del, link_name, link_len + 1); 2452 bzero(clnt_name, MAXNAMELEN); 2453 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2454 link_name); 2455 2456 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2457 aggr_port = aggr_port->lp_next; 2458 } 2459 } 2460 2461 /* 2462 * Initialize the capabilities that are advertised for the group 2463 * according to the capabilities of the constituent ports. 2464 */ 2465 static boolean_t 2466 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2467 { 2468 aggr_grp_t *grp = arg; 2469 2470 switch (cap) { 2471 case MAC_CAPAB_HCKSUM: { 2472 uint32_t *hcksum_txflags = cap_data; 2473 *hcksum_txflags = grp->lg_hcksum_txflags; 2474 break; 2475 } 2476 case MAC_CAPAB_LSO: { 2477 mac_capab_lso_t *cap_lso = cap_data; 2478 2479 if (grp->lg_lso) { 2480 *cap_lso = grp->lg_cap_lso; 2481 break; 2482 } else { 2483 return (B_FALSE); 2484 } 2485 } 2486 case MAC_CAPAB_NO_NATIVEVLAN: 2487 return (!grp->lg_vlan); 2488 case MAC_CAPAB_NO_ZCOPY: 2489 return (!grp->lg_zcopy); 2490 case MAC_CAPAB_RINGS: { 2491 mac_capab_rings_t *cap_rings = cap_data; 2492 uint_t ring_cnt = 0; 2493 2494 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2495 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2496 2497 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2498 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2499 cap_rings->mr_rnum = ring_cnt; 2500 cap_rings->mr_gnum = grp->lg_rx_group_count; 2501 cap_rings->mr_gaddring = NULL; 2502 cap_rings->mr_gremring = NULL; 2503 } else { 2504 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2505 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2506 cap_rings->mr_gnum = 0; 2507 } 2508 cap_rings->mr_rget = aggr_fill_ring; 2509 cap_rings->mr_gget = aggr_fill_group; 2510 break; 2511 } 2512 case MAC_CAPAB_AGGR: 2513 { 2514 mac_capab_aggr_t *aggr_cap; 2515 2516 if (cap_data != NULL) { 2517 aggr_cap = cap_data; 2518 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2519 aggr_cap->mca_unicst = aggr_m_unicst; 2520 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2521 aggr_cap->mca_arg = arg; 2522 } 2523 return (B_TRUE); 2524 } 2525 default: 2526 return (B_FALSE); 2527 } 2528 return (B_TRUE); 2529 } 2530 2531 /* 2532 * Callback function for MAC layer to register groups. 2533 */ 2534 static void 2535 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2536 mac_group_info_t *infop, mac_group_handle_t gh) 2537 { 2538 aggr_grp_t *grp = arg; 2539 2540 if (rtype == MAC_RING_TYPE_RX) { 2541 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2542 2543 rx_group->arg_gh = gh; 2544 rx_group->arg_grp = grp; 2545 2546 infop->mgi_driver = (mac_group_driver_t)rx_group; 2547 infop->mgi_start = NULL; 2548 infop->mgi_stop = NULL; 2549 infop->mgi_addmac = aggr_addmac; 2550 infop->mgi_remmac = aggr_remmac; 2551 infop->mgi_count = rx_group->arg_ring_cnt; 2552 2553 /* 2554 * Always set the HW VLAN callbacks. They are smart 2555 * enough to know when a port has HW VLAN filters to 2556 * program and when it doesn't. 2557 */ 2558 infop->mgi_addvlan = aggr_addvlan; 2559 infop->mgi_remvlan = aggr_remvlan; 2560 } else { 2561 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2562 2563 ASSERT3S(index, ==, 0); 2564 tx_group->atg_gh = gh; 2565 } 2566 } 2567 2568 /* 2569 * Callback funtion for MAC layer to register all rings. 2570 */ 2571 static void 2572 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2573 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2574 { 2575 aggr_grp_t *grp = arg; 2576 2577 switch (rtype) { 2578 case MAC_RING_TYPE_RX: { 2579 aggr_pseudo_rx_group_t *rx_group; 2580 aggr_pseudo_rx_ring_t *rx_ring; 2581 mac_intr_t aggr_mac_intr; 2582 2583 rx_group = &grp->lg_rx_groups[rg_index]; 2584 ASSERT3S(index, >=, 0); 2585 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2586 rx_ring = rx_group->arg_rings + index; 2587 rx_ring->arr_rh = rh; 2588 2589 /* 2590 * Entrypoint to enable interrupt (disable poll) and 2591 * disable interrupt (enable poll). 2592 */ 2593 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2594 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2595 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2596 aggr_mac_intr.mi_ddi_handle = NULL; 2597 2598 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2599 infop->mri_start = aggr_pseudo_start_rx_ring; 2600 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2601 2602 infop->mri_intr = aggr_mac_intr; 2603 infop->mri_poll = aggr_rx_poll; 2604 2605 infop->mri_stat = aggr_rx_ring_stat; 2606 break; 2607 } 2608 case MAC_RING_TYPE_TX: { 2609 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2610 aggr_pseudo_tx_ring_t *tx_ring; 2611 2612 ASSERT(rg_index == -1); 2613 ASSERT(index < tx_group->atg_ring_cnt); 2614 2615 tx_ring = &tx_group->atg_rings[index]; 2616 tx_ring->atr_rh = rh; 2617 2618 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2619 infop->mri_start = NULL; 2620 infop->mri_stop = NULL; 2621 infop->mri_tx = aggr_ring_tx; 2622 infop->mri_stat = aggr_tx_ring_stat; 2623 /* 2624 * Use the hw TX ring handle to find if the ring needs 2625 * serialization or not. For NICs that do not expose 2626 * Tx rings, atr_hw_rh will be NULL. 2627 */ 2628 if (tx_ring->atr_hw_rh != NULL) { 2629 infop->mri_flags = 2630 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2631 } 2632 break; 2633 } 2634 default: 2635 break; 2636 } 2637 } 2638 2639 static mblk_t * 2640 aggr_rx_poll(void *arg, int bytes_to_pickup) 2641 { 2642 aggr_pseudo_rx_ring_t *rr_ring = arg; 2643 aggr_port_t *port = rr_ring->arr_port; 2644 aggr_grp_t *grp = port->lp_grp; 2645 mblk_t *mp_chain, *mp, **mpp; 2646 2647 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2648 2649 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2650 return (mp_chain); 2651 2652 mpp = &mp_chain; 2653 while ((mp = *mpp) != NULL) { 2654 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2655 struct ether_header *ehp; 2656 2657 ehp = (struct ether_header *)mp->b_rptr; 2658 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2659 *mpp = mp->b_next; 2660 mp->b_next = NULL; 2661 aggr_recv_lacp(port, 2662 (mac_resource_handle_t)rr_ring, mp); 2663 continue; 2664 } 2665 } 2666 2667 if (!port->lp_collector_enabled) { 2668 *mpp = mp->b_next; 2669 mp->b_next = NULL; 2670 freemsg(mp); 2671 continue; 2672 } 2673 mpp = &mp->b_next; 2674 } 2675 return (mp_chain); 2676 } 2677 2678 static int 2679 aggr_addmac(void *arg, const uint8_t *mac_addr) 2680 { 2681 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2682 aggr_unicst_addr_t *addr, **pprev; 2683 aggr_grp_t *grp = rx_group->arg_grp; 2684 aggr_port_t *port, *p; 2685 mac_perim_handle_t mph; 2686 int err = 0; 2687 uint_t idx = rx_group->arg_index; 2688 2689 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2690 2691 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2692 mac_perim_exit(mph); 2693 return (0); 2694 } 2695 2696 /* 2697 * Insert this mac address into the list of mac addresses owned by 2698 * the aggregation pseudo group. 2699 */ 2700 pprev = &rx_group->arg_macaddr; 2701 while ((addr = *pprev) != NULL) { 2702 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2703 mac_perim_exit(mph); 2704 return (EEXIST); 2705 } 2706 pprev = &addr->aua_next; 2707 } 2708 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2709 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2710 addr->aua_next = NULL; 2711 *pprev = addr; 2712 2713 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2714 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2715 break; 2716 2717 if (err != 0) { 2718 for (p = grp->lg_ports; p != port; p = p->lp_next) 2719 aggr_port_remmac(p, idx, mac_addr); 2720 2721 *pprev = NULL; 2722 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2723 } 2724 2725 mac_perim_exit(mph); 2726 return (err); 2727 } 2728 2729 static int 2730 aggr_remmac(void *arg, const uint8_t *mac_addr) 2731 { 2732 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2733 aggr_unicst_addr_t *addr, **pprev; 2734 aggr_grp_t *grp = rx_group->arg_grp; 2735 aggr_port_t *port; 2736 mac_perim_handle_t mph; 2737 int err = 0; 2738 2739 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2740 2741 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2742 mac_perim_exit(mph); 2743 return (0); 2744 } 2745 2746 /* 2747 * Insert this mac address into the list of mac addresses owned by 2748 * the aggregation pseudo group. 2749 */ 2750 pprev = &rx_group->arg_macaddr; 2751 while ((addr = *pprev) != NULL) { 2752 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2753 pprev = &addr->aua_next; 2754 continue; 2755 } 2756 break; 2757 } 2758 if (addr == NULL) { 2759 mac_perim_exit(mph); 2760 return (EINVAL); 2761 } 2762 2763 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2764 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2765 2766 *pprev = addr->aua_next; 2767 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2768 2769 mac_perim_exit(mph); 2770 return (err); 2771 } 2772 2773 /* 2774 * Search for VID in the Rx group's list and return a pointer if 2775 * found. Otherwise return NULL. 2776 */ 2777 static aggr_vlan_t * 2778 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2779 { 2780 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2781 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2782 avp = list_next(&rx_group->arg_vlans, avp)) { 2783 if (avp->av_vid == vid) 2784 return (avp); 2785 } 2786 2787 return (NULL); 2788 } 2789 2790 /* 2791 * Accept traffic on the specified VID. 2792 * 2793 * Persist VLAN state in the aggr so that ports added later will 2794 * receive the correct filters. In the future it would be nice to 2795 * allow aggr to iterate its clients instead of duplicating state. 2796 */ 2797 static int 2798 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2799 { 2800 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2801 aggr_grp_t *aggr = rx_group->arg_grp; 2802 aggr_port_t *port, *p; 2803 mac_perim_handle_t mph; 2804 int err = 0; 2805 aggr_vlan_t *avp = NULL; 2806 uint_t idx = rx_group->arg_index; 2807 2808 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2809 2810 if (vid == MAC_VLAN_UNTAGGED) { 2811 /* 2812 * Aggr is both a MAC provider and MAC client. As a 2813 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2814 * client. As a client itself, it should pass 2815 * VLAN_ID_NONE to its ports. 2816 */ 2817 vid = VLAN_ID_NONE; 2818 rx_group->arg_untagged++; 2819 goto update_ports; 2820 } 2821 2822 avp = aggr_find_vlan(rx_group, vid); 2823 2824 if (avp != NULL) { 2825 avp->av_refs++; 2826 mac_perim_exit(mph); 2827 return (0); 2828 } 2829 2830 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2831 avp->av_vid = vid; 2832 avp->av_refs = 1; 2833 2834 update_ports: 2835 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2836 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2837 break; 2838 2839 if (err != 0) { 2840 /* 2841 * If any of these calls fail then we are in a 2842 * situation where the ports have different HW state. 2843 * There's no reasonable action the MAC client can 2844 * take in this scenario to rectify the situation. 2845 */ 2846 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2847 int err2; 2848 2849 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2850 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2851 " from port %s: errno %d.", vid, 2852 mac_client_name(p->lp_mch), err2); 2853 } 2854 2855 } 2856 2857 if (vid == VLAN_ID_NONE) 2858 rx_group->arg_untagged--; 2859 2860 if (avp != NULL) { 2861 kmem_free(avp, sizeof (aggr_vlan_t)); 2862 avp = NULL; 2863 } 2864 } 2865 2866 if (avp != NULL) 2867 list_insert_tail(&rx_group->arg_vlans, avp); 2868 2869 mac_perim_exit(mph); 2870 return (err); 2871 } 2872 2873 /* 2874 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2875 */ 2876 static int 2877 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2878 { 2879 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2880 aggr_grp_t *aggr = rx_group->arg_grp; 2881 aggr_port_t *port, *p; 2882 mac_perim_handle_t mph; 2883 int err = 0; 2884 aggr_vlan_t *avp = NULL; 2885 uint_t idx = rx_group->arg_index; 2886 2887 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2888 2889 /* 2890 * See the comment in aggr_addvlan(). 2891 */ 2892 if (vid == MAC_VLAN_UNTAGGED) { 2893 vid = VLAN_ID_NONE; 2894 rx_group->arg_untagged--; 2895 2896 if (rx_group->arg_untagged > 0) 2897 goto done; 2898 2899 goto update_ports; 2900 } 2901 2902 avp = aggr_find_vlan(rx_group, vid); 2903 2904 if (avp == NULL) { 2905 err = ENOENT; 2906 goto done; 2907 } 2908 2909 avp->av_refs--; 2910 2911 if (avp->av_refs > 0) 2912 goto done; 2913 2914 update_ports: 2915 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2916 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2917 break; 2918 2919 /* 2920 * See the comment in aggr_addvlan() for justification of the 2921 * use of VERIFY here. 2922 */ 2923 if (err != 0) { 2924 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2925 int err2; 2926 2927 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2928 cmn_err(CE_WARN, "Failed to add VLAN %u" 2929 " to port %s: errno %d.", vid, 2930 mac_client_name(p->lp_mch), err2); 2931 } 2932 } 2933 2934 if (avp != NULL) 2935 avp->av_refs++; 2936 2937 if (vid == VLAN_ID_NONE) 2938 rx_group->arg_untagged++; 2939 2940 goto done; 2941 } 2942 2943 if (err == 0 && avp != NULL) { 2944 VERIFY3U(avp->av_refs, ==, 0); 2945 list_remove(&rx_group->arg_vlans, avp); 2946 kmem_free(avp, sizeof (aggr_vlan_t)); 2947 } 2948 2949 done: 2950 mac_perim_exit(mph); 2951 return (err); 2952 } 2953 2954 /* 2955 * Add or remove the multicast addresses that are defined for the group 2956 * to or from the specified port. 2957 * 2958 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2959 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2960 * called when the port is either stopped or detached. 2961 */ 2962 void 2963 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2964 { 2965 aggr_grp_t *grp = port->lp_grp; 2966 2967 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2968 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2969 2970 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2971 return; 2972 2973 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2974 } 2975 2976 static int 2977 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2978 { 2979 aggr_grp_t *grp = arg; 2980 aggr_port_t *port = NULL, *errport = NULL; 2981 mac_perim_handle_t mph; 2982 int err = 0; 2983 2984 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2985 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2986 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2987 !port->lp_started) { 2988 continue; 2989 } 2990 err = aggr_port_multicst(port, add, addrp); 2991 if (err != 0) { 2992 errport = port; 2993 break; 2994 } 2995 } 2996 2997 /* 2998 * At least one port caused error return and this error is returned to 2999 * mac, eventually a NAK would be sent upwards. 3000 * Some ports have this multicast address listed now, and some don't. 3001 * Treat this error as a whole aggr failure not individual port failure. 3002 * Therefore remove this multicast address from other ports. 3003 */ 3004 if ((err != 0) && add) { 3005 for (port = grp->lg_ports; port != errport; 3006 port = port->lp_next) { 3007 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 3008 !port->lp_started) { 3009 continue; 3010 } 3011 (void) aggr_port_multicst(port, B_FALSE, addrp); 3012 } 3013 } 3014 mac_perim_exit(mph); 3015 return (err); 3016 } 3017 3018 static int 3019 aggr_m_unicst(void *arg, const uint8_t *macaddr) 3020 { 3021 aggr_grp_t *grp = arg; 3022 mac_perim_handle_t mph; 3023 int err; 3024 3025 mac_perim_enter_by_mh(grp->lg_mh, &mph); 3026 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 3027 0, 0); 3028 mac_perim_exit(mph); 3029 return (err); 3030 } 3031 3032 /* 3033 * Initialize the capabilities that are advertised for the group 3034 * according to the capabilities of the constituent ports. 3035 */ 3036 static void 3037 aggr_grp_capab_set(aggr_grp_t *grp) 3038 { 3039 uint32_t cksum; 3040 aggr_port_t *port; 3041 mac_capab_lso_t cap_lso; 3042 3043 ASSERT(grp->lg_mh == NULL); 3044 ASSERT(grp->lg_ports != NULL); 3045 3046 grp->lg_hcksum_txflags = (uint32_t)-1; 3047 grp->lg_zcopy = B_TRUE; 3048 grp->lg_vlan = B_TRUE; 3049 3050 grp->lg_lso = B_TRUE; 3051 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 3052 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 3053 3054 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3055 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 3056 cksum = 0; 3057 grp->lg_hcksum_txflags &= cksum; 3058 3059 grp->lg_vlan &= 3060 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 3061 3062 grp->lg_zcopy &= 3063 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 3064 3065 grp->lg_lso &= 3066 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 3067 if (grp->lg_lso) { 3068 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 3069 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 3070 cap_lso.lso_basic_tcp_ipv4.lso_max) 3071 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 3072 cap_lso.lso_basic_tcp_ipv4.lso_max; 3073 } 3074 } 3075 } 3076 3077 /* 3078 * Checks whether the capabilities of the port being added are compatible 3079 * with the current capabilities of the aggregation. 3080 */ 3081 static boolean_t 3082 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 3083 { 3084 uint32_t hcksum_txflags; 3085 3086 ASSERT(grp->lg_ports != NULL); 3087 3088 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 3089 grp->lg_vlan) != grp->lg_vlan) { 3090 return (B_FALSE); 3091 } 3092 3093 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 3094 grp->lg_zcopy) != grp->lg_zcopy) { 3095 return (B_FALSE); 3096 } 3097 3098 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 3099 if (grp->lg_hcksum_txflags != 0) 3100 return (B_FALSE); 3101 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 3102 grp->lg_hcksum_txflags) { 3103 return (B_FALSE); 3104 } 3105 3106 if (grp->lg_lso) { 3107 mac_capab_lso_t cap_lso; 3108 3109 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 3110 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 3111 grp->lg_cap_lso.lso_flags) 3112 return (B_FALSE); 3113 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 3114 cap_lso.lso_basic_tcp_ipv4.lso_max) 3115 return (B_FALSE); 3116 } else { 3117 return (B_FALSE); 3118 } 3119 } 3120 3121 return (B_TRUE); 3122 } 3123 3124 /* 3125 * Returns the maximum SDU according to the SDU of the constituent ports. 3126 */ 3127 static uint_t 3128 aggr_grp_max_sdu(aggr_grp_t *grp) 3129 { 3130 uint_t max_sdu = (uint_t)-1; 3131 aggr_port_t *port; 3132 3133 ASSERT(grp->lg_ports != NULL); 3134 3135 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3136 uint_t port_sdu_max; 3137 3138 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3139 if (max_sdu > port_sdu_max) 3140 max_sdu = port_sdu_max; 3141 } 3142 3143 return (max_sdu); 3144 } 3145 3146 /* 3147 * Checks if the maximum SDU of the specified port is compatible 3148 * with the maximum SDU of the specified aggregation group, returns 3149 * B_TRUE if it is, B_FALSE otherwise. 3150 */ 3151 static boolean_t 3152 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 3153 { 3154 uint_t port_sdu_max; 3155 3156 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3157 return (port_sdu_max >= grp->lg_max_sdu); 3158 } 3159 3160 /* 3161 * Returns the maximum margin according to the margin of the constituent ports. 3162 */ 3163 static uint32_t 3164 aggr_grp_max_margin(aggr_grp_t *grp) 3165 { 3166 uint32_t margin = UINT32_MAX; 3167 aggr_port_t *port; 3168 3169 ASSERT(grp->lg_mh == NULL); 3170 ASSERT(grp->lg_ports != NULL); 3171 3172 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3173 if (margin > port->lp_margin) 3174 margin = port->lp_margin; 3175 } 3176 3177 grp->lg_margin = margin; 3178 return (margin); 3179 } 3180 3181 /* 3182 * Checks if the maximum margin of the specified port is compatible 3183 * with the maximum margin of the specified aggregation group, returns 3184 * B_TRUE if it is, B_FALSE otherwise. 3185 */ 3186 static boolean_t 3187 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3188 { 3189 if (port->lp_margin >= grp->lg_margin) 3190 return (B_TRUE); 3191 3192 /* 3193 * See whether the current margin value is allowed to be changed to 3194 * the new value. 3195 */ 3196 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3197 return (B_FALSE); 3198 3199 grp->lg_margin = port->lp_margin; 3200 return (B_TRUE); 3201 } 3202 3203 /* 3204 * Set MTU on individual ports of an aggregation group 3205 */ 3206 static int 3207 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3208 uint32_t *old_mtu) 3209 { 3210 boolean_t removed = B_FALSE; 3211 mac_perim_handle_t mph; 3212 mac_diag_t diag; 3213 int err, rv, retry = 0; 3214 3215 if (port->lp_mah != NULL) { 3216 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3217 port->lp_mah = NULL; 3218 removed = B_TRUE; 3219 } 3220 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3221 try_again: 3222 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3223 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3224 &port->lp_mah, 0, &diag)) != 0) { 3225 /* 3226 * following is a workaround for a bug in 'bge' driver. 3227 * See CR 6794654 for more information and this work around 3228 * will be removed once the CR is fixed. 3229 */ 3230 if (rv == EIO && retry++ < 3) { 3231 delay(2 * hz); 3232 goto try_again; 3233 } 3234 /* 3235 * if mac_unicast_add() failed while setting the MTU, 3236 * detach the port from the group. 3237 */ 3238 mac_perim_enter_by_mh(port->lp_mh, &mph); 3239 (void) aggr_grp_detach_port(grp, port); 3240 mac_perim_exit(mph); 3241 cmn_err(CE_WARN, "Unable to restart the port %s while " 3242 "setting MTU. Detaching the port from the aggregation.", 3243 mac_client_name(port->lp_mch)); 3244 } 3245 return (err); 3246 } 3247 3248 static int 3249 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3250 { 3251 int err = 0, i, rv; 3252 aggr_port_t *port; 3253 uint32_t *mtu; 3254 3255 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3256 3257 /* 3258 * If the MTU being set is equal to aggr group's maximum 3259 * allowable value, then there is nothing to change 3260 */ 3261 if (sdu == grp->lg_max_sdu) 3262 return (0); 3263 3264 /* 0 is aggr group's min sdu */ 3265 if (sdu == 0) 3266 return (EINVAL); 3267 3268 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3269 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3270 port = port->lp_next, i++) { 3271 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3272 } 3273 if (err != 0) { 3274 /* recover from error: reset the mtus of the ports */ 3275 aggr_port_t *tmp; 3276 3277 for (tmp = grp->lg_ports, i = 0; tmp != port; 3278 tmp = tmp->lp_next, i++) { 3279 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3280 } 3281 goto bail; 3282 } 3283 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3284 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3285 ASSERT(rv == 0); 3286 bail: 3287 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3288 return (err); 3289 } 3290 3291 /* 3292 * Callback functions for set/get of properties 3293 */ 3294 /*ARGSUSED*/ 3295 static int 3296 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3297 uint_t pr_valsize, const void *pr_val) 3298 { 3299 int err = ENOTSUP; 3300 aggr_grp_t *grp = m_driver; 3301 3302 switch (pr_num) { 3303 case MAC_PROP_MTU: { 3304 uint32_t mtu; 3305 3306 if (pr_valsize < sizeof (mtu)) { 3307 err = EINVAL; 3308 break; 3309 } 3310 bcopy(pr_val, &mtu, sizeof (mtu)); 3311 err = aggr_sdu_update(grp, mtu); 3312 break; 3313 } 3314 default: 3315 break; 3316 } 3317 return (err); 3318 } 3319 3320 typedef struct rboundary { 3321 uint32_t bval; 3322 int btype; 3323 } rboundary_t; 3324 3325 /* 3326 * This function finds the intersection of mtu ranges stored in arrays - 3327 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3328 * Individual arrays are assumed to contain non-overlapping ranges. 3329 * Algorithm: 3330 * A range has two boundaries - min and max. We scan all arrays and store 3331 * each boundary as a separate element in a temporary array. We also store 3332 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3333 * array. Then we sort the temporary array in ascending order. We scan the 3334 * sorted array from lower to higher values and keep a cumulative sum of 3335 * boundary types. Element in the temporary array for which the sum reaches 3336 * mcount is a min boundary of a range in the result and next element will be 3337 * max boundary. 3338 * 3339 * Example for mcount = 3, 3340 * 3341 * ----|_________|-------|_______|----|__|------ mrange[0] 3342 * 3343 * -------|________|--|____________|-----|___|-- mrange[1] 3344 * 3345 * --------|________________|-------|____|------ mrange[2] 3346 * 3347 * 3 2 1 3348 * \|/ 3349 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3350 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3351 * 3352 * same min and max 3353 * V 3354 * --------|_____|-------|__|------------|------ intersecting ranges 3355 */ 3356 void 3357 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3358 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3359 { 3360 mac_propval_uint32_range_t *rval, *ur; 3361 int rmaxcnt, rcount; 3362 size_t sz_range32; 3363 rboundary_t *ta; /* temporary array */ 3364 rboundary_t temp; 3365 boolean_t range_started = B_FALSE; 3366 int i, j, m, sum; 3367 3368 sz_range32 = sizeof (mac_propval_uint32_range_t); 3369 3370 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3371 rmaxcnt += mrange[i]->mpr_count; 3372 3373 /* Allocate enough space to store the results */ 3374 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3375 3376 /* Number of boundaries are twice as many as ranges */ 3377 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3378 3379 for (i = 0, m = 0; i < mcount; i++) { 3380 ur = &(mrange[i]->mpr_range_uint32[0]); 3381 for (j = 0; j < mrange[i]->mpr_count; j++) { 3382 ta[m].bval = ur[j].mpur_min; 3383 ta[m++].btype = 1; 3384 ta[m].bval = ur[j].mpur_max; 3385 ta[m++].btype = -1; 3386 } 3387 } 3388 3389 /* 3390 * Sort the temporary array in ascending order of bval; 3391 * if boundary values are same then sort on btype. 3392 */ 3393 for (i = 0; i < m-1; i++) { 3394 for (j = i+1; j < m; j++) { 3395 if ((ta[i].bval > ta[j].bval) || 3396 ((ta[i].bval == ta[j].bval) && 3397 (ta[i].btype < ta[j].btype))) { 3398 temp = ta[i]; 3399 ta[i] = ta[j]; 3400 ta[j] = temp; 3401 } 3402 } 3403 } 3404 3405 /* Walk through temporary array to find all ranges in the results */ 3406 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3407 sum += ta[i].btype; 3408 if (sum == mcount) { 3409 rval[rcount].mpur_min = ta[i].bval; 3410 range_started = B_TRUE; 3411 } else if (sum < mcount && range_started) { 3412 rval[rcount++].mpur_max = ta[i].bval; 3413 range_started = B_FALSE; 3414 } 3415 } 3416 3417 *prval = rval; 3418 *prmaxcnt = rmaxcnt; 3419 *prcount = rcount; 3420 3421 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3422 } 3423 3424 /* 3425 * Returns the mtu ranges which could be supported by aggr group. 3426 * prmaxcnt returns the size of the buffer prval, prcount returns 3427 * the number of valid entries in prval. Caller is responsible 3428 * for freeing up prval. 3429 */ 3430 int 3431 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3432 int *prmaxcnt, int *prcount) 3433 { 3434 mac_propval_range_t **vals; 3435 aggr_port_t *port; 3436 mac_perim_handle_t mph; 3437 uint_t i, numr; 3438 int err = 0; 3439 size_t sz_propval, sz_range32; 3440 size_t size; 3441 3442 sz_propval = sizeof (mac_propval_range_t); 3443 sz_range32 = sizeof (mac_propval_uint32_range_t); 3444 3445 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3446 3447 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3448 KM_SLEEP); 3449 3450 for (port = grp->lg_ports, i = 0; port != NULL; 3451 port = port->lp_next, i++) { 3452 3453 size = sz_propval; 3454 vals[i] = kmem_alloc(size, KM_SLEEP); 3455 vals[i]->mpr_count = 1; 3456 3457 mac_perim_enter_by_mh(port->lp_mh, &mph); 3458 3459 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3460 NULL, 0, vals[i], NULL); 3461 if (err == ENOSPC) { 3462 /* 3463 * Not enough space to hold all ranges. 3464 * Allocate extra space as indicated and retry. 3465 */ 3466 numr = vals[i]->mpr_count; 3467 kmem_free(vals[i], sz_propval); 3468 size = sz_propval + (numr - 1) * sz_range32; 3469 vals[i] = kmem_alloc(size, KM_SLEEP); 3470 vals[i]->mpr_count = numr; 3471 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3472 NULL, 0, vals[i], NULL); 3473 ASSERT(err != ENOSPC); 3474 } 3475 mac_perim_exit(mph); 3476 if (err != 0) { 3477 kmem_free(vals[i], size); 3478 vals[i] = NULL; 3479 break; 3480 } 3481 } 3482 3483 /* 3484 * if any of the underlying ports does not support changing MTU then 3485 * just return ENOTSUP 3486 */ 3487 if (port != NULL) { 3488 ASSERT(err != 0); 3489 goto done; 3490 } 3491 3492 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3493 prcount); 3494 3495 done: 3496 for (i = 0; i < grp->lg_nports; i++) { 3497 if (vals[i] != NULL) { 3498 numr = vals[i]->mpr_count; 3499 size = sz_propval + (numr - 1) * sz_range32; 3500 kmem_free(vals[i], size); 3501 } 3502 } 3503 3504 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3505 return (err); 3506 } 3507 3508 static void 3509 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3510 mac_prop_info_handle_t prh) 3511 { 3512 aggr_grp_t *grp = m_driver; 3513 mac_propval_uint32_range_t *rval = NULL; 3514 int i, rcount, rmaxcnt; 3515 int err = 0; 3516 3517 _NOTE(ARGUNUSED(pr_name)); 3518 3519 if (pr_num != MAC_PROP_MTU) 3520 return; 3521 3522 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, &rcount); 3523 if (err != 0) { 3524 ASSERT(rval == NULL); 3525 return; 3526 } 3527 for (i = 0; i < rcount; i++) { 3528 mac_prop_info_set_range_uint32(prh, 3529 rval[i].mpur_min, rval[i].mpur_max); 3530 } 3531 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3532 } 3533