1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2020 Joyent, Inc. 24 * Copyright 2020 RackTop Systems, Inc. 25 */ 26 27 /* 28 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 29 * 30 * An instance of the structure aggr_grp_t is allocated for each 31 * link aggregation group. When created, aggr_grp_t objects are 32 * entered into the aggr_grp_hash hash table maintained by the modhash 33 * module. The hash key is the linkid associated with the link 34 * aggregation group. 35 * 36 * Each aggregation contains a set of ports. The port is represented 37 * by the aggr_port_t structure. A port consists of a single MAC 38 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 39 * MAC. This client is used by the aggr to send and receive LACP 40 * traffic. Each port client takes on the same MAC unicast address -- 41 * the address of the aggregation itself (taken from the first port by 42 * default). 43 * 44 * The MAC client that hangs off each aggr port is not your typical 45 * MAC client. Not only does it have exclusive control of the MAC, but 46 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 47 * fanout traffic among L4 protocols; but the aggr is an intermediary, 48 * not a consumer. Instead of using SRSes, the aggr puts the 49 * underlying hardware rings into passthru mode and ships packets up 50 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 51 * LACP while passing all other traffic up to clients of the aggr. 52 * 53 * Pseudo Rx Groups and Rings 54 * -------------------------- 55 * 56 * It is imperative for client performance that the aggr provide as 57 * many MAC groups as possible. In order to use the underlying HW 58 * resources, aggr creates pseudo groups to aggregate the underlying 59 * HW groups. Every HW group gets mapped to a pseudo group; and every 60 * HW ring in that group gets mapped to a pseudo ring. The pseudo 61 * group at index 0 combines all the HW groups at index 0 from each 62 * port, etc. The aggr's MAC then creates normal MAC groups and rings 63 * out of these pseudo groups and rings to present to the aggr's 64 * clients. To the clients, the aggr's groups and rings are absolutely 65 * no different than a NIC's groups or rings. 66 * 67 * Pseudo Tx Rings 68 * --------------- 69 * 70 * The underlying ports (NICs) in an aggregation can have Tx rings. To 71 * enhance aggr's performance, these Tx rings are made available to 72 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 73 * not new. They are already present and implemented on the Rx side. 74 * The same concept is extended to the Tx side where each Tx ring of 75 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 76 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 77 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 78 * is given to the aggregation layer. 79 * 80 * With this change, the outgoing stack depth looks much better: 81 * 82 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 83 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 84 * 85 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 86 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 87 * 88 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 89 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 90 * ring belonging to a port on which the packet has to be sent. 91 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 92 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 93 * the selected port. 94 * 95 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 96 * bandwidth limit is applied first on the outgoing packet and the packets 97 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 98 * particular Tx ring. 99 */ 100 101 #include <sys/types.h> 102 #include <sys/sysmacros.h> 103 #include <sys/conf.h> 104 #include <sys/cmn_err.h> 105 #include <sys/disp.h> 106 #include <sys/list.h> 107 #include <sys/ksynch.h> 108 #include <sys/kmem.h> 109 #include <sys/stream.h> 110 #include <sys/modctl.h> 111 #include <sys/ddi.h> 112 #include <sys/sunddi.h> 113 #include <sys/atomic.h> 114 #include <sys/stat.h> 115 #include <sys/modhash.h> 116 #include <sys/id_space.h> 117 #include <sys/strsun.h> 118 #include <sys/cred.h> 119 #include <sys/dlpi.h> 120 #include <sys/zone.h> 121 #include <sys/mac_provider.h> 122 #include <sys/dls.h> 123 #include <sys/vlan.h> 124 #include <sys/aggr.h> 125 #include <sys/aggr_impl.h> 126 127 static int aggr_m_start(void *); 128 static void aggr_m_stop(void *); 129 static int aggr_m_promisc(void *, boolean_t); 130 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 131 static int aggr_m_unicst(void *, const uint8_t *); 132 static int aggr_m_stat(void *, uint_t, uint64_t *); 133 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 134 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 135 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 136 const void *); 137 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 138 mac_prop_info_handle_t); 139 140 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 141 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 142 boolean_t *); 143 144 static void aggr_grp_capab_set(aggr_grp_t *); 145 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 146 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 147 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 148 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 149 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 150 151 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 152 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 153 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 154 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 155 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 156 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 157 static int aggr_addmac(void *, const uint8_t *); 158 static int aggr_remmac(void *, const uint8_t *); 159 static int aggr_addvlan(mac_group_driver_t, uint16_t); 160 static int aggr_remvlan(mac_group_driver_t, uint16_t); 161 static mblk_t *aggr_rx_poll(void *, int); 162 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 163 const int, mac_ring_info_t *, mac_ring_handle_t); 164 static void aggr_fill_group(void *, mac_ring_type_t, const int, 165 mac_group_info_t *, mac_group_handle_t); 166 167 static kmem_cache_t *aggr_grp_cache; 168 static mod_hash_t *aggr_grp_hash; 169 static krwlock_t aggr_grp_lock; 170 static uint_t aggr_grp_cnt; 171 static id_space_t *key_ids; 172 173 #define GRP_HASHSZ 64 174 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 175 #define AGGR_PORT_NAME_DELIMIT '-' 176 177 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 178 179 #define AGGR_M_CALLBACK_FLAGS \ 180 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 181 182 static mac_callbacks_t aggr_m_callbacks = { 183 AGGR_M_CALLBACK_FLAGS, 184 aggr_m_stat, 185 aggr_m_start, 186 aggr_m_stop, 187 aggr_m_promisc, 188 aggr_m_multicst, 189 NULL, 190 NULL, 191 NULL, 192 aggr_m_ioctl, 193 aggr_m_capab_get, 194 NULL, 195 NULL, 196 aggr_m_setprop, 197 NULL, 198 aggr_m_propinfo 199 }; 200 201 /*ARGSUSED*/ 202 static int 203 aggr_grp_constructor(void *buf, void *arg, int kmflag) 204 { 205 aggr_grp_t *grp = buf; 206 207 bzero(grp, sizeof (*grp)); 208 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 209 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 210 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 211 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 212 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 213 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 214 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 215 grp->lg_link_state = LINK_STATE_UNKNOWN; 216 return (0); 217 } 218 219 /*ARGSUSED*/ 220 static void 221 aggr_grp_destructor(void *buf, void *arg) 222 { 223 aggr_grp_t *grp = buf; 224 225 if (grp->lg_tx_ports != NULL) { 226 kmem_free(grp->lg_tx_ports, 227 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 228 } 229 230 mutex_destroy(&grp->lg_lacp_lock); 231 cv_destroy(&grp->lg_lacp_cv); 232 mutex_destroy(&grp->lg_port_lock); 233 cv_destroy(&grp->lg_port_cv); 234 rw_destroy(&grp->lg_tx_lock); 235 mutex_destroy(&grp->lg_tx_flowctl_lock); 236 cv_destroy(&grp->lg_tx_flowctl_cv); 237 } 238 239 void 240 aggr_grp_init(void) 241 { 242 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 243 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 244 aggr_grp_destructor, NULL, NULL, NULL, 0); 245 246 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 247 GRP_HASHSZ, mod_hash_null_valdtor); 248 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 249 aggr_grp_cnt = 0; 250 251 /* 252 * Allocate an id space to manage key values (when key is not 253 * specified). The range of the id space will be from 254 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 255 * uses a 16-bit key. 256 */ 257 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 258 ASSERT(key_ids != NULL); 259 } 260 261 void 262 aggr_grp_fini(void) 263 { 264 id_space_destroy(key_ids); 265 rw_destroy(&aggr_grp_lock); 266 mod_hash_destroy_idhash(aggr_grp_hash); 267 kmem_cache_destroy(aggr_grp_cache); 268 } 269 270 uint_t 271 aggr_grp_count(void) 272 { 273 uint_t count; 274 275 rw_enter(&aggr_grp_lock, RW_READER); 276 count = aggr_grp_cnt; 277 rw_exit(&aggr_grp_lock); 278 return (count); 279 } 280 281 /* 282 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 283 * requires the mac perimeter, this function holds a reference of the aggr 284 * and aggr won't call mac_unregister() until this reference drops to 0. 285 */ 286 void 287 aggr_grp_port_hold(aggr_port_t *port) 288 { 289 aggr_grp_t *grp = port->lp_grp; 290 291 AGGR_PORT_REFHOLD(port); 292 mutex_enter(&grp->lg_port_lock); 293 grp->lg_port_ref++; 294 mutex_exit(&grp->lg_port_lock); 295 } 296 297 /* 298 * Release the reference of the grp and inform aggr_grp_delete() calling 299 * mac_unregister() is now safe. 300 */ 301 void 302 aggr_grp_port_rele(aggr_port_t *port) 303 { 304 aggr_grp_t *grp = port->lp_grp; 305 306 mutex_enter(&grp->lg_port_lock); 307 if (--grp->lg_port_ref == 0) 308 cv_signal(&grp->lg_port_cv); 309 mutex_exit(&grp->lg_port_lock); 310 AGGR_PORT_REFRELE(port); 311 } 312 313 /* 314 * Wait for the port's lacp timer thread and the port's notification callback 315 * to exit. 316 */ 317 void 318 aggr_grp_port_wait(aggr_grp_t *grp) 319 { 320 mutex_enter(&grp->lg_port_lock); 321 if (grp->lg_port_ref != 0) 322 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 323 mutex_exit(&grp->lg_port_lock); 324 } 325 326 /* 327 * Attach a port to a link aggregation group. 328 * 329 * A port is attached to a link aggregation group once its speed 330 * and link state have been verified. 331 * 332 * Returns B_TRUE if the group link state or speed has changed. If 333 * it's the case, the caller must notify the MAC layer via a call 334 * to mac_link(). 335 */ 336 boolean_t 337 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 338 { 339 boolean_t link_state_changed = B_FALSE; 340 341 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 342 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 343 344 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 345 return (B_FALSE); 346 347 /* 348 * Validate the MAC port link speed and update the group 349 * link speed if needed. 350 */ 351 if (port->lp_ifspeed == 0 || 352 port->lp_link_state != LINK_STATE_UP || 353 port->lp_link_duplex != LINK_DUPLEX_FULL) { 354 /* 355 * Can't attach a MAC port with unknown link speed, 356 * down link, or not in full duplex mode. 357 */ 358 return (B_FALSE); 359 } 360 361 mutex_enter(&grp->lg_stat_lock); 362 if (grp->lg_ifspeed == 0) { 363 /* 364 * The group inherits the speed of the first link being 365 * attached. 366 */ 367 grp->lg_ifspeed = port->lp_ifspeed; 368 link_state_changed = B_TRUE; 369 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 370 /* 371 * The link speed of the MAC port must be the same as 372 * the group link speed, as per 802.3ad. Since it is 373 * not, the attach is cancelled. 374 */ 375 mutex_exit(&grp->lg_stat_lock); 376 return (B_FALSE); 377 } 378 mutex_exit(&grp->lg_stat_lock); 379 380 grp->lg_nattached_ports++; 381 382 /* 383 * Update the group link state. 384 */ 385 if (grp->lg_link_state != LINK_STATE_UP) { 386 grp->lg_link_state = LINK_STATE_UP; 387 mutex_enter(&grp->lg_stat_lock); 388 grp->lg_link_duplex = LINK_DUPLEX_FULL; 389 mutex_exit(&grp->lg_stat_lock); 390 link_state_changed = B_TRUE; 391 } 392 393 /* 394 * Update port's state. 395 */ 396 port->lp_state = AGGR_PORT_STATE_ATTACHED; 397 398 aggr_grp_multicst_port(port, B_TRUE); 399 400 /* 401 * The port client doesn't have an Rx SRS; instead of calling 402 * mac_rx_set() we set the client's flow callback directly. 403 * This datapath is used only when the port's driver doesn't 404 * support MAC_CAPAB_RINGS. Drivers with ring support will 405 * deliver traffic to the aggr via ring passthru. 406 */ 407 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 408 409 /* 410 * If LACP is OFF, the port can be used to send data as soon 411 * as its link is up and verified to be compatible with the 412 * aggregation. 413 * 414 * If LACP is active or passive, notify the LACP subsystem, which 415 * will enable sending on the port following the LACP protocol. 416 */ 417 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 418 aggr_send_port_enable(port); 419 else 420 aggr_lacp_port_attached(port); 421 422 return (link_state_changed); 423 } 424 425 boolean_t 426 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 427 { 428 boolean_t link_state_changed = B_FALSE; 429 430 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 431 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 432 433 /* update state */ 434 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 435 return (B_FALSE); 436 437 mac_client_clear_flow_cb(port->lp_mch); 438 439 aggr_grp_multicst_port(port, B_FALSE); 440 441 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 442 aggr_send_port_disable(port); 443 else 444 aggr_lacp_port_detached(port); 445 446 port->lp_state = AGGR_PORT_STATE_STANDBY; 447 448 grp->lg_nattached_ports--; 449 if (grp->lg_nattached_ports == 0) { 450 /* the last attached MAC port of the group is being detached */ 451 grp->lg_link_state = LINK_STATE_DOWN; 452 mutex_enter(&grp->lg_stat_lock); 453 grp->lg_ifspeed = 0; 454 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 455 mutex_exit(&grp->lg_stat_lock); 456 link_state_changed = B_TRUE; 457 } 458 459 return (link_state_changed); 460 } 461 462 /* 463 * Update the MAC addresses of the constituent ports of the specified 464 * group. This function is invoked: 465 * - after creating a new aggregation group. 466 * - after adding new ports to an aggregation group. 467 * - after removing a port from a group when the MAC address of 468 * that port was used for the MAC address of the group. 469 * - after the MAC address of a port changed when the MAC address 470 * of that port was used for the MAC address of the group. 471 * 472 * Return true if the link state of the aggregation changed, for example 473 * as a result of a failure changing the MAC address of one of the 474 * constituent ports. 475 */ 476 boolean_t 477 aggr_grp_update_ports_mac(aggr_grp_t *grp) 478 { 479 aggr_port_t *cport; 480 boolean_t link_state_changed = B_FALSE; 481 mac_perim_handle_t mph; 482 483 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 484 485 for (cport = grp->lg_ports; cport != NULL; 486 cport = cport->lp_next) { 487 mac_perim_enter_by_mh(cport->lp_mh, &mph); 488 if (aggr_port_unicst(cport) != 0) { 489 if (aggr_grp_detach_port(grp, cport)) 490 link_state_changed = B_TRUE; 491 } else { 492 /* 493 * If a port was detached because of a previous 494 * failure changing the MAC address, the port is 495 * reattached when it successfully changes the MAC 496 * address now, and this might cause the link state 497 * of the aggregation to change. 498 */ 499 if (aggr_grp_attach_port(grp, cport)) 500 link_state_changed = B_TRUE; 501 } 502 mac_perim_exit(mph); 503 } 504 return (link_state_changed); 505 } 506 507 /* 508 * Invoked when the MAC address of a port has changed. If the port's 509 * MAC address was used for the group MAC address, set mac_addr_changedp 510 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 511 * notification. If the link state changes due to detach/attach of 512 * the constituent port, set link_state_changedp to B_TRUE to indicate 513 * to the caller that it should send a MAC_NOTE_LINK notification. In both 514 * cases, it is the responsibility of the caller to invoke notification 515 * functions after releasing the the port lock. 516 */ 517 void 518 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 519 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 520 { 521 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 522 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 523 ASSERT(mac_addr_changedp != NULL); 524 ASSERT(link_state_changedp != NULL); 525 526 *mac_addr_changedp = B_FALSE; 527 *link_state_changedp = B_FALSE; 528 529 if (grp->lg_addr_fixed) { 530 /* 531 * The group is using a fixed MAC address or an automatic 532 * MAC address has not been set. 533 */ 534 return; 535 } 536 537 if (grp->lg_mac_addr_port == port) { 538 /* 539 * The MAC address of the port was assigned to the group 540 * MAC address. Update the group MAC address. 541 */ 542 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 543 *mac_addr_changedp = B_TRUE; 544 } else { 545 /* 546 * Update the actual port MAC address to the MAC address 547 * of the group. 548 */ 549 if (aggr_port_unicst(port) != 0) { 550 *link_state_changedp = aggr_grp_detach_port(grp, port); 551 } else { 552 /* 553 * If a port was detached because of a previous 554 * failure changing the MAC address, the port is 555 * reattached when it successfully changes the MAC 556 * address now, and this might cause the link state 557 * of the aggregation to change. 558 */ 559 *link_state_changedp = aggr_grp_attach_port(grp, port); 560 } 561 } 562 } 563 564 /* 565 * Add a port to a link aggregation group. 566 */ 567 static int 568 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 569 aggr_port_t **pp) 570 { 571 aggr_port_t *port, **cport; 572 mac_perim_handle_t mph; 573 zoneid_t port_zoneid = ALL_ZONES; 574 int err; 575 576 /* The port must be in the same zone as the aggregation. */ 577 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 578 port_zoneid = GLOBAL_ZONEID; 579 if (grp->lg_zoneid != port_zoneid) 580 return (EBUSY); 581 582 /* 583 * If we are creating the aggr, then there is no MAC handle 584 * and thus no perimeter to hold. If we are adding a port to 585 * an existing aggr, then the perimiter of the aggr's MAC must 586 * be held. 587 */ 588 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 589 590 err = aggr_port_create(grp, port_linkid, force, &port); 591 if (err != 0) 592 return (err); 593 594 mac_perim_enter_by_mh(port->lp_mh, &mph); 595 596 /* Add the new port to the end of the list. */ 597 cport = &grp->lg_ports; 598 while (*cport != NULL) 599 cport = &((*cport)->lp_next); 600 *cport = port; 601 602 /* 603 * Back reference to the group it is member of. A port always 604 * holds a reference to its group to ensure that the back 605 * reference is always valid. 606 */ 607 port->lp_grp = grp; 608 AGGR_GRP_REFHOLD(grp); 609 grp->lg_nports++; 610 if (grp->lg_nports > grp->lg_nports_high) 611 grp->lg_nports_high = grp->lg_nports; 612 613 aggr_lacp_init_port(port); 614 mac_perim_exit(mph); 615 616 if (pp != NULL) 617 *pp = port; 618 619 return (0); 620 } 621 622 /* 623 * This is called when the 'lg_tx_ports' arrangement has changed and 624 * we need to update the corresponding 'mi_default_tx_ring'. This 625 * happens for several reasons. 626 * 627 * - A pseudo TX mac group was added or removed. 628 * - An LACP message has changed the port's state. 629 * - A link event has changed the port's state. 630 * 631 * In any case, we see if there is at least one port enabled (see 632 * 'aggr_send_port_enable()'), and if so we use its first ring as the 633 * mac's default TX ring. 634 * 635 * Note, because we only have a single TX group, we don't have to 636 * worry about the rings moving between groups and the chance that mac 637 * will reassign it unless someone removes a port, at which point, we 638 * play it safe and call this again. 639 */ 640 void 641 aggr_grp_update_default(aggr_grp_t *grp) 642 { 643 aggr_port_t *port; 644 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 645 646 rw_enter(&grp->lg_tx_lock, RW_WRITER); 647 648 if (grp->lg_ntx_ports == 0) { 649 rw_exit(&grp->lg_tx_lock); 650 return; 651 } 652 653 port = grp->lg_tx_ports[0]; 654 ASSERT(port->lp_tx_ring_cnt > 0); 655 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 656 rw_exit(&grp->lg_tx_lock); 657 } 658 659 /* 660 * Add a pseudo RX ring for the given HW ring handle. 661 */ 662 static int 663 aggr_add_pseudo_rx_ring(aggr_port_t *port, 664 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 665 { 666 aggr_pseudo_rx_ring_t *ring; 667 int err; 668 int j; 669 670 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 671 ring = rx_grp->arg_rings + j; 672 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 673 break; 674 } 675 676 /* 677 * No slot for this new RX ring. 678 */ 679 if (j == MAX_RINGS_PER_GROUP) 680 return (ENOSPC); 681 682 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 683 ring->arr_hw_rh = hw_rh; 684 ring->arr_port = port; 685 ring->arr_grp = rx_grp; 686 rx_grp->arg_ring_cnt++; 687 688 /* 689 * The group is already registered, dynamically add a new ring to the 690 * mac group. 691 */ 692 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 693 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 694 ring->arr_hw_rh = NULL; 695 ring->arr_port = NULL; 696 ring->arr_grp = NULL; 697 rx_grp->arg_ring_cnt--; 698 } else { 699 /* 700 * This must run after the MAC is registered. 701 */ 702 ASSERT3P(ring->arr_rh, !=, NULL); 703 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 704 (void *)port, (mac_resource_handle_t)ring); 705 } 706 return (err); 707 } 708 709 /* 710 * Remove the pseudo RX ring of the given HW ring handle. 711 */ 712 static void 713 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 714 { 715 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 716 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 717 718 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 719 ring->arr_hw_rh != hw_rh) { 720 continue; 721 } 722 723 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 724 725 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 726 ring->arr_hw_rh = NULL; 727 ring->arr_port = NULL; 728 ring->arr_grp = NULL; 729 rx_grp->arg_ring_cnt--; 730 mac_hwring_clear_passthru(hw_rh); 731 break; 732 } 733 } 734 735 /* 736 * Create pseudo rings over the HW rings of the port. 737 * 738 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 739 * 740 * o Program existing unicast filters on the pseudo group into the HW group. 741 * 742 * o Program existing VLAN filters on the pseudo group into the HW group. 743 */ 744 static int 745 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 746 { 747 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 748 aggr_unicst_addr_t *addr, *a; 749 mac_perim_handle_t pmph; 750 aggr_vlan_t *avp; 751 uint_t hw_rh_cnt, i; 752 int err = 0; 753 uint_t g_idx = rx_grp->arg_index; 754 755 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 756 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 757 mac_perim_enter_by_mh(port->lp_mh, &pmph); 758 759 i = 0; 760 addr = NULL; 761 /* 762 * This function must be called after the aggr registers its 763 * MAC and its Rx groups have been initialized. 764 */ 765 ASSERT(rx_grp->arg_gh != NULL); 766 767 /* 768 * Get the list of the underlying HW rings. 769 */ 770 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 771 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 772 773 /* 774 * Add existing VLAN and unicast address filters to the port. 775 */ 776 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 777 avp = list_next(&rx_grp->arg_vlans, avp)) { 778 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 779 goto err; 780 } 781 782 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 783 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 784 goto err; 785 } 786 787 for (i = 0; i < hw_rh_cnt; i++) { 788 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 789 if (err != 0) 790 goto err; 791 } 792 793 mac_perim_exit(pmph); 794 return (0); 795 796 err: 797 ASSERT(err != 0); 798 799 for (uint_t j = 0; j < i; j++) 800 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 801 802 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 803 aggr_port_remmac(port, g_idx, a->aua_addr); 804 805 if (avp != NULL) 806 avp = list_prev(&rx_grp->arg_vlans, avp); 807 808 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 809 int err2; 810 811 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 812 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 813 ": errno %d.", avp->av_vid, 814 mac_client_name(port->lp_mch), err2); 815 } 816 } 817 818 port->lp_hwghs[g_idx] = NULL; 819 mac_perim_exit(pmph); 820 return (err); 821 } 822 823 /* 824 * Destroy the pseudo rings mapping to this port and remove all VLAN 825 * and unicast filters from this port. Even if there are no underlying 826 * HW rings we must still remove the unicast filters to take the port 827 * out of promisc mode. 828 */ 829 static void 830 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 831 { 832 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 833 aggr_unicst_addr_t *addr; 834 mac_perim_handle_t pmph; 835 uint_t hw_rh_cnt; 836 uint_t g_idx = rx_grp->arg_index; 837 838 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 839 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 840 ASSERT3P(rx_grp->arg_gh, !=, NULL); 841 mac_perim_enter_by_mh(port->lp_mh, &pmph); 842 843 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 844 MAC_RING_TYPE_RX); 845 846 for (uint_t i = 0; i < hw_rh_cnt; i++) 847 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 848 849 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 850 aggr_port_remmac(port, g_idx, addr->aua_addr); 851 852 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 853 avp = list_next(&rx_grp->arg_vlans, avp)) { 854 int err; 855 856 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 857 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 858 ": errno %d.", avp->av_vid, 859 mac_client_name(port->lp_mch), err); 860 } 861 } 862 863 port->lp_hwghs[g_idx] = NULL; 864 mac_perim_exit(pmph); 865 } 866 867 /* 868 * Add a pseudo TX ring for the given HW ring handle. 869 */ 870 static int 871 aggr_add_pseudo_tx_ring(aggr_port_t *port, 872 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 873 mac_ring_handle_t *pseudo_rh) 874 { 875 aggr_pseudo_tx_ring_t *ring; 876 int err; 877 int i; 878 879 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 880 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 881 ring = tx_grp->atg_rings + i; 882 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 883 break; 884 } 885 /* 886 * No slot for this new TX ring. 887 */ 888 if (i == MAX_RINGS_PER_GROUP) 889 return (ENOSPC); 890 /* 891 * The following 4 statements needs to be done before 892 * calling mac_group_add_ring(). Otherwise it will 893 * result in an assertion failure in mac_init_ring(). 894 */ 895 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 896 ring->atr_hw_rh = hw_rh; 897 ring->atr_port = port; 898 tx_grp->atg_ring_cnt++; 899 900 /* 901 * The TX side has no concept of ring groups unlike RX groups. 902 * There is just a single group which stores all the TX rings. 903 * This group will be used to store aggr's pseudo TX rings. 904 */ 905 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 906 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 907 ring->atr_hw_rh = NULL; 908 ring->atr_port = NULL; 909 tx_grp->atg_ring_cnt--; 910 } else { 911 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 912 if (hw_rh != NULL) { 913 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 914 mac_find_ring(tx_grp->atg_gh, i)); 915 } 916 } 917 918 return (err); 919 } 920 921 /* 922 * Remove the pseudo TX ring of the given HW ring handle. 923 */ 924 static void 925 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 926 mac_ring_handle_t pseudo_hw_rh) 927 { 928 aggr_pseudo_tx_ring_t *ring; 929 int i; 930 931 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 932 ring = tx_grp->atg_rings + i; 933 if (ring->atr_rh != pseudo_hw_rh) 934 continue; 935 936 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 937 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 938 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 939 mac_hwring_teardown(ring->atr_hw_rh); 940 ring->atr_hw_rh = NULL; 941 ring->atr_port = NULL; 942 tx_grp->atg_ring_cnt--; 943 break; 944 } 945 } 946 947 /* 948 * This function is called to create pseudo rings over hardware rings of 949 * the underlying device. There is a 1:1 mapping between the pseudo TX 950 * rings of the aggr and the hardware rings of the underlying port. 951 */ 952 static int 953 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp, 954 uint_t limit) 955 { 956 aggr_grp_t *grp = port->lp_grp; 957 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 958 mac_perim_handle_t pmph; 959 int hw_rh_cnt, i = 0, j; 960 int err = 0; 961 962 if (limit == 0) 963 return (ENOSPC); 964 965 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 966 mac_perim_enter_by_mh(port->lp_mh, &pmph); 967 968 /* 969 * Get the list the the underlying HW rings. 970 */ 971 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 972 MAC_RING_TYPE_TX); 973 974 /* 975 * Even if the underlying NIC does not have TX rings, we 976 * still make a psuedo TX ring for that NIC with NULL as 977 * the ring handle. 978 */ 979 if (hw_rh_cnt == 0) 980 port->lp_tx_ring_cnt = 1; 981 else 982 port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit); 983 984 port->lp_tx_ring_alloc = port->lp_tx_ring_cnt; 985 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 986 port->lp_tx_ring_alloc), KM_SLEEP); 987 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 988 port->lp_tx_ring_alloc), KM_SLEEP); 989 990 if (hw_rh_cnt == 0) { 991 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 992 NULL, &pseudo_rh)) == 0) { 993 port->lp_tx_rings[0] = NULL; 994 port->lp_pseudo_tx_rings[0] = pseudo_rh; 995 } 996 } else { 997 for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) { 998 err = aggr_add_pseudo_tx_ring(port, 999 tx_grp, hw_rh[i], &pseudo_rh); 1000 if (err != 0) 1001 break; 1002 port->lp_tx_rings[i] = hw_rh[i]; 1003 port->lp_pseudo_tx_rings[i] = pseudo_rh; 1004 } 1005 } 1006 1007 if (err != 0) { 1008 if (hw_rh_cnt != 0) { 1009 for (j = 0; j < i; j++) { 1010 aggr_rem_pseudo_tx_ring(tx_grp, 1011 port->lp_pseudo_tx_rings[j]); 1012 } 1013 } 1014 kmem_free(port->lp_tx_rings, 1015 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1016 kmem_free(port->lp_pseudo_tx_rings, 1017 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1018 port->lp_tx_ring_cnt = 0; 1019 port->lp_tx_ring_alloc = 0; 1020 } else { 1021 port->lp_tx_grp_added = B_TRUE; 1022 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1023 aggr_tx_ring_update, port); 1024 } 1025 mac_perim_exit(pmph); 1026 aggr_grp_update_default(grp); 1027 return (err); 1028 } 1029 1030 /* 1031 * This function is called by aggr to remove pseudo TX rings over the 1032 * HW rings of the underlying port. 1033 */ 1034 static void 1035 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1036 { 1037 aggr_grp_t *grp = port->lp_grp; 1038 mac_perim_handle_t pmph; 1039 int i; 1040 1041 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1042 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1043 1044 if (!port->lp_tx_grp_added) 1045 goto done; 1046 1047 ASSERT(tx_grp->atg_gh != NULL); 1048 1049 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1050 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1051 1052 kmem_free(port->lp_tx_rings, 1053 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1054 kmem_free(port->lp_pseudo_tx_rings, 1055 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc)); 1056 1057 port->lp_tx_ring_cnt = 0; 1058 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1059 port->lp_tx_grp_added = B_FALSE; 1060 aggr_grp_update_default(grp); 1061 done: 1062 mac_perim_exit(pmph); 1063 } 1064 1065 static int 1066 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1067 { 1068 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1069 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1070 } 1071 1072 static int 1073 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1074 { 1075 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1076 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1077 } 1078 1079 /* 1080 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1081 * over an actual HW ring, the real task is to start the underlying HW 1082 * ring. 1083 */ 1084 static int 1085 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1086 { 1087 int err; 1088 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1089 1090 err = mac_hwring_start(rr_ring->arr_hw_rh); 1091 1092 if (err != 0) 1093 return (err); 1094 1095 rr_ring->arr_gen = mr_gen; 1096 return (err); 1097 } 1098 1099 /* 1100 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1101 * over an actual HW ring, the real task is to stop the underlying HW 1102 * ring. 1103 */ 1104 static void 1105 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1106 { 1107 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1108 1109 /* 1110 * The rings underlying the default group must stay up to 1111 * continue receiving LACP traffic. We would normally never 1112 * stop the default Rx rings because of the primary MAC 1113 * client; but aggr's primary MAC client doesn't call 1114 * mac_unicast_add() and thus mi_active is 0 when the last 1115 * non-primary client is deleted. 1116 */ 1117 if (rr_ring->arr_grp->arg_index != 0) 1118 mac_hwring_stop(rr_ring->arr_hw_rh); 1119 } 1120 1121 /* 1122 * Trim each port in a group to ensure it uses no more than tx_ring_limit 1123 * rings. 1124 */ 1125 static void 1126 aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit) 1127 { 1128 aggr_port_t *port; 1129 mac_perim_handle_t mph; 1130 uint_t i, tx_ring_cnt; 1131 1132 ASSERT(tx_ring_limit > 0); 1133 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1134 1135 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1136 mac_perim_enter_by_mh(port->lp_mh, &mph); 1137 1138 /* 1139 * Reduce the Tx ring count first to prevent rings being 1140 * used as they are removed. 1141 */ 1142 rw_enter(&grp->lg_tx_lock, RW_WRITER); 1143 if (port->lp_tx_ring_cnt <= tx_ring_limit) { 1144 rw_exit(&grp->lg_tx_lock); 1145 mac_perim_exit(mph); 1146 continue; 1147 } 1148 1149 tx_ring_cnt = port->lp_tx_ring_cnt; 1150 port->lp_tx_ring_cnt = tx_ring_limit; 1151 rw_exit(&grp->lg_tx_lock); 1152 1153 for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) { 1154 aggr_rem_pseudo_tx_ring(&grp->lg_tx_group, 1155 port->lp_pseudo_tx_rings[i]); 1156 1157 } 1158 1159 mac_perim_exit(mph); 1160 } 1161 } 1162 1163 /* 1164 * Add one or more ports to an existing link aggregation group. 1165 */ 1166 int 1167 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1168 laioc_port_t *ports) 1169 { 1170 int rc; 1171 uint_t port_added = 0; 1172 uint_t grp_added; 1173 uint_t nports_high, tx_ring_limit; 1174 aggr_grp_t *grp = NULL; 1175 aggr_port_t *port; 1176 boolean_t link_state_changed = B_FALSE; 1177 mac_perim_handle_t mph, pmph; 1178 1179 /* Get the aggr corresponding to linkid. */ 1180 rw_enter(&aggr_grp_lock, RW_READER); 1181 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1182 (mod_hash_val_t *)&grp) != 0) { 1183 rw_exit(&aggr_grp_lock); 1184 return (ENOENT); 1185 } 1186 AGGR_GRP_REFHOLD(grp); 1187 1188 /* 1189 * Hold the perimeter so that the aggregation can't be destroyed. 1190 */ 1191 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1192 rw_exit(&aggr_grp_lock); 1193 1194 /* 1195 * Limit the number of Tx rings per port. When determining the 1196 * number of ports take into consideration the existing high 1197 * value, and what the new high value may be after this request. 1198 */ 1199 nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports); 1200 tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high; 1201 1202 if (tx_ring_limit == 0) { 1203 rc = ENOSPC; 1204 goto bail; 1205 } 1206 1207 /* 1208 * Balance the Tx rings so each port has a fair share of rings. 1209 */ 1210 aggr_grp_balance_tx(grp, tx_ring_limit); 1211 1212 /* Add the specified ports to the aggr. */ 1213 for (uint_t i = 0; i < nports; i++) { 1214 grp_added = 0; 1215 1216 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1217 force, &port)) != 0) { 1218 goto bail; 1219 } 1220 1221 ASSERT(port != NULL); 1222 port_added++; 1223 1224 /* check capabilities */ 1225 if (!aggr_grp_capab_check(grp, port) || 1226 !aggr_grp_sdu_check(grp, port) || 1227 !aggr_grp_margin_check(grp, port)) { 1228 rc = ENOTSUP; 1229 goto bail; 1230 } 1231 1232 /* 1233 * Create the pseudo ring for each HW ring of the underlying 1234 * port. 1235 */ 1236 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, 1237 tx_ring_limit); 1238 if (rc != 0) 1239 goto bail; 1240 1241 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1242 rc = aggr_add_pseudo_rx_group(port, 1243 &grp->lg_rx_groups[j]); 1244 1245 if (rc != 0) 1246 goto bail; 1247 1248 grp_added++; 1249 } 1250 1251 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1252 1253 /* set LACP mode */ 1254 aggr_port_lacp_set_mode(grp, port); 1255 1256 /* start port if group has already been started */ 1257 if (grp->lg_started) { 1258 rc = aggr_port_start(port); 1259 if (rc != 0) { 1260 mac_perim_exit(pmph); 1261 goto bail; 1262 } 1263 1264 /* 1265 * Turn on the promiscuous mode over the port when it 1266 * is requested to be turned on to receive the 1267 * non-primary address over a port, or the promiscuous 1268 * mode is enabled over the aggr. 1269 */ 1270 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1271 rc = aggr_port_promisc(port, B_TRUE); 1272 if (rc != 0) { 1273 mac_perim_exit(pmph); 1274 goto bail; 1275 } 1276 } 1277 } 1278 mac_perim_exit(pmph); 1279 1280 /* 1281 * Attach each port if necessary. 1282 */ 1283 if (aggr_port_notify_link(grp, port)) 1284 link_state_changed = B_TRUE; 1285 1286 /* 1287 * Initialize the callback functions for this port. 1288 */ 1289 aggr_port_init_callbacks(port); 1290 } 1291 1292 /* update the MAC address of the constituent ports */ 1293 if (aggr_grp_update_ports_mac(grp)) 1294 link_state_changed = B_TRUE; 1295 1296 if (link_state_changed) 1297 mac_link_update(grp->lg_mh, grp->lg_link_state); 1298 1299 bail: 1300 if (rc != 0) { 1301 /* stop and remove ports that have been added */ 1302 for (uint_t i = 0; i < port_added; i++) { 1303 uint_t grp_remove; 1304 1305 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1306 ASSERT(port != NULL); 1307 1308 if (grp->lg_started) { 1309 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1310 (void) aggr_port_promisc(port, B_FALSE); 1311 aggr_port_stop(port); 1312 mac_perim_exit(pmph); 1313 } 1314 1315 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1316 1317 /* 1318 * Only the last port could have a partial set 1319 * of groups added. 1320 */ 1321 grp_remove = (i + 1 == port_added) ? grp_added : 1322 grp->lg_rx_group_count; 1323 1324 for (uint_t j = 0; j < grp_remove; j++) { 1325 aggr_rem_pseudo_rx_group(port, 1326 &grp->lg_rx_groups[j]); 1327 } 1328 1329 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1330 } 1331 } 1332 1333 mac_perim_exit(mph); 1334 AGGR_GRP_REFRELE(grp); 1335 return (rc); 1336 } 1337 1338 static int 1339 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1340 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1341 aggr_lacp_timer_t lacp_timer) 1342 { 1343 boolean_t mac_addr_changed = B_FALSE; 1344 boolean_t link_state_changed = B_FALSE; 1345 mac_perim_handle_t pmph; 1346 1347 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1348 1349 /* validate fixed address if specified */ 1350 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1351 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1352 (mac_addr[0] & 0x01))) { 1353 return (EINVAL); 1354 } 1355 1356 /* update policy if requested */ 1357 if (update_mask & AGGR_MODIFY_POLICY) 1358 aggr_send_update_policy(grp, policy); 1359 1360 /* update unicast MAC address if requested */ 1361 if (update_mask & AGGR_MODIFY_MAC) { 1362 if (mac_fixed) { 1363 /* user-supplied MAC address */ 1364 grp->lg_mac_addr_port = NULL; 1365 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1366 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1367 mac_addr_changed = B_TRUE; 1368 } 1369 } else if (grp->lg_addr_fixed) { 1370 /* switch from user-supplied to automatic */ 1371 aggr_port_t *port = grp->lg_ports; 1372 1373 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1374 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1375 grp->lg_mac_addr_port = port; 1376 mac_addr_changed = B_TRUE; 1377 mac_perim_exit(pmph); 1378 } 1379 grp->lg_addr_fixed = mac_fixed; 1380 } 1381 1382 if (mac_addr_changed) 1383 link_state_changed = aggr_grp_update_ports_mac(grp); 1384 1385 if (update_mask & AGGR_MODIFY_LACP_MODE) 1386 aggr_lacp_update_mode(grp, lacp_mode); 1387 1388 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1389 aggr_lacp_update_timer(grp, lacp_timer); 1390 1391 if (link_state_changed) 1392 mac_link_update(grp->lg_mh, grp->lg_link_state); 1393 1394 if (mac_addr_changed) 1395 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1396 1397 return (0); 1398 } 1399 1400 /* 1401 * Update properties of an existing link aggregation group. 1402 */ 1403 int 1404 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1405 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1406 aggr_lacp_timer_t lacp_timer) 1407 { 1408 aggr_grp_t *grp = NULL; 1409 mac_perim_handle_t mph; 1410 int err; 1411 1412 /* get group corresponding to linkid */ 1413 rw_enter(&aggr_grp_lock, RW_READER); 1414 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1415 (mod_hash_val_t *)&grp) != 0) { 1416 rw_exit(&aggr_grp_lock); 1417 return (ENOENT); 1418 } 1419 AGGR_GRP_REFHOLD(grp); 1420 1421 /* 1422 * Hold the perimeter so that the aggregation won't be destroyed. 1423 */ 1424 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1425 rw_exit(&aggr_grp_lock); 1426 1427 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1428 mac_addr, lacp_mode, lacp_timer); 1429 1430 mac_perim_exit(mph); 1431 AGGR_GRP_REFRELE(grp); 1432 return (err); 1433 } 1434 1435 /* 1436 * Create a new link aggregation group upon request from administrator. 1437 * Returns 0 on success, an errno on failure. 1438 */ 1439 int 1440 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1441 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1442 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1443 cred_t *credp) 1444 { 1445 aggr_grp_t *grp = NULL; 1446 aggr_port_t *port; 1447 aggr_port_t *last_attached = NULL; 1448 mac_register_t *mac; 1449 boolean_t link_state_changed; 1450 mac_perim_handle_t mph, pmph; 1451 datalink_id_t tempid; 1452 boolean_t mac_registered = B_FALSE; 1453 uint_t tx_ring_limit; 1454 int err; 1455 int i, j; 1456 kt_did_t tid = 0; 1457 1458 /* need at least one port */ 1459 if (nports == 0) 1460 return (EINVAL); 1461 1462 rw_enter(&aggr_grp_lock, RW_WRITER); 1463 1464 /* does a group with the same linkid already exist? */ 1465 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1466 (mod_hash_val_t *)&grp); 1467 if (err == 0) { 1468 rw_exit(&aggr_grp_lock); 1469 return (EEXIST); 1470 } 1471 1472 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1473 1474 grp->lg_refs = 1; 1475 grp->lg_closing = B_FALSE; 1476 grp->lg_force = force; 1477 grp->lg_linkid = linkid; 1478 grp->lg_zoneid = crgetzoneid(credp); 1479 grp->lg_ifspeed = 0; 1480 grp->lg_link_state = LINK_STATE_UNKNOWN; 1481 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1482 grp->lg_started = B_FALSE; 1483 grp->lg_promisc = B_FALSE; 1484 grp->lg_lacp_done = B_FALSE; 1485 grp->lg_tx_notify_done = B_FALSE; 1486 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1487 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1488 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1489 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1490 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1491 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1492 MAX_RINGS_PER_GROUP), KM_SLEEP); 1493 grp->lg_tx_blocked_cnt = 0; 1494 bzero(&grp->lg_rx_groups, 1495 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1496 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1497 aggr_lacp_init_grp(grp); 1498 1499 /* add MAC ports to group */ 1500 grp->lg_ports = NULL; 1501 grp->lg_nports = 0; 1502 grp->lg_nattached_ports = 0; 1503 grp->lg_ntx_ports = 0; 1504 1505 /* 1506 * If key is not specified by the user, allocate the key. 1507 */ 1508 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1509 err = ENOMEM; 1510 goto bail; 1511 } 1512 grp->lg_key = key; 1513 1514 for (i = 0; i < nports; i++) { 1515 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1516 if (err != 0) 1517 goto bail; 1518 } 1519 1520 grp->lg_rx_group_count = 1; 1521 1522 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1523 uint_t num_rgroups; 1524 1525 mac_perim_enter_by_mh(port->lp_mh, &mph); 1526 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1527 mac_perim_exit(mph); 1528 1529 /* 1530 * Utilize all the groups in a port. If some ports 1531 * have less groups than others, then traffic destined 1532 * for the same unicast address may be HW classified 1533 * on some ports but SW classified by aggr when 1534 * arriving on other ports. 1535 */ 1536 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1537 num_rgroups); 1538 } 1539 1540 /* 1541 * There could be cases where the hardware provides more 1542 * groups than aggr can support. Make sure we never go above 1543 * the max aggr can support. 1544 */ 1545 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1546 MAX_GROUPS_PER_PORT); 1547 1548 ASSERT3U(grp->lg_rx_group_count, >, 0); 1549 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1550 grp->lg_rx_groups[i].arg_index = i; 1551 grp->lg_rx_groups[i].arg_untagged = 0; 1552 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1553 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1554 } 1555 1556 /* 1557 * If no explicit MAC address was specified by the administrator, 1558 * set it to the MAC address of the first port. 1559 */ 1560 grp->lg_addr_fixed = mac_fixed; 1561 if (grp->lg_addr_fixed) { 1562 /* validate specified address */ 1563 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1564 err = EINVAL; 1565 goto bail; 1566 } 1567 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1568 } else { 1569 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1570 grp->lg_mac_addr_port = grp->lg_ports; 1571 } 1572 1573 /* Set the initial group capabilities. */ 1574 aggr_grp_capab_set(grp); 1575 1576 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1577 err = ENOMEM; 1578 goto bail; 1579 } 1580 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1581 mac->m_driver = grp; 1582 mac->m_dip = aggr_dip; 1583 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1584 mac->m_src_addr = grp->lg_addr; 1585 mac->m_callbacks = &aggr_m_callbacks; 1586 mac->m_min_sdu = 0; 1587 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1588 mac->m_margin = aggr_grp_max_margin(grp); 1589 mac->m_v12n = MAC_VIRT_LEVEL1; 1590 err = mac_register(mac, &grp->lg_mh); 1591 mac_free(mac); 1592 if (err != 0) 1593 goto bail; 1594 1595 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1596 if (err != 0) { 1597 (void) mac_unregister(grp->lg_mh); 1598 grp->lg_mh = NULL; 1599 goto bail; 1600 } 1601 1602 mac_registered = B_TRUE; 1603 1604 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1605 1606 /* 1607 * Update the MAC address of the constituent ports. 1608 * None of the port is attached at this time, the link state of the 1609 * aggregation will not change. 1610 * 1611 * All ports take on the primary MAC address of the aggr 1612 * (lg_aggr). At this point, none of the ports are attached; 1613 * thus the link state of the aggregation will not change. 1614 */ 1615 link_state_changed = aggr_grp_update_ports_mac(grp); 1616 ASSERT(!link_state_changed); 1617 1618 /* Update outbound load balancing policy. */ 1619 aggr_send_update_policy(grp, policy); 1620 1621 /* Set LACP mode. */ 1622 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1623 1624 /* 1625 * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP 1626 * rings, when all the Tx rings of all the ports are accumulated 1627 * it is conceivable this limit is exceeded. We try and prevent 1628 * this by limiting the number of rings an individual port will use. 1629 * 1630 * - When an aggr is first created, we will not let an 1631 * individual port use more than MAX_RINGS_PER_GROUP/nports 1632 * rings. 1633 * - As ports are added to an existing aggr, each of the 1634 * ports will not use more than MAX_RINGS_PER_GROUP/nports_high. 1635 * Where nports_high is the highest number of ports the aggr has 1636 * held (including any ports being added). This may involve 1637 * trimming rings from existing ports. 1638 */ 1639 1640 /* Leave room for 4 ports */ 1641 tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports); 1642 1643 /* 1644 * Attach each port if necessary. 1645 */ 1646 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1647 /* 1648 * Create the pseudo ring for each HW ring of the 1649 * underlying port. Note that this is done after the 1650 * aggr registers its MAC. 1651 */ 1652 err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group, 1653 tx_ring_limit); 1654 1655 if (err != 0) { 1656 mac_perim_exit(mph); 1657 goto bail; 1658 } 1659 1660 for (i = 0; i < grp->lg_rx_group_count; i++) { 1661 err = aggr_add_pseudo_rx_group(port, 1662 &grp->lg_rx_groups[i]); 1663 1664 if (err != 0) { 1665 /* 1666 * Undo what we have added for the current 1667 * port. 1668 */ 1669 aggr_rem_pseudo_tx_group(port, 1670 &grp->lg_tx_group); 1671 1672 for (j = 0; j < i; j++) { 1673 aggr_rem_pseudo_rx_group(port, 1674 &grp->lg_rx_groups[j]); 1675 } 1676 1677 mac_perim_exit(mph); 1678 goto bail; 1679 } 1680 } 1681 1682 if (aggr_port_notify_link(grp, port)) 1683 link_state_changed = B_TRUE; 1684 1685 /* 1686 * Initialize the callback functions for this port. 1687 */ 1688 aggr_port_init_callbacks(port); 1689 1690 last_attached = port; 1691 } 1692 1693 if (link_state_changed) 1694 mac_link_update(grp->lg_mh, grp->lg_link_state); 1695 1696 /* add new group to hash table */ 1697 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1698 (mod_hash_val_t)grp); 1699 ASSERT(err == 0); 1700 aggr_grp_cnt++; 1701 1702 mac_perim_exit(mph); 1703 rw_exit(&aggr_grp_lock); 1704 return (0); 1705 1706 bail: 1707 grp->lg_closing = B_TRUE; 1708 1709 /* 1710 * Inform the lacp_rx thread to exit. 1711 */ 1712 mutex_enter(&grp->lg_lacp_lock); 1713 grp->lg_lacp_done = B_TRUE; 1714 cv_signal(&grp->lg_lacp_cv); 1715 while (grp->lg_lacp_rx_thread != NULL) 1716 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1717 mutex_exit(&grp->lg_lacp_lock); 1718 /* 1719 * Inform the tx_notify thread to exit. 1720 */ 1721 mutex_enter(&grp->lg_tx_flowctl_lock); 1722 if (grp->lg_tx_notify_thread != NULL) { 1723 tid = grp->lg_tx_notify_thread->t_did; 1724 grp->lg_tx_notify_done = B_TRUE; 1725 cv_signal(&grp->lg_tx_flowctl_cv); 1726 } 1727 mutex_exit(&grp->lg_tx_flowctl_lock); 1728 if (tid != 0) 1729 thread_join(tid); 1730 1731 if (mac_registered) { 1732 (void) dls_devnet_destroy(grp->lg_mh, &tempid, B_TRUE); 1733 (void) mac_disable(grp->lg_mh); 1734 1735 if (last_attached != NULL) { 1736 /* 1737 * Detach and clean up ports added. 1738 */ 1739 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1740 1741 for (port = grp->lg_ports; ; port = port->lp_next) { 1742 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1743 (void) aggr_grp_detach_port(grp, port); 1744 mac_perim_exit(pmph); 1745 1746 aggr_rem_pseudo_tx_group(port, 1747 &grp->lg_tx_group); 1748 1749 for (i = 0; i < grp->lg_rx_group_count; i++) { 1750 aggr_rem_pseudo_rx_group(port, 1751 &grp->lg_rx_groups[i]); 1752 } 1753 if (port == last_attached) 1754 break; 1755 } 1756 1757 mac_perim_exit(mph); 1758 } 1759 1760 (void) mac_unregister(grp->lg_mh); 1761 } 1762 1763 port = grp->lg_ports; 1764 while (port != NULL) { 1765 aggr_port_t *cport; 1766 1767 cport = port->lp_next; 1768 aggr_port_delete(port); 1769 port = cport; 1770 } 1771 1772 kmem_free(grp->lg_tx_blocked_rings, 1773 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1774 rw_exit(&aggr_grp_lock); 1775 AGGR_GRP_REFRELE(grp); 1776 return (err); 1777 } 1778 1779 /* 1780 * Return a pointer to the member of a group with specified linkid. 1781 */ 1782 static aggr_port_t * 1783 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1784 { 1785 aggr_port_t *port; 1786 1787 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1788 1789 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1790 if (port->lp_linkid == linkid) 1791 break; 1792 } 1793 1794 return (port); 1795 } 1796 1797 /* 1798 * Stop, detach and remove a port from a link aggregation group. 1799 */ 1800 static int 1801 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1802 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1803 { 1804 int rc = 0; 1805 aggr_port_t **pport; 1806 boolean_t mac_addr_changed = B_FALSE; 1807 boolean_t link_state_changed = B_FALSE; 1808 mac_perim_handle_t mph; 1809 uint64_t val; 1810 uint_t i; 1811 uint_t stat; 1812 1813 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1814 ASSERT(grp->lg_nports > 1); 1815 ASSERT(!grp->lg_closing); 1816 1817 /* unlink port */ 1818 for (pport = &grp->lg_ports; *pport != port; 1819 pport = &(*pport)->lp_next) { 1820 if (*pport == NULL) { 1821 rc = ENOENT; 1822 goto done; 1823 } 1824 } 1825 *pport = port->lp_next; 1826 1827 mac_perim_enter_by_mh(port->lp_mh, &mph); 1828 1829 /* 1830 * If the MAC address of the port being removed was assigned 1831 * to the group, update the group MAC address 1832 * using the MAC address of a different port. 1833 */ 1834 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1835 /* 1836 * Set the MAC address of the group to the 1837 * MAC address of its first port. 1838 */ 1839 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1840 grp->lg_mac_addr_port = grp->lg_ports; 1841 mac_addr_changed = B_TRUE; 1842 } 1843 1844 link_state_changed = aggr_grp_detach_port(grp, port); 1845 1846 /* 1847 * Add the counter statistics of the ports while it was aggregated 1848 * to the group's residual statistics. This is done by obtaining 1849 * the current counter from the underlying MAC then subtracting the 1850 * value of the counter at the moment it was added to the 1851 * aggregation. 1852 */ 1853 for (i = 0; i < MAC_NSTAT; i++) { 1854 stat = i + MAC_STAT_MIN; 1855 if (!MAC_STAT_ISACOUNTER(stat)) 1856 continue; 1857 val = aggr_port_stat(port, stat); 1858 val -= port->lp_stat[i]; 1859 mutex_enter(&grp->lg_stat_lock); 1860 grp->lg_stat[i] += val; 1861 mutex_exit(&grp->lg_stat_lock); 1862 } 1863 for (i = 0; i < ETHER_NSTAT; i++) { 1864 stat = i + MACTYPE_STAT_MIN; 1865 if (!ETHER_STAT_ISACOUNTER(stat)) 1866 continue; 1867 val = aggr_port_stat(port, stat); 1868 val -= port->lp_ether_stat[i]; 1869 mutex_enter(&grp->lg_stat_lock); 1870 grp->lg_ether_stat[i] += val; 1871 mutex_exit(&grp->lg_stat_lock); 1872 } 1873 1874 grp->lg_nports--; 1875 mac_perim_exit(mph); 1876 1877 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1878 aggr_port_delete(port); 1879 1880 /* 1881 * If the group MAC address has changed, update the MAC address of 1882 * the remaining constituent ports according to the new MAC 1883 * address of the group. 1884 */ 1885 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1886 link_state_changed = B_TRUE; 1887 1888 done: 1889 if (mac_addr_changedp != NULL) 1890 *mac_addr_changedp = mac_addr_changed; 1891 if (link_state_changedp != NULL) 1892 *link_state_changedp = link_state_changed; 1893 1894 return (rc); 1895 } 1896 1897 /* 1898 * Remove one or more ports from an existing link aggregation group. 1899 */ 1900 int 1901 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1902 { 1903 int rc = 0; 1904 uint_t i; 1905 aggr_grp_t *grp = NULL; 1906 aggr_port_t *port; 1907 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1908 boolean_t link_state_update = B_FALSE, link_state_changed; 1909 mac_perim_handle_t mph, pmph; 1910 1911 /* get group corresponding to linkid */ 1912 rw_enter(&aggr_grp_lock, RW_READER); 1913 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1914 (mod_hash_val_t *)&grp) != 0) { 1915 rw_exit(&aggr_grp_lock); 1916 return (ENOENT); 1917 } 1918 AGGR_GRP_REFHOLD(grp); 1919 1920 /* 1921 * Hold the perimeter so that the aggregation won't be destroyed. 1922 */ 1923 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1924 rw_exit(&aggr_grp_lock); 1925 1926 /* we need to keep at least one port per group */ 1927 if (nports >= grp->lg_nports) { 1928 rc = EINVAL; 1929 goto bail; 1930 } 1931 1932 /* first verify that all the groups are valid */ 1933 for (i = 0; i < nports; i++) { 1934 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1935 /* port not found */ 1936 rc = ENOENT; 1937 goto bail; 1938 } 1939 } 1940 1941 /* clear the promiscous mode for the specified ports */ 1942 for (i = 0; i < nports && rc == 0; i++) { 1943 /* lookup port */ 1944 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1945 ASSERT(port != NULL); 1946 1947 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1948 rc = aggr_port_promisc(port, B_FALSE); 1949 mac_perim_exit(pmph); 1950 } 1951 if (rc != 0) { 1952 for (i = 0; i < nports; i++) { 1953 port = aggr_grp_port_lookup(grp, 1954 ports[i].lp_linkid); 1955 ASSERT(port != NULL); 1956 1957 /* 1958 * Turn the promiscuous mode back on if it is required 1959 * to receive the non-primary address over a port, or 1960 * the promiscous mode is enabled over the aggr. 1961 */ 1962 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1963 if (port->lp_started && (grp->lg_promisc || 1964 port->lp_prom_addr != NULL)) { 1965 (void) aggr_port_promisc(port, B_TRUE); 1966 } 1967 mac_perim_exit(pmph); 1968 } 1969 goto bail; 1970 } 1971 1972 /* remove the specified ports from group */ 1973 for (i = 0; i < nports; i++) { 1974 /* lookup port */ 1975 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1976 ASSERT(port != NULL); 1977 1978 /* stop port if group has already been started */ 1979 if (grp->lg_started) { 1980 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1981 aggr_port_stop(port); 1982 mac_perim_exit(pmph); 1983 } 1984 1985 /* 1986 * aggr_rem_pseudo_tx_group() is not called here. Instead 1987 * it is called from inside aggr_grp_rem_port() after the 1988 * port has been detached. The reason is that 1989 * aggr_rem_pseudo_tx_group() removes one ring at a time 1990 * and if there is still traffic going on, then there 1991 * is the possibility of aggr_find_tx_ring() returning a 1992 * removed ring for transmission. Once the port has been 1993 * detached, that port will not be used and 1994 * aggr_find_tx_ring() will not return any rings 1995 * belonging to it. 1996 */ 1997 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) 1998 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]); 1999 2000 /* remove port from group */ 2001 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 2002 &link_state_changed); 2003 ASSERT(rc == 0); 2004 mac_addr_update = mac_addr_update || mac_addr_changed; 2005 link_state_update = link_state_update || link_state_changed; 2006 } 2007 2008 bail: 2009 if (mac_addr_update) 2010 mac_unicst_update(grp->lg_mh, grp->lg_addr); 2011 if (link_state_update) 2012 mac_link_update(grp->lg_mh, grp->lg_link_state); 2013 2014 mac_perim_exit(mph); 2015 AGGR_GRP_REFRELE(grp); 2016 2017 return (rc); 2018 } 2019 2020 int 2021 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 2022 { 2023 aggr_grp_t *grp = NULL; 2024 aggr_port_t *port, *cport; 2025 datalink_id_t tmpid; 2026 mod_hash_val_t val; 2027 mac_perim_handle_t mph, pmph; 2028 int err; 2029 kt_did_t tid = 0; 2030 2031 rw_enter(&aggr_grp_lock, RW_WRITER); 2032 2033 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2034 (mod_hash_val_t *)&grp) != 0) { 2035 rw_exit(&aggr_grp_lock); 2036 return (ENOENT); 2037 } 2038 2039 /* 2040 * Note that dls_devnet_destroy() must be called before lg_lock is 2041 * held. Otherwise, it will deadlock if another thread is in 2042 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 2043 * dls_devnet_destroy() needs to delete. 2044 */ 2045 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 2046 rw_exit(&aggr_grp_lock); 2047 return (err); 2048 } 2049 ASSERT(linkid == tmpid); 2050 2051 /* 2052 * Unregister from the MAC service module. Since this can 2053 * fail if a client hasn't closed the MAC port, we gracefully 2054 * fail the operation. 2055 */ 2056 if ((err = mac_disable(grp->lg_mh)) != 0) { 2057 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 2058 rw_exit(&aggr_grp_lock); 2059 return (err); 2060 } 2061 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 2062 ASSERT(grp == (aggr_grp_t *)val); 2063 2064 ASSERT(aggr_grp_cnt > 0); 2065 aggr_grp_cnt--; 2066 rw_exit(&aggr_grp_lock); 2067 2068 /* 2069 * Inform the lacp_rx thread to exit. 2070 */ 2071 mutex_enter(&grp->lg_lacp_lock); 2072 grp->lg_lacp_done = B_TRUE; 2073 cv_signal(&grp->lg_lacp_cv); 2074 while (grp->lg_lacp_rx_thread != NULL) 2075 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 2076 mutex_exit(&grp->lg_lacp_lock); 2077 /* 2078 * Inform the tx_notify_thread to exit. 2079 */ 2080 mutex_enter(&grp->lg_tx_flowctl_lock); 2081 if (grp->lg_tx_notify_thread != NULL) { 2082 tid = grp->lg_tx_notify_thread->t_did; 2083 grp->lg_tx_notify_done = B_TRUE; 2084 cv_signal(&grp->lg_tx_flowctl_cv); 2085 } 2086 mutex_exit(&grp->lg_tx_flowctl_lock); 2087 if (tid != 0) 2088 thread_join(tid); 2089 2090 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2091 2092 grp->lg_closing = B_TRUE; 2093 /* detach and free MAC ports associated with group */ 2094 port = grp->lg_ports; 2095 while (port != NULL) { 2096 cport = port->lp_next; 2097 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2098 if (grp->lg_started) 2099 aggr_port_stop(port); 2100 (void) aggr_grp_detach_port(grp, port); 2101 mac_perim_exit(pmph); 2102 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 2103 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2104 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 2105 aggr_port_delete(port); 2106 port = cport; 2107 } 2108 2109 mac_perim_exit(mph); 2110 2111 kmem_free(grp->lg_tx_blocked_rings, 2112 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 2113 /* 2114 * Wait for the port's lacp timer thread and its notification callback 2115 * to exit before calling mac_unregister() since both needs to access 2116 * the mac perimeter of the grp. 2117 */ 2118 aggr_grp_port_wait(grp); 2119 2120 VERIFY(mac_unregister(grp->lg_mh) == 0); 2121 grp->lg_mh = NULL; 2122 2123 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 2124 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 2125 } 2126 2127 AGGR_GRP_REFRELE(grp); 2128 return (0); 2129 } 2130 2131 void 2132 aggr_grp_free(aggr_grp_t *grp) 2133 { 2134 ASSERT(grp->lg_refs == 0); 2135 ASSERT(grp->lg_port_ref == 0); 2136 if (grp->lg_key > AGGR_MAX_KEY) { 2137 id_free(key_ids, grp->lg_key); 2138 grp->lg_key = 0; 2139 } 2140 kmem_cache_free(aggr_grp_cache, grp); 2141 } 2142 2143 int 2144 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 2145 aggr_grp_info_new_grp_fn_t new_grp_fn, 2146 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 2147 { 2148 aggr_grp_t *grp; 2149 aggr_port_t *port; 2150 mac_perim_handle_t mph, pmph; 2151 int rc = 0; 2152 2153 /* 2154 * Make sure that the aggregation link is visible from the caller's 2155 * zone. 2156 */ 2157 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 2158 return (ENOENT); 2159 2160 rw_enter(&aggr_grp_lock, RW_READER); 2161 2162 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2163 (mod_hash_val_t *)&grp) != 0) { 2164 rw_exit(&aggr_grp_lock); 2165 return (ENOENT); 2166 } 2167 AGGR_GRP_REFHOLD(grp); 2168 2169 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2170 rw_exit(&aggr_grp_lock); 2171 2172 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2173 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2174 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2175 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2176 2177 if (rc != 0) 2178 goto bail; 2179 2180 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2181 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2182 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2183 port->lp_state, &port->lp_lacp.ActorOperPortState); 2184 mac_perim_exit(pmph); 2185 2186 if (rc != 0) 2187 goto bail; 2188 } 2189 2190 bail: 2191 mac_perim_exit(mph); 2192 AGGR_GRP_REFRELE(grp); 2193 return (rc); 2194 } 2195 2196 /*ARGSUSED*/ 2197 static void 2198 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2199 { 2200 miocnak(q, mp, 0, ENOTSUP); 2201 } 2202 2203 static int 2204 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2205 { 2206 aggr_port_t *port; 2207 uint_t stat_index; 2208 2209 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2210 2211 /* We only aggregate counter statistics. */ 2212 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 2213 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 2214 return (ENOTSUP); 2215 } 2216 2217 /* 2218 * Counter statistics for a group are computed by aggregating the 2219 * counters of the members MACs while they were aggregated, plus 2220 * the residual counter of the group itself, which is updated each 2221 * time a MAC is removed from the group. 2222 */ 2223 *val = 0; 2224 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2225 /* actual port statistic */ 2226 *val += aggr_port_stat(port, stat); 2227 /* 2228 * minus the port stat when it was added, plus any residual 2229 * amount for the group. 2230 */ 2231 if (IS_MAC_STAT(stat)) { 2232 stat_index = stat - MAC_STAT_MIN; 2233 *val -= port->lp_stat[stat_index]; 2234 *val += grp->lg_stat[stat_index]; 2235 } else if (IS_MACTYPE_STAT(stat)) { 2236 stat_index = stat - MACTYPE_STAT_MIN; 2237 *val -= port->lp_ether_stat[stat_index]; 2238 *val += grp->lg_ether_stat[stat_index]; 2239 } 2240 } 2241 return (0); 2242 } 2243 2244 int 2245 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2246 { 2247 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2248 2249 if (rx_ring->arr_hw_rh != NULL) { 2250 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2251 } else { 2252 aggr_port_t *port = rx_ring->arr_port; 2253 2254 *val = mac_stat_get(port->lp_mh, stat); 2255 2256 } 2257 return (0); 2258 } 2259 2260 int 2261 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2262 { 2263 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2264 2265 if (tx_ring->atr_hw_rh != NULL) { 2266 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2267 } else { 2268 aggr_port_t *port = tx_ring->atr_port; 2269 2270 *val = mac_stat_get(port->lp_mh, stat); 2271 } 2272 return (0); 2273 } 2274 2275 static int 2276 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2277 { 2278 aggr_grp_t *grp = arg; 2279 int rval = 0; 2280 2281 mutex_enter(&grp->lg_stat_lock); 2282 2283 switch (stat) { 2284 case MAC_STAT_IFSPEED: 2285 *val = grp->lg_ifspeed; 2286 break; 2287 2288 case ETHER_STAT_LINK_DUPLEX: 2289 *val = grp->lg_link_duplex; 2290 break; 2291 2292 default: 2293 /* 2294 * For all other statistics, we return the aggregated stat 2295 * from the underlying ports. aggr_grp_stat() will set 2296 * rval appropriately if the statistic isn't a counter. 2297 */ 2298 rval = aggr_grp_stat(grp, stat, val); 2299 } 2300 2301 mutex_exit(&grp->lg_stat_lock); 2302 return (rval); 2303 } 2304 2305 static int 2306 aggr_m_start(void *arg) 2307 { 2308 aggr_grp_t *grp = arg; 2309 aggr_port_t *port; 2310 mac_perim_handle_t mph, pmph; 2311 2312 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2313 2314 /* 2315 * Attempts to start all configured members of the group. 2316 * Group members will be attached when their link-up notification 2317 * is received. 2318 */ 2319 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2320 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2321 if (aggr_port_start(port) != 0) { 2322 mac_perim_exit(pmph); 2323 continue; 2324 } 2325 2326 /* 2327 * Turn on the promiscuous mode if it is required to receive 2328 * the non-primary address over a port, or the promiscous 2329 * mode is enabled over the aggr. 2330 */ 2331 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2332 if (aggr_port_promisc(port, B_TRUE) != 0) 2333 aggr_port_stop(port); 2334 } 2335 mac_perim_exit(pmph); 2336 } 2337 2338 grp->lg_started = B_TRUE; 2339 2340 mac_perim_exit(mph); 2341 return (0); 2342 } 2343 2344 static void 2345 aggr_m_stop(void *arg) 2346 { 2347 aggr_grp_t *grp = arg; 2348 aggr_port_t *port; 2349 mac_perim_handle_t mph, pmph; 2350 2351 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2352 2353 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2354 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2355 2356 /* reset port promiscuous mode */ 2357 (void) aggr_port_promisc(port, B_FALSE); 2358 2359 aggr_port_stop(port); 2360 mac_perim_exit(pmph); 2361 } 2362 2363 grp->lg_started = B_FALSE; 2364 mac_perim_exit(mph); 2365 } 2366 2367 static int 2368 aggr_m_promisc(void *arg, boolean_t on) 2369 { 2370 aggr_grp_t *grp = arg; 2371 aggr_port_t *port; 2372 boolean_t link_state_changed = B_FALSE; 2373 mac_perim_handle_t mph, pmph; 2374 2375 AGGR_GRP_REFHOLD(grp); 2376 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2377 2378 ASSERT(!grp->lg_closing); 2379 2380 if (on == grp->lg_promisc) 2381 goto bail; 2382 2383 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2384 int err = 0; 2385 2386 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2387 AGGR_PORT_REFHOLD(port); 2388 if (!on && (port->lp_prom_addr == NULL)) 2389 err = aggr_port_promisc(port, B_FALSE); 2390 else if (on && port->lp_started) 2391 err = aggr_port_promisc(port, B_TRUE); 2392 2393 if (err != 0) { 2394 if (aggr_grp_detach_port(grp, port)) 2395 link_state_changed = B_TRUE; 2396 } else { 2397 /* 2398 * If a port was detached because of a previous 2399 * failure changing the promiscuity, the port 2400 * is reattached when it successfully changes 2401 * the promiscuity now, and this might cause 2402 * the link state of the aggregation to change. 2403 */ 2404 if (aggr_grp_attach_port(grp, port)) 2405 link_state_changed = B_TRUE; 2406 } 2407 mac_perim_exit(pmph); 2408 AGGR_PORT_REFRELE(port); 2409 } 2410 2411 grp->lg_promisc = on; 2412 2413 if (link_state_changed) 2414 mac_link_update(grp->lg_mh, grp->lg_link_state); 2415 2416 bail: 2417 mac_perim_exit(mph); 2418 AGGR_GRP_REFRELE(grp); 2419 2420 return (0); 2421 } 2422 2423 static void 2424 aggr_grp_port_rename(const char *new_name, void *arg) 2425 { 2426 /* 2427 * aggr port's mac client name is the format of "aggr link name" plus 2428 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2429 */ 2430 int aggr_len, link_len, clnt_name_len, i; 2431 char *str_end, *str_st, *str_del; 2432 char aggr_name[MAXNAMELEN]; 2433 char link_name[MAXNAMELEN]; 2434 char *clnt_name; 2435 aggr_grp_t *aggr_grp = arg; 2436 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2437 2438 for (i = 0; i < aggr_grp->lg_nports; i++) { 2439 clnt_name = mac_client_name(aggr_port->lp_mch); 2440 clnt_name_len = strlen(clnt_name); 2441 str_st = clnt_name; 2442 str_end = &(clnt_name[clnt_name_len]); 2443 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2444 ASSERT(str_del != NULL); 2445 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2446 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2447 bzero(aggr_name, MAXNAMELEN); 2448 bzero(link_name, MAXNAMELEN); 2449 bcopy(clnt_name, aggr_name, aggr_len); 2450 bcopy(str_del, link_name, link_len + 1); 2451 bzero(clnt_name, MAXNAMELEN); 2452 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2453 link_name); 2454 2455 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2456 aggr_port = aggr_port->lp_next; 2457 } 2458 } 2459 2460 /* 2461 * Initialize the capabilities that are advertised for the group 2462 * according to the capabilities of the constituent ports. 2463 */ 2464 static boolean_t 2465 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2466 { 2467 aggr_grp_t *grp = arg; 2468 2469 switch (cap) { 2470 case MAC_CAPAB_HCKSUM: { 2471 uint32_t *hcksum_txflags = cap_data; 2472 *hcksum_txflags = grp->lg_hcksum_txflags; 2473 break; 2474 } 2475 case MAC_CAPAB_LSO: { 2476 mac_capab_lso_t *cap_lso = cap_data; 2477 2478 if (grp->lg_lso) { 2479 *cap_lso = grp->lg_cap_lso; 2480 break; 2481 } else { 2482 return (B_FALSE); 2483 } 2484 } 2485 case MAC_CAPAB_NO_NATIVEVLAN: 2486 return (!grp->lg_vlan); 2487 case MAC_CAPAB_NO_ZCOPY: 2488 return (!grp->lg_zcopy); 2489 case MAC_CAPAB_RINGS: { 2490 mac_capab_rings_t *cap_rings = cap_data; 2491 uint_t ring_cnt = 0; 2492 2493 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2494 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2495 2496 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2497 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2498 cap_rings->mr_rnum = ring_cnt; 2499 cap_rings->mr_gnum = grp->lg_rx_group_count; 2500 cap_rings->mr_gaddring = NULL; 2501 cap_rings->mr_gremring = NULL; 2502 } else { 2503 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2504 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2505 cap_rings->mr_gnum = 0; 2506 } 2507 cap_rings->mr_rget = aggr_fill_ring; 2508 cap_rings->mr_gget = aggr_fill_group; 2509 break; 2510 } 2511 case MAC_CAPAB_AGGR: 2512 { 2513 mac_capab_aggr_t *aggr_cap; 2514 2515 if (cap_data != NULL) { 2516 aggr_cap = cap_data; 2517 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2518 aggr_cap->mca_unicst = aggr_m_unicst; 2519 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2520 aggr_cap->mca_arg = arg; 2521 } 2522 return (B_TRUE); 2523 } 2524 default: 2525 return (B_FALSE); 2526 } 2527 return (B_TRUE); 2528 } 2529 2530 /* 2531 * Callback function for MAC layer to register groups. 2532 */ 2533 static void 2534 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2535 mac_group_info_t *infop, mac_group_handle_t gh) 2536 { 2537 aggr_grp_t *grp = arg; 2538 2539 if (rtype == MAC_RING_TYPE_RX) { 2540 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2541 2542 rx_group->arg_gh = gh; 2543 rx_group->arg_grp = grp; 2544 2545 infop->mgi_driver = (mac_group_driver_t)rx_group; 2546 infop->mgi_start = NULL; 2547 infop->mgi_stop = NULL; 2548 infop->mgi_addmac = aggr_addmac; 2549 infop->mgi_remmac = aggr_remmac; 2550 infop->mgi_count = rx_group->arg_ring_cnt; 2551 2552 /* 2553 * Always set the HW VLAN callbacks. They are smart 2554 * enough to know when a port has HW VLAN filters to 2555 * program and when it doesn't. 2556 */ 2557 infop->mgi_addvlan = aggr_addvlan; 2558 infop->mgi_remvlan = aggr_remvlan; 2559 } else { 2560 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2561 2562 ASSERT3S(index, ==, 0); 2563 tx_group->atg_gh = gh; 2564 } 2565 } 2566 2567 /* 2568 * Callback funtion for MAC layer to register all rings. 2569 */ 2570 static void 2571 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2572 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2573 { 2574 aggr_grp_t *grp = arg; 2575 2576 switch (rtype) { 2577 case MAC_RING_TYPE_RX: { 2578 aggr_pseudo_rx_group_t *rx_group; 2579 aggr_pseudo_rx_ring_t *rx_ring; 2580 mac_intr_t aggr_mac_intr; 2581 2582 rx_group = &grp->lg_rx_groups[rg_index]; 2583 ASSERT3S(index, >=, 0); 2584 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2585 rx_ring = rx_group->arg_rings + index; 2586 rx_ring->arr_rh = rh; 2587 2588 /* 2589 * Entrypoint to enable interrupt (disable poll) and 2590 * disable interrupt (enable poll). 2591 */ 2592 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2593 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2594 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2595 aggr_mac_intr.mi_ddi_handle = NULL; 2596 2597 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2598 infop->mri_start = aggr_pseudo_start_rx_ring; 2599 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2600 2601 infop->mri_intr = aggr_mac_intr; 2602 infop->mri_poll = aggr_rx_poll; 2603 2604 infop->mri_stat = aggr_rx_ring_stat; 2605 break; 2606 } 2607 case MAC_RING_TYPE_TX: { 2608 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2609 aggr_pseudo_tx_ring_t *tx_ring; 2610 2611 ASSERT(rg_index == -1); 2612 ASSERT(index < tx_group->atg_ring_cnt); 2613 2614 tx_ring = &tx_group->atg_rings[index]; 2615 tx_ring->atr_rh = rh; 2616 2617 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2618 infop->mri_start = NULL; 2619 infop->mri_stop = NULL; 2620 infop->mri_tx = aggr_ring_tx; 2621 infop->mri_stat = aggr_tx_ring_stat; 2622 /* 2623 * Use the hw TX ring handle to find if the ring needs 2624 * serialization or not. For NICs that do not expose 2625 * Tx rings, atr_hw_rh will be NULL. 2626 */ 2627 if (tx_ring->atr_hw_rh != NULL) { 2628 infop->mri_flags = 2629 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2630 } 2631 break; 2632 } 2633 default: 2634 break; 2635 } 2636 } 2637 2638 static mblk_t * 2639 aggr_rx_poll(void *arg, int bytes_to_pickup) 2640 { 2641 aggr_pseudo_rx_ring_t *rr_ring = arg; 2642 aggr_port_t *port = rr_ring->arr_port; 2643 aggr_grp_t *grp = port->lp_grp; 2644 mblk_t *mp_chain, *mp, **mpp; 2645 2646 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2647 2648 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2649 return (mp_chain); 2650 2651 mpp = &mp_chain; 2652 while ((mp = *mpp) != NULL) { 2653 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2654 struct ether_header *ehp; 2655 2656 ehp = (struct ether_header *)mp->b_rptr; 2657 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2658 *mpp = mp->b_next; 2659 mp->b_next = NULL; 2660 aggr_recv_lacp(port, 2661 (mac_resource_handle_t)rr_ring, mp); 2662 continue; 2663 } 2664 } 2665 2666 if (!port->lp_collector_enabled) { 2667 *mpp = mp->b_next; 2668 mp->b_next = NULL; 2669 freemsg(mp); 2670 continue; 2671 } 2672 mpp = &mp->b_next; 2673 } 2674 return (mp_chain); 2675 } 2676 2677 static int 2678 aggr_addmac(void *arg, const uint8_t *mac_addr) 2679 { 2680 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2681 aggr_unicst_addr_t *addr, **pprev; 2682 aggr_grp_t *grp = rx_group->arg_grp; 2683 aggr_port_t *port, *p; 2684 mac_perim_handle_t mph; 2685 int err = 0; 2686 uint_t idx = rx_group->arg_index; 2687 2688 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2689 2690 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2691 mac_perim_exit(mph); 2692 return (0); 2693 } 2694 2695 /* 2696 * Insert this mac address into the list of mac addresses owned by 2697 * the aggregation pseudo group. 2698 */ 2699 pprev = &rx_group->arg_macaddr; 2700 while ((addr = *pprev) != NULL) { 2701 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2702 mac_perim_exit(mph); 2703 return (EEXIST); 2704 } 2705 pprev = &addr->aua_next; 2706 } 2707 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2708 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2709 addr->aua_next = NULL; 2710 *pprev = addr; 2711 2712 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2713 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2714 break; 2715 2716 if (err != 0) { 2717 for (p = grp->lg_ports; p != port; p = p->lp_next) 2718 aggr_port_remmac(p, idx, mac_addr); 2719 2720 *pprev = NULL; 2721 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2722 } 2723 2724 mac_perim_exit(mph); 2725 return (err); 2726 } 2727 2728 static int 2729 aggr_remmac(void *arg, const uint8_t *mac_addr) 2730 { 2731 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2732 aggr_unicst_addr_t *addr, **pprev; 2733 aggr_grp_t *grp = rx_group->arg_grp; 2734 aggr_port_t *port; 2735 mac_perim_handle_t mph; 2736 int err = 0; 2737 2738 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2739 2740 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2741 mac_perim_exit(mph); 2742 return (0); 2743 } 2744 2745 /* 2746 * Insert this mac address into the list of mac addresses owned by 2747 * the aggregation pseudo group. 2748 */ 2749 pprev = &rx_group->arg_macaddr; 2750 while ((addr = *pprev) != NULL) { 2751 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2752 pprev = &addr->aua_next; 2753 continue; 2754 } 2755 break; 2756 } 2757 if (addr == NULL) { 2758 mac_perim_exit(mph); 2759 return (EINVAL); 2760 } 2761 2762 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2763 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2764 2765 *pprev = addr->aua_next; 2766 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2767 2768 mac_perim_exit(mph); 2769 return (err); 2770 } 2771 2772 /* 2773 * Search for VID in the Rx group's list and return a pointer if 2774 * found. Otherwise return NULL. 2775 */ 2776 static aggr_vlan_t * 2777 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2778 { 2779 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2780 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2781 avp = list_next(&rx_group->arg_vlans, avp)) { 2782 if (avp->av_vid == vid) 2783 return (avp); 2784 } 2785 2786 return (NULL); 2787 } 2788 2789 /* 2790 * Accept traffic on the specified VID. 2791 * 2792 * Persist VLAN state in the aggr so that ports added later will 2793 * receive the correct filters. In the future it would be nice to 2794 * allow aggr to iterate its clients instead of duplicating state. 2795 */ 2796 static int 2797 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2798 { 2799 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2800 aggr_grp_t *aggr = rx_group->arg_grp; 2801 aggr_port_t *port, *p; 2802 mac_perim_handle_t mph; 2803 int err = 0; 2804 aggr_vlan_t *avp = NULL; 2805 uint_t idx = rx_group->arg_index; 2806 2807 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2808 2809 if (vid == MAC_VLAN_UNTAGGED) { 2810 /* 2811 * Aggr is both a MAC provider and MAC client. As a 2812 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2813 * client. As a client itself, it should pass 2814 * VLAN_ID_NONE to its ports. 2815 */ 2816 vid = VLAN_ID_NONE; 2817 rx_group->arg_untagged++; 2818 goto update_ports; 2819 } 2820 2821 avp = aggr_find_vlan(rx_group, vid); 2822 2823 if (avp != NULL) { 2824 avp->av_refs++; 2825 mac_perim_exit(mph); 2826 return (0); 2827 } 2828 2829 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2830 avp->av_vid = vid; 2831 avp->av_refs = 1; 2832 2833 update_ports: 2834 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2835 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2836 break; 2837 2838 if (err != 0) { 2839 /* 2840 * If any of these calls fail then we are in a 2841 * situation where the ports have different HW state. 2842 * There's no reasonable action the MAC client can 2843 * take in this scenario to rectify the situation. 2844 */ 2845 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2846 int err2; 2847 2848 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2849 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2850 " from port %s: errno %d.", vid, 2851 mac_client_name(p->lp_mch), err2); 2852 } 2853 2854 } 2855 2856 if (vid == VLAN_ID_NONE) 2857 rx_group->arg_untagged--; 2858 2859 if (avp != NULL) { 2860 kmem_free(avp, sizeof (aggr_vlan_t)); 2861 avp = NULL; 2862 } 2863 } 2864 2865 if (avp != NULL) 2866 list_insert_tail(&rx_group->arg_vlans, avp); 2867 2868 done: 2869 mac_perim_exit(mph); 2870 return (err); 2871 } 2872 2873 /* 2874 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2875 */ 2876 static int 2877 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2878 { 2879 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2880 aggr_grp_t *aggr = rx_group->arg_grp; 2881 aggr_port_t *port, *p; 2882 mac_perim_handle_t mph; 2883 int err = 0; 2884 aggr_vlan_t *avp = NULL; 2885 uint_t idx = rx_group->arg_index; 2886 2887 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2888 2889 /* 2890 * See the comment in aggr_addvlan(). 2891 */ 2892 if (vid == MAC_VLAN_UNTAGGED) { 2893 vid = VLAN_ID_NONE; 2894 rx_group->arg_untagged--; 2895 2896 if (rx_group->arg_untagged > 0) 2897 goto done; 2898 2899 goto update_ports; 2900 } 2901 2902 avp = aggr_find_vlan(rx_group, vid); 2903 2904 if (avp == NULL) { 2905 err = ENOENT; 2906 goto done; 2907 } 2908 2909 avp->av_refs--; 2910 2911 if (avp->av_refs > 0) 2912 goto done; 2913 2914 update_ports: 2915 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2916 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2917 break; 2918 2919 /* 2920 * See the comment in aggr_addvlan() for justification of the 2921 * use of VERIFY here. 2922 */ 2923 if (err != 0) { 2924 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2925 int err2; 2926 2927 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2928 cmn_err(CE_WARN, "Failed to add VLAN %u" 2929 " to port %s: errno %d.", vid, 2930 mac_client_name(p->lp_mch), err2); 2931 } 2932 } 2933 2934 if (avp != NULL) 2935 avp->av_refs++; 2936 2937 if (vid == VLAN_ID_NONE) 2938 rx_group->arg_untagged++; 2939 2940 goto done; 2941 } 2942 2943 if (err == 0 && avp != NULL) { 2944 VERIFY3U(avp->av_refs, ==, 0); 2945 list_remove(&rx_group->arg_vlans, avp); 2946 kmem_free(avp, sizeof (aggr_vlan_t)); 2947 } 2948 2949 done: 2950 mac_perim_exit(mph); 2951 return (err); 2952 } 2953 2954 /* 2955 * Add or remove the multicast addresses that are defined for the group 2956 * to or from the specified port. 2957 * 2958 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2959 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2960 * called when the port is either stopped or detached. 2961 */ 2962 void 2963 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2964 { 2965 aggr_grp_t *grp = port->lp_grp; 2966 2967 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2968 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2969 2970 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2971 return; 2972 2973 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2974 } 2975 2976 static int 2977 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2978 { 2979 aggr_grp_t *grp = arg; 2980 aggr_port_t *port = NULL, *errport = NULL; 2981 mac_perim_handle_t mph; 2982 int err = 0; 2983 2984 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2985 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2986 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2987 !port->lp_started) { 2988 continue; 2989 } 2990 err = aggr_port_multicst(port, add, addrp); 2991 if (err != 0) { 2992 errport = port; 2993 break; 2994 } 2995 } 2996 2997 /* 2998 * At least one port caused error return and this error is returned to 2999 * mac, eventually a NAK would be sent upwards. 3000 * Some ports have this multicast address listed now, and some don't. 3001 * Treat this error as a whole aggr failure not individual port failure. 3002 * Therefore remove this multicast address from other ports. 3003 */ 3004 if ((err != 0) && add) { 3005 for (port = grp->lg_ports; port != errport; 3006 port = port->lp_next) { 3007 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 3008 !port->lp_started) { 3009 continue; 3010 } 3011 (void) aggr_port_multicst(port, B_FALSE, addrp); 3012 } 3013 } 3014 mac_perim_exit(mph); 3015 return (err); 3016 } 3017 3018 static int 3019 aggr_m_unicst(void *arg, const uint8_t *macaddr) 3020 { 3021 aggr_grp_t *grp = arg; 3022 mac_perim_handle_t mph; 3023 int err; 3024 3025 mac_perim_enter_by_mh(grp->lg_mh, &mph); 3026 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 3027 0, 0); 3028 mac_perim_exit(mph); 3029 return (err); 3030 } 3031 3032 /* 3033 * Initialize the capabilities that are advertised for the group 3034 * according to the capabilities of the constituent ports. 3035 */ 3036 static void 3037 aggr_grp_capab_set(aggr_grp_t *grp) 3038 { 3039 uint32_t cksum; 3040 aggr_port_t *port; 3041 mac_capab_lso_t cap_lso; 3042 3043 ASSERT(grp->lg_mh == NULL); 3044 ASSERT(grp->lg_ports != NULL); 3045 3046 grp->lg_hcksum_txflags = (uint32_t)-1; 3047 grp->lg_zcopy = B_TRUE; 3048 grp->lg_vlan = B_TRUE; 3049 3050 grp->lg_lso = B_TRUE; 3051 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 3052 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 3053 3054 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3055 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 3056 cksum = 0; 3057 grp->lg_hcksum_txflags &= cksum; 3058 3059 grp->lg_vlan &= 3060 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 3061 3062 grp->lg_zcopy &= 3063 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 3064 3065 grp->lg_lso &= 3066 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 3067 if (grp->lg_lso) { 3068 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 3069 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 3070 cap_lso.lso_basic_tcp_ipv4.lso_max) 3071 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 3072 cap_lso.lso_basic_tcp_ipv4.lso_max; 3073 } 3074 } 3075 } 3076 3077 /* 3078 * Checks whether the capabilities of the port being added are compatible 3079 * with the current capabilities of the aggregation. 3080 */ 3081 static boolean_t 3082 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 3083 { 3084 uint32_t hcksum_txflags; 3085 3086 ASSERT(grp->lg_ports != NULL); 3087 3088 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 3089 grp->lg_vlan) != grp->lg_vlan) { 3090 return (B_FALSE); 3091 } 3092 3093 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 3094 grp->lg_zcopy) != grp->lg_zcopy) { 3095 return (B_FALSE); 3096 } 3097 3098 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 3099 if (grp->lg_hcksum_txflags != 0) 3100 return (B_FALSE); 3101 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 3102 grp->lg_hcksum_txflags) { 3103 return (B_FALSE); 3104 } 3105 3106 if (grp->lg_lso) { 3107 mac_capab_lso_t cap_lso; 3108 3109 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 3110 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 3111 grp->lg_cap_lso.lso_flags) 3112 return (B_FALSE); 3113 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 3114 cap_lso.lso_basic_tcp_ipv4.lso_max) 3115 return (B_FALSE); 3116 } else { 3117 return (B_FALSE); 3118 } 3119 } 3120 3121 return (B_TRUE); 3122 } 3123 3124 /* 3125 * Returns the maximum SDU according to the SDU of the constituent ports. 3126 */ 3127 static uint_t 3128 aggr_grp_max_sdu(aggr_grp_t *grp) 3129 { 3130 uint_t max_sdu = (uint_t)-1; 3131 aggr_port_t *port; 3132 3133 ASSERT(grp->lg_ports != NULL); 3134 3135 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3136 uint_t port_sdu_max; 3137 3138 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3139 if (max_sdu > port_sdu_max) 3140 max_sdu = port_sdu_max; 3141 } 3142 3143 return (max_sdu); 3144 } 3145 3146 /* 3147 * Checks if the maximum SDU of the specified port is compatible 3148 * with the maximum SDU of the specified aggregation group, returns 3149 * B_TRUE if it is, B_FALSE otherwise. 3150 */ 3151 static boolean_t 3152 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 3153 { 3154 uint_t port_sdu_max; 3155 3156 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3157 return (port_sdu_max >= grp->lg_max_sdu); 3158 } 3159 3160 /* 3161 * Returns the maximum margin according to the margin of the constituent ports. 3162 */ 3163 static uint32_t 3164 aggr_grp_max_margin(aggr_grp_t *grp) 3165 { 3166 uint32_t margin = UINT32_MAX; 3167 aggr_port_t *port; 3168 3169 ASSERT(grp->lg_mh == NULL); 3170 ASSERT(grp->lg_ports != NULL); 3171 3172 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3173 if (margin > port->lp_margin) 3174 margin = port->lp_margin; 3175 } 3176 3177 grp->lg_margin = margin; 3178 return (margin); 3179 } 3180 3181 /* 3182 * Checks if the maximum margin of the specified port is compatible 3183 * with the maximum margin of the specified aggregation group, returns 3184 * B_TRUE if it is, B_FALSE otherwise. 3185 */ 3186 static boolean_t 3187 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3188 { 3189 if (port->lp_margin >= grp->lg_margin) 3190 return (B_TRUE); 3191 3192 /* 3193 * See whether the current margin value is allowed to be changed to 3194 * the new value. 3195 */ 3196 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3197 return (B_FALSE); 3198 3199 grp->lg_margin = port->lp_margin; 3200 return (B_TRUE); 3201 } 3202 3203 /* 3204 * Set MTU on individual ports of an aggregation group 3205 */ 3206 static int 3207 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3208 uint32_t *old_mtu) 3209 { 3210 boolean_t removed = B_FALSE; 3211 mac_perim_handle_t mph; 3212 mac_diag_t diag; 3213 int err, rv, retry = 0; 3214 3215 if (port->lp_mah != NULL) { 3216 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3217 port->lp_mah = NULL; 3218 removed = B_TRUE; 3219 } 3220 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3221 try_again: 3222 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3223 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3224 &port->lp_mah, 0, &diag)) != 0) { 3225 /* 3226 * following is a workaround for a bug in 'bge' driver. 3227 * See CR 6794654 for more information and this work around 3228 * will be removed once the CR is fixed. 3229 */ 3230 if (rv == EIO && retry++ < 3) { 3231 delay(2 * hz); 3232 goto try_again; 3233 } 3234 /* 3235 * if mac_unicast_add() failed while setting the MTU, 3236 * detach the port from the group. 3237 */ 3238 mac_perim_enter_by_mh(port->lp_mh, &mph); 3239 (void) aggr_grp_detach_port(grp, port); 3240 mac_perim_exit(mph); 3241 cmn_err(CE_WARN, "Unable to restart the port %s while " 3242 "setting MTU. Detaching the port from the aggregation.", 3243 mac_client_name(port->lp_mch)); 3244 } 3245 return (err); 3246 } 3247 3248 static int 3249 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3250 { 3251 int err = 0, i, rv; 3252 aggr_port_t *port; 3253 uint32_t *mtu; 3254 3255 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3256 3257 /* 3258 * If the MTU being set is equal to aggr group's maximum 3259 * allowable value, then there is nothing to change 3260 */ 3261 if (sdu == grp->lg_max_sdu) 3262 return (0); 3263 3264 /* 0 is aggr group's min sdu */ 3265 if (sdu == 0) 3266 return (EINVAL); 3267 3268 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3269 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3270 port = port->lp_next, i++) { 3271 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3272 } 3273 if (err != 0) { 3274 /* recover from error: reset the mtus of the ports */ 3275 aggr_port_t *tmp; 3276 3277 for (tmp = grp->lg_ports, i = 0; tmp != port; 3278 tmp = tmp->lp_next, i++) { 3279 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3280 } 3281 goto bail; 3282 } 3283 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3284 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3285 ASSERT(rv == 0); 3286 bail: 3287 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3288 return (err); 3289 } 3290 3291 /* 3292 * Callback functions for set/get of properties 3293 */ 3294 /*ARGSUSED*/ 3295 static int 3296 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3297 uint_t pr_valsize, const void *pr_val) 3298 { 3299 int err = ENOTSUP; 3300 aggr_grp_t *grp = m_driver; 3301 3302 switch (pr_num) { 3303 case MAC_PROP_MTU: { 3304 uint32_t mtu; 3305 3306 if (pr_valsize < sizeof (mtu)) { 3307 err = EINVAL; 3308 break; 3309 } 3310 bcopy(pr_val, &mtu, sizeof (mtu)); 3311 err = aggr_sdu_update(grp, mtu); 3312 break; 3313 } 3314 default: 3315 break; 3316 } 3317 return (err); 3318 } 3319 3320 typedef struct rboundary { 3321 uint32_t bval; 3322 int btype; 3323 } rboundary_t; 3324 3325 /* 3326 * This function finds the intersection of mtu ranges stored in arrays - 3327 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3328 * Individual arrays are assumed to contain non-overlapping ranges. 3329 * Algorithm: 3330 * A range has two boundaries - min and max. We scan all arrays and store 3331 * each boundary as a separate element in a temporary array. We also store 3332 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3333 * array. Then we sort the temporary array in ascending order. We scan the 3334 * sorted array from lower to higher values and keep a cumulative sum of 3335 * boundary types. Element in the temporary array for which the sum reaches 3336 * mcount is a min boundary of a range in the result and next element will be 3337 * max boundary. 3338 * 3339 * Example for mcount = 3, 3340 * 3341 * ----|_________|-------|_______|----|__|------ mrange[0] 3342 * 3343 * -------|________|--|____________|-----|___|-- mrange[1] 3344 * 3345 * --------|________________|-------|____|------ mrange[2] 3346 * 3347 * 3 2 1 3348 * \|/ 3349 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3350 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3351 * 3352 * same min and max 3353 * V 3354 * --------|_____|-------|__|------------|------ intersecting ranges 3355 */ 3356 void 3357 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3358 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3359 { 3360 mac_propval_uint32_range_t *rval, *ur; 3361 int rmaxcnt, rcount; 3362 size_t sz_range32; 3363 rboundary_t *ta; /* temporary array */ 3364 rboundary_t temp; 3365 boolean_t range_started = B_FALSE; 3366 int i, j, m, sum; 3367 3368 sz_range32 = sizeof (mac_propval_uint32_range_t); 3369 3370 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3371 rmaxcnt += mrange[i]->mpr_count; 3372 3373 /* Allocate enough space to store the results */ 3374 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3375 3376 /* Number of boundaries are twice as many as ranges */ 3377 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3378 3379 for (i = 0, m = 0; i < mcount; i++) { 3380 ur = &(mrange[i]->mpr_range_uint32[0]); 3381 for (j = 0; j < mrange[i]->mpr_count; j++) { 3382 ta[m].bval = ur[j].mpur_min; 3383 ta[m++].btype = 1; 3384 ta[m].bval = ur[j].mpur_max; 3385 ta[m++].btype = -1; 3386 } 3387 } 3388 3389 /* 3390 * Sort the temporary array in ascending order of bval; 3391 * if boundary values are same then sort on btype. 3392 */ 3393 for (i = 0; i < m-1; i++) { 3394 for (j = i+1; j < m; j++) { 3395 if ((ta[i].bval > ta[j].bval) || 3396 ((ta[i].bval == ta[j].bval) && 3397 (ta[i].btype < ta[j].btype))) { 3398 temp = ta[i]; 3399 ta[i] = ta[j]; 3400 ta[j] = temp; 3401 } 3402 } 3403 } 3404 3405 /* Walk through temporary array to find all ranges in the results */ 3406 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3407 sum += ta[i].btype; 3408 if (sum == mcount) { 3409 rval[rcount].mpur_min = ta[i].bval; 3410 range_started = B_TRUE; 3411 } else if (sum < mcount && range_started) { 3412 rval[rcount++].mpur_max = ta[i].bval; 3413 range_started = B_FALSE; 3414 } 3415 } 3416 3417 *prval = rval; 3418 *prmaxcnt = rmaxcnt; 3419 *prcount = rcount; 3420 3421 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3422 } 3423 3424 /* 3425 * Returns the mtu ranges which could be supported by aggr group. 3426 * prmaxcnt returns the size of the buffer prval, prcount returns 3427 * the number of valid entries in prval. Caller is responsible 3428 * for freeing up prval. 3429 */ 3430 int 3431 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3432 int *prmaxcnt, int *prcount) 3433 { 3434 mac_propval_range_t **vals; 3435 aggr_port_t *port; 3436 mac_perim_handle_t mph; 3437 uint_t i, numr; 3438 int err = 0; 3439 size_t sz_propval, sz_range32; 3440 size_t size; 3441 3442 sz_propval = sizeof (mac_propval_range_t); 3443 sz_range32 = sizeof (mac_propval_uint32_range_t); 3444 3445 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3446 3447 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3448 KM_SLEEP); 3449 3450 for (port = grp->lg_ports, i = 0; port != NULL; 3451 port = port->lp_next, i++) { 3452 3453 size = sz_propval; 3454 vals[i] = kmem_alloc(size, KM_SLEEP); 3455 vals[i]->mpr_count = 1; 3456 3457 mac_perim_enter_by_mh(port->lp_mh, &mph); 3458 3459 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3460 NULL, 0, vals[i], NULL); 3461 if (err == ENOSPC) { 3462 /* 3463 * Not enough space to hold all ranges. 3464 * Allocate extra space as indicated and retry. 3465 */ 3466 numr = vals[i]->mpr_count; 3467 kmem_free(vals[i], sz_propval); 3468 size = sz_propval + (numr - 1) * sz_range32; 3469 vals[i] = kmem_alloc(size, KM_SLEEP); 3470 vals[i]->mpr_count = numr; 3471 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3472 NULL, 0, vals[i], NULL); 3473 ASSERT(err != ENOSPC); 3474 } 3475 mac_perim_exit(mph); 3476 if (err != 0) { 3477 kmem_free(vals[i], size); 3478 vals[i] = NULL; 3479 break; 3480 } 3481 } 3482 3483 /* 3484 * if any of the underlying ports does not support changing MTU then 3485 * just return ENOTSUP 3486 */ 3487 if (port != NULL) { 3488 ASSERT(err != 0); 3489 goto done; 3490 } 3491 3492 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3493 prcount); 3494 3495 done: 3496 for (i = 0; i < grp->lg_nports; i++) { 3497 if (vals[i] != NULL) { 3498 numr = vals[i]->mpr_count; 3499 size = sz_propval + (numr - 1) * sz_range32; 3500 kmem_free(vals[i], size); 3501 } 3502 } 3503 3504 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3505 return (err); 3506 } 3507 3508 static void 3509 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3510 mac_prop_info_handle_t prh) 3511 { 3512 aggr_grp_t *grp = m_driver; 3513 mac_propval_uint32_range_t *rval = NULL; 3514 int i, rcount, rmaxcnt; 3515 int err = 0; 3516 3517 _NOTE(ARGUNUSED(pr_name)); 3518 3519 switch (pr_num) { 3520 case MAC_PROP_MTU: 3521 3522 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3523 &rcount); 3524 if (err != 0) { 3525 ASSERT(rval == NULL); 3526 return; 3527 } 3528 for (i = 0; i < rcount; i++) { 3529 mac_prop_info_set_range_uint32(prh, 3530 rval[i].mpur_min, rval[i].mpur_max); 3531 } 3532 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3533 break; 3534 } 3535 } 3536