1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2020 Joyent, Inc. 24 * Copyright 2020 RackTop Systems, Inc. 25 */ 26 27 /* 28 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 29 * 30 * An instance of the structure aggr_grp_t is allocated for each 31 * link aggregation group. When created, aggr_grp_t objects are 32 * entered into the aggr_grp_hash hash table maintained by the modhash 33 * module. The hash key is the linkid associated with the link 34 * aggregation group. 35 * 36 * Each aggregation contains a set of ports. The port is represented 37 * by the aggr_port_t structure. A port consists of a single MAC 38 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 39 * MAC. This client is used by the aggr to send and receive LACP 40 * traffic. Each port client takes on the same MAC unicast address -- 41 * the address of the aggregation itself (taken from the first port by 42 * default). 43 * 44 * The MAC client that hangs off each aggr port is not your typical 45 * MAC client. Not only does it have exclusive control of the MAC, but 46 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 47 * fanout traffic among L4 protocols; but the aggr is an intermediary, 48 * not a consumer. Instead of using SRSes, the aggr puts the 49 * underlying hardware rings into passthru mode and ships packets up 50 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 51 * LACP while passing all other traffic up to clients of the aggr. 52 * 53 * Pseudo Rx Groups and Rings 54 * -------------------------- 55 * 56 * It is imperative for client performance that the aggr provide as 57 * many MAC groups as possible. In order to use the underlying HW 58 * resources, aggr creates pseudo groups to aggregate the underlying 59 * HW groups. Every HW group gets mapped to a pseudo group; and every 60 * HW ring in that group gets mapped to a pseudo ring. The pseudo 61 * group at index 0 combines all the HW groups at index 0 from each 62 * port, etc. The aggr's MAC then creates normal MAC groups and rings 63 * out of these pseudo groups and rings to present to the aggr's 64 * clients. To the clients, the aggr's groups and rings are absolutely 65 * no different than a NIC's groups or rings. 66 * 67 * Pseudo Tx Rings 68 * --------------- 69 * 70 * The underlying ports (NICs) in an aggregation can have Tx rings. To 71 * enhance aggr's performance, these Tx rings are made available to 72 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 73 * not new. They are already present and implemented on the Rx side. 74 * The same concept is extended to the Tx side where each Tx ring of 75 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 76 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 77 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 78 * is given to the aggregation layer. 79 * 80 * With this change, the outgoing stack depth looks much better: 81 * 82 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 83 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 84 * 85 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 86 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 87 * 88 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 89 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 90 * ring belonging to a port on which the packet has to be sent. 91 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 92 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 93 * the selected port. 94 * 95 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 96 * bandwidth limit is applied first on the outgoing packet and the packets 97 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 98 * particular Tx ring. 99 */ 100 101 #include <sys/types.h> 102 #include <sys/sysmacros.h> 103 #include <sys/conf.h> 104 #include <sys/cmn_err.h> 105 #include <sys/disp.h> 106 #include <sys/list.h> 107 #include <sys/ksynch.h> 108 #include <sys/kmem.h> 109 #include <sys/stream.h> 110 #include <sys/modctl.h> 111 #include <sys/ddi.h> 112 #include <sys/sunddi.h> 113 #include <sys/atomic.h> 114 #include <sys/stat.h> 115 #include <sys/modhash.h> 116 #include <sys/id_space.h> 117 #include <sys/strsun.h> 118 #include <sys/cred.h> 119 #include <sys/dlpi.h> 120 #include <sys/zone.h> 121 #include <sys/mac_provider.h> 122 #include <sys/dls.h> 123 #include <sys/vlan.h> 124 #include <sys/aggr.h> 125 #include <sys/aggr_impl.h> 126 127 static int aggr_m_start(void *); 128 static void aggr_m_stop(void *); 129 static int aggr_m_promisc(void *, boolean_t); 130 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 131 static int aggr_m_unicst(void *, const uint8_t *); 132 static int aggr_m_stat(void *, uint_t, uint64_t *); 133 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 134 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 135 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 136 const void *); 137 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 138 mac_prop_info_handle_t); 139 140 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 141 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 142 boolean_t *); 143 144 static void aggr_grp_capab_set(aggr_grp_t *); 145 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 146 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 147 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 148 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 149 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 150 151 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 152 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 153 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 154 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 155 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 156 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 157 static int aggr_addmac(void *, const uint8_t *); 158 static int aggr_remmac(void *, const uint8_t *); 159 static int aggr_addvlan(mac_group_driver_t, uint16_t); 160 static int aggr_remvlan(mac_group_driver_t, uint16_t); 161 static mblk_t *aggr_rx_poll(void *, int); 162 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 163 const int, mac_ring_info_t *, mac_ring_handle_t); 164 static void aggr_fill_group(void *, mac_ring_type_t, const int, 165 mac_group_info_t *, mac_group_handle_t); 166 167 static kmem_cache_t *aggr_grp_cache; 168 static mod_hash_t *aggr_grp_hash; 169 static krwlock_t aggr_grp_lock; 170 static uint_t aggr_grp_cnt; 171 static id_space_t *key_ids; 172 173 #define GRP_HASHSZ 64 174 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 175 #define AGGR_PORT_NAME_DELIMIT '-' 176 177 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 178 179 #define AGGR_M_CALLBACK_FLAGS \ 180 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 181 182 static mac_callbacks_t aggr_m_callbacks = { 183 AGGR_M_CALLBACK_FLAGS, 184 aggr_m_stat, 185 aggr_m_start, 186 aggr_m_stop, 187 aggr_m_promisc, 188 aggr_m_multicst, 189 NULL, 190 NULL, 191 NULL, 192 aggr_m_ioctl, 193 aggr_m_capab_get, 194 NULL, 195 NULL, 196 aggr_m_setprop, 197 NULL, 198 aggr_m_propinfo 199 }; 200 201 /*ARGSUSED*/ 202 static int 203 aggr_grp_constructor(void *buf, void *arg, int kmflag) 204 { 205 aggr_grp_t *grp = buf; 206 207 bzero(grp, sizeof (*grp)); 208 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 209 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 210 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 211 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 212 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 213 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 214 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 215 grp->lg_link_state = LINK_STATE_UNKNOWN; 216 return (0); 217 } 218 219 /*ARGSUSED*/ 220 static void 221 aggr_grp_destructor(void *buf, void *arg) 222 { 223 aggr_grp_t *grp = buf; 224 225 if (grp->lg_tx_ports != NULL) { 226 kmem_free(grp->lg_tx_ports, 227 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 228 } 229 230 mutex_destroy(&grp->lg_lacp_lock); 231 cv_destroy(&grp->lg_lacp_cv); 232 mutex_destroy(&grp->lg_port_lock); 233 cv_destroy(&grp->lg_port_cv); 234 rw_destroy(&grp->lg_tx_lock); 235 mutex_destroy(&grp->lg_tx_flowctl_lock); 236 cv_destroy(&grp->lg_tx_flowctl_cv); 237 } 238 239 void 240 aggr_grp_init(void) 241 { 242 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 243 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 244 aggr_grp_destructor, NULL, NULL, NULL, 0); 245 246 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 247 GRP_HASHSZ, mod_hash_null_valdtor); 248 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 249 aggr_grp_cnt = 0; 250 251 /* 252 * Allocate an id space to manage key values (when key is not 253 * specified). The range of the id space will be from 254 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 255 * uses a 16-bit key. 256 */ 257 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 258 ASSERT(key_ids != NULL); 259 } 260 261 void 262 aggr_grp_fini(void) 263 { 264 id_space_destroy(key_ids); 265 rw_destroy(&aggr_grp_lock); 266 mod_hash_destroy_idhash(aggr_grp_hash); 267 kmem_cache_destroy(aggr_grp_cache); 268 } 269 270 uint_t 271 aggr_grp_count(void) 272 { 273 uint_t count; 274 275 rw_enter(&aggr_grp_lock, RW_READER); 276 count = aggr_grp_cnt; 277 rw_exit(&aggr_grp_lock); 278 return (count); 279 } 280 281 /* 282 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 283 * requires the mac perimeter, this function holds a reference of the aggr 284 * and aggr won't call mac_unregister() until this reference drops to 0. 285 */ 286 void 287 aggr_grp_port_hold(aggr_port_t *port) 288 { 289 aggr_grp_t *grp = port->lp_grp; 290 291 AGGR_PORT_REFHOLD(port); 292 mutex_enter(&grp->lg_port_lock); 293 grp->lg_port_ref++; 294 mutex_exit(&grp->lg_port_lock); 295 } 296 297 /* 298 * Release the reference of the grp and inform aggr_grp_delete() calling 299 * mac_unregister() is now safe. 300 */ 301 void 302 aggr_grp_port_rele(aggr_port_t *port) 303 { 304 aggr_grp_t *grp = port->lp_grp; 305 306 mutex_enter(&grp->lg_port_lock); 307 if (--grp->lg_port_ref == 0) 308 cv_signal(&grp->lg_port_cv); 309 mutex_exit(&grp->lg_port_lock); 310 AGGR_PORT_REFRELE(port); 311 } 312 313 /* 314 * Wait for the port's lacp timer thread and the port's notification callback 315 * to exit. 316 */ 317 void 318 aggr_grp_port_wait(aggr_grp_t *grp) 319 { 320 mutex_enter(&grp->lg_port_lock); 321 if (grp->lg_port_ref != 0) 322 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 323 mutex_exit(&grp->lg_port_lock); 324 } 325 326 /* 327 * Attach a port to a link aggregation group. 328 * 329 * A port is attached to a link aggregation group once its speed 330 * and link state have been verified. 331 * 332 * Returns B_TRUE if the group link state or speed has changed. If 333 * it's the case, the caller must notify the MAC layer via a call 334 * to mac_link(). 335 */ 336 boolean_t 337 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 338 { 339 boolean_t link_state_changed = B_FALSE; 340 341 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 342 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 343 344 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 345 return (B_FALSE); 346 347 /* 348 * Validate the MAC port link speed and update the group 349 * link speed if needed. 350 */ 351 if (port->lp_ifspeed == 0 || 352 port->lp_link_state != LINK_STATE_UP || 353 port->lp_link_duplex != LINK_DUPLEX_FULL) { 354 /* 355 * Can't attach a MAC port with unknown link speed, 356 * down link, or not in full duplex mode. 357 */ 358 return (B_FALSE); 359 } 360 361 mutex_enter(&grp->lg_stat_lock); 362 if (grp->lg_ifspeed == 0) { 363 /* 364 * The group inherits the speed of the first link being 365 * attached. 366 */ 367 grp->lg_ifspeed = port->lp_ifspeed; 368 link_state_changed = B_TRUE; 369 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 370 /* 371 * The link speed of the MAC port must be the same as 372 * the group link speed, as per 802.3ad. Since it is 373 * not, the attach is cancelled. 374 */ 375 mutex_exit(&grp->lg_stat_lock); 376 return (B_FALSE); 377 } 378 mutex_exit(&grp->lg_stat_lock); 379 380 grp->lg_nattached_ports++; 381 382 /* 383 * Update the group link state. 384 */ 385 if (grp->lg_link_state != LINK_STATE_UP) { 386 grp->lg_link_state = LINK_STATE_UP; 387 mutex_enter(&grp->lg_stat_lock); 388 grp->lg_link_duplex = LINK_DUPLEX_FULL; 389 mutex_exit(&grp->lg_stat_lock); 390 link_state_changed = B_TRUE; 391 } 392 393 /* 394 * Update port's state. 395 */ 396 port->lp_state = AGGR_PORT_STATE_ATTACHED; 397 398 aggr_grp_multicst_port(port, B_TRUE); 399 400 /* 401 * The port client doesn't have an Rx SRS; instead of calling 402 * mac_rx_set() we set the client's flow callback directly. 403 * This datapath is used only when the port's driver doesn't 404 * support MAC_CAPAB_RINGS. Drivers with ring support will 405 * deliver traffic to the aggr via ring passthru. 406 */ 407 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 408 409 /* 410 * If LACP is OFF, the port can be used to send data as soon 411 * as its link is up and verified to be compatible with the 412 * aggregation. 413 * 414 * If LACP is active or passive, notify the LACP subsystem, which 415 * will enable sending on the port following the LACP protocol. 416 */ 417 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 418 aggr_send_port_enable(port); 419 else 420 aggr_lacp_port_attached(port); 421 422 return (link_state_changed); 423 } 424 425 boolean_t 426 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 427 { 428 boolean_t link_state_changed = B_FALSE; 429 430 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 431 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 432 433 /* update state */ 434 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 435 return (B_FALSE); 436 437 mac_client_clear_flow_cb(port->lp_mch); 438 439 aggr_grp_multicst_port(port, B_FALSE); 440 441 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 442 aggr_send_port_disable(port); 443 else 444 aggr_lacp_port_detached(port); 445 446 port->lp_state = AGGR_PORT_STATE_STANDBY; 447 448 grp->lg_nattached_ports--; 449 if (grp->lg_nattached_ports == 0) { 450 /* the last attached MAC port of the group is being detached */ 451 grp->lg_link_state = LINK_STATE_DOWN; 452 mutex_enter(&grp->lg_stat_lock); 453 grp->lg_ifspeed = 0; 454 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 455 mutex_exit(&grp->lg_stat_lock); 456 link_state_changed = B_TRUE; 457 } 458 459 return (link_state_changed); 460 } 461 462 /* 463 * Update the MAC addresses of the constituent ports of the specified 464 * group. This function is invoked: 465 * - after creating a new aggregation group. 466 * - after adding new ports to an aggregation group. 467 * - after removing a port from a group when the MAC address of 468 * that port was used for the MAC address of the group. 469 * - after the MAC address of a port changed when the MAC address 470 * of that port was used for the MAC address of the group. 471 * 472 * Return true if the link state of the aggregation changed, for example 473 * as a result of a failure changing the MAC address of one of the 474 * constituent ports. 475 */ 476 boolean_t 477 aggr_grp_update_ports_mac(aggr_grp_t *grp) 478 { 479 aggr_port_t *cport; 480 boolean_t link_state_changed = B_FALSE; 481 mac_perim_handle_t mph; 482 483 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 484 485 for (cport = grp->lg_ports; cport != NULL; 486 cport = cport->lp_next) { 487 mac_perim_enter_by_mh(cport->lp_mh, &mph); 488 if (aggr_port_unicst(cport) != 0) { 489 if (aggr_grp_detach_port(grp, cport)) 490 link_state_changed = B_TRUE; 491 } else { 492 /* 493 * If a port was detached because of a previous 494 * failure changing the MAC address, the port is 495 * reattached when it successfully changes the MAC 496 * address now, and this might cause the link state 497 * of the aggregation to change. 498 */ 499 if (aggr_grp_attach_port(grp, cport)) 500 link_state_changed = B_TRUE; 501 } 502 mac_perim_exit(mph); 503 } 504 return (link_state_changed); 505 } 506 507 /* 508 * Invoked when the MAC address of a port has changed. If the port's 509 * MAC address was used for the group MAC address, set mac_addr_changedp 510 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 511 * notification. If the link state changes due to detach/attach of 512 * the constituent port, set link_state_changedp to B_TRUE to indicate 513 * to the caller that it should send a MAC_NOTE_LINK notification. In both 514 * cases, it is the responsibility of the caller to invoke notification 515 * functions after releasing the the port lock. 516 */ 517 void 518 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 519 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 520 { 521 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 522 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 523 ASSERT(mac_addr_changedp != NULL); 524 ASSERT(link_state_changedp != NULL); 525 526 *mac_addr_changedp = B_FALSE; 527 *link_state_changedp = B_FALSE; 528 529 if (grp->lg_addr_fixed) { 530 /* 531 * The group is using a fixed MAC address or an automatic 532 * MAC address has not been set. 533 */ 534 return; 535 } 536 537 if (grp->lg_mac_addr_port == port) { 538 /* 539 * The MAC address of the port was assigned to the group 540 * MAC address. Update the group MAC address. 541 */ 542 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 543 *mac_addr_changedp = B_TRUE; 544 } else { 545 /* 546 * Update the actual port MAC address to the MAC address 547 * of the group. 548 */ 549 if (aggr_port_unicst(port) != 0) { 550 *link_state_changedp = aggr_grp_detach_port(grp, port); 551 } else { 552 /* 553 * If a port was detached because of a previous 554 * failure changing the MAC address, the port is 555 * reattached when it successfully changes the MAC 556 * address now, and this might cause the link state 557 * of the aggregation to change. 558 */ 559 *link_state_changedp = aggr_grp_attach_port(grp, port); 560 } 561 } 562 } 563 564 /* 565 * Add a port to a link aggregation group. 566 */ 567 static int 568 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 569 aggr_port_t **pp) 570 { 571 aggr_port_t *port, **cport; 572 mac_perim_handle_t mph; 573 zoneid_t port_zoneid = ALL_ZONES; 574 int err; 575 576 /* The port must be in the same zone as the aggregation. */ 577 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 578 port_zoneid = GLOBAL_ZONEID; 579 if (grp->lg_zoneid != port_zoneid) 580 return (EBUSY); 581 582 /* 583 * If we are creating the aggr, then there is no MAC handle 584 * and thus no perimeter to hold. If we are adding a port to 585 * an existing aggr, then the perimiter of the aggr's MAC must 586 * be held. 587 */ 588 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 589 590 err = aggr_port_create(grp, port_linkid, force, &port); 591 if (err != 0) 592 return (err); 593 594 mac_perim_enter_by_mh(port->lp_mh, &mph); 595 596 /* Add the new port to the end of the list. */ 597 cport = &grp->lg_ports; 598 while (*cport != NULL) 599 cport = &((*cport)->lp_next); 600 *cport = port; 601 602 /* 603 * Back reference to the group it is member of. A port always 604 * holds a reference to its group to ensure that the back 605 * reference is always valid. 606 */ 607 port->lp_grp = grp; 608 AGGR_GRP_REFHOLD(grp); 609 grp->lg_nports++; 610 611 aggr_lacp_init_port(port); 612 mac_perim_exit(mph); 613 614 if (pp != NULL) 615 *pp = port; 616 617 return (0); 618 } 619 620 /* 621 * This is called when the 'lg_tx_ports' arrangement has changed and 622 * we need to update the corresponding 'mi_default_tx_ring'. This 623 * happens for several reasons. 624 * 625 * - A pseudo TX mac group was added or removed. 626 * - An LACP message has changed the port's state. 627 * - A link event has changed the port's state. 628 * 629 * In any case, we see if there is at least one port enabled (see 630 * 'aggr_send_port_enable()'), and if so we use its first ring as the 631 * mac's default TX ring. 632 * 633 * Note, because we only have a single TX group, we don't have to 634 * worry about the rings moving between groups and the chance that mac 635 * will reassign it unless someone removes a port, at which point, we 636 * play it safe and call this again. 637 */ 638 void 639 aggr_grp_update_default(aggr_grp_t *grp) 640 { 641 aggr_port_t *port; 642 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 643 644 rw_enter(&grp->lg_tx_lock, RW_WRITER); 645 646 if (grp->lg_ntx_ports == 0) { 647 rw_exit(&grp->lg_tx_lock); 648 return; 649 } 650 651 port = grp->lg_tx_ports[0]; 652 ASSERT(port->lp_tx_ring_cnt > 0); 653 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 654 rw_exit(&grp->lg_tx_lock); 655 } 656 657 /* 658 * Add a pseudo RX ring for the given HW ring handle. 659 */ 660 static int 661 aggr_add_pseudo_rx_ring(aggr_port_t *port, 662 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 663 { 664 aggr_pseudo_rx_ring_t *ring; 665 int err; 666 int j; 667 668 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 669 ring = rx_grp->arg_rings + j; 670 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 671 break; 672 } 673 674 /* 675 * No slot for this new RX ring. 676 */ 677 if (j == MAX_RINGS_PER_GROUP) 678 return (EIO); 679 680 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 681 ring->arr_hw_rh = hw_rh; 682 ring->arr_port = port; 683 ring->arr_grp = rx_grp; 684 rx_grp->arg_ring_cnt++; 685 686 /* 687 * The group is already registered, dynamically add a new ring to the 688 * mac group. 689 */ 690 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 691 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 692 ring->arr_hw_rh = NULL; 693 ring->arr_port = NULL; 694 ring->arr_grp = NULL; 695 rx_grp->arg_ring_cnt--; 696 } else { 697 /* 698 * This must run after the MAC is registered. 699 */ 700 ASSERT3P(ring->arr_rh, !=, NULL); 701 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 702 (void *)port, (mac_resource_handle_t)ring); 703 } 704 return (err); 705 } 706 707 /* 708 * Remove the pseudo RX ring of the given HW ring handle. 709 */ 710 static void 711 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 712 { 713 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 714 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 715 716 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 717 ring->arr_hw_rh != hw_rh) { 718 continue; 719 } 720 721 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 722 723 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 724 ring->arr_hw_rh = NULL; 725 ring->arr_port = NULL; 726 ring->arr_grp = NULL; 727 rx_grp->arg_ring_cnt--; 728 mac_hwring_clear_passthru(hw_rh); 729 break; 730 } 731 } 732 733 /* 734 * Create pseudo rings over the HW rings of the port. 735 * 736 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 737 * 738 * o Program existing unicast filters on the pseudo group into the HW group. 739 * 740 * o Program existing VLAN filters on the pseudo group into the HW group. 741 */ 742 static int 743 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 744 { 745 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 746 aggr_unicst_addr_t *addr, *a; 747 mac_perim_handle_t pmph; 748 aggr_vlan_t *avp; 749 uint_t hw_rh_cnt, i; 750 int err = 0; 751 uint_t g_idx = rx_grp->arg_index; 752 753 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 754 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 755 mac_perim_enter_by_mh(port->lp_mh, &pmph); 756 757 i = 0; 758 addr = NULL; 759 /* 760 * This function must be called after the aggr registers its 761 * MAC and its Rx groups have been initialized. 762 */ 763 ASSERT(rx_grp->arg_gh != NULL); 764 765 /* 766 * Get the list of the underlying HW rings. 767 */ 768 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 769 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 770 771 /* 772 * Add existing VLAN and unicast address filters to the port. 773 */ 774 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 775 avp = list_next(&rx_grp->arg_vlans, avp)) { 776 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 777 goto err; 778 } 779 780 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 781 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 782 goto err; 783 } 784 785 for (i = 0; i < hw_rh_cnt; i++) { 786 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 787 if (err != 0) 788 goto err; 789 } 790 791 mac_perim_exit(pmph); 792 return (0); 793 794 err: 795 ASSERT(err != 0); 796 797 for (uint_t j = 0; j < i; j++) 798 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 799 800 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 801 aggr_port_remmac(port, g_idx, a->aua_addr); 802 803 if (avp != NULL) 804 avp = list_prev(&rx_grp->arg_vlans, avp); 805 806 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 807 int err2; 808 809 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 810 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 811 ": errno %d.", avp->av_vid, 812 mac_client_name(port->lp_mch), err2); 813 } 814 } 815 816 port->lp_hwghs[g_idx] = NULL; 817 mac_perim_exit(pmph); 818 return (err); 819 } 820 821 /* 822 * Destroy the pseudo rings mapping to this port and remove all VLAN 823 * and unicast filters from this port. Even if there are no underlying 824 * HW rings we must still remove the unicast filters to take the port 825 * out of promisc mode. 826 */ 827 static void 828 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 829 { 830 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 831 aggr_unicst_addr_t *addr; 832 mac_perim_handle_t pmph; 833 uint_t hw_rh_cnt; 834 uint_t g_idx = rx_grp->arg_index; 835 836 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 837 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 838 ASSERT3P(rx_grp->arg_gh, !=, NULL); 839 mac_perim_enter_by_mh(port->lp_mh, &pmph); 840 841 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 842 MAC_RING_TYPE_RX); 843 844 for (uint_t i = 0; i < hw_rh_cnt; i++) 845 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 846 847 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 848 aggr_port_remmac(port, g_idx, addr->aua_addr); 849 850 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 851 avp = list_next(&rx_grp->arg_vlans, avp)) { 852 int err; 853 854 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 855 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 856 ": errno %d.", avp->av_vid, 857 mac_client_name(port->lp_mch), err); 858 } 859 } 860 861 port->lp_hwghs[g_idx] = NULL; 862 mac_perim_exit(pmph); 863 } 864 865 /* 866 * Add a pseudo TX ring for the given HW ring handle. 867 */ 868 static int 869 aggr_add_pseudo_tx_ring(aggr_port_t *port, 870 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 871 mac_ring_handle_t *pseudo_rh) 872 { 873 aggr_pseudo_tx_ring_t *ring; 874 int err; 875 int i; 876 877 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 878 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 879 ring = tx_grp->atg_rings + i; 880 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 881 break; 882 } 883 /* 884 * No slot for this new TX ring. 885 */ 886 if (i == MAX_RINGS_PER_GROUP) 887 return (EIO); 888 /* 889 * The following 4 statements needs to be done before 890 * calling mac_group_add_ring(). Otherwise it will 891 * result in an assertion failure in mac_init_ring(). 892 */ 893 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 894 ring->atr_hw_rh = hw_rh; 895 ring->atr_port = port; 896 tx_grp->atg_ring_cnt++; 897 898 /* 899 * The TX side has no concept of ring groups unlike RX groups. 900 * There is just a single group which stores all the TX rings. 901 * This group will be used to store aggr's pseudo TX rings. 902 */ 903 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 904 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 905 ring->atr_hw_rh = NULL; 906 ring->atr_port = NULL; 907 tx_grp->atg_ring_cnt--; 908 } else { 909 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 910 if (hw_rh != NULL) { 911 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 912 mac_find_ring(tx_grp->atg_gh, i)); 913 } 914 } 915 916 return (err); 917 } 918 919 /* 920 * Remove the pseudo TX ring of the given HW ring handle. 921 */ 922 static void 923 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 924 mac_ring_handle_t pseudo_hw_rh) 925 { 926 aggr_pseudo_tx_ring_t *ring; 927 int i; 928 929 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 930 ring = tx_grp->atg_rings + i; 931 if (ring->atr_rh != pseudo_hw_rh) 932 continue; 933 934 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 935 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 936 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 937 mac_hwring_teardown(ring->atr_hw_rh); 938 ring->atr_hw_rh = NULL; 939 ring->atr_port = NULL; 940 tx_grp->atg_ring_cnt--; 941 break; 942 } 943 } 944 945 /* 946 * This function is called to create pseudo rings over hardware rings of 947 * the underlying device. There is a 1:1 mapping between the pseudo TX 948 * rings of the aggr and the hardware rings of the underlying port. 949 */ 950 static int 951 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 952 { 953 aggr_grp_t *grp = port->lp_grp; 954 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 955 mac_perim_handle_t pmph; 956 int hw_rh_cnt, i = 0, j; 957 int err = 0; 958 959 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 960 mac_perim_enter_by_mh(port->lp_mh, &pmph); 961 962 /* 963 * Get the list the the underlying HW rings. 964 */ 965 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 966 MAC_RING_TYPE_TX); 967 968 /* 969 * Even if the underlying NIC does not have TX rings, we 970 * still make a psuedo TX ring for that NIC with NULL as 971 * the ring handle. 972 */ 973 if (hw_rh_cnt == 0) 974 port->lp_tx_ring_cnt = 1; 975 else 976 port->lp_tx_ring_cnt = hw_rh_cnt; 977 978 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 979 port->lp_tx_ring_cnt), KM_SLEEP); 980 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 981 port->lp_tx_ring_cnt), KM_SLEEP); 982 983 if (hw_rh_cnt == 0) { 984 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 985 NULL, &pseudo_rh)) == 0) { 986 port->lp_tx_rings[0] = NULL; 987 port->lp_pseudo_tx_rings[0] = pseudo_rh; 988 } 989 } else { 990 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 991 err = aggr_add_pseudo_tx_ring(port, 992 tx_grp, hw_rh[i], &pseudo_rh); 993 if (err != 0) 994 break; 995 port->lp_tx_rings[i] = hw_rh[i]; 996 port->lp_pseudo_tx_rings[i] = pseudo_rh; 997 } 998 } 999 1000 if (err != 0) { 1001 if (hw_rh_cnt != 0) { 1002 for (j = 0; j < i; j++) { 1003 aggr_rem_pseudo_tx_ring(tx_grp, 1004 port->lp_pseudo_tx_rings[j]); 1005 } 1006 } 1007 kmem_free(port->lp_tx_rings, 1008 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1009 kmem_free(port->lp_pseudo_tx_rings, 1010 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1011 port->lp_tx_ring_cnt = 0; 1012 } else { 1013 port->lp_tx_grp_added = B_TRUE; 1014 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1015 aggr_tx_ring_update, port); 1016 } 1017 mac_perim_exit(pmph); 1018 aggr_grp_update_default(grp); 1019 return (err); 1020 } 1021 1022 /* 1023 * This function is called by aggr to remove pseudo TX rings over the 1024 * HW rings of the underlying port. 1025 */ 1026 static void 1027 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1028 { 1029 aggr_grp_t *grp = port->lp_grp; 1030 mac_perim_handle_t pmph; 1031 int i; 1032 1033 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1034 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1035 1036 if (!port->lp_tx_grp_added) 1037 goto done; 1038 1039 ASSERT(tx_grp->atg_gh != NULL); 1040 1041 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1042 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1043 1044 kmem_free(port->lp_tx_rings, 1045 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1046 kmem_free(port->lp_pseudo_tx_rings, 1047 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1048 1049 port->lp_tx_ring_cnt = 0; 1050 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1051 port->lp_tx_grp_added = B_FALSE; 1052 aggr_grp_update_default(grp); 1053 done: 1054 mac_perim_exit(pmph); 1055 } 1056 1057 static int 1058 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1059 { 1060 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1061 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1062 } 1063 1064 static int 1065 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1066 { 1067 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1068 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1069 } 1070 1071 /* 1072 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1073 * over an actual HW ring, the real task is to start the underlying HW 1074 * ring. 1075 */ 1076 static int 1077 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1078 { 1079 int err; 1080 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1081 1082 err = mac_hwring_start(rr_ring->arr_hw_rh); 1083 1084 if (err != 0) 1085 return (err); 1086 1087 rr_ring->arr_gen = mr_gen; 1088 return (err); 1089 } 1090 1091 /* 1092 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1093 * over an actual HW ring, the real task is to stop the underlying HW 1094 * ring. 1095 */ 1096 static void 1097 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1098 { 1099 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1100 1101 /* 1102 * The rings underlying the default group must stay up to 1103 * continue receiving LACP traffic. We would normally never 1104 * stop the default Rx rings because of the primary MAC 1105 * client; but aggr's primary MAC client doesn't call 1106 * mac_unicast_add() and thus mi_active is 0 when the last 1107 * non-primary client is deleted. 1108 */ 1109 if (rr_ring->arr_grp->arg_index != 0) 1110 mac_hwring_stop(rr_ring->arr_hw_rh); 1111 } 1112 1113 /* 1114 * Add one or more ports to an existing link aggregation group. 1115 */ 1116 int 1117 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1118 laioc_port_t *ports) 1119 { 1120 int rc; 1121 uint_t port_added = 0; 1122 uint_t grp_added; 1123 aggr_grp_t *grp = NULL; 1124 aggr_port_t *port; 1125 boolean_t link_state_changed = B_FALSE; 1126 mac_perim_handle_t mph, pmph; 1127 1128 /* Get the aggr corresponding to linkid. */ 1129 rw_enter(&aggr_grp_lock, RW_READER); 1130 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1131 (mod_hash_val_t *)&grp) != 0) { 1132 rw_exit(&aggr_grp_lock); 1133 return (ENOENT); 1134 } 1135 AGGR_GRP_REFHOLD(grp); 1136 1137 /* 1138 * Hold the perimeter so that the aggregation can't be destroyed. 1139 */ 1140 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1141 rw_exit(&aggr_grp_lock); 1142 1143 /* Add the specified ports to the aggr. */ 1144 for (uint_t i = 0; i < nports; i++) { 1145 grp_added = 0; 1146 1147 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1148 force, &port)) != 0) { 1149 goto bail; 1150 } 1151 1152 ASSERT(port != NULL); 1153 port_added++; 1154 1155 /* check capabilities */ 1156 if (!aggr_grp_capab_check(grp, port) || 1157 !aggr_grp_sdu_check(grp, port) || 1158 !aggr_grp_margin_check(grp, port)) { 1159 rc = ENOTSUP; 1160 goto bail; 1161 } 1162 1163 /* 1164 * Create the pseudo ring for each HW ring of the underlying 1165 * port. 1166 */ 1167 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1168 if (rc != 0) 1169 goto bail; 1170 1171 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1172 rc = aggr_add_pseudo_rx_group(port, 1173 &grp->lg_rx_groups[j]); 1174 1175 if (rc != 0) 1176 goto bail; 1177 1178 grp_added++; 1179 } 1180 1181 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1182 1183 /* set LACP mode */ 1184 aggr_port_lacp_set_mode(grp, port); 1185 1186 /* start port if group has already been started */ 1187 if (grp->lg_started) { 1188 rc = aggr_port_start(port); 1189 if (rc != 0) { 1190 mac_perim_exit(pmph); 1191 goto bail; 1192 } 1193 1194 /* 1195 * Turn on the promiscuous mode over the port when it 1196 * is requested to be turned on to receive the 1197 * non-primary address over a port, or the promiscuous 1198 * mode is enabled over the aggr. 1199 */ 1200 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1201 rc = aggr_port_promisc(port, B_TRUE); 1202 if (rc != 0) { 1203 mac_perim_exit(pmph); 1204 goto bail; 1205 } 1206 } 1207 } 1208 mac_perim_exit(pmph); 1209 1210 /* 1211 * Attach each port if necessary. 1212 */ 1213 if (aggr_port_notify_link(grp, port)) 1214 link_state_changed = B_TRUE; 1215 1216 /* 1217 * Initialize the callback functions for this port. 1218 */ 1219 aggr_port_init_callbacks(port); 1220 } 1221 1222 /* update the MAC address of the constituent ports */ 1223 if (aggr_grp_update_ports_mac(grp)) 1224 link_state_changed = B_TRUE; 1225 1226 if (link_state_changed) 1227 mac_link_update(grp->lg_mh, grp->lg_link_state); 1228 1229 bail: 1230 if (rc != 0) { 1231 /* stop and remove ports that have been added */ 1232 for (uint_t i = 0; i < port_added; i++) { 1233 uint_t grp_remove; 1234 1235 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1236 ASSERT(port != NULL); 1237 1238 if (grp->lg_started) { 1239 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1240 (void) aggr_port_promisc(port, B_FALSE); 1241 aggr_port_stop(port); 1242 mac_perim_exit(pmph); 1243 } 1244 1245 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1246 1247 /* 1248 * Only the last port could have a partial set 1249 * of groups added. 1250 */ 1251 grp_remove = (i + 1 == port_added) ? grp_added : 1252 grp->lg_rx_group_count; 1253 1254 for (uint_t j = 0; j < grp_remove; j++) { 1255 aggr_rem_pseudo_rx_group(port, 1256 &grp->lg_rx_groups[j]); 1257 } 1258 1259 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1260 } 1261 } 1262 1263 mac_perim_exit(mph); 1264 AGGR_GRP_REFRELE(grp); 1265 return (rc); 1266 } 1267 1268 static int 1269 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1270 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1271 aggr_lacp_timer_t lacp_timer) 1272 { 1273 boolean_t mac_addr_changed = B_FALSE; 1274 boolean_t link_state_changed = B_FALSE; 1275 mac_perim_handle_t pmph; 1276 1277 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1278 1279 /* validate fixed address if specified */ 1280 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1281 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1282 (mac_addr[0] & 0x01))) { 1283 return (EINVAL); 1284 } 1285 1286 /* update policy if requested */ 1287 if (update_mask & AGGR_MODIFY_POLICY) 1288 aggr_send_update_policy(grp, policy); 1289 1290 /* update unicast MAC address if requested */ 1291 if (update_mask & AGGR_MODIFY_MAC) { 1292 if (mac_fixed) { 1293 /* user-supplied MAC address */ 1294 grp->lg_mac_addr_port = NULL; 1295 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1296 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1297 mac_addr_changed = B_TRUE; 1298 } 1299 } else if (grp->lg_addr_fixed) { 1300 /* switch from user-supplied to automatic */ 1301 aggr_port_t *port = grp->lg_ports; 1302 1303 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1304 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1305 grp->lg_mac_addr_port = port; 1306 mac_addr_changed = B_TRUE; 1307 mac_perim_exit(pmph); 1308 } 1309 grp->lg_addr_fixed = mac_fixed; 1310 } 1311 1312 if (mac_addr_changed) 1313 link_state_changed = aggr_grp_update_ports_mac(grp); 1314 1315 if (update_mask & AGGR_MODIFY_LACP_MODE) 1316 aggr_lacp_update_mode(grp, lacp_mode); 1317 1318 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1319 aggr_lacp_update_timer(grp, lacp_timer); 1320 1321 if (link_state_changed) 1322 mac_link_update(grp->lg_mh, grp->lg_link_state); 1323 1324 if (mac_addr_changed) 1325 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1326 1327 return (0); 1328 } 1329 1330 /* 1331 * Update properties of an existing link aggregation group. 1332 */ 1333 int 1334 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1335 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1336 aggr_lacp_timer_t lacp_timer) 1337 { 1338 aggr_grp_t *grp = NULL; 1339 mac_perim_handle_t mph; 1340 int err; 1341 1342 /* get group corresponding to linkid */ 1343 rw_enter(&aggr_grp_lock, RW_READER); 1344 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1345 (mod_hash_val_t *)&grp) != 0) { 1346 rw_exit(&aggr_grp_lock); 1347 return (ENOENT); 1348 } 1349 AGGR_GRP_REFHOLD(grp); 1350 1351 /* 1352 * Hold the perimeter so that the aggregation won't be destroyed. 1353 */ 1354 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1355 rw_exit(&aggr_grp_lock); 1356 1357 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1358 mac_addr, lacp_mode, lacp_timer); 1359 1360 mac_perim_exit(mph); 1361 AGGR_GRP_REFRELE(grp); 1362 return (err); 1363 } 1364 1365 /* 1366 * Create a new link aggregation group upon request from administrator. 1367 * Returns 0 on success, an errno on failure. 1368 */ 1369 int 1370 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1371 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1372 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1373 cred_t *credp) 1374 { 1375 aggr_grp_t *grp = NULL; 1376 aggr_port_t *port; 1377 mac_register_t *mac; 1378 boolean_t link_state_changed; 1379 mac_perim_handle_t mph; 1380 int err; 1381 int i; 1382 kt_did_t tid = 0; 1383 1384 /* need at least one port */ 1385 if (nports == 0) 1386 return (EINVAL); 1387 1388 rw_enter(&aggr_grp_lock, RW_WRITER); 1389 1390 /* does a group with the same linkid already exist? */ 1391 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1392 (mod_hash_val_t *)&grp); 1393 if (err == 0) { 1394 rw_exit(&aggr_grp_lock); 1395 return (EEXIST); 1396 } 1397 1398 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1399 1400 grp->lg_refs = 1; 1401 grp->lg_closing = B_FALSE; 1402 grp->lg_force = force; 1403 grp->lg_linkid = linkid; 1404 grp->lg_zoneid = crgetzoneid(credp); 1405 grp->lg_ifspeed = 0; 1406 grp->lg_link_state = LINK_STATE_UNKNOWN; 1407 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1408 grp->lg_started = B_FALSE; 1409 grp->lg_promisc = B_FALSE; 1410 grp->lg_lacp_done = B_FALSE; 1411 grp->lg_tx_notify_done = B_FALSE; 1412 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1413 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1414 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1415 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1416 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1417 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1418 MAX_RINGS_PER_GROUP), KM_SLEEP); 1419 grp->lg_tx_blocked_cnt = 0; 1420 bzero(&grp->lg_rx_groups, 1421 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1422 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1423 aggr_lacp_init_grp(grp); 1424 1425 /* add MAC ports to group */ 1426 grp->lg_ports = NULL; 1427 grp->lg_nports = 0; 1428 grp->lg_nattached_ports = 0; 1429 grp->lg_ntx_ports = 0; 1430 1431 /* 1432 * If key is not specified by the user, allocate the key. 1433 */ 1434 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1435 err = ENOMEM; 1436 goto bail; 1437 } 1438 grp->lg_key = key; 1439 1440 for (i = 0; i < nports; i++) { 1441 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1442 if (err != 0) 1443 goto bail; 1444 } 1445 1446 grp->lg_rx_group_count = 1; 1447 1448 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1449 uint_t num_rgroups; 1450 1451 mac_perim_enter_by_mh(port->lp_mh, &mph); 1452 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1453 mac_perim_exit(mph); 1454 1455 /* 1456 * Utilize all the groups in a port. If some ports 1457 * have less groups than others, then traffic destined 1458 * for the same unicast address may be HW classified 1459 * on some ports but SW classified by aggr when 1460 * arriving on other ports. 1461 */ 1462 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1463 num_rgroups); 1464 } 1465 1466 /* 1467 * There could be cases where the hardware provides more 1468 * groups than aggr can support. Make sure we never go above 1469 * the max aggr can support. 1470 */ 1471 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1472 MAX_GROUPS_PER_PORT); 1473 1474 ASSERT3U(grp->lg_rx_group_count, >, 0); 1475 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1476 grp->lg_rx_groups[i].arg_index = i; 1477 grp->lg_rx_groups[i].arg_untagged = 0; 1478 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1479 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1480 } 1481 1482 /* 1483 * If no explicit MAC address was specified by the administrator, 1484 * set it to the MAC address of the first port. 1485 */ 1486 grp->lg_addr_fixed = mac_fixed; 1487 if (grp->lg_addr_fixed) { 1488 /* validate specified address */ 1489 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1490 err = EINVAL; 1491 goto bail; 1492 } 1493 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1494 } else { 1495 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1496 grp->lg_mac_addr_port = grp->lg_ports; 1497 } 1498 1499 /* Set the initial group capabilities. */ 1500 aggr_grp_capab_set(grp); 1501 1502 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1503 err = ENOMEM; 1504 goto bail; 1505 } 1506 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1507 mac->m_driver = grp; 1508 mac->m_dip = aggr_dip; 1509 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1510 mac->m_src_addr = grp->lg_addr; 1511 mac->m_callbacks = &aggr_m_callbacks; 1512 mac->m_min_sdu = 0; 1513 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1514 mac->m_margin = aggr_grp_max_margin(grp); 1515 mac->m_v12n = MAC_VIRT_LEVEL1; 1516 err = mac_register(mac, &grp->lg_mh); 1517 mac_free(mac); 1518 if (err != 0) 1519 goto bail; 1520 1521 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1522 if (err != 0) { 1523 (void) mac_unregister(grp->lg_mh); 1524 grp->lg_mh = NULL; 1525 goto bail; 1526 } 1527 1528 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1529 1530 /* 1531 * Update the MAC address of the constituent ports. 1532 * None of the port is attached at this time, the link state of the 1533 * aggregation will not change. 1534 * 1535 * All ports take on the primary MAC address of the aggr 1536 * (lg_aggr). At this point, none of the ports are attached; 1537 * thus the link state of the aggregation will not change. 1538 */ 1539 link_state_changed = aggr_grp_update_ports_mac(grp); 1540 ASSERT(!link_state_changed); 1541 1542 /* Update outbound load balancing policy. */ 1543 aggr_send_update_policy(grp, policy); 1544 1545 /* Set LACP mode. */ 1546 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1547 1548 /* 1549 * Attach each port if necessary. 1550 */ 1551 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1552 /* 1553 * Create the pseudo ring for each HW ring of the 1554 * underlying port. Note that this is done after the 1555 * aggr registers its MAC. 1556 */ 1557 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), 1558 ==, 0); 1559 1560 for (i = 0; i < grp->lg_rx_group_count; i++) { 1561 VERIFY3S(aggr_add_pseudo_rx_group(port, 1562 &grp->lg_rx_groups[i]), ==, 0); 1563 } 1564 1565 if (aggr_port_notify_link(grp, port)) 1566 link_state_changed = B_TRUE; 1567 1568 /* 1569 * Initialize the callback functions for this port. 1570 */ 1571 aggr_port_init_callbacks(port); 1572 } 1573 1574 if (link_state_changed) 1575 mac_link_update(grp->lg_mh, grp->lg_link_state); 1576 1577 /* add new group to hash table */ 1578 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1579 (mod_hash_val_t)grp); 1580 ASSERT(err == 0); 1581 aggr_grp_cnt++; 1582 1583 mac_perim_exit(mph); 1584 rw_exit(&aggr_grp_lock); 1585 return (0); 1586 1587 bail: 1588 1589 grp->lg_closing = B_TRUE; 1590 1591 port = grp->lg_ports; 1592 while (port != NULL) { 1593 aggr_port_t *cport; 1594 1595 cport = port->lp_next; 1596 aggr_port_delete(port); 1597 port = cport; 1598 } 1599 1600 /* 1601 * Inform the lacp_rx thread to exit. 1602 */ 1603 mutex_enter(&grp->lg_lacp_lock); 1604 grp->lg_lacp_done = B_TRUE; 1605 cv_signal(&grp->lg_lacp_cv); 1606 while (grp->lg_lacp_rx_thread != NULL) 1607 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1608 mutex_exit(&grp->lg_lacp_lock); 1609 /* 1610 * Inform the tx_notify thread to exit. 1611 */ 1612 mutex_enter(&grp->lg_tx_flowctl_lock); 1613 if (grp->lg_tx_notify_thread != NULL) { 1614 tid = grp->lg_tx_notify_thread->t_did; 1615 grp->lg_tx_notify_done = B_TRUE; 1616 cv_signal(&grp->lg_tx_flowctl_cv); 1617 } 1618 mutex_exit(&grp->lg_tx_flowctl_lock); 1619 if (tid != 0) 1620 thread_join(tid); 1621 1622 kmem_free(grp->lg_tx_blocked_rings, 1623 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1624 rw_exit(&aggr_grp_lock); 1625 AGGR_GRP_REFRELE(grp); 1626 return (err); 1627 } 1628 1629 /* 1630 * Return a pointer to the member of a group with specified linkid. 1631 */ 1632 static aggr_port_t * 1633 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1634 { 1635 aggr_port_t *port; 1636 1637 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1638 1639 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1640 if (port->lp_linkid == linkid) 1641 break; 1642 } 1643 1644 return (port); 1645 } 1646 1647 /* 1648 * Stop, detach and remove a port from a link aggregation group. 1649 */ 1650 static int 1651 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1652 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1653 { 1654 int rc = 0; 1655 aggr_port_t **pport; 1656 boolean_t mac_addr_changed = B_FALSE; 1657 boolean_t link_state_changed = B_FALSE; 1658 mac_perim_handle_t mph; 1659 uint64_t val; 1660 uint_t i; 1661 uint_t stat; 1662 1663 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1664 ASSERT(grp->lg_nports > 1); 1665 ASSERT(!grp->lg_closing); 1666 1667 /* unlink port */ 1668 for (pport = &grp->lg_ports; *pport != port; 1669 pport = &(*pport)->lp_next) { 1670 if (*pport == NULL) { 1671 rc = ENOENT; 1672 goto done; 1673 } 1674 } 1675 *pport = port->lp_next; 1676 1677 mac_perim_enter_by_mh(port->lp_mh, &mph); 1678 1679 /* 1680 * If the MAC address of the port being removed was assigned 1681 * to the group, update the group MAC address 1682 * using the MAC address of a different port. 1683 */ 1684 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1685 /* 1686 * Set the MAC address of the group to the 1687 * MAC address of its first port. 1688 */ 1689 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1690 grp->lg_mac_addr_port = grp->lg_ports; 1691 mac_addr_changed = B_TRUE; 1692 } 1693 1694 link_state_changed = aggr_grp_detach_port(grp, port); 1695 1696 /* 1697 * Add the counter statistics of the ports while it was aggregated 1698 * to the group's residual statistics. This is done by obtaining 1699 * the current counter from the underlying MAC then subtracting the 1700 * value of the counter at the moment it was added to the 1701 * aggregation. 1702 */ 1703 for (i = 0; i < MAC_NSTAT; i++) { 1704 stat = i + MAC_STAT_MIN; 1705 if (!MAC_STAT_ISACOUNTER(stat)) 1706 continue; 1707 val = aggr_port_stat(port, stat); 1708 val -= port->lp_stat[i]; 1709 mutex_enter(&grp->lg_stat_lock); 1710 grp->lg_stat[i] += val; 1711 mutex_exit(&grp->lg_stat_lock); 1712 } 1713 for (i = 0; i < ETHER_NSTAT; i++) { 1714 stat = i + MACTYPE_STAT_MIN; 1715 if (!ETHER_STAT_ISACOUNTER(stat)) 1716 continue; 1717 val = aggr_port_stat(port, stat); 1718 val -= port->lp_ether_stat[i]; 1719 mutex_enter(&grp->lg_stat_lock); 1720 grp->lg_ether_stat[i] += val; 1721 mutex_exit(&grp->lg_stat_lock); 1722 } 1723 1724 grp->lg_nports--; 1725 mac_perim_exit(mph); 1726 1727 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1728 aggr_port_delete(port); 1729 1730 /* 1731 * If the group MAC address has changed, update the MAC address of 1732 * the remaining constituent ports according to the new MAC 1733 * address of the group. 1734 */ 1735 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1736 link_state_changed = B_TRUE; 1737 1738 done: 1739 if (mac_addr_changedp != NULL) 1740 *mac_addr_changedp = mac_addr_changed; 1741 if (link_state_changedp != NULL) 1742 *link_state_changedp = link_state_changed; 1743 1744 return (rc); 1745 } 1746 1747 /* 1748 * Remove one or more ports from an existing link aggregation group. 1749 */ 1750 int 1751 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1752 { 1753 int rc = 0; 1754 uint_t i; 1755 aggr_grp_t *grp = NULL; 1756 aggr_port_t *port; 1757 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1758 boolean_t link_state_update = B_FALSE, link_state_changed; 1759 mac_perim_handle_t mph, pmph; 1760 1761 /* get group corresponding to linkid */ 1762 rw_enter(&aggr_grp_lock, RW_READER); 1763 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1764 (mod_hash_val_t *)&grp) != 0) { 1765 rw_exit(&aggr_grp_lock); 1766 return (ENOENT); 1767 } 1768 AGGR_GRP_REFHOLD(grp); 1769 1770 /* 1771 * Hold the perimeter so that the aggregation won't be destroyed. 1772 */ 1773 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1774 rw_exit(&aggr_grp_lock); 1775 1776 /* we need to keep at least one port per group */ 1777 if (nports >= grp->lg_nports) { 1778 rc = EINVAL; 1779 goto bail; 1780 } 1781 1782 /* first verify that all the groups are valid */ 1783 for (i = 0; i < nports; i++) { 1784 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1785 /* port not found */ 1786 rc = ENOENT; 1787 goto bail; 1788 } 1789 } 1790 1791 /* clear the promiscous mode for the specified ports */ 1792 for (i = 0; i < nports && rc == 0; i++) { 1793 /* lookup port */ 1794 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1795 ASSERT(port != NULL); 1796 1797 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1798 rc = aggr_port_promisc(port, B_FALSE); 1799 mac_perim_exit(pmph); 1800 } 1801 if (rc != 0) { 1802 for (i = 0; i < nports; i++) { 1803 port = aggr_grp_port_lookup(grp, 1804 ports[i].lp_linkid); 1805 ASSERT(port != NULL); 1806 1807 /* 1808 * Turn the promiscuous mode back on if it is required 1809 * to receive the non-primary address over a port, or 1810 * the promiscous mode is enabled over the aggr. 1811 */ 1812 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1813 if (port->lp_started && (grp->lg_promisc || 1814 port->lp_prom_addr != NULL)) { 1815 (void) aggr_port_promisc(port, B_TRUE); 1816 } 1817 mac_perim_exit(pmph); 1818 } 1819 goto bail; 1820 } 1821 1822 /* remove the specified ports from group */ 1823 for (i = 0; i < nports; i++) { 1824 /* lookup port */ 1825 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1826 ASSERT(port != NULL); 1827 1828 /* stop port if group has already been started */ 1829 if (grp->lg_started) { 1830 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1831 aggr_port_stop(port); 1832 mac_perim_exit(pmph); 1833 } 1834 1835 /* 1836 * aggr_rem_pseudo_tx_group() is not called here. Instead 1837 * it is called from inside aggr_grp_rem_port() after the 1838 * port has been detached. The reason is that 1839 * aggr_rem_pseudo_tx_group() removes one ring at a time 1840 * and if there is still traffic going on, then there 1841 * is the possibility of aggr_find_tx_ring() returning a 1842 * removed ring for transmission. Once the port has been 1843 * detached, that port will not be used and 1844 * aggr_find_tx_ring() will not return any rings 1845 * belonging to it. 1846 */ 1847 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) 1848 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]); 1849 1850 /* remove port from group */ 1851 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1852 &link_state_changed); 1853 ASSERT(rc == 0); 1854 mac_addr_update = mac_addr_update || mac_addr_changed; 1855 link_state_update = link_state_update || link_state_changed; 1856 } 1857 1858 bail: 1859 if (mac_addr_update) 1860 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1861 if (link_state_update) 1862 mac_link_update(grp->lg_mh, grp->lg_link_state); 1863 1864 mac_perim_exit(mph); 1865 AGGR_GRP_REFRELE(grp); 1866 1867 return (rc); 1868 } 1869 1870 int 1871 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1872 { 1873 aggr_grp_t *grp = NULL; 1874 aggr_port_t *port, *cport; 1875 datalink_id_t tmpid; 1876 mod_hash_val_t val; 1877 mac_perim_handle_t mph, pmph; 1878 int err; 1879 kt_did_t tid = 0; 1880 1881 rw_enter(&aggr_grp_lock, RW_WRITER); 1882 1883 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1884 (mod_hash_val_t *)&grp) != 0) { 1885 rw_exit(&aggr_grp_lock); 1886 return (ENOENT); 1887 } 1888 1889 /* 1890 * Note that dls_devnet_destroy() must be called before lg_lock is 1891 * held. Otherwise, it will deadlock if another thread is in 1892 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1893 * dls_devnet_destroy() needs to delete. 1894 */ 1895 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1896 rw_exit(&aggr_grp_lock); 1897 return (err); 1898 } 1899 ASSERT(linkid == tmpid); 1900 1901 /* 1902 * Unregister from the MAC service module. Since this can 1903 * fail if a client hasn't closed the MAC port, we gracefully 1904 * fail the operation. 1905 */ 1906 if ((err = mac_disable(grp->lg_mh)) != 0) { 1907 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1908 rw_exit(&aggr_grp_lock); 1909 return (err); 1910 } 1911 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1912 ASSERT(grp == (aggr_grp_t *)val); 1913 1914 ASSERT(aggr_grp_cnt > 0); 1915 aggr_grp_cnt--; 1916 rw_exit(&aggr_grp_lock); 1917 1918 /* 1919 * Inform the lacp_rx thread to exit. 1920 */ 1921 mutex_enter(&grp->lg_lacp_lock); 1922 grp->lg_lacp_done = B_TRUE; 1923 cv_signal(&grp->lg_lacp_cv); 1924 while (grp->lg_lacp_rx_thread != NULL) 1925 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1926 mutex_exit(&grp->lg_lacp_lock); 1927 /* 1928 * Inform the tx_notify_thread to exit. 1929 */ 1930 mutex_enter(&grp->lg_tx_flowctl_lock); 1931 if (grp->lg_tx_notify_thread != NULL) { 1932 tid = grp->lg_tx_notify_thread->t_did; 1933 grp->lg_tx_notify_done = B_TRUE; 1934 cv_signal(&grp->lg_tx_flowctl_cv); 1935 } 1936 mutex_exit(&grp->lg_tx_flowctl_lock); 1937 if (tid != 0) 1938 thread_join(tid); 1939 1940 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1941 1942 grp->lg_closing = B_TRUE; 1943 /* detach and free MAC ports associated with group */ 1944 port = grp->lg_ports; 1945 while (port != NULL) { 1946 cport = port->lp_next; 1947 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1948 if (grp->lg_started) 1949 aggr_port_stop(port); 1950 (void) aggr_grp_detach_port(grp, port); 1951 mac_perim_exit(pmph); 1952 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1953 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 1954 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1955 aggr_port_delete(port); 1956 port = cport; 1957 } 1958 1959 mac_perim_exit(mph); 1960 1961 kmem_free(grp->lg_tx_blocked_rings, 1962 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1963 /* 1964 * Wait for the port's lacp timer thread and its notification callback 1965 * to exit before calling mac_unregister() since both needs to access 1966 * the mac perimeter of the grp. 1967 */ 1968 aggr_grp_port_wait(grp); 1969 1970 VERIFY(mac_unregister(grp->lg_mh) == 0); 1971 grp->lg_mh = NULL; 1972 1973 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1974 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 1975 } 1976 1977 AGGR_GRP_REFRELE(grp); 1978 return (0); 1979 } 1980 1981 void 1982 aggr_grp_free(aggr_grp_t *grp) 1983 { 1984 ASSERT(grp->lg_refs == 0); 1985 ASSERT(grp->lg_port_ref == 0); 1986 if (grp->lg_key > AGGR_MAX_KEY) { 1987 id_free(key_ids, grp->lg_key); 1988 grp->lg_key = 0; 1989 } 1990 kmem_cache_free(aggr_grp_cache, grp); 1991 } 1992 1993 int 1994 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1995 aggr_grp_info_new_grp_fn_t new_grp_fn, 1996 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1997 { 1998 aggr_grp_t *grp; 1999 aggr_port_t *port; 2000 mac_perim_handle_t mph, pmph; 2001 int rc = 0; 2002 2003 /* 2004 * Make sure that the aggregation link is visible from the caller's 2005 * zone. 2006 */ 2007 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 2008 return (ENOENT); 2009 2010 rw_enter(&aggr_grp_lock, RW_READER); 2011 2012 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2013 (mod_hash_val_t *)&grp) != 0) { 2014 rw_exit(&aggr_grp_lock); 2015 return (ENOENT); 2016 } 2017 AGGR_GRP_REFHOLD(grp); 2018 2019 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2020 rw_exit(&aggr_grp_lock); 2021 2022 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2023 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2024 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2025 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2026 2027 if (rc != 0) 2028 goto bail; 2029 2030 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2031 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2032 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2033 port->lp_state, &port->lp_lacp.ActorOperPortState); 2034 mac_perim_exit(pmph); 2035 2036 if (rc != 0) 2037 goto bail; 2038 } 2039 2040 bail: 2041 mac_perim_exit(mph); 2042 AGGR_GRP_REFRELE(grp); 2043 return (rc); 2044 } 2045 2046 /*ARGSUSED*/ 2047 static void 2048 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2049 { 2050 miocnak(q, mp, 0, ENOTSUP); 2051 } 2052 2053 static int 2054 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2055 { 2056 aggr_port_t *port; 2057 uint_t stat_index; 2058 2059 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2060 2061 /* We only aggregate counter statistics. */ 2062 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 2063 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 2064 return (ENOTSUP); 2065 } 2066 2067 /* 2068 * Counter statistics for a group are computed by aggregating the 2069 * counters of the members MACs while they were aggregated, plus 2070 * the residual counter of the group itself, which is updated each 2071 * time a MAC is removed from the group. 2072 */ 2073 *val = 0; 2074 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2075 /* actual port statistic */ 2076 *val += aggr_port_stat(port, stat); 2077 /* 2078 * minus the port stat when it was added, plus any residual 2079 * amount for the group. 2080 */ 2081 if (IS_MAC_STAT(stat)) { 2082 stat_index = stat - MAC_STAT_MIN; 2083 *val -= port->lp_stat[stat_index]; 2084 *val += grp->lg_stat[stat_index]; 2085 } else if (IS_MACTYPE_STAT(stat)) { 2086 stat_index = stat - MACTYPE_STAT_MIN; 2087 *val -= port->lp_ether_stat[stat_index]; 2088 *val += grp->lg_ether_stat[stat_index]; 2089 } 2090 } 2091 return (0); 2092 } 2093 2094 int 2095 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2096 { 2097 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2098 2099 if (rx_ring->arr_hw_rh != NULL) { 2100 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2101 } else { 2102 aggr_port_t *port = rx_ring->arr_port; 2103 2104 *val = mac_stat_get(port->lp_mh, stat); 2105 2106 } 2107 return (0); 2108 } 2109 2110 int 2111 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2112 { 2113 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2114 2115 if (tx_ring->atr_hw_rh != NULL) { 2116 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2117 } else { 2118 aggr_port_t *port = tx_ring->atr_port; 2119 2120 *val = mac_stat_get(port->lp_mh, stat); 2121 } 2122 return (0); 2123 } 2124 2125 static int 2126 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2127 { 2128 aggr_grp_t *grp = arg; 2129 int rval = 0; 2130 2131 mutex_enter(&grp->lg_stat_lock); 2132 2133 switch (stat) { 2134 case MAC_STAT_IFSPEED: 2135 *val = grp->lg_ifspeed; 2136 break; 2137 2138 case ETHER_STAT_LINK_DUPLEX: 2139 *val = grp->lg_link_duplex; 2140 break; 2141 2142 default: 2143 /* 2144 * For all other statistics, we return the aggregated stat 2145 * from the underlying ports. aggr_grp_stat() will set 2146 * rval appropriately if the statistic isn't a counter. 2147 */ 2148 rval = aggr_grp_stat(grp, stat, val); 2149 } 2150 2151 mutex_exit(&grp->lg_stat_lock); 2152 return (rval); 2153 } 2154 2155 static int 2156 aggr_m_start(void *arg) 2157 { 2158 aggr_grp_t *grp = arg; 2159 aggr_port_t *port; 2160 mac_perim_handle_t mph, pmph; 2161 2162 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2163 2164 /* 2165 * Attempts to start all configured members of the group. 2166 * Group members will be attached when their link-up notification 2167 * is received. 2168 */ 2169 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2170 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2171 if (aggr_port_start(port) != 0) { 2172 mac_perim_exit(pmph); 2173 continue; 2174 } 2175 2176 /* 2177 * Turn on the promiscuous mode if it is required to receive 2178 * the non-primary address over a port, or the promiscous 2179 * mode is enabled over the aggr. 2180 */ 2181 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2182 if (aggr_port_promisc(port, B_TRUE) != 0) 2183 aggr_port_stop(port); 2184 } 2185 mac_perim_exit(pmph); 2186 } 2187 2188 grp->lg_started = B_TRUE; 2189 2190 mac_perim_exit(mph); 2191 return (0); 2192 } 2193 2194 static void 2195 aggr_m_stop(void *arg) 2196 { 2197 aggr_grp_t *grp = arg; 2198 aggr_port_t *port; 2199 mac_perim_handle_t mph, pmph; 2200 2201 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2202 2203 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2204 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2205 2206 /* reset port promiscuous mode */ 2207 (void) aggr_port_promisc(port, B_FALSE); 2208 2209 aggr_port_stop(port); 2210 mac_perim_exit(pmph); 2211 } 2212 2213 grp->lg_started = B_FALSE; 2214 mac_perim_exit(mph); 2215 } 2216 2217 static int 2218 aggr_m_promisc(void *arg, boolean_t on) 2219 { 2220 aggr_grp_t *grp = arg; 2221 aggr_port_t *port; 2222 boolean_t link_state_changed = B_FALSE; 2223 mac_perim_handle_t mph, pmph; 2224 2225 AGGR_GRP_REFHOLD(grp); 2226 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2227 2228 ASSERT(!grp->lg_closing); 2229 2230 if (on == grp->lg_promisc) 2231 goto bail; 2232 2233 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2234 int err = 0; 2235 2236 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2237 AGGR_PORT_REFHOLD(port); 2238 if (!on && (port->lp_prom_addr == NULL)) 2239 err = aggr_port_promisc(port, B_FALSE); 2240 else if (on && port->lp_started) 2241 err = aggr_port_promisc(port, B_TRUE); 2242 2243 if (err != 0) { 2244 if (aggr_grp_detach_port(grp, port)) 2245 link_state_changed = B_TRUE; 2246 } else { 2247 /* 2248 * If a port was detached because of a previous 2249 * failure changing the promiscuity, the port 2250 * is reattached when it successfully changes 2251 * the promiscuity now, and this might cause 2252 * the link state of the aggregation to change. 2253 */ 2254 if (aggr_grp_attach_port(grp, port)) 2255 link_state_changed = B_TRUE; 2256 } 2257 mac_perim_exit(pmph); 2258 AGGR_PORT_REFRELE(port); 2259 } 2260 2261 grp->lg_promisc = on; 2262 2263 if (link_state_changed) 2264 mac_link_update(grp->lg_mh, grp->lg_link_state); 2265 2266 bail: 2267 mac_perim_exit(mph); 2268 AGGR_GRP_REFRELE(grp); 2269 2270 return (0); 2271 } 2272 2273 static void 2274 aggr_grp_port_rename(const char *new_name, void *arg) 2275 { 2276 /* 2277 * aggr port's mac client name is the format of "aggr link name" plus 2278 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2279 */ 2280 int aggr_len, link_len, clnt_name_len, i; 2281 char *str_end, *str_st, *str_del; 2282 char aggr_name[MAXNAMELEN]; 2283 char link_name[MAXNAMELEN]; 2284 char *clnt_name; 2285 aggr_grp_t *aggr_grp = arg; 2286 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2287 2288 for (i = 0; i < aggr_grp->lg_nports; i++) { 2289 clnt_name = mac_client_name(aggr_port->lp_mch); 2290 clnt_name_len = strlen(clnt_name); 2291 str_st = clnt_name; 2292 str_end = &(clnt_name[clnt_name_len]); 2293 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2294 ASSERT(str_del != NULL); 2295 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2296 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2297 bzero(aggr_name, MAXNAMELEN); 2298 bzero(link_name, MAXNAMELEN); 2299 bcopy(clnt_name, aggr_name, aggr_len); 2300 bcopy(str_del, link_name, link_len + 1); 2301 bzero(clnt_name, MAXNAMELEN); 2302 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2303 link_name); 2304 2305 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2306 aggr_port = aggr_port->lp_next; 2307 } 2308 } 2309 2310 /* 2311 * Initialize the capabilities that are advertised for the group 2312 * according to the capabilities of the constituent ports. 2313 */ 2314 static boolean_t 2315 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2316 { 2317 aggr_grp_t *grp = arg; 2318 2319 switch (cap) { 2320 case MAC_CAPAB_HCKSUM: { 2321 uint32_t *hcksum_txflags = cap_data; 2322 *hcksum_txflags = grp->lg_hcksum_txflags; 2323 break; 2324 } 2325 case MAC_CAPAB_LSO: { 2326 mac_capab_lso_t *cap_lso = cap_data; 2327 2328 if (grp->lg_lso) { 2329 *cap_lso = grp->lg_cap_lso; 2330 break; 2331 } else { 2332 return (B_FALSE); 2333 } 2334 } 2335 case MAC_CAPAB_NO_NATIVEVLAN: 2336 return (!grp->lg_vlan); 2337 case MAC_CAPAB_NO_ZCOPY: 2338 return (!grp->lg_zcopy); 2339 case MAC_CAPAB_RINGS: { 2340 mac_capab_rings_t *cap_rings = cap_data; 2341 uint_t ring_cnt = 0; 2342 2343 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2344 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2345 2346 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2347 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2348 cap_rings->mr_rnum = ring_cnt; 2349 cap_rings->mr_gnum = grp->lg_rx_group_count; 2350 cap_rings->mr_gaddring = NULL; 2351 cap_rings->mr_gremring = NULL; 2352 } else { 2353 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2354 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2355 cap_rings->mr_gnum = 0; 2356 } 2357 cap_rings->mr_rget = aggr_fill_ring; 2358 cap_rings->mr_gget = aggr_fill_group; 2359 break; 2360 } 2361 case MAC_CAPAB_AGGR: 2362 { 2363 mac_capab_aggr_t *aggr_cap; 2364 2365 if (cap_data != NULL) { 2366 aggr_cap = cap_data; 2367 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2368 aggr_cap->mca_unicst = aggr_m_unicst; 2369 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2370 aggr_cap->mca_arg = arg; 2371 } 2372 return (B_TRUE); 2373 } 2374 default: 2375 return (B_FALSE); 2376 } 2377 return (B_TRUE); 2378 } 2379 2380 /* 2381 * Callback function for MAC layer to register groups. 2382 */ 2383 static void 2384 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2385 mac_group_info_t *infop, mac_group_handle_t gh) 2386 { 2387 aggr_grp_t *grp = arg; 2388 2389 if (rtype == MAC_RING_TYPE_RX) { 2390 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2391 2392 rx_group->arg_gh = gh; 2393 rx_group->arg_grp = grp; 2394 2395 infop->mgi_driver = (mac_group_driver_t)rx_group; 2396 infop->mgi_start = NULL; 2397 infop->mgi_stop = NULL; 2398 infop->mgi_addmac = aggr_addmac; 2399 infop->mgi_remmac = aggr_remmac; 2400 infop->mgi_count = rx_group->arg_ring_cnt; 2401 2402 /* 2403 * Always set the HW VLAN callbacks. They are smart 2404 * enough to know when a port has HW VLAN filters to 2405 * program and when it doesn't. 2406 */ 2407 infop->mgi_addvlan = aggr_addvlan; 2408 infop->mgi_remvlan = aggr_remvlan; 2409 } else { 2410 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2411 2412 ASSERT3S(index, ==, 0); 2413 tx_group->atg_gh = gh; 2414 } 2415 } 2416 2417 /* 2418 * Callback funtion for MAC layer to register all rings. 2419 */ 2420 static void 2421 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2422 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2423 { 2424 aggr_grp_t *grp = arg; 2425 2426 switch (rtype) { 2427 case MAC_RING_TYPE_RX: { 2428 aggr_pseudo_rx_group_t *rx_group; 2429 aggr_pseudo_rx_ring_t *rx_ring; 2430 mac_intr_t aggr_mac_intr; 2431 2432 rx_group = &grp->lg_rx_groups[rg_index]; 2433 ASSERT3S(index, >=, 0); 2434 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2435 rx_ring = rx_group->arg_rings + index; 2436 rx_ring->arr_rh = rh; 2437 2438 /* 2439 * Entrypoint to enable interrupt (disable poll) and 2440 * disable interrupt (enable poll). 2441 */ 2442 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2443 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2444 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2445 aggr_mac_intr.mi_ddi_handle = NULL; 2446 2447 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2448 infop->mri_start = aggr_pseudo_start_rx_ring; 2449 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2450 2451 infop->mri_intr = aggr_mac_intr; 2452 infop->mri_poll = aggr_rx_poll; 2453 2454 infop->mri_stat = aggr_rx_ring_stat; 2455 break; 2456 } 2457 case MAC_RING_TYPE_TX: { 2458 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2459 aggr_pseudo_tx_ring_t *tx_ring; 2460 2461 ASSERT(rg_index == -1); 2462 ASSERT(index < tx_group->atg_ring_cnt); 2463 2464 tx_ring = &tx_group->atg_rings[index]; 2465 tx_ring->atr_rh = rh; 2466 2467 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2468 infop->mri_start = NULL; 2469 infop->mri_stop = NULL; 2470 infop->mri_tx = aggr_ring_tx; 2471 infop->mri_stat = aggr_tx_ring_stat; 2472 /* 2473 * Use the hw TX ring handle to find if the ring needs 2474 * serialization or not. For NICs that do not expose 2475 * Tx rings, atr_hw_rh will be NULL. 2476 */ 2477 if (tx_ring->atr_hw_rh != NULL) { 2478 infop->mri_flags = 2479 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2480 } 2481 break; 2482 } 2483 default: 2484 break; 2485 } 2486 } 2487 2488 static mblk_t * 2489 aggr_rx_poll(void *arg, int bytes_to_pickup) 2490 { 2491 aggr_pseudo_rx_ring_t *rr_ring = arg; 2492 aggr_port_t *port = rr_ring->arr_port; 2493 aggr_grp_t *grp = port->lp_grp; 2494 mblk_t *mp_chain, *mp, **mpp; 2495 2496 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2497 2498 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2499 return (mp_chain); 2500 2501 mpp = &mp_chain; 2502 while ((mp = *mpp) != NULL) { 2503 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2504 struct ether_header *ehp; 2505 2506 ehp = (struct ether_header *)mp->b_rptr; 2507 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2508 *mpp = mp->b_next; 2509 mp->b_next = NULL; 2510 aggr_recv_lacp(port, 2511 (mac_resource_handle_t)rr_ring, mp); 2512 continue; 2513 } 2514 } 2515 2516 if (!port->lp_collector_enabled) { 2517 *mpp = mp->b_next; 2518 mp->b_next = NULL; 2519 freemsg(mp); 2520 continue; 2521 } 2522 mpp = &mp->b_next; 2523 } 2524 return (mp_chain); 2525 } 2526 2527 static int 2528 aggr_addmac(void *arg, const uint8_t *mac_addr) 2529 { 2530 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2531 aggr_unicst_addr_t *addr, **pprev; 2532 aggr_grp_t *grp = rx_group->arg_grp; 2533 aggr_port_t *port, *p; 2534 mac_perim_handle_t mph; 2535 int err = 0; 2536 uint_t idx = rx_group->arg_index; 2537 2538 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2539 2540 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2541 mac_perim_exit(mph); 2542 return (0); 2543 } 2544 2545 /* 2546 * Insert this mac address into the list of mac addresses owned by 2547 * the aggregation pseudo group. 2548 */ 2549 pprev = &rx_group->arg_macaddr; 2550 while ((addr = *pprev) != NULL) { 2551 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2552 mac_perim_exit(mph); 2553 return (EEXIST); 2554 } 2555 pprev = &addr->aua_next; 2556 } 2557 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2558 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2559 addr->aua_next = NULL; 2560 *pprev = addr; 2561 2562 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2563 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2564 break; 2565 2566 if (err != 0) { 2567 for (p = grp->lg_ports; p != port; p = p->lp_next) 2568 aggr_port_remmac(p, idx, mac_addr); 2569 2570 *pprev = NULL; 2571 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2572 } 2573 2574 mac_perim_exit(mph); 2575 return (err); 2576 } 2577 2578 static int 2579 aggr_remmac(void *arg, const uint8_t *mac_addr) 2580 { 2581 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2582 aggr_unicst_addr_t *addr, **pprev; 2583 aggr_grp_t *grp = rx_group->arg_grp; 2584 aggr_port_t *port; 2585 mac_perim_handle_t mph; 2586 int err = 0; 2587 2588 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2589 2590 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2591 mac_perim_exit(mph); 2592 return (0); 2593 } 2594 2595 /* 2596 * Insert this mac address into the list of mac addresses owned by 2597 * the aggregation pseudo group. 2598 */ 2599 pprev = &rx_group->arg_macaddr; 2600 while ((addr = *pprev) != NULL) { 2601 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2602 pprev = &addr->aua_next; 2603 continue; 2604 } 2605 break; 2606 } 2607 if (addr == NULL) { 2608 mac_perim_exit(mph); 2609 return (EINVAL); 2610 } 2611 2612 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2613 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2614 2615 *pprev = addr->aua_next; 2616 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2617 2618 mac_perim_exit(mph); 2619 return (err); 2620 } 2621 2622 /* 2623 * Search for VID in the Rx group's list and return a pointer if 2624 * found. Otherwise return NULL. 2625 */ 2626 static aggr_vlan_t * 2627 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2628 { 2629 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2630 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2631 avp = list_next(&rx_group->arg_vlans, avp)) { 2632 if (avp->av_vid == vid) 2633 return (avp); 2634 } 2635 2636 return (NULL); 2637 } 2638 2639 /* 2640 * Accept traffic on the specified VID. 2641 * 2642 * Persist VLAN state in the aggr so that ports added later will 2643 * receive the correct filters. In the future it would be nice to 2644 * allow aggr to iterate its clients instead of duplicating state. 2645 */ 2646 static int 2647 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2648 { 2649 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2650 aggr_grp_t *aggr = rx_group->arg_grp; 2651 aggr_port_t *port, *p; 2652 mac_perim_handle_t mph; 2653 int err = 0; 2654 aggr_vlan_t *avp = NULL; 2655 uint_t idx = rx_group->arg_index; 2656 2657 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2658 2659 if (vid == MAC_VLAN_UNTAGGED) { 2660 /* 2661 * Aggr is both a MAC provider and MAC client. As a 2662 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2663 * client. As a client itself, it should pass 2664 * VLAN_ID_NONE to its ports. 2665 */ 2666 vid = VLAN_ID_NONE; 2667 rx_group->arg_untagged++; 2668 goto update_ports; 2669 } 2670 2671 avp = aggr_find_vlan(rx_group, vid); 2672 2673 if (avp != NULL) { 2674 avp->av_refs++; 2675 mac_perim_exit(mph); 2676 return (0); 2677 } 2678 2679 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2680 avp->av_vid = vid; 2681 avp->av_refs = 1; 2682 2683 update_ports: 2684 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2685 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2686 break; 2687 2688 if (err != 0) { 2689 /* 2690 * If any of these calls fail then we are in a 2691 * situation where the ports have different HW state. 2692 * There's no reasonable action the MAC client can 2693 * take in this scenario to rectify the situation. 2694 */ 2695 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2696 int err2; 2697 2698 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2699 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2700 " from port %s: errno %d.", vid, 2701 mac_client_name(p->lp_mch), err2); 2702 } 2703 2704 } 2705 2706 if (vid == VLAN_ID_NONE) 2707 rx_group->arg_untagged--; 2708 2709 if (avp != NULL) { 2710 kmem_free(avp, sizeof (aggr_vlan_t)); 2711 avp = NULL; 2712 } 2713 } 2714 2715 if (avp != NULL) 2716 list_insert_tail(&rx_group->arg_vlans, avp); 2717 2718 done: 2719 mac_perim_exit(mph); 2720 return (err); 2721 } 2722 2723 /* 2724 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2725 */ 2726 static int 2727 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2728 { 2729 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2730 aggr_grp_t *aggr = rx_group->arg_grp; 2731 aggr_port_t *port, *p; 2732 mac_perim_handle_t mph; 2733 int err = 0; 2734 aggr_vlan_t *avp = NULL; 2735 uint_t idx = rx_group->arg_index; 2736 2737 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2738 2739 /* 2740 * See the comment in aggr_addvlan(). 2741 */ 2742 if (vid == MAC_VLAN_UNTAGGED) { 2743 vid = VLAN_ID_NONE; 2744 rx_group->arg_untagged--; 2745 2746 if (rx_group->arg_untagged > 0) 2747 goto done; 2748 2749 goto update_ports; 2750 } 2751 2752 avp = aggr_find_vlan(rx_group, vid); 2753 2754 if (avp == NULL) { 2755 err = ENOENT; 2756 goto done; 2757 } 2758 2759 avp->av_refs--; 2760 2761 if (avp->av_refs > 0) 2762 goto done; 2763 2764 update_ports: 2765 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2766 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2767 break; 2768 2769 /* 2770 * See the comment in aggr_addvlan() for justification of the 2771 * use of VERIFY here. 2772 */ 2773 if (err != 0) { 2774 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2775 int err2; 2776 2777 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2778 cmn_err(CE_WARN, "Failed to add VLAN %u" 2779 " to port %s: errno %d.", vid, 2780 mac_client_name(p->lp_mch), err2); 2781 } 2782 } 2783 2784 if (avp != NULL) 2785 avp->av_refs++; 2786 2787 if (vid == VLAN_ID_NONE) 2788 rx_group->arg_untagged++; 2789 2790 goto done; 2791 } 2792 2793 if (err == 0 && avp != NULL) { 2794 VERIFY3U(avp->av_refs, ==, 0); 2795 list_remove(&rx_group->arg_vlans, avp); 2796 kmem_free(avp, sizeof (aggr_vlan_t)); 2797 } 2798 2799 done: 2800 mac_perim_exit(mph); 2801 return (err); 2802 } 2803 2804 /* 2805 * Add or remove the multicast addresses that are defined for the group 2806 * to or from the specified port. 2807 * 2808 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2809 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2810 * called when the port is either stopped or detached. 2811 */ 2812 void 2813 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2814 { 2815 aggr_grp_t *grp = port->lp_grp; 2816 2817 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2818 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2819 2820 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2821 return; 2822 2823 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2824 } 2825 2826 static int 2827 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2828 { 2829 aggr_grp_t *grp = arg; 2830 aggr_port_t *port = NULL, *errport = NULL; 2831 mac_perim_handle_t mph; 2832 int err = 0; 2833 2834 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2835 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2836 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2837 !port->lp_started) { 2838 continue; 2839 } 2840 err = aggr_port_multicst(port, add, addrp); 2841 if (err != 0) { 2842 errport = port; 2843 break; 2844 } 2845 } 2846 2847 /* 2848 * At least one port caused error return and this error is returned to 2849 * mac, eventually a NAK would be sent upwards. 2850 * Some ports have this multicast address listed now, and some don't. 2851 * Treat this error as a whole aggr failure not individual port failure. 2852 * Therefore remove this multicast address from other ports. 2853 */ 2854 if ((err != 0) && add) { 2855 for (port = grp->lg_ports; port != errport; 2856 port = port->lp_next) { 2857 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2858 !port->lp_started) { 2859 continue; 2860 } 2861 (void) aggr_port_multicst(port, B_FALSE, addrp); 2862 } 2863 } 2864 mac_perim_exit(mph); 2865 return (err); 2866 } 2867 2868 static int 2869 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2870 { 2871 aggr_grp_t *grp = arg; 2872 mac_perim_handle_t mph; 2873 int err; 2874 2875 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2876 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2877 0, 0); 2878 mac_perim_exit(mph); 2879 return (err); 2880 } 2881 2882 /* 2883 * Initialize the capabilities that are advertised for the group 2884 * according to the capabilities of the constituent ports. 2885 */ 2886 static void 2887 aggr_grp_capab_set(aggr_grp_t *grp) 2888 { 2889 uint32_t cksum; 2890 aggr_port_t *port; 2891 mac_capab_lso_t cap_lso; 2892 2893 ASSERT(grp->lg_mh == NULL); 2894 ASSERT(grp->lg_ports != NULL); 2895 2896 grp->lg_hcksum_txflags = (uint32_t)-1; 2897 grp->lg_zcopy = B_TRUE; 2898 grp->lg_vlan = B_TRUE; 2899 2900 grp->lg_lso = B_TRUE; 2901 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2902 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2903 2904 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2905 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2906 cksum = 0; 2907 grp->lg_hcksum_txflags &= cksum; 2908 2909 grp->lg_vlan &= 2910 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2911 2912 grp->lg_zcopy &= 2913 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2914 2915 grp->lg_lso &= 2916 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2917 if (grp->lg_lso) { 2918 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2919 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2920 cap_lso.lso_basic_tcp_ipv4.lso_max) 2921 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2922 cap_lso.lso_basic_tcp_ipv4.lso_max; 2923 } 2924 } 2925 } 2926 2927 /* 2928 * Checks whether the capabilities of the port being added are compatible 2929 * with the current capabilities of the aggregation. 2930 */ 2931 static boolean_t 2932 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2933 { 2934 uint32_t hcksum_txflags; 2935 2936 ASSERT(grp->lg_ports != NULL); 2937 2938 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2939 grp->lg_vlan) != grp->lg_vlan) { 2940 return (B_FALSE); 2941 } 2942 2943 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2944 grp->lg_zcopy) != grp->lg_zcopy) { 2945 return (B_FALSE); 2946 } 2947 2948 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2949 if (grp->lg_hcksum_txflags != 0) 2950 return (B_FALSE); 2951 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2952 grp->lg_hcksum_txflags) { 2953 return (B_FALSE); 2954 } 2955 2956 if (grp->lg_lso) { 2957 mac_capab_lso_t cap_lso; 2958 2959 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2960 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2961 grp->lg_cap_lso.lso_flags) 2962 return (B_FALSE); 2963 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2964 cap_lso.lso_basic_tcp_ipv4.lso_max) 2965 return (B_FALSE); 2966 } else { 2967 return (B_FALSE); 2968 } 2969 } 2970 2971 return (B_TRUE); 2972 } 2973 2974 /* 2975 * Returns the maximum SDU according to the SDU of the constituent ports. 2976 */ 2977 static uint_t 2978 aggr_grp_max_sdu(aggr_grp_t *grp) 2979 { 2980 uint_t max_sdu = (uint_t)-1; 2981 aggr_port_t *port; 2982 2983 ASSERT(grp->lg_ports != NULL); 2984 2985 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2986 uint_t port_sdu_max; 2987 2988 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2989 if (max_sdu > port_sdu_max) 2990 max_sdu = port_sdu_max; 2991 } 2992 2993 return (max_sdu); 2994 } 2995 2996 /* 2997 * Checks if the maximum SDU of the specified port is compatible 2998 * with the maximum SDU of the specified aggregation group, returns 2999 * B_TRUE if it is, B_FALSE otherwise. 3000 */ 3001 static boolean_t 3002 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 3003 { 3004 uint_t port_sdu_max; 3005 3006 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3007 return (port_sdu_max >= grp->lg_max_sdu); 3008 } 3009 3010 /* 3011 * Returns the maximum margin according to the margin of the constituent ports. 3012 */ 3013 static uint32_t 3014 aggr_grp_max_margin(aggr_grp_t *grp) 3015 { 3016 uint32_t margin = UINT32_MAX; 3017 aggr_port_t *port; 3018 3019 ASSERT(grp->lg_mh == NULL); 3020 ASSERT(grp->lg_ports != NULL); 3021 3022 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3023 if (margin > port->lp_margin) 3024 margin = port->lp_margin; 3025 } 3026 3027 grp->lg_margin = margin; 3028 return (margin); 3029 } 3030 3031 /* 3032 * Checks if the maximum margin of the specified port is compatible 3033 * with the maximum margin of the specified aggregation group, returns 3034 * B_TRUE if it is, B_FALSE otherwise. 3035 */ 3036 static boolean_t 3037 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3038 { 3039 if (port->lp_margin >= grp->lg_margin) 3040 return (B_TRUE); 3041 3042 /* 3043 * See whether the current margin value is allowed to be changed to 3044 * the new value. 3045 */ 3046 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3047 return (B_FALSE); 3048 3049 grp->lg_margin = port->lp_margin; 3050 return (B_TRUE); 3051 } 3052 3053 /* 3054 * Set MTU on individual ports of an aggregation group 3055 */ 3056 static int 3057 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3058 uint32_t *old_mtu) 3059 { 3060 boolean_t removed = B_FALSE; 3061 mac_perim_handle_t mph; 3062 mac_diag_t diag; 3063 int err, rv, retry = 0; 3064 3065 if (port->lp_mah != NULL) { 3066 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3067 port->lp_mah = NULL; 3068 removed = B_TRUE; 3069 } 3070 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3071 try_again: 3072 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3073 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3074 &port->lp_mah, 0, &diag)) != 0) { 3075 /* 3076 * following is a workaround for a bug in 'bge' driver. 3077 * See CR 6794654 for more information and this work around 3078 * will be removed once the CR is fixed. 3079 */ 3080 if (rv == EIO && retry++ < 3) { 3081 delay(2 * hz); 3082 goto try_again; 3083 } 3084 /* 3085 * if mac_unicast_add() failed while setting the MTU, 3086 * detach the port from the group. 3087 */ 3088 mac_perim_enter_by_mh(port->lp_mh, &mph); 3089 (void) aggr_grp_detach_port(grp, port); 3090 mac_perim_exit(mph); 3091 cmn_err(CE_WARN, "Unable to restart the port %s while " 3092 "setting MTU. Detaching the port from the aggregation.", 3093 mac_client_name(port->lp_mch)); 3094 } 3095 return (err); 3096 } 3097 3098 static int 3099 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3100 { 3101 int err = 0, i, rv; 3102 aggr_port_t *port; 3103 uint32_t *mtu; 3104 3105 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3106 3107 /* 3108 * If the MTU being set is equal to aggr group's maximum 3109 * allowable value, then there is nothing to change 3110 */ 3111 if (sdu == grp->lg_max_sdu) 3112 return (0); 3113 3114 /* 0 is aggr group's min sdu */ 3115 if (sdu == 0) 3116 return (EINVAL); 3117 3118 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3119 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3120 port = port->lp_next, i++) { 3121 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3122 } 3123 if (err != 0) { 3124 /* recover from error: reset the mtus of the ports */ 3125 aggr_port_t *tmp; 3126 3127 for (tmp = grp->lg_ports, i = 0; tmp != port; 3128 tmp = tmp->lp_next, i++) { 3129 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3130 } 3131 goto bail; 3132 } 3133 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3134 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3135 ASSERT(rv == 0); 3136 bail: 3137 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3138 return (err); 3139 } 3140 3141 /* 3142 * Callback functions for set/get of properties 3143 */ 3144 /*ARGSUSED*/ 3145 static int 3146 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3147 uint_t pr_valsize, const void *pr_val) 3148 { 3149 int err = ENOTSUP; 3150 aggr_grp_t *grp = m_driver; 3151 3152 switch (pr_num) { 3153 case MAC_PROP_MTU: { 3154 uint32_t mtu; 3155 3156 if (pr_valsize < sizeof (mtu)) { 3157 err = EINVAL; 3158 break; 3159 } 3160 bcopy(pr_val, &mtu, sizeof (mtu)); 3161 err = aggr_sdu_update(grp, mtu); 3162 break; 3163 } 3164 default: 3165 break; 3166 } 3167 return (err); 3168 } 3169 3170 typedef struct rboundary { 3171 uint32_t bval; 3172 int btype; 3173 } rboundary_t; 3174 3175 /* 3176 * This function finds the intersection of mtu ranges stored in arrays - 3177 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3178 * Individual arrays are assumed to contain non-overlapping ranges. 3179 * Algorithm: 3180 * A range has two boundaries - min and max. We scan all arrays and store 3181 * each boundary as a separate element in a temporary array. We also store 3182 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3183 * array. Then we sort the temporary array in ascending order. We scan the 3184 * sorted array from lower to higher values and keep a cumulative sum of 3185 * boundary types. Element in the temporary array for which the sum reaches 3186 * mcount is a min boundary of a range in the result and next element will be 3187 * max boundary. 3188 * 3189 * Example for mcount = 3, 3190 * 3191 * ----|_________|-------|_______|----|__|------ mrange[0] 3192 * 3193 * -------|________|--|____________|-----|___|-- mrange[1] 3194 * 3195 * --------|________________|-------|____|------ mrange[2] 3196 * 3197 * 3 2 1 3198 * \|/ 3199 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3200 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3201 * 3202 * same min and max 3203 * V 3204 * --------|_____|-------|__|------------|------ intersecting ranges 3205 */ 3206 void 3207 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3208 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3209 { 3210 mac_propval_uint32_range_t *rval, *ur; 3211 int rmaxcnt, rcount; 3212 size_t sz_range32; 3213 rboundary_t *ta; /* temporary array */ 3214 rboundary_t temp; 3215 boolean_t range_started = B_FALSE; 3216 int i, j, m, sum; 3217 3218 sz_range32 = sizeof (mac_propval_uint32_range_t); 3219 3220 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3221 rmaxcnt += mrange[i]->mpr_count; 3222 3223 /* Allocate enough space to store the results */ 3224 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3225 3226 /* Number of boundaries are twice as many as ranges */ 3227 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3228 3229 for (i = 0, m = 0; i < mcount; i++) { 3230 ur = &(mrange[i]->mpr_range_uint32[0]); 3231 for (j = 0; j < mrange[i]->mpr_count; j++) { 3232 ta[m].bval = ur[j].mpur_min; 3233 ta[m++].btype = 1; 3234 ta[m].bval = ur[j].mpur_max; 3235 ta[m++].btype = -1; 3236 } 3237 } 3238 3239 /* 3240 * Sort the temporary array in ascending order of bval; 3241 * if boundary values are same then sort on btype. 3242 */ 3243 for (i = 0; i < m-1; i++) { 3244 for (j = i+1; j < m; j++) { 3245 if ((ta[i].bval > ta[j].bval) || 3246 ((ta[i].bval == ta[j].bval) && 3247 (ta[i].btype < ta[j].btype))) { 3248 temp = ta[i]; 3249 ta[i] = ta[j]; 3250 ta[j] = temp; 3251 } 3252 } 3253 } 3254 3255 /* Walk through temporary array to find all ranges in the results */ 3256 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3257 sum += ta[i].btype; 3258 if (sum == mcount) { 3259 rval[rcount].mpur_min = ta[i].bval; 3260 range_started = B_TRUE; 3261 } else if (sum < mcount && range_started) { 3262 rval[rcount++].mpur_max = ta[i].bval; 3263 range_started = B_FALSE; 3264 } 3265 } 3266 3267 *prval = rval; 3268 *prmaxcnt = rmaxcnt; 3269 *prcount = rcount; 3270 3271 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3272 } 3273 3274 /* 3275 * Returns the mtu ranges which could be supported by aggr group. 3276 * prmaxcnt returns the size of the buffer prval, prcount returns 3277 * the number of valid entries in prval. Caller is responsible 3278 * for freeing up prval. 3279 */ 3280 int 3281 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3282 int *prmaxcnt, int *prcount) 3283 { 3284 mac_propval_range_t **vals; 3285 aggr_port_t *port; 3286 mac_perim_handle_t mph; 3287 uint_t i, numr; 3288 int err = 0; 3289 size_t sz_propval, sz_range32; 3290 size_t size; 3291 3292 sz_propval = sizeof (mac_propval_range_t); 3293 sz_range32 = sizeof (mac_propval_uint32_range_t); 3294 3295 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3296 3297 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3298 KM_SLEEP); 3299 3300 for (port = grp->lg_ports, i = 0; port != NULL; 3301 port = port->lp_next, i++) { 3302 3303 size = sz_propval; 3304 vals[i] = kmem_alloc(size, KM_SLEEP); 3305 vals[i]->mpr_count = 1; 3306 3307 mac_perim_enter_by_mh(port->lp_mh, &mph); 3308 3309 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3310 NULL, 0, vals[i], NULL); 3311 if (err == ENOSPC) { 3312 /* 3313 * Not enough space to hold all ranges. 3314 * Allocate extra space as indicated and retry. 3315 */ 3316 numr = vals[i]->mpr_count; 3317 kmem_free(vals[i], sz_propval); 3318 size = sz_propval + (numr - 1) * sz_range32; 3319 vals[i] = kmem_alloc(size, KM_SLEEP); 3320 vals[i]->mpr_count = numr; 3321 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3322 NULL, 0, vals[i], NULL); 3323 ASSERT(err != ENOSPC); 3324 } 3325 mac_perim_exit(mph); 3326 if (err != 0) { 3327 kmem_free(vals[i], size); 3328 vals[i] = NULL; 3329 break; 3330 } 3331 } 3332 3333 /* 3334 * if any of the underlying ports does not support changing MTU then 3335 * just return ENOTSUP 3336 */ 3337 if (port != NULL) { 3338 ASSERT(err != 0); 3339 goto done; 3340 } 3341 3342 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3343 prcount); 3344 3345 done: 3346 for (i = 0; i < grp->lg_nports; i++) { 3347 if (vals[i] != NULL) { 3348 numr = vals[i]->mpr_count; 3349 size = sz_propval + (numr - 1) * sz_range32; 3350 kmem_free(vals[i], size); 3351 } 3352 } 3353 3354 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3355 return (err); 3356 } 3357 3358 static void 3359 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3360 mac_prop_info_handle_t prh) 3361 { 3362 aggr_grp_t *grp = m_driver; 3363 mac_propval_uint32_range_t *rval = NULL; 3364 int i, rcount, rmaxcnt; 3365 int err = 0; 3366 3367 _NOTE(ARGUNUSED(pr_name)); 3368 3369 switch (pr_num) { 3370 case MAC_PROP_MTU: 3371 3372 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3373 &rcount); 3374 if (err != 0) { 3375 ASSERT(rval == NULL); 3376 return; 3377 } 3378 for (i = 0; i < rcount; i++) { 3379 mac_prop_info_set_range_uint32(prh, 3380 rval[i].mpur_min, rval[i].mpur_max); 3381 } 3382 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3383 break; 3384 } 3385 } 3386