1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2020 Joyent, Inc. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 28 * 29 * An instance of the structure aggr_grp_t is allocated for each 30 * link aggregation group. When created, aggr_grp_t objects are 31 * entered into the aggr_grp_hash hash table maintained by the modhash 32 * module. The hash key is the linkid associated with the link 33 * aggregation group. 34 * 35 * Each aggregation contains a set of ports. The port is represented 36 * by the aggr_port_t structure. A port consists of a single MAC 37 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 38 * MAC. This client is used by the aggr to send and receive LACP 39 * traffic. Each port client takes on the same MAC unicast address -- 40 * the address of the aggregation itself (taken from the first port by 41 * default). 42 * 43 * The MAC client that hangs off each aggr port is not your typical 44 * MAC client. Not only does it have exclusive control of the MAC, but 45 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 46 * fanout traffic among L4 protocols; but the aggr is an intermediary, 47 * not a consumer. Instead of using SRSes, the aggr puts the 48 * underlying hardware rings into passthru mode and ships packets up 49 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 50 * LACP while passing all other traffic up to clients of the aggr. 51 * 52 * Pseudo Rx Groups and Rings 53 * -------------------------- 54 * 55 * It is imperative for client performance that the aggr provide as 56 * many MAC groups as possible. In order to use the underlying HW 57 * resources, aggr creates pseudo groups to aggregate the underlying 58 * HW groups. Every HW group gets mapped to a pseudo group; and every 59 * HW ring in that group gets mapped to a pseudo ring. The pseudo 60 * group at index 0 combines all the HW groups at index 0 from each 61 * port, etc. The aggr's MAC then creates normal MAC groups and rings 62 * out of these pseudo groups and rings to present to the aggr's 63 * clients. To the clients, the aggr's groups and rings are absolutely 64 * no different than a NIC's groups or rings. 65 * 66 * Pseudo Tx Rings 67 * --------------- 68 * 69 * The underlying ports (NICs) in an aggregation can have Tx rings. To 70 * enhance aggr's performance, these Tx rings are made available to 71 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 72 * not new. They are already present and implemented on the Rx side. 73 * The same concept is extended to the Tx side where each Tx ring of 74 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 75 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 76 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 77 * is given to the aggregation layer. 78 * 79 * With this change, the outgoing stack depth looks much better: 80 * 81 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 82 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 83 * 84 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 85 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 86 * 87 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 88 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 89 * ring belonging to a port on which the packet has to be sent. 90 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 91 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 92 * the selected port. 93 * 94 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 95 * bandwidth limit is applied first on the outgoing packet and the packets 96 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 97 * particular Tx ring. 98 */ 99 100 #include <sys/types.h> 101 #include <sys/sysmacros.h> 102 #include <sys/conf.h> 103 #include <sys/cmn_err.h> 104 #include <sys/disp.h> 105 #include <sys/list.h> 106 #include <sys/ksynch.h> 107 #include <sys/kmem.h> 108 #include <sys/stream.h> 109 #include <sys/modctl.h> 110 #include <sys/ddi.h> 111 #include <sys/sunddi.h> 112 #include <sys/atomic.h> 113 #include <sys/stat.h> 114 #include <sys/modhash.h> 115 #include <sys/id_space.h> 116 #include <sys/strsun.h> 117 #include <sys/cred.h> 118 #include <sys/dlpi.h> 119 #include <sys/zone.h> 120 #include <sys/mac_provider.h> 121 #include <sys/dls.h> 122 #include <sys/vlan.h> 123 #include <sys/aggr.h> 124 #include <sys/aggr_impl.h> 125 126 static int aggr_m_start(void *); 127 static void aggr_m_stop(void *); 128 static int aggr_m_promisc(void *, boolean_t); 129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 130 static int aggr_m_unicst(void *, const uint8_t *); 131 static int aggr_m_stat(void *, uint_t, uint64_t *); 132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 135 const void *); 136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 137 mac_prop_info_handle_t); 138 139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 141 boolean_t *); 142 143 static void aggr_grp_capab_set(aggr_grp_t *); 144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 145 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 146 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 149 150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 152 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 153 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 156 static int aggr_addmac(void *, const uint8_t *); 157 static int aggr_remmac(void *, const uint8_t *); 158 static int aggr_addvlan(mac_group_driver_t, uint16_t); 159 static int aggr_remvlan(mac_group_driver_t, uint16_t); 160 static mblk_t *aggr_rx_poll(void *, int); 161 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 162 const int, mac_ring_info_t *, mac_ring_handle_t); 163 static void aggr_fill_group(void *, mac_ring_type_t, const int, 164 mac_group_info_t *, mac_group_handle_t); 165 166 static kmem_cache_t *aggr_grp_cache; 167 static mod_hash_t *aggr_grp_hash; 168 static krwlock_t aggr_grp_lock; 169 static uint_t aggr_grp_cnt; 170 static id_space_t *key_ids; 171 172 #define GRP_HASHSZ 64 173 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 174 #define AGGR_PORT_NAME_DELIMIT '-' 175 176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 177 178 #define AGGR_M_CALLBACK_FLAGS \ 179 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 180 181 static mac_callbacks_t aggr_m_callbacks = { 182 AGGR_M_CALLBACK_FLAGS, 183 aggr_m_stat, 184 aggr_m_start, 185 aggr_m_stop, 186 aggr_m_promisc, 187 aggr_m_multicst, 188 NULL, 189 NULL, 190 NULL, 191 aggr_m_ioctl, 192 aggr_m_capab_get, 193 NULL, 194 NULL, 195 aggr_m_setprop, 196 NULL, 197 aggr_m_propinfo 198 }; 199 200 /*ARGSUSED*/ 201 static int 202 aggr_grp_constructor(void *buf, void *arg, int kmflag) 203 { 204 aggr_grp_t *grp = buf; 205 206 bzero(grp, sizeof (*grp)); 207 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 208 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 209 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 210 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 211 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 212 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 213 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 214 grp->lg_link_state = LINK_STATE_UNKNOWN; 215 return (0); 216 } 217 218 /*ARGSUSED*/ 219 static void 220 aggr_grp_destructor(void *buf, void *arg) 221 { 222 aggr_grp_t *grp = buf; 223 224 if (grp->lg_tx_ports != NULL) { 225 kmem_free(grp->lg_tx_ports, 226 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 227 } 228 229 mutex_destroy(&grp->lg_lacp_lock); 230 cv_destroy(&grp->lg_lacp_cv); 231 mutex_destroy(&grp->lg_port_lock); 232 cv_destroy(&grp->lg_port_cv); 233 rw_destroy(&grp->lg_tx_lock); 234 mutex_destroy(&grp->lg_tx_flowctl_lock); 235 cv_destroy(&grp->lg_tx_flowctl_cv); 236 } 237 238 void 239 aggr_grp_init(void) 240 { 241 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 242 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 243 aggr_grp_destructor, NULL, NULL, NULL, 0); 244 245 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 246 GRP_HASHSZ, mod_hash_null_valdtor); 247 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 248 aggr_grp_cnt = 0; 249 250 /* 251 * Allocate an id space to manage key values (when key is not 252 * specified). The range of the id space will be from 253 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 254 * uses a 16-bit key. 255 */ 256 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 257 ASSERT(key_ids != NULL); 258 } 259 260 void 261 aggr_grp_fini(void) 262 { 263 id_space_destroy(key_ids); 264 rw_destroy(&aggr_grp_lock); 265 mod_hash_destroy_idhash(aggr_grp_hash); 266 kmem_cache_destroy(aggr_grp_cache); 267 } 268 269 uint_t 270 aggr_grp_count(void) 271 { 272 uint_t count; 273 274 rw_enter(&aggr_grp_lock, RW_READER); 275 count = aggr_grp_cnt; 276 rw_exit(&aggr_grp_lock); 277 return (count); 278 } 279 280 /* 281 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 282 * requires the mac perimeter, this function holds a reference of the aggr 283 * and aggr won't call mac_unregister() until this reference drops to 0. 284 */ 285 void 286 aggr_grp_port_hold(aggr_port_t *port) 287 { 288 aggr_grp_t *grp = port->lp_grp; 289 290 AGGR_PORT_REFHOLD(port); 291 mutex_enter(&grp->lg_port_lock); 292 grp->lg_port_ref++; 293 mutex_exit(&grp->lg_port_lock); 294 } 295 296 /* 297 * Release the reference of the grp and inform aggr_grp_delete() calling 298 * mac_unregister() is now safe. 299 */ 300 void 301 aggr_grp_port_rele(aggr_port_t *port) 302 { 303 aggr_grp_t *grp = port->lp_grp; 304 305 mutex_enter(&grp->lg_port_lock); 306 if (--grp->lg_port_ref == 0) 307 cv_signal(&grp->lg_port_cv); 308 mutex_exit(&grp->lg_port_lock); 309 AGGR_PORT_REFRELE(port); 310 } 311 312 /* 313 * Wait for the port's lacp timer thread and the port's notification callback 314 * to exit. 315 */ 316 void 317 aggr_grp_port_wait(aggr_grp_t *grp) 318 { 319 mutex_enter(&grp->lg_port_lock); 320 if (grp->lg_port_ref != 0) 321 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 322 mutex_exit(&grp->lg_port_lock); 323 } 324 325 /* 326 * Attach a port to a link aggregation group. 327 * 328 * A port is attached to a link aggregation group once its speed 329 * and link state have been verified. 330 * 331 * Returns B_TRUE if the group link state or speed has changed. If 332 * it's the case, the caller must notify the MAC layer via a call 333 * to mac_link(). 334 */ 335 boolean_t 336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 337 { 338 boolean_t link_state_changed = B_FALSE; 339 340 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 341 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 342 343 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 344 return (B_FALSE); 345 346 /* 347 * Validate the MAC port link speed and update the group 348 * link speed if needed. 349 */ 350 if (port->lp_ifspeed == 0 || 351 port->lp_link_state != LINK_STATE_UP || 352 port->lp_link_duplex != LINK_DUPLEX_FULL) { 353 /* 354 * Can't attach a MAC port with unknown link speed, 355 * down link, or not in full duplex mode. 356 */ 357 return (B_FALSE); 358 } 359 360 mutex_enter(&grp->lg_stat_lock); 361 if (grp->lg_ifspeed == 0) { 362 /* 363 * The group inherits the speed of the first link being 364 * attached. 365 */ 366 grp->lg_ifspeed = port->lp_ifspeed; 367 link_state_changed = B_TRUE; 368 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 369 /* 370 * The link speed of the MAC port must be the same as 371 * the group link speed, as per 802.3ad. Since it is 372 * not, the attach is cancelled. 373 */ 374 mutex_exit(&grp->lg_stat_lock); 375 return (B_FALSE); 376 } 377 mutex_exit(&grp->lg_stat_lock); 378 379 grp->lg_nattached_ports++; 380 381 /* 382 * Update the group link state. 383 */ 384 if (grp->lg_link_state != LINK_STATE_UP) { 385 grp->lg_link_state = LINK_STATE_UP; 386 mutex_enter(&grp->lg_stat_lock); 387 grp->lg_link_duplex = LINK_DUPLEX_FULL; 388 mutex_exit(&grp->lg_stat_lock); 389 link_state_changed = B_TRUE; 390 } 391 392 /* 393 * Update port's state. 394 */ 395 port->lp_state = AGGR_PORT_STATE_ATTACHED; 396 397 aggr_grp_multicst_port(port, B_TRUE); 398 399 /* 400 * The port client doesn't have an Rx SRS; instead of calling 401 * mac_rx_set() we set the client's flow callback directly. 402 * This datapath is used only when the port's driver doesn't 403 * support MAC_CAPAB_RINGS. Drivers with ring support will 404 * deliver traffic to the aggr via ring passthru. 405 */ 406 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 407 408 /* 409 * If LACP is OFF, the port can be used to send data as soon 410 * as its link is up and verified to be compatible with the 411 * aggregation. 412 * 413 * If LACP is active or passive, notify the LACP subsystem, which 414 * will enable sending on the port following the LACP protocol. 415 */ 416 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 417 aggr_send_port_enable(port); 418 else 419 aggr_lacp_port_attached(port); 420 421 return (link_state_changed); 422 } 423 424 boolean_t 425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 426 { 427 boolean_t link_state_changed = B_FALSE; 428 429 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 430 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 431 432 /* update state */ 433 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 434 return (B_FALSE); 435 436 mac_client_clear_flow_cb(port->lp_mch); 437 438 aggr_grp_multicst_port(port, B_FALSE); 439 440 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 441 aggr_send_port_disable(port); 442 else 443 aggr_lacp_port_detached(port); 444 445 port->lp_state = AGGR_PORT_STATE_STANDBY; 446 447 grp->lg_nattached_ports--; 448 if (grp->lg_nattached_ports == 0) { 449 /* the last attached MAC port of the group is being detached */ 450 grp->lg_link_state = LINK_STATE_DOWN; 451 mutex_enter(&grp->lg_stat_lock); 452 grp->lg_ifspeed = 0; 453 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 454 mutex_exit(&grp->lg_stat_lock); 455 link_state_changed = B_TRUE; 456 } 457 458 return (link_state_changed); 459 } 460 461 /* 462 * Update the MAC addresses of the constituent ports of the specified 463 * group. This function is invoked: 464 * - after creating a new aggregation group. 465 * - after adding new ports to an aggregation group. 466 * - after removing a port from a group when the MAC address of 467 * that port was used for the MAC address of the group. 468 * - after the MAC address of a port changed when the MAC address 469 * of that port was used for the MAC address of the group. 470 * 471 * Return true if the link state of the aggregation changed, for example 472 * as a result of a failure changing the MAC address of one of the 473 * constituent ports. 474 */ 475 boolean_t 476 aggr_grp_update_ports_mac(aggr_grp_t *grp) 477 { 478 aggr_port_t *cport; 479 boolean_t link_state_changed = B_FALSE; 480 mac_perim_handle_t mph; 481 482 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 483 484 for (cport = grp->lg_ports; cport != NULL; 485 cport = cport->lp_next) { 486 mac_perim_enter_by_mh(cport->lp_mh, &mph); 487 if (aggr_port_unicst(cport) != 0) { 488 if (aggr_grp_detach_port(grp, cport)) 489 link_state_changed = B_TRUE; 490 } else { 491 /* 492 * If a port was detached because of a previous 493 * failure changing the MAC address, the port is 494 * reattached when it successfully changes the MAC 495 * address now, and this might cause the link state 496 * of the aggregation to change. 497 */ 498 if (aggr_grp_attach_port(grp, cport)) 499 link_state_changed = B_TRUE; 500 } 501 mac_perim_exit(mph); 502 } 503 return (link_state_changed); 504 } 505 506 /* 507 * Invoked when the MAC address of a port has changed. If the port's 508 * MAC address was used for the group MAC address, set mac_addr_changedp 509 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 510 * notification. If the link state changes due to detach/attach of 511 * the constituent port, set link_state_changedp to B_TRUE to indicate 512 * to the caller that it should send a MAC_NOTE_LINK notification. In both 513 * cases, it is the responsibility of the caller to invoke notification 514 * functions after releasing the the port lock. 515 */ 516 void 517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 518 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 519 { 520 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 521 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 522 ASSERT(mac_addr_changedp != NULL); 523 ASSERT(link_state_changedp != NULL); 524 525 *mac_addr_changedp = B_FALSE; 526 *link_state_changedp = B_FALSE; 527 528 if (grp->lg_addr_fixed) { 529 /* 530 * The group is using a fixed MAC address or an automatic 531 * MAC address has not been set. 532 */ 533 return; 534 } 535 536 if (grp->lg_mac_addr_port == port) { 537 /* 538 * The MAC address of the port was assigned to the group 539 * MAC address. Update the group MAC address. 540 */ 541 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 542 *mac_addr_changedp = B_TRUE; 543 } else { 544 /* 545 * Update the actual port MAC address to the MAC address 546 * of the group. 547 */ 548 if (aggr_port_unicst(port) != 0) { 549 *link_state_changedp = aggr_grp_detach_port(grp, port); 550 } else { 551 /* 552 * If a port was detached because of a previous 553 * failure changing the MAC address, the port is 554 * reattached when it successfully changes the MAC 555 * address now, and this might cause the link state 556 * of the aggregation to change. 557 */ 558 *link_state_changedp = aggr_grp_attach_port(grp, port); 559 } 560 } 561 } 562 563 /* 564 * Add a port to a link aggregation group. 565 */ 566 static int 567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 568 aggr_port_t **pp) 569 { 570 aggr_port_t *port, **cport; 571 mac_perim_handle_t mph; 572 zoneid_t port_zoneid = ALL_ZONES; 573 int err; 574 575 /* The port must be in the same zone as the aggregation. */ 576 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 577 port_zoneid = GLOBAL_ZONEID; 578 if (grp->lg_zoneid != port_zoneid) 579 return (EBUSY); 580 581 /* 582 * If we are creating the aggr, then there is no MAC handle 583 * and thus no perimeter to hold. If we are adding a port to 584 * an existing aggr, then the perimiter of the aggr's MAC must 585 * be held. 586 */ 587 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 588 589 err = aggr_port_create(grp, port_linkid, force, &port); 590 if (err != 0) 591 return (err); 592 593 mac_perim_enter_by_mh(port->lp_mh, &mph); 594 595 /* Add the new port to the end of the list. */ 596 cport = &grp->lg_ports; 597 while (*cport != NULL) 598 cport = &((*cport)->lp_next); 599 *cport = port; 600 601 /* 602 * Back reference to the group it is member of. A port always 603 * holds a reference to its group to ensure that the back 604 * reference is always valid. 605 */ 606 port->lp_grp = grp; 607 AGGR_GRP_REFHOLD(grp); 608 grp->lg_nports++; 609 610 aggr_lacp_init_port(port); 611 mac_perim_exit(mph); 612 613 if (pp != NULL) 614 *pp = port; 615 616 return (0); 617 } 618 619 /* 620 * This is called when the 'lg_tx_ports' arrangement has changed and 621 * we need to update the corresponding 'mi_default_tx_ring'. This 622 * happens for several reasons. 623 * 624 * - A pseudo TX mac group was added or removed. 625 * - An LACP message has changed the port's state. 626 * - A link event has changed the port's state. 627 * 628 * In any case, we see if there is at least one port enabled (see 629 * 'aggr_send_port_enable()'), and if so we use its first ring as the 630 * mac's default TX ring. 631 * 632 * Note, because we only have a single TX group, we don't have to 633 * worry about the rings moving between groups and the chance that mac 634 * will reassign it unless someone removes a port, at which point, we 635 * play it safe and call this again. 636 */ 637 void 638 aggr_grp_update_default(aggr_grp_t *grp) 639 { 640 aggr_port_t *port; 641 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 642 643 rw_enter(&grp->lg_tx_lock, RW_WRITER); 644 645 if (grp->lg_ntx_ports == 0) { 646 rw_exit(&grp->lg_tx_lock); 647 return; 648 } 649 650 port = grp->lg_tx_ports[0]; 651 ASSERT(port->lp_tx_ring_cnt > 0); 652 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 653 rw_exit(&grp->lg_tx_lock); 654 } 655 656 /* 657 * Add a pseudo RX ring for the given HW ring handle. 658 */ 659 static int 660 aggr_add_pseudo_rx_ring(aggr_port_t *port, 661 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 662 { 663 aggr_pseudo_rx_ring_t *ring; 664 int err; 665 int j; 666 667 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 668 ring = rx_grp->arg_rings + j; 669 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 670 break; 671 } 672 673 /* 674 * No slot for this new RX ring. 675 */ 676 if (j == MAX_RINGS_PER_GROUP) 677 return (EIO); 678 679 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 680 ring->arr_hw_rh = hw_rh; 681 ring->arr_port = port; 682 ring->arr_grp = rx_grp; 683 rx_grp->arg_ring_cnt++; 684 685 /* 686 * The group is already registered, dynamically add a new ring to the 687 * mac group. 688 */ 689 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 690 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 691 ring->arr_hw_rh = NULL; 692 ring->arr_port = NULL; 693 ring->arr_grp = NULL; 694 rx_grp->arg_ring_cnt--; 695 } else { 696 /* 697 * This must run after the MAC is registered. 698 */ 699 ASSERT3P(ring->arr_rh, !=, NULL); 700 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 701 (void *)port, (mac_resource_handle_t)ring); 702 } 703 return (err); 704 } 705 706 /* 707 * Remove the pseudo RX ring of the given HW ring handle. 708 */ 709 static void 710 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 711 { 712 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 713 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 714 715 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 716 ring->arr_hw_rh != hw_rh) { 717 continue; 718 } 719 720 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 721 722 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 723 ring->arr_hw_rh = NULL; 724 ring->arr_port = NULL; 725 ring->arr_grp = NULL; 726 rx_grp->arg_ring_cnt--; 727 mac_hwring_clear_passthru(hw_rh); 728 break; 729 } 730 } 731 732 /* 733 * Create pseudo rings over the HW rings of the port. 734 * 735 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 736 * 737 * o Program existing unicast filters on the pseudo group into the HW group. 738 * 739 * o Program existing VLAN filters on the pseudo group into the HW group. 740 */ 741 static int 742 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 743 { 744 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 745 aggr_unicst_addr_t *addr, *a; 746 mac_perim_handle_t pmph; 747 aggr_vlan_t *avp; 748 uint_t hw_rh_cnt, i; 749 int err = 0; 750 uint_t g_idx = rx_grp->arg_index; 751 752 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 753 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 754 mac_perim_enter_by_mh(port->lp_mh, &pmph); 755 756 i = 0; 757 addr = NULL; 758 /* 759 * This function must be called after the aggr registers its 760 * MAC and its Rx groups have been initialized. 761 */ 762 ASSERT(rx_grp->arg_gh != NULL); 763 764 /* 765 * Get the list of the underlying HW rings. 766 */ 767 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 768 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 769 770 /* 771 * Add existing VLAN and unicast address filters to the port. 772 */ 773 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 774 avp = list_next(&rx_grp->arg_vlans, avp)) { 775 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 776 goto err; 777 } 778 779 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 780 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 781 goto err; 782 } 783 784 for (i = 0; i < hw_rh_cnt; i++) { 785 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 786 if (err != 0) 787 goto err; 788 } 789 790 mac_perim_exit(pmph); 791 return (0); 792 793 err: 794 ASSERT(err != 0); 795 796 for (uint_t j = 0; j < i; j++) 797 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 798 799 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 800 aggr_port_remmac(port, g_idx, a->aua_addr); 801 802 if (avp != NULL) 803 avp = list_prev(&rx_grp->arg_vlans, avp); 804 805 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 806 int err2; 807 808 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 809 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 810 ": errno %d.", avp->av_vid, 811 mac_client_name(port->lp_mch), err2); 812 } 813 } 814 815 port->lp_hwghs[g_idx] = NULL; 816 mac_perim_exit(pmph); 817 return (err); 818 } 819 820 /* 821 * Destroy the pseudo rings mapping to this port and remove all VLAN 822 * and unicast filters from this port. Even if there are no underlying 823 * HW rings we must still remove the unicast filters to take the port 824 * out of promisc mode. 825 */ 826 static void 827 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 828 { 829 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 830 aggr_unicst_addr_t *addr; 831 mac_perim_handle_t pmph; 832 uint_t hw_rh_cnt; 833 uint_t g_idx = rx_grp->arg_index; 834 835 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 836 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 837 ASSERT3P(rx_grp->arg_gh, !=, NULL); 838 mac_perim_enter_by_mh(port->lp_mh, &pmph); 839 840 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 841 MAC_RING_TYPE_RX); 842 843 for (uint_t i = 0; i < hw_rh_cnt; i++) 844 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 845 846 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 847 aggr_port_remmac(port, g_idx, addr->aua_addr); 848 849 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 850 avp = list_next(&rx_grp->arg_vlans, avp)) { 851 int err; 852 853 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 854 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 855 ": errno %d.", avp->av_vid, 856 mac_client_name(port->lp_mch), err); 857 } 858 } 859 860 port->lp_hwghs[g_idx] = NULL; 861 mac_perim_exit(pmph); 862 } 863 864 /* 865 * Add a pseudo TX ring for the given HW ring handle. 866 */ 867 static int 868 aggr_add_pseudo_tx_ring(aggr_port_t *port, 869 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 870 mac_ring_handle_t *pseudo_rh) 871 { 872 aggr_pseudo_tx_ring_t *ring; 873 int err; 874 int i; 875 876 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 877 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 878 ring = tx_grp->atg_rings + i; 879 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 880 break; 881 } 882 /* 883 * No slot for this new TX ring. 884 */ 885 if (i == MAX_RINGS_PER_GROUP) 886 return (EIO); 887 /* 888 * The following 4 statements needs to be done before 889 * calling mac_group_add_ring(). Otherwise it will 890 * result in an assertion failure in mac_init_ring(). 891 */ 892 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 893 ring->atr_hw_rh = hw_rh; 894 ring->atr_port = port; 895 tx_grp->atg_ring_cnt++; 896 897 /* 898 * The TX side has no concept of ring groups unlike RX groups. 899 * There is just a single group which stores all the TX rings. 900 * This group will be used to store aggr's pseudo TX rings. 901 */ 902 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 903 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 904 ring->atr_hw_rh = NULL; 905 ring->atr_port = NULL; 906 tx_grp->atg_ring_cnt--; 907 } else { 908 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 909 if (hw_rh != NULL) { 910 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 911 mac_find_ring(tx_grp->atg_gh, i)); 912 } 913 } 914 915 return (err); 916 } 917 918 /* 919 * Remove the pseudo TX ring of the given HW ring handle. 920 */ 921 static void 922 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 923 mac_ring_handle_t pseudo_hw_rh) 924 { 925 aggr_pseudo_tx_ring_t *ring; 926 int i; 927 928 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 929 ring = tx_grp->atg_rings + i; 930 if (ring->atr_rh != pseudo_hw_rh) 931 continue; 932 933 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 934 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 935 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 936 mac_hwring_teardown(ring->atr_hw_rh); 937 ring->atr_hw_rh = NULL; 938 ring->atr_port = NULL; 939 tx_grp->atg_ring_cnt--; 940 break; 941 } 942 } 943 944 /* 945 * This function is called to create pseudo rings over hardware rings of 946 * the underlying device. There is a 1:1 mapping between the pseudo TX 947 * rings of the aggr and the hardware rings of the underlying port. 948 */ 949 static int 950 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 951 { 952 aggr_grp_t *grp = port->lp_grp; 953 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 954 mac_perim_handle_t pmph; 955 int hw_rh_cnt, i = 0, j; 956 int err = 0; 957 958 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 959 mac_perim_enter_by_mh(port->lp_mh, &pmph); 960 961 /* 962 * Get the list the the underlying HW rings. 963 */ 964 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 965 MAC_RING_TYPE_TX); 966 967 /* 968 * Even if the underlying NIC does not have TX rings, we 969 * still make a psuedo TX ring for that NIC with NULL as 970 * the ring handle. 971 */ 972 if (hw_rh_cnt == 0) 973 port->lp_tx_ring_cnt = 1; 974 else 975 port->lp_tx_ring_cnt = hw_rh_cnt; 976 977 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 978 port->lp_tx_ring_cnt), KM_SLEEP); 979 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 980 port->lp_tx_ring_cnt), KM_SLEEP); 981 982 if (hw_rh_cnt == 0) { 983 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 984 NULL, &pseudo_rh)) == 0) { 985 port->lp_tx_rings[0] = NULL; 986 port->lp_pseudo_tx_rings[0] = pseudo_rh; 987 } 988 } else { 989 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 990 err = aggr_add_pseudo_tx_ring(port, 991 tx_grp, hw_rh[i], &pseudo_rh); 992 if (err != 0) 993 break; 994 port->lp_tx_rings[i] = hw_rh[i]; 995 port->lp_pseudo_tx_rings[i] = pseudo_rh; 996 } 997 } 998 999 if (err != 0) { 1000 if (hw_rh_cnt != 0) { 1001 for (j = 0; j < i; j++) { 1002 aggr_rem_pseudo_tx_ring(tx_grp, 1003 port->lp_pseudo_tx_rings[j]); 1004 } 1005 } 1006 kmem_free(port->lp_tx_rings, 1007 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1008 kmem_free(port->lp_pseudo_tx_rings, 1009 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1010 port->lp_tx_ring_cnt = 0; 1011 } else { 1012 port->lp_tx_grp_added = B_TRUE; 1013 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1014 aggr_tx_ring_update, port); 1015 } 1016 mac_perim_exit(pmph); 1017 aggr_grp_update_default(grp); 1018 return (err); 1019 } 1020 1021 /* 1022 * This function is called by aggr to remove pseudo TX rings over the 1023 * HW rings of the underlying port. 1024 */ 1025 static void 1026 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1027 { 1028 aggr_grp_t *grp = port->lp_grp; 1029 mac_perim_handle_t pmph; 1030 int i; 1031 1032 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1033 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1034 1035 if (!port->lp_tx_grp_added) 1036 goto done; 1037 1038 ASSERT(tx_grp->atg_gh != NULL); 1039 1040 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1041 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1042 1043 kmem_free(port->lp_tx_rings, 1044 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1045 kmem_free(port->lp_pseudo_tx_rings, 1046 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1047 1048 port->lp_tx_ring_cnt = 0; 1049 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1050 port->lp_tx_grp_added = B_FALSE; 1051 aggr_grp_update_default(grp); 1052 done: 1053 mac_perim_exit(pmph); 1054 } 1055 1056 static int 1057 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1058 { 1059 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1060 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1061 } 1062 1063 static int 1064 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1065 { 1066 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1067 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1068 } 1069 1070 /* 1071 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1072 * over an actual HW ring, the real task is to start the underlying HW 1073 * ring. 1074 */ 1075 static int 1076 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1077 { 1078 int err; 1079 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1080 1081 err = mac_hwring_start(rr_ring->arr_hw_rh); 1082 1083 if (err != 0) 1084 return (err); 1085 1086 rr_ring->arr_gen = mr_gen; 1087 return (err); 1088 } 1089 1090 /* 1091 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1092 * over an actual HW ring, the real task is to stop the underlying HW 1093 * ring. 1094 */ 1095 static void 1096 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1097 { 1098 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1099 1100 /* 1101 * The rings underlying the default group must stay up to 1102 * continue receiving LACP traffic. We would normally never 1103 * stop the default Rx rings because of the primary MAC 1104 * client; but aggr's primary MAC client doesn't call 1105 * mac_unicast_add() and thus mi_active is 0 when the last 1106 * non-primary client is deleted. 1107 */ 1108 if (rr_ring->arr_grp->arg_index != 0) 1109 mac_hwring_stop(rr_ring->arr_hw_rh); 1110 } 1111 1112 /* 1113 * Add one or more ports to an existing link aggregation group. 1114 */ 1115 int 1116 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1117 laioc_port_t *ports) 1118 { 1119 int rc; 1120 uint_t port_added = 0; 1121 uint_t grp_added; 1122 aggr_grp_t *grp = NULL; 1123 aggr_port_t *port; 1124 boolean_t link_state_changed = B_FALSE; 1125 mac_perim_handle_t mph, pmph; 1126 1127 /* Get the aggr corresponding to linkid. */ 1128 rw_enter(&aggr_grp_lock, RW_READER); 1129 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1130 (mod_hash_val_t *)&grp) != 0) { 1131 rw_exit(&aggr_grp_lock); 1132 return (ENOENT); 1133 } 1134 AGGR_GRP_REFHOLD(grp); 1135 1136 /* 1137 * Hold the perimeter so that the aggregation can't be destroyed. 1138 */ 1139 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1140 rw_exit(&aggr_grp_lock); 1141 1142 /* Add the specified ports to the aggr. */ 1143 for (uint_t i = 0; i < nports; i++) { 1144 grp_added = 0; 1145 1146 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1147 force, &port)) != 0) { 1148 goto bail; 1149 } 1150 1151 ASSERT(port != NULL); 1152 port_added++; 1153 1154 /* check capabilities */ 1155 if (!aggr_grp_capab_check(grp, port) || 1156 !aggr_grp_sdu_check(grp, port) || 1157 !aggr_grp_margin_check(grp, port)) { 1158 rc = ENOTSUP; 1159 goto bail; 1160 } 1161 1162 /* 1163 * Create the pseudo ring for each HW ring of the underlying 1164 * port. 1165 */ 1166 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1167 if (rc != 0) 1168 goto bail; 1169 1170 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1171 rc = aggr_add_pseudo_rx_group(port, 1172 &grp->lg_rx_groups[j]); 1173 1174 if (rc != 0) 1175 goto bail; 1176 1177 grp_added++; 1178 } 1179 1180 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1181 1182 /* set LACP mode */ 1183 aggr_port_lacp_set_mode(grp, port); 1184 1185 /* start port if group has already been started */ 1186 if (grp->lg_started) { 1187 rc = aggr_port_start(port); 1188 if (rc != 0) { 1189 mac_perim_exit(pmph); 1190 goto bail; 1191 } 1192 1193 /* 1194 * Turn on the promiscuous mode over the port when it 1195 * is requested to be turned on to receive the 1196 * non-primary address over a port, or the promiscuous 1197 * mode is enabled over the aggr. 1198 */ 1199 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1200 rc = aggr_port_promisc(port, B_TRUE); 1201 if (rc != 0) { 1202 mac_perim_exit(pmph); 1203 goto bail; 1204 } 1205 } 1206 } 1207 mac_perim_exit(pmph); 1208 1209 /* 1210 * Attach each port if necessary. 1211 */ 1212 if (aggr_port_notify_link(grp, port)) 1213 link_state_changed = B_TRUE; 1214 1215 /* 1216 * Initialize the callback functions for this port. 1217 */ 1218 aggr_port_init_callbacks(port); 1219 } 1220 1221 /* update the MAC address of the constituent ports */ 1222 if (aggr_grp_update_ports_mac(grp)) 1223 link_state_changed = B_TRUE; 1224 1225 if (link_state_changed) 1226 mac_link_update(grp->lg_mh, grp->lg_link_state); 1227 1228 bail: 1229 if (rc != 0) { 1230 /* stop and remove ports that have been added */ 1231 for (uint_t i = 0; i < port_added; i++) { 1232 uint_t grp_remove; 1233 1234 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1235 ASSERT(port != NULL); 1236 1237 if (grp->lg_started) { 1238 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1239 (void) aggr_port_promisc(port, B_FALSE); 1240 aggr_port_stop(port); 1241 mac_perim_exit(pmph); 1242 } 1243 1244 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1245 1246 /* 1247 * Only the last port could have a partial set 1248 * of groups added. 1249 */ 1250 grp_remove = (i + 1 == port_added) ? grp_added : 1251 grp->lg_rx_group_count; 1252 1253 for (uint_t j = 0; j < grp_remove; j++) { 1254 aggr_rem_pseudo_rx_group(port, 1255 &grp->lg_rx_groups[j]); 1256 } 1257 1258 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1259 } 1260 } 1261 1262 mac_perim_exit(mph); 1263 AGGR_GRP_REFRELE(grp); 1264 return (rc); 1265 } 1266 1267 static int 1268 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1269 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1270 aggr_lacp_timer_t lacp_timer) 1271 { 1272 boolean_t mac_addr_changed = B_FALSE; 1273 boolean_t link_state_changed = B_FALSE; 1274 mac_perim_handle_t pmph; 1275 1276 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1277 1278 /* validate fixed address if specified */ 1279 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1280 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1281 (mac_addr[0] & 0x01))) { 1282 return (EINVAL); 1283 } 1284 1285 /* update policy if requested */ 1286 if (update_mask & AGGR_MODIFY_POLICY) 1287 aggr_send_update_policy(grp, policy); 1288 1289 /* update unicast MAC address if requested */ 1290 if (update_mask & AGGR_MODIFY_MAC) { 1291 if (mac_fixed) { 1292 /* user-supplied MAC address */ 1293 grp->lg_mac_addr_port = NULL; 1294 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1295 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1296 mac_addr_changed = B_TRUE; 1297 } 1298 } else if (grp->lg_addr_fixed) { 1299 /* switch from user-supplied to automatic */ 1300 aggr_port_t *port = grp->lg_ports; 1301 1302 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1303 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1304 grp->lg_mac_addr_port = port; 1305 mac_addr_changed = B_TRUE; 1306 mac_perim_exit(pmph); 1307 } 1308 grp->lg_addr_fixed = mac_fixed; 1309 } 1310 1311 if (mac_addr_changed) 1312 link_state_changed = aggr_grp_update_ports_mac(grp); 1313 1314 if (update_mask & AGGR_MODIFY_LACP_MODE) 1315 aggr_lacp_update_mode(grp, lacp_mode); 1316 1317 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1318 aggr_lacp_update_timer(grp, lacp_timer); 1319 1320 if (link_state_changed) 1321 mac_link_update(grp->lg_mh, grp->lg_link_state); 1322 1323 if (mac_addr_changed) 1324 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1325 1326 return (0); 1327 } 1328 1329 /* 1330 * Update properties of an existing link aggregation group. 1331 */ 1332 int 1333 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1334 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1335 aggr_lacp_timer_t lacp_timer) 1336 { 1337 aggr_grp_t *grp = NULL; 1338 mac_perim_handle_t mph; 1339 int err; 1340 1341 /* get group corresponding to linkid */ 1342 rw_enter(&aggr_grp_lock, RW_READER); 1343 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1344 (mod_hash_val_t *)&grp) != 0) { 1345 rw_exit(&aggr_grp_lock); 1346 return (ENOENT); 1347 } 1348 AGGR_GRP_REFHOLD(grp); 1349 1350 /* 1351 * Hold the perimeter so that the aggregation won't be destroyed. 1352 */ 1353 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1354 rw_exit(&aggr_grp_lock); 1355 1356 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1357 mac_addr, lacp_mode, lacp_timer); 1358 1359 mac_perim_exit(mph); 1360 AGGR_GRP_REFRELE(grp); 1361 return (err); 1362 } 1363 1364 /* 1365 * Create a new link aggregation group upon request from administrator. 1366 * Returns 0 on success, an errno on failure. 1367 */ 1368 int 1369 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1370 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1371 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1372 cred_t *credp) 1373 { 1374 aggr_grp_t *grp = NULL; 1375 aggr_port_t *port; 1376 mac_register_t *mac; 1377 boolean_t link_state_changed; 1378 mac_perim_handle_t mph; 1379 int err; 1380 int i; 1381 kt_did_t tid = 0; 1382 1383 /* need at least one port */ 1384 if (nports == 0) 1385 return (EINVAL); 1386 1387 rw_enter(&aggr_grp_lock, RW_WRITER); 1388 1389 /* does a group with the same linkid already exist? */ 1390 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1391 (mod_hash_val_t *)&grp); 1392 if (err == 0) { 1393 rw_exit(&aggr_grp_lock); 1394 return (EEXIST); 1395 } 1396 1397 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1398 1399 grp->lg_refs = 1; 1400 grp->lg_closing = B_FALSE; 1401 grp->lg_force = force; 1402 grp->lg_linkid = linkid; 1403 grp->lg_zoneid = crgetzoneid(credp); 1404 grp->lg_ifspeed = 0; 1405 grp->lg_link_state = LINK_STATE_UNKNOWN; 1406 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1407 grp->lg_started = B_FALSE; 1408 grp->lg_promisc = B_FALSE; 1409 grp->lg_lacp_done = B_FALSE; 1410 grp->lg_tx_notify_done = B_FALSE; 1411 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1412 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1413 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1414 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1415 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1416 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1417 MAX_RINGS_PER_GROUP), KM_SLEEP); 1418 grp->lg_tx_blocked_cnt = 0; 1419 bzero(&grp->lg_rx_groups, 1420 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1421 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1422 aggr_lacp_init_grp(grp); 1423 1424 /* add MAC ports to group */ 1425 grp->lg_ports = NULL; 1426 grp->lg_nports = 0; 1427 grp->lg_nattached_ports = 0; 1428 grp->lg_ntx_ports = 0; 1429 1430 /* 1431 * If key is not specified by the user, allocate the key. 1432 */ 1433 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1434 err = ENOMEM; 1435 goto bail; 1436 } 1437 grp->lg_key = key; 1438 1439 for (i = 0; i < nports; i++) { 1440 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1441 if (err != 0) 1442 goto bail; 1443 } 1444 1445 grp->lg_rx_group_count = 1; 1446 1447 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1448 uint_t num_rgroups; 1449 1450 mac_perim_enter_by_mh(port->lp_mh, &mph); 1451 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1452 mac_perim_exit(mph); 1453 1454 /* 1455 * Utilize all the groups in a port. If some ports 1456 * have less groups than others, then traffic destined 1457 * for the same unicast address may be HW classified 1458 * on some ports but SW classified by aggr when 1459 * arriving on other ports. 1460 */ 1461 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1462 num_rgroups); 1463 } 1464 1465 /* 1466 * There could be cases where the hardware provides more 1467 * groups than aggr can support. Make sure we never go above 1468 * the max aggr can support. 1469 */ 1470 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1471 MAX_GROUPS_PER_PORT); 1472 1473 ASSERT3U(grp->lg_rx_group_count, >, 0); 1474 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1475 grp->lg_rx_groups[i].arg_index = i; 1476 grp->lg_rx_groups[i].arg_untagged = 0; 1477 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1478 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1479 } 1480 1481 /* 1482 * If no explicit MAC address was specified by the administrator, 1483 * set it to the MAC address of the first port. 1484 */ 1485 grp->lg_addr_fixed = mac_fixed; 1486 if (grp->lg_addr_fixed) { 1487 /* validate specified address */ 1488 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1489 err = EINVAL; 1490 goto bail; 1491 } 1492 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1493 } else { 1494 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1495 grp->lg_mac_addr_port = grp->lg_ports; 1496 } 1497 1498 /* Set the initial group capabilities. */ 1499 aggr_grp_capab_set(grp); 1500 1501 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1502 err = ENOMEM; 1503 goto bail; 1504 } 1505 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1506 mac->m_driver = grp; 1507 mac->m_dip = aggr_dip; 1508 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1509 mac->m_src_addr = grp->lg_addr; 1510 mac->m_callbacks = &aggr_m_callbacks; 1511 mac->m_min_sdu = 0; 1512 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1513 mac->m_margin = aggr_grp_max_margin(grp); 1514 mac->m_v12n = MAC_VIRT_LEVEL1; 1515 err = mac_register(mac, &grp->lg_mh); 1516 mac_free(mac); 1517 if (err != 0) 1518 goto bail; 1519 1520 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1521 if (err != 0) { 1522 (void) mac_unregister(grp->lg_mh); 1523 grp->lg_mh = NULL; 1524 goto bail; 1525 } 1526 1527 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1528 1529 /* 1530 * Update the MAC address of the constituent ports. 1531 * None of the port is attached at this time, the link state of the 1532 * aggregation will not change. 1533 * 1534 * All ports take on the primary MAC address of the aggr 1535 * (lg_aggr). At this point, none of the ports are attached; 1536 * thus the link state of the aggregation will not change. 1537 */ 1538 link_state_changed = aggr_grp_update_ports_mac(grp); 1539 ASSERT(!link_state_changed); 1540 1541 /* Update outbound load balancing policy. */ 1542 aggr_send_update_policy(grp, policy); 1543 1544 /* Set LACP mode. */ 1545 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1546 1547 /* 1548 * Attach each port if necessary. 1549 */ 1550 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1551 /* 1552 * Create the pseudo ring for each HW ring of the 1553 * underlying port. Note that this is done after the 1554 * aggr registers its MAC. 1555 */ 1556 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), 1557 ==, 0); 1558 1559 for (i = 0; i < grp->lg_rx_group_count; i++) { 1560 VERIFY3S(aggr_add_pseudo_rx_group(port, 1561 &grp->lg_rx_groups[i]), ==, 0); 1562 } 1563 1564 if (aggr_port_notify_link(grp, port)) 1565 link_state_changed = B_TRUE; 1566 1567 /* 1568 * Initialize the callback functions for this port. 1569 */ 1570 aggr_port_init_callbacks(port); 1571 } 1572 1573 if (link_state_changed) 1574 mac_link_update(grp->lg_mh, grp->lg_link_state); 1575 1576 /* add new group to hash table */ 1577 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1578 (mod_hash_val_t)grp); 1579 ASSERT(err == 0); 1580 aggr_grp_cnt++; 1581 1582 mac_perim_exit(mph); 1583 rw_exit(&aggr_grp_lock); 1584 return (0); 1585 1586 bail: 1587 1588 grp->lg_closing = B_TRUE; 1589 1590 port = grp->lg_ports; 1591 while (port != NULL) { 1592 aggr_port_t *cport; 1593 1594 cport = port->lp_next; 1595 aggr_port_delete(port); 1596 port = cport; 1597 } 1598 1599 /* 1600 * Inform the lacp_rx thread to exit. 1601 */ 1602 mutex_enter(&grp->lg_lacp_lock); 1603 grp->lg_lacp_done = B_TRUE; 1604 cv_signal(&grp->lg_lacp_cv); 1605 while (grp->lg_lacp_rx_thread != NULL) 1606 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1607 mutex_exit(&grp->lg_lacp_lock); 1608 /* 1609 * Inform the tx_notify thread to exit. 1610 */ 1611 mutex_enter(&grp->lg_tx_flowctl_lock); 1612 if (grp->lg_tx_notify_thread != NULL) { 1613 tid = grp->lg_tx_notify_thread->t_did; 1614 grp->lg_tx_notify_done = B_TRUE; 1615 cv_signal(&grp->lg_tx_flowctl_cv); 1616 } 1617 mutex_exit(&grp->lg_tx_flowctl_lock); 1618 if (tid != 0) 1619 thread_join(tid); 1620 1621 kmem_free(grp->lg_tx_blocked_rings, 1622 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1623 rw_exit(&aggr_grp_lock); 1624 AGGR_GRP_REFRELE(grp); 1625 return (err); 1626 } 1627 1628 /* 1629 * Return a pointer to the member of a group with specified linkid. 1630 */ 1631 static aggr_port_t * 1632 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1633 { 1634 aggr_port_t *port; 1635 1636 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1637 1638 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1639 if (port->lp_linkid == linkid) 1640 break; 1641 } 1642 1643 return (port); 1644 } 1645 1646 /* 1647 * Stop, detach and remove a port from a link aggregation group. 1648 */ 1649 static int 1650 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1651 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1652 { 1653 int rc = 0; 1654 aggr_port_t **pport; 1655 boolean_t mac_addr_changed = B_FALSE; 1656 boolean_t link_state_changed = B_FALSE; 1657 mac_perim_handle_t mph; 1658 uint64_t val; 1659 uint_t i; 1660 uint_t stat; 1661 1662 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1663 ASSERT(grp->lg_nports > 1); 1664 ASSERT(!grp->lg_closing); 1665 1666 /* unlink port */ 1667 for (pport = &grp->lg_ports; *pport != port; 1668 pport = &(*pport)->lp_next) { 1669 if (*pport == NULL) { 1670 rc = ENOENT; 1671 goto done; 1672 } 1673 } 1674 *pport = port->lp_next; 1675 1676 mac_perim_enter_by_mh(port->lp_mh, &mph); 1677 1678 /* 1679 * If the MAC address of the port being removed was assigned 1680 * to the group, update the group MAC address 1681 * using the MAC address of a different port. 1682 */ 1683 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1684 /* 1685 * Set the MAC address of the group to the 1686 * MAC address of its first port. 1687 */ 1688 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1689 grp->lg_mac_addr_port = grp->lg_ports; 1690 mac_addr_changed = B_TRUE; 1691 } 1692 1693 link_state_changed = aggr_grp_detach_port(grp, port); 1694 1695 /* 1696 * Add the counter statistics of the ports while it was aggregated 1697 * to the group's residual statistics. This is done by obtaining 1698 * the current counter from the underlying MAC then subtracting the 1699 * value of the counter at the moment it was added to the 1700 * aggregation. 1701 */ 1702 for (i = 0; i < MAC_NSTAT; i++) { 1703 stat = i + MAC_STAT_MIN; 1704 if (!MAC_STAT_ISACOUNTER(stat)) 1705 continue; 1706 val = aggr_port_stat(port, stat); 1707 val -= port->lp_stat[i]; 1708 mutex_enter(&grp->lg_stat_lock); 1709 grp->lg_stat[i] += val; 1710 mutex_exit(&grp->lg_stat_lock); 1711 } 1712 for (i = 0; i < ETHER_NSTAT; i++) { 1713 stat = i + MACTYPE_STAT_MIN; 1714 if (!ETHER_STAT_ISACOUNTER(stat)) 1715 continue; 1716 val = aggr_port_stat(port, stat); 1717 val -= port->lp_ether_stat[i]; 1718 mutex_enter(&grp->lg_stat_lock); 1719 grp->lg_ether_stat[i] += val; 1720 mutex_exit(&grp->lg_stat_lock); 1721 } 1722 1723 grp->lg_nports--; 1724 mac_perim_exit(mph); 1725 1726 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1727 aggr_port_delete(port); 1728 1729 /* 1730 * If the group MAC address has changed, update the MAC address of 1731 * the remaining constituent ports according to the new MAC 1732 * address of the group. 1733 */ 1734 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1735 link_state_changed = B_TRUE; 1736 1737 done: 1738 if (mac_addr_changedp != NULL) 1739 *mac_addr_changedp = mac_addr_changed; 1740 if (link_state_changedp != NULL) 1741 *link_state_changedp = link_state_changed; 1742 1743 return (rc); 1744 } 1745 1746 /* 1747 * Remove one or more ports from an existing link aggregation group. 1748 */ 1749 int 1750 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1751 { 1752 int rc = 0, i; 1753 aggr_grp_t *grp = NULL; 1754 aggr_port_t *port; 1755 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1756 boolean_t link_state_update = B_FALSE, link_state_changed; 1757 mac_perim_handle_t mph, pmph; 1758 1759 /* get group corresponding to linkid */ 1760 rw_enter(&aggr_grp_lock, RW_READER); 1761 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1762 (mod_hash_val_t *)&grp) != 0) { 1763 rw_exit(&aggr_grp_lock); 1764 return (ENOENT); 1765 } 1766 AGGR_GRP_REFHOLD(grp); 1767 1768 /* 1769 * Hold the perimeter so that the aggregation won't be destroyed. 1770 */ 1771 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1772 rw_exit(&aggr_grp_lock); 1773 1774 /* we need to keep at least one port per group */ 1775 if (nports >= grp->lg_nports) { 1776 rc = EINVAL; 1777 goto bail; 1778 } 1779 1780 /* first verify that all the groups are valid */ 1781 for (i = 0; i < nports; i++) { 1782 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1783 /* port not found */ 1784 rc = ENOENT; 1785 goto bail; 1786 } 1787 } 1788 1789 /* clear the promiscous mode for the specified ports */ 1790 for (i = 0; i < nports && rc == 0; i++) { 1791 /* lookup port */ 1792 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1793 ASSERT(port != NULL); 1794 1795 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1796 rc = aggr_port_promisc(port, B_FALSE); 1797 mac_perim_exit(pmph); 1798 } 1799 if (rc != 0) { 1800 for (i = 0; i < nports; i++) { 1801 port = aggr_grp_port_lookup(grp, 1802 ports[i].lp_linkid); 1803 ASSERT(port != NULL); 1804 1805 /* 1806 * Turn the promiscuous mode back on if it is required 1807 * to receive the non-primary address over a port, or 1808 * the promiscous mode is enabled over the aggr. 1809 */ 1810 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1811 if (port->lp_started && (grp->lg_promisc || 1812 port->lp_prom_addr != NULL)) { 1813 (void) aggr_port_promisc(port, B_TRUE); 1814 } 1815 mac_perim_exit(pmph); 1816 } 1817 goto bail; 1818 } 1819 1820 /* remove the specified ports from group */ 1821 for (i = 0; i < nports; i++) { 1822 /* lookup port */ 1823 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1824 ASSERT(port != NULL); 1825 1826 /* stop port if group has already been started */ 1827 if (grp->lg_started) { 1828 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1829 aggr_port_stop(port); 1830 mac_perim_exit(pmph); 1831 } 1832 1833 /* 1834 * aggr_rem_pseudo_tx_group() is not called here. Instead 1835 * it is called from inside aggr_grp_rem_port() after the 1836 * port has been detached. The reason is that 1837 * aggr_rem_pseudo_tx_group() removes one ring at a time 1838 * and if there is still traffic going on, then there 1839 * is the possibility of aggr_find_tx_ring() returning a 1840 * removed ring for transmission. Once the port has been 1841 * detached, that port will not be used and 1842 * aggr_find_tx_ring() will not return any rings 1843 * belonging to it. 1844 */ 1845 for (i = 0; i < grp->lg_rx_group_count; i++) 1846 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1847 1848 /* remove port from group */ 1849 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1850 &link_state_changed); 1851 ASSERT(rc == 0); 1852 mac_addr_update = mac_addr_update || mac_addr_changed; 1853 link_state_update = link_state_update || link_state_changed; 1854 } 1855 1856 bail: 1857 if (mac_addr_update) 1858 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1859 if (link_state_update) 1860 mac_link_update(grp->lg_mh, grp->lg_link_state); 1861 1862 mac_perim_exit(mph); 1863 AGGR_GRP_REFRELE(grp); 1864 1865 return (rc); 1866 } 1867 1868 int 1869 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1870 { 1871 aggr_grp_t *grp = NULL; 1872 aggr_port_t *port, *cport; 1873 datalink_id_t tmpid; 1874 mod_hash_val_t val; 1875 mac_perim_handle_t mph, pmph; 1876 int err; 1877 kt_did_t tid = 0; 1878 1879 rw_enter(&aggr_grp_lock, RW_WRITER); 1880 1881 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1882 (mod_hash_val_t *)&grp) != 0) { 1883 rw_exit(&aggr_grp_lock); 1884 return (ENOENT); 1885 } 1886 1887 /* 1888 * Note that dls_devnet_destroy() must be called before lg_lock is 1889 * held. Otherwise, it will deadlock if another thread is in 1890 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1891 * dls_devnet_destroy() needs to delete. 1892 */ 1893 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1894 rw_exit(&aggr_grp_lock); 1895 return (err); 1896 } 1897 ASSERT(linkid == tmpid); 1898 1899 /* 1900 * Unregister from the MAC service module. Since this can 1901 * fail if a client hasn't closed the MAC port, we gracefully 1902 * fail the operation. 1903 */ 1904 if ((err = mac_disable(grp->lg_mh)) != 0) { 1905 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1906 rw_exit(&aggr_grp_lock); 1907 return (err); 1908 } 1909 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1910 ASSERT(grp == (aggr_grp_t *)val); 1911 1912 ASSERT(aggr_grp_cnt > 0); 1913 aggr_grp_cnt--; 1914 rw_exit(&aggr_grp_lock); 1915 1916 /* 1917 * Inform the lacp_rx thread to exit. 1918 */ 1919 mutex_enter(&grp->lg_lacp_lock); 1920 grp->lg_lacp_done = B_TRUE; 1921 cv_signal(&grp->lg_lacp_cv); 1922 while (grp->lg_lacp_rx_thread != NULL) 1923 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1924 mutex_exit(&grp->lg_lacp_lock); 1925 /* 1926 * Inform the tx_notify_thread to exit. 1927 */ 1928 mutex_enter(&grp->lg_tx_flowctl_lock); 1929 if (grp->lg_tx_notify_thread != NULL) { 1930 tid = grp->lg_tx_notify_thread->t_did; 1931 grp->lg_tx_notify_done = B_TRUE; 1932 cv_signal(&grp->lg_tx_flowctl_cv); 1933 } 1934 mutex_exit(&grp->lg_tx_flowctl_lock); 1935 if (tid != 0) 1936 thread_join(tid); 1937 1938 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1939 1940 grp->lg_closing = B_TRUE; 1941 /* detach and free MAC ports associated with group */ 1942 port = grp->lg_ports; 1943 while (port != NULL) { 1944 cport = port->lp_next; 1945 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1946 if (grp->lg_started) 1947 aggr_port_stop(port); 1948 (void) aggr_grp_detach_port(grp, port); 1949 mac_perim_exit(pmph); 1950 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1951 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 1952 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1953 aggr_port_delete(port); 1954 port = cport; 1955 } 1956 1957 mac_perim_exit(mph); 1958 1959 kmem_free(grp->lg_tx_blocked_rings, 1960 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1961 /* 1962 * Wait for the port's lacp timer thread and its notification callback 1963 * to exit before calling mac_unregister() since both needs to access 1964 * the mac perimeter of the grp. 1965 */ 1966 aggr_grp_port_wait(grp); 1967 1968 VERIFY(mac_unregister(grp->lg_mh) == 0); 1969 grp->lg_mh = NULL; 1970 1971 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1972 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 1973 } 1974 1975 AGGR_GRP_REFRELE(grp); 1976 return (0); 1977 } 1978 1979 void 1980 aggr_grp_free(aggr_grp_t *grp) 1981 { 1982 ASSERT(grp->lg_refs == 0); 1983 ASSERT(grp->lg_port_ref == 0); 1984 if (grp->lg_key > AGGR_MAX_KEY) { 1985 id_free(key_ids, grp->lg_key); 1986 grp->lg_key = 0; 1987 } 1988 kmem_cache_free(aggr_grp_cache, grp); 1989 } 1990 1991 int 1992 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1993 aggr_grp_info_new_grp_fn_t new_grp_fn, 1994 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1995 { 1996 aggr_grp_t *grp; 1997 aggr_port_t *port; 1998 mac_perim_handle_t mph, pmph; 1999 int rc = 0; 2000 2001 /* 2002 * Make sure that the aggregation link is visible from the caller's 2003 * zone. 2004 */ 2005 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 2006 return (ENOENT); 2007 2008 rw_enter(&aggr_grp_lock, RW_READER); 2009 2010 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2011 (mod_hash_val_t *)&grp) != 0) { 2012 rw_exit(&aggr_grp_lock); 2013 return (ENOENT); 2014 } 2015 AGGR_GRP_REFHOLD(grp); 2016 2017 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2018 rw_exit(&aggr_grp_lock); 2019 2020 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2021 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2022 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2023 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2024 2025 if (rc != 0) 2026 goto bail; 2027 2028 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2029 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2030 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2031 port->lp_state, &port->lp_lacp.ActorOperPortState); 2032 mac_perim_exit(pmph); 2033 2034 if (rc != 0) 2035 goto bail; 2036 } 2037 2038 bail: 2039 mac_perim_exit(mph); 2040 AGGR_GRP_REFRELE(grp); 2041 return (rc); 2042 } 2043 2044 /*ARGSUSED*/ 2045 static void 2046 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2047 { 2048 miocnak(q, mp, 0, ENOTSUP); 2049 } 2050 2051 static int 2052 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2053 { 2054 aggr_port_t *port; 2055 uint_t stat_index; 2056 2057 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2058 2059 /* We only aggregate counter statistics. */ 2060 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 2061 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 2062 return (ENOTSUP); 2063 } 2064 2065 /* 2066 * Counter statistics for a group are computed by aggregating the 2067 * counters of the members MACs while they were aggregated, plus 2068 * the residual counter of the group itself, which is updated each 2069 * time a MAC is removed from the group. 2070 */ 2071 *val = 0; 2072 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2073 /* actual port statistic */ 2074 *val += aggr_port_stat(port, stat); 2075 /* 2076 * minus the port stat when it was added, plus any residual 2077 * amount for the group. 2078 */ 2079 if (IS_MAC_STAT(stat)) { 2080 stat_index = stat - MAC_STAT_MIN; 2081 *val -= port->lp_stat[stat_index]; 2082 *val += grp->lg_stat[stat_index]; 2083 } else if (IS_MACTYPE_STAT(stat)) { 2084 stat_index = stat - MACTYPE_STAT_MIN; 2085 *val -= port->lp_ether_stat[stat_index]; 2086 *val += grp->lg_ether_stat[stat_index]; 2087 } 2088 } 2089 return (0); 2090 } 2091 2092 int 2093 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2094 { 2095 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2096 2097 if (rx_ring->arr_hw_rh != NULL) { 2098 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2099 } else { 2100 aggr_port_t *port = rx_ring->arr_port; 2101 2102 *val = mac_stat_get(port->lp_mh, stat); 2103 2104 } 2105 return (0); 2106 } 2107 2108 int 2109 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2110 { 2111 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2112 2113 if (tx_ring->atr_hw_rh != NULL) { 2114 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2115 } else { 2116 aggr_port_t *port = tx_ring->atr_port; 2117 2118 *val = mac_stat_get(port->lp_mh, stat); 2119 } 2120 return (0); 2121 } 2122 2123 static int 2124 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2125 { 2126 aggr_grp_t *grp = arg; 2127 int rval = 0; 2128 2129 mutex_enter(&grp->lg_stat_lock); 2130 2131 switch (stat) { 2132 case MAC_STAT_IFSPEED: 2133 *val = grp->lg_ifspeed; 2134 break; 2135 2136 case ETHER_STAT_LINK_DUPLEX: 2137 *val = grp->lg_link_duplex; 2138 break; 2139 2140 default: 2141 /* 2142 * For all other statistics, we return the aggregated stat 2143 * from the underlying ports. aggr_grp_stat() will set 2144 * rval appropriately if the statistic isn't a counter. 2145 */ 2146 rval = aggr_grp_stat(grp, stat, val); 2147 } 2148 2149 mutex_exit(&grp->lg_stat_lock); 2150 return (rval); 2151 } 2152 2153 static int 2154 aggr_m_start(void *arg) 2155 { 2156 aggr_grp_t *grp = arg; 2157 aggr_port_t *port; 2158 mac_perim_handle_t mph, pmph; 2159 2160 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2161 2162 /* 2163 * Attempts to start all configured members of the group. 2164 * Group members will be attached when their link-up notification 2165 * is received. 2166 */ 2167 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2168 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2169 if (aggr_port_start(port) != 0) { 2170 mac_perim_exit(pmph); 2171 continue; 2172 } 2173 2174 /* 2175 * Turn on the promiscuous mode if it is required to receive 2176 * the non-primary address over a port, or the promiscous 2177 * mode is enabled over the aggr. 2178 */ 2179 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2180 if (aggr_port_promisc(port, B_TRUE) != 0) 2181 aggr_port_stop(port); 2182 } 2183 mac_perim_exit(pmph); 2184 } 2185 2186 grp->lg_started = B_TRUE; 2187 2188 mac_perim_exit(mph); 2189 return (0); 2190 } 2191 2192 static void 2193 aggr_m_stop(void *arg) 2194 { 2195 aggr_grp_t *grp = arg; 2196 aggr_port_t *port; 2197 mac_perim_handle_t mph, pmph; 2198 2199 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2200 2201 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2202 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2203 2204 /* reset port promiscuous mode */ 2205 (void) aggr_port_promisc(port, B_FALSE); 2206 2207 aggr_port_stop(port); 2208 mac_perim_exit(pmph); 2209 } 2210 2211 grp->lg_started = B_FALSE; 2212 mac_perim_exit(mph); 2213 } 2214 2215 static int 2216 aggr_m_promisc(void *arg, boolean_t on) 2217 { 2218 aggr_grp_t *grp = arg; 2219 aggr_port_t *port; 2220 boolean_t link_state_changed = B_FALSE; 2221 mac_perim_handle_t mph, pmph; 2222 2223 AGGR_GRP_REFHOLD(grp); 2224 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2225 2226 ASSERT(!grp->lg_closing); 2227 2228 if (on == grp->lg_promisc) 2229 goto bail; 2230 2231 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2232 int err = 0; 2233 2234 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2235 AGGR_PORT_REFHOLD(port); 2236 if (!on && (port->lp_prom_addr == NULL)) 2237 err = aggr_port_promisc(port, B_FALSE); 2238 else if (on && port->lp_started) 2239 err = aggr_port_promisc(port, B_TRUE); 2240 2241 if (err != 0) { 2242 if (aggr_grp_detach_port(grp, port)) 2243 link_state_changed = B_TRUE; 2244 } else { 2245 /* 2246 * If a port was detached because of a previous 2247 * failure changing the promiscuity, the port 2248 * is reattached when it successfully changes 2249 * the promiscuity now, and this might cause 2250 * the link state of the aggregation to change. 2251 */ 2252 if (aggr_grp_attach_port(grp, port)) 2253 link_state_changed = B_TRUE; 2254 } 2255 mac_perim_exit(pmph); 2256 AGGR_PORT_REFRELE(port); 2257 } 2258 2259 grp->lg_promisc = on; 2260 2261 if (link_state_changed) 2262 mac_link_update(grp->lg_mh, grp->lg_link_state); 2263 2264 bail: 2265 mac_perim_exit(mph); 2266 AGGR_GRP_REFRELE(grp); 2267 2268 return (0); 2269 } 2270 2271 static void 2272 aggr_grp_port_rename(const char *new_name, void *arg) 2273 { 2274 /* 2275 * aggr port's mac client name is the format of "aggr link name" plus 2276 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2277 */ 2278 int aggr_len, link_len, clnt_name_len, i; 2279 char *str_end, *str_st, *str_del; 2280 char aggr_name[MAXNAMELEN]; 2281 char link_name[MAXNAMELEN]; 2282 char *clnt_name; 2283 aggr_grp_t *aggr_grp = arg; 2284 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2285 2286 for (i = 0; i < aggr_grp->lg_nports; i++) { 2287 clnt_name = mac_client_name(aggr_port->lp_mch); 2288 clnt_name_len = strlen(clnt_name); 2289 str_st = clnt_name; 2290 str_end = &(clnt_name[clnt_name_len]); 2291 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2292 ASSERT(str_del != NULL); 2293 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2294 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2295 bzero(aggr_name, MAXNAMELEN); 2296 bzero(link_name, MAXNAMELEN); 2297 bcopy(clnt_name, aggr_name, aggr_len); 2298 bcopy(str_del, link_name, link_len + 1); 2299 bzero(clnt_name, MAXNAMELEN); 2300 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2301 link_name); 2302 2303 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2304 aggr_port = aggr_port->lp_next; 2305 } 2306 } 2307 2308 /* 2309 * Initialize the capabilities that are advertised for the group 2310 * according to the capabilities of the constituent ports. 2311 */ 2312 static boolean_t 2313 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2314 { 2315 aggr_grp_t *grp = arg; 2316 2317 switch (cap) { 2318 case MAC_CAPAB_HCKSUM: { 2319 uint32_t *hcksum_txflags = cap_data; 2320 *hcksum_txflags = grp->lg_hcksum_txflags; 2321 break; 2322 } 2323 case MAC_CAPAB_LSO: { 2324 mac_capab_lso_t *cap_lso = cap_data; 2325 2326 if (grp->lg_lso) { 2327 *cap_lso = grp->lg_cap_lso; 2328 break; 2329 } else { 2330 return (B_FALSE); 2331 } 2332 } 2333 case MAC_CAPAB_NO_NATIVEVLAN: 2334 return (!grp->lg_vlan); 2335 case MAC_CAPAB_NO_ZCOPY: 2336 return (!grp->lg_zcopy); 2337 case MAC_CAPAB_RINGS: { 2338 mac_capab_rings_t *cap_rings = cap_data; 2339 uint_t ring_cnt = 0; 2340 2341 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2342 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2343 2344 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2345 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2346 cap_rings->mr_rnum = ring_cnt; 2347 cap_rings->mr_gnum = grp->lg_rx_group_count; 2348 cap_rings->mr_gaddring = NULL; 2349 cap_rings->mr_gremring = NULL; 2350 } else { 2351 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2352 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2353 cap_rings->mr_gnum = 0; 2354 } 2355 cap_rings->mr_rget = aggr_fill_ring; 2356 cap_rings->mr_gget = aggr_fill_group; 2357 break; 2358 } 2359 case MAC_CAPAB_AGGR: 2360 { 2361 mac_capab_aggr_t *aggr_cap; 2362 2363 if (cap_data != NULL) { 2364 aggr_cap = cap_data; 2365 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2366 aggr_cap->mca_unicst = aggr_m_unicst; 2367 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2368 aggr_cap->mca_arg = arg; 2369 } 2370 return (B_TRUE); 2371 } 2372 default: 2373 return (B_FALSE); 2374 } 2375 return (B_TRUE); 2376 } 2377 2378 /* 2379 * Callback function for MAC layer to register groups. 2380 */ 2381 static void 2382 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2383 mac_group_info_t *infop, mac_group_handle_t gh) 2384 { 2385 aggr_grp_t *grp = arg; 2386 2387 if (rtype == MAC_RING_TYPE_RX) { 2388 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2389 2390 rx_group->arg_gh = gh; 2391 rx_group->arg_grp = grp; 2392 2393 infop->mgi_driver = (mac_group_driver_t)rx_group; 2394 infop->mgi_start = NULL; 2395 infop->mgi_stop = NULL; 2396 infop->mgi_addmac = aggr_addmac; 2397 infop->mgi_remmac = aggr_remmac; 2398 infop->mgi_count = rx_group->arg_ring_cnt; 2399 2400 /* 2401 * Always set the HW VLAN callbacks. They are smart 2402 * enough to know when a port has HW VLAN filters to 2403 * program and when it doesn't. 2404 */ 2405 infop->mgi_addvlan = aggr_addvlan; 2406 infop->mgi_remvlan = aggr_remvlan; 2407 } else { 2408 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2409 2410 ASSERT3S(index, ==, 0); 2411 tx_group->atg_gh = gh; 2412 } 2413 } 2414 2415 /* 2416 * Callback funtion for MAC layer to register all rings. 2417 */ 2418 static void 2419 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2420 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2421 { 2422 aggr_grp_t *grp = arg; 2423 2424 switch (rtype) { 2425 case MAC_RING_TYPE_RX: { 2426 aggr_pseudo_rx_group_t *rx_group; 2427 aggr_pseudo_rx_ring_t *rx_ring; 2428 mac_intr_t aggr_mac_intr; 2429 2430 rx_group = &grp->lg_rx_groups[rg_index]; 2431 ASSERT3S(index, >=, 0); 2432 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2433 rx_ring = rx_group->arg_rings + index; 2434 rx_ring->arr_rh = rh; 2435 2436 /* 2437 * Entrypoint to enable interrupt (disable poll) and 2438 * disable interrupt (enable poll). 2439 */ 2440 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2441 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2442 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2443 aggr_mac_intr.mi_ddi_handle = NULL; 2444 2445 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2446 infop->mri_start = aggr_pseudo_start_rx_ring; 2447 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2448 2449 infop->mri_intr = aggr_mac_intr; 2450 infop->mri_poll = aggr_rx_poll; 2451 2452 infop->mri_stat = aggr_rx_ring_stat; 2453 break; 2454 } 2455 case MAC_RING_TYPE_TX: { 2456 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2457 aggr_pseudo_tx_ring_t *tx_ring; 2458 2459 ASSERT(rg_index == -1); 2460 ASSERT(index < tx_group->atg_ring_cnt); 2461 2462 tx_ring = &tx_group->atg_rings[index]; 2463 tx_ring->atr_rh = rh; 2464 2465 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2466 infop->mri_start = NULL; 2467 infop->mri_stop = NULL; 2468 infop->mri_tx = aggr_ring_tx; 2469 infop->mri_stat = aggr_tx_ring_stat; 2470 /* 2471 * Use the hw TX ring handle to find if the ring needs 2472 * serialization or not. For NICs that do not expose 2473 * Tx rings, atr_hw_rh will be NULL. 2474 */ 2475 if (tx_ring->atr_hw_rh != NULL) { 2476 infop->mri_flags = 2477 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2478 } 2479 break; 2480 } 2481 default: 2482 break; 2483 } 2484 } 2485 2486 static mblk_t * 2487 aggr_rx_poll(void *arg, int bytes_to_pickup) 2488 { 2489 aggr_pseudo_rx_ring_t *rr_ring = arg; 2490 aggr_port_t *port = rr_ring->arr_port; 2491 aggr_grp_t *grp = port->lp_grp; 2492 mblk_t *mp_chain, *mp, **mpp; 2493 2494 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2495 2496 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2497 return (mp_chain); 2498 2499 mpp = &mp_chain; 2500 while ((mp = *mpp) != NULL) { 2501 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2502 struct ether_header *ehp; 2503 2504 ehp = (struct ether_header *)mp->b_rptr; 2505 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2506 *mpp = mp->b_next; 2507 mp->b_next = NULL; 2508 aggr_recv_lacp(port, 2509 (mac_resource_handle_t)rr_ring, mp); 2510 continue; 2511 } 2512 } 2513 2514 if (!port->lp_collector_enabled) { 2515 *mpp = mp->b_next; 2516 mp->b_next = NULL; 2517 freemsg(mp); 2518 continue; 2519 } 2520 mpp = &mp->b_next; 2521 } 2522 return (mp_chain); 2523 } 2524 2525 static int 2526 aggr_addmac(void *arg, const uint8_t *mac_addr) 2527 { 2528 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2529 aggr_unicst_addr_t *addr, **pprev; 2530 aggr_grp_t *grp = rx_group->arg_grp; 2531 aggr_port_t *port, *p; 2532 mac_perim_handle_t mph; 2533 int err = 0; 2534 uint_t idx = rx_group->arg_index; 2535 2536 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2537 2538 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2539 mac_perim_exit(mph); 2540 return (0); 2541 } 2542 2543 /* 2544 * Insert this mac address into the list of mac addresses owned by 2545 * the aggregation pseudo group. 2546 */ 2547 pprev = &rx_group->arg_macaddr; 2548 while ((addr = *pprev) != NULL) { 2549 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2550 mac_perim_exit(mph); 2551 return (EEXIST); 2552 } 2553 pprev = &addr->aua_next; 2554 } 2555 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2556 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2557 addr->aua_next = NULL; 2558 *pprev = addr; 2559 2560 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2561 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2562 break; 2563 2564 if (err != 0) { 2565 for (p = grp->lg_ports; p != port; p = p->lp_next) 2566 aggr_port_remmac(p, idx, mac_addr); 2567 2568 *pprev = NULL; 2569 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2570 } 2571 2572 mac_perim_exit(mph); 2573 return (err); 2574 } 2575 2576 static int 2577 aggr_remmac(void *arg, const uint8_t *mac_addr) 2578 { 2579 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2580 aggr_unicst_addr_t *addr, **pprev; 2581 aggr_grp_t *grp = rx_group->arg_grp; 2582 aggr_port_t *port; 2583 mac_perim_handle_t mph; 2584 int err = 0; 2585 2586 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2587 2588 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2589 mac_perim_exit(mph); 2590 return (0); 2591 } 2592 2593 /* 2594 * Insert this mac address into the list of mac addresses owned by 2595 * the aggregation pseudo group. 2596 */ 2597 pprev = &rx_group->arg_macaddr; 2598 while ((addr = *pprev) != NULL) { 2599 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2600 pprev = &addr->aua_next; 2601 continue; 2602 } 2603 break; 2604 } 2605 if (addr == NULL) { 2606 mac_perim_exit(mph); 2607 return (EINVAL); 2608 } 2609 2610 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2611 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2612 2613 *pprev = addr->aua_next; 2614 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2615 2616 mac_perim_exit(mph); 2617 return (err); 2618 } 2619 2620 /* 2621 * Search for VID in the Rx group's list and return a pointer if 2622 * found. Otherwise return NULL. 2623 */ 2624 static aggr_vlan_t * 2625 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2626 { 2627 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2628 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2629 avp = list_next(&rx_group->arg_vlans, avp)) { 2630 if (avp->av_vid == vid) 2631 return (avp); 2632 } 2633 2634 return (NULL); 2635 } 2636 2637 /* 2638 * Accept traffic on the specified VID. 2639 * 2640 * Persist VLAN state in the aggr so that ports added later will 2641 * receive the correct filters. In the future it would be nice to 2642 * allow aggr to iterate its clients instead of duplicating state. 2643 */ 2644 static int 2645 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2646 { 2647 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2648 aggr_grp_t *aggr = rx_group->arg_grp; 2649 aggr_port_t *port, *p; 2650 mac_perim_handle_t mph; 2651 int err = 0; 2652 aggr_vlan_t *avp = NULL; 2653 uint_t idx = rx_group->arg_index; 2654 2655 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2656 2657 if (vid == MAC_VLAN_UNTAGGED) { 2658 /* 2659 * Aggr is both a MAC provider and MAC client. As a 2660 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2661 * client. As a client itself, it should pass 2662 * VLAN_ID_NONE to its ports. 2663 */ 2664 vid = VLAN_ID_NONE; 2665 rx_group->arg_untagged++; 2666 goto update_ports; 2667 } 2668 2669 avp = aggr_find_vlan(rx_group, vid); 2670 2671 if (avp != NULL) { 2672 avp->av_refs++; 2673 mac_perim_exit(mph); 2674 return (0); 2675 } 2676 2677 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2678 avp->av_vid = vid; 2679 avp->av_refs = 1; 2680 2681 update_ports: 2682 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2683 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2684 break; 2685 2686 if (err != 0) { 2687 /* 2688 * If any of these calls fail then we are in a 2689 * situation where the ports have different HW state. 2690 * There's no reasonable action the MAC client can 2691 * take in this scenario to rectify the situation. 2692 */ 2693 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2694 int err2; 2695 2696 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2697 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2698 " from port %s: errno %d.", vid, 2699 mac_client_name(p->lp_mch), err2); 2700 } 2701 2702 } 2703 2704 if (vid == VLAN_ID_NONE) 2705 rx_group->arg_untagged--; 2706 2707 if (avp != NULL) { 2708 kmem_free(avp, sizeof (aggr_vlan_t)); 2709 avp = NULL; 2710 } 2711 } 2712 2713 if (avp != NULL) 2714 list_insert_tail(&rx_group->arg_vlans, avp); 2715 2716 done: 2717 mac_perim_exit(mph); 2718 return (err); 2719 } 2720 2721 /* 2722 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2723 */ 2724 static int 2725 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2726 { 2727 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2728 aggr_grp_t *aggr = rx_group->arg_grp; 2729 aggr_port_t *port, *p; 2730 mac_perim_handle_t mph; 2731 int err = 0; 2732 aggr_vlan_t *avp = NULL; 2733 uint_t idx = rx_group->arg_index; 2734 2735 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2736 2737 /* 2738 * See the comment in aggr_addvlan(). 2739 */ 2740 if (vid == MAC_VLAN_UNTAGGED) { 2741 vid = VLAN_ID_NONE; 2742 rx_group->arg_untagged--; 2743 2744 if (rx_group->arg_untagged > 0) 2745 goto done; 2746 2747 goto update_ports; 2748 } 2749 2750 avp = aggr_find_vlan(rx_group, vid); 2751 2752 if (avp == NULL) { 2753 err = ENOENT; 2754 goto done; 2755 } 2756 2757 avp->av_refs--; 2758 2759 if (avp->av_refs > 0) 2760 goto done; 2761 2762 update_ports: 2763 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2764 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2765 break; 2766 2767 /* 2768 * See the comment in aggr_addvlan() for justification of the 2769 * use of VERIFY here. 2770 */ 2771 if (err != 0) { 2772 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2773 int err2; 2774 2775 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2776 cmn_err(CE_WARN, "Failed to add VLAN %u" 2777 " to port %s: errno %d.", vid, 2778 mac_client_name(p->lp_mch), err2); 2779 } 2780 } 2781 2782 if (avp != NULL) 2783 avp->av_refs++; 2784 2785 if (vid == VLAN_ID_NONE) 2786 rx_group->arg_untagged++; 2787 2788 goto done; 2789 } 2790 2791 if (err == 0 && avp != NULL) { 2792 VERIFY3U(avp->av_refs, ==, 0); 2793 list_remove(&rx_group->arg_vlans, avp); 2794 kmem_free(avp, sizeof (aggr_vlan_t)); 2795 } 2796 2797 done: 2798 mac_perim_exit(mph); 2799 return (err); 2800 } 2801 2802 /* 2803 * Add or remove the multicast addresses that are defined for the group 2804 * to or from the specified port. 2805 * 2806 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2807 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2808 * called when the port is either stopped or detached. 2809 */ 2810 void 2811 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2812 { 2813 aggr_grp_t *grp = port->lp_grp; 2814 2815 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2816 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2817 2818 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2819 return; 2820 2821 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2822 } 2823 2824 static int 2825 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2826 { 2827 aggr_grp_t *grp = arg; 2828 aggr_port_t *port = NULL, *errport = NULL; 2829 mac_perim_handle_t mph; 2830 int err = 0; 2831 2832 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2833 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2834 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2835 !port->lp_started) { 2836 continue; 2837 } 2838 err = aggr_port_multicst(port, add, addrp); 2839 if (err != 0) { 2840 errport = port; 2841 break; 2842 } 2843 } 2844 2845 /* 2846 * At least one port caused error return and this error is returned to 2847 * mac, eventually a NAK would be sent upwards. 2848 * Some ports have this multicast address listed now, and some don't. 2849 * Treat this error as a whole aggr failure not individual port failure. 2850 * Therefore remove this multicast address from other ports. 2851 */ 2852 if ((err != 0) && add) { 2853 for (port = grp->lg_ports; port != errport; 2854 port = port->lp_next) { 2855 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2856 !port->lp_started) { 2857 continue; 2858 } 2859 (void) aggr_port_multicst(port, B_FALSE, addrp); 2860 } 2861 } 2862 mac_perim_exit(mph); 2863 return (err); 2864 } 2865 2866 static int 2867 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2868 { 2869 aggr_grp_t *grp = arg; 2870 mac_perim_handle_t mph; 2871 int err; 2872 2873 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2874 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2875 0, 0); 2876 mac_perim_exit(mph); 2877 return (err); 2878 } 2879 2880 /* 2881 * Initialize the capabilities that are advertised for the group 2882 * according to the capabilities of the constituent ports. 2883 */ 2884 static void 2885 aggr_grp_capab_set(aggr_grp_t *grp) 2886 { 2887 uint32_t cksum; 2888 aggr_port_t *port; 2889 mac_capab_lso_t cap_lso; 2890 2891 ASSERT(grp->lg_mh == NULL); 2892 ASSERT(grp->lg_ports != NULL); 2893 2894 grp->lg_hcksum_txflags = (uint32_t)-1; 2895 grp->lg_zcopy = B_TRUE; 2896 grp->lg_vlan = B_TRUE; 2897 2898 grp->lg_lso = B_TRUE; 2899 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2900 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2901 2902 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2903 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2904 cksum = 0; 2905 grp->lg_hcksum_txflags &= cksum; 2906 2907 grp->lg_vlan &= 2908 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2909 2910 grp->lg_zcopy &= 2911 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2912 2913 grp->lg_lso &= 2914 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2915 if (grp->lg_lso) { 2916 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2917 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2918 cap_lso.lso_basic_tcp_ipv4.lso_max) 2919 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2920 cap_lso.lso_basic_tcp_ipv4.lso_max; 2921 } 2922 } 2923 } 2924 2925 /* 2926 * Checks whether the capabilities of the port being added are compatible 2927 * with the current capabilities of the aggregation. 2928 */ 2929 static boolean_t 2930 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2931 { 2932 uint32_t hcksum_txflags; 2933 2934 ASSERT(grp->lg_ports != NULL); 2935 2936 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2937 grp->lg_vlan) != grp->lg_vlan) { 2938 return (B_FALSE); 2939 } 2940 2941 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2942 grp->lg_zcopy) != grp->lg_zcopy) { 2943 return (B_FALSE); 2944 } 2945 2946 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2947 if (grp->lg_hcksum_txflags != 0) 2948 return (B_FALSE); 2949 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2950 grp->lg_hcksum_txflags) { 2951 return (B_FALSE); 2952 } 2953 2954 if (grp->lg_lso) { 2955 mac_capab_lso_t cap_lso; 2956 2957 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2958 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2959 grp->lg_cap_lso.lso_flags) 2960 return (B_FALSE); 2961 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2962 cap_lso.lso_basic_tcp_ipv4.lso_max) 2963 return (B_FALSE); 2964 } else { 2965 return (B_FALSE); 2966 } 2967 } 2968 2969 return (B_TRUE); 2970 } 2971 2972 /* 2973 * Returns the maximum SDU according to the SDU of the constituent ports. 2974 */ 2975 static uint_t 2976 aggr_grp_max_sdu(aggr_grp_t *grp) 2977 { 2978 uint_t max_sdu = (uint_t)-1; 2979 aggr_port_t *port; 2980 2981 ASSERT(grp->lg_ports != NULL); 2982 2983 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2984 uint_t port_sdu_max; 2985 2986 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2987 if (max_sdu > port_sdu_max) 2988 max_sdu = port_sdu_max; 2989 } 2990 2991 return (max_sdu); 2992 } 2993 2994 /* 2995 * Checks if the maximum SDU of the specified port is compatible 2996 * with the maximum SDU of the specified aggregation group, returns 2997 * B_TRUE if it is, B_FALSE otherwise. 2998 */ 2999 static boolean_t 3000 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 3001 { 3002 uint_t port_sdu_max; 3003 3004 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3005 return (port_sdu_max >= grp->lg_max_sdu); 3006 } 3007 3008 /* 3009 * Returns the maximum margin according to the margin of the constituent ports. 3010 */ 3011 static uint32_t 3012 aggr_grp_max_margin(aggr_grp_t *grp) 3013 { 3014 uint32_t margin = UINT32_MAX; 3015 aggr_port_t *port; 3016 3017 ASSERT(grp->lg_mh == NULL); 3018 ASSERT(grp->lg_ports != NULL); 3019 3020 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3021 if (margin > port->lp_margin) 3022 margin = port->lp_margin; 3023 } 3024 3025 grp->lg_margin = margin; 3026 return (margin); 3027 } 3028 3029 /* 3030 * Checks if the maximum margin of the specified port is compatible 3031 * with the maximum margin of the specified aggregation group, returns 3032 * B_TRUE if it is, B_FALSE otherwise. 3033 */ 3034 static boolean_t 3035 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3036 { 3037 if (port->lp_margin >= grp->lg_margin) 3038 return (B_TRUE); 3039 3040 /* 3041 * See whether the current margin value is allowed to be changed to 3042 * the new value. 3043 */ 3044 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3045 return (B_FALSE); 3046 3047 grp->lg_margin = port->lp_margin; 3048 return (B_TRUE); 3049 } 3050 3051 /* 3052 * Set MTU on individual ports of an aggregation group 3053 */ 3054 static int 3055 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3056 uint32_t *old_mtu) 3057 { 3058 boolean_t removed = B_FALSE; 3059 mac_perim_handle_t mph; 3060 mac_diag_t diag; 3061 int err, rv, retry = 0; 3062 3063 if (port->lp_mah != NULL) { 3064 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3065 port->lp_mah = NULL; 3066 removed = B_TRUE; 3067 } 3068 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3069 try_again: 3070 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3071 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3072 &port->lp_mah, 0, &diag)) != 0) { 3073 /* 3074 * following is a workaround for a bug in 'bge' driver. 3075 * See CR 6794654 for more information and this work around 3076 * will be removed once the CR is fixed. 3077 */ 3078 if (rv == EIO && retry++ < 3) { 3079 delay(2 * hz); 3080 goto try_again; 3081 } 3082 /* 3083 * if mac_unicast_add() failed while setting the MTU, 3084 * detach the port from the group. 3085 */ 3086 mac_perim_enter_by_mh(port->lp_mh, &mph); 3087 (void) aggr_grp_detach_port(grp, port); 3088 mac_perim_exit(mph); 3089 cmn_err(CE_WARN, "Unable to restart the port %s while " 3090 "setting MTU. Detaching the port from the aggregation.", 3091 mac_client_name(port->lp_mch)); 3092 } 3093 return (err); 3094 } 3095 3096 static int 3097 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3098 { 3099 int err = 0, i, rv; 3100 aggr_port_t *port; 3101 uint32_t *mtu; 3102 3103 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3104 3105 /* 3106 * If the MTU being set is equal to aggr group's maximum 3107 * allowable value, then there is nothing to change 3108 */ 3109 if (sdu == grp->lg_max_sdu) 3110 return (0); 3111 3112 /* 0 is aggr group's min sdu */ 3113 if (sdu == 0) 3114 return (EINVAL); 3115 3116 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3117 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3118 port = port->lp_next, i++) { 3119 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3120 } 3121 if (err != 0) { 3122 /* recover from error: reset the mtus of the ports */ 3123 aggr_port_t *tmp; 3124 3125 for (tmp = grp->lg_ports, i = 0; tmp != port; 3126 tmp = tmp->lp_next, i++) { 3127 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3128 } 3129 goto bail; 3130 } 3131 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3132 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3133 ASSERT(rv == 0); 3134 bail: 3135 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3136 return (err); 3137 } 3138 3139 /* 3140 * Callback functions for set/get of properties 3141 */ 3142 /*ARGSUSED*/ 3143 static int 3144 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3145 uint_t pr_valsize, const void *pr_val) 3146 { 3147 int err = ENOTSUP; 3148 aggr_grp_t *grp = m_driver; 3149 3150 switch (pr_num) { 3151 case MAC_PROP_MTU: { 3152 uint32_t mtu; 3153 3154 if (pr_valsize < sizeof (mtu)) { 3155 err = EINVAL; 3156 break; 3157 } 3158 bcopy(pr_val, &mtu, sizeof (mtu)); 3159 err = aggr_sdu_update(grp, mtu); 3160 break; 3161 } 3162 default: 3163 break; 3164 } 3165 return (err); 3166 } 3167 3168 typedef struct rboundary { 3169 uint32_t bval; 3170 int btype; 3171 } rboundary_t; 3172 3173 /* 3174 * This function finds the intersection of mtu ranges stored in arrays - 3175 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3176 * Individual arrays are assumed to contain non-overlapping ranges. 3177 * Algorithm: 3178 * A range has two boundaries - min and max. We scan all arrays and store 3179 * each boundary as a separate element in a temporary array. We also store 3180 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3181 * array. Then we sort the temporary array in ascending order. We scan the 3182 * sorted array from lower to higher values and keep a cumulative sum of 3183 * boundary types. Element in the temporary array for which the sum reaches 3184 * mcount is a min boundary of a range in the result and next element will be 3185 * max boundary. 3186 * 3187 * Example for mcount = 3, 3188 * 3189 * ----|_________|-------|_______|----|__|------ mrange[0] 3190 * 3191 * -------|________|--|____________|-----|___|-- mrange[1] 3192 * 3193 * --------|________________|-------|____|------ mrange[2] 3194 * 3195 * 3 2 1 3196 * \|/ 3197 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3198 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3199 * 3200 * same min and max 3201 * V 3202 * --------|_____|-------|__|------------|------ intersecting ranges 3203 */ 3204 void 3205 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3206 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3207 { 3208 mac_propval_uint32_range_t *rval, *ur; 3209 int rmaxcnt, rcount; 3210 size_t sz_range32; 3211 rboundary_t *ta; /* temporary array */ 3212 rboundary_t temp; 3213 boolean_t range_started = B_FALSE; 3214 int i, j, m, sum; 3215 3216 sz_range32 = sizeof (mac_propval_uint32_range_t); 3217 3218 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3219 rmaxcnt += mrange[i]->mpr_count; 3220 3221 /* Allocate enough space to store the results */ 3222 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3223 3224 /* Number of boundaries are twice as many as ranges */ 3225 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3226 3227 for (i = 0, m = 0; i < mcount; i++) { 3228 ur = &(mrange[i]->mpr_range_uint32[0]); 3229 for (j = 0; j < mrange[i]->mpr_count; j++) { 3230 ta[m].bval = ur[j].mpur_min; 3231 ta[m++].btype = 1; 3232 ta[m].bval = ur[j].mpur_max; 3233 ta[m++].btype = -1; 3234 } 3235 } 3236 3237 /* 3238 * Sort the temporary array in ascending order of bval; 3239 * if boundary values are same then sort on btype. 3240 */ 3241 for (i = 0; i < m-1; i++) { 3242 for (j = i+1; j < m; j++) { 3243 if ((ta[i].bval > ta[j].bval) || 3244 ((ta[i].bval == ta[j].bval) && 3245 (ta[i].btype < ta[j].btype))) { 3246 temp = ta[i]; 3247 ta[i] = ta[j]; 3248 ta[j] = temp; 3249 } 3250 } 3251 } 3252 3253 /* Walk through temporary array to find all ranges in the results */ 3254 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3255 sum += ta[i].btype; 3256 if (sum == mcount) { 3257 rval[rcount].mpur_min = ta[i].bval; 3258 range_started = B_TRUE; 3259 } else if (sum < mcount && range_started) { 3260 rval[rcount++].mpur_max = ta[i].bval; 3261 range_started = B_FALSE; 3262 } 3263 } 3264 3265 *prval = rval; 3266 *prmaxcnt = rmaxcnt; 3267 *prcount = rcount; 3268 3269 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3270 } 3271 3272 /* 3273 * Returns the mtu ranges which could be supported by aggr group. 3274 * prmaxcnt returns the size of the buffer prval, prcount returns 3275 * the number of valid entries in prval. Caller is responsible 3276 * for freeing up prval. 3277 */ 3278 int 3279 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3280 int *prmaxcnt, int *prcount) 3281 { 3282 mac_propval_range_t **vals; 3283 aggr_port_t *port; 3284 mac_perim_handle_t mph; 3285 uint_t i, numr; 3286 int err = 0; 3287 size_t sz_propval, sz_range32; 3288 size_t size; 3289 3290 sz_propval = sizeof (mac_propval_range_t); 3291 sz_range32 = sizeof (mac_propval_uint32_range_t); 3292 3293 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3294 3295 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3296 KM_SLEEP); 3297 3298 for (port = grp->lg_ports, i = 0; port != NULL; 3299 port = port->lp_next, i++) { 3300 3301 size = sz_propval; 3302 vals[i] = kmem_alloc(size, KM_SLEEP); 3303 vals[i]->mpr_count = 1; 3304 3305 mac_perim_enter_by_mh(port->lp_mh, &mph); 3306 3307 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3308 NULL, 0, vals[i], NULL); 3309 if (err == ENOSPC) { 3310 /* 3311 * Not enough space to hold all ranges. 3312 * Allocate extra space as indicated and retry. 3313 */ 3314 numr = vals[i]->mpr_count; 3315 kmem_free(vals[i], sz_propval); 3316 size = sz_propval + (numr - 1) * sz_range32; 3317 vals[i] = kmem_alloc(size, KM_SLEEP); 3318 vals[i]->mpr_count = numr; 3319 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3320 NULL, 0, vals[i], NULL); 3321 ASSERT(err != ENOSPC); 3322 } 3323 mac_perim_exit(mph); 3324 if (err != 0) { 3325 kmem_free(vals[i], size); 3326 vals[i] = NULL; 3327 break; 3328 } 3329 } 3330 3331 /* 3332 * if any of the underlying ports does not support changing MTU then 3333 * just return ENOTSUP 3334 */ 3335 if (port != NULL) { 3336 ASSERT(err != 0); 3337 goto done; 3338 } 3339 3340 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3341 prcount); 3342 3343 done: 3344 for (i = 0; i < grp->lg_nports; i++) { 3345 if (vals[i] != NULL) { 3346 numr = vals[i]->mpr_count; 3347 size = sz_propval + (numr - 1) * sz_range32; 3348 kmem_free(vals[i], size); 3349 } 3350 } 3351 3352 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3353 return (err); 3354 } 3355 3356 static void 3357 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3358 mac_prop_info_handle_t prh) 3359 { 3360 aggr_grp_t *grp = m_driver; 3361 mac_propval_uint32_range_t *rval = NULL; 3362 int i, rcount, rmaxcnt; 3363 int err = 0; 3364 3365 _NOTE(ARGUNUSED(pr_name)); 3366 3367 switch (pr_num) { 3368 case MAC_PROP_MTU: 3369 3370 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3371 &rcount); 3372 if (err != 0) { 3373 ASSERT(rval == NULL); 3374 return; 3375 } 3376 for (i = 0; i < rcount; i++) { 3377 mac_prop_info_set_range_uint32(prh, 3378 rval[i].mpur_min, rval[i].mpur_max); 3379 } 3380 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3381 break; 3382 } 3383 } 3384