1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2015 Joyent, Inc. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 28 * 29 * An instance of the structure aggr_grp_t is allocated for each 30 * link aggregation group. When created, aggr_grp_t objects are 31 * entered into the aggr_grp_hash hash table maintained by the modhash 32 * module. The hash key is the linkid associated with the link 33 * aggregation group. 34 * 35 * A set of MAC ports are associated with each association group. 36 * 37 * Aggr pseudo TX rings 38 * -------------------- 39 * The underlying ports (NICs) in an aggregation can have TX rings. To 40 * enhance aggr's performance, these TX rings are made available to the 41 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new. 42 * They are already present and implemented on the RX side. It is called 43 * as pseudo RX rings. The same concept is extended to the TX side where 44 * each TX ring of an underlying port is reflected in aggr as a pseudo 45 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX 46 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo 47 * TX ring is given to the aggregation layer. 48 * 49 * With this change, the outgoing stack depth looks much better: 50 * 51 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 52 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 53 * 54 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings: 55 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 56 * 57 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 58 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX 59 * ring belonging to a port on which the packet has to be sent. 60 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 61 * policy and then uses the fanout_hint passed to it to pick a TX ring from 62 * the selected port. 63 * 64 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 65 * bandwidth limit is applied first on the outgoing packet and the packets 66 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 67 * particular TX ring. 68 */ 69 70 #include <sys/types.h> 71 #include <sys/sysmacros.h> 72 #include <sys/conf.h> 73 #include <sys/cmn_err.h> 74 #include <sys/disp.h> 75 #include <sys/list.h> 76 #include <sys/ksynch.h> 77 #include <sys/kmem.h> 78 #include <sys/stream.h> 79 #include <sys/modctl.h> 80 #include <sys/ddi.h> 81 #include <sys/sunddi.h> 82 #include <sys/atomic.h> 83 #include <sys/stat.h> 84 #include <sys/modhash.h> 85 #include <sys/id_space.h> 86 #include <sys/strsun.h> 87 #include <sys/cred.h> 88 #include <sys/dlpi.h> 89 #include <sys/zone.h> 90 #include <sys/mac_provider.h> 91 #include <sys/dls.h> 92 #include <sys/vlan.h> 93 #include <sys/aggr.h> 94 #include <sys/aggr_impl.h> 95 96 static int aggr_m_start(void *); 97 static void aggr_m_stop(void *); 98 static int aggr_m_promisc(void *, boolean_t); 99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 100 static int aggr_m_unicst(void *, const uint8_t *); 101 static int aggr_m_stat(void *, uint_t, uint64_t *); 102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 105 const void *); 106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 107 mac_prop_info_handle_t); 108 109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 111 boolean_t *); 112 113 static void aggr_grp_capab_set(aggr_grp_t *); 114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 115 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 116 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 119 120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 122 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 123 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t); 125 static void aggr_pseudo_stop_ring(mac_ring_driver_t); 126 static int aggr_addmac(void *, const uint8_t *); 127 static int aggr_remmac(void *, const uint8_t *); 128 static mblk_t *aggr_rx_poll(void *, int); 129 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 130 const int, mac_ring_info_t *, mac_ring_handle_t); 131 static void aggr_fill_group(void *, mac_ring_type_t, const int, 132 mac_group_info_t *, mac_group_handle_t); 133 134 static kmem_cache_t *aggr_grp_cache; 135 static mod_hash_t *aggr_grp_hash; 136 static krwlock_t aggr_grp_lock; 137 static uint_t aggr_grp_cnt; 138 static id_space_t *key_ids; 139 140 #define GRP_HASHSZ 64 141 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 142 #define AGGR_PORT_NAME_DELIMIT '-' 143 144 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 145 146 #define AGGR_M_CALLBACK_FLAGS \ 147 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 148 149 static mac_callbacks_t aggr_m_callbacks = { 150 AGGR_M_CALLBACK_FLAGS, 151 aggr_m_stat, 152 aggr_m_start, 153 aggr_m_stop, 154 aggr_m_promisc, 155 aggr_m_multicst, 156 NULL, 157 NULL, 158 NULL, 159 aggr_m_ioctl, 160 aggr_m_capab_get, 161 NULL, 162 NULL, 163 aggr_m_setprop, 164 NULL, 165 aggr_m_propinfo 166 }; 167 168 /*ARGSUSED*/ 169 static int 170 aggr_grp_constructor(void *buf, void *arg, int kmflag) 171 { 172 aggr_grp_t *grp = buf; 173 174 bzero(grp, sizeof (*grp)); 175 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 176 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 177 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 178 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 179 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 180 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 181 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 182 grp->lg_link_state = LINK_STATE_UNKNOWN; 183 return (0); 184 } 185 186 /*ARGSUSED*/ 187 static void 188 aggr_grp_destructor(void *buf, void *arg) 189 { 190 aggr_grp_t *grp = buf; 191 192 if (grp->lg_tx_ports != NULL) { 193 kmem_free(grp->lg_tx_ports, 194 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 195 } 196 197 mutex_destroy(&grp->lg_lacp_lock); 198 cv_destroy(&grp->lg_lacp_cv); 199 mutex_destroy(&grp->lg_port_lock); 200 cv_destroy(&grp->lg_port_cv); 201 rw_destroy(&grp->lg_tx_lock); 202 mutex_destroy(&grp->lg_tx_flowctl_lock); 203 cv_destroy(&grp->lg_tx_flowctl_cv); 204 } 205 206 void 207 aggr_grp_init(void) 208 { 209 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 210 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 211 aggr_grp_destructor, NULL, NULL, NULL, 0); 212 213 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 214 GRP_HASHSZ, mod_hash_null_valdtor); 215 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 216 aggr_grp_cnt = 0; 217 218 /* 219 * Allocate an id space to manage key values (when key is not 220 * specified). The range of the id space will be from 221 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 222 * uses a 16-bit key. 223 */ 224 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 225 ASSERT(key_ids != NULL); 226 } 227 228 void 229 aggr_grp_fini(void) 230 { 231 id_space_destroy(key_ids); 232 rw_destroy(&aggr_grp_lock); 233 mod_hash_destroy_idhash(aggr_grp_hash); 234 kmem_cache_destroy(aggr_grp_cache); 235 } 236 237 uint_t 238 aggr_grp_count(void) 239 { 240 uint_t count; 241 242 rw_enter(&aggr_grp_lock, RW_READER); 243 count = aggr_grp_cnt; 244 rw_exit(&aggr_grp_lock); 245 return (count); 246 } 247 248 /* 249 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 250 * requires the mac perimeter, this function holds a reference of the aggr 251 * and aggr won't call mac_unregister() until this reference drops to 0. 252 */ 253 void 254 aggr_grp_port_hold(aggr_port_t *port) 255 { 256 aggr_grp_t *grp = port->lp_grp; 257 258 AGGR_PORT_REFHOLD(port); 259 mutex_enter(&grp->lg_port_lock); 260 grp->lg_port_ref++; 261 mutex_exit(&grp->lg_port_lock); 262 } 263 264 /* 265 * Release the reference of the grp and inform aggr_grp_delete() calling 266 * mac_unregister() is now safe. 267 */ 268 void 269 aggr_grp_port_rele(aggr_port_t *port) 270 { 271 aggr_grp_t *grp = port->lp_grp; 272 273 mutex_enter(&grp->lg_port_lock); 274 if (--grp->lg_port_ref == 0) 275 cv_signal(&grp->lg_port_cv); 276 mutex_exit(&grp->lg_port_lock); 277 AGGR_PORT_REFRELE(port); 278 } 279 280 /* 281 * Wait for the port's lacp timer thread and the port's notification callback 282 * to exit. 283 */ 284 void 285 aggr_grp_port_wait(aggr_grp_t *grp) 286 { 287 mutex_enter(&grp->lg_port_lock); 288 if (grp->lg_port_ref != 0) 289 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 290 mutex_exit(&grp->lg_port_lock); 291 } 292 293 /* 294 * Attach a port to a link aggregation group. 295 * 296 * A port is attached to a link aggregation group once its speed 297 * and link state have been verified. 298 * 299 * Returns B_TRUE if the group link state or speed has changed. If 300 * it's the case, the caller must notify the MAC layer via a call 301 * to mac_link(). 302 */ 303 boolean_t 304 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 305 { 306 boolean_t link_state_changed = B_FALSE; 307 308 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 309 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 310 311 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 312 return (B_FALSE); 313 314 /* 315 * Validate the MAC port link speed and update the group 316 * link speed if needed. 317 */ 318 if (port->lp_ifspeed == 0 || 319 port->lp_link_state != LINK_STATE_UP || 320 port->lp_link_duplex != LINK_DUPLEX_FULL) { 321 /* 322 * Can't attach a MAC port with unknown link speed, 323 * down link, or not in full duplex mode. 324 */ 325 return (B_FALSE); 326 } 327 328 if (grp->lg_ifspeed == 0) { 329 /* 330 * The group inherits the speed of the first link being 331 * attached. 332 */ 333 grp->lg_ifspeed = port->lp_ifspeed; 334 link_state_changed = B_TRUE; 335 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 336 /* 337 * The link speed of the MAC port must be the same as 338 * the group link speed, as per 802.3ad. Since it is 339 * not, the attach is cancelled. 340 */ 341 return (B_FALSE); 342 } 343 344 grp->lg_nattached_ports++; 345 346 /* 347 * Update the group link state. 348 */ 349 if (grp->lg_link_state != LINK_STATE_UP) { 350 grp->lg_link_state = LINK_STATE_UP; 351 grp->lg_link_duplex = LINK_DUPLEX_FULL; 352 link_state_changed = B_TRUE; 353 } 354 355 /* 356 * Update port's state. 357 */ 358 port->lp_state = AGGR_PORT_STATE_ATTACHED; 359 360 aggr_grp_multicst_port(port, B_TRUE); 361 362 /* 363 * Set port's receive callback 364 */ 365 mac_rx_set(port->lp_mch, aggr_recv_cb, port); 366 367 /* 368 * If LACP is OFF, the port can be used to send data as soon 369 * as its link is up and verified to be compatible with the 370 * aggregation. 371 * 372 * If LACP is active or passive, notify the LACP subsystem, which 373 * will enable sending on the port following the LACP protocol. 374 */ 375 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 376 aggr_send_port_enable(port); 377 else 378 aggr_lacp_port_attached(port); 379 380 return (link_state_changed); 381 } 382 383 boolean_t 384 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 385 { 386 boolean_t link_state_changed = B_FALSE; 387 388 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 389 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 390 391 /* update state */ 392 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 393 return (B_FALSE); 394 395 mac_rx_clear(port->lp_mch); 396 397 aggr_grp_multicst_port(port, B_FALSE); 398 399 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 400 aggr_send_port_disable(port); 401 else 402 aggr_lacp_port_detached(port); 403 404 port->lp_state = AGGR_PORT_STATE_STANDBY; 405 406 grp->lg_nattached_ports--; 407 if (grp->lg_nattached_ports == 0) { 408 /* the last attached MAC port of the group is being detached */ 409 grp->lg_ifspeed = 0; 410 grp->lg_link_state = LINK_STATE_DOWN; 411 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 412 link_state_changed = B_TRUE; 413 } 414 415 return (link_state_changed); 416 } 417 418 /* 419 * Update the MAC addresses of the constituent ports of the specified 420 * group. This function is invoked: 421 * - after creating a new aggregation group. 422 * - after adding new ports to an aggregation group. 423 * - after removing a port from a group when the MAC address of 424 * that port was used for the MAC address of the group. 425 * - after the MAC address of a port changed when the MAC address 426 * of that port was used for the MAC address of the group. 427 * 428 * Return true if the link state of the aggregation changed, for example 429 * as a result of a failure changing the MAC address of one of the 430 * constituent ports. 431 */ 432 boolean_t 433 aggr_grp_update_ports_mac(aggr_grp_t *grp) 434 { 435 aggr_port_t *cport; 436 boolean_t link_state_changed = B_FALSE; 437 mac_perim_handle_t mph; 438 439 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 440 441 for (cport = grp->lg_ports; cport != NULL; 442 cport = cport->lp_next) { 443 mac_perim_enter_by_mh(cport->lp_mh, &mph); 444 if (aggr_port_unicst(cport) != 0) { 445 if (aggr_grp_detach_port(grp, cport)) 446 link_state_changed = B_TRUE; 447 } else { 448 /* 449 * If a port was detached because of a previous 450 * failure changing the MAC address, the port is 451 * reattached when it successfully changes the MAC 452 * address now, and this might cause the link state 453 * of the aggregation to change. 454 */ 455 if (aggr_grp_attach_port(grp, cport)) 456 link_state_changed = B_TRUE; 457 } 458 mac_perim_exit(mph); 459 } 460 return (link_state_changed); 461 } 462 463 /* 464 * Invoked when the MAC address of a port has changed. If the port's 465 * MAC address was used for the group MAC address, set mac_addr_changedp 466 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 467 * notification. If the link state changes due to detach/attach of 468 * the constituent port, set link_state_changedp to B_TRUE to indicate 469 * to the caller that it should send a MAC_NOTE_LINK notification. In both 470 * cases, it is the responsibility of the caller to invoke notification 471 * functions after releasing the the port lock. 472 */ 473 void 474 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 475 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 476 { 477 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 478 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 479 ASSERT(mac_addr_changedp != NULL); 480 ASSERT(link_state_changedp != NULL); 481 482 *mac_addr_changedp = B_FALSE; 483 *link_state_changedp = B_FALSE; 484 485 if (grp->lg_addr_fixed) { 486 /* 487 * The group is using a fixed MAC address or an automatic 488 * MAC address has not been set. 489 */ 490 return; 491 } 492 493 if (grp->lg_mac_addr_port == port) { 494 /* 495 * The MAC address of the port was assigned to the group 496 * MAC address. Update the group MAC address. 497 */ 498 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 499 *mac_addr_changedp = B_TRUE; 500 } else { 501 /* 502 * Update the actual port MAC address to the MAC address 503 * of the group. 504 */ 505 if (aggr_port_unicst(port) != 0) { 506 *link_state_changedp = aggr_grp_detach_port(grp, port); 507 } else { 508 /* 509 * If a port was detached because of a previous 510 * failure changing the MAC address, the port is 511 * reattached when it successfully changes the MAC 512 * address now, and this might cause the link state 513 * of the aggregation to change. 514 */ 515 *link_state_changedp = aggr_grp_attach_port(grp, port); 516 } 517 } 518 } 519 520 /* 521 * Add a port to a link aggregation group. 522 */ 523 static int 524 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 525 aggr_port_t **pp) 526 { 527 aggr_port_t *port, **cport; 528 mac_perim_handle_t mph; 529 zoneid_t port_zoneid = ALL_ZONES; 530 int err; 531 532 /* The port must be int the same zone as the aggregation. */ 533 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 534 port_zoneid = GLOBAL_ZONEID; 535 if (grp->lg_zoneid != port_zoneid) 536 return (EBUSY); 537 538 /* 539 * lg_mh could be NULL when the function is called during the creation 540 * of the aggregation. 541 */ 542 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 543 544 /* create new port */ 545 err = aggr_port_create(grp, port_linkid, force, &port); 546 if (err != 0) 547 return (err); 548 549 mac_perim_enter_by_mh(port->lp_mh, &mph); 550 551 /* add port to list of group constituent ports */ 552 cport = &grp->lg_ports; 553 while (*cport != NULL) 554 cport = &((*cport)->lp_next); 555 *cport = port; 556 557 /* 558 * Back reference to the group it is member of. A port always 559 * holds a reference to its group to ensure that the back 560 * reference is always valid. 561 */ 562 port->lp_grp = grp; 563 AGGR_GRP_REFHOLD(grp); 564 grp->lg_nports++; 565 566 aggr_lacp_init_port(port); 567 mac_perim_exit(mph); 568 569 if (pp != NULL) 570 *pp = port; 571 572 return (0); 573 } 574 575 /* 576 * This is called in response to either our LACP state machine or a MAC 577 * notification that the link has gone down via aggr_send_port_disable(). At 578 * this point, we may need to update our default ring. To that end, we go 579 * through the set of ports (underlying datalinks in an aggregation) that are 580 * currently enabled to transmit data. If all our links have been disabled for 581 * transmit, then we don't do anything. 582 * 583 * Note, because we only have a single TX group, we don't have to worry about 584 * the rings moving between groups and the chance that mac will reassign it 585 * unless someone removes a port, at which point, we play it safe and call this 586 * again. 587 */ 588 void 589 aggr_grp_update_default(aggr_grp_t *grp) 590 { 591 aggr_port_t *port; 592 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 593 594 rw_enter(&grp->lg_tx_lock, RW_WRITER); 595 596 if (grp->lg_ntx_ports == 0) { 597 rw_exit(&grp->lg_tx_lock); 598 return; 599 } 600 601 port = grp->lg_tx_ports[0]; 602 ASSERT(port->lp_tx_ring_cnt > 0); 603 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 604 rw_exit(&grp->lg_tx_lock); 605 } 606 607 /* 608 * Add a pseudo RX ring for the given HW ring handle. 609 */ 610 static int 611 aggr_add_pseudo_rx_ring(aggr_port_t *port, 612 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 613 { 614 aggr_pseudo_rx_ring_t *ring; 615 int err; 616 int j; 617 618 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 619 ring = rx_grp->arg_rings + j; 620 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 621 break; 622 } 623 624 /* 625 * No slot for this new RX ring. 626 */ 627 if (j == MAX_RINGS_PER_GROUP) 628 return (EIO); 629 630 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 631 ring->arr_hw_rh = hw_rh; 632 ring->arr_port = port; 633 rx_grp->arg_ring_cnt++; 634 635 /* 636 * The group is already registered, dynamically add a new ring to the 637 * mac group. 638 */ 639 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 640 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 641 ring->arr_hw_rh = NULL; 642 ring->arr_port = NULL; 643 rx_grp->arg_ring_cnt--; 644 } else { 645 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 646 mac_find_ring(rx_grp->arg_gh, j)); 647 } 648 return (err); 649 } 650 651 /* 652 * Remove the pseudo RX ring of the given HW ring handle. 653 */ 654 static void 655 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 656 { 657 aggr_pseudo_rx_ring_t *ring; 658 int j; 659 660 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 661 ring = rx_grp->arg_rings + j; 662 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 663 ring->arr_hw_rh != hw_rh) { 664 continue; 665 } 666 667 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 668 669 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 670 ring->arr_hw_rh = NULL; 671 ring->arr_port = NULL; 672 rx_grp->arg_ring_cnt--; 673 mac_hwring_teardown(hw_rh); 674 break; 675 } 676 } 677 678 /* 679 * This function is called to create pseudo rings over the hardware rings of 680 * the underlying device. Note that there is a 1:1 mapping between the pseudo 681 * RX rings of the aggr and the hardware rings of the underlying port. 682 */ 683 static int 684 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 685 { 686 aggr_grp_t *grp = port->lp_grp; 687 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 688 aggr_unicst_addr_t *addr, *a; 689 mac_perim_handle_t pmph; 690 int hw_rh_cnt, i = 0, j; 691 int err = 0; 692 693 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 694 mac_perim_enter_by_mh(port->lp_mh, &pmph); 695 696 /* 697 * This function must be called after the aggr registers its mac 698 * and its RX group has been initialized. 699 */ 700 ASSERT(rx_grp->arg_gh != NULL); 701 702 /* 703 * Get the list the the underlying HW rings. 704 */ 705 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 706 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX); 707 708 if (port->lp_hwgh != NULL) { 709 /* 710 * Quiesce the HW ring and the mac srs on the ring. Note 711 * that the HW ring will be restarted when the pseudo ring 712 * is started. At that time all the packets will be 713 * directly passed up to the pseudo RX ring and handled 714 * by mac srs created over the pseudo RX ring. 715 */ 716 mac_rx_client_quiesce(port->lp_mch); 717 mac_srs_perm_quiesce(port->lp_mch, B_TRUE); 718 } 719 720 /* 721 * Add all the unicast addresses to the newly added port. 722 */ 723 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 724 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0) 725 break; 726 } 727 728 for (i = 0; err == 0 && i < hw_rh_cnt; i++) 729 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 730 731 if (err != 0) { 732 for (j = 0; j < i; j++) 733 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 734 735 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 736 aggr_port_remmac(port, a->aua_addr); 737 738 if (port->lp_hwgh != NULL) { 739 mac_srs_perm_quiesce(port->lp_mch, B_FALSE); 740 mac_rx_client_restart(port->lp_mch); 741 port->lp_hwgh = NULL; 742 } 743 } else { 744 port->lp_rx_grp_added = B_TRUE; 745 } 746 done: 747 mac_perim_exit(pmph); 748 return (err); 749 } 750 751 /* 752 * This function is called by aggr to remove pseudo RX rings over the 753 * HW rings of the underlying port. 754 */ 755 static void 756 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 757 { 758 aggr_grp_t *grp = port->lp_grp; 759 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 760 aggr_unicst_addr_t *addr; 761 mac_group_handle_t hwgh; 762 mac_perim_handle_t pmph; 763 int hw_rh_cnt, i; 764 765 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 766 mac_perim_enter_by_mh(port->lp_mh, &pmph); 767 768 if (!port->lp_rx_grp_added) 769 goto done; 770 771 ASSERT(rx_grp->arg_gh != NULL); 772 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 773 &hwgh, hw_rh, MAC_RING_TYPE_RX); 774 775 /* 776 * If hw_rh_cnt is 0, it means that the underlying port does not 777 * support RX rings. Directly return in this case. 778 */ 779 for (i = 0; i < hw_rh_cnt; i++) 780 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 781 782 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 783 aggr_port_remmac(port, addr->aua_addr); 784 785 if (port->lp_hwgh != NULL) { 786 port->lp_hwgh = NULL; 787 788 /* 789 * First clear the permanent-quiesced flag of the RX srs then 790 * restart the HW ring and the mac srs on the ring. Note that 791 * the HW ring and associated SRS will soon been removed when 792 * the port is removed from the aggr. 793 */ 794 mac_srs_perm_quiesce(port->lp_mch, B_FALSE); 795 mac_rx_client_restart(port->lp_mch); 796 } 797 798 port->lp_rx_grp_added = B_FALSE; 799 done: 800 mac_perim_exit(pmph); 801 } 802 803 /* 804 * Add a pseudo TX ring for the given HW ring handle. 805 */ 806 static int 807 aggr_add_pseudo_tx_ring(aggr_port_t *port, 808 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 809 mac_ring_handle_t *pseudo_rh) 810 { 811 aggr_pseudo_tx_ring_t *ring; 812 int err; 813 int i; 814 815 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 816 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 817 ring = tx_grp->atg_rings + i; 818 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 819 break; 820 } 821 /* 822 * No slot for this new TX ring. 823 */ 824 if (i == MAX_RINGS_PER_GROUP) 825 return (EIO); 826 /* 827 * The following 4 statements needs to be done before 828 * calling mac_group_add_ring(). Otherwise it will 829 * result in an assertion failure in mac_init_ring(). 830 */ 831 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 832 ring->atr_hw_rh = hw_rh; 833 ring->atr_port = port; 834 tx_grp->atg_ring_cnt++; 835 836 /* 837 * The TX side has no concept of ring groups unlike RX groups. 838 * There is just a single group which stores all the TX rings. 839 * This group will be used to store aggr's pseudo TX rings. 840 */ 841 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 842 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 843 ring->atr_hw_rh = NULL; 844 ring->atr_port = NULL; 845 tx_grp->atg_ring_cnt--; 846 } else { 847 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 848 if (hw_rh != NULL) { 849 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 850 mac_find_ring(tx_grp->atg_gh, i)); 851 } 852 } 853 854 return (err); 855 } 856 857 /* 858 * Remove the pseudo TX ring of the given HW ring handle. 859 */ 860 static void 861 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 862 mac_ring_handle_t pseudo_hw_rh) 863 { 864 aggr_pseudo_tx_ring_t *ring; 865 int i; 866 867 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 868 ring = tx_grp->atg_rings + i; 869 if (ring->atr_rh != pseudo_hw_rh) 870 continue; 871 872 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 873 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 874 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 875 mac_hwring_teardown(ring->atr_hw_rh); 876 ring->atr_hw_rh = NULL; 877 ring->atr_port = NULL; 878 tx_grp->atg_ring_cnt--; 879 break; 880 } 881 } 882 883 /* 884 * This function is called to create pseudo rings over hardware rings of 885 * the underlying device. There is a 1:1 mapping between the pseudo TX 886 * rings of the aggr and the hardware rings of the underlying port. 887 */ 888 static int 889 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 890 { 891 aggr_grp_t *grp = port->lp_grp; 892 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 893 mac_perim_handle_t pmph; 894 int hw_rh_cnt, i = 0, j; 895 int err = 0; 896 897 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 898 mac_perim_enter_by_mh(port->lp_mh, &pmph); 899 900 /* 901 * Get the list the the underlying HW rings. 902 */ 903 hw_rh_cnt = mac_hwrings_get(port->lp_mch, 904 NULL, hw_rh, MAC_RING_TYPE_TX); 905 906 /* 907 * Even if the underlying NIC does not have TX rings, we 908 * still make a psuedo TX ring for that NIC with NULL as 909 * the ring handle. 910 */ 911 if (hw_rh_cnt == 0) 912 port->lp_tx_ring_cnt = 1; 913 else 914 port->lp_tx_ring_cnt = hw_rh_cnt; 915 916 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 917 port->lp_tx_ring_cnt), KM_SLEEP); 918 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 919 port->lp_tx_ring_cnt), KM_SLEEP); 920 921 if (hw_rh_cnt == 0) { 922 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 923 NULL, &pseudo_rh)) == 0) { 924 port->lp_tx_rings[0] = NULL; 925 port->lp_pseudo_tx_rings[0] = pseudo_rh; 926 } 927 } else { 928 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 929 err = aggr_add_pseudo_tx_ring(port, 930 tx_grp, hw_rh[i], &pseudo_rh); 931 if (err != 0) 932 break; 933 port->lp_tx_rings[i] = hw_rh[i]; 934 port->lp_pseudo_tx_rings[i] = pseudo_rh; 935 } 936 } 937 938 if (err != 0) { 939 if (hw_rh_cnt != 0) { 940 for (j = 0; j < i; j++) { 941 aggr_rem_pseudo_tx_ring(tx_grp, 942 port->lp_pseudo_tx_rings[j]); 943 } 944 } 945 kmem_free(port->lp_tx_rings, 946 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 947 kmem_free(port->lp_pseudo_tx_rings, 948 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 949 port->lp_tx_ring_cnt = 0; 950 } else { 951 port->lp_tx_grp_added = B_TRUE; 952 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 953 aggr_tx_ring_update, port); 954 } 955 mac_perim_exit(pmph); 956 aggr_grp_update_default(grp); 957 return (err); 958 } 959 960 /* 961 * This function is called by aggr to remove pseudo TX rings over the 962 * HW rings of the underlying port. 963 */ 964 static void 965 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 966 { 967 aggr_grp_t *grp = port->lp_grp; 968 mac_perim_handle_t pmph; 969 int i; 970 971 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 972 mac_perim_enter_by_mh(port->lp_mh, &pmph); 973 974 if (!port->lp_tx_grp_added) 975 goto done; 976 977 ASSERT(tx_grp->atg_gh != NULL); 978 979 for (i = 0; i < port->lp_tx_ring_cnt; i++) 980 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 981 982 kmem_free(port->lp_tx_rings, 983 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 984 kmem_free(port->lp_pseudo_tx_rings, 985 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 986 987 port->lp_tx_ring_cnt = 0; 988 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 989 port->lp_tx_grp_added = B_FALSE; 990 aggr_grp_update_default(grp); 991 done: 992 mac_perim_exit(pmph); 993 } 994 995 static int 996 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 997 { 998 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 999 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1000 } 1001 1002 static int 1003 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1004 { 1005 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1006 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1007 } 1008 1009 static int 1010 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1011 { 1012 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1013 int err; 1014 1015 err = mac_hwring_start(rr_ring->arr_hw_rh); 1016 if (err == 0) 1017 rr_ring->arr_gen = mr_gen; 1018 return (err); 1019 } 1020 1021 static void 1022 aggr_pseudo_stop_ring(mac_ring_driver_t arg) 1023 { 1024 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1025 mac_hwring_stop(rr_ring->arr_hw_rh); 1026 } 1027 1028 /* 1029 * Add one or more ports to an existing link aggregation group. 1030 */ 1031 int 1032 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1033 laioc_port_t *ports) 1034 { 1035 int rc, i, nadded = 0; 1036 aggr_grp_t *grp = NULL; 1037 aggr_port_t *port; 1038 boolean_t link_state_changed = B_FALSE; 1039 mac_perim_handle_t mph, pmph; 1040 1041 /* get group corresponding to linkid */ 1042 rw_enter(&aggr_grp_lock, RW_READER); 1043 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1044 (mod_hash_val_t *)&grp) != 0) { 1045 rw_exit(&aggr_grp_lock); 1046 return (ENOENT); 1047 } 1048 AGGR_GRP_REFHOLD(grp); 1049 1050 /* 1051 * Hold the perimeter so that the aggregation won't be destroyed. 1052 */ 1053 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1054 rw_exit(&aggr_grp_lock); 1055 1056 /* add the specified ports to group */ 1057 for (i = 0; i < nports; i++) { 1058 /* add port to group */ 1059 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1060 force, &port)) != 0) { 1061 goto bail; 1062 } 1063 ASSERT(port != NULL); 1064 nadded++; 1065 1066 /* check capabilities */ 1067 if (!aggr_grp_capab_check(grp, port) || 1068 !aggr_grp_sdu_check(grp, port) || 1069 !aggr_grp_margin_check(grp, port)) { 1070 rc = ENOTSUP; 1071 goto bail; 1072 } 1073 1074 /* 1075 * Create the pseudo ring for each HW ring of the underlying 1076 * port. 1077 */ 1078 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1079 if (rc != 0) 1080 goto bail; 1081 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group); 1082 if (rc != 0) 1083 goto bail; 1084 1085 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1086 1087 /* set LACP mode */ 1088 aggr_port_lacp_set_mode(grp, port); 1089 1090 /* start port if group has already been started */ 1091 if (grp->lg_started) { 1092 rc = aggr_port_start(port); 1093 if (rc != 0) { 1094 mac_perim_exit(pmph); 1095 goto bail; 1096 } 1097 1098 /* 1099 * Turn on the promiscuous mode over the port when it 1100 * is requested to be turned on to receive the 1101 * non-primary address over a port, or the promiscous 1102 * mode is enabled over the aggr. 1103 */ 1104 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1105 rc = aggr_port_promisc(port, B_TRUE); 1106 if (rc != 0) { 1107 mac_perim_exit(pmph); 1108 goto bail; 1109 } 1110 } 1111 } 1112 mac_perim_exit(pmph); 1113 1114 /* 1115 * Attach each port if necessary. 1116 */ 1117 if (aggr_port_notify_link(grp, port)) 1118 link_state_changed = B_TRUE; 1119 1120 /* 1121 * Initialize the callback functions for this port. 1122 */ 1123 aggr_port_init_callbacks(port); 1124 } 1125 1126 /* update the MAC address of the constituent ports */ 1127 if (aggr_grp_update_ports_mac(grp)) 1128 link_state_changed = B_TRUE; 1129 1130 if (link_state_changed) 1131 mac_link_update(grp->lg_mh, grp->lg_link_state); 1132 1133 bail: 1134 if (rc != 0) { 1135 /* stop and remove ports that have been added */ 1136 for (i = 0; i < nadded; i++) { 1137 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1138 ASSERT(port != NULL); 1139 if (grp->lg_started) { 1140 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1141 (void) aggr_port_promisc(port, B_FALSE); 1142 aggr_port_stop(port); 1143 mac_perim_exit(pmph); 1144 } 1145 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1146 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1147 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1148 } 1149 } 1150 1151 mac_perim_exit(mph); 1152 AGGR_GRP_REFRELE(grp); 1153 return (rc); 1154 } 1155 1156 static int 1157 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1158 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1159 aggr_lacp_timer_t lacp_timer) 1160 { 1161 boolean_t mac_addr_changed = B_FALSE; 1162 boolean_t link_state_changed = B_FALSE; 1163 mac_perim_handle_t pmph; 1164 1165 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1166 1167 /* validate fixed address if specified */ 1168 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1169 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1170 (mac_addr[0] & 0x01))) { 1171 return (EINVAL); 1172 } 1173 1174 /* update policy if requested */ 1175 if (update_mask & AGGR_MODIFY_POLICY) 1176 aggr_send_update_policy(grp, policy); 1177 1178 /* update unicast MAC address if requested */ 1179 if (update_mask & AGGR_MODIFY_MAC) { 1180 if (mac_fixed) { 1181 /* user-supplied MAC address */ 1182 grp->lg_mac_addr_port = NULL; 1183 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1184 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1185 mac_addr_changed = B_TRUE; 1186 } 1187 } else if (grp->lg_addr_fixed) { 1188 /* switch from user-supplied to automatic */ 1189 aggr_port_t *port = grp->lg_ports; 1190 1191 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1192 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1193 grp->lg_mac_addr_port = port; 1194 mac_addr_changed = B_TRUE; 1195 mac_perim_exit(pmph); 1196 } 1197 grp->lg_addr_fixed = mac_fixed; 1198 } 1199 1200 if (mac_addr_changed) 1201 link_state_changed = aggr_grp_update_ports_mac(grp); 1202 1203 if (update_mask & AGGR_MODIFY_LACP_MODE) 1204 aggr_lacp_update_mode(grp, lacp_mode); 1205 1206 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1207 aggr_lacp_update_timer(grp, lacp_timer); 1208 1209 if (link_state_changed) 1210 mac_link_update(grp->lg_mh, grp->lg_link_state); 1211 1212 if (mac_addr_changed) 1213 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1214 1215 return (0); 1216 } 1217 1218 /* 1219 * Update properties of an existing link aggregation group. 1220 */ 1221 int 1222 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1223 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1224 aggr_lacp_timer_t lacp_timer) 1225 { 1226 aggr_grp_t *grp = NULL; 1227 mac_perim_handle_t mph; 1228 int err; 1229 1230 /* get group corresponding to linkid */ 1231 rw_enter(&aggr_grp_lock, RW_READER); 1232 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1233 (mod_hash_val_t *)&grp) != 0) { 1234 rw_exit(&aggr_grp_lock); 1235 return (ENOENT); 1236 } 1237 AGGR_GRP_REFHOLD(grp); 1238 1239 /* 1240 * Hold the perimeter so that the aggregation won't be destroyed. 1241 */ 1242 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1243 rw_exit(&aggr_grp_lock); 1244 1245 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1246 mac_addr, lacp_mode, lacp_timer); 1247 1248 mac_perim_exit(mph); 1249 AGGR_GRP_REFRELE(grp); 1250 return (err); 1251 } 1252 1253 /* 1254 * Create a new link aggregation group upon request from administrator. 1255 * Returns 0 on success, an errno on failure. 1256 */ 1257 int 1258 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1259 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1260 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1261 cred_t *credp) 1262 { 1263 aggr_grp_t *grp = NULL; 1264 aggr_port_t *port; 1265 mac_register_t *mac; 1266 boolean_t link_state_changed; 1267 mac_perim_handle_t mph; 1268 int err; 1269 int i; 1270 kt_did_t tid = 0; 1271 1272 /* need at least one port */ 1273 if (nports == 0) 1274 return (EINVAL); 1275 1276 rw_enter(&aggr_grp_lock, RW_WRITER); 1277 1278 /* does a group with the same linkid already exist? */ 1279 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1280 (mod_hash_val_t *)&grp); 1281 if (err == 0) { 1282 rw_exit(&aggr_grp_lock); 1283 return (EEXIST); 1284 } 1285 1286 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1287 1288 grp->lg_refs = 1; 1289 grp->lg_closing = B_FALSE; 1290 grp->lg_force = force; 1291 grp->lg_linkid = linkid; 1292 grp->lg_zoneid = crgetzoneid(credp); 1293 grp->lg_ifspeed = 0; 1294 grp->lg_link_state = LINK_STATE_UNKNOWN; 1295 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1296 grp->lg_started = B_FALSE; 1297 grp->lg_promisc = B_FALSE; 1298 grp->lg_lacp_done = B_FALSE; 1299 grp->lg_tx_notify_done = B_FALSE; 1300 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1301 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1302 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1303 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1304 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1305 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1306 MAX_RINGS_PER_GROUP), KM_SLEEP); 1307 grp->lg_tx_blocked_cnt = 0; 1308 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t)); 1309 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1310 aggr_lacp_init_grp(grp); 1311 1312 /* add MAC ports to group */ 1313 grp->lg_ports = NULL; 1314 grp->lg_nports = 0; 1315 grp->lg_nattached_ports = 0; 1316 grp->lg_ntx_ports = 0; 1317 1318 /* 1319 * If key is not specified by the user, allocate the key. 1320 */ 1321 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1322 err = ENOMEM; 1323 goto bail; 1324 } 1325 grp->lg_key = key; 1326 1327 for (i = 0; i < nports; i++) { 1328 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL); 1329 if (err != 0) 1330 goto bail; 1331 } 1332 1333 /* 1334 * If no explicit MAC address was specified by the administrator, 1335 * set it to the MAC address of the first port. 1336 */ 1337 grp->lg_addr_fixed = mac_fixed; 1338 if (grp->lg_addr_fixed) { 1339 /* validate specified address */ 1340 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1341 err = EINVAL; 1342 goto bail; 1343 } 1344 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1345 } else { 1346 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1347 grp->lg_mac_addr_port = grp->lg_ports; 1348 } 1349 1350 /* set the initial group capabilities */ 1351 aggr_grp_capab_set(grp); 1352 1353 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1354 err = ENOMEM; 1355 goto bail; 1356 } 1357 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1358 mac->m_driver = grp; 1359 mac->m_dip = aggr_dip; 1360 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1361 mac->m_src_addr = grp->lg_addr; 1362 mac->m_callbacks = &aggr_m_callbacks; 1363 mac->m_min_sdu = 0; 1364 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1365 mac->m_margin = aggr_grp_max_margin(grp); 1366 mac->m_v12n = MAC_VIRT_LEVEL1; 1367 err = mac_register(mac, &grp->lg_mh); 1368 mac_free(mac); 1369 if (err != 0) 1370 goto bail; 1371 1372 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1373 if (err != 0) { 1374 (void) mac_unregister(grp->lg_mh); 1375 grp->lg_mh = NULL; 1376 goto bail; 1377 } 1378 1379 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1380 1381 /* 1382 * Update the MAC address of the constituent ports. 1383 * None of the port is attached at this time, the link state of the 1384 * aggregation will not change. 1385 */ 1386 link_state_changed = aggr_grp_update_ports_mac(grp); 1387 ASSERT(!link_state_changed); 1388 1389 /* update outbound load balancing policy */ 1390 aggr_send_update_policy(grp, policy); 1391 1392 /* set LACP mode */ 1393 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1394 1395 /* 1396 * Attach each port if necessary. 1397 */ 1398 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1399 /* 1400 * Create the pseudo ring for each HW ring of the underlying 1401 * port. Note that this is done after the aggr registers the 1402 * mac. 1403 */ 1404 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0); 1405 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0); 1406 if (aggr_port_notify_link(grp, port)) 1407 link_state_changed = B_TRUE; 1408 1409 /* 1410 * Initialize the callback functions for this port. 1411 */ 1412 aggr_port_init_callbacks(port); 1413 } 1414 1415 if (link_state_changed) 1416 mac_link_update(grp->lg_mh, grp->lg_link_state); 1417 1418 /* add new group to hash table */ 1419 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1420 (mod_hash_val_t)grp); 1421 ASSERT(err == 0); 1422 aggr_grp_cnt++; 1423 1424 mac_perim_exit(mph); 1425 rw_exit(&aggr_grp_lock); 1426 return (0); 1427 1428 bail: 1429 1430 grp->lg_closing = B_TRUE; 1431 1432 port = grp->lg_ports; 1433 while (port != NULL) { 1434 aggr_port_t *cport; 1435 1436 cport = port->lp_next; 1437 aggr_port_delete(port); 1438 port = cport; 1439 } 1440 1441 /* 1442 * Inform the lacp_rx thread to exit. 1443 */ 1444 mutex_enter(&grp->lg_lacp_lock); 1445 grp->lg_lacp_done = B_TRUE; 1446 cv_signal(&grp->lg_lacp_cv); 1447 while (grp->lg_lacp_rx_thread != NULL) 1448 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1449 mutex_exit(&grp->lg_lacp_lock); 1450 /* 1451 * Inform the tx_notify thread to exit. 1452 */ 1453 mutex_enter(&grp->lg_tx_flowctl_lock); 1454 if (grp->lg_tx_notify_thread != NULL) { 1455 tid = grp->lg_tx_notify_thread->t_did; 1456 grp->lg_tx_notify_done = B_TRUE; 1457 cv_signal(&grp->lg_tx_flowctl_cv); 1458 } 1459 mutex_exit(&grp->lg_tx_flowctl_lock); 1460 if (tid != 0) 1461 thread_join(tid); 1462 1463 kmem_free(grp->lg_tx_blocked_rings, 1464 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1465 rw_exit(&aggr_grp_lock); 1466 AGGR_GRP_REFRELE(grp); 1467 return (err); 1468 } 1469 1470 /* 1471 * Return a pointer to the member of a group with specified linkid. 1472 */ 1473 static aggr_port_t * 1474 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1475 { 1476 aggr_port_t *port; 1477 1478 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1479 1480 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1481 if (port->lp_linkid == linkid) 1482 break; 1483 } 1484 1485 return (port); 1486 } 1487 1488 /* 1489 * Stop, detach and remove a port from a link aggregation group. 1490 */ 1491 static int 1492 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1493 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1494 { 1495 int rc = 0; 1496 aggr_port_t **pport; 1497 boolean_t mac_addr_changed = B_FALSE; 1498 boolean_t link_state_changed = B_FALSE; 1499 mac_perim_handle_t mph; 1500 uint64_t val; 1501 uint_t i; 1502 uint_t stat; 1503 1504 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1505 ASSERT(grp->lg_nports > 1); 1506 ASSERT(!grp->lg_closing); 1507 1508 /* unlink port */ 1509 for (pport = &grp->lg_ports; *pport != port; 1510 pport = &(*pport)->lp_next) { 1511 if (*pport == NULL) { 1512 rc = ENOENT; 1513 goto done; 1514 } 1515 } 1516 *pport = port->lp_next; 1517 1518 mac_perim_enter_by_mh(port->lp_mh, &mph); 1519 1520 /* 1521 * If the MAC address of the port being removed was assigned 1522 * to the group, update the group MAC address 1523 * using the MAC address of a different port. 1524 */ 1525 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1526 /* 1527 * Set the MAC address of the group to the 1528 * MAC address of its first port. 1529 */ 1530 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1531 grp->lg_mac_addr_port = grp->lg_ports; 1532 mac_addr_changed = B_TRUE; 1533 } 1534 1535 link_state_changed = aggr_grp_detach_port(grp, port); 1536 1537 /* 1538 * Add the counter statistics of the ports while it was aggregated 1539 * to the group's residual statistics. This is done by obtaining 1540 * the current counter from the underlying MAC then subtracting the 1541 * value of the counter at the moment it was added to the 1542 * aggregation. 1543 */ 1544 for (i = 0; i < MAC_NSTAT; i++) { 1545 stat = i + MAC_STAT_MIN; 1546 if (!MAC_STAT_ISACOUNTER(stat)) 1547 continue; 1548 val = aggr_port_stat(port, stat); 1549 val -= port->lp_stat[i]; 1550 grp->lg_stat[i] += val; 1551 } 1552 for (i = 0; i < ETHER_NSTAT; i++) { 1553 stat = i + MACTYPE_STAT_MIN; 1554 if (!ETHER_STAT_ISACOUNTER(stat)) 1555 continue; 1556 val = aggr_port_stat(port, stat); 1557 val -= port->lp_ether_stat[i]; 1558 grp->lg_ether_stat[i] += val; 1559 } 1560 1561 grp->lg_nports--; 1562 mac_perim_exit(mph); 1563 1564 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1565 aggr_port_delete(port); 1566 1567 /* 1568 * If the group MAC address has changed, update the MAC address of 1569 * the remaining constituent ports according to the new MAC 1570 * address of the group. 1571 */ 1572 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1573 link_state_changed = B_TRUE; 1574 1575 done: 1576 if (mac_addr_changedp != NULL) 1577 *mac_addr_changedp = mac_addr_changed; 1578 if (link_state_changedp != NULL) 1579 *link_state_changedp = link_state_changed; 1580 1581 return (rc); 1582 } 1583 1584 /* 1585 * Remove one or more ports from an existing link aggregation group. 1586 */ 1587 int 1588 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1589 { 1590 int rc = 0, i; 1591 aggr_grp_t *grp = NULL; 1592 aggr_port_t *port; 1593 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1594 boolean_t link_state_update = B_FALSE, link_state_changed; 1595 mac_perim_handle_t mph, pmph; 1596 1597 /* get group corresponding to linkid */ 1598 rw_enter(&aggr_grp_lock, RW_READER); 1599 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1600 (mod_hash_val_t *)&grp) != 0) { 1601 rw_exit(&aggr_grp_lock); 1602 return (ENOENT); 1603 } 1604 AGGR_GRP_REFHOLD(grp); 1605 1606 /* 1607 * Hold the perimeter so that the aggregation won't be destroyed. 1608 */ 1609 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1610 rw_exit(&aggr_grp_lock); 1611 1612 /* we need to keep at least one port per group */ 1613 if (nports >= grp->lg_nports) { 1614 rc = EINVAL; 1615 goto bail; 1616 } 1617 1618 /* first verify that all the groups are valid */ 1619 for (i = 0; i < nports; i++) { 1620 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1621 /* port not found */ 1622 rc = ENOENT; 1623 goto bail; 1624 } 1625 } 1626 1627 /* clear the promiscous mode for the specified ports */ 1628 for (i = 0; i < nports && rc == 0; i++) { 1629 /* lookup port */ 1630 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1631 ASSERT(port != NULL); 1632 1633 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1634 rc = aggr_port_promisc(port, B_FALSE); 1635 mac_perim_exit(pmph); 1636 } 1637 if (rc != 0) { 1638 for (i = 0; i < nports; i++) { 1639 port = aggr_grp_port_lookup(grp, 1640 ports[i].lp_linkid); 1641 ASSERT(port != NULL); 1642 1643 /* 1644 * Turn the promiscuous mode back on if it is required 1645 * to receive the non-primary address over a port, or 1646 * the promiscous mode is enabled over the aggr. 1647 */ 1648 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1649 if (port->lp_started && (grp->lg_promisc || 1650 port->lp_prom_addr != NULL)) { 1651 (void) aggr_port_promisc(port, B_TRUE); 1652 } 1653 mac_perim_exit(pmph); 1654 } 1655 goto bail; 1656 } 1657 1658 /* remove the specified ports from group */ 1659 for (i = 0; i < nports; i++) { 1660 /* lookup port */ 1661 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1662 ASSERT(port != NULL); 1663 1664 /* stop port if group has already been started */ 1665 if (grp->lg_started) { 1666 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1667 aggr_port_stop(port); 1668 mac_perim_exit(pmph); 1669 } 1670 1671 /* 1672 * aggr_rem_pseudo_tx_group() is not called here. Instead 1673 * it is called from inside aggr_grp_rem_port() after the 1674 * port has been detached. The reason is that 1675 * aggr_rem_pseudo_tx_group() removes one ring at a time 1676 * and if there is still traffic going on, then there 1677 * is the possibility of aggr_find_tx_ring() returning a 1678 * removed ring for transmission. Once the port has been 1679 * detached, that port will not be used and 1680 * aggr_find_tx_ring() will not return any rings 1681 * belonging to it. 1682 */ 1683 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1684 1685 /* remove port from group */ 1686 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1687 &link_state_changed); 1688 ASSERT(rc == 0); 1689 mac_addr_update = mac_addr_update || mac_addr_changed; 1690 link_state_update = link_state_update || link_state_changed; 1691 } 1692 1693 bail: 1694 if (mac_addr_update) 1695 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1696 if (link_state_update) 1697 mac_link_update(grp->lg_mh, grp->lg_link_state); 1698 1699 mac_perim_exit(mph); 1700 AGGR_GRP_REFRELE(grp); 1701 1702 return (rc); 1703 } 1704 1705 int 1706 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1707 { 1708 aggr_grp_t *grp = NULL; 1709 aggr_port_t *port, *cport; 1710 datalink_id_t tmpid; 1711 mod_hash_val_t val; 1712 mac_perim_handle_t mph, pmph; 1713 int err; 1714 kt_did_t tid = 0; 1715 1716 rw_enter(&aggr_grp_lock, RW_WRITER); 1717 1718 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1719 (mod_hash_val_t *)&grp) != 0) { 1720 rw_exit(&aggr_grp_lock); 1721 return (ENOENT); 1722 } 1723 1724 /* 1725 * Note that dls_devnet_destroy() must be called before lg_lock is 1726 * held. Otherwise, it will deadlock if another thread is in 1727 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1728 * dls_devnet_destroy() needs to delete. 1729 */ 1730 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1731 rw_exit(&aggr_grp_lock); 1732 return (err); 1733 } 1734 ASSERT(linkid == tmpid); 1735 1736 /* 1737 * Unregister from the MAC service module. Since this can 1738 * fail if a client hasn't closed the MAC port, we gracefully 1739 * fail the operation. 1740 */ 1741 if ((err = mac_disable(grp->lg_mh)) != 0) { 1742 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1743 rw_exit(&aggr_grp_lock); 1744 return (err); 1745 } 1746 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1747 ASSERT(grp == (aggr_grp_t *)val); 1748 1749 ASSERT(aggr_grp_cnt > 0); 1750 aggr_grp_cnt--; 1751 rw_exit(&aggr_grp_lock); 1752 1753 /* 1754 * Inform the lacp_rx thread to exit. 1755 */ 1756 mutex_enter(&grp->lg_lacp_lock); 1757 grp->lg_lacp_done = B_TRUE; 1758 cv_signal(&grp->lg_lacp_cv); 1759 while (grp->lg_lacp_rx_thread != NULL) 1760 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1761 mutex_exit(&grp->lg_lacp_lock); 1762 /* 1763 * Inform the tx_notify_thread to exit. 1764 */ 1765 mutex_enter(&grp->lg_tx_flowctl_lock); 1766 if (grp->lg_tx_notify_thread != NULL) { 1767 tid = grp->lg_tx_notify_thread->t_did; 1768 grp->lg_tx_notify_done = B_TRUE; 1769 cv_signal(&grp->lg_tx_flowctl_cv); 1770 } 1771 mutex_exit(&grp->lg_tx_flowctl_lock); 1772 if (tid != 0) 1773 thread_join(tid); 1774 1775 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1776 1777 grp->lg_closing = B_TRUE; 1778 /* detach and free MAC ports associated with group */ 1779 port = grp->lg_ports; 1780 while (port != NULL) { 1781 cport = port->lp_next; 1782 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1783 if (grp->lg_started) 1784 aggr_port_stop(port); 1785 (void) aggr_grp_detach_port(grp, port); 1786 mac_perim_exit(pmph); 1787 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1788 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group); 1789 aggr_port_delete(port); 1790 port = cport; 1791 } 1792 1793 mac_perim_exit(mph); 1794 1795 kmem_free(grp->lg_tx_blocked_rings, 1796 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1797 /* 1798 * Wait for the port's lacp timer thread and its notification callback 1799 * to exit before calling mac_unregister() since both needs to access 1800 * the mac perimeter of the grp. 1801 */ 1802 aggr_grp_port_wait(grp); 1803 1804 VERIFY(mac_unregister(grp->lg_mh) == 0); 1805 grp->lg_mh = NULL; 1806 1807 AGGR_GRP_REFRELE(grp); 1808 return (0); 1809 } 1810 1811 void 1812 aggr_grp_free(aggr_grp_t *grp) 1813 { 1814 ASSERT(grp->lg_refs == 0); 1815 ASSERT(grp->lg_port_ref == 0); 1816 if (grp->lg_key > AGGR_MAX_KEY) { 1817 id_free(key_ids, grp->lg_key); 1818 grp->lg_key = 0; 1819 } 1820 kmem_cache_free(aggr_grp_cache, grp); 1821 } 1822 1823 int 1824 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1825 aggr_grp_info_new_grp_fn_t new_grp_fn, 1826 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1827 { 1828 aggr_grp_t *grp; 1829 aggr_port_t *port; 1830 mac_perim_handle_t mph, pmph; 1831 int rc = 0; 1832 1833 /* 1834 * Make sure that the aggregation link is visible from the caller's 1835 * zone. 1836 */ 1837 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 1838 return (ENOENT); 1839 1840 rw_enter(&aggr_grp_lock, RW_READER); 1841 1842 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1843 (mod_hash_val_t *)&grp) != 0) { 1844 rw_exit(&aggr_grp_lock); 1845 return (ENOENT); 1846 } 1847 AGGR_GRP_REFHOLD(grp); 1848 1849 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1850 rw_exit(&aggr_grp_lock); 1851 1852 rc = new_grp_fn(fn_arg, grp->lg_linkid, 1853 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 1854 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 1855 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 1856 1857 if (rc != 0) 1858 goto bail; 1859 1860 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1861 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1862 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 1863 port->lp_state, &port->lp_lacp.ActorOperPortState); 1864 mac_perim_exit(pmph); 1865 1866 if (rc != 0) 1867 goto bail; 1868 } 1869 1870 bail: 1871 mac_perim_exit(mph); 1872 AGGR_GRP_REFRELE(grp); 1873 return (rc); 1874 } 1875 1876 /*ARGSUSED*/ 1877 static void 1878 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 1879 { 1880 miocnak(q, mp, 0, ENOTSUP); 1881 } 1882 1883 static int 1884 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 1885 { 1886 aggr_port_t *port; 1887 uint_t stat_index; 1888 1889 /* We only aggregate counter statistics. */ 1890 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 1891 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 1892 return (ENOTSUP); 1893 } 1894 1895 /* 1896 * Counter statistics for a group are computed by aggregating the 1897 * counters of the members MACs while they were aggregated, plus 1898 * the residual counter of the group itself, which is updated each 1899 * time a MAC is removed from the group. 1900 */ 1901 *val = 0; 1902 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1903 /* actual port statistic */ 1904 *val += aggr_port_stat(port, stat); 1905 /* 1906 * minus the port stat when it was added, plus any residual 1907 * amount for the group. 1908 */ 1909 if (IS_MAC_STAT(stat)) { 1910 stat_index = stat - MAC_STAT_MIN; 1911 *val -= port->lp_stat[stat_index]; 1912 *val += grp->lg_stat[stat_index]; 1913 } else if (IS_MACTYPE_STAT(stat)) { 1914 stat_index = stat - MACTYPE_STAT_MIN; 1915 *val -= port->lp_ether_stat[stat_index]; 1916 *val += grp->lg_ether_stat[stat_index]; 1917 } 1918 } 1919 return (0); 1920 } 1921 1922 int 1923 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 1924 { 1925 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 1926 1927 if (rx_ring->arr_hw_rh != NULL) { 1928 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 1929 } else { 1930 aggr_port_t *port = rx_ring->arr_port; 1931 1932 *val = mac_stat_get(port->lp_mh, stat); 1933 1934 } 1935 return (0); 1936 } 1937 1938 int 1939 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 1940 { 1941 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 1942 1943 if (tx_ring->atr_hw_rh != NULL) { 1944 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 1945 } else { 1946 aggr_port_t *port = tx_ring->atr_port; 1947 1948 *val = mac_stat_get(port->lp_mh, stat); 1949 } 1950 return (0); 1951 } 1952 1953 static int 1954 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 1955 { 1956 aggr_grp_t *grp = arg; 1957 mac_perim_handle_t mph; 1958 int rval = 0; 1959 1960 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1961 1962 switch (stat) { 1963 case MAC_STAT_IFSPEED: 1964 *val = grp->lg_ifspeed; 1965 break; 1966 1967 case ETHER_STAT_LINK_DUPLEX: 1968 *val = grp->lg_link_duplex; 1969 break; 1970 1971 default: 1972 /* 1973 * For all other statistics, we return the aggregated stat 1974 * from the underlying ports. aggr_grp_stat() will set 1975 * rval appropriately if the statistic isn't a counter. 1976 */ 1977 rval = aggr_grp_stat(grp, stat, val); 1978 } 1979 1980 mac_perim_exit(mph); 1981 return (rval); 1982 } 1983 1984 static int 1985 aggr_m_start(void *arg) 1986 { 1987 aggr_grp_t *grp = arg; 1988 aggr_port_t *port; 1989 mac_perim_handle_t mph, pmph; 1990 1991 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1992 1993 /* 1994 * Attempts to start all configured members of the group. 1995 * Group members will be attached when their link-up notification 1996 * is received. 1997 */ 1998 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1999 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2000 if (aggr_port_start(port) != 0) { 2001 mac_perim_exit(pmph); 2002 continue; 2003 } 2004 2005 /* 2006 * Turn on the promiscuous mode if it is required to receive 2007 * the non-primary address over a port, or the promiscous 2008 * mode is enabled over the aggr. 2009 */ 2010 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2011 if (aggr_port_promisc(port, B_TRUE) != 0) 2012 aggr_port_stop(port); 2013 } 2014 mac_perim_exit(pmph); 2015 } 2016 2017 grp->lg_started = B_TRUE; 2018 2019 mac_perim_exit(mph); 2020 return (0); 2021 } 2022 2023 static void 2024 aggr_m_stop(void *arg) 2025 { 2026 aggr_grp_t *grp = arg; 2027 aggr_port_t *port; 2028 mac_perim_handle_t mph, pmph; 2029 2030 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2031 2032 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2033 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2034 2035 /* reset port promiscuous mode */ 2036 (void) aggr_port_promisc(port, B_FALSE); 2037 2038 aggr_port_stop(port); 2039 mac_perim_exit(pmph); 2040 } 2041 2042 grp->lg_started = B_FALSE; 2043 mac_perim_exit(mph); 2044 } 2045 2046 static int 2047 aggr_m_promisc(void *arg, boolean_t on) 2048 { 2049 aggr_grp_t *grp = arg; 2050 aggr_port_t *port; 2051 boolean_t link_state_changed = B_FALSE; 2052 mac_perim_handle_t mph, pmph; 2053 2054 AGGR_GRP_REFHOLD(grp); 2055 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2056 2057 ASSERT(!grp->lg_closing); 2058 2059 if (on == grp->lg_promisc) 2060 goto bail; 2061 2062 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2063 int err = 0; 2064 2065 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2066 AGGR_PORT_REFHOLD(port); 2067 if (!on && (port->lp_prom_addr == NULL)) 2068 err = aggr_port_promisc(port, B_FALSE); 2069 else if (on && port->lp_started) 2070 err = aggr_port_promisc(port, B_TRUE); 2071 2072 if (err != 0) { 2073 if (aggr_grp_detach_port(grp, port)) 2074 link_state_changed = B_TRUE; 2075 } else { 2076 /* 2077 * If a port was detached because of a previous 2078 * failure changing the promiscuity, the port 2079 * is reattached when it successfully changes 2080 * the promiscuity now, and this might cause 2081 * the link state of the aggregation to change. 2082 */ 2083 if (aggr_grp_attach_port(grp, port)) 2084 link_state_changed = B_TRUE; 2085 } 2086 mac_perim_exit(pmph); 2087 AGGR_PORT_REFRELE(port); 2088 } 2089 2090 grp->lg_promisc = on; 2091 2092 if (link_state_changed) 2093 mac_link_update(grp->lg_mh, grp->lg_link_state); 2094 2095 bail: 2096 mac_perim_exit(mph); 2097 AGGR_GRP_REFRELE(grp); 2098 2099 return (0); 2100 } 2101 2102 static void 2103 aggr_grp_port_rename(const char *new_name, void *arg) 2104 { 2105 /* 2106 * aggr port's mac client name is the format of "aggr link name" plus 2107 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2108 */ 2109 int aggr_len, link_len, clnt_name_len, i; 2110 char *str_end, *str_st, *str_del; 2111 char aggr_name[MAXNAMELEN]; 2112 char link_name[MAXNAMELEN]; 2113 char *clnt_name; 2114 aggr_grp_t *aggr_grp = arg; 2115 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2116 2117 for (i = 0; i < aggr_grp->lg_nports; i++) { 2118 clnt_name = mac_client_name(aggr_port->lp_mch); 2119 clnt_name_len = strlen(clnt_name); 2120 str_st = clnt_name; 2121 str_end = &(clnt_name[clnt_name_len]); 2122 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2123 ASSERT(str_del != NULL); 2124 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2125 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2126 bzero(aggr_name, MAXNAMELEN); 2127 bzero(link_name, MAXNAMELEN); 2128 bcopy(clnt_name, aggr_name, aggr_len); 2129 bcopy(str_del, link_name, link_len + 1); 2130 bzero(clnt_name, MAXNAMELEN); 2131 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2132 link_name); 2133 2134 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2135 aggr_port = aggr_port->lp_next; 2136 } 2137 } 2138 2139 /* 2140 * Initialize the capabilities that are advertised for the group 2141 * according to the capabilities of the constituent ports. 2142 */ 2143 static boolean_t 2144 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2145 { 2146 aggr_grp_t *grp = arg; 2147 2148 switch (cap) { 2149 case MAC_CAPAB_HCKSUM: { 2150 uint32_t *hcksum_txflags = cap_data; 2151 *hcksum_txflags = grp->lg_hcksum_txflags; 2152 break; 2153 } 2154 case MAC_CAPAB_LSO: { 2155 mac_capab_lso_t *cap_lso = cap_data; 2156 2157 if (grp->lg_lso) { 2158 *cap_lso = grp->lg_cap_lso; 2159 break; 2160 } else { 2161 return (B_FALSE); 2162 } 2163 } 2164 case MAC_CAPAB_NO_NATIVEVLAN: 2165 return (!grp->lg_vlan); 2166 case MAC_CAPAB_NO_ZCOPY: 2167 return (!grp->lg_zcopy); 2168 case MAC_CAPAB_RINGS: { 2169 mac_capab_rings_t *cap_rings = cap_data; 2170 2171 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2172 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2173 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt; 2174 2175 /* 2176 * An aggregation advertises only one (pseudo) RX 2177 * group, which virtualizes the main/primary group of 2178 * the underlying devices. 2179 */ 2180 cap_rings->mr_gnum = 1; 2181 cap_rings->mr_gaddring = NULL; 2182 cap_rings->mr_gremring = NULL; 2183 } else { 2184 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2185 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2186 cap_rings->mr_gnum = 0; 2187 } 2188 cap_rings->mr_rget = aggr_fill_ring; 2189 cap_rings->mr_gget = aggr_fill_group; 2190 break; 2191 } 2192 case MAC_CAPAB_AGGR: 2193 { 2194 mac_capab_aggr_t *aggr_cap; 2195 2196 if (cap_data != NULL) { 2197 aggr_cap = cap_data; 2198 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2199 aggr_cap->mca_unicst = aggr_m_unicst; 2200 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2201 aggr_cap->mca_arg = arg; 2202 } 2203 return (B_TRUE); 2204 } 2205 default: 2206 return (B_FALSE); 2207 } 2208 return (B_TRUE); 2209 } 2210 2211 /* 2212 * Callback funtion for MAC layer to register groups. 2213 */ 2214 static void 2215 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2216 mac_group_info_t *infop, mac_group_handle_t gh) 2217 { 2218 aggr_grp_t *grp = arg; 2219 aggr_pseudo_rx_group_t *rx_group; 2220 aggr_pseudo_tx_group_t *tx_group; 2221 2222 ASSERT(index == 0); 2223 if (rtype == MAC_RING_TYPE_RX) { 2224 rx_group = &grp->lg_rx_group; 2225 rx_group->arg_gh = gh; 2226 rx_group->arg_grp = grp; 2227 2228 infop->mgi_driver = (mac_group_driver_t)rx_group; 2229 infop->mgi_start = NULL; 2230 infop->mgi_stop = NULL; 2231 infop->mgi_addmac = aggr_addmac; 2232 infop->mgi_remmac = aggr_remmac; 2233 infop->mgi_count = rx_group->arg_ring_cnt; 2234 } else { 2235 tx_group = &grp->lg_tx_group; 2236 tx_group->atg_gh = gh; 2237 } 2238 } 2239 2240 /* 2241 * Callback funtion for MAC layer to register all rings. 2242 */ 2243 static void 2244 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2245 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2246 { 2247 aggr_grp_t *grp = arg; 2248 2249 switch (rtype) { 2250 case MAC_RING_TYPE_RX: { 2251 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group; 2252 aggr_pseudo_rx_ring_t *rx_ring; 2253 mac_intr_t aggr_mac_intr; 2254 2255 ASSERT(rg_index == 0); 2256 2257 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt)); 2258 rx_ring = rx_group->arg_rings + index; 2259 rx_ring->arr_rh = rh; 2260 2261 /* 2262 * Entrypoint to enable interrupt (disable poll) and 2263 * disable interrupt (enable poll). 2264 */ 2265 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2266 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2267 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2268 aggr_mac_intr.mi_ddi_handle = NULL; 2269 2270 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2271 infop->mri_start = aggr_pseudo_start_ring; 2272 infop->mri_stop = aggr_pseudo_stop_ring; 2273 2274 infop->mri_intr = aggr_mac_intr; 2275 infop->mri_poll = aggr_rx_poll; 2276 2277 infop->mri_stat = aggr_rx_ring_stat; 2278 break; 2279 } 2280 case MAC_RING_TYPE_TX: { 2281 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2282 aggr_pseudo_tx_ring_t *tx_ring; 2283 2284 ASSERT(rg_index == -1); 2285 ASSERT(index < tx_group->atg_ring_cnt); 2286 2287 tx_ring = &tx_group->atg_rings[index]; 2288 tx_ring->atr_rh = rh; 2289 2290 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2291 infop->mri_start = NULL; 2292 infop->mri_stop = NULL; 2293 infop->mri_tx = aggr_ring_tx; 2294 infop->mri_stat = aggr_tx_ring_stat; 2295 /* 2296 * Use the hw TX ring handle to find if the ring needs 2297 * serialization or not. For NICs that do not expose 2298 * Tx rings, atr_hw_rh will be NULL. 2299 */ 2300 if (tx_ring->atr_hw_rh != NULL) { 2301 infop->mri_flags = 2302 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2303 } 2304 break; 2305 } 2306 default: 2307 break; 2308 } 2309 } 2310 2311 static mblk_t * 2312 aggr_rx_poll(void *arg, int bytes_to_pickup) 2313 { 2314 aggr_pseudo_rx_ring_t *rr_ring = arg; 2315 aggr_port_t *port = rr_ring->arr_port; 2316 aggr_grp_t *grp = port->lp_grp; 2317 mblk_t *mp_chain, *mp, **mpp; 2318 2319 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2320 2321 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2322 return (mp_chain); 2323 2324 mpp = &mp_chain; 2325 while ((mp = *mpp) != NULL) { 2326 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2327 struct ether_header *ehp; 2328 2329 ehp = (struct ether_header *)mp->b_rptr; 2330 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2331 *mpp = mp->b_next; 2332 mp->b_next = NULL; 2333 aggr_recv_lacp(port, 2334 (mac_resource_handle_t)rr_ring, mp); 2335 continue; 2336 } 2337 } 2338 2339 if (!port->lp_collector_enabled) { 2340 *mpp = mp->b_next; 2341 mp->b_next = NULL; 2342 freemsg(mp); 2343 continue; 2344 } 2345 mpp = &mp->b_next; 2346 } 2347 return (mp_chain); 2348 } 2349 2350 static int 2351 aggr_addmac(void *arg, const uint8_t *mac_addr) 2352 { 2353 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2354 aggr_unicst_addr_t *addr, **pprev; 2355 aggr_grp_t *grp = rx_group->arg_grp; 2356 aggr_port_t *port, *p; 2357 mac_perim_handle_t mph; 2358 int err = 0; 2359 2360 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2361 2362 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2363 mac_perim_exit(mph); 2364 return (0); 2365 } 2366 2367 /* 2368 * Insert this mac address into the list of mac addresses owned by 2369 * the aggregation pseudo group. 2370 */ 2371 pprev = &rx_group->arg_macaddr; 2372 while ((addr = *pprev) != NULL) { 2373 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2374 mac_perim_exit(mph); 2375 return (EEXIST); 2376 } 2377 pprev = &addr->aua_next; 2378 } 2379 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2380 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2381 addr->aua_next = NULL; 2382 *pprev = addr; 2383 2384 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2385 if ((err = aggr_port_addmac(port, mac_addr)) != 0) 2386 break; 2387 2388 if (err != 0) { 2389 for (p = grp->lg_ports; p != port; p = p->lp_next) 2390 aggr_port_remmac(p, mac_addr); 2391 2392 *pprev = NULL; 2393 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2394 } 2395 2396 mac_perim_exit(mph); 2397 return (err); 2398 } 2399 2400 static int 2401 aggr_remmac(void *arg, const uint8_t *mac_addr) 2402 { 2403 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2404 aggr_unicst_addr_t *addr, **pprev; 2405 aggr_grp_t *grp = rx_group->arg_grp; 2406 aggr_port_t *port; 2407 mac_perim_handle_t mph; 2408 int err = 0; 2409 2410 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2411 2412 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2413 mac_perim_exit(mph); 2414 return (0); 2415 } 2416 2417 /* 2418 * Insert this mac address into the list of mac addresses owned by 2419 * the aggregation pseudo group. 2420 */ 2421 pprev = &rx_group->arg_macaddr; 2422 while ((addr = *pprev) != NULL) { 2423 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2424 pprev = &addr->aua_next; 2425 continue; 2426 } 2427 break; 2428 } 2429 if (addr == NULL) { 2430 mac_perim_exit(mph); 2431 return (EINVAL); 2432 } 2433 2434 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2435 aggr_port_remmac(port, mac_addr); 2436 2437 *pprev = addr->aua_next; 2438 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2439 2440 mac_perim_exit(mph); 2441 return (err); 2442 } 2443 2444 /* 2445 * Add or remove the multicast addresses that are defined for the group 2446 * to or from the specified port. 2447 * 2448 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2449 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2450 * called when the port is either stopped or detached. 2451 */ 2452 void 2453 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2454 { 2455 aggr_grp_t *grp = port->lp_grp; 2456 2457 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2458 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2459 2460 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2461 return; 2462 2463 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2464 } 2465 2466 static int 2467 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2468 { 2469 aggr_grp_t *grp = arg; 2470 aggr_port_t *port = NULL, *errport = NULL; 2471 mac_perim_handle_t mph; 2472 int err = 0; 2473 2474 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2475 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2476 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2477 !port->lp_started) { 2478 continue; 2479 } 2480 err = aggr_port_multicst(port, add, addrp); 2481 if (err != 0) { 2482 errport = port; 2483 break; 2484 } 2485 } 2486 2487 /* 2488 * At least one port caused error return and this error is returned to 2489 * mac, eventually a NAK would be sent upwards. 2490 * Some ports have this multicast address listed now, and some don't. 2491 * Treat this error as a whole aggr failure not individual port failure. 2492 * Therefore remove this multicast address from other ports. 2493 */ 2494 if ((err != 0) && add) { 2495 for (port = grp->lg_ports; port != errport; 2496 port = port->lp_next) { 2497 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2498 !port->lp_started) { 2499 continue; 2500 } 2501 (void) aggr_port_multicst(port, B_FALSE, addrp); 2502 } 2503 } 2504 mac_perim_exit(mph); 2505 return (err); 2506 } 2507 2508 static int 2509 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2510 { 2511 aggr_grp_t *grp = arg; 2512 mac_perim_handle_t mph; 2513 int err; 2514 2515 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2516 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2517 0, 0); 2518 mac_perim_exit(mph); 2519 return (err); 2520 } 2521 2522 /* 2523 * Initialize the capabilities that are advertised for the group 2524 * according to the capabilities of the constituent ports. 2525 */ 2526 static void 2527 aggr_grp_capab_set(aggr_grp_t *grp) 2528 { 2529 uint32_t cksum; 2530 aggr_port_t *port; 2531 mac_capab_lso_t cap_lso; 2532 2533 ASSERT(grp->lg_mh == NULL); 2534 ASSERT(grp->lg_ports != NULL); 2535 2536 grp->lg_hcksum_txflags = (uint32_t)-1; 2537 grp->lg_zcopy = B_TRUE; 2538 grp->lg_vlan = B_TRUE; 2539 2540 grp->lg_lso = B_TRUE; 2541 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2542 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2543 2544 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2545 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2546 cksum = 0; 2547 grp->lg_hcksum_txflags &= cksum; 2548 2549 grp->lg_vlan &= 2550 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2551 2552 grp->lg_zcopy &= 2553 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2554 2555 grp->lg_lso &= 2556 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2557 if (grp->lg_lso) { 2558 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2559 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2560 cap_lso.lso_basic_tcp_ipv4.lso_max) 2561 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2562 cap_lso.lso_basic_tcp_ipv4.lso_max; 2563 } 2564 } 2565 } 2566 2567 /* 2568 * Checks whether the capabilities of the port being added are compatible 2569 * with the current capabilities of the aggregation. 2570 */ 2571 static boolean_t 2572 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2573 { 2574 uint32_t hcksum_txflags; 2575 2576 ASSERT(grp->lg_ports != NULL); 2577 2578 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2579 grp->lg_vlan) != grp->lg_vlan) { 2580 return (B_FALSE); 2581 } 2582 2583 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2584 grp->lg_zcopy) != grp->lg_zcopy) { 2585 return (B_FALSE); 2586 } 2587 2588 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2589 if (grp->lg_hcksum_txflags != 0) 2590 return (B_FALSE); 2591 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2592 grp->lg_hcksum_txflags) { 2593 return (B_FALSE); 2594 } 2595 2596 if (grp->lg_lso) { 2597 mac_capab_lso_t cap_lso; 2598 2599 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2600 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2601 grp->lg_cap_lso.lso_flags) 2602 return (B_FALSE); 2603 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2604 cap_lso.lso_basic_tcp_ipv4.lso_max) 2605 return (B_FALSE); 2606 } else { 2607 return (B_FALSE); 2608 } 2609 } 2610 2611 return (B_TRUE); 2612 } 2613 2614 /* 2615 * Returns the maximum SDU according to the SDU of the constituent ports. 2616 */ 2617 static uint_t 2618 aggr_grp_max_sdu(aggr_grp_t *grp) 2619 { 2620 uint_t max_sdu = (uint_t)-1; 2621 aggr_port_t *port; 2622 2623 ASSERT(grp->lg_ports != NULL); 2624 2625 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2626 uint_t port_sdu_max; 2627 2628 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2629 if (max_sdu > port_sdu_max) 2630 max_sdu = port_sdu_max; 2631 } 2632 2633 return (max_sdu); 2634 } 2635 2636 /* 2637 * Checks if the maximum SDU of the specified port is compatible 2638 * with the maximum SDU of the specified aggregation group, returns 2639 * B_TRUE if it is, B_FALSE otherwise. 2640 */ 2641 static boolean_t 2642 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 2643 { 2644 uint_t port_sdu_max; 2645 2646 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2647 return (port_sdu_max >= grp->lg_max_sdu); 2648 } 2649 2650 /* 2651 * Returns the maximum margin according to the margin of the constituent ports. 2652 */ 2653 static uint32_t 2654 aggr_grp_max_margin(aggr_grp_t *grp) 2655 { 2656 uint32_t margin = UINT32_MAX; 2657 aggr_port_t *port; 2658 2659 ASSERT(grp->lg_mh == NULL); 2660 ASSERT(grp->lg_ports != NULL); 2661 2662 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2663 if (margin > port->lp_margin) 2664 margin = port->lp_margin; 2665 } 2666 2667 grp->lg_margin = margin; 2668 return (margin); 2669 } 2670 2671 /* 2672 * Checks if the maximum margin of the specified port is compatible 2673 * with the maximum margin of the specified aggregation group, returns 2674 * B_TRUE if it is, B_FALSE otherwise. 2675 */ 2676 static boolean_t 2677 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 2678 { 2679 if (port->lp_margin >= grp->lg_margin) 2680 return (B_TRUE); 2681 2682 /* 2683 * See whether the current margin value is allowed to be changed to 2684 * the new value. 2685 */ 2686 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 2687 return (B_FALSE); 2688 2689 grp->lg_margin = port->lp_margin; 2690 return (B_TRUE); 2691 } 2692 2693 /* 2694 * Set MTU on individual ports of an aggregation group 2695 */ 2696 static int 2697 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 2698 uint32_t *old_mtu) 2699 { 2700 boolean_t removed = B_FALSE; 2701 mac_perim_handle_t mph; 2702 mac_diag_t diag; 2703 int err, rv, retry = 0; 2704 2705 if (port->lp_mah != NULL) { 2706 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 2707 port->lp_mah = NULL; 2708 removed = B_TRUE; 2709 } 2710 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 2711 try_again: 2712 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 2713 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 2714 &port->lp_mah, 0, &diag)) != 0) { 2715 /* 2716 * following is a workaround for a bug in 'bge' driver. 2717 * See CR 6794654 for more information and this work around 2718 * will be removed once the CR is fixed. 2719 */ 2720 if (rv == EIO && retry++ < 3) { 2721 delay(2 * hz); 2722 goto try_again; 2723 } 2724 /* 2725 * if mac_unicast_add() failed while setting the MTU, 2726 * detach the port from the group. 2727 */ 2728 mac_perim_enter_by_mh(port->lp_mh, &mph); 2729 (void) aggr_grp_detach_port(grp, port); 2730 mac_perim_exit(mph); 2731 cmn_err(CE_WARN, "Unable to restart the port %s while " 2732 "setting MTU. Detaching the port from the aggregation.", 2733 mac_client_name(port->lp_mch)); 2734 } 2735 return (err); 2736 } 2737 2738 static int 2739 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 2740 { 2741 int err = 0, i, rv; 2742 aggr_port_t *port; 2743 uint32_t *mtu; 2744 2745 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2746 2747 /* 2748 * If the MTU being set is equal to aggr group's maximum 2749 * allowable value, then there is nothing to change 2750 */ 2751 if (sdu == grp->lg_max_sdu) 2752 return (0); 2753 2754 /* 0 is aggr group's min sdu */ 2755 if (sdu == 0) 2756 return (EINVAL); 2757 2758 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 2759 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 2760 port = port->lp_next, i++) { 2761 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 2762 } 2763 if (err != 0) { 2764 /* recover from error: reset the mtus of the ports */ 2765 aggr_port_t *tmp; 2766 2767 for (tmp = grp->lg_ports, i = 0; tmp != port; 2768 tmp = tmp->lp_next, i++) { 2769 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 2770 } 2771 goto bail; 2772 } 2773 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 2774 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 2775 ASSERT(rv == 0); 2776 bail: 2777 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 2778 return (err); 2779 } 2780 2781 /* 2782 * Callback functions for set/get of properties 2783 */ 2784 /*ARGSUSED*/ 2785 static int 2786 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 2787 uint_t pr_valsize, const void *pr_val) 2788 { 2789 int err = ENOTSUP; 2790 aggr_grp_t *grp = m_driver; 2791 2792 switch (pr_num) { 2793 case MAC_PROP_MTU: { 2794 uint32_t mtu; 2795 2796 if (pr_valsize < sizeof (mtu)) { 2797 err = EINVAL; 2798 break; 2799 } 2800 bcopy(pr_val, &mtu, sizeof (mtu)); 2801 err = aggr_sdu_update(grp, mtu); 2802 break; 2803 } 2804 default: 2805 break; 2806 } 2807 return (err); 2808 } 2809 2810 typedef struct rboundary { 2811 uint32_t bval; 2812 int btype; 2813 } rboundary_t; 2814 2815 /* 2816 * This function finds the intersection of mtu ranges stored in arrays - 2817 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 2818 * Individual arrays are assumed to contain non-overlapping ranges. 2819 * Algorithm: 2820 * A range has two boundaries - min and max. We scan all arrays and store 2821 * each boundary as a separate element in a temporary array. We also store 2822 * the boundary types, min or max, as +1 or -1 respectively in the temporary 2823 * array. Then we sort the temporary array in ascending order. We scan the 2824 * sorted array from lower to higher values and keep a cumulative sum of 2825 * boundary types. Element in the temporary array for which the sum reaches 2826 * mcount is a min boundary of a range in the result and next element will be 2827 * max boundary. 2828 * 2829 * Example for mcount = 3, 2830 * 2831 * ----|_________|-------|_______|----|__|------ mrange[0] 2832 * 2833 * -------|________|--|____________|-----|___|-- mrange[1] 2834 * 2835 * --------|________________|-------|____|------ mrange[2] 2836 * 2837 * 3 2 1 2838 * \|/ 2839 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 2840 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 2841 * 2842 * same min and max 2843 * V 2844 * --------|_____|-------|__|------------|------ intersecting ranges 2845 */ 2846 void 2847 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 2848 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 2849 { 2850 mac_propval_uint32_range_t *rval, *ur; 2851 int rmaxcnt, rcount; 2852 size_t sz_range32; 2853 rboundary_t *ta; /* temporary array */ 2854 rboundary_t temp; 2855 boolean_t range_started = B_FALSE; 2856 int i, j, m, sum; 2857 2858 sz_range32 = sizeof (mac_propval_uint32_range_t); 2859 2860 for (i = 0, rmaxcnt = 0; i < mcount; i++) 2861 rmaxcnt += mrange[i]->mpr_count; 2862 2863 /* Allocate enough space to store the results */ 2864 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 2865 2866 /* Number of boundaries are twice as many as ranges */ 2867 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 2868 2869 for (i = 0, m = 0; i < mcount; i++) { 2870 ur = &(mrange[i]->mpr_range_uint32[0]); 2871 for (j = 0; j < mrange[i]->mpr_count; j++) { 2872 ta[m].bval = ur[j].mpur_min; 2873 ta[m++].btype = 1; 2874 ta[m].bval = ur[j].mpur_max; 2875 ta[m++].btype = -1; 2876 } 2877 } 2878 2879 /* 2880 * Sort the temporary array in ascending order of bval; 2881 * if boundary values are same then sort on btype. 2882 */ 2883 for (i = 0; i < m-1; i++) { 2884 for (j = i+1; j < m; j++) { 2885 if ((ta[i].bval > ta[j].bval) || 2886 ((ta[i].bval == ta[j].bval) && 2887 (ta[i].btype < ta[j].btype))) { 2888 temp = ta[i]; 2889 ta[i] = ta[j]; 2890 ta[j] = temp; 2891 } 2892 } 2893 } 2894 2895 /* Walk through temporary array to find all ranges in the results */ 2896 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 2897 sum += ta[i].btype; 2898 if (sum == mcount) { 2899 rval[rcount].mpur_min = ta[i].bval; 2900 range_started = B_TRUE; 2901 } else if (sum < mcount && range_started) { 2902 rval[rcount++].mpur_max = ta[i].bval; 2903 range_started = B_FALSE; 2904 } 2905 } 2906 2907 *prval = rval; 2908 *prmaxcnt = rmaxcnt; 2909 *prcount = rcount; 2910 2911 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 2912 } 2913 2914 /* 2915 * Returns the mtu ranges which could be supported by aggr group. 2916 * prmaxcnt returns the size of the buffer prval, prcount returns 2917 * the number of valid entries in prval. Caller is responsible 2918 * for freeing up prval. 2919 */ 2920 int 2921 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 2922 int *prmaxcnt, int *prcount) 2923 { 2924 mac_propval_range_t **vals; 2925 aggr_port_t *port; 2926 mac_perim_handle_t mph; 2927 uint_t i, numr; 2928 int err = 0; 2929 size_t sz_propval, sz_range32; 2930 size_t size; 2931 2932 sz_propval = sizeof (mac_propval_range_t); 2933 sz_range32 = sizeof (mac_propval_uint32_range_t); 2934 2935 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2936 2937 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 2938 KM_SLEEP); 2939 2940 for (port = grp->lg_ports, i = 0; port != NULL; 2941 port = port->lp_next, i++) { 2942 2943 size = sz_propval; 2944 vals[i] = kmem_alloc(size, KM_SLEEP); 2945 vals[i]->mpr_count = 1; 2946 2947 mac_perim_enter_by_mh(port->lp_mh, &mph); 2948 2949 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 2950 NULL, 0, vals[i], NULL); 2951 if (err == ENOSPC) { 2952 /* 2953 * Not enough space to hold all ranges. 2954 * Allocate extra space as indicated and retry. 2955 */ 2956 numr = vals[i]->mpr_count; 2957 kmem_free(vals[i], sz_propval); 2958 size = sz_propval + (numr - 1) * sz_range32; 2959 vals[i] = kmem_alloc(size, KM_SLEEP); 2960 vals[i]->mpr_count = numr; 2961 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 2962 NULL, 0, vals[i], NULL); 2963 ASSERT(err != ENOSPC); 2964 } 2965 mac_perim_exit(mph); 2966 if (err != 0) { 2967 kmem_free(vals[i], size); 2968 vals[i] = NULL; 2969 break; 2970 } 2971 } 2972 2973 /* 2974 * if any of the underlying ports does not support changing MTU then 2975 * just return ENOTSUP 2976 */ 2977 if (port != NULL) { 2978 ASSERT(err != 0); 2979 goto done; 2980 } 2981 2982 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 2983 prcount); 2984 2985 done: 2986 for (i = 0; i < grp->lg_nports; i++) { 2987 if (vals[i] != NULL) { 2988 numr = vals[i]->mpr_count; 2989 size = sz_propval + (numr - 1) * sz_range32; 2990 kmem_free(vals[i], size); 2991 } 2992 } 2993 2994 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 2995 return (err); 2996 } 2997 2998 static void 2999 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3000 mac_prop_info_handle_t prh) 3001 { 3002 aggr_grp_t *grp = m_driver; 3003 mac_propval_uint32_range_t *rval = NULL; 3004 int i, rcount, rmaxcnt; 3005 int err = 0; 3006 3007 _NOTE(ARGUNUSED(pr_name)); 3008 3009 switch (pr_num) { 3010 case MAC_PROP_MTU: 3011 3012 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3013 &rcount); 3014 if (err != 0) { 3015 ASSERT(rval == NULL); 3016 return; 3017 } 3018 for (i = 0; i < rcount; i++) { 3019 mac_prop_info_set_range_uint32(prh, 3020 rval[i].mpur_min, rval[i].mpur_max); 3021 } 3022 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3023 break; 3024 } 3025 } 3026