1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2020 Joyent, Inc. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 28 * 29 * An instance of the structure aggr_grp_t is allocated for each 30 * link aggregation group. When created, aggr_grp_t objects are 31 * entered into the aggr_grp_hash hash table maintained by the modhash 32 * module. The hash key is the linkid associated with the link 33 * aggregation group. 34 * 35 * Each aggregation contains a set of ports. The port is represented 36 * by the aggr_port_t structure. A port consists of a single MAC 37 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 38 * MAC. This client is used by the aggr to send and receive LACP 39 * traffic. Each port client takes on the same MAC unicast address -- 40 * the address of the aggregation itself (taken from the first port by 41 * default). 42 * 43 * The MAC client that hangs off each aggr port is not your typical 44 * MAC client. Not only does it have exclusive control of the MAC, but 45 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 46 * fanout traffic among L4 protocols; but the aggr is an intermediary, 47 * not a consumer. Instead of using SRSes, the aggr puts the 48 * underlying hardware rings into passthru mode and ships packets up 49 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 50 * LACP while passing all other traffic up to clients of the aggr. 51 * 52 * Pseudo Rx Groups and Rings 53 * -------------------------- 54 * 55 * It is imperative for client performance that the aggr provide as 56 * many MAC groups as possible. In order to use the underlying HW 57 * resources, aggr creates pseudo groups to aggregate the underlying 58 * HW groups. Every HW group gets mapped to a pseudo group; and every 59 * HW ring in that group gets mapped to a pseudo ring. The pseudo 60 * group at index 0 combines all the HW groups at index 0 from each 61 * port, etc. The aggr's MAC then creates normal MAC groups and rings 62 * out of these pseudo groups and rings to present to the aggr's 63 * clients. To the clients, the aggr's groups and rings are absolutely 64 * no different than a NIC's groups or rings. 65 * 66 * Pseudo Tx Rings 67 * --------------- 68 * 69 * The underlying ports (NICs) in an aggregation can have Tx rings. To 70 * enhance aggr's performance, these Tx rings are made available to 71 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 72 * not new. They are already present and implemented on the Rx side. 73 * The same concept is extended to the Tx side where each Tx ring of 74 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 75 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 76 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 77 * is given to the aggregation layer. 78 * 79 * With this change, the outgoing stack depth looks much better: 80 * 81 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 82 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 83 * 84 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 85 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 86 * 87 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 88 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 89 * ring belonging to a port on which the packet has to be sent. 90 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 91 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 92 * the selected port. 93 * 94 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 95 * bandwidth limit is applied first on the outgoing packet and the packets 96 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 97 * particular Tx ring. 98 */ 99 100 #include <sys/types.h> 101 #include <sys/sysmacros.h> 102 #include <sys/conf.h> 103 #include <sys/cmn_err.h> 104 #include <sys/disp.h> 105 #include <sys/list.h> 106 #include <sys/ksynch.h> 107 #include <sys/kmem.h> 108 #include <sys/stream.h> 109 #include <sys/modctl.h> 110 #include <sys/ddi.h> 111 #include <sys/sunddi.h> 112 #include <sys/atomic.h> 113 #include <sys/stat.h> 114 #include <sys/modhash.h> 115 #include <sys/id_space.h> 116 #include <sys/strsun.h> 117 #include <sys/cred.h> 118 #include <sys/dlpi.h> 119 #include <sys/zone.h> 120 #include <sys/mac_provider.h> 121 #include <sys/dls.h> 122 #include <sys/vlan.h> 123 #include <sys/aggr.h> 124 #include <sys/aggr_impl.h> 125 126 static int aggr_m_start(void *); 127 static void aggr_m_stop(void *); 128 static int aggr_m_promisc(void *, boolean_t); 129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 130 static int aggr_m_unicst(void *, const uint8_t *); 131 static int aggr_m_stat(void *, uint_t, uint64_t *); 132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 135 const void *); 136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 137 mac_prop_info_handle_t); 138 139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 141 boolean_t *); 142 143 static void aggr_grp_capab_set(aggr_grp_t *); 144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 145 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 146 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 149 150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 152 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 153 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 156 static int aggr_addmac(void *, const uint8_t *); 157 static int aggr_remmac(void *, const uint8_t *); 158 static int aggr_addvlan(mac_group_driver_t, uint16_t); 159 static int aggr_remvlan(mac_group_driver_t, uint16_t); 160 static mblk_t *aggr_rx_poll(void *, int); 161 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 162 const int, mac_ring_info_t *, mac_ring_handle_t); 163 static void aggr_fill_group(void *, mac_ring_type_t, const int, 164 mac_group_info_t *, mac_group_handle_t); 165 166 static kmem_cache_t *aggr_grp_cache; 167 static mod_hash_t *aggr_grp_hash; 168 static krwlock_t aggr_grp_lock; 169 static uint_t aggr_grp_cnt; 170 static id_space_t *key_ids; 171 172 #define GRP_HASHSZ 64 173 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 174 #define AGGR_PORT_NAME_DELIMIT '-' 175 176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 177 178 #define AGGR_M_CALLBACK_FLAGS \ 179 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 180 181 static mac_callbacks_t aggr_m_callbacks = { 182 AGGR_M_CALLBACK_FLAGS, 183 aggr_m_stat, 184 aggr_m_start, 185 aggr_m_stop, 186 aggr_m_promisc, 187 aggr_m_multicst, 188 NULL, 189 NULL, 190 NULL, 191 aggr_m_ioctl, 192 aggr_m_capab_get, 193 NULL, 194 NULL, 195 aggr_m_setprop, 196 NULL, 197 aggr_m_propinfo 198 }; 199 200 /*ARGSUSED*/ 201 static int 202 aggr_grp_constructor(void *buf, void *arg, int kmflag) 203 { 204 aggr_grp_t *grp = buf; 205 206 bzero(grp, sizeof (*grp)); 207 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 208 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 209 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 210 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 211 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 212 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 213 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 214 grp->lg_link_state = LINK_STATE_UNKNOWN; 215 return (0); 216 } 217 218 /*ARGSUSED*/ 219 static void 220 aggr_grp_destructor(void *buf, void *arg) 221 { 222 aggr_grp_t *grp = buf; 223 224 if (grp->lg_tx_ports != NULL) { 225 kmem_free(grp->lg_tx_ports, 226 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 227 } 228 229 mutex_destroy(&grp->lg_lacp_lock); 230 cv_destroy(&grp->lg_lacp_cv); 231 mutex_destroy(&grp->lg_port_lock); 232 cv_destroy(&grp->lg_port_cv); 233 rw_destroy(&grp->lg_tx_lock); 234 mutex_destroy(&grp->lg_tx_flowctl_lock); 235 cv_destroy(&grp->lg_tx_flowctl_cv); 236 } 237 238 void 239 aggr_grp_init(void) 240 { 241 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 242 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 243 aggr_grp_destructor, NULL, NULL, NULL, 0); 244 245 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 246 GRP_HASHSZ, mod_hash_null_valdtor); 247 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 248 aggr_grp_cnt = 0; 249 250 /* 251 * Allocate an id space to manage key values (when key is not 252 * specified). The range of the id space will be from 253 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 254 * uses a 16-bit key. 255 */ 256 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 257 ASSERT(key_ids != NULL); 258 } 259 260 void 261 aggr_grp_fini(void) 262 { 263 id_space_destroy(key_ids); 264 rw_destroy(&aggr_grp_lock); 265 mod_hash_destroy_idhash(aggr_grp_hash); 266 kmem_cache_destroy(aggr_grp_cache); 267 } 268 269 uint_t 270 aggr_grp_count(void) 271 { 272 uint_t count; 273 274 rw_enter(&aggr_grp_lock, RW_READER); 275 count = aggr_grp_cnt; 276 rw_exit(&aggr_grp_lock); 277 return (count); 278 } 279 280 /* 281 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 282 * requires the mac perimeter, this function holds a reference of the aggr 283 * and aggr won't call mac_unregister() until this reference drops to 0. 284 */ 285 void 286 aggr_grp_port_hold(aggr_port_t *port) 287 { 288 aggr_grp_t *grp = port->lp_grp; 289 290 AGGR_PORT_REFHOLD(port); 291 mutex_enter(&grp->lg_port_lock); 292 grp->lg_port_ref++; 293 mutex_exit(&grp->lg_port_lock); 294 } 295 296 /* 297 * Release the reference of the grp and inform aggr_grp_delete() calling 298 * mac_unregister() is now safe. 299 */ 300 void 301 aggr_grp_port_rele(aggr_port_t *port) 302 { 303 aggr_grp_t *grp = port->lp_grp; 304 305 mutex_enter(&grp->lg_port_lock); 306 if (--grp->lg_port_ref == 0) 307 cv_signal(&grp->lg_port_cv); 308 mutex_exit(&grp->lg_port_lock); 309 AGGR_PORT_REFRELE(port); 310 } 311 312 /* 313 * Wait for the port's lacp timer thread and the port's notification callback 314 * to exit. 315 */ 316 void 317 aggr_grp_port_wait(aggr_grp_t *grp) 318 { 319 mutex_enter(&grp->lg_port_lock); 320 if (grp->lg_port_ref != 0) 321 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 322 mutex_exit(&grp->lg_port_lock); 323 } 324 325 /* 326 * Attach a port to a link aggregation group. 327 * 328 * A port is attached to a link aggregation group once its speed 329 * and link state have been verified. 330 * 331 * Returns B_TRUE if the group link state or speed has changed. If 332 * it's the case, the caller must notify the MAC layer via a call 333 * to mac_link(). 334 */ 335 boolean_t 336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 337 { 338 boolean_t link_state_changed = B_FALSE; 339 340 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 341 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 342 343 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 344 return (B_FALSE); 345 346 /* 347 * Validate the MAC port link speed and update the group 348 * link speed if needed. 349 */ 350 if (port->lp_ifspeed == 0 || 351 port->lp_link_state != LINK_STATE_UP || 352 port->lp_link_duplex != LINK_DUPLEX_FULL) { 353 /* 354 * Can't attach a MAC port with unknown link speed, 355 * down link, or not in full duplex mode. 356 */ 357 return (B_FALSE); 358 } 359 360 mutex_enter(&grp->lg_stat_lock); 361 if (grp->lg_ifspeed == 0) { 362 /* 363 * The group inherits the speed of the first link being 364 * attached. 365 */ 366 grp->lg_ifspeed = port->lp_ifspeed; 367 link_state_changed = B_TRUE; 368 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 369 /* 370 * The link speed of the MAC port must be the same as 371 * the group link speed, as per 802.3ad. Since it is 372 * not, the attach is cancelled. 373 */ 374 mutex_exit(&grp->lg_stat_lock); 375 return (B_FALSE); 376 } 377 mutex_exit(&grp->lg_stat_lock); 378 379 grp->lg_nattached_ports++; 380 381 /* 382 * Update the group link state. 383 */ 384 if (grp->lg_link_state != LINK_STATE_UP) { 385 grp->lg_link_state = LINK_STATE_UP; 386 mutex_enter(&grp->lg_stat_lock); 387 grp->lg_link_duplex = LINK_DUPLEX_FULL; 388 mutex_exit(&grp->lg_stat_lock); 389 link_state_changed = B_TRUE; 390 } 391 392 /* 393 * Update port's state. 394 */ 395 port->lp_state = AGGR_PORT_STATE_ATTACHED; 396 397 aggr_grp_multicst_port(port, B_TRUE); 398 399 /* 400 * The port client doesn't have an Rx SRS; instead of calling 401 * mac_rx_set() we set the client's flow callback directly. 402 * This datapath is used only when the port's driver doesn't 403 * support MAC_CAPAB_RINGS. Drivers with ring support will 404 * deliver traffic to the aggr via ring passthru. 405 */ 406 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 407 408 /* 409 * If LACP is OFF, the port can be used to send data as soon 410 * as its link is up and verified to be compatible with the 411 * aggregation. 412 * 413 * If LACP is active or passive, notify the LACP subsystem, which 414 * will enable sending on the port following the LACP protocol. 415 */ 416 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 417 aggr_send_port_enable(port); 418 else 419 aggr_lacp_port_attached(port); 420 421 return (link_state_changed); 422 } 423 424 boolean_t 425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 426 { 427 boolean_t link_state_changed = B_FALSE; 428 429 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 430 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 431 432 /* update state */ 433 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 434 return (B_FALSE); 435 436 mac_client_clear_flow_cb(port->lp_mch); 437 438 aggr_grp_multicst_port(port, B_FALSE); 439 440 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 441 aggr_send_port_disable(port); 442 else 443 aggr_lacp_port_detached(port); 444 445 port->lp_state = AGGR_PORT_STATE_STANDBY; 446 447 grp->lg_nattached_ports--; 448 if (grp->lg_nattached_ports == 0) { 449 /* the last attached MAC port of the group is being detached */ 450 grp->lg_link_state = LINK_STATE_DOWN; 451 mutex_enter(&grp->lg_stat_lock); 452 grp->lg_ifspeed = 0; 453 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 454 mutex_exit(&grp->lg_stat_lock); 455 link_state_changed = B_TRUE; 456 } 457 458 return (link_state_changed); 459 } 460 461 /* 462 * Update the MAC addresses of the constituent ports of the specified 463 * group. This function is invoked: 464 * - after creating a new aggregation group. 465 * - after adding new ports to an aggregation group. 466 * - after removing a port from a group when the MAC address of 467 * that port was used for the MAC address of the group. 468 * - after the MAC address of a port changed when the MAC address 469 * of that port was used for the MAC address of the group. 470 * 471 * Return true if the link state of the aggregation changed, for example 472 * as a result of a failure changing the MAC address of one of the 473 * constituent ports. 474 */ 475 boolean_t 476 aggr_grp_update_ports_mac(aggr_grp_t *grp) 477 { 478 aggr_port_t *cport; 479 boolean_t link_state_changed = B_FALSE; 480 mac_perim_handle_t mph; 481 482 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 483 484 for (cport = grp->lg_ports; cport != NULL; 485 cport = cport->lp_next) { 486 mac_perim_enter_by_mh(cport->lp_mh, &mph); 487 if (aggr_port_unicst(cport) != 0) { 488 if (aggr_grp_detach_port(grp, cport)) 489 link_state_changed = B_TRUE; 490 } else { 491 /* 492 * If a port was detached because of a previous 493 * failure changing the MAC address, the port is 494 * reattached when it successfully changes the MAC 495 * address now, and this might cause the link state 496 * of the aggregation to change. 497 */ 498 if (aggr_grp_attach_port(grp, cport)) 499 link_state_changed = B_TRUE; 500 } 501 mac_perim_exit(mph); 502 } 503 return (link_state_changed); 504 } 505 506 /* 507 * Invoked when the MAC address of a port has changed. If the port's 508 * MAC address was used for the group MAC address, set mac_addr_changedp 509 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 510 * notification. If the link state changes due to detach/attach of 511 * the constituent port, set link_state_changedp to B_TRUE to indicate 512 * to the caller that it should send a MAC_NOTE_LINK notification. In both 513 * cases, it is the responsibility of the caller to invoke notification 514 * functions after releasing the the port lock. 515 */ 516 void 517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 518 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 519 { 520 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 521 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 522 ASSERT(mac_addr_changedp != NULL); 523 ASSERT(link_state_changedp != NULL); 524 525 *mac_addr_changedp = B_FALSE; 526 *link_state_changedp = B_FALSE; 527 528 if (grp->lg_addr_fixed) { 529 /* 530 * The group is using a fixed MAC address or an automatic 531 * MAC address has not been set. 532 */ 533 return; 534 } 535 536 if (grp->lg_mac_addr_port == port) { 537 /* 538 * The MAC address of the port was assigned to the group 539 * MAC address. Update the group MAC address. 540 */ 541 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 542 *mac_addr_changedp = B_TRUE; 543 } else { 544 /* 545 * Update the actual port MAC address to the MAC address 546 * of the group. 547 */ 548 if (aggr_port_unicst(port) != 0) { 549 *link_state_changedp = aggr_grp_detach_port(grp, port); 550 } else { 551 /* 552 * If a port was detached because of a previous 553 * failure changing the MAC address, the port is 554 * reattached when it successfully changes the MAC 555 * address now, and this might cause the link state 556 * of the aggregation to change. 557 */ 558 *link_state_changedp = aggr_grp_attach_port(grp, port); 559 } 560 } 561 } 562 563 /* 564 * Add a port to a link aggregation group. 565 */ 566 static int 567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 568 aggr_port_t **pp) 569 { 570 aggr_port_t *port, **cport; 571 mac_perim_handle_t mph; 572 zoneid_t port_zoneid = ALL_ZONES; 573 int err; 574 575 /* The port must be in the same zone as the aggregation. */ 576 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 577 port_zoneid = GLOBAL_ZONEID; 578 if (grp->lg_zoneid != port_zoneid) 579 return (EBUSY); 580 581 /* 582 * If we are creating the aggr, then there is no MAC handle 583 * and thus no perimeter to hold. If we are adding a port to 584 * an existing aggr, then the perimiter of the aggr's MAC must 585 * be held. 586 */ 587 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 588 589 err = aggr_port_create(grp, port_linkid, force, &port); 590 if (err != 0) 591 return (err); 592 593 mac_perim_enter_by_mh(port->lp_mh, &mph); 594 595 /* Add the new port to the end of the list. */ 596 cport = &grp->lg_ports; 597 while (*cport != NULL) 598 cport = &((*cport)->lp_next); 599 *cport = port; 600 601 /* 602 * Back reference to the group it is member of. A port always 603 * holds a reference to its group to ensure that the back 604 * reference is always valid. 605 */ 606 port->lp_grp = grp; 607 AGGR_GRP_REFHOLD(grp); 608 grp->lg_nports++; 609 610 aggr_lacp_init_port(port); 611 mac_perim_exit(mph); 612 613 if (pp != NULL) 614 *pp = port; 615 616 return (0); 617 } 618 619 /* 620 * This is called in response to either our LACP state machine or a MAC 621 * notification that the link has gone down via aggr_send_port_disable(). At 622 * this point, we may need to update our default ring. To that end, we go 623 * through the set of ports (underlying datalinks in an aggregation) that are 624 * currently enabled to transmit data. If all our links have been disabled for 625 * transmit, then we don't do anything. 626 * 627 * Note, because we only have a single TX group, we don't have to worry about 628 * the rings moving between groups and the chance that mac will reassign it 629 * unless someone removes a port, at which point, we play it safe and call this 630 * again. 631 */ 632 void 633 aggr_grp_update_default(aggr_grp_t *grp) 634 { 635 aggr_port_t *port; 636 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 637 638 rw_enter(&grp->lg_tx_lock, RW_WRITER); 639 640 if (grp->lg_ntx_ports == 0) { 641 rw_exit(&grp->lg_tx_lock); 642 return; 643 } 644 645 port = grp->lg_tx_ports[0]; 646 ASSERT(port->lp_tx_ring_cnt > 0); 647 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 648 rw_exit(&grp->lg_tx_lock); 649 } 650 651 /* 652 * Add a pseudo RX ring for the given HW ring handle. 653 */ 654 static int 655 aggr_add_pseudo_rx_ring(aggr_port_t *port, 656 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 657 { 658 aggr_pseudo_rx_ring_t *ring; 659 int err; 660 int j; 661 662 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 663 ring = rx_grp->arg_rings + j; 664 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 665 break; 666 } 667 668 /* 669 * No slot for this new RX ring. 670 */ 671 if (j == MAX_RINGS_PER_GROUP) 672 return (EIO); 673 674 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 675 ring->arr_hw_rh = hw_rh; 676 ring->arr_port = port; 677 ring->arr_grp = rx_grp; 678 rx_grp->arg_ring_cnt++; 679 680 /* 681 * The group is already registered, dynamically add a new ring to the 682 * mac group. 683 */ 684 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 685 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 686 ring->arr_hw_rh = NULL; 687 ring->arr_port = NULL; 688 ring->arr_grp = NULL; 689 rx_grp->arg_ring_cnt--; 690 } else { 691 /* 692 * This must run after the MAC is registered. 693 */ 694 ASSERT3P(ring->arr_rh, !=, NULL); 695 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 696 (void *)port, (mac_resource_handle_t)ring); 697 } 698 return (err); 699 } 700 701 /* 702 * Remove the pseudo RX ring of the given HW ring handle. 703 */ 704 static void 705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 706 { 707 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 708 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 709 710 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 711 ring->arr_hw_rh != hw_rh) { 712 continue; 713 } 714 715 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 716 717 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 718 ring->arr_hw_rh = NULL; 719 ring->arr_port = NULL; 720 ring->arr_grp = NULL; 721 rx_grp->arg_ring_cnt--; 722 mac_hwring_clear_passthru(hw_rh); 723 break; 724 } 725 } 726 727 /* 728 * Create pseudo rings over the HW rings of the port. 729 * 730 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 731 * 732 * o Program existing unicast filters on the pseudo group into the HW group. 733 * 734 * o Program existing VLAN filters on the pseudo group into the HW group. 735 */ 736 static int 737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 738 { 739 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 740 aggr_unicst_addr_t *addr, *a; 741 mac_perim_handle_t pmph; 742 aggr_vlan_t *avp; 743 uint_t hw_rh_cnt, i; 744 int err = 0; 745 uint_t g_idx = rx_grp->arg_index; 746 747 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 748 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 749 mac_perim_enter_by_mh(port->lp_mh, &pmph); 750 751 /* 752 * This function must be called after the aggr registers its 753 * MAC and its Rx groups have been initialized. 754 */ 755 ASSERT(rx_grp->arg_gh != NULL); 756 757 /* 758 * Get the list of the underlying HW rings. 759 */ 760 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 761 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 762 763 /* 764 * Add existing VLAN and unicast address filters to the port. 765 */ 766 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 767 avp = list_next(&rx_grp->arg_vlans, avp)) { 768 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 769 goto err; 770 } 771 772 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 773 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 774 goto err; 775 } 776 777 for (i = 0; i < hw_rh_cnt; i++) { 778 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 779 if (err != 0) 780 goto err; 781 } 782 783 mac_perim_exit(pmph); 784 return (0); 785 786 err: 787 ASSERT(err != 0); 788 789 for (uint_t j = 0; j < i; j++) 790 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 791 792 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 793 aggr_port_remmac(port, g_idx, a->aua_addr); 794 795 if (avp != NULL) 796 avp = list_prev(&rx_grp->arg_vlans, avp); 797 798 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 799 int err2; 800 801 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 802 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 803 ": errno %d.", avp->av_vid, 804 mac_client_name(port->lp_mch), err2); 805 } 806 } 807 808 port->lp_hwghs[g_idx] = NULL; 809 mac_perim_exit(pmph); 810 return (err); 811 } 812 813 /* 814 * Destroy the pseudo rings mapping to this port and remove all VLAN 815 * and unicast filters from this port. Even if there are no underlying 816 * HW rings we must still remove the unicast filters to take the port 817 * out of promisc mode. 818 */ 819 static void 820 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 821 { 822 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 823 aggr_unicst_addr_t *addr; 824 mac_perim_handle_t pmph; 825 uint_t hw_rh_cnt; 826 uint_t g_idx = rx_grp->arg_index; 827 828 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 829 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 830 ASSERT3P(rx_grp->arg_gh, !=, NULL); 831 mac_perim_enter_by_mh(port->lp_mh, &pmph); 832 833 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 834 MAC_RING_TYPE_RX); 835 836 for (uint_t i = 0; i < hw_rh_cnt; i++) 837 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 838 839 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 840 aggr_port_remmac(port, g_idx, addr->aua_addr); 841 842 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 843 avp = list_next(&rx_grp->arg_vlans, avp)) { 844 int err; 845 846 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 847 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 848 ": errno %d.", avp->av_vid, 849 mac_client_name(port->lp_mch), err); 850 } 851 } 852 853 port->lp_hwghs[g_idx] = NULL; 854 mac_perim_exit(pmph); 855 } 856 857 /* 858 * Add a pseudo TX ring for the given HW ring handle. 859 */ 860 static int 861 aggr_add_pseudo_tx_ring(aggr_port_t *port, 862 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 863 mac_ring_handle_t *pseudo_rh) 864 { 865 aggr_pseudo_tx_ring_t *ring; 866 int err; 867 int i; 868 869 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 870 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 871 ring = tx_grp->atg_rings + i; 872 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 873 break; 874 } 875 /* 876 * No slot for this new TX ring. 877 */ 878 if (i == MAX_RINGS_PER_GROUP) 879 return (EIO); 880 /* 881 * The following 4 statements needs to be done before 882 * calling mac_group_add_ring(). Otherwise it will 883 * result in an assertion failure in mac_init_ring(). 884 */ 885 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 886 ring->atr_hw_rh = hw_rh; 887 ring->atr_port = port; 888 tx_grp->atg_ring_cnt++; 889 890 /* 891 * The TX side has no concept of ring groups unlike RX groups. 892 * There is just a single group which stores all the TX rings. 893 * This group will be used to store aggr's pseudo TX rings. 894 */ 895 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 896 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 897 ring->atr_hw_rh = NULL; 898 ring->atr_port = NULL; 899 tx_grp->atg_ring_cnt--; 900 } else { 901 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 902 if (hw_rh != NULL) { 903 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 904 mac_find_ring(tx_grp->atg_gh, i)); 905 } 906 } 907 908 return (err); 909 } 910 911 /* 912 * Remove the pseudo TX ring of the given HW ring handle. 913 */ 914 static void 915 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 916 mac_ring_handle_t pseudo_hw_rh) 917 { 918 aggr_pseudo_tx_ring_t *ring; 919 int i; 920 921 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 922 ring = tx_grp->atg_rings + i; 923 if (ring->atr_rh != pseudo_hw_rh) 924 continue; 925 926 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 927 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 928 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 929 mac_hwring_teardown(ring->atr_hw_rh); 930 ring->atr_hw_rh = NULL; 931 ring->atr_port = NULL; 932 tx_grp->atg_ring_cnt--; 933 break; 934 } 935 } 936 937 /* 938 * This function is called to create pseudo rings over hardware rings of 939 * the underlying device. There is a 1:1 mapping between the pseudo TX 940 * rings of the aggr and the hardware rings of the underlying port. 941 */ 942 static int 943 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 944 { 945 aggr_grp_t *grp = port->lp_grp; 946 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 947 mac_perim_handle_t pmph; 948 int hw_rh_cnt, i = 0, j; 949 int err = 0; 950 951 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 952 mac_perim_enter_by_mh(port->lp_mh, &pmph); 953 954 /* 955 * Get the list the the underlying HW rings. 956 */ 957 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 958 MAC_RING_TYPE_TX); 959 960 /* 961 * Even if the underlying NIC does not have TX rings, we 962 * still make a psuedo TX ring for that NIC with NULL as 963 * the ring handle. 964 */ 965 if (hw_rh_cnt == 0) 966 port->lp_tx_ring_cnt = 1; 967 else 968 port->lp_tx_ring_cnt = hw_rh_cnt; 969 970 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 971 port->lp_tx_ring_cnt), KM_SLEEP); 972 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 973 port->lp_tx_ring_cnt), KM_SLEEP); 974 975 if (hw_rh_cnt == 0) { 976 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 977 NULL, &pseudo_rh)) == 0) { 978 port->lp_tx_rings[0] = NULL; 979 port->lp_pseudo_tx_rings[0] = pseudo_rh; 980 } 981 } else { 982 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 983 err = aggr_add_pseudo_tx_ring(port, 984 tx_grp, hw_rh[i], &pseudo_rh); 985 if (err != 0) 986 break; 987 port->lp_tx_rings[i] = hw_rh[i]; 988 port->lp_pseudo_tx_rings[i] = pseudo_rh; 989 } 990 } 991 992 if (err != 0) { 993 if (hw_rh_cnt != 0) { 994 for (j = 0; j < i; j++) { 995 aggr_rem_pseudo_tx_ring(tx_grp, 996 port->lp_pseudo_tx_rings[j]); 997 } 998 } 999 kmem_free(port->lp_tx_rings, 1000 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1001 kmem_free(port->lp_pseudo_tx_rings, 1002 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1003 port->lp_tx_ring_cnt = 0; 1004 } else { 1005 port->lp_tx_grp_added = B_TRUE; 1006 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1007 aggr_tx_ring_update, port); 1008 } 1009 mac_perim_exit(pmph); 1010 aggr_grp_update_default(grp); 1011 return (err); 1012 } 1013 1014 /* 1015 * This function is called by aggr to remove pseudo TX rings over the 1016 * HW rings of the underlying port. 1017 */ 1018 static void 1019 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1020 { 1021 aggr_grp_t *grp = port->lp_grp; 1022 mac_perim_handle_t pmph; 1023 int i; 1024 1025 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1026 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1027 1028 if (!port->lp_tx_grp_added) 1029 goto done; 1030 1031 ASSERT(tx_grp->atg_gh != NULL); 1032 1033 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1034 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1035 1036 kmem_free(port->lp_tx_rings, 1037 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1038 kmem_free(port->lp_pseudo_tx_rings, 1039 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1040 1041 port->lp_tx_ring_cnt = 0; 1042 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1043 port->lp_tx_grp_added = B_FALSE; 1044 aggr_grp_update_default(grp); 1045 done: 1046 mac_perim_exit(pmph); 1047 } 1048 1049 static int 1050 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1051 { 1052 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1053 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1054 } 1055 1056 static int 1057 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1058 { 1059 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1060 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1061 } 1062 1063 /* 1064 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1065 * over an actual HW ring, the real task is to start the underlying HW 1066 * ring. 1067 */ 1068 static int 1069 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1070 { 1071 int err; 1072 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1073 1074 err = mac_hwring_start(rr_ring->arr_hw_rh); 1075 1076 if (err != 0) 1077 return (err); 1078 1079 rr_ring->arr_gen = mr_gen; 1080 return (err); 1081 } 1082 1083 /* 1084 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1085 * over an actual HW ring, the real task is to stop the underlying HW 1086 * ring. 1087 */ 1088 static void 1089 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1090 { 1091 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1092 1093 /* 1094 * The rings underlying the default group must stay up to 1095 * continue receiving LACP traffic. We would normally never 1096 * stop the default Rx rings because of the primary MAC 1097 * client; but aggr's primary MAC client doesn't call 1098 * mac_unicast_add() and thus mi_active is 0 when the last 1099 * non-primary client is deleted. 1100 */ 1101 if (rr_ring->arr_grp->arg_index != 0) 1102 mac_hwring_stop(rr_ring->arr_hw_rh); 1103 } 1104 1105 /* 1106 * Add one or more ports to an existing link aggregation group. 1107 */ 1108 int 1109 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1110 laioc_port_t *ports) 1111 { 1112 int rc; 1113 uint_t port_added = 0; 1114 uint_t grp_added; 1115 aggr_grp_t *grp = NULL; 1116 aggr_port_t *port; 1117 boolean_t link_state_changed = B_FALSE; 1118 mac_perim_handle_t mph, pmph; 1119 1120 /* Get the aggr corresponding to linkid. */ 1121 rw_enter(&aggr_grp_lock, RW_READER); 1122 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1123 (mod_hash_val_t *)&grp) != 0) { 1124 rw_exit(&aggr_grp_lock); 1125 return (ENOENT); 1126 } 1127 AGGR_GRP_REFHOLD(grp); 1128 1129 /* 1130 * Hold the perimeter so that the aggregation can't be destroyed. 1131 */ 1132 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1133 rw_exit(&aggr_grp_lock); 1134 1135 /* Add the specified ports to the aggr. */ 1136 for (uint_t i = 0; i < nports; i++) { 1137 grp_added = 0; 1138 1139 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1140 force, &port)) != 0) { 1141 goto bail; 1142 } 1143 1144 ASSERT(port != NULL); 1145 port_added++; 1146 1147 /* check capabilities */ 1148 if (!aggr_grp_capab_check(grp, port) || 1149 !aggr_grp_sdu_check(grp, port) || 1150 !aggr_grp_margin_check(grp, port)) { 1151 rc = ENOTSUP; 1152 goto bail; 1153 } 1154 1155 /* 1156 * Create the pseudo ring for each HW ring of the underlying 1157 * port. 1158 */ 1159 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1160 if (rc != 0) 1161 goto bail; 1162 1163 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1164 rc = aggr_add_pseudo_rx_group(port, 1165 &grp->lg_rx_groups[j]); 1166 1167 if (rc != 0) 1168 goto bail; 1169 1170 grp_added++; 1171 } 1172 1173 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1174 1175 /* set LACP mode */ 1176 aggr_port_lacp_set_mode(grp, port); 1177 1178 /* start port if group has already been started */ 1179 if (grp->lg_started) { 1180 rc = aggr_port_start(port); 1181 if (rc != 0) { 1182 mac_perim_exit(pmph); 1183 goto bail; 1184 } 1185 1186 /* 1187 * Turn on the promiscuous mode over the port when it 1188 * is requested to be turned on to receive the 1189 * non-primary address over a port, or the promiscuous 1190 * mode is enabled over the aggr. 1191 */ 1192 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1193 rc = aggr_port_promisc(port, B_TRUE); 1194 if (rc != 0) { 1195 mac_perim_exit(pmph); 1196 goto bail; 1197 } 1198 } 1199 } 1200 mac_perim_exit(pmph); 1201 1202 /* 1203 * Attach each port if necessary. 1204 */ 1205 if (aggr_port_notify_link(grp, port)) 1206 link_state_changed = B_TRUE; 1207 1208 /* 1209 * Initialize the callback functions for this port. 1210 */ 1211 aggr_port_init_callbacks(port); 1212 } 1213 1214 /* update the MAC address of the constituent ports */ 1215 if (aggr_grp_update_ports_mac(grp)) 1216 link_state_changed = B_TRUE; 1217 1218 if (link_state_changed) 1219 mac_link_update(grp->lg_mh, grp->lg_link_state); 1220 1221 bail: 1222 if (rc != 0) { 1223 /* stop and remove ports that have been added */ 1224 for (uint_t i = 0; i < port_added; i++) { 1225 uint_t grp_remove; 1226 1227 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1228 ASSERT(port != NULL); 1229 1230 if (grp->lg_started) { 1231 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1232 (void) aggr_port_promisc(port, B_FALSE); 1233 aggr_port_stop(port); 1234 mac_perim_exit(pmph); 1235 } 1236 1237 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1238 1239 /* 1240 * Only the last port could have a partial set 1241 * of groups added. 1242 */ 1243 grp_remove = (i + 1 == port_added) ? grp_added : 1244 grp->lg_rx_group_count; 1245 1246 for (uint_t j = 0; j < grp_remove; j++) { 1247 aggr_rem_pseudo_rx_group(port, 1248 &grp->lg_rx_groups[j]); 1249 } 1250 1251 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1252 } 1253 } 1254 1255 mac_perim_exit(mph); 1256 AGGR_GRP_REFRELE(grp); 1257 return (rc); 1258 } 1259 1260 static int 1261 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1262 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1263 aggr_lacp_timer_t lacp_timer) 1264 { 1265 boolean_t mac_addr_changed = B_FALSE; 1266 boolean_t link_state_changed = B_FALSE; 1267 mac_perim_handle_t pmph; 1268 1269 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1270 1271 /* validate fixed address if specified */ 1272 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1273 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1274 (mac_addr[0] & 0x01))) { 1275 return (EINVAL); 1276 } 1277 1278 /* update policy if requested */ 1279 if (update_mask & AGGR_MODIFY_POLICY) 1280 aggr_send_update_policy(grp, policy); 1281 1282 /* update unicast MAC address if requested */ 1283 if (update_mask & AGGR_MODIFY_MAC) { 1284 if (mac_fixed) { 1285 /* user-supplied MAC address */ 1286 grp->lg_mac_addr_port = NULL; 1287 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1288 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1289 mac_addr_changed = B_TRUE; 1290 } 1291 } else if (grp->lg_addr_fixed) { 1292 /* switch from user-supplied to automatic */ 1293 aggr_port_t *port = grp->lg_ports; 1294 1295 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1296 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1297 grp->lg_mac_addr_port = port; 1298 mac_addr_changed = B_TRUE; 1299 mac_perim_exit(pmph); 1300 } 1301 grp->lg_addr_fixed = mac_fixed; 1302 } 1303 1304 if (mac_addr_changed) 1305 link_state_changed = aggr_grp_update_ports_mac(grp); 1306 1307 if (update_mask & AGGR_MODIFY_LACP_MODE) 1308 aggr_lacp_update_mode(grp, lacp_mode); 1309 1310 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1311 aggr_lacp_update_timer(grp, lacp_timer); 1312 1313 if (link_state_changed) 1314 mac_link_update(grp->lg_mh, grp->lg_link_state); 1315 1316 if (mac_addr_changed) 1317 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1318 1319 return (0); 1320 } 1321 1322 /* 1323 * Update properties of an existing link aggregation group. 1324 */ 1325 int 1326 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1327 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1328 aggr_lacp_timer_t lacp_timer) 1329 { 1330 aggr_grp_t *grp = NULL; 1331 mac_perim_handle_t mph; 1332 int err; 1333 1334 /* get group corresponding to linkid */ 1335 rw_enter(&aggr_grp_lock, RW_READER); 1336 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1337 (mod_hash_val_t *)&grp) != 0) { 1338 rw_exit(&aggr_grp_lock); 1339 return (ENOENT); 1340 } 1341 AGGR_GRP_REFHOLD(grp); 1342 1343 /* 1344 * Hold the perimeter so that the aggregation won't be destroyed. 1345 */ 1346 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1347 rw_exit(&aggr_grp_lock); 1348 1349 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1350 mac_addr, lacp_mode, lacp_timer); 1351 1352 mac_perim_exit(mph); 1353 AGGR_GRP_REFRELE(grp); 1354 return (err); 1355 } 1356 1357 /* 1358 * Create a new link aggregation group upon request from administrator. 1359 * Returns 0 on success, an errno on failure. 1360 */ 1361 int 1362 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1363 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1364 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1365 cred_t *credp) 1366 { 1367 aggr_grp_t *grp = NULL; 1368 aggr_port_t *port; 1369 mac_register_t *mac; 1370 boolean_t link_state_changed; 1371 mac_perim_handle_t mph; 1372 int err; 1373 int i; 1374 kt_did_t tid = 0; 1375 1376 /* need at least one port */ 1377 if (nports == 0) 1378 return (EINVAL); 1379 1380 rw_enter(&aggr_grp_lock, RW_WRITER); 1381 1382 /* does a group with the same linkid already exist? */ 1383 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1384 (mod_hash_val_t *)&grp); 1385 if (err == 0) { 1386 rw_exit(&aggr_grp_lock); 1387 return (EEXIST); 1388 } 1389 1390 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1391 1392 grp->lg_refs = 1; 1393 grp->lg_closing = B_FALSE; 1394 grp->lg_force = force; 1395 grp->lg_linkid = linkid; 1396 grp->lg_zoneid = crgetzoneid(credp); 1397 grp->lg_ifspeed = 0; 1398 grp->lg_link_state = LINK_STATE_UNKNOWN; 1399 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1400 grp->lg_started = B_FALSE; 1401 grp->lg_promisc = B_FALSE; 1402 grp->lg_lacp_done = B_FALSE; 1403 grp->lg_tx_notify_done = B_FALSE; 1404 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1405 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1406 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1407 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1408 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1409 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1410 MAX_RINGS_PER_GROUP), KM_SLEEP); 1411 grp->lg_tx_blocked_cnt = 0; 1412 bzero(&grp->lg_rx_groups, 1413 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1414 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1415 aggr_lacp_init_grp(grp); 1416 1417 /* add MAC ports to group */ 1418 grp->lg_ports = NULL; 1419 grp->lg_nports = 0; 1420 grp->lg_nattached_ports = 0; 1421 grp->lg_ntx_ports = 0; 1422 1423 /* 1424 * If key is not specified by the user, allocate the key. 1425 */ 1426 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1427 err = ENOMEM; 1428 goto bail; 1429 } 1430 grp->lg_key = key; 1431 1432 for (i = 0; i < nports; i++) { 1433 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1434 if (err != 0) 1435 goto bail; 1436 } 1437 1438 grp->lg_rx_group_count = 1; 1439 1440 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1441 uint_t num_rgroups; 1442 1443 mac_perim_enter_by_mh(port->lp_mh, &mph); 1444 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1445 mac_perim_exit(mph); 1446 1447 /* 1448 * Utilize all the groups in a port. If some ports 1449 * have less groups than others, then traffic destined 1450 * for the same unicast address may be HW classified 1451 * on some ports but SW classified by aggr when 1452 * arriving on other ports. 1453 */ 1454 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1455 num_rgroups); 1456 } 1457 1458 /* 1459 * There could be cases where the hardware provides more 1460 * groups than aggr can support. Make sure we never go above 1461 * the max aggr can support. 1462 */ 1463 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1464 MAX_GROUPS_PER_PORT); 1465 1466 ASSERT3U(grp->lg_rx_group_count, >, 0); 1467 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1468 grp->lg_rx_groups[i].arg_index = i; 1469 grp->lg_rx_groups[i].arg_untagged = 0; 1470 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1471 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1472 } 1473 1474 /* 1475 * If no explicit MAC address was specified by the administrator, 1476 * set it to the MAC address of the first port. 1477 */ 1478 grp->lg_addr_fixed = mac_fixed; 1479 if (grp->lg_addr_fixed) { 1480 /* validate specified address */ 1481 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1482 err = EINVAL; 1483 goto bail; 1484 } 1485 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1486 } else { 1487 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1488 grp->lg_mac_addr_port = grp->lg_ports; 1489 } 1490 1491 /* Set the initial group capabilities. */ 1492 aggr_grp_capab_set(grp); 1493 1494 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1495 err = ENOMEM; 1496 goto bail; 1497 } 1498 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1499 mac->m_driver = grp; 1500 mac->m_dip = aggr_dip; 1501 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1502 mac->m_src_addr = grp->lg_addr; 1503 mac->m_callbacks = &aggr_m_callbacks; 1504 mac->m_min_sdu = 0; 1505 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1506 mac->m_margin = aggr_grp_max_margin(grp); 1507 mac->m_v12n = MAC_VIRT_LEVEL1; 1508 err = mac_register(mac, &grp->lg_mh); 1509 mac_free(mac); 1510 if (err != 0) 1511 goto bail; 1512 1513 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1514 if (err != 0) { 1515 (void) mac_unregister(grp->lg_mh); 1516 grp->lg_mh = NULL; 1517 goto bail; 1518 } 1519 1520 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1521 1522 /* 1523 * Update the MAC address of the constituent ports. 1524 * None of the port is attached at this time, the link state of the 1525 * aggregation will not change. 1526 * 1527 * All ports take on the primary MAC address of the aggr 1528 * (lg_aggr). At this point, none of the ports are attached; 1529 * thus the link state of the aggregation will not change. 1530 */ 1531 link_state_changed = aggr_grp_update_ports_mac(grp); 1532 ASSERT(!link_state_changed); 1533 1534 /* Update outbound load balancing policy. */ 1535 aggr_send_update_policy(grp, policy); 1536 1537 /* Set LACP mode. */ 1538 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1539 1540 /* 1541 * Attach each port if necessary. 1542 */ 1543 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1544 /* 1545 * Create the pseudo ring for each HW ring of the 1546 * underlying port. Note that this is done after the 1547 * aggr registers its MAC. 1548 */ 1549 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), 1550 ==, 0); 1551 1552 for (i = 0; i < grp->lg_rx_group_count; i++) { 1553 VERIFY3S(aggr_add_pseudo_rx_group(port, 1554 &grp->lg_rx_groups[i]), ==, 0); 1555 } 1556 1557 if (aggr_port_notify_link(grp, port)) 1558 link_state_changed = B_TRUE; 1559 1560 /* 1561 * Initialize the callback functions for this port. 1562 */ 1563 aggr_port_init_callbacks(port); 1564 } 1565 1566 if (link_state_changed) 1567 mac_link_update(grp->lg_mh, grp->lg_link_state); 1568 1569 /* add new group to hash table */ 1570 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1571 (mod_hash_val_t)grp); 1572 ASSERT(err == 0); 1573 aggr_grp_cnt++; 1574 1575 mac_perim_exit(mph); 1576 rw_exit(&aggr_grp_lock); 1577 return (0); 1578 1579 bail: 1580 1581 grp->lg_closing = B_TRUE; 1582 1583 port = grp->lg_ports; 1584 while (port != NULL) { 1585 aggr_port_t *cport; 1586 1587 cport = port->lp_next; 1588 aggr_port_delete(port); 1589 port = cport; 1590 } 1591 1592 /* 1593 * Inform the lacp_rx thread to exit. 1594 */ 1595 mutex_enter(&grp->lg_lacp_lock); 1596 grp->lg_lacp_done = B_TRUE; 1597 cv_signal(&grp->lg_lacp_cv); 1598 while (grp->lg_lacp_rx_thread != NULL) 1599 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1600 mutex_exit(&grp->lg_lacp_lock); 1601 /* 1602 * Inform the tx_notify thread to exit. 1603 */ 1604 mutex_enter(&grp->lg_tx_flowctl_lock); 1605 if (grp->lg_tx_notify_thread != NULL) { 1606 tid = grp->lg_tx_notify_thread->t_did; 1607 grp->lg_tx_notify_done = B_TRUE; 1608 cv_signal(&grp->lg_tx_flowctl_cv); 1609 } 1610 mutex_exit(&grp->lg_tx_flowctl_lock); 1611 if (tid != 0) 1612 thread_join(tid); 1613 1614 kmem_free(grp->lg_tx_blocked_rings, 1615 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1616 rw_exit(&aggr_grp_lock); 1617 AGGR_GRP_REFRELE(grp); 1618 return (err); 1619 } 1620 1621 /* 1622 * Return a pointer to the member of a group with specified linkid. 1623 */ 1624 static aggr_port_t * 1625 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1626 { 1627 aggr_port_t *port; 1628 1629 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1630 1631 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1632 if (port->lp_linkid == linkid) 1633 break; 1634 } 1635 1636 return (port); 1637 } 1638 1639 /* 1640 * Stop, detach and remove a port from a link aggregation group. 1641 */ 1642 static int 1643 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1644 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1645 { 1646 int rc = 0; 1647 aggr_port_t **pport; 1648 boolean_t mac_addr_changed = B_FALSE; 1649 boolean_t link_state_changed = B_FALSE; 1650 mac_perim_handle_t mph; 1651 uint64_t val; 1652 uint_t i; 1653 uint_t stat; 1654 1655 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1656 ASSERT(grp->lg_nports > 1); 1657 ASSERT(!grp->lg_closing); 1658 1659 /* unlink port */ 1660 for (pport = &grp->lg_ports; *pport != port; 1661 pport = &(*pport)->lp_next) { 1662 if (*pport == NULL) { 1663 rc = ENOENT; 1664 goto done; 1665 } 1666 } 1667 *pport = port->lp_next; 1668 1669 mac_perim_enter_by_mh(port->lp_mh, &mph); 1670 1671 /* 1672 * If the MAC address of the port being removed was assigned 1673 * to the group, update the group MAC address 1674 * using the MAC address of a different port. 1675 */ 1676 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1677 /* 1678 * Set the MAC address of the group to the 1679 * MAC address of its first port. 1680 */ 1681 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1682 grp->lg_mac_addr_port = grp->lg_ports; 1683 mac_addr_changed = B_TRUE; 1684 } 1685 1686 link_state_changed = aggr_grp_detach_port(grp, port); 1687 1688 /* 1689 * Add the counter statistics of the ports while it was aggregated 1690 * to the group's residual statistics. This is done by obtaining 1691 * the current counter from the underlying MAC then subtracting the 1692 * value of the counter at the moment it was added to the 1693 * aggregation. 1694 */ 1695 for (i = 0; i < MAC_NSTAT; i++) { 1696 stat = i + MAC_STAT_MIN; 1697 if (!MAC_STAT_ISACOUNTER(stat)) 1698 continue; 1699 val = aggr_port_stat(port, stat); 1700 val -= port->lp_stat[i]; 1701 mutex_enter(&grp->lg_stat_lock); 1702 grp->lg_stat[i] += val; 1703 mutex_exit(&grp->lg_stat_lock); 1704 } 1705 for (i = 0; i < ETHER_NSTAT; i++) { 1706 stat = i + MACTYPE_STAT_MIN; 1707 if (!ETHER_STAT_ISACOUNTER(stat)) 1708 continue; 1709 val = aggr_port_stat(port, stat); 1710 val -= port->lp_ether_stat[i]; 1711 mutex_enter(&grp->lg_stat_lock); 1712 grp->lg_ether_stat[i] += val; 1713 mutex_exit(&grp->lg_stat_lock); 1714 } 1715 1716 grp->lg_nports--; 1717 mac_perim_exit(mph); 1718 1719 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1720 aggr_port_delete(port); 1721 1722 /* 1723 * If the group MAC address has changed, update the MAC address of 1724 * the remaining constituent ports according to the new MAC 1725 * address of the group. 1726 */ 1727 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1728 link_state_changed = B_TRUE; 1729 1730 done: 1731 if (mac_addr_changedp != NULL) 1732 *mac_addr_changedp = mac_addr_changed; 1733 if (link_state_changedp != NULL) 1734 *link_state_changedp = link_state_changed; 1735 1736 return (rc); 1737 } 1738 1739 /* 1740 * Remove one or more ports from an existing link aggregation group. 1741 */ 1742 int 1743 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1744 { 1745 int rc = 0, i; 1746 aggr_grp_t *grp = NULL; 1747 aggr_port_t *port; 1748 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1749 boolean_t link_state_update = B_FALSE, link_state_changed; 1750 mac_perim_handle_t mph, pmph; 1751 1752 /* get group corresponding to linkid */ 1753 rw_enter(&aggr_grp_lock, RW_READER); 1754 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1755 (mod_hash_val_t *)&grp) != 0) { 1756 rw_exit(&aggr_grp_lock); 1757 return (ENOENT); 1758 } 1759 AGGR_GRP_REFHOLD(grp); 1760 1761 /* 1762 * Hold the perimeter so that the aggregation won't be destroyed. 1763 */ 1764 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1765 rw_exit(&aggr_grp_lock); 1766 1767 /* we need to keep at least one port per group */ 1768 if (nports >= grp->lg_nports) { 1769 rc = EINVAL; 1770 goto bail; 1771 } 1772 1773 /* first verify that all the groups are valid */ 1774 for (i = 0; i < nports; i++) { 1775 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1776 /* port not found */ 1777 rc = ENOENT; 1778 goto bail; 1779 } 1780 } 1781 1782 /* clear the promiscous mode for the specified ports */ 1783 for (i = 0; i < nports && rc == 0; i++) { 1784 /* lookup port */ 1785 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1786 ASSERT(port != NULL); 1787 1788 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1789 rc = aggr_port_promisc(port, B_FALSE); 1790 mac_perim_exit(pmph); 1791 } 1792 if (rc != 0) { 1793 for (i = 0; i < nports; i++) { 1794 port = aggr_grp_port_lookup(grp, 1795 ports[i].lp_linkid); 1796 ASSERT(port != NULL); 1797 1798 /* 1799 * Turn the promiscuous mode back on if it is required 1800 * to receive the non-primary address over a port, or 1801 * the promiscous mode is enabled over the aggr. 1802 */ 1803 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1804 if (port->lp_started && (grp->lg_promisc || 1805 port->lp_prom_addr != NULL)) { 1806 (void) aggr_port_promisc(port, B_TRUE); 1807 } 1808 mac_perim_exit(pmph); 1809 } 1810 goto bail; 1811 } 1812 1813 /* remove the specified ports from group */ 1814 for (i = 0; i < nports; i++) { 1815 /* lookup port */ 1816 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1817 ASSERT(port != NULL); 1818 1819 /* stop port if group has already been started */ 1820 if (grp->lg_started) { 1821 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1822 aggr_port_stop(port); 1823 mac_perim_exit(pmph); 1824 } 1825 1826 /* 1827 * aggr_rem_pseudo_tx_group() is not called here. Instead 1828 * it is called from inside aggr_grp_rem_port() after the 1829 * port has been detached. The reason is that 1830 * aggr_rem_pseudo_tx_group() removes one ring at a time 1831 * and if there is still traffic going on, then there 1832 * is the possibility of aggr_find_tx_ring() returning a 1833 * removed ring for transmission. Once the port has been 1834 * detached, that port will not be used and 1835 * aggr_find_tx_ring() will not return any rings 1836 * belonging to it. 1837 */ 1838 for (i = 0; i < grp->lg_rx_group_count; i++) 1839 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1840 1841 /* remove port from group */ 1842 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1843 &link_state_changed); 1844 ASSERT(rc == 0); 1845 mac_addr_update = mac_addr_update || mac_addr_changed; 1846 link_state_update = link_state_update || link_state_changed; 1847 } 1848 1849 bail: 1850 if (mac_addr_update) 1851 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1852 if (link_state_update) 1853 mac_link_update(grp->lg_mh, grp->lg_link_state); 1854 1855 mac_perim_exit(mph); 1856 AGGR_GRP_REFRELE(grp); 1857 1858 return (rc); 1859 } 1860 1861 int 1862 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1863 { 1864 aggr_grp_t *grp = NULL; 1865 aggr_port_t *port, *cport; 1866 datalink_id_t tmpid; 1867 mod_hash_val_t val; 1868 mac_perim_handle_t mph, pmph; 1869 int err; 1870 kt_did_t tid = 0; 1871 1872 rw_enter(&aggr_grp_lock, RW_WRITER); 1873 1874 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1875 (mod_hash_val_t *)&grp) != 0) { 1876 rw_exit(&aggr_grp_lock); 1877 return (ENOENT); 1878 } 1879 1880 /* 1881 * Note that dls_devnet_destroy() must be called before lg_lock is 1882 * held. Otherwise, it will deadlock if another thread is in 1883 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1884 * dls_devnet_destroy() needs to delete. 1885 */ 1886 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1887 rw_exit(&aggr_grp_lock); 1888 return (err); 1889 } 1890 ASSERT(linkid == tmpid); 1891 1892 /* 1893 * Unregister from the MAC service module. Since this can 1894 * fail if a client hasn't closed the MAC port, we gracefully 1895 * fail the operation. 1896 */ 1897 if ((err = mac_disable(grp->lg_mh)) != 0) { 1898 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1899 rw_exit(&aggr_grp_lock); 1900 return (err); 1901 } 1902 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1903 ASSERT(grp == (aggr_grp_t *)val); 1904 1905 ASSERT(aggr_grp_cnt > 0); 1906 aggr_grp_cnt--; 1907 rw_exit(&aggr_grp_lock); 1908 1909 /* 1910 * Inform the lacp_rx thread to exit. 1911 */ 1912 mutex_enter(&grp->lg_lacp_lock); 1913 grp->lg_lacp_done = B_TRUE; 1914 cv_signal(&grp->lg_lacp_cv); 1915 while (grp->lg_lacp_rx_thread != NULL) 1916 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1917 mutex_exit(&grp->lg_lacp_lock); 1918 /* 1919 * Inform the tx_notify_thread to exit. 1920 */ 1921 mutex_enter(&grp->lg_tx_flowctl_lock); 1922 if (grp->lg_tx_notify_thread != NULL) { 1923 tid = grp->lg_tx_notify_thread->t_did; 1924 grp->lg_tx_notify_done = B_TRUE; 1925 cv_signal(&grp->lg_tx_flowctl_cv); 1926 } 1927 mutex_exit(&grp->lg_tx_flowctl_lock); 1928 if (tid != 0) 1929 thread_join(tid); 1930 1931 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1932 1933 grp->lg_closing = B_TRUE; 1934 /* detach and free MAC ports associated with group */ 1935 port = grp->lg_ports; 1936 while (port != NULL) { 1937 cport = port->lp_next; 1938 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1939 if (grp->lg_started) 1940 aggr_port_stop(port); 1941 (void) aggr_grp_detach_port(grp, port); 1942 mac_perim_exit(pmph); 1943 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1944 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 1945 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1946 aggr_port_delete(port); 1947 port = cport; 1948 } 1949 1950 mac_perim_exit(mph); 1951 1952 kmem_free(grp->lg_tx_blocked_rings, 1953 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1954 /* 1955 * Wait for the port's lacp timer thread and its notification callback 1956 * to exit before calling mac_unregister() since both needs to access 1957 * the mac perimeter of the grp. 1958 */ 1959 aggr_grp_port_wait(grp); 1960 1961 VERIFY(mac_unregister(grp->lg_mh) == 0); 1962 grp->lg_mh = NULL; 1963 1964 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1965 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 1966 } 1967 1968 AGGR_GRP_REFRELE(grp); 1969 return (0); 1970 } 1971 1972 void 1973 aggr_grp_free(aggr_grp_t *grp) 1974 { 1975 ASSERT(grp->lg_refs == 0); 1976 ASSERT(grp->lg_port_ref == 0); 1977 if (grp->lg_key > AGGR_MAX_KEY) { 1978 id_free(key_ids, grp->lg_key); 1979 grp->lg_key = 0; 1980 } 1981 kmem_cache_free(aggr_grp_cache, grp); 1982 } 1983 1984 int 1985 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1986 aggr_grp_info_new_grp_fn_t new_grp_fn, 1987 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1988 { 1989 aggr_grp_t *grp; 1990 aggr_port_t *port; 1991 mac_perim_handle_t mph, pmph; 1992 int rc = 0; 1993 1994 /* 1995 * Make sure that the aggregation link is visible from the caller's 1996 * zone. 1997 */ 1998 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 1999 return (ENOENT); 2000 2001 rw_enter(&aggr_grp_lock, RW_READER); 2002 2003 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2004 (mod_hash_val_t *)&grp) != 0) { 2005 rw_exit(&aggr_grp_lock); 2006 return (ENOENT); 2007 } 2008 AGGR_GRP_REFHOLD(grp); 2009 2010 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2011 rw_exit(&aggr_grp_lock); 2012 2013 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2014 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2015 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2016 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2017 2018 if (rc != 0) 2019 goto bail; 2020 2021 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2022 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2023 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2024 port->lp_state, &port->lp_lacp.ActorOperPortState); 2025 mac_perim_exit(pmph); 2026 2027 if (rc != 0) 2028 goto bail; 2029 } 2030 2031 bail: 2032 mac_perim_exit(mph); 2033 AGGR_GRP_REFRELE(grp); 2034 return (rc); 2035 } 2036 2037 /*ARGSUSED*/ 2038 static void 2039 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2040 { 2041 miocnak(q, mp, 0, ENOTSUP); 2042 } 2043 2044 static int 2045 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2046 { 2047 aggr_port_t *port; 2048 uint_t stat_index; 2049 2050 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2051 2052 /* We only aggregate counter statistics. */ 2053 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 2054 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 2055 return (ENOTSUP); 2056 } 2057 2058 /* 2059 * Counter statistics for a group are computed by aggregating the 2060 * counters of the members MACs while they were aggregated, plus 2061 * the residual counter of the group itself, which is updated each 2062 * time a MAC is removed from the group. 2063 */ 2064 *val = 0; 2065 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2066 /* actual port statistic */ 2067 *val += aggr_port_stat(port, stat); 2068 /* 2069 * minus the port stat when it was added, plus any residual 2070 * amount for the group. 2071 */ 2072 if (IS_MAC_STAT(stat)) { 2073 stat_index = stat - MAC_STAT_MIN; 2074 *val -= port->lp_stat[stat_index]; 2075 *val += grp->lg_stat[stat_index]; 2076 } else if (IS_MACTYPE_STAT(stat)) { 2077 stat_index = stat - MACTYPE_STAT_MIN; 2078 *val -= port->lp_ether_stat[stat_index]; 2079 *val += grp->lg_ether_stat[stat_index]; 2080 } 2081 } 2082 return (0); 2083 } 2084 2085 int 2086 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2087 { 2088 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2089 2090 if (rx_ring->arr_hw_rh != NULL) { 2091 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2092 } else { 2093 aggr_port_t *port = rx_ring->arr_port; 2094 2095 *val = mac_stat_get(port->lp_mh, stat); 2096 2097 } 2098 return (0); 2099 } 2100 2101 int 2102 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2103 { 2104 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2105 2106 if (tx_ring->atr_hw_rh != NULL) { 2107 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2108 } else { 2109 aggr_port_t *port = tx_ring->atr_port; 2110 2111 *val = mac_stat_get(port->lp_mh, stat); 2112 } 2113 return (0); 2114 } 2115 2116 static int 2117 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2118 { 2119 aggr_grp_t *grp = arg; 2120 int rval = 0; 2121 2122 mutex_enter(&grp->lg_stat_lock); 2123 2124 switch (stat) { 2125 case MAC_STAT_IFSPEED: 2126 *val = grp->lg_ifspeed; 2127 break; 2128 2129 case ETHER_STAT_LINK_DUPLEX: 2130 *val = grp->lg_link_duplex; 2131 break; 2132 2133 default: 2134 /* 2135 * For all other statistics, we return the aggregated stat 2136 * from the underlying ports. aggr_grp_stat() will set 2137 * rval appropriately if the statistic isn't a counter. 2138 */ 2139 rval = aggr_grp_stat(grp, stat, val); 2140 } 2141 2142 mutex_exit(&grp->lg_stat_lock); 2143 return (rval); 2144 } 2145 2146 static int 2147 aggr_m_start(void *arg) 2148 { 2149 aggr_grp_t *grp = arg; 2150 aggr_port_t *port; 2151 mac_perim_handle_t mph, pmph; 2152 2153 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2154 2155 /* 2156 * Attempts to start all configured members of the group. 2157 * Group members will be attached when their link-up notification 2158 * is received. 2159 */ 2160 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2161 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2162 if (aggr_port_start(port) != 0) { 2163 mac_perim_exit(pmph); 2164 continue; 2165 } 2166 2167 /* 2168 * Turn on the promiscuous mode if it is required to receive 2169 * the non-primary address over a port, or the promiscous 2170 * mode is enabled over the aggr. 2171 */ 2172 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2173 if (aggr_port_promisc(port, B_TRUE) != 0) 2174 aggr_port_stop(port); 2175 } 2176 mac_perim_exit(pmph); 2177 } 2178 2179 grp->lg_started = B_TRUE; 2180 2181 mac_perim_exit(mph); 2182 return (0); 2183 } 2184 2185 static void 2186 aggr_m_stop(void *arg) 2187 { 2188 aggr_grp_t *grp = arg; 2189 aggr_port_t *port; 2190 mac_perim_handle_t mph, pmph; 2191 2192 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2193 2194 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2195 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2196 2197 /* reset port promiscuous mode */ 2198 (void) aggr_port_promisc(port, B_FALSE); 2199 2200 aggr_port_stop(port); 2201 mac_perim_exit(pmph); 2202 } 2203 2204 grp->lg_started = B_FALSE; 2205 mac_perim_exit(mph); 2206 } 2207 2208 static int 2209 aggr_m_promisc(void *arg, boolean_t on) 2210 { 2211 aggr_grp_t *grp = arg; 2212 aggr_port_t *port; 2213 boolean_t link_state_changed = B_FALSE; 2214 mac_perim_handle_t mph, pmph; 2215 2216 AGGR_GRP_REFHOLD(grp); 2217 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2218 2219 ASSERT(!grp->lg_closing); 2220 2221 if (on == grp->lg_promisc) 2222 goto bail; 2223 2224 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2225 int err = 0; 2226 2227 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2228 AGGR_PORT_REFHOLD(port); 2229 if (!on && (port->lp_prom_addr == NULL)) 2230 err = aggr_port_promisc(port, B_FALSE); 2231 else if (on && port->lp_started) 2232 err = aggr_port_promisc(port, B_TRUE); 2233 2234 if (err != 0) { 2235 if (aggr_grp_detach_port(grp, port)) 2236 link_state_changed = B_TRUE; 2237 } else { 2238 /* 2239 * If a port was detached because of a previous 2240 * failure changing the promiscuity, the port 2241 * is reattached when it successfully changes 2242 * the promiscuity now, and this might cause 2243 * the link state of the aggregation to change. 2244 */ 2245 if (aggr_grp_attach_port(grp, port)) 2246 link_state_changed = B_TRUE; 2247 } 2248 mac_perim_exit(pmph); 2249 AGGR_PORT_REFRELE(port); 2250 } 2251 2252 grp->lg_promisc = on; 2253 2254 if (link_state_changed) 2255 mac_link_update(grp->lg_mh, grp->lg_link_state); 2256 2257 bail: 2258 mac_perim_exit(mph); 2259 AGGR_GRP_REFRELE(grp); 2260 2261 return (0); 2262 } 2263 2264 static void 2265 aggr_grp_port_rename(const char *new_name, void *arg) 2266 { 2267 /* 2268 * aggr port's mac client name is the format of "aggr link name" plus 2269 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2270 */ 2271 int aggr_len, link_len, clnt_name_len, i; 2272 char *str_end, *str_st, *str_del; 2273 char aggr_name[MAXNAMELEN]; 2274 char link_name[MAXNAMELEN]; 2275 char *clnt_name; 2276 aggr_grp_t *aggr_grp = arg; 2277 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2278 2279 for (i = 0; i < aggr_grp->lg_nports; i++) { 2280 clnt_name = mac_client_name(aggr_port->lp_mch); 2281 clnt_name_len = strlen(clnt_name); 2282 str_st = clnt_name; 2283 str_end = &(clnt_name[clnt_name_len]); 2284 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2285 ASSERT(str_del != NULL); 2286 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2287 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2288 bzero(aggr_name, MAXNAMELEN); 2289 bzero(link_name, MAXNAMELEN); 2290 bcopy(clnt_name, aggr_name, aggr_len); 2291 bcopy(str_del, link_name, link_len + 1); 2292 bzero(clnt_name, MAXNAMELEN); 2293 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2294 link_name); 2295 2296 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2297 aggr_port = aggr_port->lp_next; 2298 } 2299 } 2300 2301 /* 2302 * Initialize the capabilities that are advertised for the group 2303 * according to the capabilities of the constituent ports. 2304 */ 2305 static boolean_t 2306 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2307 { 2308 aggr_grp_t *grp = arg; 2309 2310 switch (cap) { 2311 case MAC_CAPAB_HCKSUM: { 2312 uint32_t *hcksum_txflags = cap_data; 2313 *hcksum_txflags = grp->lg_hcksum_txflags; 2314 break; 2315 } 2316 case MAC_CAPAB_LSO: { 2317 mac_capab_lso_t *cap_lso = cap_data; 2318 2319 if (grp->lg_lso) { 2320 *cap_lso = grp->lg_cap_lso; 2321 break; 2322 } else { 2323 return (B_FALSE); 2324 } 2325 } 2326 case MAC_CAPAB_NO_NATIVEVLAN: 2327 return (!grp->lg_vlan); 2328 case MAC_CAPAB_NO_ZCOPY: 2329 return (!grp->lg_zcopy); 2330 case MAC_CAPAB_RINGS: { 2331 mac_capab_rings_t *cap_rings = cap_data; 2332 uint_t ring_cnt = 0; 2333 2334 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2335 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2336 2337 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2338 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2339 cap_rings->mr_rnum = ring_cnt; 2340 cap_rings->mr_gnum = grp->lg_rx_group_count; 2341 cap_rings->mr_gaddring = NULL; 2342 cap_rings->mr_gremring = NULL; 2343 } else { 2344 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2345 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2346 cap_rings->mr_gnum = 0; 2347 } 2348 cap_rings->mr_rget = aggr_fill_ring; 2349 cap_rings->mr_gget = aggr_fill_group; 2350 break; 2351 } 2352 case MAC_CAPAB_AGGR: 2353 { 2354 mac_capab_aggr_t *aggr_cap; 2355 2356 if (cap_data != NULL) { 2357 aggr_cap = cap_data; 2358 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2359 aggr_cap->mca_unicst = aggr_m_unicst; 2360 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2361 aggr_cap->mca_arg = arg; 2362 } 2363 return (B_TRUE); 2364 } 2365 default: 2366 return (B_FALSE); 2367 } 2368 return (B_TRUE); 2369 } 2370 2371 /* 2372 * Callback function for MAC layer to register groups. 2373 */ 2374 static void 2375 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2376 mac_group_info_t *infop, mac_group_handle_t gh) 2377 { 2378 aggr_grp_t *grp = arg; 2379 2380 if (rtype == MAC_RING_TYPE_RX) { 2381 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2382 2383 rx_group->arg_gh = gh; 2384 rx_group->arg_grp = grp; 2385 2386 infop->mgi_driver = (mac_group_driver_t)rx_group; 2387 infop->mgi_start = NULL; 2388 infop->mgi_stop = NULL; 2389 infop->mgi_addmac = aggr_addmac; 2390 infop->mgi_remmac = aggr_remmac; 2391 infop->mgi_count = rx_group->arg_ring_cnt; 2392 2393 /* 2394 * Always set the HW VLAN callbacks. They are smart 2395 * enough to know when a port has HW VLAN filters to 2396 * program and when it doesn't. 2397 */ 2398 infop->mgi_addvlan = aggr_addvlan; 2399 infop->mgi_remvlan = aggr_remvlan; 2400 } else { 2401 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2402 2403 ASSERT3S(index, ==, 0); 2404 tx_group->atg_gh = gh; 2405 } 2406 } 2407 2408 /* 2409 * Callback funtion for MAC layer to register all rings. 2410 */ 2411 static void 2412 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2413 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2414 { 2415 aggr_grp_t *grp = arg; 2416 2417 switch (rtype) { 2418 case MAC_RING_TYPE_RX: { 2419 aggr_pseudo_rx_group_t *rx_group; 2420 aggr_pseudo_rx_ring_t *rx_ring; 2421 mac_intr_t aggr_mac_intr; 2422 2423 rx_group = &grp->lg_rx_groups[rg_index]; 2424 ASSERT3S(index, >=, 0); 2425 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2426 rx_ring = rx_group->arg_rings + index; 2427 rx_ring->arr_rh = rh; 2428 2429 /* 2430 * Entrypoint to enable interrupt (disable poll) and 2431 * disable interrupt (enable poll). 2432 */ 2433 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2434 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2435 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2436 aggr_mac_intr.mi_ddi_handle = NULL; 2437 2438 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2439 infop->mri_start = aggr_pseudo_start_rx_ring; 2440 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2441 2442 infop->mri_intr = aggr_mac_intr; 2443 infop->mri_poll = aggr_rx_poll; 2444 2445 infop->mri_stat = aggr_rx_ring_stat; 2446 break; 2447 } 2448 case MAC_RING_TYPE_TX: { 2449 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2450 aggr_pseudo_tx_ring_t *tx_ring; 2451 2452 ASSERT(rg_index == -1); 2453 ASSERT(index < tx_group->atg_ring_cnt); 2454 2455 tx_ring = &tx_group->atg_rings[index]; 2456 tx_ring->atr_rh = rh; 2457 2458 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2459 infop->mri_start = NULL; 2460 infop->mri_stop = NULL; 2461 infop->mri_tx = aggr_ring_tx; 2462 infop->mri_stat = aggr_tx_ring_stat; 2463 /* 2464 * Use the hw TX ring handle to find if the ring needs 2465 * serialization or not. For NICs that do not expose 2466 * Tx rings, atr_hw_rh will be NULL. 2467 */ 2468 if (tx_ring->atr_hw_rh != NULL) { 2469 infop->mri_flags = 2470 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2471 } 2472 break; 2473 } 2474 default: 2475 break; 2476 } 2477 } 2478 2479 static mblk_t * 2480 aggr_rx_poll(void *arg, int bytes_to_pickup) 2481 { 2482 aggr_pseudo_rx_ring_t *rr_ring = arg; 2483 aggr_port_t *port = rr_ring->arr_port; 2484 aggr_grp_t *grp = port->lp_grp; 2485 mblk_t *mp_chain, *mp, **mpp; 2486 2487 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2488 2489 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2490 return (mp_chain); 2491 2492 mpp = &mp_chain; 2493 while ((mp = *mpp) != NULL) { 2494 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2495 struct ether_header *ehp; 2496 2497 ehp = (struct ether_header *)mp->b_rptr; 2498 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2499 *mpp = mp->b_next; 2500 mp->b_next = NULL; 2501 aggr_recv_lacp(port, 2502 (mac_resource_handle_t)rr_ring, mp); 2503 continue; 2504 } 2505 } 2506 2507 if (!port->lp_collector_enabled) { 2508 *mpp = mp->b_next; 2509 mp->b_next = NULL; 2510 freemsg(mp); 2511 continue; 2512 } 2513 mpp = &mp->b_next; 2514 } 2515 return (mp_chain); 2516 } 2517 2518 static int 2519 aggr_addmac(void *arg, const uint8_t *mac_addr) 2520 { 2521 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2522 aggr_unicst_addr_t *addr, **pprev; 2523 aggr_grp_t *grp = rx_group->arg_grp; 2524 aggr_port_t *port, *p; 2525 mac_perim_handle_t mph; 2526 int err = 0; 2527 uint_t idx = rx_group->arg_index; 2528 2529 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2530 2531 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2532 mac_perim_exit(mph); 2533 return (0); 2534 } 2535 2536 /* 2537 * Insert this mac address into the list of mac addresses owned by 2538 * the aggregation pseudo group. 2539 */ 2540 pprev = &rx_group->arg_macaddr; 2541 while ((addr = *pprev) != NULL) { 2542 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2543 mac_perim_exit(mph); 2544 return (EEXIST); 2545 } 2546 pprev = &addr->aua_next; 2547 } 2548 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2549 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2550 addr->aua_next = NULL; 2551 *pprev = addr; 2552 2553 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2554 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2555 break; 2556 2557 if (err != 0) { 2558 for (p = grp->lg_ports; p != port; p = p->lp_next) 2559 aggr_port_remmac(p, idx, mac_addr); 2560 2561 *pprev = NULL; 2562 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2563 } 2564 2565 mac_perim_exit(mph); 2566 return (err); 2567 } 2568 2569 static int 2570 aggr_remmac(void *arg, const uint8_t *mac_addr) 2571 { 2572 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2573 aggr_unicst_addr_t *addr, **pprev; 2574 aggr_grp_t *grp = rx_group->arg_grp; 2575 aggr_port_t *port; 2576 mac_perim_handle_t mph; 2577 int err = 0; 2578 2579 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2580 2581 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2582 mac_perim_exit(mph); 2583 return (0); 2584 } 2585 2586 /* 2587 * Insert this mac address into the list of mac addresses owned by 2588 * the aggregation pseudo group. 2589 */ 2590 pprev = &rx_group->arg_macaddr; 2591 while ((addr = *pprev) != NULL) { 2592 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2593 pprev = &addr->aua_next; 2594 continue; 2595 } 2596 break; 2597 } 2598 if (addr == NULL) { 2599 mac_perim_exit(mph); 2600 return (EINVAL); 2601 } 2602 2603 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2604 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2605 2606 *pprev = addr->aua_next; 2607 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2608 2609 mac_perim_exit(mph); 2610 return (err); 2611 } 2612 2613 /* 2614 * Search for VID in the Rx group's list and return a pointer if 2615 * found. Otherwise return NULL. 2616 */ 2617 static aggr_vlan_t * 2618 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2619 { 2620 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2621 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2622 avp = list_next(&rx_group->arg_vlans, avp)) { 2623 if (avp->av_vid == vid) 2624 return (avp); 2625 } 2626 2627 return (NULL); 2628 } 2629 2630 /* 2631 * Accept traffic on the specified VID. 2632 * 2633 * Persist VLAN state in the aggr so that ports added later will 2634 * receive the correct filters. In the future it would be nice to 2635 * allow aggr to iterate its clients instead of duplicating state. 2636 */ 2637 static int 2638 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2639 { 2640 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2641 aggr_grp_t *aggr = rx_group->arg_grp; 2642 aggr_port_t *port, *p; 2643 mac_perim_handle_t mph; 2644 int err = 0; 2645 aggr_vlan_t *avp = NULL; 2646 uint_t idx = rx_group->arg_index; 2647 2648 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2649 2650 if (vid == MAC_VLAN_UNTAGGED) { 2651 /* 2652 * Aggr is both a MAC provider and MAC client. As a 2653 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2654 * client. As a client itself, it should pass 2655 * VLAN_ID_NONE to its ports. 2656 */ 2657 vid = VLAN_ID_NONE; 2658 rx_group->arg_untagged++; 2659 goto update_ports; 2660 } 2661 2662 avp = aggr_find_vlan(rx_group, vid); 2663 2664 if (avp != NULL) { 2665 avp->av_refs++; 2666 mac_perim_exit(mph); 2667 return (0); 2668 } 2669 2670 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2671 avp->av_vid = vid; 2672 avp->av_refs = 1; 2673 2674 update_ports: 2675 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2676 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2677 break; 2678 2679 if (err != 0) { 2680 /* 2681 * If any of these calls fail then we are in a 2682 * situation where the ports have different HW state. 2683 * There's no reasonable action the MAC client can 2684 * take in this scenario to rectify the situation. 2685 */ 2686 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2687 int err2; 2688 2689 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2690 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2691 " from port %s: errno %d.", vid, 2692 mac_client_name(p->lp_mch), err2); 2693 } 2694 2695 } 2696 2697 if (vid == VLAN_ID_NONE) 2698 rx_group->arg_untagged--; 2699 2700 if (avp != NULL) { 2701 kmem_free(avp, sizeof (aggr_vlan_t)); 2702 avp = NULL; 2703 } 2704 } 2705 2706 if (avp != NULL) 2707 list_insert_tail(&rx_group->arg_vlans, avp); 2708 2709 done: 2710 mac_perim_exit(mph); 2711 return (err); 2712 } 2713 2714 /* 2715 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2716 */ 2717 static int 2718 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2719 { 2720 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2721 aggr_grp_t *aggr = rx_group->arg_grp; 2722 aggr_port_t *port, *p; 2723 mac_perim_handle_t mph; 2724 int err = 0; 2725 aggr_vlan_t *avp = NULL; 2726 uint_t idx = rx_group->arg_index; 2727 2728 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2729 2730 /* 2731 * See the comment in aggr_addvlan(). 2732 */ 2733 if (vid == MAC_VLAN_UNTAGGED) { 2734 vid = VLAN_ID_NONE; 2735 rx_group->arg_untagged--; 2736 2737 if (rx_group->arg_untagged > 0) 2738 goto done; 2739 2740 goto update_ports; 2741 } 2742 2743 avp = aggr_find_vlan(rx_group, vid); 2744 2745 if (avp == NULL) { 2746 err = ENOENT; 2747 goto done; 2748 } 2749 2750 avp->av_refs--; 2751 2752 if (avp->av_refs > 0) 2753 goto done; 2754 2755 update_ports: 2756 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2757 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2758 break; 2759 2760 /* 2761 * See the comment in aggr_addvlan() for justification of the 2762 * use of VERIFY here. 2763 */ 2764 if (err != 0) { 2765 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2766 int err2; 2767 2768 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2769 cmn_err(CE_WARN, "Failed to add VLAN %u" 2770 " to port %s: errno %d.", vid, 2771 mac_client_name(p->lp_mch), err2); 2772 } 2773 } 2774 2775 if (avp != NULL) 2776 avp->av_refs++; 2777 2778 if (vid == VLAN_ID_NONE) 2779 rx_group->arg_untagged++; 2780 2781 goto done; 2782 } 2783 2784 if (err == 0 && avp != NULL) { 2785 VERIFY3U(avp->av_refs, ==, 0); 2786 list_remove(&rx_group->arg_vlans, avp); 2787 kmem_free(avp, sizeof (aggr_vlan_t)); 2788 } 2789 2790 done: 2791 mac_perim_exit(mph); 2792 return (err); 2793 } 2794 2795 /* 2796 * Add or remove the multicast addresses that are defined for the group 2797 * to or from the specified port. 2798 * 2799 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2800 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2801 * called when the port is either stopped or detached. 2802 */ 2803 void 2804 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2805 { 2806 aggr_grp_t *grp = port->lp_grp; 2807 2808 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2809 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2810 2811 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2812 return; 2813 2814 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2815 } 2816 2817 static int 2818 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2819 { 2820 aggr_grp_t *grp = arg; 2821 aggr_port_t *port = NULL, *errport = NULL; 2822 mac_perim_handle_t mph; 2823 int err = 0; 2824 2825 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2826 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2827 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2828 !port->lp_started) { 2829 continue; 2830 } 2831 err = aggr_port_multicst(port, add, addrp); 2832 if (err != 0) { 2833 errport = port; 2834 break; 2835 } 2836 } 2837 2838 /* 2839 * At least one port caused error return and this error is returned to 2840 * mac, eventually a NAK would be sent upwards. 2841 * Some ports have this multicast address listed now, and some don't. 2842 * Treat this error as a whole aggr failure not individual port failure. 2843 * Therefore remove this multicast address from other ports. 2844 */ 2845 if ((err != 0) && add) { 2846 for (port = grp->lg_ports; port != errport; 2847 port = port->lp_next) { 2848 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2849 !port->lp_started) { 2850 continue; 2851 } 2852 (void) aggr_port_multicst(port, B_FALSE, addrp); 2853 } 2854 } 2855 mac_perim_exit(mph); 2856 return (err); 2857 } 2858 2859 static int 2860 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2861 { 2862 aggr_grp_t *grp = arg; 2863 mac_perim_handle_t mph; 2864 int err; 2865 2866 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2867 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2868 0, 0); 2869 mac_perim_exit(mph); 2870 return (err); 2871 } 2872 2873 /* 2874 * Initialize the capabilities that are advertised for the group 2875 * according to the capabilities of the constituent ports. 2876 */ 2877 static void 2878 aggr_grp_capab_set(aggr_grp_t *grp) 2879 { 2880 uint32_t cksum; 2881 aggr_port_t *port; 2882 mac_capab_lso_t cap_lso; 2883 2884 ASSERT(grp->lg_mh == NULL); 2885 ASSERT(grp->lg_ports != NULL); 2886 2887 grp->lg_hcksum_txflags = (uint32_t)-1; 2888 grp->lg_zcopy = B_TRUE; 2889 grp->lg_vlan = B_TRUE; 2890 2891 grp->lg_lso = B_TRUE; 2892 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2893 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2894 2895 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2896 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2897 cksum = 0; 2898 grp->lg_hcksum_txflags &= cksum; 2899 2900 grp->lg_vlan &= 2901 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2902 2903 grp->lg_zcopy &= 2904 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2905 2906 grp->lg_lso &= 2907 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2908 if (grp->lg_lso) { 2909 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2910 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2911 cap_lso.lso_basic_tcp_ipv4.lso_max) 2912 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2913 cap_lso.lso_basic_tcp_ipv4.lso_max; 2914 } 2915 } 2916 } 2917 2918 /* 2919 * Checks whether the capabilities of the port being added are compatible 2920 * with the current capabilities of the aggregation. 2921 */ 2922 static boolean_t 2923 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2924 { 2925 uint32_t hcksum_txflags; 2926 2927 ASSERT(grp->lg_ports != NULL); 2928 2929 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2930 grp->lg_vlan) != grp->lg_vlan) { 2931 return (B_FALSE); 2932 } 2933 2934 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2935 grp->lg_zcopy) != grp->lg_zcopy) { 2936 return (B_FALSE); 2937 } 2938 2939 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2940 if (grp->lg_hcksum_txflags != 0) 2941 return (B_FALSE); 2942 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2943 grp->lg_hcksum_txflags) { 2944 return (B_FALSE); 2945 } 2946 2947 if (grp->lg_lso) { 2948 mac_capab_lso_t cap_lso; 2949 2950 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2951 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2952 grp->lg_cap_lso.lso_flags) 2953 return (B_FALSE); 2954 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2955 cap_lso.lso_basic_tcp_ipv4.lso_max) 2956 return (B_FALSE); 2957 } else { 2958 return (B_FALSE); 2959 } 2960 } 2961 2962 return (B_TRUE); 2963 } 2964 2965 /* 2966 * Returns the maximum SDU according to the SDU of the constituent ports. 2967 */ 2968 static uint_t 2969 aggr_grp_max_sdu(aggr_grp_t *grp) 2970 { 2971 uint_t max_sdu = (uint_t)-1; 2972 aggr_port_t *port; 2973 2974 ASSERT(grp->lg_ports != NULL); 2975 2976 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2977 uint_t port_sdu_max; 2978 2979 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2980 if (max_sdu > port_sdu_max) 2981 max_sdu = port_sdu_max; 2982 } 2983 2984 return (max_sdu); 2985 } 2986 2987 /* 2988 * Checks if the maximum SDU of the specified port is compatible 2989 * with the maximum SDU of the specified aggregation group, returns 2990 * B_TRUE if it is, B_FALSE otherwise. 2991 */ 2992 static boolean_t 2993 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 2994 { 2995 uint_t port_sdu_max; 2996 2997 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2998 return (port_sdu_max >= grp->lg_max_sdu); 2999 } 3000 3001 /* 3002 * Returns the maximum margin according to the margin of the constituent ports. 3003 */ 3004 static uint32_t 3005 aggr_grp_max_margin(aggr_grp_t *grp) 3006 { 3007 uint32_t margin = UINT32_MAX; 3008 aggr_port_t *port; 3009 3010 ASSERT(grp->lg_mh == NULL); 3011 ASSERT(grp->lg_ports != NULL); 3012 3013 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3014 if (margin > port->lp_margin) 3015 margin = port->lp_margin; 3016 } 3017 3018 grp->lg_margin = margin; 3019 return (margin); 3020 } 3021 3022 /* 3023 * Checks if the maximum margin of the specified port is compatible 3024 * with the maximum margin of the specified aggregation group, returns 3025 * B_TRUE if it is, B_FALSE otherwise. 3026 */ 3027 static boolean_t 3028 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3029 { 3030 if (port->lp_margin >= grp->lg_margin) 3031 return (B_TRUE); 3032 3033 /* 3034 * See whether the current margin value is allowed to be changed to 3035 * the new value. 3036 */ 3037 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3038 return (B_FALSE); 3039 3040 grp->lg_margin = port->lp_margin; 3041 return (B_TRUE); 3042 } 3043 3044 /* 3045 * Set MTU on individual ports of an aggregation group 3046 */ 3047 static int 3048 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3049 uint32_t *old_mtu) 3050 { 3051 boolean_t removed = B_FALSE; 3052 mac_perim_handle_t mph; 3053 mac_diag_t diag; 3054 int err, rv, retry = 0; 3055 3056 if (port->lp_mah != NULL) { 3057 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3058 port->lp_mah = NULL; 3059 removed = B_TRUE; 3060 } 3061 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3062 try_again: 3063 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3064 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3065 &port->lp_mah, 0, &diag)) != 0) { 3066 /* 3067 * following is a workaround for a bug in 'bge' driver. 3068 * See CR 6794654 for more information and this work around 3069 * will be removed once the CR is fixed. 3070 */ 3071 if (rv == EIO && retry++ < 3) { 3072 delay(2 * hz); 3073 goto try_again; 3074 } 3075 /* 3076 * if mac_unicast_add() failed while setting the MTU, 3077 * detach the port from the group. 3078 */ 3079 mac_perim_enter_by_mh(port->lp_mh, &mph); 3080 (void) aggr_grp_detach_port(grp, port); 3081 mac_perim_exit(mph); 3082 cmn_err(CE_WARN, "Unable to restart the port %s while " 3083 "setting MTU. Detaching the port from the aggregation.", 3084 mac_client_name(port->lp_mch)); 3085 } 3086 return (err); 3087 } 3088 3089 static int 3090 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3091 { 3092 int err = 0, i, rv; 3093 aggr_port_t *port; 3094 uint32_t *mtu; 3095 3096 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3097 3098 /* 3099 * If the MTU being set is equal to aggr group's maximum 3100 * allowable value, then there is nothing to change 3101 */ 3102 if (sdu == grp->lg_max_sdu) 3103 return (0); 3104 3105 /* 0 is aggr group's min sdu */ 3106 if (sdu == 0) 3107 return (EINVAL); 3108 3109 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3110 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3111 port = port->lp_next, i++) { 3112 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3113 } 3114 if (err != 0) { 3115 /* recover from error: reset the mtus of the ports */ 3116 aggr_port_t *tmp; 3117 3118 for (tmp = grp->lg_ports, i = 0; tmp != port; 3119 tmp = tmp->lp_next, i++) { 3120 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3121 } 3122 goto bail; 3123 } 3124 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3125 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3126 ASSERT(rv == 0); 3127 bail: 3128 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3129 return (err); 3130 } 3131 3132 /* 3133 * Callback functions for set/get of properties 3134 */ 3135 /*ARGSUSED*/ 3136 static int 3137 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3138 uint_t pr_valsize, const void *pr_val) 3139 { 3140 int err = ENOTSUP; 3141 aggr_grp_t *grp = m_driver; 3142 3143 switch (pr_num) { 3144 case MAC_PROP_MTU: { 3145 uint32_t mtu; 3146 3147 if (pr_valsize < sizeof (mtu)) { 3148 err = EINVAL; 3149 break; 3150 } 3151 bcopy(pr_val, &mtu, sizeof (mtu)); 3152 err = aggr_sdu_update(grp, mtu); 3153 break; 3154 } 3155 default: 3156 break; 3157 } 3158 return (err); 3159 } 3160 3161 typedef struct rboundary { 3162 uint32_t bval; 3163 int btype; 3164 } rboundary_t; 3165 3166 /* 3167 * This function finds the intersection of mtu ranges stored in arrays - 3168 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3169 * Individual arrays are assumed to contain non-overlapping ranges. 3170 * Algorithm: 3171 * A range has two boundaries - min and max. We scan all arrays and store 3172 * each boundary as a separate element in a temporary array. We also store 3173 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3174 * array. Then we sort the temporary array in ascending order. We scan the 3175 * sorted array from lower to higher values and keep a cumulative sum of 3176 * boundary types. Element in the temporary array for which the sum reaches 3177 * mcount is a min boundary of a range in the result and next element will be 3178 * max boundary. 3179 * 3180 * Example for mcount = 3, 3181 * 3182 * ----|_________|-------|_______|----|__|------ mrange[0] 3183 * 3184 * -------|________|--|____________|-----|___|-- mrange[1] 3185 * 3186 * --------|________________|-------|____|------ mrange[2] 3187 * 3188 * 3 2 1 3189 * \|/ 3190 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3191 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3192 * 3193 * same min and max 3194 * V 3195 * --------|_____|-------|__|------------|------ intersecting ranges 3196 */ 3197 void 3198 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3199 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3200 { 3201 mac_propval_uint32_range_t *rval, *ur; 3202 int rmaxcnt, rcount; 3203 size_t sz_range32; 3204 rboundary_t *ta; /* temporary array */ 3205 rboundary_t temp; 3206 boolean_t range_started = B_FALSE; 3207 int i, j, m, sum; 3208 3209 sz_range32 = sizeof (mac_propval_uint32_range_t); 3210 3211 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3212 rmaxcnt += mrange[i]->mpr_count; 3213 3214 /* Allocate enough space to store the results */ 3215 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3216 3217 /* Number of boundaries are twice as many as ranges */ 3218 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3219 3220 for (i = 0, m = 0; i < mcount; i++) { 3221 ur = &(mrange[i]->mpr_range_uint32[0]); 3222 for (j = 0; j < mrange[i]->mpr_count; j++) { 3223 ta[m].bval = ur[j].mpur_min; 3224 ta[m++].btype = 1; 3225 ta[m].bval = ur[j].mpur_max; 3226 ta[m++].btype = -1; 3227 } 3228 } 3229 3230 /* 3231 * Sort the temporary array in ascending order of bval; 3232 * if boundary values are same then sort on btype. 3233 */ 3234 for (i = 0; i < m-1; i++) { 3235 for (j = i+1; j < m; j++) { 3236 if ((ta[i].bval > ta[j].bval) || 3237 ((ta[i].bval == ta[j].bval) && 3238 (ta[i].btype < ta[j].btype))) { 3239 temp = ta[i]; 3240 ta[i] = ta[j]; 3241 ta[j] = temp; 3242 } 3243 } 3244 } 3245 3246 /* Walk through temporary array to find all ranges in the results */ 3247 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3248 sum += ta[i].btype; 3249 if (sum == mcount) { 3250 rval[rcount].mpur_min = ta[i].bval; 3251 range_started = B_TRUE; 3252 } else if (sum < mcount && range_started) { 3253 rval[rcount++].mpur_max = ta[i].bval; 3254 range_started = B_FALSE; 3255 } 3256 } 3257 3258 *prval = rval; 3259 *prmaxcnt = rmaxcnt; 3260 *prcount = rcount; 3261 3262 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3263 } 3264 3265 /* 3266 * Returns the mtu ranges which could be supported by aggr group. 3267 * prmaxcnt returns the size of the buffer prval, prcount returns 3268 * the number of valid entries in prval. Caller is responsible 3269 * for freeing up prval. 3270 */ 3271 int 3272 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3273 int *prmaxcnt, int *prcount) 3274 { 3275 mac_propval_range_t **vals; 3276 aggr_port_t *port; 3277 mac_perim_handle_t mph; 3278 uint_t i, numr; 3279 int err = 0; 3280 size_t sz_propval, sz_range32; 3281 size_t size; 3282 3283 sz_propval = sizeof (mac_propval_range_t); 3284 sz_range32 = sizeof (mac_propval_uint32_range_t); 3285 3286 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3287 3288 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3289 KM_SLEEP); 3290 3291 for (port = grp->lg_ports, i = 0; port != NULL; 3292 port = port->lp_next, i++) { 3293 3294 size = sz_propval; 3295 vals[i] = kmem_alloc(size, KM_SLEEP); 3296 vals[i]->mpr_count = 1; 3297 3298 mac_perim_enter_by_mh(port->lp_mh, &mph); 3299 3300 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3301 NULL, 0, vals[i], NULL); 3302 if (err == ENOSPC) { 3303 /* 3304 * Not enough space to hold all ranges. 3305 * Allocate extra space as indicated and retry. 3306 */ 3307 numr = vals[i]->mpr_count; 3308 kmem_free(vals[i], sz_propval); 3309 size = sz_propval + (numr - 1) * sz_range32; 3310 vals[i] = kmem_alloc(size, KM_SLEEP); 3311 vals[i]->mpr_count = numr; 3312 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3313 NULL, 0, vals[i], NULL); 3314 ASSERT(err != ENOSPC); 3315 } 3316 mac_perim_exit(mph); 3317 if (err != 0) { 3318 kmem_free(vals[i], size); 3319 vals[i] = NULL; 3320 break; 3321 } 3322 } 3323 3324 /* 3325 * if any of the underlying ports does not support changing MTU then 3326 * just return ENOTSUP 3327 */ 3328 if (port != NULL) { 3329 ASSERT(err != 0); 3330 goto done; 3331 } 3332 3333 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3334 prcount); 3335 3336 done: 3337 for (i = 0; i < grp->lg_nports; i++) { 3338 if (vals[i] != NULL) { 3339 numr = vals[i]->mpr_count; 3340 size = sz_propval + (numr - 1) * sz_range32; 3341 kmem_free(vals[i], size); 3342 } 3343 } 3344 3345 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3346 return (err); 3347 } 3348 3349 static void 3350 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3351 mac_prop_info_handle_t prh) 3352 { 3353 aggr_grp_t *grp = m_driver; 3354 mac_propval_uint32_range_t *rval = NULL; 3355 int i, rcount, rmaxcnt; 3356 int err = 0; 3357 3358 _NOTE(ARGUNUSED(pr_name)); 3359 3360 switch (pr_num) { 3361 case MAC_PROP_MTU: 3362 3363 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3364 &rcount); 3365 if (err != 0) { 3366 ASSERT(rval == NULL); 3367 return; 3368 } 3369 for (i = 0; i < rcount; i++) { 3370 mac_prop_info_set_range_uint32(prh, 3371 rval[i].mpur_min, rval[i].mpur_max); 3372 } 3373 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3374 break; 3375 } 3376 } 3377