1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2020 Joyent, Inc. 24 */ 25 26 /* 27 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups. 28 * 29 * An instance of the structure aggr_grp_t is allocated for each 30 * link aggregation group. When created, aggr_grp_t objects are 31 * entered into the aggr_grp_hash hash table maintained by the modhash 32 * module. The hash key is the linkid associated with the link 33 * aggregation group. 34 * 35 * Each aggregation contains a set of ports. The port is represented 36 * by the aggr_port_t structure. A port consists of a single MAC 37 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying 38 * MAC. This client is used by the aggr to send and receive LACP 39 * traffic. Each port client takes on the same MAC unicast address -- 40 * the address of the aggregation itself (taken from the first port by 41 * default). 42 * 43 * The MAC client that hangs off each aggr port is not your typical 44 * MAC client. Not only does it have exclusive control of the MAC, but 45 * it also has no Tx or Rx SRSes. An SRS is designed to queue and 46 * fanout traffic among L4 protocols; but the aggr is an intermediary, 47 * not a consumer. Instead of using SRSes, the aggr puts the 48 * underlying hardware rings into passthru mode and ships packets up 49 * via a direct call to aggr_recv_cb(). This allows aggr to enforce 50 * LACP while passing all other traffic up to clients of the aggr. 51 * 52 * Pseudo Rx Groups and Rings 53 * -------------------------- 54 * 55 * It is imperative for client performance that the aggr provide as 56 * many MAC groups as possible. In order to use the underlying HW 57 * resources, aggr creates pseudo groups to aggregate the underlying 58 * HW groups. Every HW group gets mapped to a pseudo group; and every 59 * HW ring in that group gets mapped to a pseudo ring. The pseudo 60 * group at index 0 combines all the HW groups at index 0 from each 61 * port, etc. The aggr's MAC then creates normal MAC groups and rings 62 * out of these pseudo groups and rings to present to the aggr's 63 * clients. To the clients, the aggr's groups and rings are absolutely 64 * no different than a NIC's groups or rings. 65 * 66 * Pseudo Tx Rings 67 * --------------- 68 * 69 * The underlying ports (NICs) in an aggregation can have Tx rings. To 70 * enhance aggr's performance, these Tx rings are made available to 71 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are 72 * not new. They are already present and implemented on the Rx side. 73 * The same concept is extended to the Tx side where each Tx ring of 74 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus 75 * each pseudo Tx ring will map to a specific hardware Tx ring. Even 76 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring 77 * is given to the aggregation layer. 78 * 79 * With this change, the outgoing stack depth looks much better: 80 * 81 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() -> 82 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx() 83 * 84 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings: 85 * SRS_TX_AGGR and SRS_TX_BW_AGGR. 86 * 87 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine 88 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx 89 * ring belonging to a port on which the packet has to be sent. 90 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4 91 * policy and then uses the fanout_hint passed to it to pick a Tx ring from 92 * the selected port. 93 * 94 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where 95 * bandwidth limit is applied first on the outgoing packet and the packets 96 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a 97 * particular Tx ring. 98 */ 99 100 #include <sys/types.h> 101 #include <sys/sysmacros.h> 102 #include <sys/conf.h> 103 #include <sys/cmn_err.h> 104 #include <sys/disp.h> 105 #include <sys/list.h> 106 #include <sys/ksynch.h> 107 #include <sys/kmem.h> 108 #include <sys/stream.h> 109 #include <sys/modctl.h> 110 #include <sys/ddi.h> 111 #include <sys/sunddi.h> 112 #include <sys/atomic.h> 113 #include <sys/stat.h> 114 #include <sys/modhash.h> 115 #include <sys/id_space.h> 116 #include <sys/strsun.h> 117 #include <sys/cred.h> 118 #include <sys/dlpi.h> 119 #include <sys/zone.h> 120 #include <sys/mac_provider.h> 121 #include <sys/dls.h> 122 #include <sys/vlan.h> 123 #include <sys/aggr.h> 124 #include <sys/aggr_impl.h> 125 126 static int aggr_m_start(void *); 127 static void aggr_m_stop(void *); 128 static int aggr_m_promisc(void *, boolean_t); 129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *); 130 static int aggr_m_unicst(void *, const uint8_t *); 131 static int aggr_m_stat(void *, uint_t, uint64_t *); 132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *); 133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *); 134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t, 135 const void *); 136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t, 137 mac_prop_info_handle_t); 138 139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t); 140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *, 141 boolean_t *); 142 143 static void aggr_grp_capab_set(aggr_grp_t *); 144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *); 145 static uint_t aggr_grp_max_sdu(aggr_grp_t *); 146 static uint32_t aggr_grp_max_margin(aggr_grp_t *); 147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *); 148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *); 149 150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *); 152 static int aggr_pseudo_disable_intr(mac_intr_handle_t); 153 static int aggr_pseudo_enable_intr(mac_intr_handle_t); 154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t); 155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t); 156 static int aggr_addmac(void *, const uint8_t *); 157 static int aggr_remmac(void *, const uint8_t *); 158 static int aggr_addvlan(mac_group_driver_t, uint16_t); 159 static int aggr_remvlan(mac_group_driver_t, uint16_t); 160 static mblk_t *aggr_rx_poll(void *, int); 161 static void aggr_fill_ring(void *, mac_ring_type_t, const int, 162 const int, mac_ring_info_t *, mac_ring_handle_t); 163 static void aggr_fill_group(void *, mac_ring_type_t, const int, 164 mac_group_info_t *, mac_group_handle_t); 165 166 static kmem_cache_t *aggr_grp_cache; 167 static mod_hash_t *aggr_grp_hash; 168 static krwlock_t aggr_grp_lock; 169 static uint_t aggr_grp_cnt; 170 static id_space_t *key_ids; 171 172 #define GRP_HASHSZ 64 173 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid) 174 #define AGGR_PORT_NAME_DELIMIT '-' 175 176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0}; 177 178 #define AGGR_M_CALLBACK_FLAGS \ 179 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO) 180 181 static mac_callbacks_t aggr_m_callbacks = { 182 AGGR_M_CALLBACK_FLAGS, 183 aggr_m_stat, 184 aggr_m_start, 185 aggr_m_stop, 186 aggr_m_promisc, 187 aggr_m_multicst, 188 NULL, 189 NULL, 190 NULL, 191 aggr_m_ioctl, 192 aggr_m_capab_get, 193 NULL, 194 NULL, 195 aggr_m_setprop, 196 NULL, 197 aggr_m_propinfo 198 }; 199 200 /*ARGSUSED*/ 201 static int 202 aggr_grp_constructor(void *buf, void *arg, int kmflag) 203 { 204 aggr_grp_t *grp = buf; 205 206 bzero(grp, sizeof (*grp)); 207 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL); 208 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL); 209 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL); 210 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL); 211 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL); 212 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL); 213 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL); 214 grp->lg_link_state = LINK_STATE_UNKNOWN; 215 return (0); 216 } 217 218 /*ARGSUSED*/ 219 static void 220 aggr_grp_destructor(void *buf, void *arg) 221 { 222 aggr_grp_t *grp = buf; 223 224 if (grp->lg_tx_ports != NULL) { 225 kmem_free(grp->lg_tx_ports, 226 grp->lg_tx_ports_size * sizeof (aggr_port_t *)); 227 } 228 229 mutex_destroy(&grp->lg_lacp_lock); 230 cv_destroy(&grp->lg_lacp_cv); 231 mutex_destroy(&grp->lg_port_lock); 232 cv_destroy(&grp->lg_port_cv); 233 rw_destroy(&grp->lg_tx_lock); 234 mutex_destroy(&grp->lg_tx_flowctl_lock); 235 cv_destroy(&grp->lg_tx_flowctl_cv); 236 } 237 238 void 239 aggr_grp_init(void) 240 { 241 aggr_grp_cache = kmem_cache_create("aggr_grp_cache", 242 sizeof (aggr_grp_t), 0, aggr_grp_constructor, 243 aggr_grp_destructor, NULL, NULL, NULL, 0); 244 245 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash", 246 GRP_HASHSZ, mod_hash_null_valdtor); 247 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL); 248 aggr_grp_cnt = 0; 249 250 /* 251 * Allocate an id space to manage key values (when key is not 252 * specified). The range of the id space will be from 253 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol 254 * uses a 16-bit key. 255 */ 256 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX); 257 ASSERT(key_ids != NULL); 258 } 259 260 void 261 aggr_grp_fini(void) 262 { 263 id_space_destroy(key_ids); 264 rw_destroy(&aggr_grp_lock); 265 mod_hash_destroy_idhash(aggr_grp_hash); 266 kmem_cache_destroy(aggr_grp_cache); 267 } 268 269 uint_t 270 aggr_grp_count(void) 271 { 272 uint_t count; 273 274 rw_enter(&aggr_grp_lock, RW_READER); 275 count = aggr_grp_cnt; 276 rw_exit(&aggr_grp_lock); 277 return (count); 278 } 279 280 /* 281 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions 282 * requires the mac perimeter, this function holds a reference of the aggr 283 * and aggr won't call mac_unregister() until this reference drops to 0. 284 */ 285 void 286 aggr_grp_port_hold(aggr_port_t *port) 287 { 288 aggr_grp_t *grp = port->lp_grp; 289 290 AGGR_PORT_REFHOLD(port); 291 mutex_enter(&grp->lg_port_lock); 292 grp->lg_port_ref++; 293 mutex_exit(&grp->lg_port_lock); 294 } 295 296 /* 297 * Release the reference of the grp and inform aggr_grp_delete() calling 298 * mac_unregister() is now safe. 299 */ 300 void 301 aggr_grp_port_rele(aggr_port_t *port) 302 { 303 aggr_grp_t *grp = port->lp_grp; 304 305 mutex_enter(&grp->lg_port_lock); 306 if (--grp->lg_port_ref == 0) 307 cv_signal(&grp->lg_port_cv); 308 mutex_exit(&grp->lg_port_lock); 309 AGGR_PORT_REFRELE(port); 310 } 311 312 /* 313 * Wait for the port's lacp timer thread and the port's notification callback 314 * to exit. 315 */ 316 void 317 aggr_grp_port_wait(aggr_grp_t *grp) 318 { 319 mutex_enter(&grp->lg_port_lock); 320 if (grp->lg_port_ref != 0) 321 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock); 322 mutex_exit(&grp->lg_port_lock); 323 } 324 325 /* 326 * Attach a port to a link aggregation group. 327 * 328 * A port is attached to a link aggregation group once its speed 329 * and link state have been verified. 330 * 331 * Returns B_TRUE if the group link state or speed has changed. If 332 * it's the case, the caller must notify the MAC layer via a call 333 * to mac_link(). 334 */ 335 boolean_t 336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port) 337 { 338 boolean_t link_state_changed = B_FALSE; 339 340 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 341 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 342 343 if (port->lp_state == AGGR_PORT_STATE_ATTACHED) 344 return (B_FALSE); 345 346 /* 347 * Validate the MAC port link speed and update the group 348 * link speed if needed. 349 */ 350 if (port->lp_ifspeed == 0 || 351 port->lp_link_state != LINK_STATE_UP || 352 port->lp_link_duplex != LINK_DUPLEX_FULL) { 353 /* 354 * Can't attach a MAC port with unknown link speed, 355 * down link, or not in full duplex mode. 356 */ 357 return (B_FALSE); 358 } 359 360 mutex_enter(&grp->lg_stat_lock); 361 if (grp->lg_ifspeed == 0) { 362 /* 363 * The group inherits the speed of the first link being 364 * attached. 365 */ 366 grp->lg_ifspeed = port->lp_ifspeed; 367 link_state_changed = B_TRUE; 368 } else if (grp->lg_ifspeed != port->lp_ifspeed) { 369 /* 370 * The link speed of the MAC port must be the same as 371 * the group link speed, as per 802.3ad. Since it is 372 * not, the attach is cancelled. 373 */ 374 mutex_exit(&grp->lg_stat_lock); 375 return (B_FALSE); 376 } 377 mutex_exit(&grp->lg_stat_lock); 378 379 grp->lg_nattached_ports++; 380 381 /* 382 * Update the group link state. 383 */ 384 if (grp->lg_link_state != LINK_STATE_UP) { 385 grp->lg_link_state = LINK_STATE_UP; 386 mutex_enter(&grp->lg_stat_lock); 387 grp->lg_link_duplex = LINK_DUPLEX_FULL; 388 mutex_exit(&grp->lg_stat_lock); 389 link_state_changed = B_TRUE; 390 } 391 392 /* 393 * Update port's state. 394 */ 395 port->lp_state = AGGR_PORT_STATE_ATTACHED; 396 397 aggr_grp_multicst_port(port, B_TRUE); 398 399 /* 400 * The port client doesn't have an Rx SRS; instead of calling 401 * mac_rx_set() we set the client's flow callback directly. 402 * This datapath is used only when the port's driver doesn't 403 * support MAC_CAPAB_RINGS. Drivers with ring support will 404 * deliver traffic to the aggr via ring passthru. 405 */ 406 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port); 407 408 /* 409 * If LACP is OFF, the port can be used to send data as soon 410 * as its link is up and verified to be compatible with the 411 * aggregation. 412 * 413 * If LACP is active or passive, notify the LACP subsystem, which 414 * will enable sending on the port following the LACP protocol. 415 */ 416 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 417 aggr_send_port_enable(port); 418 else 419 aggr_lacp_port_attached(port); 420 421 return (link_state_changed); 422 } 423 424 boolean_t 425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port) 426 { 427 boolean_t link_state_changed = B_FALSE; 428 429 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 430 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 431 432 /* update state */ 433 if (port->lp_state != AGGR_PORT_STATE_ATTACHED) 434 return (B_FALSE); 435 436 mac_client_clear_flow_cb(port->lp_mch); 437 438 aggr_grp_multicst_port(port, B_FALSE); 439 440 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 441 aggr_send_port_disable(port); 442 else 443 aggr_lacp_port_detached(port); 444 445 port->lp_state = AGGR_PORT_STATE_STANDBY; 446 447 grp->lg_nattached_ports--; 448 if (grp->lg_nattached_ports == 0) { 449 /* the last attached MAC port of the group is being detached */ 450 grp->lg_link_state = LINK_STATE_DOWN; 451 mutex_enter(&grp->lg_stat_lock); 452 grp->lg_ifspeed = 0; 453 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 454 mutex_exit(&grp->lg_stat_lock); 455 link_state_changed = B_TRUE; 456 } 457 458 return (link_state_changed); 459 } 460 461 /* 462 * Update the MAC addresses of the constituent ports of the specified 463 * group. This function is invoked: 464 * - after creating a new aggregation group. 465 * - after adding new ports to an aggregation group. 466 * - after removing a port from a group when the MAC address of 467 * that port was used for the MAC address of the group. 468 * - after the MAC address of a port changed when the MAC address 469 * of that port was used for the MAC address of the group. 470 * 471 * Return true if the link state of the aggregation changed, for example 472 * as a result of a failure changing the MAC address of one of the 473 * constituent ports. 474 */ 475 boolean_t 476 aggr_grp_update_ports_mac(aggr_grp_t *grp) 477 { 478 aggr_port_t *cport; 479 boolean_t link_state_changed = B_FALSE; 480 mac_perim_handle_t mph; 481 482 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 483 484 for (cport = grp->lg_ports; cport != NULL; 485 cport = cport->lp_next) { 486 mac_perim_enter_by_mh(cport->lp_mh, &mph); 487 if (aggr_port_unicst(cport) != 0) { 488 if (aggr_grp_detach_port(grp, cport)) 489 link_state_changed = B_TRUE; 490 } else { 491 /* 492 * If a port was detached because of a previous 493 * failure changing the MAC address, the port is 494 * reattached when it successfully changes the MAC 495 * address now, and this might cause the link state 496 * of the aggregation to change. 497 */ 498 if (aggr_grp_attach_port(grp, cport)) 499 link_state_changed = B_TRUE; 500 } 501 mac_perim_exit(mph); 502 } 503 return (link_state_changed); 504 } 505 506 /* 507 * Invoked when the MAC address of a port has changed. If the port's 508 * MAC address was used for the group MAC address, set mac_addr_changedp 509 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST 510 * notification. If the link state changes due to detach/attach of 511 * the constituent port, set link_state_changedp to B_TRUE to indicate 512 * to the caller that it should send a MAC_NOTE_LINK notification. In both 513 * cases, it is the responsibility of the caller to invoke notification 514 * functions after releasing the the port lock. 515 */ 516 void 517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port, 518 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 519 { 520 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 521 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 522 ASSERT(mac_addr_changedp != NULL); 523 ASSERT(link_state_changedp != NULL); 524 525 *mac_addr_changedp = B_FALSE; 526 *link_state_changedp = B_FALSE; 527 528 if (grp->lg_addr_fixed) { 529 /* 530 * The group is using a fixed MAC address or an automatic 531 * MAC address has not been set. 532 */ 533 return; 534 } 535 536 if (grp->lg_mac_addr_port == port) { 537 /* 538 * The MAC address of the port was assigned to the group 539 * MAC address. Update the group MAC address. 540 */ 541 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 542 *mac_addr_changedp = B_TRUE; 543 } else { 544 /* 545 * Update the actual port MAC address to the MAC address 546 * of the group. 547 */ 548 if (aggr_port_unicst(port) != 0) { 549 *link_state_changedp = aggr_grp_detach_port(grp, port); 550 } else { 551 /* 552 * If a port was detached because of a previous 553 * failure changing the MAC address, the port is 554 * reattached when it successfully changes the MAC 555 * address now, and this might cause the link state 556 * of the aggregation to change. 557 */ 558 *link_state_changedp = aggr_grp_attach_port(grp, port); 559 } 560 } 561 } 562 563 /* 564 * Add a port to a link aggregation group. 565 */ 566 static int 567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force, 568 aggr_port_t **pp) 569 { 570 aggr_port_t *port, **cport; 571 mac_perim_handle_t mph; 572 zoneid_t port_zoneid = ALL_ZONES; 573 int err; 574 575 /* The port must be in the same zone as the aggregation. */ 576 if (zone_check_datalink(&port_zoneid, port_linkid) != 0) 577 port_zoneid = GLOBAL_ZONEID; 578 if (grp->lg_zoneid != port_zoneid) 579 return (EBUSY); 580 581 /* 582 * If we are creating the aggr, then there is no MAC handle 583 * and thus no perimeter to hold. If we are adding a port to 584 * an existing aggr, then the perimiter of the aggr's MAC must 585 * be held. 586 */ 587 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh)); 588 589 err = aggr_port_create(grp, port_linkid, force, &port); 590 if (err != 0) 591 return (err); 592 593 mac_perim_enter_by_mh(port->lp_mh, &mph); 594 595 /* Add the new port to the end of the list. */ 596 cport = &grp->lg_ports; 597 while (*cport != NULL) 598 cport = &((*cport)->lp_next); 599 *cport = port; 600 601 /* 602 * Back reference to the group it is member of. A port always 603 * holds a reference to its group to ensure that the back 604 * reference is always valid. 605 */ 606 port->lp_grp = grp; 607 AGGR_GRP_REFHOLD(grp); 608 grp->lg_nports++; 609 610 aggr_lacp_init_port(port); 611 mac_perim_exit(mph); 612 613 if (pp != NULL) 614 *pp = port; 615 616 return (0); 617 } 618 619 /* 620 * This is called in response to either our LACP state machine or a MAC 621 * notification that the link has gone down via aggr_send_port_disable(). At 622 * this point, we may need to update our default ring. To that end, we go 623 * through the set of ports (underlying datalinks in an aggregation) that are 624 * currently enabled to transmit data. If all our links have been disabled for 625 * transmit, then we don't do anything. 626 * 627 * Note, because we only have a single TX group, we don't have to worry about 628 * the rings moving between groups and the chance that mac will reassign it 629 * unless someone removes a port, at which point, we play it safe and call this 630 * again. 631 */ 632 void 633 aggr_grp_update_default(aggr_grp_t *grp) 634 { 635 aggr_port_t *port; 636 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 637 638 rw_enter(&grp->lg_tx_lock, RW_WRITER); 639 640 if (grp->lg_ntx_ports == 0) { 641 rw_exit(&grp->lg_tx_lock); 642 return; 643 } 644 645 port = grp->lg_tx_ports[0]; 646 ASSERT(port->lp_tx_ring_cnt > 0); 647 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]); 648 rw_exit(&grp->lg_tx_lock); 649 } 650 651 /* 652 * Add a pseudo RX ring for the given HW ring handle. 653 */ 654 static int 655 aggr_add_pseudo_rx_ring(aggr_port_t *port, 656 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 657 { 658 aggr_pseudo_rx_ring_t *ring; 659 int err; 660 int j; 661 662 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) { 663 ring = rx_grp->arg_rings + j; 664 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE)) 665 break; 666 } 667 668 /* 669 * No slot for this new RX ring. 670 */ 671 if (j == MAX_RINGS_PER_GROUP) 672 return (EIO); 673 674 ring->arr_flags |= MAC_PSEUDO_RING_INUSE; 675 ring->arr_hw_rh = hw_rh; 676 ring->arr_port = port; 677 ring->arr_grp = rx_grp; 678 rx_grp->arg_ring_cnt++; 679 680 /* 681 * The group is already registered, dynamically add a new ring to the 682 * mac group. 683 */ 684 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) { 685 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 686 ring->arr_hw_rh = NULL; 687 ring->arr_port = NULL; 688 ring->arr_grp = NULL; 689 rx_grp->arg_ring_cnt--; 690 } else { 691 /* 692 * This must run after the MAC is registered. 693 */ 694 ASSERT3P(ring->arr_rh, !=, NULL); 695 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb, 696 (void *)port, (mac_resource_handle_t)ring); 697 } 698 return (err); 699 } 700 701 /* 702 * Remove the pseudo RX ring of the given HW ring handle. 703 */ 704 static void 705 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh) 706 { 707 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) { 708 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j; 709 710 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) || 711 ring->arr_hw_rh != hw_rh) { 712 continue; 713 } 714 715 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh); 716 717 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE; 718 ring->arr_hw_rh = NULL; 719 ring->arr_port = NULL; 720 ring->arr_grp = NULL; 721 rx_grp->arg_ring_cnt--; 722 mac_hwring_clear_passthru(hw_rh); 723 break; 724 } 725 } 726 727 /* 728 * Create pseudo rings over the HW rings of the port. 729 * 730 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group. 731 * 732 * o Program existing unicast filters on the pseudo group into the HW group. 733 * 734 * o Program existing VLAN filters on the pseudo group into the HW group. 735 */ 736 static int 737 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 738 { 739 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 740 aggr_unicst_addr_t *addr, *a; 741 mac_perim_handle_t pmph; 742 aggr_vlan_t *avp; 743 uint_t hw_rh_cnt, i; 744 int err = 0; 745 uint_t g_idx = rx_grp->arg_index; 746 747 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 748 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 749 mac_perim_enter_by_mh(port->lp_mh, &pmph); 750 751 i = 0; 752 addr = NULL; 753 /* 754 * This function must be called after the aggr registers its 755 * MAC and its Rx groups have been initialized. 756 */ 757 ASSERT(rx_grp->arg_gh != NULL); 758 759 /* 760 * Get the list of the underlying HW rings. 761 */ 762 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, 763 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX); 764 765 /* 766 * Add existing VLAN and unicast address filters to the port. 767 */ 768 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL; 769 avp = list_next(&rx_grp->arg_vlans, avp)) { 770 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0) 771 goto err; 772 } 773 774 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) { 775 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0) 776 goto err; 777 } 778 779 for (i = 0; i < hw_rh_cnt; i++) { 780 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]); 781 if (err != 0) 782 goto err; 783 } 784 785 mac_perim_exit(pmph); 786 return (0); 787 788 err: 789 ASSERT(err != 0); 790 791 for (uint_t j = 0; j < i; j++) 792 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]); 793 794 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next) 795 aggr_port_remmac(port, g_idx, a->aua_addr); 796 797 if (avp != NULL) 798 avp = list_prev(&rx_grp->arg_vlans, avp); 799 800 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) { 801 int err2; 802 803 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 804 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 805 ": errno %d.", avp->av_vid, 806 mac_client_name(port->lp_mch), err2); 807 } 808 } 809 810 port->lp_hwghs[g_idx] = NULL; 811 mac_perim_exit(pmph); 812 return (err); 813 } 814 815 /* 816 * Destroy the pseudo rings mapping to this port and remove all VLAN 817 * and unicast filters from this port. Even if there are no underlying 818 * HW rings we must still remove the unicast filters to take the port 819 * out of promisc mode. 820 */ 821 static void 822 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp) 823 { 824 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP]; 825 aggr_unicst_addr_t *addr; 826 mac_perim_handle_t pmph; 827 uint_t hw_rh_cnt; 828 uint_t g_idx = rx_grp->arg_index; 829 830 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh)); 831 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT); 832 ASSERT3P(rx_grp->arg_gh, !=, NULL); 833 mac_perim_enter_by_mh(port->lp_mh, &pmph); 834 835 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh, 836 MAC_RING_TYPE_RX); 837 838 for (uint_t i = 0; i < hw_rh_cnt; i++) 839 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]); 840 841 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) 842 aggr_port_remmac(port, g_idx, addr->aua_addr); 843 844 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL; 845 avp = list_next(&rx_grp->arg_vlans, avp)) { 846 int err; 847 848 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) { 849 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s" 850 ": errno %d.", avp->av_vid, 851 mac_client_name(port->lp_mch), err); 852 } 853 } 854 855 port->lp_hwghs[g_idx] = NULL; 856 mac_perim_exit(pmph); 857 } 858 859 /* 860 * Add a pseudo TX ring for the given HW ring handle. 861 */ 862 static int 863 aggr_add_pseudo_tx_ring(aggr_port_t *port, 864 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh, 865 mac_ring_handle_t *pseudo_rh) 866 { 867 aggr_pseudo_tx_ring_t *ring; 868 int err; 869 int i; 870 871 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 872 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 873 ring = tx_grp->atg_rings + i; 874 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE)) 875 break; 876 } 877 /* 878 * No slot for this new TX ring. 879 */ 880 if (i == MAX_RINGS_PER_GROUP) 881 return (EIO); 882 /* 883 * The following 4 statements needs to be done before 884 * calling mac_group_add_ring(). Otherwise it will 885 * result in an assertion failure in mac_init_ring(). 886 */ 887 ring->atr_flags |= MAC_PSEUDO_RING_INUSE; 888 ring->atr_hw_rh = hw_rh; 889 ring->atr_port = port; 890 tx_grp->atg_ring_cnt++; 891 892 /* 893 * The TX side has no concept of ring groups unlike RX groups. 894 * There is just a single group which stores all the TX rings. 895 * This group will be used to store aggr's pseudo TX rings. 896 */ 897 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) { 898 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 899 ring->atr_hw_rh = NULL; 900 ring->atr_port = NULL; 901 tx_grp->atg_ring_cnt--; 902 } else { 903 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i); 904 if (hw_rh != NULL) { 905 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring, 906 mac_find_ring(tx_grp->atg_gh, i)); 907 } 908 } 909 910 return (err); 911 } 912 913 /* 914 * Remove the pseudo TX ring of the given HW ring handle. 915 */ 916 static void 917 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp, 918 mac_ring_handle_t pseudo_hw_rh) 919 { 920 aggr_pseudo_tx_ring_t *ring; 921 int i; 922 923 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) { 924 ring = tx_grp->atg_rings + i; 925 if (ring->atr_rh != pseudo_hw_rh) 926 continue; 927 928 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE); 929 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh); 930 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE; 931 mac_hwring_teardown(ring->atr_hw_rh); 932 ring->atr_hw_rh = NULL; 933 ring->atr_port = NULL; 934 tx_grp->atg_ring_cnt--; 935 break; 936 } 937 } 938 939 /* 940 * This function is called to create pseudo rings over hardware rings of 941 * the underlying device. There is a 1:1 mapping between the pseudo TX 942 * rings of the aggr and the hardware rings of the underlying port. 943 */ 944 static int 945 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 946 { 947 aggr_grp_t *grp = port->lp_grp; 948 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh; 949 mac_perim_handle_t pmph; 950 int hw_rh_cnt, i = 0, j; 951 int err = 0; 952 953 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 954 mac_perim_enter_by_mh(port->lp_mh, &pmph); 955 956 /* 957 * Get the list the the underlying HW rings. 958 */ 959 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh, 960 MAC_RING_TYPE_TX); 961 962 /* 963 * Even if the underlying NIC does not have TX rings, we 964 * still make a psuedo TX ring for that NIC with NULL as 965 * the ring handle. 966 */ 967 if (hw_rh_cnt == 0) 968 port->lp_tx_ring_cnt = 1; 969 else 970 port->lp_tx_ring_cnt = hw_rh_cnt; 971 972 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 973 port->lp_tx_ring_cnt), KM_SLEEP); 974 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 975 port->lp_tx_ring_cnt), KM_SLEEP); 976 977 if (hw_rh_cnt == 0) { 978 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp, 979 NULL, &pseudo_rh)) == 0) { 980 port->lp_tx_rings[0] = NULL; 981 port->lp_pseudo_tx_rings[0] = pseudo_rh; 982 } 983 } else { 984 for (i = 0; err == 0 && i < hw_rh_cnt; i++) { 985 err = aggr_add_pseudo_tx_ring(port, 986 tx_grp, hw_rh[i], &pseudo_rh); 987 if (err != 0) 988 break; 989 port->lp_tx_rings[i] = hw_rh[i]; 990 port->lp_pseudo_tx_rings[i] = pseudo_rh; 991 } 992 } 993 994 if (err != 0) { 995 if (hw_rh_cnt != 0) { 996 for (j = 0; j < i; j++) { 997 aggr_rem_pseudo_tx_ring(tx_grp, 998 port->lp_pseudo_tx_rings[j]); 999 } 1000 } 1001 kmem_free(port->lp_tx_rings, 1002 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1003 kmem_free(port->lp_pseudo_tx_rings, 1004 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1005 port->lp_tx_ring_cnt = 0; 1006 } else { 1007 port->lp_tx_grp_added = B_TRUE; 1008 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch, 1009 aggr_tx_ring_update, port); 1010 } 1011 mac_perim_exit(pmph); 1012 aggr_grp_update_default(grp); 1013 return (err); 1014 } 1015 1016 /* 1017 * This function is called by aggr to remove pseudo TX rings over the 1018 * HW rings of the underlying port. 1019 */ 1020 static void 1021 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp) 1022 { 1023 aggr_grp_t *grp = port->lp_grp; 1024 mac_perim_handle_t pmph; 1025 int i; 1026 1027 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1028 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1029 1030 if (!port->lp_tx_grp_added) 1031 goto done; 1032 1033 ASSERT(tx_grp->atg_gh != NULL); 1034 1035 for (i = 0; i < port->lp_tx_ring_cnt; i++) 1036 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]); 1037 1038 kmem_free(port->lp_tx_rings, 1039 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1040 kmem_free(port->lp_pseudo_tx_rings, 1041 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt)); 1042 1043 port->lp_tx_ring_cnt = 0; 1044 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh); 1045 port->lp_tx_grp_added = B_FALSE; 1046 aggr_grp_update_default(grp); 1047 done: 1048 mac_perim_exit(pmph); 1049 } 1050 1051 static int 1052 aggr_pseudo_disable_intr(mac_intr_handle_t ih) 1053 { 1054 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1055 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh)); 1056 } 1057 1058 static int 1059 aggr_pseudo_enable_intr(mac_intr_handle_t ih) 1060 { 1061 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih; 1062 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh)); 1063 } 1064 1065 /* 1066 * Start the pseudo ring. Since the pseudo ring is just an abstraction 1067 * over an actual HW ring, the real task is to start the underlying HW 1068 * ring. 1069 */ 1070 static int 1071 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen) 1072 { 1073 int err; 1074 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1075 1076 err = mac_hwring_start(rr_ring->arr_hw_rh); 1077 1078 if (err != 0) 1079 return (err); 1080 1081 rr_ring->arr_gen = mr_gen; 1082 return (err); 1083 } 1084 1085 /* 1086 * Stop the pseudo ring. Since the pseudo ring is just an abstraction 1087 * over an actual HW ring, the real task is to stop the underlying HW 1088 * ring. 1089 */ 1090 static void 1091 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg) 1092 { 1093 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg; 1094 1095 /* 1096 * The rings underlying the default group must stay up to 1097 * continue receiving LACP traffic. We would normally never 1098 * stop the default Rx rings because of the primary MAC 1099 * client; but aggr's primary MAC client doesn't call 1100 * mac_unicast_add() and thus mi_active is 0 when the last 1101 * non-primary client is deleted. 1102 */ 1103 if (rr_ring->arr_grp->arg_index != 0) 1104 mac_hwring_stop(rr_ring->arr_hw_rh); 1105 } 1106 1107 /* 1108 * Add one or more ports to an existing link aggregation group. 1109 */ 1110 int 1111 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force, 1112 laioc_port_t *ports) 1113 { 1114 int rc; 1115 uint_t port_added = 0; 1116 uint_t grp_added; 1117 aggr_grp_t *grp = NULL; 1118 aggr_port_t *port; 1119 boolean_t link_state_changed = B_FALSE; 1120 mac_perim_handle_t mph, pmph; 1121 1122 /* Get the aggr corresponding to linkid. */ 1123 rw_enter(&aggr_grp_lock, RW_READER); 1124 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1125 (mod_hash_val_t *)&grp) != 0) { 1126 rw_exit(&aggr_grp_lock); 1127 return (ENOENT); 1128 } 1129 AGGR_GRP_REFHOLD(grp); 1130 1131 /* 1132 * Hold the perimeter so that the aggregation can't be destroyed. 1133 */ 1134 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1135 rw_exit(&aggr_grp_lock); 1136 1137 /* Add the specified ports to the aggr. */ 1138 for (uint_t i = 0; i < nports; i++) { 1139 grp_added = 0; 1140 1141 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid, 1142 force, &port)) != 0) { 1143 goto bail; 1144 } 1145 1146 ASSERT(port != NULL); 1147 port_added++; 1148 1149 /* check capabilities */ 1150 if (!aggr_grp_capab_check(grp, port) || 1151 !aggr_grp_sdu_check(grp, port) || 1152 !aggr_grp_margin_check(grp, port)) { 1153 rc = ENOTSUP; 1154 goto bail; 1155 } 1156 1157 /* 1158 * Create the pseudo ring for each HW ring of the underlying 1159 * port. 1160 */ 1161 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group); 1162 if (rc != 0) 1163 goto bail; 1164 1165 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) { 1166 rc = aggr_add_pseudo_rx_group(port, 1167 &grp->lg_rx_groups[j]); 1168 1169 if (rc != 0) 1170 goto bail; 1171 1172 grp_added++; 1173 } 1174 1175 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1176 1177 /* set LACP mode */ 1178 aggr_port_lacp_set_mode(grp, port); 1179 1180 /* start port if group has already been started */ 1181 if (grp->lg_started) { 1182 rc = aggr_port_start(port); 1183 if (rc != 0) { 1184 mac_perim_exit(pmph); 1185 goto bail; 1186 } 1187 1188 /* 1189 * Turn on the promiscuous mode over the port when it 1190 * is requested to be turned on to receive the 1191 * non-primary address over a port, or the promiscuous 1192 * mode is enabled over the aggr. 1193 */ 1194 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 1195 rc = aggr_port_promisc(port, B_TRUE); 1196 if (rc != 0) { 1197 mac_perim_exit(pmph); 1198 goto bail; 1199 } 1200 } 1201 } 1202 mac_perim_exit(pmph); 1203 1204 /* 1205 * Attach each port if necessary. 1206 */ 1207 if (aggr_port_notify_link(grp, port)) 1208 link_state_changed = B_TRUE; 1209 1210 /* 1211 * Initialize the callback functions for this port. 1212 */ 1213 aggr_port_init_callbacks(port); 1214 } 1215 1216 /* update the MAC address of the constituent ports */ 1217 if (aggr_grp_update_ports_mac(grp)) 1218 link_state_changed = B_TRUE; 1219 1220 if (link_state_changed) 1221 mac_link_update(grp->lg_mh, grp->lg_link_state); 1222 1223 bail: 1224 if (rc != 0) { 1225 /* stop and remove ports that have been added */ 1226 for (uint_t i = 0; i < port_added; i++) { 1227 uint_t grp_remove; 1228 1229 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1230 ASSERT(port != NULL); 1231 1232 if (grp->lg_started) { 1233 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1234 (void) aggr_port_promisc(port, B_FALSE); 1235 aggr_port_stop(port); 1236 mac_perim_exit(pmph); 1237 } 1238 1239 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1240 1241 /* 1242 * Only the last port could have a partial set 1243 * of groups added. 1244 */ 1245 grp_remove = (i + 1 == port_added) ? grp_added : 1246 grp->lg_rx_group_count; 1247 1248 for (uint_t j = 0; j < grp_remove; j++) { 1249 aggr_rem_pseudo_rx_group(port, 1250 &grp->lg_rx_groups[j]); 1251 } 1252 1253 (void) aggr_grp_rem_port(grp, port, NULL, NULL); 1254 } 1255 } 1256 1257 mac_perim_exit(mph); 1258 AGGR_GRP_REFRELE(grp); 1259 return (rc); 1260 } 1261 1262 static int 1263 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy, 1264 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1265 aggr_lacp_timer_t lacp_timer) 1266 { 1267 boolean_t mac_addr_changed = B_FALSE; 1268 boolean_t link_state_changed = B_FALSE; 1269 mac_perim_handle_t pmph; 1270 1271 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1272 1273 /* validate fixed address if specified */ 1274 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed && 1275 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) || 1276 (mac_addr[0] & 0x01))) { 1277 return (EINVAL); 1278 } 1279 1280 /* update policy if requested */ 1281 if (update_mask & AGGR_MODIFY_POLICY) 1282 aggr_send_update_policy(grp, policy); 1283 1284 /* update unicast MAC address if requested */ 1285 if (update_mask & AGGR_MODIFY_MAC) { 1286 if (mac_fixed) { 1287 /* user-supplied MAC address */ 1288 grp->lg_mac_addr_port = NULL; 1289 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) { 1290 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1291 mac_addr_changed = B_TRUE; 1292 } 1293 } else if (grp->lg_addr_fixed) { 1294 /* switch from user-supplied to automatic */ 1295 aggr_port_t *port = grp->lg_ports; 1296 1297 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1298 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL); 1299 grp->lg_mac_addr_port = port; 1300 mac_addr_changed = B_TRUE; 1301 mac_perim_exit(pmph); 1302 } 1303 grp->lg_addr_fixed = mac_fixed; 1304 } 1305 1306 if (mac_addr_changed) 1307 link_state_changed = aggr_grp_update_ports_mac(grp); 1308 1309 if (update_mask & AGGR_MODIFY_LACP_MODE) 1310 aggr_lacp_update_mode(grp, lacp_mode); 1311 1312 if (update_mask & AGGR_MODIFY_LACP_TIMER) 1313 aggr_lacp_update_timer(grp, lacp_timer); 1314 1315 if (link_state_changed) 1316 mac_link_update(grp->lg_mh, grp->lg_link_state); 1317 1318 if (mac_addr_changed) 1319 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1320 1321 return (0); 1322 } 1323 1324 /* 1325 * Update properties of an existing link aggregation group. 1326 */ 1327 int 1328 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy, 1329 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, 1330 aggr_lacp_timer_t lacp_timer) 1331 { 1332 aggr_grp_t *grp = NULL; 1333 mac_perim_handle_t mph; 1334 int err; 1335 1336 /* get group corresponding to linkid */ 1337 rw_enter(&aggr_grp_lock, RW_READER); 1338 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1339 (mod_hash_val_t *)&grp) != 0) { 1340 rw_exit(&aggr_grp_lock); 1341 return (ENOENT); 1342 } 1343 AGGR_GRP_REFHOLD(grp); 1344 1345 /* 1346 * Hold the perimeter so that the aggregation won't be destroyed. 1347 */ 1348 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1349 rw_exit(&aggr_grp_lock); 1350 1351 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed, 1352 mac_addr, lacp_mode, lacp_timer); 1353 1354 mac_perim_exit(mph); 1355 AGGR_GRP_REFRELE(grp); 1356 return (err); 1357 } 1358 1359 /* 1360 * Create a new link aggregation group upon request from administrator. 1361 * Returns 0 on success, an errno on failure. 1362 */ 1363 int 1364 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports, 1365 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force, 1366 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer, 1367 cred_t *credp) 1368 { 1369 aggr_grp_t *grp = NULL; 1370 aggr_port_t *port; 1371 mac_register_t *mac; 1372 boolean_t link_state_changed; 1373 mac_perim_handle_t mph; 1374 int err; 1375 int i; 1376 kt_did_t tid = 0; 1377 1378 /* need at least one port */ 1379 if (nports == 0) 1380 return (EINVAL); 1381 1382 rw_enter(&aggr_grp_lock, RW_WRITER); 1383 1384 /* does a group with the same linkid already exist? */ 1385 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1386 (mod_hash_val_t *)&grp); 1387 if (err == 0) { 1388 rw_exit(&aggr_grp_lock); 1389 return (EEXIST); 1390 } 1391 1392 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP); 1393 1394 grp->lg_refs = 1; 1395 grp->lg_closing = B_FALSE; 1396 grp->lg_force = force; 1397 grp->lg_linkid = linkid; 1398 grp->lg_zoneid = crgetzoneid(credp); 1399 grp->lg_ifspeed = 0; 1400 grp->lg_link_state = LINK_STATE_UNKNOWN; 1401 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN; 1402 grp->lg_started = B_FALSE; 1403 grp->lg_promisc = B_FALSE; 1404 grp->lg_lacp_done = B_FALSE; 1405 grp->lg_tx_notify_done = B_FALSE; 1406 grp->lg_lacp_head = grp->lg_lacp_tail = NULL; 1407 grp->lg_lacp_rx_thread = thread_create(NULL, 0, 1408 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1409 grp->lg_tx_notify_thread = thread_create(NULL, 0, 1410 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri); 1411 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) * 1412 MAX_RINGS_PER_GROUP), KM_SLEEP); 1413 grp->lg_tx_blocked_cnt = 0; 1414 bzero(&grp->lg_rx_groups, 1415 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT); 1416 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t)); 1417 aggr_lacp_init_grp(grp); 1418 1419 /* add MAC ports to group */ 1420 grp->lg_ports = NULL; 1421 grp->lg_nports = 0; 1422 grp->lg_nattached_ports = 0; 1423 grp->lg_ntx_ports = 0; 1424 1425 /* 1426 * If key is not specified by the user, allocate the key. 1427 */ 1428 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) { 1429 err = ENOMEM; 1430 goto bail; 1431 } 1432 grp->lg_key = key; 1433 1434 for (i = 0; i < nports; i++) { 1435 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port); 1436 if (err != 0) 1437 goto bail; 1438 } 1439 1440 grp->lg_rx_group_count = 1; 1441 1442 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1443 uint_t num_rgroups; 1444 1445 mac_perim_enter_by_mh(port->lp_mh, &mph); 1446 num_rgroups = mac_get_num_rx_groups(port->lp_mh); 1447 mac_perim_exit(mph); 1448 1449 /* 1450 * Utilize all the groups in a port. If some ports 1451 * have less groups than others, then traffic destined 1452 * for the same unicast address may be HW classified 1453 * on some ports but SW classified by aggr when 1454 * arriving on other ports. 1455 */ 1456 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count, 1457 num_rgroups); 1458 } 1459 1460 /* 1461 * There could be cases where the hardware provides more 1462 * groups than aggr can support. Make sure we never go above 1463 * the max aggr can support. 1464 */ 1465 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count, 1466 MAX_GROUPS_PER_PORT); 1467 1468 ASSERT3U(grp->lg_rx_group_count, >, 0); 1469 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1470 grp->lg_rx_groups[i].arg_index = i; 1471 grp->lg_rx_groups[i].arg_untagged = 0; 1472 list_create(&(grp->lg_rx_groups[i].arg_vlans), 1473 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link)); 1474 } 1475 1476 /* 1477 * If no explicit MAC address was specified by the administrator, 1478 * set it to the MAC address of the first port. 1479 */ 1480 grp->lg_addr_fixed = mac_fixed; 1481 if (grp->lg_addr_fixed) { 1482 /* validate specified address */ 1483 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) { 1484 err = EINVAL; 1485 goto bail; 1486 } 1487 bcopy(mac_addr, grp->lg_addr, ETHERADDRL); 1488 } else { 1489 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1490 grp->lg_mac_addr_port = grp->lg_ports; 1491 } 1492 1493 /* Set the initial group capabilities. */ 1494 aggr_grp_capab_set(grp); 1495 1496 if ((mac = mac_alloc(MAC_VERSION)) == NULL) { 1497 err = ENOMEM; 1498 goto bail; 1499 } 1500 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER; 1501 mac->m_driver = grp; 1502 mac->m_dip = aggr_dip; 1503 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key; 1504 mac->m_src_addr = grp->lg_addr; 1505 mac->m_callbacks = &aggr_m_callbacks; 1506 mac->m_min_sdu = 0; 1507 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp); 1508 mac->m_margin = aggr_grp_max_margin(grp); 1509 mac->m_v12n = MAC_VIRT_LEVEL1; 1510 err = mac_register(mac, &grp->lg_mh); 1511 mac_free(mac); 1512 if (err != 0) 1513 goto bail; 1514 1515 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp)); 1516 if (err != 0) { 1517 (void) mac_unregister(grp->lg_mh); 1518 grp->lg_mh = NULL; 1519 goto bail; 1520 } 1521 1522 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1523 1524 /* 1525 * Update the MAC address of the constituent ports. 1526 * None of the port is attached at this time, the link state of the 1527 * aggregation will not change. 1528 * 1529 * All ports take on the primary MAC address of the aggr 1530 * (lg_aggr). At this point, none of the ports are attached; 1531 * thus the link state of the aggregation will not change. 1532 */ 1533 link_state_changed = aggr_grp_update_ports_mac(grp); 1534 ASSERT(!link_state_changed); 1535 1536 /* Update outbound load balancing policy. */ 1537 aggr_send_update_policy(grp, policy); 1538 1539 /* Set LACP mode. */ 1540 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer); 1541 1542 /* 1543 * Attach each port if necessary. 1544 */ 1545 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1546 /* 1547 * Create the pseudo ring for each HW ring of the 1548 * underlying port. Note that this is done after the 1549 * aggr registers its MAC. 1550 */ 1551 VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group), 1552 ==, 0); 1553 1554 for (i = 0; i < grp->lg_rx_group_count; i++) { 1555 VERIFY3S(aggr_add_pseudo_rx_group(port, 1556 &grp->lg_rx_groups[i]), ==, 0); 1557 } 1558 1559 if (aggr_port_notify_link(grp, port)) 1560 link_state_changed = B_TRUE; 1561 1562 /* 1563 * Initialize the callback functions for this port. 1564 */ 1565 aggr_port_init_callbacks(port); 1566 } 1567 1568 if (link_state_changed) 1569 mac_link_update(grp->lg_mh, grp->lg_link_state); 1570 1571 /* add new group to hash table */ 1572 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid), 1573 (mod_hash_val_t)grp); 1574 ASSERT(err == 0); 1575 aggr_grp_cnt++; 1576 1577 mac_perim_exit(mph); 1578 rw_exit(&aggr_grp_lock); 1579 return (0); 1580 1581 bail: 1582 1583 grp->lg_closing = B_TRUE; 1584 1585 port = grp->lg_ports; 1586 while (port != NULL) { 1587 aggr_port_t *cport; 1588 1589 cport = port->lp_next; 1590 aggr_port_delete(port); 1591 port = cport; 1592 } 1593 1594 /* 1595 * Inform the lacp_rx thread to exit. 1596 */ 1597 mutex_enter(&grp->lg_lacp_lock); 1598 grp->lg_lacp_done = B_TRUE; 1599 cv_signal(&grp->lg_lacp_cv); 1600 while (grp->lg_lacp_rx_thread != NULL) 1601 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1602 mutex_exit(&grp->lg_lacp_lock); 1603 /* 1604 * Inform the tx_notify thread to exit. 1605 */ 1606 mutex_enter(&grp->lg_tx_flowctl_lock); 1607 if (grp->lg_tx_notify_thread != NULL) { 1608 tid = grp->lg_tx_notify_thread->t_did; 1609 grp->lg_tx_notify_done = B_TRUE; 1610 cv_signal(&grp->lg_tx_flowctl_cv); 1611 } 1612 mutex_exit(&grp->lg_tx_flowctl_lock); 1613 if (tid != 0) 1614 thread_join(tid); 1615 1616 kmem_free(grp->lg_tx_blocked_rings, 1617 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1618 rw_exit(&aggr_grp_lock); 1619 AGGR_GRP_REFRELE(grp); 1620 return (err); 1621 } 1622 1623 /* 1624 * Return a pointer to the member of a group with specified linkid. 1625 */ 1626 static aggr_port_t * 1627 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid) 1628 { 1629 aggr_port_t *port; 1630 1631 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1632 1633 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 1634 if (port->lp_linkid == linkid) 1635 break; 1636 } 1637 1638 return (port); 1639 } 1640 1641 /* 1642 * Stop, detach and remove a port from a link aggregation group. 1643 */ 1644 static int 1645 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, 1646 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp) 1647 { 1648 int rc = 0; 1649 aggr_port_t **pport; 1650 boolean_t mac_addr_changed = B_FALSE; 1651 boolean_t link_state_changed = B_FALSE; 1652 mac_perim_handle_t mph; 1653 uint64_t val; 1654 uint_t i; 1655 uint_t stat; 1656 1657 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 1658 ASSERT(grp->lg_nports > 1); 1659 ASSERT(!grp->lg_closing); 1660 1661 /* unlink port */ 1662 for (pport = &grp->lg_ports; *pport != port; 1663 pport = &(*pport)->lp_next) { 1664 if (*pport == NULL) { 1665 rc = ENOENT; 1666 goto done; 1667 } 1668 } 1669 *pport = port->lp_next; 1670 1671 mac_perim_enter_by_mh(port->lp_mh, &mph); 1672 1673 /* 1674 * If the MAC address of the port being removed was assigned 1675 * to the group, update the group MAC address 1676 * using the MAC address of a different port. 1677 */ 1678 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) { 1679 /* 1680 * Set the MAC address of the group to the 1681 * MAC address of its first port. 1682 */ 1683 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL); 1684 grp->lg_mac_addr_port = grp->lg_ports; 1685 mac_addr_changed = B_TRUE; 1686 } 1687 1688 link_state_changed = aggr_grp_detach_port(grp, port); 1689 1690 /* 1691 * Add the counter statistics of the ports while it was aggregated 1692 * to the group's residual statistics. This is done by obtaining 1693 * the current counter from the underlying MAC then subtracting the 1694 * value of the counter at the moment it was added to the 1695 * aggregation. 1696 */ 1697 for (i = 0; i < MAC_NSTAT; i++) { 1698 stat = i + MAC_STAT_MIN; 1699 if (!MAC_STAT_ISACOUNTER(stat)) 1700 continue; 1701 val = aggr_port_stat(port, stat); 1702 val -= port->lp_stat[i]; 1703 mutex_enter(&grp->lg_stat_lock); 1704 grp->lg_stat[i] += val; 1705 mutex_exit(&grp->lg_stat_lock); 1706 } 1707 for (i = 0; i < ETHER_NSTAT; i++) { 1708 stat = i + MACTYPE_STAT_MIN; 1709 if (!ETHER_STAT_ISACOUNTER(stat)) 1710 continue; 1711 val = aggr_port_stat(port, stat); 1712 val -= port->lp_ether_stat[i]; 1713 mutex_enter(&grp->lg_stat_lock); 1714 grp->lg_ether_stat[i] += val; 1715 mutex_exit(&grp->lg_stat_lock); 1716 } 1717 1718 grp->lg_nports--; 1719 mac_perim_exit(mph); 1720 1721 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1722 aggr_port_delete(port); 1723 1724 /* 1725 * If the group MAC address has changed, update the MAC address of 1726 * the remaining constituent ports according to the new MAC 1727 * address of the group. 1728 */ 1729 if (mac_addr_changed && aggr_grp_update_ports_mac(grp)) 1730 link_state_changed = B_TRUE; 1731 1732 done: 1733 if (mac_addr_changedp != NULL) 1734 *mac_addr_changedp = mac_addr_changed; 1735 if (link_state_changedp != NULL) 1736 *link_state_changedp = link_state_changed; 1737 1738 return (rc); 1739 } 1740 1741 /* 1742 * Remove one or more ports from an existing link aggregation group. 1743 */ 1744 int 1745 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports) 1746 { 1747 int rc = 0, i; 1748 aggr_grp_t *grp = NULL; 1749 aggr_port_t *port; 1750 boolean_t mac_addr_update = B_FALSE, mac_addr_changed; 1751 boolean_t link_state_update = B_FALSE, link_state_changed; 1752 mac_perim_handle_t mph, pmph; 1753 1754 /* get group corresponding to linkid */ 1755 rw_enter(&aggr_grp_lock, RW_READER); 1756 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1757 (mod_hash_val_t *)&grp) != 0) { 1758 rw_exit(&aggr_grp_lock); 1759 return (ENOENT); 1760 } 1761 AGGR_GRP_REFHOLD(grp); 1762 1763 /* 1764 * Hold the perimeter so that the aggregation won't be destroyed. 1765 */ 1766 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1767 rw_exit(&aggr_grp_lock); 1768 1769 /* we need to keep at least one port per group */ 1770 if (nports >= grp->lg_nports) { 1771 rc = EINVAL; 1772 goto bail; 1773 } 1774 1775 /* first verify that all the groups are valid */ 1776 for (i = 0; i < nports; i++) { 1777 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) { 1778 /* port not found */ 1779 rc = ENOENT; 1780 goto bail; 1781 } 1782 } 1783 1784 /* clear the promiscous mode for the specified ports */ 1785 for (i = 0; i < nports && rc == 0; i++) { 1786 /* lookup port */ 1787 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1788 ASSERT(port != NULL); 1789 1790 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1791 rc = aggr_port_promisc(port, B_FALSE); 1792 mac_perim_exit(pmph); 1793 } 1794 if (rc != 0) { 1795 for (i = 0; i < nports; i++) { 1796 port = aggr_grp_port_lookup(grp, 1797 ports[i].lp_linkid); 1798 ASSERT(port != NULL); 1799 1800 /* 1801 * Turn the promiscuous mode back on if it is required 1802 * to receive the non-primary address over a port, or 1803 * the promiscous mode is enabled over the aggr. 1804 */ 1805 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1806 if (port->lp_started && (grp->lg_promisc || 1807 port->lp_prom_addr != NULL)) { 1808 (void) aggr_port_promisc(port, B_TRUE); 1809 } 1810 mac_perim_exit(pmph); 1811 } 1812 goto bail; 1813 } 1814 1815 /* remove the specified ports from group */ 1816 for (i = 0; i < nports; i++) { 1817 /* lookup port */ 1818 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid); 1819 ASSERT(port != NULL); 1820 1821 /* stop port if group has already been started */ 1822 if (grp->lg_started) { 1823 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1824 aggr_port_stop(port); 1825 mac_perim_exit(pmph); 1826 } 1827 1828 /* 1829 * aggr_rem_pseudo_tx_group() is not called here. Instead 1830 * it is called from inside aggr_grp_rem_port() after the 1831 * port has been detached. The reason is that 1832 * aggr_rem_pseudo_tx_group() removes one ring at a time 1833 * and if there is still traffic going on, then there 1834 * is the possibility of aggr_find_tx_ring() returning a 1835 * removed ring for transmission. Once the port has been 1836 * detached, that port will not be used and 1837 * aggr_find_tx_ring() will not return any rings 1838 * belonging to it. 1839 */ 1840 for (i = 0; i < grp->lg_rx_group_count; i++) 1841 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1842 1843 /* remove port from group */ 1844 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed, 1845 &link_state_changed); 1846 ASSERT(rc == 0); 1847 mac_addr_update = mac_addr_update || mac_addr_changed; 1848 link_state_update = link_state_update || link_state_changed; 1849 } 1850 1851 bail: 1852 if (mac_addr_update) 1853 mac_unicst_update(grp->lg_mh, grp->lg_addr); 1854 if (link_state_update) 1855 mac_link_update(grp->lg_mh, grp->lg_link_state); 1856 1857 mac_perim_exit(mph); 1858 AGGR_GRP_REFRELE(grp); 1859 1860 return (rc); 1861 } 1862 1863 int 1864 aggr_grp_delete(datalink_id_t linkid, cred_t *cred) 1865 { 1866 aggr_grp_t *grp = NULL; 1867 aggr_port_t *port, *cport; 1868 datalink_id_t tmpid; 1869 mod_hash_val_t val; 1870 mac_perim_handle_t mph, pmph; 1871 int err; 1872 kt_did_t tid = 0; 1873 1874 rw_enter(&aggr_grp_lock, RW_WRITER); 1875 1876 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 1877 (mod_hash_val_t *)&grp) != 0) { 1878 rw_exit(&aggr_grp_lock); 1879 return (ENOENT); 1880 } 1881 1882 /* 1883 * Note that dls_devnet_destroy() must be called before lg_lock is 1884 * held. Otherwise, it will deadlock if another thread is in 1885 * aggr_m_stat() and thus has a kstat_hold() on the kstats that 1886 * dls_devnet_destroy() needs to delete. 1887 */ 1888 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) { 1889 rw_exit(&aggr_grp_lock); 1890 return (err); 1891 } 1892 ASSERT(linkid == tmpid); 1893 1894 /* 1895 * Unregister from the MAC service module. Since this can 1896 * fail if a client hasn't closed the MAC port, we gracefully 1897 * fail the operation. 1898 */ 1899 if ((err = mac_disable(grp->lg_mh)) != 0) { 1900 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred)); 1901 rw_exit(&aggr_grp_lock); 1902 return (err); 1903 } 1904 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val); 1905 ASSERT(grp == (aggr_grp_t *)val); 1906 1907 ASSERT(aggr_grp_cnt > 0); 1908 aggr_grp_cnt--; 1909 rw_exit(&aggr_grp_lock); 1910 1911 /* 1912 * Inform the lacp_rx thread to exit. 1913 */ 1914 mutex_enter(&grp->lg_lacp_lock); 1915 grp->lg_lacp_done = B_TRUE; 1916 cv_signal(&grp->lg_lacp_cv); 1917 while (grp->lg_lacp_rx_thread != NULL) 1918 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock); 1919 mutex_exit(&grp->lg_lacp_lock); 1920 /* 1921 * Inform the tx_notify_thread to exit. 1922 */ 1923 mutex_enter(&grp->lg_tx_flowctl_lock); 1924 if (grp->lg_tx_notify_thread != NULL) { 1925 tid = grp->lg_tx_notify_thread->t_did; 1926 grp->lg_tx_notify_done = B_TRUE; 1927 cv_signal(&grp->lg_tx_flowctl_cv); 1928 } 1929 mutex_exit(&grp->lg_tx_flowctl_lock); 1930 if (tid != 0) 1931 thread_join(tid); 1932 1933 mac_perim_enter_by_mh(grp->lg_mh, &mph); 1934 1935 grp->lg_closing = B_TRUE; 1936 /* detach and free MAC ports associated with group */ 1937 port = grp->lg_ports; 1938 while (port != NULL) { 1939 cport = port->lp_next; 1940 mac_perim_enter_by_mh(port->lp_mh, &pmph); 1941 if (grp->lg_started) 1942 aggr_port_stop(port); 1943 (void) aggr_grp_detach_port(grp, port); 1944 mac_perim_exit(pmph); 1945 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group); 1946 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 1947 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]); 1948 aggr_port_delete(port); 1949 port = cport; 1950 } 1951 1952 mac_perim_exit(mph); 1953 1954 kmem_free(grp->lg_tx_blocked_rings, 1955 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP)); 1956 /* 1957 * Wait for the port's lacp timer thread and its notification callback 1958 * to exit before calling mac_unregister() since both needs to access 1959 * the mac perimeter of the grp. 1960 */ 1961 aggr_grp_port_wait(grp); 1962 1963 VERIFY(mac_unregister(grp->lg_mh) == 0); 1964 grp->lg_mh = NULL; 1965 1966 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) { 1967 list_destroy(&(grp->lg_rx_groups[i].arg_vlans)); 1968 } 1969 1970 AGGR_GRP_REFRELE(grp); 1971 return (0); 1972 } 1973 1974 void 1975 aggr_grp_free(aggr_grp_t *grp) 1976 { 1977 ASSERT(grp->lg_refs == 0); 1978 ASSERT(grp->lg_port_ref == 0); 1979 if (grp->lg_key > AGGR_MAX_KEY) { 1980 id_free(key_ids, grp->lg_key); 1981 grp->lg_key = 0; 1982 } 1983 kmem_cache_free(aggr_grp_cache, grp); 1984 } 1985 1986 int 1987 aggr_grp_info(datalink_id_t linkid, void *fn_arg, 1988 aggr_grp_info_new_grp_fn_t new_grp_fn, 1989 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred) 1990 { 1991 aggr_grp_t *grp; 1992 aggr_port_t *port; 1993 mac_perim_handle_t mph, pmph; 1994 int rc = 0; 1995 1996 /* 1997 * Make sure that the aggregation link is visible from the caller's 1998 * zone. 1999 */ 2000 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred))) 2001 return (ENOENT); 2002 2003 rw_enter(&aggr_grp_lock, RW_READER); 2004 2005 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid), 2006 (mod_hash_val_t *)&grp) != 0) { 2007 rw_exit(&aggr_grp_lock); 2008 return (ENOENT); 2009 } 2010 AGGR_GRP_REFHOLD(grp); 2011 2012 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2013 rw_exit(&aggr_grp_lock); 2014 2015 rc = new_grp_fn(fn_arg, grp->lg_linkid, 2016 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr, 2017 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy, 2018 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer); 2019 2020 if (rc != 0) 2021 goto bail; 2022 2023 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2024 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2025 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr, 2026 port->lp_state, &port->lp_lacp.ActorOperPortState); 2027 mac_perim_exit(pmph); 2028 2029 if (rc != 0) 2030 goto bail; 2031 } 2032 2033 bail: 2034 mac_perim_exit(mph); 2035 AGGR_GRP_REFRELE(grp); 2036 return (rc); 2037 } 2038 2039 /*ARGSUSED*/ 2040 static void 2041 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp) 2042 { 2043 miocnak(q, mp, 0, ENOTSUP); 2044 } 2045 2046 static int 2047 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val) 2048 { 2049 aggr_port_t *port; 2050 uint_t stat_index; 2051 2052 ASSERT(MUTEX_HELD(&grp->lg_stat_lock)); 2053 2054 /* We only aggregate counter statistics. */ 2055 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) || 2056 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) { 2057 return (ENOTSUP); 2058 } 2059 2060 /* 2061 * Counter statistics for a group are computed by aggregating the 2062 * counters of the members MACs while they were aggregated, plus 2063 * the residual counter of the group itself, which is updated each 2064 * time a MAC is removed from the group. 2065 */ 2066 *val = 0; 2067 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2068 /* actual port statistic */ 2069 *val += aggr_port_stat(port, stat); 2070 /* 2071 * minus the port stat when it was added, plus any residual 2072 * amount for the group. 2073 */ 2074 if (IS_MAC_STAT(stat)) { 2075 stat_index = stat - MAC_STAT_MIN; 2076 *val -= port->lp_stat[stat_index]; 2077 *val += grp->lg_stat[stat_index]; 2078 } else if (IS_MACTYPE_STAT(stat)) { 2079 stat_index = stat - MACTYPE_STAT_MIN; 2080 *val -= port->lp_ether_stat[stat_index]; 2081 *val += grp->lg_ether_stat[stat_index]; 2082 } 2083 } 2084 return (0); 2085 } 2086 2087 int 2088 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2089 { 2090 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver; 2091 2092 if (rx_ring->arr_hw_rh != NULL) { 2093 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat); 2094 } else { 2095 aggr_port_t *port = rx_ring->arr_port; 2096 2097 *val = mac_stat_get(port->lp_mh, stat); 2098 2099 } 2100 return (0); 2101 } 2102 2103 int 2104 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val) 2105 { 2106 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver; 2107 2108 if (tx_ring->atr_hw_rh != NULL) { 2109 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat); 2110 } else { 2111 aggr_port_t *port = tx_ring->atr_port; 2112 2113 *val = mac_stat_get(port->lp_mh, stat); 2114 } 2115 return (0); 2116 } 2117 2118 static int 2119 aggr_m_stat(void *arg, uint_t stat, uint64_t *val) 2120 { 2121 aggr_grp_t *grp = arg; 2122 int rval = 0; 2123 2124 mutex_enter(&grp->lg_stat_lock); 2125 2126 switch (stat) { 2127 case MAC_STAT_IFSPEED: 2128 *val = grp->lg_ifspeed; 2129 break; 2130 2131 case ETHER_STAT_LINK_DUPLEX: 2132 *val = grp->lg_link_duplex; 2133 break; 2134 2135 default: 2136 /* 2137 * For all other statistics, we return the aggregated stat 2138 * from the underlying ports. aggr_grp_stat() will set 2139 * rval appropriately if the statistic isn't a counter. 2140 */ 2141 rval = aggr_grp_stat(grp, stat, val); 2142 } 2143 2144 mutex_exit(&grp->lg_stat_lock); 2145 return (rval); 2146 } 2147 2148 static int 2149 aggr_m_start(void *arg) 2150 { 2151 aggr_grp_t *grp = arg; 2152 aggr_port_t *port; 2153 mac_perim_handle_t mph, pmph; 2154 2155 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2156 2157 /* 2158 * Attempts to start all configured members of the group. 2159 * Group members will be attached when their link-up notification 2160 * is received. 2161 */ 2162 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2163 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2164 if (aggr_port_start(port) != 0) { 2165 mac_perim_exit(pmph); 2166 continue; 2167 } 2168 2169 /* 2170 * Turn on the promiscuous mode if it is required to receive 2171 * the non-primary address over a port, or the promiscous 2172 * mode is enabled over the aggr. 2173 */ 2174 if (grp->lg_promisc || port->lp_prom_addr != NULL) { 2175 if (aggr_port_promisc(port, B_TRUE) != 0) 2176 aggr_port_stop(port); 2177 } 2178 mac_perim_exit(pmph); 2179 } 2180 2181 grp->lg_started = B_TRUE; 2182 2183 mac_perim_exit(mph); 2184 return (0); 2185 } 2186 2187 static void 2188 aggr_m_stop(void *arg) 2189 { 2190 aggr_grp_t *grp = arg; 2191 aggr_port_t *port; 2192 mac_perim_handle_t mph, pmph; 2193 2194 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2195 2196 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2197 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2198 2199 /* reset port promiscuous mode */ 2200 (void) aggr_port_promisc(port, B_FALSE); 2201 2202 aggr_port_stop(port); 2203 mac_perim_exit(pmph); 2204 } 2205 2206 grp->lg_started = B_FALSE; 2207 mac_perim_exit(mph); 2208 } 2209 2210 static int 2211 aggr_m_promisc(void *arg, boolean_t on) 2212 { 2213 aggr_grp_t *grp = arg; 2214 aggr_port_t *port; 2215 boolean_t link_state_changed = B_FALSE; 2216 mac_perim_handle_t mph, pmph; 2217 2218 AGGR_GRP_REFHOLD(grp); 2219 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2220 2221 ASSERT(!grp->lg_closing); 2222 2223 if (on == grp->lg_promisc) 2224 goto bail; 2225 2226 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2227 int err = 0; 2228 2229 mac_perim_enter_by_mh(port->lp_mh, &pmph); 2230 AGGR_PORT_REFHOLD(port); 2231 if (!on && (port->lp_prom_addr == NULL)) 2232 err = aggr_port_promisc(port, B_FALSE); 2233 else if (on && port->lp_started) 2234 err = aggr_port_promisc(port, B_TRUE); 2235 2236 if (err != 0) { 2237 if (aggr_grp_detach_port(grp, port)) 2238 link_state_changed = B_TRUE; 2239 } else { 2240 /* 2241 * If a port was detached because of a previous 2242 * failure changing the promiscuity, the port 2243 * is reattached when it successfully changes 2244 * the promiscuity now, and this might cause 2245 * the link state of the aggregation to change. 2246 */ 2247 if (aggr_grp_attach_port(grp, port)) 2248 link_state_changed = B_TRUE; 2249 } 2250 mac_perim_exit(pmph); 2251 AGGR_PORT_REFRELE(port); 2252 } 2253 2254 grp->lg_promisc = on; 2255 2256 if (link_state_changed) 2257 mac_link_update(grp->lg_mh, grp->lg_link_state); 2258 2259 bail: 2260 mac_perim_exit(mph); 2261 AGGR_GRP_REFRELE(grp); 2262 2263 return (0); 2264 } 2265 2266 static void 2267 aggr_grp_port_rename(const char *new_name, void *arg) 2268 { 2269 /* 2270 * aggr port's mac client name is the format of "aggr link name" plus 2271 * AGGR_PORT_NAME_DELIMIT plus "underneath link name". 2272 */ 2273 int aggr_len, link_len, clnt_name_len, i; 2274 char *str_end, *str_st, *str_del; 2275 char aggr_name[MAXNAMELEN]; 2276 char link_name[MAXNAMELEN]; 2277 char *clnt_name; 2278 aggr_grp_t *aggr_grp = arg; 2279 aggr_port_t *aggr_port = aggr_grp->lg_ports; 2280 2281 for (i = 0; i < aggr_grp->lg_nports; i++) { 2282 clnt_name = mac_client_name(aggr_port->lp_mch); 2283 clnt_name_len = strlen(clnt_name); 2284 str_st = clnt_name; 2285 str_end = &(clnt_name[clnt_name_len]); 2286 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT); 2287 ASSERT(str_del != NULL); 2288 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st); 2289 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del); 2290 bzero(aggr_name, MAXNAMELEN); 2291 bzero(link_name, MAXNAMELEN); 2292 bcopy(clnt_name, aggr_name, aggr_len); 2293 bcopy(str_del, link_name, link_len + 1); 2294 bzero(clnt_name, MAXNAMELEN); 2295 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name, 2296 link_name); 2297 2298 (void) mac_rename_primary(aggr_port->lp_mh, NULL); 2299 aggr_port = aggr_port->lp_next; 2300 } 2301 } 2302 2303 /* 2304 * Initialize the capabilities that are advertised for the group 2305 * according to the capabilities of the constituent ports. 2306 */ 2307 static boolean_t 2308 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data) 2309 { 2310 aggr_grp_t *grp = arg; 2311 2312 switch (cap) { 2313 case MAC_CAPAB_HCKSUM: { 2314 uint32_t *hcksum_txflags = cap_data; 2315 *hcksum_txflags = grp->lg_hcksum_txflags; 2316 break; 2317 } 2318 case MAC_CAPAB_LSO: { 2319 mac_capab_lso_t *cap_lso = cap_data; 2320 2321 if (grp->lg_lso) { 2322 *cap_lso = grp->lg_cap_lso; 2323 break; 2324 } else { 2325 return (B_FALSE); 2326 } 2327 } 2328 case MAC_CAPAB_NO_NATIVEVLAN: 2329 return (!grp->lg_vlan); 2330 case MAC_CAPAB_NO_ZCOPY: 2331 return (!grp->lg_zcopy); 2332 case MAC_CAPAB_RINGS: { 2333 mac_capab_rings_t *cap_rings = cap_data; 2334 uint_t ring_cnt = 0; 2335 2336 for (uint_t i = 0; i < grp->lg_rx_group_count; i++) 2337 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt; 2338 2339 if (cap_rings->mr_type == MAC_RING_TYPE_RX) { 2340 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2341 cap_rings->mr_rnum = ring_cnt; 2342 cap_rings->mr_gnum = grp->lg_rx_group_count; 2343 cap_rings->mr_gaddring = NULL; 2344 cap_rings->mr_gremring = NULL; 2345 } else { 2346 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC; 2347 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt; 2348 cap_rings->mr_gnum = 0; 2349 } 2350 cap_rings->mr_rget = aggr_fill_ring; 2351 cap_rings->mr_gget = aggr_fill_group; 2352 break; 2353 } 2354 case MAC_CAPAB_AGGR: 2355 { 2356 mac_capab_aggr_t *aggr_cap; 2357 2358 if (cap_data != NULL) { 2359 aggr_cap = cap_data; 2360 aggr_cap->mca_rename_fn = aggr_grp_port_rename; 2361 aggr_cap->mca_unicst = aggr_m_unicst; 2362 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring; 2363 aggr_cap->mca_arg = arg; 2364 } 2365 return (B_TRUE); 2366 } 2367 default: 2368 return (B_FALSE); 2369 } 2370 return (B_TRUE); 2371 } 2372 2373 /* 2374 * Callback function for MAC layer to register groups. 2375 */ 2376 static void 2377 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index, 2378 mac_group_info_t *infop, mac_group_handle_t gh) 2379 { 2380 aggr_grp_t *grp = arg; 2381 2382 if (rtype == MAC_RING_TYPE_RX) { 2383 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index]; 2384 2385 rx_group->arg_gh = gh; 2386 rx_group->arg_grp = grp; 2387 2388 infop->mgi_driver = (mac_group_driver_t)rx_group; 2389 infop->mgi_start = NULL; 2390 infop->mgi_stop = NULL; 2391 infop->mgi_addmac = aggr_addmac; 2392 infop->mgi_remmac = aggr_remmac; 2393 infop->mgi_count = rx_group->arg_ring_cnt; 2394 2395 /* 2396 * Always set the HW VLAN callbacks. They are smart 2397 * enough to know when a port has HW VLAN filters to 2398 * program and when it doesn't. 2399 */ 2400 infop->mgi_addvlan = aggr_addvlan; 2401 infop->mgi_remvlan = aggr_remvlan; 2402 } else { 2403 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2404 2405 ASSERT3S(index, ==, 0); 2406 tx_group->atg_gh = gh; 2407 } 2408 } 2409 2410 /* 2411 * Callback funtion for MAC layer to register all rings. 2412 */ 2413 static void 2414 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index, 2415 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh) 2416 { 2417 aggr_grp_t *grp = arg; 2418 2419 switch (rtype) { 2420 case MAC_RING_TYPE_RX: { 2421 aggr_pseudo_rx_group_t *rx_group; 2422 aggr_pseudo_rx_ring_t *rx_ring; 2423 mac_intr_t aggr_mac_intr; 2424 2425 rx_group = &grp->lg_rx_groups[rg_index]; 2426 ASSERT3S(index, >=, 0); 2427 ASSERT3S(index, <, rx_group->arg_ring_cnt); 2428 rx_ring = rx_group->arg_rings + index; 2429 rx_ring->arr_rh = rh; 2430 2431 /* 2432 * Entrypoint to enable interrupt (disable poll) and 2433 * disable interrupt (enable poll). 2434 */ 2435 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring; 2436 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr; 2437 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr; 2438 aggr_mac_intr.mi_ddi_handle = NULL; 2439 2440 infop->mri_driver = (mac_ring_driver_t)rx_ring; 2441 infop->mri_start = aggr_pseudo_start_rx_ring; 2442 infop->mri_stop = aggr_pseudo_stop_rx_ring; 2443 2444 infop->mri_intr = aggr_mac_intr; 2445 infop->mri_poll = aggr_rx_poll; 2446 2447 infop->mri_stat = aggr_rx_ring_stat; 2448 break; 2449 } 2450 case MAC_RING_TYPE_TX: { 2451 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group; 2452 aggr_pseudo_tx_ring_t *tx_ring; 2453 2454 ASSERT(rg_index == -1); 2455 ASSERT(index < tx_group->atg_ring_cnt); 2456 2457 tx_ring = &tx_group->atg_rings[index]; 2458 tx_ring->atr_rh = rh; 2459 2460 infop->mri_driver = (mac_ring_driver_t)tx_ring; 2461 infop->mri_start = NULL; 2462 infop->mri_stop = NULL; 2463 infop->mri_tx = aggr_ring_tx; 2464 infop->mri_stat = aggr_tx_ring_stat; 2465 /* 2466 * Use the hw TX ring handle to find if the ring needs 2467 * serialization or not. For NICs that do not expose 2468 * Tx rings, atr_hw_rh will be NULL. 2469 */ 2470 if (tx_ring->atr_hw_rh != NULL) { 2471 infop->mri_flags = 2472 mac_hwring_getinfo(tx_ring->atr_hw_rh); 2473 } 2474 break; 2475 } 2476 default: 2477 break; 2478 } 2479 } 2480 2481 static mblk_t * 2482 aggr_rx_poll(void *arg, int bytes_to_pickup) 2483 { 2484 aggr_pseudo_rx_ring_t *rr_ring = arg; 2485 aggr_port_t *port = rr_ring->arr_port; 2486 aggr_grp_t *grp = port->lp_grp; 2487 mblk_t *mp_chain, *mp, **mpp; 2488 2489 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup); 2490 2491 if (grp->lg_lacp_mode == AGGR_LACP_OFF) 2492 return (mp_chain); 2493 2494 mpp = &mp_chain; 2495 while ((mp = *mpp) != NULL) { 2496 if (MBLKL(mp) >= sizeof (struct ether_header)) { 2497 struct ether_header *ehp; 2498 2499 ehp = (struct ether_header *)mp->b_rptr; 2500 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) { 2501 *mpp = mp->b_next; 2502 mp->b_next = NULL; 2503 aggr_recv_lacp(port, 2504 (mac_resource_handle_t)rr_ring, mp); 2505 continue; 2506 } 2507 } 2508 2509 if (!port->lp_collector_enabled) { 2510 *mpp = mp->b_next; 2511 mp->b_next = NULL; 2512 freemsg(mp); 2513 continue; 2514 } 2515 mpp = &mp->b_next; 2516 } 2517 return (mp_chain); 2518 } 2519 2520 static int 2521 aggr_addmac(void *arg, const uint8_t *mac_addr) 2522 { 2523 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2524 aggr_unicst_addr_t *addr, **pprev; 2525 aggr_grp_t *grp = rx_group->arg_grp; 2526 aggr_port_t *port, *p; 2527 mac_perim_handle_t mph; 2528 int err = 0; 2529 uint_t idx = rx_group->arg_index; 2530 2531 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2532 2533 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2534 mac_perim_exit(mph); 2535 return (0); 2536 } 2537 2538 /* 2539 * Insert this mac address into the list of mac addresses owned by 2540 * the aggregation pseudo group. 2541 */ 2542 pprev = &rx_group->arg_macaddr; 2543 while ((addr = *pprev) != NULL) { 2544 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) { 2545 mac_perim_exit(mph); 2546 return (EEXIST); 2547 } 2548 pprev = &addr->aua_next; 2549 } 2550 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP); 2551 bcopy(mac_addr, addr->aua_addr, ETHERADDRL); 2552 addr->aua_next = NULL; 2553 *pprev = addr; 2554 2555 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2556 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0) 2557 break; 2558 2559 if (err != 0) { 2560 for (p = grp->lg_ports; p != port; p = p->lp_next) 2561 aggr_port_remmac(p, idx, mac_addr); 2562 2563 *pprev = NULL; 2564 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2565 } 2566 2567 mac_perim_exit(mph); 2568 return (err); 2569 } 2570 2571 static int 2572 aggr_remmac(void *arg, const uint8_t *mac_addr) 2573 { 2574 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg; 2575 aggr_unicst_addr_t *addr, **pprev; 2576 aggr_grp_t *grp = rx_group->arg_grp; 2577 aggr_port_t *port; 2578 mac_perim_handle_t mph; 2579 int err = 0; 2580 2581 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2582 2583 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) { 2584 mac_perim_exit(mph); 2585 return (0); 2586 } 2587 2588 /* 2589 * Insert this mac address into the list of mac addresses owned by 2590 * the aggregation pseudo group. 2591 */ 2592 pprev = &rx_group->arg_macaddr; 2593 while ((addr = *pprev) != NULL) { 2594 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) { 2595 pprev = &addr->aua_next; 2596 continue; 2597 } 2598 break; 2599 } 2600 if (addr == NULL) { 2601 mac_perim_exit(mph); 2602 return (EINVAL); 2603 } 2604 2605 for (port = grp->lg_ports; port != NULL; port = port->lp_next) 2606 aggr_port_remmac(port, rx_group->arg_index, mac_addr); 2607 2608 *pprev = addr->aua_next; 2609 kmem_free(addr, sizeof (aggr_unicst_addr_t)); 2610 2611 mac_perim_exit(mph); 2612 return (err); 2613 } 2614 2615 /* 2616 * Search for VID in the Rx group's list and return a pointer if 2617 * found. Otherwise return NULL. 2618 */ 2619 static aggr_vlan_t * 2620 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid) 2621 { 2622 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh)); 2623 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL; 2624 avp = list_next(&rx_group->arg_vlans, avp)) { 2625 if (avp->av_vid == vid) 2626 return (avp); 2627 } 2628 2629 return (NULL); 2630 } 2631 2632 /* 2633 * Accept traffic on the specified VID. 2634 * 2635 * Persist VLAN state in the aggr so that ports added later will 2636 * receive the correct filters. In the future it would be nice to 2637 * allow aggr to iterate its clients instead of duplicating state. 2638 */ 2639 static int 2640 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid) 2641 { 2642 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2643 aggr_grp_t *aggr = rx_group->arg_grp; 2644 aggr_port_t *port, *p; 2645 mac_perim_handle_t mph; 2646 int err = 0; 2647 aggr_vlan_t *avp = NULL; 2648 uint_t idx = rx_group->arg_index; 2649 2650 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2651 2652 if (vid == MAC_VLAN_UNTAGGED) { 2653 /* 2654 * Aggr is both a MAC provider and MAC client. As a 2655 * MAC provider it is passed MAC_VLAN_UNTAGGED by its 2656 * client. As a client itself, it should pass 2657 * VLAN_ID_NONE to its ports. 2658 */ 2659 vid = VLAN_ID_NONE; 2660 rx_group->arg_untagged++; 2661 goto update_ports; 2662 } 2663 2664 avp = aggr_find_vlan(rx_group, vid); 2665 2666 if (avp != NULL) { 2667 avp->av_refs++; 2668 mac_perim_exit(mph); 2669 return (0); 2670 } 2671 2672 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP); 2673 avp->av_vid = vid; 2674 avp->av_refs = 1; 2675 2676 update_ports: 2677 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2678 if ((err = aggr_port_addvlan(port, idx, vid)) != 0) 2679 break; 2680 2681 if (err != 0) { 2682 /* 2683 * If any of these calls fail then we are in a 2684 * situation where the ports have different HW state. 2685 * There's no reasonable action the MAC client can 2686 * take in this scenario to rectify the situation. 2687 */ 2688 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2689 int err2; 2690 2691 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) { 2692 cmn_err(CE_WARN, "Failed to remove VLAN %u" 2693 " from port %s: errno %d.", vid, 2694 mac_client_name(p->lp_mch), err2); 2695 } 2696 2697 } 2698 2699 if (vid == VLAN_ID_NONE) 2700 rx_group->arg_untagged--; 2701 2702 if (avp != NULL) { 2703 kmem_free(avp, sizeof (aggr_vlan_t)); 2704 avp = NULL; 2705 } 2706 } 2707 2708 if (avp != NULL) 2709 list_insert_tail(&rx_group->arg_vlans, avp); 2710 2711 done: 2712 mac_perim_exit(mph); 2713 return (err); 2714 } 2715 2716 /* 2717 * Stop accepting traffic on this VLAN if it's the last use of this VLAN. 2718 */ 2719 static int 2720 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid) 2721 { 2722 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver; 2723 aggr_grp_t *aggr = rx_group->arg_grp; 2724 aggr_port_t *port, *p; 2725 mac_perim_handle_t mph; 2726 int err = 0; 2727 aggr_vlan_t *avp = NULL; 2728 uint_t idx = rx_group->arg_index; 2729 2730 mac_perim_enter_by_mh(aggr->lg_mh, &mph); 2731 2732 /* 2733 * See the comment in aggr_addvlan(). 2734 */ 2735 if (vid == MAC_VLAN_UNTAGGED) { 2736 vid = VLAN_ID_NONE; 2737 rx_group->arg_untagged--; 2738 2739 if (rx_group->arg_untagged > 0) 2740 goto done; 2741 2742 goto update_ports; 2743 } 2744 2745 avp = aggr_find_vlan(rx_group, vid); 2746 2747 if (avp == NULL) { 2748 err = ENOENT; 2749 goto done; 2750 } 2751 2752 avp->av_refs--; 2753 2754 if (avp->av_refs > 0) 2755 goto done; 2756 2757 update_ports: 2758 for (port = aggr->lg_ports; port != NULL; port = port->lp_next) 2759 if ((err = aggr_port_remvlan(port, idx, vid)) != 0) 2760 break; 2761 2762 /* 2763 * See the comment in aggr_addvlan() for justification of the 2764 * use of VERIFY here. 2765 */ 2766 if (err != 0) { 2767 for (p = aggr->lg_ports; p != port; p = p->lp_next) { 2768 int err2; 2769 2770 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) { 2771 cmn_err(CE_WARN, "Failed to add VLAN %u" 2772 " to port %s: errno %d.", vid, 2773 mac_client_name(p->lp_mch), err2); 2774 } 2775 } 2776 2777 if (avp != NULL) 2778 avp->av_refs++; 2779 2780 if (vid == VLAN_ID_NONE) 2781 rx_group->arg_untagged++; 2782 2783 goto done; 2784 } 2785 2786 if (err == 0 && avp != NULL) { 2787 VERIFY3U(avp->av_refs, ==, 0); 2788 list_remove(&rx_group->arg_vlans, avp); 2789 kmem_free(avp, sizeof (aggr_vlan_t)); 2790 } 2791 2792 done: 2793 mac_perim_exit(mph); 2794 return (err); 2795 } 2796 2797 /* 2798 * Add or remove the multicast addresses that are defined for the group 2799 * to or from the specified port. 2800 * 2801 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port 2802 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is 2803 * called when the port is either stopped or detached. 2804 */ 2805 void 2806 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add) 2807 { 2808 aggr_grp_t *grp = port->lp_grp; 2809 2810 ASSERT(MAC_PERIM_HELD(port->lp_mh)); 2811 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 2812 2813 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED) 2814 return; 2815 2816 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add); 2817 } 2818 2819 static int 2820 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp) 2821 { 2822 aggr_grp_t *grp = arg; 2823 aggr_port_t *port = NULL, *errport = NULL; 2824 mac_perim_handle_t mph; 2825 int err = 0; 2826 2827 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2828 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2829 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2830 !port->lp_started) { 2831 continue; 2832 } 2833 err = aggr_port_multicst(port, add, addrp); 2834 if (err != 0) { 2835 errport = port; 2836 break; 2837 } 2838 } 2839 2840 /* 2841 * At least one port caused error return and this error is returned to 2842 * mac, eventually a NAK would be sent upwards. 2843 * Some ports have this multicast address listed now, and some don't. 2844 * Treat this error as a whole aggr failure not individual port failure. 2845 * Therefore remove this multicast address from other ports. 2846 */ 2847 if ((err != 0) && add) { 2848 for (port = grp->lg_ports; port != errport; 2849 port = port->lp_next) { 2850 if (port->lp_state != AGGR_PORT_STATE_ATTACHED || 2851 !port->lp_started) { 2852 continue; 2853 } 2854 (void) aggr_port_multicst(port, B_FALSE, addrp); 2855 } 2856 } 2857 mac_perim_exit(mph); 2858 return (err); 2859 } 2860 2861 static int 2862 aggr_m_unicst(void *arg, const uint8_t *macaddr) 2863 { 2864 aggr_grp_t *grp = arg; 2865 mac_perim_handle_t mph; 2866 int err; 2867 2868 mac_perim_enter_by_mh(grp->lg_mh, &mph); 2869 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr, 2870 0, 0); 2871 mac_perim_exit(mph); 2872 return (err); 2873 } 2874 2875 /* 2876 * Initialize the capabilities that are advertised for the group 2877 * according to the capabilities of the constituent ports. 2878 */ 2879 static void 2880 aggr_grp_capab_set(aggr_grp_t *grp) 2881 { 2882 uint32_t cksum; 2883 aggr_port_t *port; 2884 mac_capab_lso_t cap_lso; 2885 2886 ASSERT(grp->lg_mh == NULL); 2887 ASSERT(grp->lg_ports != NULL); 2888 2889 grp->lg_hcksum_txflags = (uint32_t)-1; 2890 grp->lg_zcopy = B_TRUE; 2891 grp->lg_vlan = B_TRUE; 2892 2893 grp->lg_lso = B_TRUE; 2894 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1; 2895 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1; 2896 2897 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2898 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum)) 2899 cksum = 0; 2900 grp->lg_hcksum_txflags &= cksum; 2901 2902 grp->lg_vlan &= 2903 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL); 2904 2905 grp->lg_zcopy &= 2906 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL); 2907 2908 grp->lg_lso &= 2909 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso); 2910 if (grp->lg_lso) { 2911 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags; 2912 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2913 cap_lso.lso_basic_tcp_ipv4.lso_max) 2914 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = 2915 cap_lso.lso_basic_tcp_ipv4.lso_max; 2916 } 2917 } 2918 } 2919 2920 /* 2921 * Checks whether the capabilities of the port being added are compatible 2922 * with the current capabilities of the aggregation. 2923 */ 2924 static boolean_t 2925 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port) 2926 { 2927 uint32_t hcksum_txflags; 2928 2929 ASSERT(grp->lg_ports != NULL); 2930 2931 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) & 2932 grp->lg_vlan) != grp->lg_vlan) { 2933 return (B_FALSE); 2934 } 2935 2936 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) & 2937 grp->lg_zcopy) != grp->lg_zcopy) { 2938 return (B_FALSE); 2939 } 2940 2941 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) { 2942 if (grp->lg_hcksum_txflags != 0) 2943 return (B_FALSE); 2944 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) != 2945 grp->lg_hcksum_txflags) { 2946 return (B_FALSE); 2947 } 2948 2949 if (grp->lg_lso) { 2950 mac_capab_lso_t cap_lso; 2951 2952 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) { 2953 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) != 2954 grp->lg_cap_lso.lso_flags) 2955 return (B_FALSE); 2956 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max > 2957 cap_lso.lso_basic_tcp_ipv4.lso_max) 2958 return (B_FALSE); 2959 } else { 2960 return (B_FALSE); 2961 } 2962 } 2963 2964 return (B_TRUE); 2965 } 2966 2967 /* 2968 * Returns the maximum SDU according to the SDU of the constituent ports. 2969 */ 2970 static uint_t 2971 aggr_grp_max_sdu(aggr_grp_t *grp) 2972 { 2973 uint_t max_sdu = (uint_t)-1; 2974 aggr_port_t *port; 2975 2976 ASSERT(grp->lg_ports != NULL); 2977 2978 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 2979 uint_t port_sdu_max; 2980 2981 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 2982 if (max_sdu > port_sdu_max) 2983 max_sdu = port_sdu_max; 2984 } 2985 2986 return (max_sdu); 2987 } 2988 2989 /* 2990 * Checks if the maximum SDU of the specified port is compatible 2991 * with the maximum SDU of the specified aggregation group, returns 2992 * B_TRUE if it is, B_FALSE otherwise. 2993 */ 2994 static boolean_t 2995 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port) 2996 { 2997 uint_t port_sdu_max; 2998 2999 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max); 3000 return (port_sdu_max >= grp->lg_max_sdu); 3001 } 3002 3003 /* 3004 * Returns the maximum margin according to the margin of the constituent ports. 3005 */ 3006 static uint32_t 3007 aggr_grp_max_margin(aggr_grp_t *grp) 3008 { 3009 uint32_t margin = UINT32_MAX; 3010 aggr_port_t *port; 3011 3012 ASSERT(grp->lg_mh == NULL); 3013 ASSERT(grp->lg_ports != NULL); 3014 3015 for (port = grp->lg_ports; port != NULL; port = port->lp_next) { 3016 if (margin > port->lp_margin) 3017 margin = port->lp_margin; 3018 } 3019 3020 grp->lg_margin = margin; 3021 return (margin); 3022 } 3023 3024 /* 3025 * Checks if the maximum margin of the specified port is compatible 3026 * with the maximum margin of the specified aggregation group, returns 3027 * B_TRUE if it is, B_FALSE otherwise. 3028 */ 3029 static boolean_t 3030 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port) 3031 { 3032 if (port->lp_margin >= grp->lg_margin) 3033 return (B_TRUE); 3034 3035 /* 3036 * See whether the current margin value is allowed to be changed to 3037 * the new value. 3038 */ 3039 if (!mac_margin_update(grp->lg_mh, port->lp_margin)) 3040 return (B_FALSE); 3041 3042 grp->lg_margin = port->lp_margin; 3043 return (B_TRUE); 3044 } 3045 3046 /* 3047 * Set MTU on individual ports of an aggregation group 3048 */ 3049 static int 3050 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu, 3051 uint32_t *old_mtu) 3052 { 3053 boolean_t removed = B_FALSE; 3054 mac_perim_handle_t mph; 3055 mac_diag_t diag; 3056 int err, rv, retry = 0; 3057 3058 if (port->lp_mah != NULL) { 3059 (void) mac_unicast_remove(port->lp_mch, port->lp_mah); 3060 port->lp_mah = NULL; 3061 removed = B_TRUE; 3062 } 3063 err = mac_set_mtu(port->lp_mh, sdu, old_mtu); 3064 try_again: 3065 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL, 3066 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK, 3067 &port->lp_mah, 0, &diag)) != 0) { 3068 /* 3069 * following is a workaround for a bug in 'bge' driver. 3070 * See CR 6794654 for more information and this work around 3071 * will be removed once the CR is fixed. 3072 */ 3073 if (rv == EIO && retry++ < 3) { 3074 delay(2 * hz); 3075 goto try_again; 3076 } 3077 /* 3078 * if mac_unicast_add() failed while setting the MTU, 3079 * detach the port from the group. 3080 */ 3081 mac_perim_enter_by_mh(port->lp_mh, &mph); 3082 (void) aggr_grp_detach_port(grp, port); 3083 mac_perim_exit(mph); 3084 cmn_err(CE_WARN, "Unable to restart the port %s while " 3085 "setting MTU. Detaching the port from the aggregation.", 3086 mac_client_name(port->lp_mch)); 3087 } 3088 return (err); 3089 } 3090 3091 static int 3092 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu) 3093 { 3094 int err = 0, i, rv; 3095 aggr_port_t *port; 3096 uint32_t *mtu; 3097 3098 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3099 3100 /* 3101 * If the MTU being set is equal to aggr group's maximum 3102 * allowable value, then there is nothing to change 3103 */ 3104 if (sdu == grp->lg_max_sdu) 3105 return (0); 3106 3107 /* 0 is aggr group's min sdu */ 3108 if (sdu == 0) 3109 return (EINVAL); 3110 3111 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP); 3112 for (port = grp->lg_ports, i = 0; port != NULL && err == 0; 3113 port = port->lp_next, i++) { 3114 err = aggr_set_port_sdu(grp, port, sdu, mtu + i); 3115 } 3116 if (err != 0) { 3117 /* recover from error: reset the mtus of the ports */ 3118 aggr_port_t *tmp; 3119 3120 for (tmp = grp->lg_ports, i = 0; tmp != port; 3121 tmp = tmp->lp_next, i++) { 3122 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL); 3123 } 3124 goto bail; 3125 } 3126 grp->lg_max_sdu = aggr_grp_max_sdu(grp); 3127 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu); 3128 ASSERT(rv == 0); 3129 bail: 3130 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports); 3131 return (err); 3132 } 3133 3134 /* 3135 * Callback functions for set/get of properties 3136 */ 3137 /*ARGSUSED*/ 3138 static int 3139 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3140 uint_t pr_valsize, const void *pr_val) 3141 { 3142 int err = ENOTSUP; 3143 aggr_grp_t *grp = m_driver; 3144 3145 switch (pr_num) { 3146 case MAC_PROP_MTU: { 3147 uint32_t mtu; 3148 3149 if (pr_valsize < sizeof (mtu)) { 3150 err = EINVAL; 3151 break; 3152 } 3153 bcopy(pr_val, &mtu, sizeof (mtu)); 3154 err = aggr_sdu_update(grp, mtu); 3155 break; 3156 } 3157 default: 3158 break; 3159 } 3160 return (err); 3161 } 3162 3163 typedef struct rboundary { 3164 uint32_t bval; 3165 int btype; 3166 } rboundary_t; 3167 3168 /* 3169 * This function finds the intersection of mtu ranges stored in arrays - 3170 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval. 3171 * Individual arrays are assumed to contain non-overlapping ranges. 3172 * Algorithm: 3173 * A range has two boundaries - min and max. We scan all arrays and store 3174 * each boundary as a separate element in a temporary array. We also store 3175 * the boundary types, min or max, as +1 or -1 respectively in the temporary 3176 * array. Then we sort the temporary array in ascending order. We scan the 3177 * sorted array from lower to higher values and keep a cumulative sum of 3178 * boundary types. Element in the temporary array for which the sum reaches 3179 * mcount is a min boundary of a range in the result and next element will be 3180 * max boundary. 3181 * 3182 * Example for mcount = 3, 3183 * 3184 * ----|_________|-------|_______|----|__|------ mrange[0] 3185 * 3186 * -------|________|--|____________|-----|___|-- mrange[1] 3187 * 3188 * --------|________________|-------|____|------ mrange[2] 3189 * 3190 * 3 2 1 3191 * \|/ 3192 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum 3193 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array 3194 * 3195 * same min and max 3196 * V 3197 * --------|_____|-------|__|------------|------ intersecting ranges 3198 */ 3199 void 3200 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount, 3201 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount) 3202 { 3203 mac_propval_uint32_range_t *rval, *ur; 3204 int rmaxcnt, rcount; 3205 size_t sz_range32; 3206 rboundary_t *ta; /* temporary array */ 3207 rboundary_t temp; 3208 boolean_t range_started = B_FALSE; 3209 int i, j, m, sum; 3210 3211 sz_range32 = sizeof (mac_propval_uint32_range_t); 3212 3213 for (i = 0, rmaxcnt = 0; i < mcount; i++) 3214 rmaxcnt += mrange[i]->mpr_count; 3215 3216 /* Allocate enough space to store the results */ 3217 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP); 3218 3219 /* Number of boundaries are twice as many as ranges */ 3220 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP); 3221 3222 for (i = 0, m = 0; i < mcount; i++) { 3223 ur = &(mrange[i]->mpr_range_uint32[0]); 3224 for (j = 0; j < mrange[i]->mpr_count; j++) { 3225 ta[m].bval = ur[j].mpur_min; 3226 ta[m++].btype = 1; 3227 ta[m].bval = ur[j].mpur_max; 3228 ta[m++].btype = -1; 3229 } 3230 } 3231 3232 /* 3233 * Sort the temporary array in ascending order of bval; 3234 * if boundary values are same then sort on btype. 3235 */ 3236 for (i = 0; i < m-1; i++) { 3237 for (j = i+1; j < m; j++) { 3238 if ((ta[i].bval > ta[j].bval) || 3239 ((ta[i].bval == ta[j].bval) && 3240 (ta[i].btype < ta[j].btype))) { 3241 temp = ta[i]; 3242 ta[i] = ta[j]; 3243 ta[j] = temp; 3244 } 3245 } 3246 } 3247 3248 /* Walk through temporary array to find all ranges in the results */ 3249 for (i = 0, sum = 0, rcount = 0; i < m; i++) { 3250 sum += ta[i].btype; 3251 if (sum == mcount) { 3252 rval[rcount].mpur_min = ta[i].bval; 3253 range_started = B_TRUE; 3254 } else if (sum < mcount && range_started) { 3255 rval[rcount++].mpur_max = ta[i].bval; 3256 range_started = B_FALSE; 3257 } 3258 } 3259 3260 *prval = rval; 3261 *prmaxcnt = rmaxcnt; 3262 *prcount = rcount; 3263 3264 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t)); 3265 } 3266 3267 /* 3268 * Returns the mtu ranges which could be supported by aggr group. 3269 * prmaxcnt returns the size of the buffer prval, prcount returns 3270 * the number of valid entries in prval. Caller is responsible 3271 * for freeing up prval. 3272 */ 3273 int 3274 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval, 3275 int *prmaxcnt, int *prcount) 3276 { 3277 mac_propval_range_t **vals; 3278 aggr_port_t *port; 3279 mac_perim_handle_t mph; 3280 uint_t i, numr; 3281 int err = 0; 3282 size_t sz_propval, sz_range32; 3283 size_t size; 3284 3285 sz_propval = sizeof (mac_propval_range_t); 3286 sz_range32 = sizeof (mac_propval_uint32_range_t); 3287 3288 ASSERT(MAC_PERIM_HELD(grp->lg_mh)); 3289 3290 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports, 3291 KM_SLEEP); 3292 3293 for (port = grp->lg_ports, i = 0; port != NULL; 3294 port = port->lp_next, i++) { 3295 3296 size = sz_propval; 3297 vals[i] = kmem_alloc(size, KM_SLEEP); 3298 vals[i]->mpr_count = 1; 3299 3300 mac_perim_enter_by_mh(port->lp_mh, &mph); 3301 3302 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3303 NULL, 0, vals[i], NULL); 3304 if (err == ENOSPC) { 3305 /* 3306 * Not enough space to hold all ranges. 3307 * Allocate extra space as indicated and retry. 3308 */ 3309 numr = vals[i]->mpr_count; 3310 kmem_free(vals[i], sz_propval); 3311 size = sz_propval + (numr - 1) * sz_range32; 3312 vals[i] = kmem_alloc(size, KM_SLEEP); 3313 vals[i]->mpr_count = numr; 3314 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL, 3315 NULL, 0, vals[i], NULL); 3316 ASSERT(err != ENOSPC); 3317 } 3318 mac_perim_exit(mph); 3319 if (err != 0) { 3320 kmem_free(vals[i], size); 3321 vals[i] = NULL; 3322 break; 3323 } 3324 } 3325 3326 /* 3327 * if any of the underlying ports does not support changing MTU then 3328 * just return ENOTSUP 3329 */ 3330 if (port != NULL) { 3331 ASSERT(err != 0); 3332 goto done; 3333 } 3334 3335 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt, 3336 prcount); 3337 3338 done: 3339 for (i = 0; i < grp->lg_nports; i++) { 3340 if (vals[i] != NULL) { 3341 numr = vals[i]->mpr_count; 3342 size = sz_propval + (numr - 1) * sz_range32; 3343 kmem_free(vals[i], size); 3344 } 3345 } 3346 3347 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports); 3348 return (err); 3349 } 3350 3351 static void 3352 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num, 3353 mac_prop_info_handle_t prh) 3354 { 3355 aggr_grp_t *grp = m_driver; 3356 mac_propval_uint32_range_t *rval = NULL; 3357 int i, rcount, rmaxcnt; 3358 int err = 0; 3359 3360 _NOTE(ARGUNUSED(pr_name)); 3361 3362 switch (pr_num) { 3363 case MAC_PROP_MTU: 3364 3365 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, 3366 &rcount); 3367 if (err != 0) { 3368 ASSERT(rval == NULL); 3369 return; 3370 } 3371 for (i = 0; i < rcount; i++) { 3372 mac_prop_info_set_range_uint32(prh, 3373 rval[i].mpur_min, rval[i].mpur_max); 3374 } 3375 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt); 3376 break; 3377 } 3378 } 3379