xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c (revision 7b34a9a5df26271af0da06974fc361c468cd48d3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2020 Joyent, Inc.
24  * Copyright 2020 RackTop Systems, Inc.
25  */
26 
27 /*
28  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
29  *
30  * An instance of the structure aggr_grp_t is allocated for each
31  * link aggregation group. When created, aggr_grp_t objects are
32  * entered into the aggr_grp_hash hash table maintained by the modhash
33  * module. The hash key is the linkid associated with the link
34  * aggregation group.
35  *
36  * Each aggregation contains a set of ports. The port is represented
37  * by the aggr_port_t structure. A port consists of a single MAC
38  * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
39  * MAC. This client is used by the aggr to send and receive LACP
40  * traffic. Each port client takes on the same MAC unicast address --
41  * the address of the aggregation itself (taken from the first port by
42  * default).
43  *
44  * The MAC client that hangs off each aggr port is not your typical
45  * MAC client. Not only does it have exclusive control of the MAC, but
46  * it also has no Tx or Rx SRSes. An SRS is designed to queue and
47  * fanout traffic among L4 protocols; but the aggr is an intermediary,
48  * not a consumer. Instead of using SRSes, the aggr puts the
49  * underlying hardware rings into passthru mode and ships packets up
50  * via a direct call to aggr_recv_cb(). This allows aggr to enforce
51  * LACP while passing all other traffic up to clients of the aggr.
52  *
53  * Pseudo Rx Groups and Rings
54  * --------------------------
55  *
56  * It is imperative for client performance that the aggr provide as
57  * many MAC groups as possible. In order to use the underlying HW
58  * resources, aggr creates pseudo groups to aggregate the underlying
59  * HW groups. Every HW group gets mapped to a pseudo group; and every
60  * HW ring in that group gets mapped to a pseudo ring. The pseudo
61  * group at index 0 combines all the HW groups at index 0 from each
62  * port, etc. The aggr's MAC then creates normal MAC groups and rings
63  * out of these pseudo groups and rings to present to the aggr's
64  * clients. To the clients, the aggr's groups and rings are absolutely
65  * no different than a NIC's groups or rings.
66  *
67  * Pseudo Tx Rings
68  * ---------------
69  *
70  * The underlying ports (NICs) in an aggregation can have Tx rings. To
71  * enhance aggr's performance, these Tx rings are made available to
72  * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
73  * not new. They are already present and implemented on the Rx side.
74  * The same concept is extended to the Tx side where each Tx ring of
75  * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
76  * each pseudo Tx ring will map to a specific hardware Tx ring. Even
77  * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
78  * is given to the aggregation layer.
79  *
80  * With this change, the outgoing stack depth looks much better:
81  *
82  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
83  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
84  *
85  * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
86  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
87  *
88  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
89  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
90  * ring belonging to a port on which the packet has to be sent.
91  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
92  * policy and then uses the fanout_hint passed to it to pick a Tx ring from
93  * the selected port.
94  *
95  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
96  * bandwidth limit is applied first on the outgoing packet and the packets
97  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
98  * particular Tx ring.
99  */
100 
101 #include <sys/types.h>
102 #include <sys/sysmacros.h>
103 #include <sys/conf.h>
104 #include <sys/cmn_err.h>
105 #include <sys/disp.h>
106 #include <sys/list.h>
107 #include <sys/ksynch.h>
108 #include <sys/kmem.h>
109 #include <sys/stream.h>
110 #include <sys/modctl.h>
111 #include <sys/ddi.h>
112 #include <sys/sunddi.h>
113 #include <sys/atomic.h>
114 #include <sys/stat.h>
115 #include <sys/modhash.h>
116 #include <sys/id_space.h>
117 #include <sys/strsun.h>
118 #include <sys/cred.h>
119 #include <sys/dlpi.h>
120 #include <sys/zone.h>
121 #include <sys/mac_provider.h>
122 #include <sys/dls.h>
123 #include <sys/vlan.h>
124 #include <sys/aggr.h>
125 #include <sys/aggr_impl.h>
126 
127 static int aggr_m_start(void *);
128 static void aggr_m_stop(void *);
129 static int aggr_m_promisc(void *, boolean_t);
130 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
131 static int aggr_m_unicst(void *, const uint8_t *);
132 static int aggr_m_stat(void *, uint_t, uint64_t *);
133 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
134 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
135 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
136     const void *);
137 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
138     mac_prop_info_handle_t);
139 
140 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
141 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
142     boolean_t *);
143 
144 static void aggr_grp_capab_set(aggr_grp_t *);
145 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
146 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
147 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
148 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
149 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
150 
151 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
152 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
153 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
154 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
155 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
156 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
157 static int aggr_addmac(void *, const uint8_t *);
158 static int aggr_remmac(void *, const uint8_t *);
159 static int aggr_addvlan(mac_group_driver_t, uint16_t);
160 static int aggr_remvlan(mac_group_driver_t, uint16_t);
161 static mblk_t *aggr_rx_poll(void *, int);
162 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
163     const int, mac_ring_info_t *, mac_ring_handle_t);
164 static void aggr_fill_group(void *, mac_ring_type_t, const int,
165     mac_group_info_t *, mac_group_handle_t);
166 
167 static kmem_cache_t	*aggr_grp_cache;
168 static mod_hash_t	*aggr_grp_hash;
169 static krwlock_t	aggr_grp_lock;
170 static uint_t		aggr_grp_cnt;
171 static id_space_t	*key_ids;
172 
173 #define	GRP_HASHSZ		64
174 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
175 #define	AGGR_PORT_NAME_DELIMIT '-'
176 
177 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
178 
179 #define	AGGR_M_CALLBACK_FLAGS	\
180 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
181 
182 static mac_callbacks_t aggr_m_callbacks = {
183 	AGGR_M_CALLBACK_FLAGS,
184 	aggr_m_stat,
185 	aggr_m_start,
186 	aggr_m_stop,
187 	aggr_m_promisc,
188 	aggr_m_multicst,
189 	NULL,
190 	NULL,
191 	NULL,
192 	aggr_m_ioctl,
193 	aggr_m_capab_get,
194 	NULL,
195 	NULL,
196 	aggr_m_setprop,
197 	NULL,
198 	aggr_m_propinfo
199 };
200 
201 /*ARGSUSED*/
202 static int
203 aggr_grp_constructor(void *buf, void *arg, int kmflag)
204 {
205 	aggr_grp_t *grp = buf;
206 
207 	bzero(grp, sizeof (*grp));
208 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
209 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
210 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
211 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
212 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
213 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
214 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
215 	grp->lg_link_state = LINK_STATE_UNKNOWN;
216 	return (0);
217 }
218 
219 /*ARGSUSED*/
220 static void
221 aggr_grp_destructor(void *buf, void *arg)
222 {
223 	aggr_grp_t *grp = buf;
224 
225 	if (grp->lg_tx_ports != NULL) {
226 		kmem_free(grp->lg_tx_ports,
227 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
228 	}
229 
230 	mutex_destroy(&grp->lg_lacp_lock);
231 	cv_destroy(&grp->lg_lacp_cv);
232 	mutex_destroy(&grp->lg_port_lock);
233 	cv_destroy(&grp->lg_port_cv);
234 	rw_destroy(&grp->lg_tx_lock);
235 	mutex_destroy(&grp->lg_tx_flowctl_lock);
236 	cv_destroy(&grp->lg_tx_flowctl_cv);
237 }
238 
239 void
240 aggr_grp_init(void)
241 {
242 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
243 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
244 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
245 
246 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
247 	    GRP_HASHSZ, mod_hash_null_valdtor);
248 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
249 	aggr_grp_cnt = 0;
250 
251 	/*
252 	 * Allocate an id space to manage key values (when key is not
253 	 * specified). The range of the id space will be from
254 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
255 	 * uses a 16-bit key.
256 	 */
257 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
258 	ASSERT(key_ids != NULL);
259 }
260 
261 void
262 aggr_grp_fini(void)
263 {
264 	id_space_destroy(key_ids);
265 	rw_destroy(&aggr_grp_lock);
266 	mod_hash_destroy_idhash(aggr_grp_hash);
267 	kmem_cache_destroy(aggr_grp_cache);
268 }
269 
270 uint_t
271 aggr_grp_count(void)
272 {
273 	uint_t	count;
274 
275 	rw_enter(&aggr_grp_lock, RW_READER);
276 	count = aggr_grp_cnt;
277 	rw_exit(&aggr_grp_lock);
278 	return (count);
279 }
280 
281 /*
282  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
283  * requires the mac perimeter, this function holds a reference of the aggr
284  * and aggr won't call mac_unregister() until this reference drops to 0.
285  */
286 void
287 aggr_grp_port_hold(aggr_port_t *port)
288 {
289 	aggr_grp_t	*grp = port->lp_grp;
290 
291 	AGGR_PORT_REFHOLD(port);
292 	mutex_enter(&grp->lg_port_lock);
293 	grp->lg_port_ref++;
294 	mutex_exit(&grp->lg_port_lock);
295 }
296 
297 /*
298  * Release the reference of the grp and inform aggr_grp_delete() calling
299  * mac_unregister() is now safe.
300  */
301 void
302 aggr_grp_port_rele(aggr_port_t *port)
303 {
304 	aggr_grp_t	*grp = port->lp_grp;
305 
306 	mutex_enter(&grp->lg_port_lock);
307 	if (--grp->lg_port_ref == 0)
308 		cv_signal(&grp->lg_port_cv);
309 	mutex_exit(&grp->lg_port_lock);
310 	AGGR_PORT_REFRELE(port);
311 }
312 
313 /*
314  * Wait for the port's lacp timer thread and the port's notification callback
315  * to exit.
316  */
317 void
318 aggr_grp_port_wait(aggr_grp_t *grp)
319 {
320 	mutex_enter(&grp->lg_port_lock);
321 	if (grp->lg_port_ref != 0)
322 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
323 	mutex_exit(&grp->lg_port_lock);
324 }
325 
326 /*
327  * Attach a port to a link aggregation group.
328  *
329  * A port is attached to a link aggregation group once its speed
330  * and link state have been verified.
331  *
332  * Returns B_TRUE if the group link state or speed has changed. If
333  * it's the case, the caller must notify the MAC layer via a call
334  * to mac_link().
335  */
336 boolean_t
337 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
338 {
339 	boolean_t link_state_changed = B_FALSE;
340 
341 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
342 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
343 
344 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
345 		return (B_FALSE);
346 
347 	/*
348 	 * Validate the MAC port link speed and update the group
349 	 * link speed if needed.
350 	 */
351 	if (port->lp_ifspeed == 0 ||
352 	    port->lp_link_state != LINK_STATE_UP ||
353 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
354 		/*
355 		 * Can't attach a MAC port with unknown link speed,
356 		 * down link, or not in full duplex mode.
357 		 */
358 		return (B_FALSE);
359 	}
360 
361 	mutex_enter(&grp->lg_stat_lock);
362 	if (grp->lg_ifspeed == 0) {
363 		/*
364 		 * The group inherits the speed of the first link being
365 		 * attached.
366 		 */
367 		grp->lg_ifspeed = port->lp_ifspeed;
368 		link_state_changed = B_TRUE;
369 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
370 		/*
371 		 * The link speed of the MAC port must be the same as
372 		 * the group link speed, as per 802.3ad. Since it is
373 		 * not, the attach is cancelled.
374 		 */
375 		mutex_exit(&grp->lg_stat_lock);
376 		return (B_FALSE);
377 	}
378 	mutex_exit(&grp->lg_stat_lock);
379 
380 	grp->lg_nattached_ports++;
381 
382 	/*
383 	 * Update the group link state.
384 	 */
385 	if (grp->lg_link_state != LINK_STATE_UP) {
386 		grp->lg_link_state = LINK_STATE_UP;
387 		mutex_enter(&grp->lg_stat_lock);
388 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
389 		mutex_exit(&grp->lg_stat_lock);
390 		link_state_changed = B_TRUE;
391 	}
392 
393 	/*
394 	 * Update port's state.
395 	 */
396 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
397 
398 	aggr_grp_multicst_port(port, B_TRUE);
399 
400 	/*
401 	 * The port client doesn't have an Rx SRS; instead of calling
402 	 * mac_rx_set() we set the client's flow callback directly.
403 	 * This datapath is used only when the port's driver doesn't
404 	 * support MAC_CAPAB_RINGS. Drivers with ring support will
405 	 * deliver traffic to the aggr via ring passthru.
406 	 */
407 	mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
408 
409 	/*
410 	 * If LACP is OFF, the port can be used to send data as soon
411 	 * as its link is up and verified to be compatible with the
412 	 * aggregation.
413 	 *
414 	 * If LACP is active or passive, notify the LACP subsystem, which
415 	 * will enable sending on the port following the LACP protocol.
416 	 */
417 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
418 		aggr_send_port_enable(port);
419 	else
420 		aggr_lacp_port_attached(port);
421 
422 	return (link_state_changed);
423 }
424 
425 boolean_t
426 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
427 {
428 	boolean_t link_state_changed = B_FALSE;
429 
430 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
431 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
432 
433 	/* update state */
434 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
435 		return (B_FALSE);
436 
437 	mac_client_clear_flow_cb(port->lp_mch);
438 
439 	aggr_grp_multicst_port(port, B_FALSE);
440 
441 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
442 		aggr_send_port_disable(port);
443 	else
444 		aggr_lacp_port_detached(port);
445 
446 	port->lp_state = AGGR_PORT_STATE_STANDBY;
447 
448 	grp->lg_nattached_ports--;
449 	if (grp->lg_nattached_ports == 0) {
450 		/* the last attached MAC port of the group is being detached */
451 		grp->lg_link_state = LINK_STATE_DOWN;
452 		mutex_enter(&grp->lg_stat_lock);
453 		grp->lg_ifspeed = 0;
454 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
455 		mutex_exit(&grp->lg_stat_lock);
456 		link_state_changed = B_TRUE;
457 	}
458 
459 	return (link_state_changed);
460 }
461 
462 /*
463  * Update the MAC addresses of the constituent ports of the specified
464  * group. This function is invoked:
465  * - after creating a new aggregation group.
466  * - after adding new ports to an aggregation group.
467  * - after removing a port from a group when the MAC address of
468  *   that port was used for the MAC address of the group.
469  * - after the MAC address of a port changed when the MAC address
470  *   of that port was used for the MAC address of the group.
471  *
472  * Return true if the link state of the aggregation changed, for example
473  * as a result of a failure changing the MAC address of one of the
474  * constituent ports.
475  */
476 boolean_t
477 aggr_grp_update_ports_mac(aggr_grp_t *grp)
478 {
479 	aggr_port_t *cport;
480 	boolean_t link_state_changed = B_FALSE;
481 	mac_perim_handle_t mph;
482 
483 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
484 
485 	for (cport = grp->lg_ports; cport != NULL;
486 	    cport = cport->lp_next) {
487 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
488 		if (aggr_port_unicst(cport) != 0) {
489 			if (aggr_grp_detach_port(grp, cport))
490 				link_state_changed = B_TRUE;
491 		} else {
492 			/*
493 			 * If a port was detached because of a previous
494 			 * failure changing the MAC address, the port is
495 			 * reattached when it successfully changes the MAC
496 			 * address now, and this might cause the link state
497 			 * of the aggregation to change.
498 			 */
499 			if (aggr_grp_attach_port(grp, cport))
500 				link_state_changed = B_TRUE;
501 		}
502 		mac_perim_exit(mph);
503 	}
504 	return (link_state_changed);
505 }
506 
507 /*
508  * Invoked when the MAC address of a port has changed. If the port's
509  * MAC address was used for the group MAC address, set mac_addr_changedp
510  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
511  * notification. If the link state changes due to detach/attach of
512  * the constituent port, set link_state_changedp to B_TRUE to indicate
513  * to the caller that it should send a MAC_NOTE_LINK notification. In both
514  * cases, it is the responsibility of the caller to invoke notification
515  * functions after releasing the the port lock.
516  */
517 void
518 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
519     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
520 {
521 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
522 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
523 	ASSERT(mac_addr_changedp != NULL);
524 	ASSERT(link_state_changedp != NULL);
525 
526 	*mac_addr_changedp = B_FALSE;
527 	*link_state_changedp = B_FALSE;
528 
529 	if (grp->lg_addr_fixed) {
530 		/*
531 		 * The group is using a fixed MAC address or an automatic
532 		 * MAC address has not been set.
533 		 */
534 		return;
535 	}
536 
537 	if (grp->lg_mac_addr_port == port) {
538 		/*
539 		 * The MAC address of the port was assigned to the group
540 		 * MAC address. Update the group MAC address.
541 		 */
542 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
543 		*mac_addr_changedp = B_TRUE;
544 	} else {
545 		/*
546 		 * Update the actual port MAC address to the MAC address
547 		 * of the group.
548 		 */
549 		if (aggr_port_unicst(port) != 0) {
550 			*link_state_changedp = aggr_grp_detach_port(grp, port);
551 		} else {
552 			/*
553 			 * If a port was detached because of a previous
554 			 * failure changing the MAC address, the port is
555 			 * reattached when it successfully changes the MAC
556 			 * address now, and this might cause the link state
557 			 * of the aggregation to change.
558 			 */
559 			*link_state_changedp = aggr_grp_attach_port(grp, port);
560 		}
561 	}
562 }
563 
564 /*
565  * Add a port to a link aggregation group.
566  */
567 static int
568 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
569     aggr_port_t **pp)
570 {
571 	aggr_port_t *port, **cport;
572 	mac_perim_handle_t mph;
573 	zoneid_t port_zoneid = ALL_ZONES;
574 	int err;
575 
576 	/* The port must be in the same zone as the aggregation. */
577 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
578 		port_zoneid = GLOBAL_ZONEID;
579 	if (grp->lg_zoneid != port_zoneid)
580 		return (EBUSY);
581 
582 	/*
583 	 * If we are creating the aggr, then there is no MAC handle
584 	 * and thus no perimeter to hold. If we are adding a port to
585 	 * an existing aggr, then the perimiter of the aggr's MAC must
586 	 * be held.
587 	 */
588 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
589 
590 	err = aggr_port_create(grp, port_linkid, force, &port);
591 	if (err != 0)
592 		return (err);
593 
594 	mac_perim_enter_by_mh(port->lp_mh, &mph);
595 
596 	/* Add the new port to the end of the list. */
597 	cport = &grp->lg_ports;
598 	while (*cport != NULL)
599 		cport = &((*cport)->lp_next);
600 	*cport = port;
601 
602 	/*
603 	 * Back reference to the group it is member of. A port always
604 	 * holds a reference to its group to ensure that the back
605 	 * reference is always valid.
606 	 */
607 	port->lp_grp = grp;
608 	AGGR_GRP_REFHOLD(grp);
609 	grp->lg_nports++;
610 
611 	aggr_lacp_init_port(port);
612 	mac_perim_exit(mph);
613 
614 	if (pp != NULL)
615 		*pp = port;
616 
617 	return (0);
618 }
619 
620 /*
621  * This is called when the 'lg_tx_ports' arrangement has changed and
622  * we need to update the corresponding 'mi_default_tx_ring'. This
623  * happens for several reasons.
624  *
625  *     - A pseudo TX mac group was added or removed.
626  *     - An LACP message has changed the port's state.
627  *     - A link event has changed the port's state.
628  *
629  * In any case, we see if there is at least one port enabled (see
630  * 'aggr_send_port_enable()'), and if so we use its first ring as the
631  * mac's default TX ring.
632  *
633  * Note, because we only have a single TX group, we don't have to
634  * worry about the rings moving between groups and the chance that mac
635  * will reassign it unless someone removes a port, at which point, we
636  * play it safe and call this again.
637  */
638 void
639 aggr_grp_update_default(aggr_grp_t *grp)
640 {
641 	aggr_port_t *port;
642 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
643 
644 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
645 
646 	if (grp->lg_ntx_ports == 0) {
647 		rw_exit(&grp->lg_tx_lock);
648 		return;
649 	}
650 
651 	port = grp->lg_tx_ports[0];
652 	ASSERT(port->lp_tx_ring_cnt > 0);
653 	mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
654 	rw_exit(&grp->lg_tx_lock);
655 }
656 
657 /*
658  * Add a pseudo RX ring for the given HW ring handle.
659  */
660 static int
661 aggr_add_pseudo_rx_ring(aggr_port_t *port,
662     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
663 {
664 	aggr_pseudo_rx_ring_t	*ring;
665 	int			err;
666 	int			j;
667 
668 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
669 		ring = rx_grp->arg_rings + j;
670 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
671 			break;
672 	}
673 
674 	/*
675 	 * No slot for this new RX ring.
676 	 */
677 	if (j == MAX_RINGS_PER_GROUP)
678 		return (EIO);
679 
680 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
681 	ring->arr_hw_rh = hw_rh;
682 	ring->arr_port = port;
683 	ring->arr_grp = rx_grp;
684 	rx_grp->arg_ring_cnt++;
685 
686 	/*
687 	 * The group is already registered, dynamically add a new ring to the
688 	 * mac group.
689 	 */
690 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
691 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
692 		ring->arr_hw_rh = NULL;
693 		ring->arr_port = NULL;
694 		ring->arr_grp = NULL;
695 		rx_grp->arg_ring_cnt--;
696 	} else {
697 		/*
698 		 * This must run after the MAC is registered.
699 		 */
700 		ASSERT3P(ring->arr_rh, !=, NULL);
701 		mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
702 		    (void *)port, (mac_resource_handle_t)ring);
703 	}
704 	return (err);
705 }
706 
707 /*
708  * Remove the pseudo RX ring of the given HW ring handle.
709  */
710 static void
711 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
712 {
713 	for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
714 		aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
715 
716 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
717 		    ring->arr_hw_rh != hw_rh) {
718 			continue;
719 		}
720 
721 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
722 
723 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
724 		ring->arr_hw_rh = NULL;
725 		ring->arr_port = NULL;
726 		ring->arr_grp = NULL;
727 		rx_grp->arg_ring_cnt--;
728 		mac_hwring_clear_passthru(hw_rh);
729 		break;
730 	}
731 }
732 
733 /*
734  * Create pseudo rings over the HW rings of the port.
735  *
736  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
737  *
738  * o Program existing unicast filters on the pseudo group into the HW group.
739  *
740  * o Program existing VLAN filters on the pseudo group into the HW group.
741  */
742 static int
743 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
744 {
745 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
746 	aggr_unicst_addr_t	*addr, *a;
747 	mac_perim_handle_t	pmph;
748 	aggr_vlan_t		*avp;
749 	uint_t			hw_rh_cnt, i;
750 	int			err = 0;
751 	uint_t			g_idx = rx_grp->arg_index;
752 
753 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
754 	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
755 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
756 
757 	i = 0;
758 	addr = NULL;
759 	/*
760 	 * This function must be called after the aggr registers its
761 	 * MAC and its Rx groups have been initialized.
762 	 */
763 	ASSERT(rx_grp->arg_gh != NULL);
764 
765 	/*
766 	 * Get the list of the underlying HW rings.
767 	 */
768 	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
769 	    &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
770 
771 	/*
772 	 * Add existing VLAN and unicast address filters to the port.
773 	 */
774 	for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
775 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
776 		if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
777 			goto err;
778 	}
779 
780 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
781 		if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
782 			goto err;
783 	}
784 
785 	for (i = 0; i < hw_rh_cnt; i++) {
786 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
787 		if (err != 0)
788 			goto err;
789 	}
790 
791 	mac_perim_exit(pmph);
792 	return (0);
793 
794 err:
795 	ASSERT(err != 0);
796 
797 	for (uint_t j = 0; j < i; j++)
798 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
799 
800 	for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
801 		aggr_port_remmac(port, g_idx, a->aua_addr);
802 
803 	if (avp != NULL)
804 		avp = list_prev(&rx_grp->arg_vlans, avp);
805 
806 	for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
807 		int err2;
808 
809 		if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
810 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
811 			    ": errno %d.", avp->av_vid,
812 			    mac_client_name(port->lp_mch), err2);
813 		}
814 	}
815 
816 	port->lp_hwghs[g_idx] = NULL;
817 	mac_perim_exit(pmph);
818 	return (err);
819 }
820 
821 /*
822  * Destroy the pseudo rings mapping to this port and remove all VLAN
823  * and unicast filters from this port. Even if there are no underlying
824  * HW rings we must still remove the unicast filters to take the port
825  * out of promisc mode.
826  */
827 static void
828 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
829 {
830 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
831 	aggr_unicst_addr_t	*addr;
832 	mac_perim_handle_t	pmph;
833 	uint_t			hw_rh_cnt;
834 	uint_t			g_idx = rx_grp->arg_index;
835 
836 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
837 	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
838 	ASSERT3P(rx_grp->arg_gh, !=, NULL);
839 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
840 
841 	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
842 	    MAC_RING_TYPE_RX);
843 
844 	for (uint_t i = 0; i < hw_rh_cnt; i++)
845 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
846 
847 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
848 		aggr_port_remmac(port, g_idx, addr->aua_addr);
849 
850 	for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
851 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
852 		int err;
853 
854 		if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
855 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
856 			    ": errno %d.", avp->av_vid,
857 			    mac_client_name(port->lp_mch), err);
858 		}
859 	}
860 
861 	port->lp_hwghs[g_idx] = NULL;
862 	mac_perim_exit(pmph);
863 }
864 
865 /*
866  * Add a pseudo TX ring for the given HW ring handle.
867  */
868 static int
869 aggr_add_pseudo_tx_ring(aggr_port_t *port,
870     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
871     mac_ring_handle_t *pseudo_rh)
872 {
873 	aggr_pseudo_tx_ring_t	*ring;
874 	int			err;
875 	int			i;
876 
877 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
878 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
879 		ring = tx_grp->atg_rings + i;
880 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
881 			break;
882 	}
883 	/*
884 	 * No slot for this new TX ring.
885 	 */
886 	if (i == MAX_RINGS_PER_GROUP)
887 		return (EIO);
888 	/*
889 	 * The following 4 statements needs to be done before
890 	 * calling mac_group_add_ring(). Otherwise it will
891 	 * result in an assertion failure in mac_init_ring().
892 	 */
893 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
894 	ring->atr_hw_rh = hw_rh;
895 	ring->atr_port = port;
896 	tx_grp->atg_ring_cnt++;
897 
898 	/*
899 	 * The TX side has no concept of ring groups unlike RX groups.
900 	 * There is just a single group which stores all the TX rings.
901 	 * This group will be used to store aggr's pseudo TX rings.
902 	 */
903 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
904 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
905 		ring->atr_hw_rh = NULL;
906 		ring->atr_port = NULL;
907 		tx_grp->atg_ring_cnt--;
908 	} else {
909 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
910 		if (hw_rh != NULL) {
911 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
912 			    mac_find_ring(tx_grp->atg_gh, i));
913 		}
914 	}
915 
916 	return (err);
917 }
918 
919 /*
920  * Remove the pseudo TX ring of the given HW ring handle.
921  */
922 static void
923 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
924     mac_ring_handle_t pseudo_hw_rh)
925 {
926 	aggr_pseudo_tx_ring_t	*ring;
927 	int			i;
928 
929 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
930 		ring = tx_grp->atg_rings + i;
931 		if (ring->atr_rh != pseudo_hw_rh)
932 			continue;
933 
934 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
935 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
936 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
937 		mac_hwring_teardown(ring->atr_hw_rh);
938 		ring->atr_hw_rh = NULL;
939 		ring->atr_port = NULL;
940 		tx_grp->atg_ring_cnt--;
941 		break;
942 	}
943 }
944 
945 /*
946  * This function is called to create pseudo rings over hardware rings of
947  * the underlying device. There is a 1:1 mapping between the pseudo TX
948  * rings of the aggr and the hardware rings of the underlying port.
949  */
950 static int
951 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
952 {
953 	aggr_grp_t		*grp = port->lp_grp;
954 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
955 	mac_perim_handle_t	pmph;
956 	int			hw_rh_cnt, i = 0, j;
957 	int			err = 0;
958 
959 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
960 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
961 
962 	/*
963 	 * Get the list the the underlying HW rings.
964 	 */
965 	hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
966 	    MAC_RING_TYPE_TX);
967 
968 	/*
969 	 * Even if the underlying NIC does not have TX rings, we
970 	 * still make a psuedo TX ring for that NIC with NULL as
971 	 * the ring handle.
972 	 */
973 	if (hw_rh_cnt == 0)
974 		port->lp_tx_ring_cnt = 1;
975 	else
976 		port->lp_tx_ring_cnt = hw_rh_cnt;
977 
978 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
979 	    port->lp_tx_ring_cnt), KM_SLEEP);
980 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
981 	    port->lp_tx_ring_cnt), KM_SLEEP);
982 
983 	if (hw_rh_cnt == 0) {
984 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
985 		    NULL, &pseudo_rh)) == 0) {
986 			port->lp_tx_rings[0] = NULL;
987 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
988 		}
989 	} else {
990 		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
991 			err = aggr_add_pseudo_tx_ring(port,
992 			    tx_grp, hw_rh[i], &pseudo_rh);
993 			if (err != 0)
994 				break;
995 			port->lp_tx_rings[i] = hw_rh[i];
996 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
997 		}
998 	}
999 
1000 	if (err != 0) {
1001 		if (hw_rh_cnt != 0) {
1002 			for (j = 0; j < i; j++) {
1003 				aggr_rem_pseudo_tx_ring(tx_grp,
1004 				    port->lp_pseudo_tx_rings[j]);
1005 			}
1006 		}
1007 		kmem_free(port->lp_tx_rings,
1008 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1009 		kmem_free(port->lp_pseudo_tx_rings,
1010 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1011 		port->lp_tx_ring_cnt = 0;
1012 	} else {
1013 		port->lp_tx_grp_added = B_TRUE;
1014 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1015 		    aggr_tx_ring_update, port);
1016 	}
1017 	mac_perim_exit(pmph);
1018 	aggr_grp_update_default(grp);
1019 	return (err);
1020 }
1021 
1022 /*
1023  * This function is called by aggr to remove pseudo TX rings over the
1024  * HW rings of the underlying port.
1025  */
1026 static void
1027 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1028 {
1029 	aggr_grp_t		*grp = port->lp_grp;
1030 	mac_perim_handle_t	pmph;
1031 	int			i;
1032 
1033 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1034 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
1035 
1036 	if (!port->lp_tx_grp_added)
1037 		goto done;
1038 
1039 	ASSERT(tx_grp->atg_gh != NULL);
1040 
1041 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
1042 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1043 
1044 	kmem_free(port->lp_tx_rings,
1045 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1046 	kmem_free(port->lp_pseudo_tx_rings,
1047 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1048 
1049 	port->lp_tx_ring_cnt = 0;
1050 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1051 	port->lp_tx_grp_added = B_FALSE;
1052 	aggr_grp_update_default(grp);
1053 done:
1054 	mac_perim_exit(pmph);
1055 }
1056 
1057 static int
1058 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1059 {
1060 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1061 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1062 }
1063 
1064 static int
1065 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1066 {
1067 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1068 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1069 }
1070 
1071 /*
1072  * Start the pseudo ring. Since the pseudo ring is just an abstraction
1073  * over an actual HW ring, the real task is to start the underlying HW
1074  * ring.
1075  */
1076 static int
1077 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1078 {
1079 	int err;
1080 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1081 
1082 	err = mac_hwring_start(rr_ring->arr_hw_rh);
1083 
1084 	if (err != 0)
1085 		return (err);
1086 
1087 	rr_ring->arr_gen = mr_gen;
1088 	return (err);
1089 }
1090 
1091 /*
1092  * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1093  * over an actual HW ring, the real task is to stop the underlying HW
1094  * ring.
1095  */
1096 static void
1097 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1098 {
1099 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1100 
1101 	/*
1102 	 * The rings underlying the default group must stay up to
1103 	 * continue receiving LACP traffic. We would normally never
1104 	 * stop the default Rx rings because of the primary MAC
1105 	 * client; but aggr's primary MAC client doesn't call
1106 	 * mac_unicast_add() and thus mi_active is 0 when the last
1107 	 * non-primary client is deleted.
1108 	 */
1109 	if (rr_ring->arr_grp->arg_index != 0)
1110 		mac_hwring_stop(rr_ring->arr_hw_rh);
1111 }
1112 
1113 /*
1114  * Add one or more ports to an existing link aggregation group.
1115  */
1116 int
1117 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1118     laioc_port_t *ports)
1119 {
1120 	int rc;
1121 	uint_t port_added = 0;
1122 	uint_t grp_added;
1123 	aggr_grp_t *grp = NULL;
1124 	aggr_port_t *port;
1125 	boolean_t link_state_changed = B_FALSE;
1126 	mac_perim_handle_t mph, pmph;
1127 
1128 	/* Get the aggr corresponding to linkid. */
1129 	rw_enter(&aggr_grp_lock, RW_READER);
1130 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1131 	    (mod_hash_val_t *)&grp) != 0) {
1132 		rw_exit(&aggr_grp_lock);
1133 		return (ENOENT);
1134 	}
1135 	AGGR_GRP_REFHOLD(grp);
1136 
1137 	/*
1138 	 * Hold the perimeter so that the aggregation can't be destroyed.
1139 	 */
1140 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1141 	rw_exit(&aggr_grp_lock);
1142 
1143 	/* Add the specified ports to the aggr. */
1144 	for (uint_t i = 0; i < nports; i++) {
1145 		grp_added = 0;
1146 
1147 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1148 		    force, &port)) != 0) {
1149 			goto bail;
1150 		}
1151 
1152 		ASSERT(port != NULL);
1153 		port_added++;
1154 
1155 		/* check capabilities */
1156 		if (!aggr_grp_capab_check(grp, port) ||
1157 		    !aggr_grp_sdu_check(grp, port) ||
1158 		    !aggr_grp_margin_check(grp, port)) {
1159 			rc = ENOTSUP;
1160 			goto bail;
1161 		}
1162 
1163 		/*
1164 		 * Create the pseudo ring for each HW ring of the underlying
1165 		 * port.
1166 		 */
1167 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1168 		if (rc != 0)
1169 			goto bail;
1170 
1171 		for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1172 			rc = aggr_add_pseudo_rx_group(port,
1173 			    &grp->lg_rx_groups[j]);
1174 
1175 			if (rc != 0)
1176 				goto bail;
1177 
1178 			grp_added++;
1179 		}
1180 
1181 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1182 
1183 		/* set LACP mode */
1184 		aggr_port_lacp_set_mode(grp, port);
1185 
1186 		/* start port if group has already been started */
1187 		if (grp->lg_started) {
1188 			rc = aggr_port_start(port);
1189 			if (rc != 0) {
1190 				mac_perim_exit(pmph);
1191 				goto bail;
1192 			}
1193 
1194 			/*
1195 			 * Turn on the promiscuous mode over the port when it
1196 			 * is requested to be turned on to receive the
1197 			 * non-primary address over a port, or the promiscuous
1198 			 * mode is enabled over the aggr.
1199 			 */
1200 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1201 				rc = aggr_port_promisc(port, B_TRUE);
1202 				if (rc != 0) {
1203 					mac_perim_exit(pmph);
1204 					goto bail;
1205 				}
1206 			}
1207 		}
1208 		mac_perim_exit(pmph);
1209 
1210 		/*
1211 		 * Attach each port if necessary.
1212 		 */
1213 		if (aggr_port_notify_link(grp, port))
1214 			link_state_changed = B_TRUE;
1215 
1216 		/*
1217 		 * Initialize the callback functions for this port.
1218 		 */
1219 		aggr_port_init_callbacks(port);
1220 	}
1221 
1222 	/* update the MAC address of the constituent ports */
1223 	if (aggr_grp_update_ports_mac(grp))
1224 		link_state_changed = B_TRUE;
1225 
1226 	if (link_state_changed)
1227 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1228 
1229 bail:
1230 	if (rc != 0) {
1231 		/* stop and remove ports that have been added */
1232 		for (uint_t i = 0; i < port_added; i++) {
1233 			uint_t grp_remove;
1234 
1235 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1236 			ASSERT(port != NULL);
1237 
1238 			if (grp->lg_started) {
1239 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1240 				(void) aggr_port_promisc(port, B_FALSE);
1241 				aggr_port_stop(port);
1242 				mac_perim_exit(pmph);
1243 			}
1244 
1245 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1246 
1247 			/*
1248 			 * Only the last port could have a partial set
1249 			 * of groups added.
1250 			 */
1251 			grp_remove = (i + 1 == port_added) ? grp_added :
1252 			    grp->lg_rx_group_count;
1253 
1254 			for (uint_t j = 0; j < grp_remove; j++) {
1255 				aggr_rem_pseudo_rx_group(port,
1256 				    &grp->lg_rx_groups[j]);
1257 			}
1258 
1259 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1260 		}
1261 	}
1262 
1263 	mac_perim_exit(mph);
1264 	AGGR_GRP_REFRELE(grp);
1265 	return (rc);
1266 }
1267 
1268 static int
1269 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1270     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1271     aggr_lacp_timer_t lacp_timer)
1272 {
1273 	boolean_t mac_addr_changed = B_FALSE;
1274 	boolean_t link_state_changed = B_FALSE;
1275 	mac_perim_handle_t pmph;
1276 
1277 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1278 
1279 	/* validate fixed address if specified */
1280 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1281 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1282 	    (mac_addr[0] & 0x01))) {
1283 		return (EINVAL);
1284 	}
1285 
1286 	/* update policy if requested */
1287 	if (update_mask & AGGR_MODIFY_POLICY)
1288 		aggr_send_update_policy(grp, policy);
1289 
1290 	/* update unicast MAC address if requested */
1291 	if (update_mask & AGGR_MODIFY_MAC) {
1292 		if (mac_fixed) {
1293 			/* user-supplied MAC address */
1294 			grp->lg_mac_addr_port = NULL;
1295 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1296 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1297 				mac_addr_changed = B_TRUE;
1298 			}
1299 		} else if (grp->lg_addr_fixed) {
1300 			/* switch from user-supplied to automatic */
1301 			aggr_port_t *port = grp->lg_ports;
1302 
1303 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1304 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1305 			grp->lg_mac_addr_port = port;
1306 			mac_addr_changed = B_TRUE;
1307 			mac_perim_exit(pmph);
1308 		}
1309 		grp->lg_addr_fixed = mac_fixed;
1310 	}
1311 
1312 	if (mac_addr_changed)
1313 		link_state_changed = aggr_grp_update_ports_mac(grp);
1314 
1315 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1316 		aggr_lacp_update_mode(grp, lacp_mode);
1317 
1318 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1319 		aggr_lacp_update_timer(grp, lacp_timer);
1320 
1321 	if (link_state_changed)
1322 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1323 
1324 	if (mac_addr_changed)
1325 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1326 
1327 	return (0);
1328 }
1329 
1330 /*
1331  * Update properties of an existing link aggregation group.
1332  */
1333 int
1334 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1335     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1336     aggr_lacp_timer_t lacp_timer)
1337 {
1338 	aggr_grp_t *grp = NULL;
1339 	mac_perim_handle_t mph;
1340 	int err;
1341 
1342 	/* get group corresponding to linkid */
1343 	rw_enter(&aggr_grp_lock, RW_READER);
1344 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1345 	    (mod_hash_val_t *)&grp) != 0) {
1346 		rw_exit(&aggr_grp_lock);
1347 		return (ENOENT);
1348 	}
1349 	AGGR_GRP_REFHOLD(grp);
1350 
1351 	/*
1352 	 * Hold the perimeter so that the aggregation won't be destroyed.
1353 	 */
1354 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1355 	rw_exit(&aggr_grp_lock);
1356 
1357 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1358 	    mac_addr, lacp_mode, lacp_timer);
1359 
1360 	mac_perim_exit(mph);
1361 	AGGR_GRP_REFRELE(grp);
1362 	return (err);
1363 }
1364 
1365 /*
1366  * Create a new link aggregation group upon request from administrator.
1367  * Returns 0 on success, an errno on failure.
1368  */
1369 int
1370 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1371     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1372     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1373     cred_t *credp)
1374 {
1375 	aggr_grp_t *grp = NULL;
1376 	aggr_port_t *port;
1377 	mac_register_t *mac;
1378 	boolean_t link_state_changed;
1379 	mac_perim_handle_t mph;
1380 	int err;
1381 	int i;
1382 	kt_did_t tid = 0;
1383 
1384 	/* need at least one port */
1385 	if (nports == 0)
1386 		return (EINVAL);
1387 
1388 	rw_enter(&aggr_grp_lock, RW_WRITER);
1389 
1390 	/* does a group with the same linkid already exist? */
1391 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1392 	    (mod_hash_val_t *)&grp);
1393 	if (err == 0) {
1394 		rw_exit(&aggr_grp_lock);
1395 		return (EEXIST);
1396 	}
1397 
1398 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1399 
1400 	grp->lg_refs = 1;
1401 	grp->lg_closing = B_FALSE;
1402 	grp->lg_force = force;
1403 	grp->lg_linkid = linkid;
1404 	grp->lg_zoneid = crgetzoneid(credp);
1405 	grp->lg_ifspeed = 0;
1406 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1407 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1408 	grp->lg_started = B_FALSE;
1409 	grp->lg_promisc = B_FALSE;
1410 	grp->lg_lacp_done = B_FALSE;
1411 	grp->lg_tx_notify_done = B_FALSE;
1412 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1413 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1414 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1415 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1416 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1417 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1418 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1419 	grp->lg_tx_blocked_cnt = 0;
1420 	bzero(&grp->lg_rx_groups,
1421 	    sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1422 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1423 	aggr_lacp_init_grp(grp);
1424 
1425 	/* add MAC ports to group */
1426 	grp->lg_ports = NULL;
1427 	grp->lg_nports = 0;
1428 	grp->lg_nattached_ports = 0;
1429 	grp->lg_ntx_ports = 0;
1430 
1431 	/*
1432 	 * If key is not specified by the user, allocate the key.
1433 	 */
1434 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1435 		err = ENOMEM;
1436 		goto bail;
1437 	}
1438 	grp->lg_key = key;
1439 
1440 	for (i = 0; i < nports; i++) {
1441 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1442 		if (err != 0)
1443 			goto bail;
1444 	}
1445 
1446 	grp->lg_rx_group_count = 1;
1447 
1448 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1449 		uint_t num_rgroups;
1450 
1451 		mac_perim_enter_by_mh(port->lp_mh, &mph);
1452 		num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1453 		mac_perim_exit(mph);
1454 
1455 		/*
1456 		 * Utilize all the groups in a port. If some ports
1457 		 * have less groups than others, then traffic destined
1458 		 * for the same unicast address may be HW classified
1459 		 * on some ports but SW classified by aggr when
1460 		 * arriving on other ports.
1461 		 */
1462 		grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1463 		    num_rgroups);
1464 	}
1465 
1466 	/*
1467 	 * There could be cases where the hardware provides more
1468 	 * groups than aggr can support. Make sure we never go above
1469 	 * the max aggr can support.
1470 	 */
1471 	grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1472 	    MAX_GROUPS_PER_PORT);
1473 
1474 	ASSERT3U(grp->lg_rx_group_count, >, 0);
1475 	for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1476 		grp->lg_rx_groups[i].arg_index = i;
1477 		grp->lg_rx_groups[i].arg_untagged = 0;
1478 		list_create(&(grp->lg_rx_groups[i].arg_vlans),
1479 		    sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1480 	}
1481 
1482 	/*
1483 	 * If no explicit MAC address was specified by the administrator,
1484 	 * set it to the MAC address of the first port.
1485 	 */
1486 	grp->lg_addr_fixed = mac_fixed;
1487 	if (grp->lg_addr_fixed) {
1488 		/* validate specified address */
1489 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1490 			err = EINVAL;
1491 			goto bail;
1492 		}
1493 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1494 	} else {
1495 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1496 		grp->lg_mac_addr_port = grp->lg_ports;
1497 	}
1498 
1499 	/* Set the initial group capabilities. */
1500 	aggr_grp_capab_set(grp);
1501 
1502 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1503 		err = ENOMEM;
1504 		goto bail;
1505 	}
1506 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1507 	mac->m_driver = grp;
1508 	mac->m_dip = aggr_dip;
1509 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1510 	mac->m_src_addr = grp->lg_addr;
1511 	mac->m_callbacks = &aggr_m_callbacks;
1512 	mac->m_min_sdu = 0;
1513 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1514 	mac->m_margin = aggr_grp_max_margin(grp);
1515 	mac->m_v12n = MAC_VIRT_LEVEL1;
1516 	err = mac_register(mac, &grp->lg_mh);
1517 	mac_free(mac);
1518 	if (err != 0)
1519 		goto bail;
1520 
1521 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1522 	if (err != 0) {
1523 		(void) mac_unregister(grp->lg_mh);
1524 		grp->lg_mh = NULL;
1525 		goto bail;
1526 	}
1527 
1528 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1529 
1530 	/*
1531 	 * Update the MAC address of the constituent ports.
1532 	 * None of the port is attached at this time, the link state of the
1533 	 * aggregation will not change.
1534 	 *
1535 	 * All ports take on the primary MAC address of the aggr
1536 	 * (lg_aggr). At this point, none of the ports are attached;
1537 	 * thus the link state of the aggregation will not change.
1538 	 */
1539 	link_state_changed = aggr_grp_update_ports_mac(grp);
1540 	ASSERT(!link_state_changed);
1541 
1542 	/* Update outbound load balancing policy. */
1543 	aggr_send_update_policy(grp, policy);
1544 
1545 	/* Set LACP mode. */
1546 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1547 
1548 	/*
1549 	 * Attach each port if necessary.
1550 	 */
1551 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1552 		/*
1553 		 * Create the pseudo ring for each HW ring of the
1554 		 * underlying port. Note that this is done after the
1555 		 * aggr registers its MAC.
1556 		 */
1557 		VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
1558 		    ==, 0);
1559 
1560 		for (i = 0; i < grp->lg_rx_group_count; i++) {
1561 			VERIFY3S(aggr_add_pseudo_rx_group(port,
1562 			    &grp->lg_rx_groups[i]), ==, 0);
1563 		}
1564 
1565 		if (aggr_port_notify_link(grp, port))
1566 			link_state_changed = B_TRUE;
1567 
1568 		/*
1569 		 * Initialize the callback functions for this port.
1570 		 */
1571 		aggr_port_init_callbacks(port);
1572 	}
1573 
1574 	if (link_state_changed)
1575 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1576 
1577 	/* add new group to hash table */
1578 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1579 	    (mod_hash_val_t)grp);
1580 	ASSERT(err == 0);
1581 	aggr_grp_cnt++;
1582 
1583 	mac_perim_exit(mph);
1584 	rw_exit(&aggr_grp_lock);
1585 	return (0);
1586 
1587 bail:
1588 
1589 	grp->lg_closing = B_TRUE;
1590 
1591 	port = grp->lg_ports;
1592 	while (port != NULL) {
1593 		aggr_port_t *cport;
1594 
1595 		cport = port->lp_next;
1596 		aggr_port_delete(port);
1597 		port = cport;
1598 	}
1599 
1600 	/*
1601 	 * Inform the lacp_rx thread to exit.
1602 	 */
1603 	mutex_enter(&grp->lg_lacp_lock);
1604 	grp->lg_lacp_done = B_TRUE;
1605 	cv_signal(&grp->lg_lacp_cv);
1606 	while (grp->lg_lacp_rx_thread != NULL)
1607 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1608 	mutex_exit(&grp->lg_lacp_lock);
1609 	/*
1610 	 * Inform the tx_notify thread to exit.
1611 	 */
1612 	mutex_enter(&grp->lg_tx_flowctl_lock);
1613 	if (grp->lg_tx_notify_thread != NULL) {
1614 		tid = grp->lg_tx_notify_thread->t_did;
1615 		grp->lg_tx_notify_done = B_TRUE;
1616 		cv_signal(&grp->lg_tx_flowctl_cv);
1617 	}
1618 	mutex_exit(&grp->lg_tx_flowctl_lock);
1619 	if (tid != 0)
1620 		thread_join(tid);
1621 
1622 	kmem_free(grp->lg_tx_blocked_rings,
1623 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1624 	rw_exit(&aggr_grp_lock);
1625 	AGGR_GRP_REFRELE(grp);
1626 	return (err);
1627 }
1628 
1629 /*
1630  * Return a pointer to the member of a group with specified linkid.
1631  */
1632 static aggr_port_t *
1633 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1634 {
1635 	aggr_port_t *port;
1636 
1637 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1638 
1639 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1640 		if (port->lp_linkid == linkid)
1641 			break;
1642 	}
1643 
1644 	return (port);
1645 }
1646 
1647 /*
1648  * Stop, detach and remove a port from a link aggregation group.
1649  */
1650 static int
1651 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1652     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1653 {
1654 	int rc = 0;
1655 	aggr_port_t **pport;
1656 	boolean_t mac_addr_changed = B_FALSE;
1657 	boolean_t link_state_changed = B_FALSE;
1658 	mac_perim_handle_t mph;
1659 	uint64_t val;
1660 	uint_t i;
1661 	uint_t stat;
1662 
1663 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1664 	ASSERT(grp->lg_nports > 1);
1665 	ASSERT(!grp->lg_closing);
1666 
1667 	/* unlink port */
1668 	for (pport = &grp->lg_ports; *pport != port;
1669 	    pport = &(*pport)->lp_next) {
1670 		if (*pport == NULL) {
1671 			rc = ENOENT;
1672 			goto done;
1673 		}
1674 	}
1675 	*pport = port->lp_next;
1676 
1677 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1678 
1679 	/*
1680 	 * If the MAC address of the port being removed was assigned
1681 	 * to the group, update the group MAC address
1682 	 * using the MAC address of a different port.
1683 	 */
1684 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1685 		/*
1686 		 * Set the MAC address of the group to the
1687 		 * MAC address of its first port.
1688 		 */
1689 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1690 		grp->lg_mac_addr_port = grp->lg_ports;
1691 		mac_addr_changed = B_TRUE;
1692 	}
1693 
1694 	link_state_changed = aggr_grp_detach_port(grp, port);
1695 
1696 	/*
1697 	 * Add the counter statistics of the ports while it was aggregated
1698 	 * to the group's residual statistics.  This is done by obtaining
1699 	 * the current counter from the underlying MAC then subtracting the
1700 	 * value of the counter at the moment it was added to the
1701 	 * aggregation.
1702 	 */
1703 	for (i = 0; i < MAC_NSTAT; i++) {
1704 		stat = i + MAC_STAT_MIN;
1705 		if (!MAC_STAT_ISACOUNTER(stat))
1706 			continue;
1707 		val = aggr_port_stat(port, stat);
1708 		val -= port->lp_stat[i];
1709 		mutex_enter(&grp->lg_stat_lock);
1710 		grp->lg_stat[i] += val;
1711 		mutex_exit(&grp->lg_stat_lock);
1712 	}
1713 	for (i = 0; i < ETHER_NSTAT; i++) {
1714 		stat = i + MACTYPE_STAT_MIN;
1715 		if (!ETHER_STAT_ISACOUNTER(stat))
1716 			continue;
1717 		val = aggr_port_stat(port, stat);
1718 		val -= port->lp_ether_stat[i];
1719 		mutex_enter(&grp->lg_stat_lock);
1720 		grp->lg_ether_stat[i] += val;
1721 		mutex_exit(&grp->lg_stat_lock);
1722 	}
1723 
1724 	grp->lg_nports--;
1725 	mac_perim_exit(mph);
1726 
1727 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1728 	aggr_port_delete(port);
1729 
1730 	/*
1731 	 * If the group MAC address has changed, update the MAC address of
1732 	 * the remaining constituent ports according to the new MAC
1733 	 * address of the group.
1734 	 */
1735 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1736 		link_state_changed = B_TRUE;
1737 
1738 done:
1739 	if (mac_addr_changedp != NULL)
1740 		*mac_addr_changedp = mac_addr_changed;
1741 	if (link_state_changedp != NULL)
1742 		*link_state_changedp = link_state_changed;
1743 
1744 	return (rc);
1745 }
1746 
1747 /*
1748  * Remove one or more ports from an existing link aggregation group.
1749  */
1750 int
1751 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1752 {
1753 	int rc = 0;
1754 	uint_t i;
1755 	aggr_grp_t *grp = NULL;
1756 	aggr_port_t *port;
1757 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1758 	boolean_t link_state_update = B_FALSE, link_state_changed;
1759 	mac_perim_handle_t mph, pmph;
1760 
1761 	/* get group corresponding to linkid */
1762 	rw_enter(&aggr_grp_lock, RW_READER);
1763 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1764 	    (mod_hash_val_t *)&grp) != 0) {
1765 		rw_exit(&aggr_grp_lock);
1766 		return (ENOENT);
1767 	}
1768 	AGGR_GRP_REFHOLD(grp);
1769 
1770 	/*
1771 	 * Hold the perimeter so that the aggregation won't be destroyed.
1772 	 */
1773 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1774 	rw_exit(&aggr_grp_lock);
1775 
1776 	/* we need to keep at least one port per group */
1777 	if (nports >= grp->lg_nports) {
1778 		rc = EINVAL;
1779 		goto bail;
1780 	}
1781 
1782 	/* first verify that all the groups are valid */
1783 	for (i = 0; i < nports; i++) {
1784 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1785 			/* port not found */
1786 			rc = ENOENT;
1787 			goto bail;
1788 		}
1789 	}
1790 
1791 	/* clear the promiscous mode for the specified ports */
1792 	for (i = 0; i < nports && rc == 0; i++) {
1793 		/* lookup port */
1794 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1795 		ASSERT(port != NULL);
1796 
1797 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1798 		rc = aggr_port_promisc(port, B_FALSE);
1799 		mac_perim_exit(pmph);
1800 	}
1801 	if (rc != 0) {
1802 		for (i = 0; i < nports; i++) {
1803 			port = aggr_grp_port_lookup(grp,
1804 			    ports[i].lp_linkid);
1805 			ASSERT(port != NULL);
1806 
1807 			/*
1808 			 * Turn the promiscuous mode back on if it is required
1809 			 * to receive the non-primary address over a port, or
1810 			 * the promiscous mode is enabled over the aggr.
1811 			 */
1812 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1813 			if (port->lp_started && (grp->lg_promisc ||
1814 			    port->lp_prom_addr != NULL)) {
1815 				(void) aggr_port_promisc(port, B_TRUE);
1816 			}
1817 			mac_perim_exit(pmph);
1818 		}
1819 		goto bail;
1820 	}
1821 
1822 	/* remove the specified ports from group */
1823 	for (i = 0; i < nports; i++) {
1824 		/* lookup port */
1825 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1826 		ASSERT(port != NULL);
1827 
1828 		/* stop port if group has already been started */
1829 		if (grp->lg_started) {
1830 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1831 			aggr_port_stop(port);
1832 			mac_perim_exit(pmph);
1833 		}
1834 
1835 		/*
1836 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1837 		 * it is called from inside aggr_grp_rem_port() after the
1838 		 * port has been detached. The reason is that
1839 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1840 		 * and if there is still traffic going on, then there
1841 		 * is the possibility of aggr_find_tx_ring() returning a
1842 		 * removed ring for transmission. Once the port has been
1843 		 * detached, that port will not be used and
1844 		 * aggr_find_tx_ring() will not return any rings
1845 		 * belonging to it.
1846 		 */
1847 		for (uint_t j = 0; j < grp->lg_rx_group_count; j++)
1848 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]);
1849 
1850 		/* remove port from group */
1851 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1852 		    &link_state_changed);
1853 		ASSERT(rc == 0);
1854 		mac_addr_update = mac_addr_update || mac_addr_changed;
1855 		link_state_update = link_state_update || link_state_changed;
1856 	}
1857 
1858 bail:
1859 	if (mac_addr_update)
1860 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1861 	if (link_state_update)
1862 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1863 
1864 	mac_perim_exit(mph);
1865 	AGGR_GRP_REFRELE(grp);
1866 
1867 	return (rc);
1868 }
1869 
1870 int
1871 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1872 {
1873 	aggr_grp_t *grp = NULL;
1874 	aggr_port_t *port, *cport;
1875 	datalink_id_t tmpid;
1876 	mod_hash_val_t val;
1877 	mac_perim_handle_t mph, pmph;
1878 	int err;
1879 	kt_did_t tid = 0;
1880 
1881 	rw_enter(&aggr_grp_lock, RW_WRITER);
1882 
1883 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1884 	    (mod_hash_val_t *)&grp) != 0) {
1885 		rw_exit(&aggr_grp_lock);
1886 		return (ENOENT);
1887 	}
1888 
1889 	/*
1890 	 * Note that dls_devnet_destroy() must be called before lg_lock is
1891 	 * held. Otherwise, it will deadlock if another thread is in
1892 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1893 	 * dls_devnet_destroy() needs to delete.
1894 	 */
1895 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1896 		rw_exit(&aggr_grp_lock);
1897 		return (err);
1898 	}
1899 	ASSERT(linkid == tmpid);
1900 
1901 	/*
1902 	 * Unregister from the MAC service module. Since this can
1903 	 * fail if a client hasn't closed the MAC port, we gracefully
1904 	 * fail the operation.
1905 	 */
1906 	if ((err = mac_disable(grp->lg_mh)) != 0) {
1907 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1908 		rw_exit(&aggr_grp_lock);
1909 		return (err);
1910 	}
1911 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1912 	ASSERT(grp == (aggr_grp_t *)val);
1913 
1914 	ASSERT(aggr_grp_cnt > 0);
1915 	aggr_grp_cnt--;
1916 	rw_exit(&aggr_grp_lock);
1917 
1918 	/*
1919 	 * Inform the lacp_rx thread to exit.
1920 	 */
1921 	mutex_enter(&grp->lg_lacp_lock);
1922 	grp->lg_lacp_done = B_TRUE;
1923 	cv_signal(&grp->lg_lacp_cv);
1924 	while (grp->lg_lacp_rx_thread != NULL)
1925 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1926 	mutex_exit(&grp->lg_lacp_lock);
1927 	/*
1928 	 * Inform the tx_notify_thread to exit.
1929 	 */
1930 	mutex_enter(&grp->lg_tx_flowctl_lock);
1931 	if (grp->lg_tx_notify_thread != NULL) {
1932 		tid = grp->lg_tx_notify_thread->t_did;
1933 		grp->lg_tx_notify_done = B_TRUE;
1934 		cv_signal(&grp->lg_tx_flowctl_cv);
1935 	}
1936 	mutex_exit(&grp->lg_tx_flowctl_lock);
1937 	if (tid != 0)
1938 		thread_join(tid);
1939 
1940 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1941 
1942 	grp->lg_closing = B_TRUE;
1943 	/* detach and free MAC ports associated with group */
1944 	port = grp->lg_ports;
1945 	while (port != NULL) {
1946 		cport = port->lp_next;
1947 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1948 		if (grp->lg_started)
1949 			aggr_port_stop(port);
1950 		(void) aggr_grp_detach_port(grp, port);
1951 		mac_perim_exit(pmph);
1952 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1953 		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
1954 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1955 		aggr_port_delete(port);
1956 		port = cport;
1957 	}
1958 
1959 	mac_perim_exit(mph);
1960 
1961 	kmem_free(grp->lg_tx_blocked_rings,
1962 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1963 	/*
1964 	 * Wait for the port's lacp timer thread and its notification callback
1965 	 * to exit before calling mac_unregister() since both needs to access
1966 	 * the mac perimeter of the grp.
1967 	 */
1968 	aggr_grp_port_wait(grp);
1969 
1970 	VERIFY(mac_unregister(grp->lg_mh) == 0);
1971 	grp->lg_mh = NULL;
1972 
1973 	for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1974 		list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
1975 	}
1976 
1977 	AGGR_GRP_REFRELE(grp);
1978 	return (0);
1979 }
1980 
1981 void
1982 aggr_grp_free(aggr_grp_t *grp)
1983 {
1984 	ASSERT(grp->lg_refs == 0);
1985 	ASSERT(grp->lg_port_ref == 0);
1986 	if (grp->lg_key > AGGR_MAX_KEY) {
1987 		id_free(key_ids, grp->lg_key);
1988 		grp->lg_key = 0;
1989 	}
1990 	kmem_cache_free(aggr_grp_cache, grp);
1991 }
1992 
1993 int
1994 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1995     aggr_grp_info_new_grp_fn_t new_grp_fn,
1996     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1997 {
1998 	aggr_grp_t	*grp;
1999 	aggr_port_t	*port;
2000 	mac_perim_handle_t mph, pmph;
2001 	int		rc = 0;
2002 
2003 	/*
2004 	 * Make sure that the aggregation link is visible from the caller's
2005 	 * zone.
2006 	 */
2007 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2008 		return (ENOENT);
2009 
2010 	rw_enter(&aggr_grp_lock, RW_READER);
2011 
2012 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2013 	    (mod_hash_val_t *)&grp) != 0) {
2014 		rw_exit(&aggr_grp_lock);
2015 		return (ENOENT);
2016 	}
2017 	AGGR_GRP_REFHOLD(grp);
2018 
2019 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2020 	rw_exit(&aggr_grp_lock);
2021 
2022 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
2023 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2024 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2025 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2026 
2027 	if (rc != 0)
2028 		goto bail;
2029 
2030 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2031 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2032 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2033 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
2034 		mac_perim_exit(pmph);
2035 
2036 		if (rc != 0)
2037 			goto bail;
2038 	}
2039 
2040 bail:
2041 	mac_perim_exit(mph);
2042 	AGGR_GRP_REFRELE(grp);
2043 	return (rc);
2044 }
2045 
2046 /*ARGSUSED*/
2047 static void
2048 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2049 {
2050 	miocnak(q, mp, 0, ENOTSUP);
2051 }
2052 
2053 static int
2054 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2055 {
2056 	aggr_port_t	*port;
2057 	uint_t		stat_index;
2058 
2059 	ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2060 
2061 	/* We only aggregate counter statistics. */
2062 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
2063 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
2064 		return (ENOTSUP);
2065 	}
2066 
2067 	/*
2068 	 * Counter statistics for a group are computed by aggregating the
2069 	 * counters of the members MACs while they were aggregated, plus
2070 	 * the residual counter of the group itself, which is updated each
2071 	 * time a MAC is removed from the group.
2072 	 */
2073 	*val = 0;
2074 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2075 		/* actual port statistic */
2076 		*val += aggr_port_stat(port, stat);
2077 		/*
2078 		 * minus the port stat when it was added, plus any residual
2079 		 * amount for the group.
2080 		 */
2081 		if (IS_MAC_STAT(stat)) {
2082 			stat_index = stat - MAC_STAT_MIN;
2083 			*val -= port->lp_stat[stat_index];
2084 			*val += grp->lg_stat[stat_index];
2085 		} else if (IS_MACTYPE_STAT(stat)) {
2086 			stat_index = stat - MACTYPE_STAT_MIN;
2087 			*val -= port->lp_ether_stat[stat_index];
2088 			*val += grp->lg_ether_stat[stat_index];
2089 		}
2090 	}
2091 	return (0);
2092 }
2093 
2094 int
2095 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2096 {
2097 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2098 
2099 	if (rx_ring->arr_hw_rh != NULL) {
2100 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2101 	} else {
2102 		aggr_port_t	*port = rx_ring->arr_port;
2103 
2104 		*val = mac_stat_get(port->lp_mh, stat);
2105 
2106 	}
2107 	return (0);
2108 }
2109 
2110 int
2111 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2112 {
2113 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2114 
2115 	if (tx_ring->atr_hw_rh != NULL) {
2116 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2117 	} else {
2118 		aggr_port_t	*port = tx_ring->atr_port;
2119 
2120 		*val = mac_stat_get(port->lp_mh, stat);
2121 	}
2122 	return (0);
2123 }
2124 
2125 static int
2126 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2127 {
2128 	aggr_grp_t		*grp = arg;
2129 	int			rval = 0;
2130 
2131 	mutex_enter(&grp->lg_stat_lock);
2132 
2133 	switch (stat) {
2134 	case MAC_STAT_IFSPEED:
2135 		*val = grp->lg_ifspeed;
2136 		break;
2137 
2138 	case ETHER_STAT_LINK_DUPLEX:
2139 		*val = grp->lg_link_duplex;
2140 		break;
2141 
2142 	default:
2143 		/*
2144 		 * For all other statistics, we return the aggregated stat
2145 		 * from the underlying ports.  aggr_grp_stat() will set
2146 		 * rval appropriately if the statistic isn't a counter.
2147 		 */
2148 		rval = aggr_grp_stat(grp, stat, val);
2149 	}
2150 
2151 	mutex_exit(&grp->lg_stat_lock);
2152 	return (rval);
2153 }
2154 
2155 static int
2156 aggr_m_start(void *arg)
2157 {
2158 	aggr_grp_t *grp = arg;
2159 	aggr_port_t *port;
2160 	mac_perim_handle_t mph, pmph;
2161 
2162 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2163 
2164 	/*
2165 	 * Attempts to start all configured members of the group.
2166 	 * Group members will be attached when their link-up notification
2167 	 * is received.
2168 	 */
2169 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2170 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2171 		if (aggr_port_start(port) != 0) {
2172 			mac_perim_exit(pmph);
2173 			continue;
2174 		}
2175 
2176 		/*
2177 		 * Turn on the promiscuous mode if it is required to receive
2178 		 * the non-primary address over a port, or the promiscous
2179 		 * mode is enabled over the aggr.
2180 		 */
2181 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2182 			if (aggr_port_promisc(port, B_TRUE) != 0)
2183 				aggr_port_stop(port);
2184 		}
2185 		mac_perim_exit(pmph);
2186 	}
2187 
2188 	grp->lg_started = B_TRUE;
2189 
2190 	mac_perim_exit(mph);
2191 	return (0);
2192 }
2193 
2194 static void
2195 aggr_m_stop(void *arg)
2196 {
2197 	aggr_grp_t *grp = arg;
2198 	aggr_port_t *port;
2199 	mac_perim_handle_t mph, pmph;
2200 
2201 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2202 
2203 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2204 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2205 
2206 		/* reset port promiscuous mode */
2207 		(void) aggr_port_promisc(port, B_FALSE);
2208 
2209 		aggr_port_stop(port);
2210 		mac_perim_exit(pmph);
2211 	}
2212 
2213 	grp->lg_started = B_FALSE;
2214 	mac_perim_exit(mph);
2215 }
2216 
2217 static int
2218 aggr_m_promisc(void *arg, boolean_t on)
2219 {
2220 	aggr_grp_t *grp = arg;
2221 	aggr_port_t *port;
2222 	boolean_t link_state_changed = B_FALSE;
2223 	mac_perim_handle_t mph, pmph;
2224 
2225 	AGGR_GRP_REFHOLD(grp);
2226 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2227 
2228 	ASSERT(!grp->lg_closing);
2229 
2230 	if (on == grp->lg_promisc)
2231 		goto bail;
2232 
2233 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2234 		int	err = 0;
2235 
2236 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2237 		AGGR_PORT_REFHOLD(port);
2238 		if (!on && (port->lp_prom_addr == NULL))
2239 			err = aggr_port_promisc(port, B_FALSE);
2240 		else if (on && port->lp_started)
2241 			err = aggr_port_promisc(port, B_TRUE);
2242 
2243 		if (err != 0) {
2244 			if (aggr_grp_detach_port(grp, port))
2245 				link_state_changed = B_TRUE;
2246 		} else {
2247 			/*
2248 			 * If a port was detached because of a previous
2249 			 * failure changing the promiscuity, the port
2250 			 * is reattached when it successfully changes
2251 			 * the promiscuity now, and this might cause
2252 			 * the link state of the aggregation to change.
2253 			 */
2254 			if (aggr_grp_attach_port(grp, port))
2255 				link_state_changed = B_TRUE;
2256 		}
2257 		mac_perim_exit(pmph);
2258 		AGGR_PORT_REFRELE(port);
2259 	}
2260 
2261 	grp->lg_promisc = on;
2262 
2263 	if (link_state_changed)
2264 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2265 
2266 bail:
2267 	mac_perim_exit(mph);
2268 	AGGR_GRP_REFRELE(grp);
2269 
2270 	return (0);
2271 }
2272 
2273 static void
2274 aggr_grp_port_rename(const char *new_name, void *arg)
2275 {
2276 	/*
2277 	 * aggr port's mac client name is the format of "aggr link name" plus
2278 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2279 	 */
2280 	int aggr_len, link_len, clnt_name_len, i;
2281 	char *str_end, *str_st, *str_del;
2282 	char aggr_name[MAXNAMELEN];
2283 	char link_name[MAXNAMELEN];
2284 	char *clnt_name;
2285 	aggr_grp_t *aggr_grp = arg;
2286 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2287 
2288 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2289 		clnt_name = mac_client_name(aggr_port->lp_mch);
2290 		clnt_name_len = strlen(clnt_name);
2291 		str_st = clnt_name;
2292 		str_end = &(clnt_name[clnt_name_len]);
2293 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2294 		ASSERT(str_del != NULL);
2295 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2296 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2297 		bzero(aggr_name, MAXNAMELEN);
2298 		bzero(link_name, MAXNAMELEN);
2299 		bcopy(clnt_name, aggr_name, aggr_len);
2300 		bcopy(str_del, link_name, link_len + 1);
2301 		bzero(clnt_name, MAXNAMELEN);
2302 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2303 		    link_name);
2304 
2305 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2306 		aggr_port = aggr_port->lp_next;
2307 	}
2308 }
2309 
2310 /*
2311  * Initialize the capabilities that are advertised for the group
2312  * according to the capabilities of the constituent ports.
2313  */
2314 static boolean_t
2315 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2316 {
2317 	aggr_grp_t *grp = arg;
2318 
2319 	switch (cap) {
2320 	case MAC_CAPAB_HCKSUM: {
2321 		uint32_t *hcksum_txflags = cap_data;
2322 		*hcksum_txflags = grp->lg_hcksum_txflags;
2323 		break;
2324 	}
2325 	case MAC_CAPAB_LSO: {
2326 		mac_capab_lso_t *cap_lso = cap_data;
2327 
2328 		if (grp->lg_lso) {
2329 			*cap_lso = grp->lg_cap_lso;
2330 			break;
2331 		} else {
2332 			return (B_FALSE);
2333 		}
2334 	}
2335 	case MAC_CAPAB_NO_NATIVEVLAN:
2336 		return (!grp->lg_vlan);
2337 	case MAC_CAPAB_NO_ZCOPY:
2338 		return (!grp->lg_zcopy);
2339 	case MAC_CAPAB_RINGS: {
2340 		mac_capab_rings_t *cap_rings = cap_data;
2341 		uint_t ring_cnt = 0;
2342 
2343 		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2344 			ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2345 
2346 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2347 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2348 			cap_rings->mr_rnum = ring_cnt;
2349 			cap_rings->mr_gnum = grp->lg_rx_group_count;
2350 			cap_rings->mr_gaddring = NULL;
2351 			cap_rings->mr_gremring = NULL;
2352 		} else {
2353 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2354 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2355 			cap_rings->mr_gnum = 0;
2356 		}
2357 		cap_rings->mr_rget = aggr_fill_ring;
2358 		cap_rings->mr_gget = aggr_fill_group;
2359 		break;
2360 	}
2361 	case MAC_CAPAB_AGGR:
2362 	{
2363 		mac_capab_aggr_t *aggr_cap;
2364 
2365 		if (cap_data != NULL) {
2366 			aggr_cap = cap_data;
2367 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2368 			aggr_cap->mca_unicst = aggr_m_unicst;
2369 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2370 			aggr_cap->mca_arg = arg;
2371 		}
2372 		return (B_TRUE);
2373 	}
2374 	default:
2375 		return (B_FALSE);
2376 	}
2377 	return (B_TRUE);
2378 }
2379 
2380 /*
2381  * Callback function for MAC layer to register groups.
2382  */
2383 static void
2384 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2385     mac_group_info_t *infop, mac_group_handle_t gh)
2386 {
2387 	aggr_grp_t *grp = arg;
2388 
2389 	if (rtype == MAC_RING_TYPE_RX) {
2390 		aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2391 
2392 		rx_group->arg_gh = gh;
2393 		rx_group->arg_grp = grp;
2394 
2395 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2396 		infop->mgi_start = NULL;
2397 		infop->mgi_stop = NULL;
2398 		infop->mgi_addmac = aggr_addmac;
2399 		infop->mgi_remmac = aggr_remmac;
2400 		infop->mgi_count = rx_group->arg_ring_cnt;
2401 
2402 		/*
2403 		 * Always set the HW VLAN callbacks. They are smart
2404 		 * enough to know when a port has HW VLAN filters to
2405 		 * program and when it doesn't.
2406 		 */
2407 		infop->mgi_addvlan = aggr_addvlan;
2408 		infop->mgi_remvlan = aggr_remvlan;
2409 	} else {
2410 		aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2411 
2412 		ASSERT3S(index, ==, 0);
2413 		tx_group->atg_gh = gh;
2414 	}
2415 }
2416 
2417 /*
2418  * Callback funtion for MAC layer to register all rings.
2419  */
2420 static void
2421 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2422     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2423 {
2424 	aggr_grp_t	*grp = arg;
2425 
2426 	switch (rtype) {
2427 	case MAC_RING_TYPE_RX: {
2428 		aggr_pseudo_rx_group_t	*rx_group;
2429 		aggr_pseudo_rx_ring_t	*rx_ring;
2430 		mac_intr_t		aggr_mac_intr;
2431 
2432 		rx_group = &grp->lg_rx_groups[rg_index];
2433 		ASSERT3S(index, >=, 0);
2434 		ASSERT3S(index, <, rx_group->arg_ring_cnt);
2435 		rx_ring = rx_group->arg_rings + index;
2436 		rx_ring->arr_rh = rh;
2437 
2438 		/*
2439 		 * Entrypoint to enable interrupt (disable poll) and
2440 		 * disable interrupt (enable poll).
2441 		 */
2442 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2443 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2444 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2445 		aggr_mac_intr.mi_ddi_handle = NULL;
2446 
2447 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2448 		infop->mri_start = aggr_pseudo_start_rx_ring;
2449 		infop->mri_stop = aggr_pseudo_stop_rx_ring;
2450 
2451 		infop->mri_intr = aggr_mac_intr;
2452 		infop->mri_poll = aggr_rx_poll;
2453 
2454 		infop->mri_stat = aggr_rx_ring_stat;
2455 		break;
2456 	}
2457 	case MAC_RING_TYPE_TX: {
2458 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2459 		aggr_pseudo_tx_ring_t	*tx_ring;
2460 
2461 		ASSERT(rg_index == -1);
2462 		ASSERT(index < tx_group->atg_ring_cnt);
2463 
2464 		tx_ring = &tx_group->atg_rings[index];
2465 		tx_ring->atr_rh = rh;
2466 
2467 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2468 		infop->mri_start = NULL;
2469 		infop->mri_stop = NULL;
2470 		infop->mri_tx = aggr_ring_tx;
2471 		infop->mri_stat = aggr_tx_ring_stat;
2472 		/*
2473 		 * Use the hw TX ring handle to find if the ring needs
2474 		 * serialization or not. For NICs that do not expose
2475 		 * Tx rings, atr_hw_rh will be NULL.
2476 		 */
2477 		if (tx_ring->atr_hw_rh != NULL) {
2478 			infop->mri_flags =
2479 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2480 		}
2481 		break;
2482 	}
2483 	default:
2484 		break;
2485 	}
2486 }
2487 
2488 static mblk_t *
2489 aggr_rx_poll(void *arg, int bytes_to_pickup)
2490 {
2491 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2492 	aggr_port_t *port = rr_ring->arr_port;
2493 	aggr_grp_t *grp = port->lp_grp;
2494 	mblk_t *mp_chain, *mp, **mpp;
2495 
2496 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2497 
2498 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2499 		return (mp_chain);
2500 
2501 	mpp = &mp_chain;
2502 	while ((mp = *mpp) != NULL) {
2503 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2504 			struct ether_header *ehp;
2505 
2506 			ehp = (struct ether_header *)mp->b_rptr;
2507 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2508 				*mpp = mp->b_next;
2509 				mp->b_next = NULL;
2510 				aggr_recv_lacp(port,
2511 				    (mac_resource_handle_t)rr_ring, mp);
2512 				continue;
2513 			}
2514 		}
2515 
2516 		if (!port->lp_collector_enabled) {
2517 			*mpp = mp->b_next;
2518 			mp->b_next = NULL;
2519 			freemsg(mp);
2520 			continue;
2521 		}
2522 		mpp = &mp->b_next;
2523 	}
2524 	return (mp_chain);
2525 }
2526 
2527 static int
2528 aggr_addmac(void *arg, const uint8_t *mac_addr)
2529 {
2530 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2531 	aggr_unicst_addr_t	*addr, **pprev;
2532 	aggr_grp_t		*grp = rx_group->arg_grp;
2533 	aggr_port_t		*port, *p;
2534 	mac_perim_handle_t	mph;
2535 	int			err = 0;
2536 	uint_t			idx = rx_group->arg_index;
2537 
2538 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2539 
2540 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2541 		mac_perim_exit(mph);
2542 		return (0);
2543 	}
2544 
2545 	/*
2546 	 * Insert this mac address into the list of mac addresses owned by
2547 	 * the aggregation pseudo group.
2548 	 */
2549 	pprev = &rx_group->arg_macaddr;
2550 	while ((addr = *pprev) != NULL) {
2551 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2552 			mac_perim_exit(mph);
2553 			return (EEXIST);
2554 		}
2555 		pprev = &addr->aua_next;
2556 	}
2557 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2558 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2559 	addr->aua_next = NULL;
2560 	*pprev = addr;
2561 
2562 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2563 		if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2564 			break;
2565 
2566 	if (err != 0) {
2567 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2568 			aggr_port_remmac(p, idx, mac_addr);
2569 
2570 		*pprev = NULL;
2571 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2572 	}
2573 
2574 	mac_perim_exit(mph);
2575 	return (err);
2576 }
2577 
2578 static int
2579 aggr_remmac(void *arg, const uint8_t *mac_addr)
2580 {
2581 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2582 	aggr_unicst_addr_t	*addr, **pprev;
2583 	aggr_grp_t		*grp = rx_group->arg_grp;
2584 	aggr_port_t		*port;
2585 	mac_perim_handle_t	mph;
2586 	int			err = 0;
2587 
2588 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2589 
2590 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2591 		mac_perim_exit(mph);
2592 		return (0);
2593 	}
2594 
2595 	/*
2596 	 * Insert this mac address into the list of mac addresses owned by
2597 	 * the aggregation pseudo group.
2598 	 */
2599 	pprev = &rx_group->arg_macaddr;
2600 	while ((addr = *pprev) != NULL) {
2601 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2602 			pprev = &addr->aua_next;
2603 			continue;
2604 		}
2605 		break;
2606 	}
2607 	if (addr == NULL) {
2608 		mac_perim_exit(mph);
2609 		return (EINVAL);
2610 	}
2611 
2612 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2613 		aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2614 
2615 	*pprev = addr->aua_next;
2616 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2617 
2618 	mac_perim_exit(mph);
2619 	return (err);
2620 }
2621 
2622 /*
2623  * Search for VID in the Rx group's list and return a pointer if
2624  * found. Otherwise return NULL.
2625  */
2626 static aggr_vlan_t *
2627 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2628 {
2629 	ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2630 	for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2631 	    avp = list_next(&rx_group->arg_vlans, avp)) {
2632 		if (avp->av_vid == vid)
2633 			return (avp);
2634 	}
2635 
2636 	return (NULL);
2637 }
2638 
2639 /*
2640  * Accept traffic on the specified VID.
2641  *
2642  * Persist VLAN state in the aggr so that ports added later will
2643  * receive the correct filters. In the future it would be nice to
2644  * allow aggr to iterate its clients instead of duplicating state.
2645  */
2646 static int
2647 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2648 {
2649 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2650 	aggr_grp_t		*aggr = rx_group->arg_grp;
2651 	aggr_port_t		*port, *p;
2652 	mac_perim_handle_t	mph;
2653 	int			err = 0;
2654 	aggr_vlan_t		*avp = NULL;
2655 	uint_t			idx = rx_group->arg_index;
2656 
2657 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2658 
2659 	if (vid == MAC_VLAN_UNTAGGED) {
2660 		/*
2661 		 * Aggr is both a MAC provider and MAC client. As a
2662 		 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2663 		 * client. As a client itself, it should pass
2664 		 * VLAN_ID_NONE to its ports.
2665 		 */
2666 		vid = VLAN_ID_NONE;
2667 		rx_group->arg_untagged++;
2668 		goto update_ports;
2669 	}
2670 
2671 	avp = aggr_find_vlan(rx_group, vid);
2672 
2673 	if (avp != NULL) {
2674 		avp->av_refs++;
2675 		mac_perim_exit(mph);
2676 		return (0);
2677 	}
2678 
2679 	avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2680 	avp->av_vid = vid;
2681 	avp->av_refs = 1;
2682 
2683 update_ports:
2684 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2685 		if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2686 			break;
2687 
2688 	if (err != 0) {
2689 		/*
2690 		 * If any of these calls fail then we are in a
2691 		 * situation where the ports have different HW state.
2692 		 * There's no reasonable action the MAC client can
2693 		 * take in this scenario to rectify the situation.
2694 		 */
2695 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2696 			int err2;
2697 
2698 			if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2699 				cmn_err(CE_WARN, "Failed to remove VLAN %u"
2700 				    " from port %s: errno %d.", vid,
2701 				    mac_client_name(p->lp_mch), err2);
2702 			}
2703 
2704 		}
2705 
2706 		if (vid == VLAN_ID_NONE)
2707 			rx_group->arg_untagged--;
2708 
2709 		if (avp != NULL) {
2710 			kmem_free(avp, sizeof (aggr_vlan_t));
2711 			avp = NULL;
2712 		}
2713 	}
2714 
2715 	if (avp != NULL)
2716 		list_insert_tail(&rx_group->arg_vlans, avp);
2717 
2718 done:
2719 	mac_perim_exit(mph);
2720 	return (err);
2721 }
2722 
2723 /*
2724  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2725  */
2726 static int
2727 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2728 {
2729 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2730 	aggr_grp_t		*aggr = rx_group->arg_grp;
2731 	aggr_port_t		*port, *p;
2732 	mac_perim_handle_t	mph;
2733 	int			err = 0;
2734 	aggr_vlan_t		*avp = NULL;
2735 	uint_t			idx = rx_group->arg_index;
2736 
2737 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2738 
2739 	/*
2740 	 * See the comment in aggr_addvlan().
2741 	 */
2742 	if (vid == MAC_VLAN_UNTAGGED) {
2743 		vid = VLAN_ID_NONE;
2744 		rx_group->arg_untagged--;
2745 
2746 		if (rx_group->arg_untagged > 0)
2747 			goto done;
2748 
2749 		goto update_ports;
2750 	}
2751 
2752 	avp = aggr_find_vlan(rx_group, vid);
2753 
2754 	if (avp == NULL) {
2755 		err = ENOENT;
2756 		goto done;
2757 	}
2758 
2759 	avp->av_refs--;
2760 
2761 	if (avp->av_refs > 0)
2762 		goto done;
2763 
2764 update_ports:
2765 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2766 		if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2767 			break;
2768 
2769 	/*
2770 	 * See the comment in aggr_addvlan() for justification of the
2771 	 * use of VERIFY here.
2772 	 */
2773 	if (err != 0) {
2774 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2775 			int err2;
2776 
2777 			if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2778 				cmn_err(CE_WARN, "Failed to add VLAN %u"
2779 				    " to port %s: errno %d.", vid,
2780 				    mac_client_name(p->lp_mch), err2);
2781 			}
2782 		}
2783 
2784 		if (avp != NULL)
2785 			avp->av_refs++;
2786 
2787 		if (vid == VLAN_ID_NONE)
2788 			rx_group->arg_untagged++;
2789 
2790 		goto done;
2791 	}
2792 
2793 	if (err == 0 && avp != NULL) {
2794 		VERIFY3U(avp->av_refs, ==, 0);
2795 		list_remove(&rx_group->arg_vlans, avp);
2796 		kmem_free(avp, sizeof (aggr_vlan_t));
2797 	}
2798 
2799 done:
2800 	mac_perim_exit(mph);
2801 	return (err);
2802 }
2803 
2804 /*
2805  * Add or remove the multicast addresses that are defined for the group
2806  * to or from the specified port.
2807  *
2808  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2809  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2810  * called when the port is either stopped or detached.
2811  */
2812 void
2813 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2814 {
2815 	aggr_grp_t *grp = port->lp_grp;
2816 
2817 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2818 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2819 
2820 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2821 		return;
2822 
2823 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2824 }
2825 
2826 static int
2827 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2828 {
2829 	aggr_grp_t *grp = arg;
2830 	aggr_port_t *port = NULL, *errport = NULL;
2831 	mac_perim_handle_t mph;
2832 	int err = 0;
2833 
2834 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2835 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2836 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2837 		    !port->lp_started) {
2838 			continue;
2839 		}
2840 		err = aggr_port_multicst(port, add, addrp);
2841 		if (err != 0) {
2842 			errport = port;
2843 			break;
2844 		}
2845 	}
2846 
2847 	/*
2848 	 * At least one port caused error return and this error is returned to
2849 	 * mac, eventually a NAK would be sent upwards.
2850 	 * Some ports have this multicast address listed now, and some don't.
2851 	 * Treat this error as a whole aggr failure not individual port failure.
2852 	 * Therefore remove this multicast address from other ports.
2853 	 */
2854 	if ((err != 0) && add) {
2855 		for (port = grp->lg_ports; port != errport;
2856 		    port = port->lp_next) {
2857 			if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2858 			    !port->lp_started) {
2859 				continue;
2860 			}
2861 			(void) aggr_port_multicst(port, B_FALSE, addrp);
2862 		}
2863 	}
2864 	mac_perim_exit(mph);
2865 	return (err);
2866 }
2867 
2868 static int
2869 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2870 {
2871 	aggr_grp_t *grp = arg;
2872 	mac_perim_handle_t mph;
2873 	int err;
2874 
2875 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2876 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2877 	    0, 0);
2878 	mac_perim_exit(mph);
2879 	return (err);
2880 }
2881 
2882 /*
2883  * Initialize the capabilities that are advertised for the group
2884  * according to the capabilities of the constituent ports.
2885  */
2886 static void
2887 aggr_grp_capab_set(aggr_grp_t *grp)
2888 {
2889 	uint32_t cksum;
2890 	aggr_port_t *port;
2891 	mac_capab_lso_t cap_lso;
2892 
2893 	ASSERT(grp->lg_mh == NULL);
2894 	ASSERT(grp->lg_ports != NULL);
2895 
2896 	grp->lg_hcksum_txflags = (uint32_t)-1;
2897 	grp->lg_zcopy = B_TRUE;
2898 	grp->lg_vlan = B_TRUE;
2899 
2900 	grp->lg_lso = B_TRUE;
2901 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2902 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2903 
2904 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2905 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2906 			cksum = 0;
2907 		grp->lg_hcksum_txflags &= cksum;
2908 
2909 		grp->lg_vlan &=
2910 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2911 
2912 		grp->lg_zcopy &=
2913 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2914 
2915 		grp->lg_lso &=
2916 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2917 		if (grp->lg_lso) {
2918 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2919 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2920 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2921 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2922 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
2923 		}
2924 	}
2925 }
2926 
2927 /*
2928  * Checks whether the capabilities of the port being added are compatible
2929  * with the current capabilities of the aggregation.
2930  */
2931 static boolean_t
2932 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2933 {
2934 	uint32_t hcksum_txflags;
2935 
2936 	ASSERT(grp->lg_ports != NULL);
2937 
2938 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2939 	    grp->lg_vlan) != grp->lg_vlan) {
2940 		return (B_FALSE);
2941 	}
2942 
2943 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2944 	    grp->lg_zcopy) != grp->lg_zcopy) {
2945 		return (B_FALSE);
2946 	}
2947 
2948 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2949 		if (grp->lg_hcksum_txflags != 0)
2950 			return (B_FALSE);
2951 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2952 	    grp->lg_hcksum_txflags) {
2953 		return (B_FALSE);
2954 	}
2955 
2956 	if (grp->lg_lso) {
2957 		mac_capab_lso_t cap_lso;
2958 
2959 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2960 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2961 			    grp->lg_cap_lso.lso_flags)
2962 				return (B_FALSE);
2963 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2964 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2965 				return (B_FALSE);
2966 		} else {
2967 			return (B_FALSE);
2968 		}
2969 	}
2970 
2971 	return (B_TRUE);
2972 }
2973 
2974 /*
2975  * Returns the maximum SDU according to the SDU of the constituent ports.
2976  */
2977 static uint_t
2978 aggr_grp_max_sdu(aggr_grp_t *grp)
2979 {
2980 	uint_t max_sdu = (uint_t)-1;
2981 	aggr_port_t *port;
2982 
2983 	ASSERT(grp->lg_ports != NULL);
2984 
2985 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2986 		uint_t port_sdu_max;
2987 
2988 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2989 		if (max_sdu > port_sdu_max)
2990 			max_sdu = port_sdu_max;
2991 	}
2992 
2993 	return (max_sdu);
2994 }
2995 
2996 /*
2997  * Checks if the maximum SDU of the specified port is compatible
2998  * with the maximum SDU of the specified aggregation group, returns
2999  * B_TRUE if it is, B_FALSE otherwise.
3000  */
3001 static boolean_t
3002 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
3003 {
3004 	uint_t port_sdu_max;
3005 
3006 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3007 	return (port_sdu_max >= grp->lg_max_sdu);
3008 }
3009 
3010 /*
3011  * Returns the maximum margin according to the margin of the constituent ports.
3012  */
3013 static uint32_t
3014 aggr_grp_max_margin(aggr_grp_t *grp)
3015 {
3016 	uint32_t margin = UINT32_MAX;
3017 	aggr_port_t *port;
3018 
3019 	ASSERT(grp->lg_mh == NULL);
3020 	ASSERT(grp->lg_ports != NULL);
3021 
3022 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3023 		if (margin > port->lp_margin)
3024 			margin = port->lp_margin;
3025 	}
3026 
3027 	grp->lg_margin = margin;
3028 	return (margin);
3029 }
3030 
3031 /*
3032  * Checks if the maximum margin of the specified port is compatible
3033  * with the maximum margin of the specified aggregation group, returns
3034  * B_TRUE if it is, B_FALSE otherwise.
3035  */
3036 static boolean_t
3037 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3038 {
3039 	if (port->lp_margin >= grp->lg_margin)
3040 		return (B_TRUE);
3041 
3042 	/*
3043 	 * See whether the current margin value is allowed to be changed to
3044 	 * the new value.
3045 	 */
3046 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3047 		return (B_FALSE);
3048 
3049 	grp->lg_margin = port->lp_margin;
3050 	return (B_TRUE);
3051 }
3052 
3053 /*
3054  * Set MTU on individual ports of an aggregation group
3055  */
3056 static int
3057 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3058     uint32_t *old_mtu)
3059 {
3060 	boolean_t		removed = B_FALSE;
3061 	mac_perim_handle_t	mph;
3062 	mac_diag_t		diag;
3063 	int			err, rv, retry = 0;
3064 
3065 	if (port->lp_mah != NULL) {
3066 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3067 		port->lp_mah = NULL;
3068 		removed = B_TRUE;
3069 	}
3070 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3071 try_again:
3072 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3073 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3074 	    &port->lp_mah, 0, &diag)) != 0) {
3075 		/*
3076 		 * following is a workaround for a bug in 'bge' driver.
3077 		 * See CR 6794654 for more information and this work around
3078 		 * will be removed once the CR is fixed.
3079 		 */
3080 		if (rv == EIO && retry++ < 3) {
3081 			delay(2 * hz);
3082 			goto try_again;
3083 		}
3084 		/*
3085 		 * if mac_unicast_add() failed while setting the MTU,
3086 		 * detach the port from the group.
3087 		 */
3088 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3089 		(void) aggr_grp_detach_port(grp, port);
3090 		mac_perim_exit(mph);
3091 		cmn_err(CE_WARN, "Unable to restart the port %s while "
3092 		    "setting MTU. Detaching the port from the aggregation.",
3093 		    mac_client_name(port->lp_mch));
3094 	}
3095 	return (err);
3096 }
3097 
3098 static int
3099 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3100 {
3101 	int			err = 0, i, rv;
3102 	aggr_port_t		*port;
3103 	uint32_t		*mtu;
3104 
3105 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3106 
3107 	/*
3108 	 * If the MTU being set is equal to aggr group's maximum
3109 	 * allowable value, then there is nothing to change
3110 	 */
3111 	if (sdu == grp->lg_max_sdu)
3112 		return (0);
3113 
3114 	/* 0 is aggr group's min sdu */
3115 	if (sdu == 0)
3116 		return (EINVAL);
3117 
3118 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3119 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3120 	    port = port->lp_next, i++) {
3121 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3122 	}
3123 	if (err != 0) {
3124 		/* recover from error: reset the mtus of the ports */
3125 		aggr_port_t *tmp;
3126 
3127 		for (tmp = grp->lg_ports, i = 0; tmp != port;
3128 		    tmp = tmp->lp_next, i++) {
3129 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3130 		}
3131 		goto bail;
3132 	}
3133 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3134 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3135 	ASSERT(rv == 0);
3136 bail:
3137 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3138 	return (err);
3139 }
3140 
3141 /*
3142  * Callback functions for set/get of properties
3143  */
3144 /*ARGSUSED*/
3145 static int
3146 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3147     uint_t pr_valsize, const void *pr_val)
3148 {
3149 	int		err = ENOTSUP;
3150 	aggr_grp_t	*grp = m_driver;
3151 
3152 	switch (pr_num) {
3153 	case MAC_PROP_MTU: {
3154 		uint32_t	mtu;
3155 
3156 		if (pr_valsize < sizeof (mtu)) {
3157 			err = EINVAL;
3158 			break;
3159 		}
3160 		bcopy(pr_val, &mtu, sizeof (mtu));
3161 		err = aggr_sdu_update(grp, mtu);
3162 		break;
3163 	}
3164 	default:
3165 		break;
3166 	}
3167 	return (err);
3168 }
3169 
3170 typedef struct rboundary {
3171 	uint32_t	bval;
3172 	int		btype;
3173 } rboundary_t;
3174 
3175 /*
3176  * This function finds the intersection of mtu ranges stored in arrays -
3177  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3178  * Individual arrays are assumed to contain non-overlapping ranges.
3179  * Algorithm:
3180  *   A range has two boundaries - min and max. We scan all arrays and store
3181  * each boundary as a separate element in a temporary array. We also store
3182  * the boundary types, min or max, as +1 or -1 respectively in the temporary
3183  * array. Then we sort the temporary array in ascending order. We scan the
3184  * sorted array from lower to higher values and keep a cumulative sum of
3185  * boundary types. Element in the temporary array for which the sum reaches
3186  * mcount is a min boundary of a range in the result and next element will be
3187  * max boundary.
3188  *
3189  * Example for mcount = 3,
3190  *
3191  *  ----|_________|-------|_______|----|__|------ mrange[0]
3192  *
3193  *  -------|________|--|____________|-----|___|-- mrange[1]
3194  *
3195  *  --------|________________|-------|____|------ mrange[2]
3196  *
3197  *                                      3 2 1
3198  *                                       \|/
3199  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
3200  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3201  *
3202  *                                 same min and max
3203  *                                        V
3204  *  --------|_____|-------|__|------------|------ intersecting ranges
3205  */
3206 void
3207 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3208     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3209 {
3210 	mac_propval_uint32_range_t	*rval, *ur;
3211 	int				rmaxcnt, rcount;
3212 	size_t				sz_range32;
3213 	rboundary_t			*ta; /* temporary array */
3214 	rboundary_t			temp;
3215 	boolean_t			range_started = B_FALSE;
3216 	int				i, j, m, sum;
3217 
3218 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3219 
3220 	for (i = 0, rmaxcnt = 0; i < mcount; i++)
3221 		rmaxcnt += mrange[i]->mpr_count;
3222 
3223 	/* Allocate enough space to store the results */
3224 	rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3225 
3226 	/* Number of boundaries are twice as many as ranges */
3227 	ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3228 
3229 	for (i = 0, m = 0; i < mcount; i++) {
3230 		ur = &(mrange[i]->mpr_range_uint32[0]);
3231 		for (j = 0; j < mrange[i]->mpr_count; j++) {
3232 			ta[m].bval = ur[j].mpur_min;
3233 			ta[m++].btype = 1;
3234 			ta[m].bval = ur[j].mpur_max;
3235 			ta[m++].btype = -1;
3236 		}
3237 	}
3238 
3239 	/*
3240 	 * Sort the temporary array in ascending order of bval;
3241 	 * if boundary values are same then sort on btype.
3242 	 */
3243 	for (i = 0; i < m-1; i++) {
3244 		for (j = i+1; j < m; j++) {
3245 			if ((ta[i].bval > ta[j].bval) ||
3246 			    ((ta[i].bval == ta[j].bval) &&
3247 			    (ta[i].btype < ta[j].btype))) {
3248 				temp = ta[i];
3249 				ta[i] = ta[j];
3250 				ta[j] = temp;
3251 			}
3252 		}
3253 	}
3254 
3255 	/* Walk through temporary array to find all ranges in the results */
3256 	for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3257 		sum += ta[i].btype;
3258 		if (sum == mcount) {
3259 			rval[rcount].mpur_min = ta[i].bval;
3260 			range_started = B_TRUE;
3261 		} else if (sum < mcount && range_started) {
3262 			rval[rcount++].mpur_max = ta[i].bval;
3263 			range_started = B_FALSE;
3264 		}
3265 	}
3266 
3267 	*prval = rval;
3268 	*prmaxcnt = rmaxcnt;
3269 	*prcount = rcount;
3270 
3271 	kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3272 }
3273 
3274 /*
3275  * Returns the mtu ranges which could be supported by aggr group.
3276  * prmaxcnt returns the size of the buffer prval, prcount returns
3277  * the number of valid entries in prval. Caller is responsible
3278  * for freeing up prval.
3279  */
3280 int
3281 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3282     int *prmaxcnt, int *prcount)
3283 {
3284 	mac_propval_range_t		**vals;
3285 	aggr_port_t			*port;
3286 	mac_perim_handle_t		mph;
3287 	uint_t				i, numr;
3288 	int				err = 0;
3289 	size_t				sz_propval, sz_range32;
3290 	size_t				size;
3291 
3292 	sz_propval = sizeof (mac_propval_range_t);
3293 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3294 
3295 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3296 
3297 	vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3298 	    KM_SLEEP);
3299 
3300 	for (port = grp->lg_ports, i = 0; port != NULL;
3301 	    port = port->lp_next, i++) {
3302 
3303 		size = sz_propval;
3304 		vals[i] = kmem_alloc(size, KM_SLEEP);
3305 		vals[i]->mpr_count = 1;
3306 
3307 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3308 
3309 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3310 		    NULL, 0, vals[i], NULL);
3311 		if (err == ENOSPC) {
3312 			/*
3313 			 * Not enough space to hold all ranges.
3314 			 * Allocate extra space as indicated and retry.
3315 			 */
3316 			numr = vals[i]->mpr_count;
3317 			kmem_free(vals[i], sz_propval);
3318 			size = sz_propval + (numr - 1) * sz_range32;
3319 			vals[i] = kmem_alloc(size, KM_SLEEP);
3320 			vals[i]->mpr_count = numr;
3321 			err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3322 			    NULL, 0, vals[i], NULL);
3323 			ASSERT(err != ENOSPC);
3324 		}
3325 		mac_perim_exit(mph);
3326 		if (err != 0) {
3327 			kmem_free(vals[i], size);
3328 			vals[i] = NULL;
3329 			break;
3330 		}
3331 	}
3332 
3333 	/*
3334 	 * if any of the underlying ports does not support changing MTU then
3335 	 * just return ENOTSUP
3336 	 */
3337 	if (port != NULL) {
3338 		ASSERT(err != 0);
3339 		goto done;
3340 	}
3341 
3342 	aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3343 	    prcount);
3344 
3345 done:
3346 	for (i = 0; i < grp->lg_nports; i++) {
3347 		if (vals[i] != NULL) {
3348 			numr = vals[i]->mpr_count;
3349 			size = sz_propval + (numr - 1) * sz_range32;
3350 			kmem_free(vals[i], size);
3351 		}
3352 	}
3353 
3354 	kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3355 	return (err);
3356 }
3357 
3358 static void
3359 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3360     mac_prop_info_handle_t prh)
3361 {
3362 	aggr_grp_t			*grp = m_driver;
3363 	mac_propval_uint32_range_t	*rval = NULL;
3364 	int				i, rcount, rmaxcnt;
3365 	int				err = 0;
3366 
3367 	_NOTE(ARGUNUSED(pr_name));
3368 
3369 	switch (pr_num) {
3370 	case MAC_PROP_MTU:
3371 
3372 		err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3373 		    &rcount);
3374 		if (err != 0) {
3375 			ASSERT(rval == NULL);
3376 			return;
3377 		}
3378 		for (i = 0; i < rcount; i++) {
3379 			mac_prop_info_set_range_uint32(prh,
3380 			    rval[i].mpur_min, rval[i].mpur_max);
3381 		}
3382 		kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3383 		break;
3384 	}
3385 }
3386