xref: /titanic_44/usr/src/uts/common/io/aggr/aggr_grp.c (revision cb8a054b1ab30d5caa746e6c44f29d4c9d3071c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28  *
29  * An instance of the structure aggr_grp_t is allocated for each
30  * link aggregation group. When created, aggr_grp_t objects are
31  * entered into the aggr_grp_hash hash table maintained by the modhash
32  * module. The hash key is the linkid associated with the link
33  * aggregation group.
34  *
35  * A set of MAC ports are associated with each association group.
36  *
37  * Aggr pseudo TX rings
38  * --------------------
39  * The underlying ports (NICs) in an aggregation can have TX rings. To
40  * enhance aggr's performance, these TX rings are made available to the
41  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
42  * They are already present and implemented on the RX side. It is called
43  * as pseudo RX rings. The same concept is extended to the TX side where
44  * each TX ring of an underlying port is reflected in aggr as a pseudo
45  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
46  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
47  * TX ring is given to the aggregation layer.
48  *
49  * With this change, the outgoing stack depth looks much better:
50  *
51  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
52  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
53  *
54  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
55  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
56  *
57  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
58  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
59  * ring belonging to a port on which the packet has to be sent.
60  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
61  * policy and then uses the fanout_hint passed to it to pick a TX ring from
62  * the selected port.
63  *
64  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
65  * bandwidth limit is applied first on the outgoing packet and the packets
66  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
67  * particular TX ring.
68  */
69 
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/conf.h>
73 #include <sys/cmn_err.h>
74 #include <sys/disp.h>
75 #include <sys/list.h>
76 #include <sys/ksynch.h>
77 #include <sys/kmem.h>
78 #include <sys/stream.h>
79 #include <sys/modctl.h>
80 #include <sys/ddi.h>
81 #include <sys/sunddi.h>
82 #include <sys/atomic.h>
83 #include <sys/stat.h>
84 #include <sys/modhash.h>
85 #include <sys/id_space.h>
86 #include <sys/strsun.h>
87 #include <sys/cred.h>
88 #include <sys/dlpi.h>
89 #include <sys/zone.h>
90 #include <sys/mac_provider.h>
91 #include <sys/dls.h>
92 #include <sys/vlan.h>
93 #include <sys/aggr.h>
94 #include <sys/aggr_impl.h>
95 
96 static int aggr_m_start(void *);
97 static void aggr_m_stop(void *);
98 static int aggr_m_promisc(void *, boolean_t);
99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
100 static int aggr_m_unicst(void *, const uint8_t *);
101 static int aggr_m_stat(void *, uint_t, uint64_t *);
102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
105     const void *);
106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
107     mac_prop_info_handle_t);
108 
109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
111     boolean_t *);
112 
113 static void aggr_grp_capab_set(aggr_grp_t *);
114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
119 
120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
125 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
126 static int aggr_addmac(void *, const uint8_t *);
127 static int aggr_remmac(void *, const uint8_t *);
128 static mblk_t *aggr_rx_poll(void *, int);
129 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
130     const int, mac_ring_info_t *, mac_ring_handle_t);
131 static void aggr_fill_group(void *, mac_ring_type_t, const int,
132     mac_group_info_t *, mac_group_handle_t);
133 
134 static kmem_cache_t	*aggr_grp_cache;
135 static mod_hash_t	*aggr_grp_hash;
136 static krwlock_t	aggr_grp_lock;
137 static uint_t		aggr_grp_cnt;
138 static id_space_t	*key_ids;
139 
140 #define	GRP_HASHSZ		64
141 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
142 #define	AGGR_PORT_NAME_DELIMIT '-'
143 
144 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
145 
146 #define	AGGR_M_CALLBACK_FLAGS	\
147 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
148 
149 static mac_callbacks_t aggr_m_callbacks = {
150 	AGGR_M_CALLBACK_FLAGS,
151 	aggr_m_stat,
152 	aggr_m_start,
153 	aggr_m_stop,
154 	aggr_m_promisc,
155 	aggr_m_multicst,
156 	NULL,
157 	NULL,
158 	NULL,
159 	aggr_m_ioctl,
160 	aggr_m_capab_get,
161 	NULL,
162 	NULL,
163 	aggr_m_setprop,
164 	NULL,
165 	aggr_m_propinfo
166 };
167 
168 /*ARGSUSED*/
169 static int
170 aggr_grp_constructor(void *buf, void *arg, int kmflag)
171 {
172 	aggr_grp_t *grp = buf;
173 
174 	bzero(grp, sizeof (*grp));
175 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
176 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
177 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
178 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
179 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
180 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
181 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
182 	grp->lg_link_state = LINK_STATE_UNKNOWN;
183 	return (0);
184 }
185 
186 /*ARGSUSED*/
187 static void
188 aggr_grp_destructor(void *buf, void *arg)
189 {
190 	aggr_grp_t *grp = buf;
191 
192 	if (grp->lg_tx_ports != NULL) {
193 		kmem_free(grp->lg_tx_ports,
194 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
195 	}
196 
197 	mutex_destroy(&grp->lg_lacp_lock);
198 	cv_destroy(&grp->lg_lacp_cv);
199 	mutex_destroy(&grp->lg_port_lock);
200 	cv_destroy(&grp->lg_port_cv);
201 	rw_destroy(&grp->lg_tx_lock);
202 	mutex_destroy(&grp->lg_tx_flowctl_lock);
203 	cv_destroy(&grp->lg_tx_flowctl_cv);
204 }
205 
206 void
207 aggr_grp_init(void)
208 {
209 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
210 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
211 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
212 
213 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
214 	    GRP_HASHSZ, mod_hash_null_valdtor);
215 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
216 	aggr_grp_cnt = 0;
217 
218 	/*
219 	 * Allocate an id space to manage key values (when key is not
220 	 * specified). The range of the id space will be from
221 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
222 	 * uses a 16-bit key.
223 	 */
224 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
225 	ASSERT(key_ids != NULL);
226 }
227 
228 void
229 aggr_grp_fini(void)
230 {
231 	id_space_destroy(key_ids);
232 	rw_destroy(&aggr_grp_lock);
233 	mod_hash_destroy_idhash(aggr_grp_hash);
234 	kmem_cache_destroy(aggr_grp_cache);
235 }
236 
237 uint_t
238 aggr_grp_count(void)
239 {
240 	uint_t	count;
241 
242 	rw_enter(&aggr_grp_lock, RW_READER);
243 	count = aggr_grp_cnt;
244 	rw_exit(&aggr_grp_lock);
245 	return (count);
246 }
247 
248 /*
249  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
250  * requires the mac perimeter, this function holds a reference of the aggr
251  * and aggr won't call mac_unregister() until this reference drops to 0.
252  */
253 void
254 aggr_grp_port_hold(aggr_port_t *port)
255 {
256 	aggr_grp_t	*grp = port->lp_grp;
257 
258 	AGGR_PORT_REFHOLD(port);
259 	mutex_enter(&grp->lg_port_lock);
260 	grp->lg_port_ref++;
261 	mutex_exit(&grp->lg_port_lock);
262 }
263 
264 /*
265  * Release the reference of the grp and inform aggr_grp_delete() calling
266  * mac_unregister() is now safe.
267  */
268 void
269 aggr_grp_port_rele(aggr_port_t *port)
270 {
271 	aggr_grp_t	*grp = port->lp_grp;
272 
273 	mutex_enter(&grp->lg_port_lock);
274 	if (--grp->lg_port_ref == 0)
275 		cv_signal(&grp->lg_port_cv);
276 	mutex_exit(&grp->lg_port_lock);
277 	AGGR_PORT_REFRELE(port);
278 }
279 
280 /*
281  * Wait for the port's lacp timer thread and the port's notification callback
282  * to exit.
283  */
284 void
285 aggr_grp_port_wait(aggr_grp_t *grp)
286 {
287 	mutex_enter(&grp->lg_port_lock);
288 	if (grp->lg_port_ref != 0)
289 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
290 	mutex_exit(&grp->lg_port_lock);
291 }
292 
293 /*
294  * Attach a port to a link aggregation group.
295  *
296  * A port is attached to a link aggregation group once its speed
297  * and link state have been verified.
298  *
299  * Returns B_TRUE if the group link state or speed has changed. If
300  * it's the case, the caller must notify the MAC layer via a call
301  * to mac_link().
302  */
303 boolean_t
304 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
305 {
306 	boolean_t link_state_changed = B_FALSE;
307 
308 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
309 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
310 
311 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
312 		return (B_FALSE);
313 
314 	/*
315 	 * Validate the MAC port link speed and update the group
316 	 * link speed if needed.
317 	 */
318 	if (port->lp_ifspeed == 0 ||
319 	    port->lp_link_state != LINK_STATE_UP ||
320 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
321 		/*
322 		 * Can't attach a MAC port with unknown link speed,
323 		 * down link, or not in full duplex mode.
324 		 */
325 		return (B_FALSE);
326 	}
327 
328 	if (grp->lg_ifspeed == 0) {
329 		/*
330 		 * The group inherits the speed of the first link being
331 		 * attached.
332 		 */
333 		grp->lg_ifspeed = port->lp_ifspeed;
334 		link_state_changed = B_TRUE;
335 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
336 		/*
337 		 * The link speed of the MAC port must be the same as
338 		 * the group link speed, as per 802.3ad. Since it is
339 		 * not, the attach is cancelled.
340 		 */
341 		return (B_FALSE);
342 	}
343 
344 	grp->lg_nattached_ports++;
345 
346 	/*
347 	 * Update the group link state.
348 	 */
349 	if (grp->lg_link_state != LINK_STATE_UP) {
350 		grp->lg_link_state = LINK_STATE_UP;
351 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
352 		link_state_changed = B_TRUE;
353 	}
354 
355 	/*
356 	 * Update port's state.
357 	 */
358 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
359 
360 	aggr_grp_multicst_port(port, B_TRUE);
361 
362 	/*
363 	 * Set port's receive callback
364 	 */
365 	mac_rx_set(port->lp_mch, aggr_recv_cb, port);
366 
367 	/*
368 	 * If LACP is OFF, the port can be used to send data as soon
369 	 * as its link is up and verified to be compatible with the
370 	 * aggregation.
371 	 *
372 	 * If LACP is active or passive, notify the LACP subsystem, which
373 	 * will enable sending on the port following the LACP protocol.
374 	 */
375 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
376 		aggr_send_port_enable(port);
377 	else
378 		aggr_lacp_port_attached(port);
379 
380 	return (link_state_changed);
381 }
382 
383 boolean_t
384 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
385 {
386 	boolean_t link_state_changed = B_FALSE;
387 
388 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
389 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
390 
391 	/* update state */
392 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
393 		return (B_FALSE);
394 
395 	mac_rx_clear(port->lp_mch);
396 
397 	aggr_grp_multicst_port(port, B_FALSE);
398 
399 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
400 		aggr_send_port_disable(port);
401 	else
402 		aggr_lacp_port_detached(port);
403 
404 	port->lp_state = AGGR_PORT_STATE_STANDBY;
405 
406 	grp->lg_nattached_ports--;
407 	if (grp->lg_nattached_ports == 0) {
408 		/* the last attached MAC port of the group is being detached */
409 		grp->lg_ifspeed = 0;
410 		grp->lg_link_state = LINK_STATE_DOWN;
411 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
412 		link_state_changed = B_TRUE;
413 	}
414 
415 	return (link_state_changed);
416 }
417 
418 /*
419  * Update the MAC addresses of the constituent ports of the specified
420  * group. This function is invoked:
421  * - after creating a new aggregation group.
422  * - after adding new ports to an aggregation group.
423  * - after removing a port from a group when the MAC address of
424  *   that port was used for the MAC address of the group.
425  * - after the MAC address of a port changed when the MAC address
426  *   of that port was used for the MAC address of the group.
427  *
428  * Return true if the link state of the aggregation changed, for example
429  * as a result of a failure changing the MAC address of one of the
430  * constituent ports.
431  */
432 boolean_t
433 aggr_grp_update_ports_mac(aggr_grp_t *grp)
434 {
435 	aggr_port_t *cport;
436 	boolean_t link_state_changed = B_FALSE;
437 	mac_perim_handle_t mph;
438 
439 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
440 
441 	for (cport = grp->lg_ports; cport != NULL;
442 	    cport = cport->lp_next) {
443 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
444 		if (aggr_port_unicst(cport) != 0) {
445 			if (aggr_grp_detach_port(grp, cport))
446 				link_state_changed = B_TRUE;
447 		} else {
448 			/*
449 			 * If a port was detached because of a previous
450 			 * failure changing the MAC address, the port is
451 			 * reattached when it successfully changes the MAC
452 			 * address now, and this might cause the link state
453 			 * of the aggregation to change.
454 			 */
455 			if (aggr_grp_attach_port(grp, cport))
456 				link_state_changed = B_TRUE;
457 		}
458 		mac_perim_exit(mph);
459 	}
460 	return (link_state_changed);
461 }
462 
463 /*
464  * Invoked when the MAC address of a port has changed. If the port's
465  * MAC address was used for the group MAC address, set mac_addr_changedp
466  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
467  * notification. If the link state changes due to detach/attach of
468  * the constituent port, set link_state_changedp to B_TRUE to indicate
469  * to the caller that it should send a MAC_NOTE_LINK notification. In both
470  * cases, it is the responsibility of the caller to invoke notification
471  * functions after releasing the the port lock.
472  */
473 void
474 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
475     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
476 {
477 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
478 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
479 	ASSERT(mac_addr_changedp != NULL);
480 	ASSERT(link_state_changedp != NULL);
481 
482 	*mac_addr_changedp = B_FALSE;
483 	*link_state_changedp = B_FALSE;
484 
485 	if (grp->lg_addr_fixed) {
486 		/*
487 		 * The group is using a fixed MAC address or an automatic
488 		 * MAC address has not been set.
489 		 */
490 		return;
491 	}
492 
493 	if (grp->lg_mac_addr_port == port) {
494 		/*
495 		 * The MAC address of the port was assigned to the group
496 		 * MAC address. Update the group MAC address.
497 		 */
498 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
499 		*mac_addr_changedp = B_TRUE;
500 	} else {
501 		/*
502 		 * Update the actual port MAC address to the MAC address
503 		 * of the group.
504 		 */
505 		if (aggr_port_unicst(port) != 0) {
506 			*link_state_changedp = aggr_grp_detach_port(grp, port);
507 		} else {
508 			/*
509 			 * If a port was detached because of a previous
510 			 * failure changing the MAC address, the port is
511 			 * reattached when it successfully changes the MAC
512 			 * address now, and this might cause the link state
513 			 * of the aggregation to change.
514 			 */
515 			*link_state_changedp = aggr_grp_attach_port(grp, port);
516 		}
517 	}
518 }
519 
520 /*
521  * Add a port to a link aggregation group.
522  */
523 static int
524 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
525     aggr_port_t **pp)
526 {
527 	aggr_port_t *port, **cport;
528 	mac_perim_handle_t mph;
529 	zoneid_t port_zoneid = ALL_ZONES;
530 	int err;
531 
532 	/* The port must be int the same zone as the aggregation. */
533 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
534 		port_zoneid = GLOBAL_ZONEID;
535 	if (grp->lg_zoneid != port_zoneid)
536 		return (EBUSY);
537 
538 	/*
539 	 * lg_mh could be NULL when the function is called during the creation
540 	 * of the aggregation.
541 	 */
542 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
543 
544 	/* create new port */
545 	err = aggr_port_create(grp, port_linkid, force, &port);
546 	if (err != 0)
547 		return (err);
548 
549 	mac_perim_enter_by_mh(port->lp_mh, &mph);
550 
551 	/* add port to list of group constituent ports */
552 	cport = &grp->lg_ports;
553 	while (*cport != NULL)
554 		cport = &((*cport)->lp_next);
555 	*cport = port;
556 
557 	/*
558 	 * Back reference to the group it is member of. A port always
559 	 * holds a reference to its group to ensure that the back
560 	 * reference is always valid.
561 	 */
562 	port->lp_grp = grp;
563 	AGGR_GRP_REFHOLD(grp);
564 	grp->lg_nports++;
565 
566 	aggr_lacp_init_port(port);
567 	mac_perim_exit(mph);
568 
569 	if (pp != NULL)
570 		*pp = port;
571 
572 	return (0);
573 }
574 
575 /*
576  * Add a pseudo RX ring for the given HW ring handle.
577  */
578 static int
579 aggr_add_pseudo_rx_ring(aggr_port_t *port,
580     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
581 {
582 	aggr_pseudo_rx_ring_t	*ring;
583 	int			err;
584 	int			j;
585 
586 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
587 		ring = rx_grp->arg_rings + j;
588 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
589 			break;
590 	}
591 
592 	/*
593 	 * No slot for this new RX ring.
594 	 */
595 	if (j == MAX_RINGS_PER_GROUP)
596 		return (EIO);
597 
598 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
599 	ring->arr_hw_rh = hw_rh;
600 	ring->arr_port = port;
601 	rx_grp->arg_ring_cnt++;
602 
603 	/*
604 	 * The group is already registered, dynamically add a new ring to the
605 	 * mac group.
606 	 */
607 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
608 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
609 		ring->arr_hw_rh = NULL;
610 		ring->arr_port = NULL;
611 		rx_grp->arg_ring_cnt--;
612 	} else {
613 		mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
614 		    mac_find_ring(rx_grp->arg_gh, j));
615 	}
616 	return (err);
617 }
618 
619 /*
620  * Remove the pseudo RX ring of the given HW ring handle.
621  */
622 static void
623 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
624 {
625 	aggr_pseudo_rx_ring_t	*ring;
626 	int			j;
627 
628 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
629 		ring = rx_grp->arg_rings + j;
630 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
631 		    ring->arr_hw_rh != hw_rh) {
632 			continue;
633 		}
634 
635 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
636 
637 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
638 		ring->arr_hw_rh = NULL;
639 		ring->arr_port = NULL;
640 		rx_grp->arg_ring_cnt--;
641 		mac_hwring_teardown(hw_rh);
642 		break;
643 	}
644 }
645 
646 /*
647  * This function is called to create pseudo rings over the hardware rings of
648  * the underlying device. Note that there is a 1:1 mapping between the pseudo
649  * RX rings of the aggr and the hardware rings of the underlying port.
650  */
651 static int
652 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
653 {
654 	aggr_grp_t		*grp = port->lp_grp;
655 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
656 	aggr_unicst_addr_t	*addr, *a;
657 	mac_perim_handle_t	pmph;
658 	int			hw_rh_cnt, i = 0, j;
659 	int			err = 0;
660 
661 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
662 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
663 
664 	/*
665 	 * This function must be called after the aggr registers its mac
666 	 * and its RX group has been initialized.
667 	 */
668 	ASSERT(rx_grp->arg_gh != NULL);
669 
670 	/*
671 	 * Get the list the the underlying HW rings.
672 	 */
673 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
674 	    &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
675 
676 	if (port->lp_hwgh != NULL) {
677 		/*
678 		 * Quiesce the HW ring and the mac srs on the ring. Note
679 		 * that the HW ring will be restarted when the pseudo ring
680 		 * is started. At that time all the packets will be
681 		 * directly passed up to the pseudo RX ring and handled
682 		 * by mac srs created over the pseudo RX ring.
683 		 */
684 		mac_rx_client_quiesce(port->lp_mch);
685 		mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
686 	}
687 
688 	/*
689 	 * Add all the unicast addresses to the newly added port.
690 	 */
691 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
692 		if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
693 			break;
694 	}
695 
696 	for (i = 0; err == 0 && i < hw_rh_cnt; i++)
697 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
698 
699 	if (err != 0) {
700 		for (j = 0; j < i; j++)
701 			aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
702 
703 		for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
704 			aggr_port_remmac(port, a->aua_addr);
705 
706 		if (port->lp_hwgh != NULL) {
707 			mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
708 			mac_rx_client_restart(port->lp_mch);
709 			port->lp_hwgh = NULL;
710 		}
711 	} else {
712 		port->lp_rx_grp_added = B_TRUE;
713 	}
714 done:
715 	mac_perim_exit(pmph);
716 	return (err);
717 }
718 
719 /*
720  * This function is called by aggr to remove pseudo RX rings over the
721  * HW rings of the underlying port.
722  */
723 static void
724 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
725 {
726 	aggr_grp_t		*grp = port->lp_grp;
727 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
728 	aggr_unicst_addr_t	*addr;
729 	mac_group_handle_t	hwgh;
730 	mac_perim_handle_t	pmph;
731 	int			hw_rh_cnt, i;
732 
733 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
734 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
735 
736 	if (!port->lp_rx_grp_added)
737 		goto done;
738 
739 	ASSERT(rx_grp->arg_gh != NULL);
740 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
741 	    &hwgh, hw_rh, MAC_RING_TYPE_RX);
742 
743 	/*
744 	 * If hw_rh_cnt is 0, it means that the underlying port does not
745 	 * support RX rings. Directly return in this case.
746 	 */
747 	for (i = 0; i < hw_rh_cnt; i++)
748 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
749 
750 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
751 		aggr_port_remmac(port, addr->aua_addr);
752 
753 	if (port->lp_hwgh != NULL) {
754 		port->lp_hwgh = NULL;
755 
756 		/*
757 		 * First clear the permanent-quiesced flag of the RX srs then
758 		 * restart the HW ring and the mac srs on the ring. Note that
759 		 * the HW ring and associated SRS will soon been removed when
760 		 * the port is removed from the aggr.
761 		 */
762 		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
763 		mac_rx_client_restart(port->lp_mch);
764 	}
765 
766 	port->lp_rx_grp_added = B_FALSE;
767 done:
768 	mac_perim_exit(pmph);
769 }
770 
771 /*
772  * Add a pseudo TX ring for the given HW ring handle.
773  */
774 static int
775 aggr_add_pseudo_tx_ring(aggr_port_t *port,
776     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
777     mac_ring_handle_t *pseudo_rh)
778 {
779 	aggr_pseudo_tx_ring_t	*ring;
780 	int			err;
781 	int			i;
782 
783 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
784 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
785 		ring = tx_grp->atg_rings + i;
786 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
787 			break;
788 	}
789 	/*
790 	 * No slot for this new TX ring.
791 	 */
792 	if (i == MAX_RINGS_PER_GROUP)
793 		return (EIO);
794 	/*
795 	 * The following 4 statements needs to be done before
796 	 * calling mac_group_add_ring(). Otherwise it will
797 	 * result in an assertion failure in mac_init_ring().
798 	 */
799 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
800 	ring->atr_hw_rh = hw_rh;
801 	ring->atr_port = port;
802 	tx_grp->atg_ring_cnt++;
803 
804 	/*
805 	 * The TX side has no concept of ring groups unlike RX groups.
806 	 * There is just a single group which stores all the TX rings.
807 	 * This group will be used to store aggr's pseudo TX rings.
808 	 */
809 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
810 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
811 		ring->atr_hw_rh = NULL;
812 		ring->atr_port = NULL;
813 		tx_grp->atg_ring_cnt--;
814 	} else {
815 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
816 		if (hw_rh != NULL) {
817 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
818 			    mac_find_ring(tx_grp->atg_gh, i));
819 		}
820 	}
821 	return (err);
822 }
823 
824 /*
825  * Remove the pseudo TX ring of the given HW ring handle.
826  */
827 static void
828 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
829     mac_ring_handle_t pseudo_hw_rh)
830 {
831 	aggr_pseudo_tx_ring_t	*ring;
832 	int			i;
833 
834 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
835 		ring = tx_grp->atg_rings + i;
836 		if (ring->atr_rh != pseudo_hw_rh)
837 			continue;
838 
839 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
840 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
841 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
842 		mac_hwring_teardown(ring->atr_hw_rh);
843 		ring->atr_hw_rh = NULL;
844 		ring->atr_port = NULL;
845 		tx_grp->atg_ring_cnt--;
846 		break;
847 	}
848 }
849 
850 /*
851  * This function is called to create pseudo rings over hardware rings of
852  * the underlying device. There is a 1:1 mapping between the pseudo TX
853  * rings of the aggr and the hardware rings of the underlying port.
854  */
855 static int
856 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
857 {
858 	aggr_grp_t		*grp = port->lp_grp;
859 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
860 	mac_perim_handle_t	pmph;
861 	int			hw_rh_cnt, i = 0, j;
862 	int			err = 0;
863 
864 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
865 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
866 
867 	/*
868 	 * Get the list the the underlying HW rings.
869 	 */
870 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
871 	    NULL, hw_rh, MAC_RING_TYPE_TX);
872 
873 	/*
874 	 * Even if the underlying NIC does not have TX rings, we
875 	 * still make a psuedo TX ring for that NIC with NULL as
876 	 * the ring handle.
877 	 */
878 	if (hw_rh_cnt == 0)
879 		port->lp_tx_ring_cnt = 1;
880 	else
881 		port->lp_tx_ring_cnt = hw_rh_cnt;
882 
883 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
884 	    port->lp_tx_ring_cnt), KM_SLEEP);
885 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
886 	    port->lp_tx_ring_cnt), KM_SLEEP);
887 
888 	if (hw_rh_cnt == 0) {
889 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
890 		    NULL, &pseudo_rh)) == 0) {
891 			port->lp_tx_rings[0] = NULL;
892 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
893 		}
894 	} else {
895 		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
896 			err = aggr_add_pseudo_tx_ring(port,
897 			    tx_grp, hw_rh[i], &pseudo_rh);
898 			if (err != 0)
899 				break;
900 			port->lp_tx_rings[i] = hw_rh[i];
901 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
902 		}
903 	}
904 
905 	if (err != 0) {
906 		if (hw_rh_cnt != 0) {
907 			for (j = 0; j < i; j++) {
908 				aggr_rem_pseudo_tx_ring(tx_grp,
909 				    port->lp_pseudo_tx_rings[j]);
910 			}
911 		}
912 		kmem_free(port->lp_tx_rings,
913 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
914 		kmem_free(port->lp_pseudo_tx_rings,
915 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
916 		port->lp_tx_ring_cnt = 0;
917 	} else {
918 		port->lp_tx_grp_added = B_TRUE;
919 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
920 		    aggr_tx_ring_update, port);
921 	}
922 	mac_perim_exit(pmph);
923 	return (err);
924 }
925 
926 /*
927  * This function is called by aggr to remove pseudo TX rings over the
928  * HW rings of the underlying port.
929  */
930 static void
931 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
932 {
933 	aggr_grp_t		*grp = port->lp_grp;
934 	mac_perim_handle_t	pmph;
935 	int			i;
936 
937 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
938 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
939 
940 	if (!port->lp_tx_grp_added)
941 		goto done;
942 
943 	ASSERT(tx_grp->atg_gh != NULL);
944 
945 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
946 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
947 
948 	kmem_free(port->lp_tx_rings,
949 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
950 	kmem_free(port->lp_pseudo_tx_rings,
951 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
952 
953 	port->lp_tx_ring_cnt = 0;
954 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
955 	port->lp_tx_grp_added = B_FALSE;
956 done:
957 	mac_perim_exit(pmph);
958 }
959 
960 static int
961 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
962 {
963 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
964 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
965 }
966 
967 static int
968 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
969 {
970 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
971 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
972 }
973 
974 static int
975 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
976 {
977 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
978 	int err;
979 
980 	err = mac_hwring_start(rr_ring->arr_hw_rh);
981 	if (err == 0)
982 		rr_ring->arr_gen = mr_gen;
983 	return (err);
984 }
985 
986 static void
987 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
988 {
989 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
990 	mac_hwring_stop(rr_ring->arr_hw_rh);
991 }
992 
993 /*
994  * Add one or more ports to an existing link aggregation group.
995  */
996 int
997 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
998     laioc_port_t *ports)
999 {
1000 	int rc, i, nadded = 0;
1001 	aggr_grp_t *grp = NULL;
1002 	aggr_port_t *port;
1003 	boolean_t link_state_changed = B_FALSE;
1004 	mac_perim_handle_t mph, pmph;
1005 
1006 	/* get group corresponding to linkid */
1007 	rw_enter(&aggr_grp_lock, RW_READER);
1008 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1009 	    (mod_hash_val_t *)&grp) != 0) {
1010 		rw_exit(&aggr_grp_lock);
1011 		return (ENOENT);
1012 	}
1013 	AGGR_GRP_REFHOLD(grp);
1014 
1015 	/*
1016 	 * Hold the perimeter so that the aggregation won't be destroyed.
1017 	 */
1018 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1019 	rw_exit(&aggr_grp_lock);
1020 
1021 	/* add the specified ports to group */
1022 	for (i = 0; i < nports; i++) {
1023 		/* add port to group */
1024 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1025 		    force, &port)) != 0) {
1026 			goto bail;
1027 		}
1028 		ASSERT(port != NULL);
1029 		nadded++;
1030 
1031 		/* check capabilities */
1032 		if (!aggr_grp_capab_check(grp, port) ||
1033 		    !aggr_grp_sdu_check(grp, port) ||
1034 		    !aggr_grp_margin_check(grp, port)) {
1035 			rc = ENOTSUP;
1036 			goto bail;
1037 		}
1038 
1039 		/*
1040 		 * Create the pseudo ring for each HW ring of the underlying
1041 		 * port.
1042 		 */
1043 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1044 		if (rc != 0)
1045 			goto bail;
1046 		rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1047 		if (rc != 0)
1048 			goto bail;
1049 
1050 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1051 
1052 		/* set LACP mode */
1053 		aggr_port_lacp_set_mode(grp, port);
1054 
1055 		/* start port if group has already been started */
1056 		if (grp->lg_started) {
1057 			rc = aggr_port_start(port);
1058 			if (rc != 0) {
1059 				mac_perim_exit(pmph);
1060 				goto bail;
1061 			}
1062 
1063 			/*
1064 			 * Turn on the promiscuous mode over the port when it
1065 			 * is requested to be turned on to receive the
1066 			 * non-primary address over a port, or the promiscous
1067 			 * mode is enabled over the aggr.
1068 			 */
1069 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1070 				rc = aggr_port_promisc(port, B_TRUE);
1071 				if (rc != 0) {
1072 					mac_perim_exit(pmph);
1073 					goto bail;
1074 				}
1075 			}
1076 		}
1077 		mac_perim_exit(pmph);
1078 
1079 		/*
1080 		 * Attach each port if necessary.
1081 		 */
1082 		if (aggr_port_notify_link(grp, port))
1083 			link_state_changed = B_TRUE;
1084 
1085 		/*
1086 		 * Initialize the callback functions for this port.
1087 		 */
1088 		aggr_port_init_callbacks(port);
1089 	}
1090 
1091 	/* update the MAC address of the constituent ports */
1092 	if (aggr_grp_update_ports_mac(grp))
1093 		link_state_changed = B_TRUE;
1094 
1095 	if (link_state_changed)
1096 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1097 
1098 bail:
1099 	if (rc != 0) {
1100 		/* stop and remove ports that have been added */
1101 		for (i = 0; i < nadded; i++) {
1102 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1103 			ASSERT(port != NULL);
1104 			if (grp->lg_started) {
1105 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1106 				(void) aggr_port_promisc(port, B_FALSE);
1107 				aggr_port_stop(port);
1108 				mac_perim_exit(pmph);
1109 			}
1110 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1111 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1112 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1113 		}
1114 	}
1115 
1116 	mac_perim_exit(mph);
1117 	AGGR_GRP_REFRELE(grp);
1118 	return (rc);
1119 }
1120 
1121 static int
1122 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1123     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1124     aggr_lacp_timer_t lacp_timer)
1125 {
1126 	boolean_t mac_addr_changed = B_FALSE;
1127 	boolean_t link_state_changed = B_FALSE;
1128 	mac_perim_handle_t pmph;
1129 
1130 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1131 
1132 	/* validate fixed address if specified */
1133 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1134 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1135 	    (mac_addr[0] & 0x01))) {
1136 		return (EINVAL);
1137 	}
1138 
1139 	/* update policy if requested */
1140 	if (update_mask & AGGR_MODIFY_POLICY)
1141 		aggr_send_update_policy(grp, policy);
1142 
1143 	/* update unicast MAC address if requested */
1144 	if (update_mask & AGGR_MODIFY_MAC) {
1145 		if (mac_fixed) {
1146 			/* user-supplied MAC address */
1147 			grp->lg_mac_addr_port = NULL;
1148 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1149 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1150 				mac_addr_changed = B_TRUE;
1151 			}
1152 		} else if (grp->lg_addr_fixed) {
1153 			/* switch from user-supplied to automatic */
1154 			aggr_port_t *port = grp->lg_ports;
1155 
1156 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1157 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1158 			grp->lg_mac_addr_port = port;
1159 			mac_addr_changed = B_TRUE;
1160 			mac_perim_exit(pmph);
1161 		}
1162 		grp->lg_addr_fixed = mac_fixed;
1163 	}
1164 
1165 	if (mac_addr_changed)
1166 		link_state_changed = aggr_grp_update_ports_mac(grp);
1167 
1168 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1169 		aggr_lacp_update_mode(grp, lacp_mode);
1170 
1171 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1172 		aggr_lacp_update_timer(grp, lacp_timer);
1173 
1174 	if (link_state_changed)
1175 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1176 
1177 	if (mac_addr_changed)
1178 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1179 
1180 	return (0);
1181 }
1182 
1183 /*
1184  * Update properties of an existing link aggregation group.
1185  */
1186 int
1187 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1188     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1189     aggr_lacp_timer_t lacp_timer)
1190 {
1191 	aggr_grp_t *grp = NULL;
1192 	mac_perim_handle_t mph;
1193 	int err;
1194 
1195 	/* get group corresponding to linkid */
1196 	rw_enter(&aggr_grp_lock, RW_READER);
1197 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1198 	    (mod_hash_val_t *)&grp) != 0) {
1199 		rw_exit(&aggr_grp_lock);
1200 		return (ENOENT);
1201 	}
1202 	AGGR_GRP_REFHOLD(grp);
1203 
1204 	/*
1205 	 * Hold the perimeter so that the aggregation won't be destroyed.
1206 	 */
1207 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1208 	rw_exit(&aggr_grp_lock);
1209 
1210 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1211 	    mac_addr, lacp_mode, lacp_timer);
1212 
1213 	mac_perim_exit(mph);
1214 	AGGR_GRP_REFRELE(grp);
1215 	return (err);
1216 }
1217 
1218 /*
1219  * Create a new link aggregation group upon request from administrator.
1220  * Returns 0 on success, an errno on failure.
1221  */
1222 int
1223 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1224     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1225     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1226     cred_t *credp)
1227 {
1228 	aggr_grp_t *grp = NULL;
1229 	aggr_port_t *port;
1230 	mac_register_t *mac;
1231 	boolean_t link_state_changed;
1232 	mac_perim_handle_t mph;
1233 	int err;
1234 	int i;
1235 	kt_did_t tid = 0;
1236 
1237 	/* need at least one port */
1238 	if (nports == 0)
1239 		return (EINVAL);
1240 
1241 	rw_enter(&aggr_grp_lock, RW_WRITER);
1242 
1243 	/* does a group with the same linkid already exist? */
1244 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1245 	    (mod_hash_val_t *)&grp);
1246 	if (err == 0) {
1247 		rw_exit(&aggr_grp_lock);
1248 		return (EEXIST);
1249 	}
1250 
1251 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1252 
1253 	grp->lg_refs = 1;
1254 	grp->lg_closing = B_FALSE;
1255 	grp->lg_force = force;
1256 	grp->lg_linkid = linkid;
1257 	grp->lg_zoneid = crgetzoneid(credp);
1258 	grp->lg_ifspeed = 0;
1259 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1260 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1261 	grp->lg_started = B_FALSE;
1262 	grp->lg_promisc = B_FALSE;
1263 	grp->lg_lacp_done = B_FALSE;
1264 	grp->lg_tx_notify_done = B_FALSE;
1265 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1266 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1267 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1268 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1269 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1270 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1271 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1272 	grp->lg_tx_blocked_cnt = 0;
1273 	bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1274 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1275 	aggr_lacp_init_grp(grp);
1276 
1277 	/* add MAC ports to group */
1278 	grp->lg_ports = NULL;
1279 	grp->lg_nports = 0;
1280 	grp->lg_nattached_ports = 0;
1281 	grp->lg_ntx_ports = 0;
1282 
1283 	/*
1284 	 * If key is not specified by the user, allocate the key.
1285 	 */
1286 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1287 		err = ENOMEM;
1288 		goto bail;
1289 	}
1290 	grp->lg_key = key;
1291 
1292 	for (i = 0; i < nports; i++) {
1293 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1294 		if (err != 0)
1295 			goto bail;
1296 	}
1297 
1298 	/*
1299 	 * If no explicit MAC address was specified by the administrator,
1300 	 * set it to the MAC address of the first port.
1301 	 */
1302 	grp->lg_addr_fixed = mac_fixed;
1303 	if (grp->lg_addr_fixed) {
1304 		/* validate specified address */
1305 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1306 			err = EINVAL;
1307 			goto bail;
1308 		}
1309 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1310 	} else {
1311 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1312 		grp->lg_mac_addr_port = grp->lg_ports;
1313 	}
1314 
1315 	/* set the initial group capabilities */
1316 	aggr_grp_capab_set(grp);
1317 
1318 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1319 		err = ENOMEM;
1320 		goto bail;
1321 	}
1322 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1323 	mac->m_driver = grp;
1324 	mac->m_dip = aggr_dip;
1325 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1326 	mac->m_src_addr = grp->lg_addr;
1327 	mac->m_callbacks = &aggr_m_callbacks;
1328 	mac->m_min_sdu = 0;
1329 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1330 	mac->m_margin = aggr_grp_max_margin(grp);
1331 	mac->m_v12n = MAC_VIRT_LEVEL1;
1332 	err = mac_register(mac, &grp->lg_mh);
1333 	mac_free(mac);
1334 	if (err != 0)
1335 		goto bail;
1336 
1337 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1338 	if (err != 0) {
1339 		(void) mac_unregister(grp->lg_mh);
1340 		grp->lg_mh = NULL;
1341 		goto bail;
1342 	}
1343 
1344 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1345 
1346 	/*
1347 	 * Update the MAC address of the constituent ports.
1348 	 * None of the port is attached at this time, the link state of the
1349 	 * aggregation will not change.
1350 	 */
1351 	link_state_changed = aggr_grp_update_ports_mac(grp);
1352 	ASSERT(!link_state_changed);
1353 
1354 	/* update outbound load balancing policy */
1355 	aggr_send_update_policy(grp, policy);
1356 
1357 	/* set LACP mode */
1358 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1359 
1360 	/*
1361 	 * Attach each port if necessary.
1362 	 */
1363 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1364 		/*
1365 		 * Create the pseudo ring for each HW ring of the underlying
1366 		 * port. Note that this is done after the aggr registers the
1367 		 * mac.
1368 		 */
1369 		VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1370 		VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1371 		if (aggr_port_notify_link(grp, port))
1372 			link_state_changed = B_TRUE;
1373 
1374 		/*
1375 		 * Initialize the callback functions for this port.
1376 		 */
1377 		aggr_port_init_callbacks(port);
1378 	}
1379 
1380 	if (link_state_changed)
1381 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1382 
1383 	/* add new group to hash table */
1384 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1385 	    (mod_hash_val_t)grp);
1386 	ASSERT(err == 0);
1387 	aggr_grp_cnt++;
1388 
1389 	mac_perim_exit(mph);
1390 	rw_exit(&aggr_grp_lock);
1391 	return (0);
1392 
1393 bail:
1394 
1395 	grp->lg_closing = B_TRUE;
1396 
1397 	port = grp->lg_ports;
1398 	while (port != NULL) {
1399 		aggr_port_t *cport;
1400 
1401 		cport = port->lp_next;
1402 		aggr_port_delete(port);
1403 		port = cport;
1404 	}
1405 
1406 	/*
1407 	 * Inform the lacp_rx thread to exit.
1408 	 */
1409 	mutex_enter(&grp->lg_lacp_lock);
1410 	grp->lg_lacp_done = B_TRUE;
1411 	cv_signal(&grp->lg_lacp_cv);
1412 	while (grp->lg_lacp_rx_thread != NULL)
1413 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1414 	mutex_exit(&grp->lg_lacp_lock);
1415 	/*
1416 	 * Inform the tx_notify thread to exit.
1417 	 */
1418 	mutex_enter(&grp->lg_tx_flowctl_lock);
1419 	if (grp->lg_tx_notify_thread != NULL) {
1420 		tid = grp->lg_tx_notify_thread->t_did;
1421 		grp->lg_tx_notify_done = B_TRUE;
1422 		cv_signal(&grp->lg_tx_flowctl_cv);
1423 	}
1424 	mutex_exit(&grp->lg_tx_flowctl_lock);
1425 	if (tid != 0)
1426 		thread_join(tid);
1427 
1428 	kmem_free(grp->lg_tx_blocked_rings,
1429 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1430 	rw_exit(&aggr_grp_lock);
1431 	AGGR_GRP_REFRELE(grp);
1432 	return (err);
1433 }
1434 
1435 /*
1436  * Return a pointer to the member of a group with specified linkid.
1437  */
1438 static aggr_port_t *
1439 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1440 {
1441 	aggr_port_t *port;
1442 
1443 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1444 
1445 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1446 		if (port->lp_linkid == linkid)
1447 			break;
1448 	}
1449 
1450 	return (port);
1451 }
1452 
1453 /*
1454  * Stop, detach and remove a port from a link aggregation group.
1455  */
1456 static int
1457 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1458     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1459 {
1460 	int rc = 0;
1461 	aggr_port_t **pport;
1462 	boolean_t mac_addr_changed = B_FALSE;
1463 	boolean_t link_state_changed = B_FALSE;
1464 	mac_perim_handle_t mph;
1465 	uint64_t val;
1466 	uint_t i;
1467 	uint_t stat;
1468 
1469 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1470 	ASSERT(grp->lg_nports > 1);
1471 	ASSERT(!grp->lg_closing);
1472 
1473 	/* unlink port */
1474 	for (pport = &grp->lg_ports; *pport != port;
1475 	    pport = &(*pport)->lp_next) {
1476 		if (*pport == NULL) {
1477 			rc = ENOENT;
1478 			goto done;
1479 		}
1480 	}
1481 	*pport = port->lp_next;
1482 
1483 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1484 
1485 	/*
1486 	 * If the MAC address of the port being removed was assigned
1487 	 * to the group, update the group MAC address
1488 	 * using the MAC address of a different port.
1489 	 */
1490 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1491 		/*
1492 		 * Set the MAC address of the group to the
1493 		 * MAC address of its first port.
1494 		 */
1495 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1496 		grp->lg_mac_addr_port = grp->lg_ports;
1497 		mac_addr_changed = B_TRUE;
1498 	}
1499 
1500 	link_state_changed = aggr_grp_detach_port(grp, port);
1501 
1502 	/*
1503 	 * Add the counter statistics of the ports while it was aggregated
1504 	 * to the group's residual statistics.  This is done by obtaining
1505 	 * the current counter from the underlying MAC then subtracting the
1506 	 * value of the counter at the moment it was added to the
1507 	 * aggregation.
1508 	 */
1509 	for (i = 0; i < MAC_NSTAT; i++) {
1510 		stat = i + MAC_STAT_MIN;
1511 		if (!MAC_STAT_ISACOUNTER(stat))
1512 			continue;
1513 		val = aggr_port_stat(port, stat);
1514 		val -= port->lp_stat[i];
1515 		grp->lg_stat[i] += val;
1516 	}
1517 	for (i = 0; i < ETHER_NSTAT; i++) {
1518 		stat = i + MACTYPE_STAT_MIN;
1519 		if (!ETHER_STAT_ISACOUNTER(stat))
1520 			continue;
1521 		val = aggr_port_stat(port, stat);
1522 		val -= port->lp_ether_stat[i];
1523 		grp->lg_ether_stat[i] += val;
1524 	}
1525 
1526 	grp->lg_nports--;
1527 	mac_perim_exit(mph);
1528 
1529 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1530 	aggr_port_delete(port);
1531 
1532 	/*
1533 	 * If the group MAC address has changed, update the MAC address of
1534 	 * the remaining constituent ports according to the new MAC
1535 	 * address of the group.
1536 	 */
1537 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1538 		link_state_changed = B_TRUE;
1539 
1540 done:
1541 	if (mac_addr_changedp != NULL)
1542 		*mac_addr_changedp = mac_addr_changed;
1543 	if (link_state_changedp != NULL)
1544 		*link_state_changedp = link_state_changed;
1545 
1546 	return (rc);
1547 }
1548 
1549 /*
1550  * Remove one or more ports from an existing link aggregation group.
1551  */
1552 int
1553 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1554 {
1555 	int rc = 0, i;
1556 	aggr_grp_t *grp = NULL;
1557 	aggr_port_t *port;
1558 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1559 	boolean_t link_state_update = B_FALSE, link_state_changed;
1560 	mac_perim_handle_t mph, pmph;
1561 
1562 	/* get group corresponding to linkid */
1563 	rw_enter(&aggr_grp_lock, RW_READER);
1564 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1565 	    (mod_hash_val_t *)&grp) != 0) {
1566 		rw_exit(&aggr_grp_lock);
1567 		return (ENOENT);
1568 	}
1569 	AGGR_GRP_REFHOLD(grp);
1570 
1571 	/*
1572 	 * Hold the perimeter so that the aggregation won't be destroyed.
1573 	 */
1574 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1575 	rw_exit(&aggr_grp_lock);
1576 
1577 	/* we need to keep at least one port per group */
1578 	if (nports >= grp->lg_nports) {
1579 		rc = EINVAL;
1580 		goto bail;
1581 	}
1582 
1583 	/* first verify that all the groups are valid */
1584 	for (i = 0; i < nports; i++) {
1585 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1586 			/* port not found */
1587 			rc = ENOENT;
1588 			goto bail;
1589 		}
1590 	}
1591 
1592 	/* clear the promiscous mode for the specified ports */
1593 	for (i = 0; i < nports && rc == 0; i++) {
1594 		/* lookup port */
1595 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1596 		ASSERT(port != NULL);
1597 
1598 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1599 		rc = aggr_port_promisc(port, B_FALSE);
1600 		mac_perim_exit(pmph);
1601 	}
1602 	if (rc != 0) {
1603 		for (i = 0; i < nports; i++) {
1604 			port = aggr_grp_port_lookup(grp,
1605 			    ports[i].lp_linkid);
1606 			ASSERT(port != NULL);
1607 
1608 			/*
1609 			 * Turn the promiscuous mode back on if it is required
1610 			 * to receive the non-primary address over a port, or
1611 			 * the promiscous mode is enabled over the aggr.
1612 			 */
1613 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1614 			if (port->lp_started && (grp->lg_promisc ||
1615 			    port->lp_prom_addr != NULL)) {
1616 				(void) aggr_port_promisc(port, B_TRUE);
1617 			}
1618 			mac_perim_exit(pmph);
1619 		}
1620 		goto bail;
1621 	}
1622 
1623 	/* remove the specified ports from group */
1624 	for (i = 0; i < nports; i++) {
1625 		/* lookup port */
1626 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1627 		ASSERT(port != NULL);
1628 
1629 		/* stop port if group has already been started */
1630 		if (grp->lg_started) {
1631 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1632 			aggr_port_stop(port);
1633 			mac_perim_exit(pmph);
1634 		}
1635 
1636 		/*
1637 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1638 		 * it is called from inside aggr_grp_rem_port() after the
1639 		 * port has been detached. The reason is that
1640 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1641 		 * and if there is still traffic going on, then there
1642 		 * is the possibility of aggr_find_tx_ring() returning a
1643 		 * removed ring for transmission. Once the port has been
1644 		 * detached, that port will not be used and
1645 		 * aggr_find_tx_ring() will not return any rings
1646 		 * belonging to it.
1647 		 */
1648 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1649 
1650 		/* remove port from group */
1651 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1652 		    &link_state_changed);
1653 		ASSERT(rc == 0);
1654 		mac_addr_update = mac_addr_update || mac_addr_changed;
1655 		link_state_update = link_state_update || link_state_changed;
1656 	}
1657 
1658 bail:
1659 	if (mac_addr_update)
1660 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1661 	if (link_state_update)
1662 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1663 
1664 	mac_perim_exit(mph);
1665 	AGGR_GRP_REFRELE(grp);
1666 
1667 	return (rc);
1668 }
1669 
1670 int
1671 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1672 {
1673 	aggr_grp_t *grp = NULL;
1674 	aggr_port_t *port, *cport;
1675 	datalink_id_t tmpid;
1676 	mod_hash_val_t val;
1677 	mac_perim_handle_t mph, pmph;
1678 	int err;
1679 	kt_did_t tid = 0;
1680 
1681 	rw_enter(&aggr_grp_lock, RW_WRITER);
1682 
1683 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1684 	    (mod_hash_val_t *)&grp) != 0) {
1685 		rw_exit(&aggr_grp_lock);
1686 		return (ENOENT);
1687 	}
1688 
1689 	/*
1690 	 * Note that dls_devnet_destroy() must be called before lg_lock is
1691 	 * held. Otherwise, it will deadlock if another thread is in
1692 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1693 	 * dls_devnet_destroy() needs to delete.
1694 	 */
1695 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1696 		rw_exit(&aggr_grp_lock);
1697 		return (err);
1698 	}
1699 	ASSERT(linkid == tmpid);
1700 
1701 	/*
1702 	 * Unregister from the MAC service module. Since this can
1703 	 * fail if a client hasn't closed the MAC port, we gracefully
1704 	 * fail the operation.
1705 	 */
1706 	if ((err = mac_disable(grp->lg_mh)) != 0) {
1707 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1708 		rw_exit(&aggr_grp_lock);
1709 		return (err);
1710 	}
1711 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1712 	ASSERT(grp == (aggr_grp_t *)val);
1713 
1714 	ASSERT(aggr_grp_cnt > 0);
1715 	aggr_grp_cnt--;
1716 	rw_exit(&aggr_grp_lock);
1717 
1718 	/*
1719 	 * Inform the lacp_rx thread to exit.
1720 	 */
1721 	mutex_enter(&grp->lg_lacp_lock);
1722 	grp->lg_lacp_done = B_TRUE;
1723 	cv_signal(&grp->lg_lacp_cv);
1724 	while (grp->lg_lacp_rx_thread != NULL)
1725 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1726 	mutex_exit(&grp->lg_lacp_lock);
1727 	/*
1728 	 * Inform the tx_notify_thread to exit.
1729 	 */
1730 	mutex_enter(&grp->lg_tx_flowctl_lock);
1731 	if (grp->lg_tx_notify_thread != NULL) {
1732 		tid = grp->lg_tx_notify_thread->t_did;
1733 		grp->lg_tx_notify_done = B_TRUE;
1734 		cv_signal(&grp->lg_tx_flowctl_cv);
1735 	}
1736 	mutex_exit(&grp->lg_tx_flowctl_lock);
1737 	if (tid != 0)
1738 		thread_join(tid);
1739 
1740 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1741 
1742 	grp->lg_closing = B_TRUE;
1743 	/* detach and free MAC ports associated with group */
1744 	port = grp->lg_ports;
1745 	while (port != NULL) {
1746 		cport = port->lp_next;
1747 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1748 		if (grp->lg_started)
1749 			aggr_port_stop(port);
1750 		(void) aggr_grp_detach_port(grp, port);
1751 		mac_perim_exit(pmph);
1752 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1753 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1754 		aggr_port_delete(port);
1755 		port = cport;
1756 	}
1757 
1758 	mac_perim_exit(mph);
1759 
1760 	kmem_free(grp->lg_tx_blocked_rings,
1761 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1762 	/*
1763 	 * Wait for the port's lacp timer thread and its notification callback
1764 	 * to exit before calling mac_unregister() since both needs to access
1765 	 * the mac perimeter of the grp.
1766 	 */
1767 	aggr_grp_port_wait(grp);
1768 
1769 	VERIFY(mac_unregister(grp->lg_mh) == 0);
1770 	grp->lg_mh = NULL;
1771 
1772 	AGGR_GRP_REFRELE(grp);
1773 	return (0);
1774 }
1775 
1776 void
1777 aggr_grp_free(aggr_grp_t *grp)
1778 {
1779 	ASSERT(grp->lg_refs == 0);
1780 	ASSERT(grp->lg_port_ref == 0);
1781 	if (grp->lg_key > AGGR_MAX_KEY) {
1782 		id_free(key_ids, grp->lg_key);
1783 		grp->lg_key = 0;
1784 	}
1785 	kmem_cache_free(aggr_grp_cache, grp);
1786 }
1787 
1788 int
1789 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1790     aggr_grp_info_new_grp_fn_t new_grp_fn,
1791     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1792 {
1793 	aggr_grp_t	*grp;
1794 	aggr_port_t	*port;
1795 	mac_perim_handle_t mph, pmph;
1796 	int		rc = 0;
1797 
1798 	/*
1799 	 * Make sure that the aggregation link is visible from the caller's
1800 	 * zone.
1801 	 */
1802 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1803 		return (ENOENT);
1804 
1805 	rw_enter(&aggr_grp_lock, RW_READER);
1806 
1807 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1808 	    (mod_hash_val_t *)&grp) != 0) {
1809 		rw_exit(&aggr_grp_lock);
1810 		return (ENOENT);
1811 	}
1812 	AGGR_GRP_REFHOLD(grp);
1813 
1814 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1815 	rw_exit(&aggr_grp_lock);
1816 
1817 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
1818 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1819 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1820 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1821 
1822 	if (rc != 0)
1823 		goto bail;
1824 
1825 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1826 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1827 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1828 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
1829 		mac_perim_exit(pmph);
1830 
1831 		if (rc != 0)
1832 			goto bail;
1833 	}
1834 
1835 bail:
1836 	mac_perim_exit(mph);
1837 	AGGR_GRP_REFRELE(grp);
1838 	return (rc);
1839 }
1840 
1841 /*ARGSUSED*/
1842 static void
1843 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1844 {
1845 	miocnak(q, mp, 0, ENOTSUP);
1846 }
1847 
1848 static int
1849 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1850 {
1851 	aggr_port_t	*port;
1852 	uint_t		stat_index;
1853 
1854 	/* We only aggregate counter statistics. */
1855 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1856 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1857 		return (ENOTSUP);
1858 	}
1859 
1860 	/*
1861 	 * Counter statistics for a group are computed by aggregating the
1862 	 * counters of the members MACs while they were aggregated, plus
1863 	 * the residual counter of the group itself, which is updated each
1864 	 * time a MAC is removed from the group.
1865 	 */
1866 	*val = 0;
1867 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1868 		/* actual port statistic */
1869 		*val += aggr_port_stat(port, stat);
1870 		/*
1871 		 * minus the port stat when it was added, plus any residual
1872 		 * amount for the group.
1873 		 */
1874 		if (IS_MAC_STAT(stat)) {
1875 			stat_index = stat - MAC_STAT_MIN;
1876 			*val -= port->lp_stat[stat_index];
1877 			*val += grp->lg_stat[stat_index];
1878 		} else if (IS_MACTYPE_STAT(stat)) {
1879 			stat_index = stat - MACTYPE_STAT_MIN;
1880 			*val -= port->lp_ether_stat[stat_index];
1881 			*val += grp->lg_ether_stat[stat_index];
1882 		}
1883 	}
1884 	return (0);
1885 }
1886 
1887 int
1888 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1889 {
1890 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1891 
1892 	if (rx_ring->arr_hw_rh != NULL) {
1893 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1894 	} else {
1895 		aggr_port_t	*port = rx_ring->arr_port;
1896 
1897 		*val = mac_stat_get(port->lp_mh, stat);
1898 
1899 	}
1900 	return (0);
1901 }
1902 
1903 int
1904 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1905 {
1906 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1907 
1908 	if (tx_ring->atr_hw_rh != NULL) {
1909 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1910 	} else {
1911 		aggr_port_t	*port = tx_ring->atr_port;
1912 
1913 		*val = mac_stat_get(port->lp_mh, stat);
1914 	}
1915 	return (0);
1916 }
1917 
1918 static int
1919 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1920 {
1921 	aggr_grp_t		*grp = arg;
1922 	mac_perim_handle_t	mph;
1923 	int			rval = 0;
1924 
1925 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1926 
1927 	switch (stat) {
1928 	case MAC_STAT_IFSPEED:
1929 		*val = grp->lg_ifspeed;
1930 		break;
1931 
1932 	case ETHER_STAT_LINK_DUPLEX:
1933 		*val = grp->lg_link_duplex;
1934 		break;
1935 
1936 	default:
1937 		/*
1938 		 * For all other statistics, we return the aggregated stat
1939 		 * from the underlying ports.  aggr_grp_stat() will set
1940 		 * rval appropriately if the statistic isn't a counter.
1941 		 */
1942 		rval = aggr_grp_stat(grp, stat, val);
1943 	}
1944 
1945 	mac_perim_exit(mph);
1946 	return (rval);
1947 }
1948 
1949 static int
1950 aggr_m_start(void *arg)
1951 {
1952 	aggr_grp_t *grp = arg;
1953 	aggr_port_t *port;
1954 	mac_perim_handle_t mph, pmph;
1955 
1956 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1957 
1958 	/*
1959 	 * Attempts to start all configured members of the group.
1960 	 * Group members will be attached when their link-up notification
1961 	 * is received.
1962 	 */
1963 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1964 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1965 		if (aggr_port_start(port) != 0) {
1966 			mac_perim_exit(pmph);
1967 			continue;
1968 		}
1969 
1970 		/*
1971 		 * Turn on the promiscuous mode if it is required to receive
1972 		 * the non-primary address over a port, or the promiscous
1973 		 * mode is enabled over the aggr.
1974 		 */
1975 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1976 			if (aggr_port_promisc(port, B_TRUE) != 0)
1977 				aggr_port_stop(port);
1978 		}
1979 		mac_perim_exit(pmph);
1980 	}
1981 
1982 	grp->lg_started = B_TRUE;
1983 
1984 	mac_perim_exit(mph);
1985 	return (0);
1986 }
1987 
1988 static void
1989 aggr_m_stop(void *arg)
1990 {
1991 	aggr_grp_t *grp = arg;
1992 	aggr_port_t *port;
1993 	mac_perim_handle_t mph, pmph;
1994 
1995 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1996 
1997 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1998 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1999 
2000 		/* reset port promiscuous mode */
2001 		(void) aggr_port_promisc(port, B_FALSE);
2002 
2003 		aggr_port_stop(port);
2004 		mac_perim_exit(pmph);
2005 	}
2006 
2007 	grp->lg_started = B_FALSE;
2008 	mac_perim_exit(mph);
2009 }
2010 
2011 static int
2012 aggr_m_promisc(void *arg, boolean_t on)
2013 {
2014 	aggr_grp_t *grp = arg;
2015 	aggr_port_t *port;
2016 	boolean_t link_state_changed = B_FALSE;
2017 	mac_perim_handle_t mph, pmph;
2018 
2019 	AGGR_GRP_REFHOLD(grp);
2020 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2021 
2022 	ASSERT(!grp->lg_closing);
2023 
2024 	if (on == grp->lg_promisc)
2025 		goto bail;
2026 
2027 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2028 		int	err = 0;
2029 
2030 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2031 		AGGR_PORT_REFHOLD(port);
2032 		if (!on && (port->lp_prom_addr == NULL))
2033 			err = aggr_port_promisc(port, B_FALSE);
2034 		else if (on && port->lp_started)
2035 			err = aggr_port_promisc(port, B_TRUE);
2036 
2037 		if (err != 0) {
2038 			if (aggr_grp_detach_port(grp, port))
2039 				link_state_changed = B_TRUE;
2040 		} else {
2041 			/*
2042 			 * If a port was detached because of a previous
2043 			 * failure changing the promiscuity, the port
2044 			 * is reattached when it successfully changes
2045 			 * the promiscuity now, and this might cause
2046 			 * the link state of the aggregation to change.
2047 			 */
2048 			if (aggr_grp_attach_port(grp, port))
2049 				link_state_changed = B_TRUE;
2050 		}
2051 		mac_perim_exit(pmph);
2052 		AGGR_PORT_REFRELE(port);
2053 	}
2054 
2055 	grp->lg_promisc = on;
2056 
2057 	if (link_state_changed)
2058 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2059 
2060 bail:
2061 	mac_perim_exit(mph);
2062 	AGGR_GRP_REFRELE(grp);
2063 
2064 	return (0);
2065 }
2066 
2067 static void
2068 aggr_grp_port_rename(const char *new_name, void *arg)
2069 {
2070 	/*
2071 	 * aggr port's mac client name is the format of "aggr link name" plus
2072 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2073 	 */
2074 	int aggr_len, link_len, clnt_name_len, i;
2075 	char *str_end, *str_st, *str_del;
2076 	char aggr_name[MAXNAMELEN];
2077 	char link_name[MAXNAMELEN];
2078 	char *clnt_name;
2079 	aggr_grp_t *aggr_grp = arg;
2080 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2081 
2082 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2083 		clnt_name = mac_client_name(aggr_port->lp_mch);
2084 		clnt_name_len = strlen(clnt_name);
2085 		str_st = clnt_name;
2086 		str_end = &(clnt_name[clnt_name_len]);
2087 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2088 		ASSERT(str_del != NULL);
2089 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2090 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2091 		bzero(aggr_name, MAXNAMELEN);
2092 		bzero(link_name, MAXNAMELEN);
2093 		bcopy(clnt_name, aggr_name, aggr_len);
2094 		bcopy(str_del, link_name, link_len + 1);
2095 		bzero(clnt_name, MAXNAMELEN);
2096 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2097 		    link_name);
2098 
2099 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2100 		aggr_port = aggr_port->lp_next;
2101 	}
2102 }
2103 
2104 /*
2105  * Initialize the capabilities that are advertised for the group
2106  * according to the capabilities of the constituent ports.
2107  */
2108 static boolean_t
2109 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2110 {
2111 	aggr_grp_t *grp = arg;
2112 
2113 	switch (cap) {
2114 	case MAC_CAPAB_HCKSUM: {
2115 		uint32_t *hcksum_txflags = cap_data;
2116 		*hcksum_txflags = grp->lg_hcksum_txflags;
2117 		break;
2118 	}
2119 	case MAC_CAPAB_LSO: {
2120 		mac_capab_lso_t *cap_lso = cap_data;
2121 
2122 		if (grp->lg_lso) {
2123 			*cap_lso = grp->lg_cap_lso;
2124 			break;
2125 		} else {
2126 			return (B_FALSE);
2127 		}
2128 	}
2129 	case MAC_CAPAB_NO_NATIVEVLAN:
2130 		return (!grp->lg_vlan);
2131 	case MAC_CAPAB_NO_ZCOPY:
2132 		return (!grp->lg_zcopy);
2133 	case MAC_CAPAB_RINGS: {
2134 		mac_capab_rings_t *cap_rings = cap_data;
2135 
2136 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2137 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2138 			cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2139 
2140 			/*
2141 			 * An aggregation advertises only one (pseudo) RX
2142 			 * group, which virtualizes the main/primary group of
2143 			 * the underlying devices.
2144 			 */
2145 			cap_rings->mr_gnum = 1;
2146 			cap_rings->mr_gaddring = NULL;
2147 			cap_rings->mr_gremring = NULL;
2148 		} else {
2149 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2150 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2151 			cap_rings->mr_gnum = 0;
2152 		}
2153 		cap_rings->mr_rget = aggr_fill_ring;
2154 		cap_rings->mr_gget = aggr_fill_group;
2155 		break;
2156 	}
2157 	case MAC_CAPAB_AGGR:
2158 	{
2159 		mac_capab_aggr_t *aggr_cap;
2160 
2161 		if (cap_data != NULL) {
2162 			aggr_cap = cap_data;
2163 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2164 			aggr_cap->mca_unicst = aggr_m_unicst;
2165 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2166 			aggr_cap->mca_arg = arg;
2167 		}
2168 		return (B_TRUE);
2169 	}
2170 	default:
2171 		return (B_FALSE);
2172 	}
2173 	return (B_TRUE);
2174 }
2175 
2176 /*
2177  * Callback funtion for MAC layer to register groups.
2178  */
2179 static void
2180 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2181     mac_group_info_t *infop, mac_group_handle_t gh)
2182 {
2183 	aggr_grp_t *grp = arg;
2184 	aggr_pseudo_rx_group_t *rx_group;
2185 	aggr_pseudo_tx_group_t *tx_group;
2186 
2187 	ASSERT(index == 0);
2188 	if (rtype == MAC_RING_TYPE_RX) {
2189 		rx_group = &grp->lg_rx_group;
2190 		rx_group->arg_gh = gh;
2191 		rx_group->arg_grp = grp;
2192 
2193 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2194 		infop->mgi_start = NULL;
2195 		infop->mgi_stop = NULL;
2196 		infop->mgi_addmac = aggr_addmac;
2197 		infop->mgi_remmac = aggr_remmac;
2198 		infop->mgi_count = rx_group->arg_ring_cnt;
2199 	} else {
2200 		tx_group = &grp->lg_tx_group;
2201 		tx_group->atg_gh = gh;
2202 	}
2203 }
2204 
2205 /*
2206  * Callback funtion for MAC layer to register all rings.
2207  */
2208 static void
2209 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2210     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2211 {
2212 	aggr_grp_t	*grp = arg;
2213 
2214 	switch (rtype) {
2215 	case MAC_RING_TYPE_RX: {
2216 		aggr_pseudo_rx_group_t	*rx_group = &grp->lg_rx_group;
2217 		aggr_pseudo_rx_ring_t	*rx_ring;
2218 		mac_intr_t		aggr_mac_intr;
2219 
2220 		ASSERT(rg_index == 0);
2221 
2222 		ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2223 		rx_ring = rx_group->arg_rings + index;
2224 		rx_ring->arr_rh = rh;
2225 
2226 		/*
2227 		 * Entrypoint to enable interrupt (disable poll) and
2228 		 * disable interrupt (enable poll).
2229 		 */
2230 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2231 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2232 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2233 		aggr_mac_intr.mi_ddi_handle = NULL;
2234 
2235 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2236 		infop->mri_start = aggr_pseudo_start_ring;
2237 		infop->mri_stop = aggr_pseudo_stop_ring;
2238 
2239 		infop->mri_intr = aggr_mac_intr;
2240 		infop->mri_poll = aggr_rx_poll;
2241 
2242 		infop->mri_stat = aggr_rx_ring_stat;
2243 		break;
2244 	}
2245 	case MAC_RING_TYPE_TX: {
2246 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2247 		aggr_pseudo_tx_ring_t	*tx_ring;
2248 
2249 		ASSERT(rg_index == -1);
2250 		ASSERT(index < tx_group->atg_ring_cnt);
2251 
2252 		tx_ring = &tx_group->atg_rings[index];
2253 		tx_ring->atr_rh = rh;
2254 
2255 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2256 		infop->mri_start = NULL;
2257 		infop->mri_stop = NULL;
2258 		infop->mri_tx = aggr_ring_tx;
2259 		infop->mri_stat = aggr_tx_ring_stat;
2260 		/*
2261 		 * Use the hw TX ring handle to find if the ring needs
2262 		 * serialization or not. For NICs that do not expose
2263 		 * Tx rings, atr_hw_rh will be NULL.
2264 		 */
2265 		if (tx_ring->atr_hw_rh != NULL) {
2266 			infop->mri_flags =
2267 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2268 		}
2269 		break;
2270 	}
2271 	default:
2272 		break;
2273 	}
2274 }
2275 
2276 static mblk_t *
2277 aggr_rx_poll(void *arg, int bytes_to_pickup)
2278 {
2279 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2280 	aggr_port_t *port = rr_ring->arr_port;
2281 	aggr_grp_t *grp = port->lp_grp;
2282 	mblk_t *mp_chain, *mp, **mpp;
2283 
2284 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2285 
2286 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2287 		return (mp_chain);
2288 
2289 	mpp = &mp_chain;
2290 	while ((mp = *mpp) != NULL) {
2291 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2292 			struct ether_header *ehp;
2293 
2294 			ehp = (struct ether_header *)mp->b_rptr;
2295 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2296 				*mpp = mp->b_next;
2297 				mp->b_next = NULL;
2298 				aggr_recv_lacp(port,
2299 				    (mac_resource_handle_t)rr_ring, mp);
2300 				continue;
2301 			}
2302 		}
2303 
2304 		if (!port->lp_collector_enabled) {
2305 			*mpp = mp->b_next;
2306 			mp->b_next = NULL;
2307 			freemsg(mp);
2308 			continue;
2309 		}
2310 		mpp = &mp->b_next;
2311 	}
2312 	return (mp_chain);
2313 }
2314 
2315 static int
2316 aggr_addmac(void *arg, const uint8_t *mac_addr)
2317 {
2318 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2319 	aggr_unicst_addr_t	*addr, **pprev;
2320 	aggr_grp_t		*grp = rx_group->arg_grp;
2321 	aggr_port_t		*port, *p;
2322 	mac_perim_handle_t	mph;
2323 	int			err = 0;
2324 
2325 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2326 
2327 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2328 		mac_perim_exit(mph);
2329 		return (0);
2330 	}
2331 
2332 	/*
2333 	 * Insert this mac address into the list of mac addresses owned by
2334 	 * the aggregation pseudo group.
2335 	 */
2336 	pprev = &rx_group->arg_macaddr;
2337 	while ((addr = *pprev) != NULL) {
2338 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2339 			mac_perim_exit(mph);
2340 			return (EEXIST);
2341 		}
2342 		pprev = &addr->aua_next;
2343 	}
2344 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2345 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2346 	addr->aua_next = NULL;
2347 	*pprev = addr;
2348 
2349 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2350 		if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2351 			break;
2352 
2353 	if (err != 0) {
2354 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2355 			aggr_port_remmac(p, mac_addr);
2356 
2357 		*pprev = NULL;
2358 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2359 	}
2360 
2361 	mac_perim_exit(mph);
2362 	return (err);
2363 }
2364 
2365 static int
2366 aggr_remmac(void *arg, const uint8_t *mac_addr)
2367 {
2368 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2369 	aggr_unicst_addr_t	*addr, **pprev;
2370 	aggr_grp_t		*grp = rx_group->arg_grp;
2371 	aggr_port_t		*port;
2372 	mac_perim_handle_t	mph;
2373 	int			err = 0;
2374 
2375 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2376 
2377 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2378 		mac_perim_exit(mph);
2379 		return (0);
2380 	}
2381 
2382 	/*
2383 	 * Insert this mac address into the list of mac addresses owned by
2384 	 * the aggregation pseudo group.
2385 	 */
2386 	pprev = &rx_group->arg_macaddr;
2387 	while ((addr = *pprev) != NULL) {
2388 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2389 			pprev = &addr->aua_next;
2390 			continue;
2391 		}
2392 		break;
2393 	}
2394 	if (addr == NULL) {
2395 		mac_perim_exit(mph);
2396 		return (EINVAL);
2397 	}
2398 
2399 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2400 		aggr_port_remmac(port, mac_addr);
2401 
2402 	*pprev = addr->aua_next;
2403 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2404 
2405 	mac_perim_exit(mph);
2406 	return (err);
2407 }
2408 
2409 /*
2410  * Add or remove the multicast addresses that are defined for the group
2411  * to or from the specified port.
2412  *
2413  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2414  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2415  * called when the port is either stopped or detached.
2416  */
2417 void
2418 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2419 {
2420 	aggr_grp_t *grp = port->lp_grp;
2421 
2422 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2423 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2424 
2425 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2426 		return;
2427 
2428 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2429 }
2430 
2431 static int
2432 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2433 {
2434 	aggr_grp_t *grp = arg;
2435 	aggr_port_t *port = NULL;
2436 	mac_perim_handle_t mph;
2437 	int err = 0, cerr;
2438 
2439 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2440 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2441 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2442 		    !port->lp_started) {
2443 			continue;
2444 		}
2445 		cerr = aggr_port_multicst(port, add, addrp);
2446 		if (cerr != 0 && err == 0)
2447 			err = cerr;
2448 	}
2449 	mac_perim_exit(mph);
2450 	return (err);
2451 }
2452 
2453 static int
2454 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2455 {
2456 	aggr_grp_t *grp = arg;
2457 	mac_perim_handle_t mph;
2458 	int err;
2459 
2460 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2461 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2462 	    0, 0);
2463 	mac_perim_exit(mph);
2464 	return (err);
2465 }
2466 
2467 /*
2468  * Initialize the capabilities that are advertised for the group
2469  * according to the capabilities of the constituent ports.
2470  */
2471 static void
2472 aggr_grp_capab_set(aggr_grp_t *grp)
2473 {
2474 	uint32_t cksum;
2475 	aggr_port_t *port;
2476 	mac_capab_lso_t cap_lso;
2477 
2478 	ASSERT(grp->lg_mh == NULL);
2479 	ASSERT(grp->lg_ports != NULL);
2480 
2481 	grp->lg_hcksum_txflags = (uint32_t)-1;
2482 	grp->lg_zcopy = B_TRUE;
2483 	grp->lg_vlan = B_TRUE;
2484 
2485 	grp->lg_lso = B_TRUE;
2486 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2487 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2488 
2489 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2490 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2491 			cksum = 0;
2492 		grp->lg_hcksum_txflags &= cksum;
2493 
2494 		grp->lg_vlan &=
2495 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2496 
2497 		grp->lg_zcopy &=
2498 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2499 
2500 		grp->lg_lso &=
2501 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2502 		if (grp->lg_lso) {
2503 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2504 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2505 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2506 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2507 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
2508 		}
2509 	}
2510 }
2511 
2512 /*
2513  * Checks whether the capabilities of the port being added are compatible
2514  * with the current capabilities of the aggregation.
2515  */
2516 static boolean_t
2517 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2518 {
2519 	uint32_t hcksum_txflags;
2520 
2521 	ASSERT(grp->lg_ports != NULL);
2522 
2523 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2524 	    grp->lg_vlan) != grp->lg_vlan) {
2525 		return (B_FALSE);
2526 	}
2527 
2528 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2529 	    grp->lg_zcopy) != grp->lg_zcopy) {
2530 		return (B_FALSE);
2531 	}
2532 
2533 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2534 		if (grp->lg_hcksum_txflags != 0)
2535 			return (B_FALSE);
2536 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2537 	    grp->lg_hcksum_txflags) {
2538 		return (B_FALSE);
2539 	}
2540 
2541 	if (grp->lg_lso) {
2542 		mac_capab_lso_t cap_lso;
2543 
2544 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2545 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2546 			    grp->lg_cap_lso.lso_flags)
2547 				return (B_FALSE);
2548 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2549 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2550 				return (B_FALSE);
2551 		} else {
2552 			return (B_FALSE);
2553 		}
2554 	}
2555 
2556 	return (B_TRUE);
2557 }
2558 
2559 /*
2560  * Returns the maximum SDU according to the SDU of the constituent ports.
2561  */
2562 static uint_t
2563 aggr_grp_max_sdu(aggr_grp_t *grp)
2564 {
2565 	uint_t max_sdu = (uint_t)-1;
2566 	aggr_port_t *port;
2567 
2568 	ASSERT(grp->lg_ports != NULL);
2569 
2570 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2571 		uint_t port_sdu_max;
2572 
2573 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2574 		if (max_sdu > port_sdu_max)
2575 			max_sdu = port_sdu_max;
2576 	}
2577 
2578 	return (max_sdu);
2579 }
2580 
2581 /*
2582  * Checks if the maximum SDU of the specified port is compatible
2583  * with the maximum SDU of the specified aggregation group, returns
2584  * B_TRUE if it is, B_FALSE otherwise.
2585  */
2586 static boolean_t
2587 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2588 {
2589 	uint_t port_sdu_max;
2590 
2591 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2592 	return (port_sdu_max >= grp->lg_max_sdu);
2593 }
2594 
2595 /*
2596  * Returns the maximum margin according to the margin of the constituent ports.
2597  */
2598 static uint32_t
2599 aggr_grp_max_margin(aggr_grp_t *grp)
2600 {
2601 	uint32_t margin = UINT32_MAX;
2602 	aggr_port_t *port;
2603 
2604 	ASSERT(grp->lg_mh == NULL);
2605 	ASSERT(grp->lg_ports != NULL);
2606 
2607 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2608 		if (margin > port->lp_margin)
2609 			margin = port->lp_margin;
2610 	}
2611 
2612 	grp->lg_margin = margin;
2613 	return (margin);
2614 }
2615 
2616 /*
2617  * Checks if the maximum margin of the specified port is compatible
2618  * with the maximum margin of the specified aggregation group, returns
2619  * B_TRUE if it is, B_FALSE otherwise.
2620  */
2621 static boolean_t
2622 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2623 {
2624 	if (port->lp_margin >= grp->lg_margin)
2625 		return (B_TRUE);
2626 
2627 	/*
2628 	 * See whether the current margin value is allowed to be changed to
2629 	 * the new value.
2630 	 */
2631 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2632 		return (B_FALSE);
2633 
2634 	grp->lg_margin = port->lp_margin;
2635 	return (B_TRUE);
2636 }
2637 
2638 /*
2639  * Set MTU on individual ports of an aggregation group
2640  */
2641 static int
2642 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2643     uint32_t *old_mtu)
2644 {
2645 	boolean_t 		removed = B_FALSE;
2646 	mac_perim_handle_t	mph;
2647 	mac_diag_t		diag;
2648 	int			err, rv, retry = 0;
2649 
2650 	if (port->lp_mah != NULL) {
2651 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2652 		port->lp_mah = NULL;
2653 		removed = B_TRUE;
2654 	}
2655 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2656 try_again:
2657 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2658 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2659 	    &port->lp_mah, 0, &diag)) != 0) {
2660 		/*
2661 		 * following is a workaround for a bug in 'bge' driver.
2662 		 * See CR 6794654 for more information and this work around
2663 		 * will be removed once the CR is fixed.
2664 		 */
2665 		if (rv == EIO && retry++ < 3) {
2666 			delay(2 * hz);
2667 			goto try_again;
2668 		}
2669 		/*
2670 		 * if mac_unicast_add() failed while setting the MTU,
2671 		 * detach the port from the group.
2672 		 */
2673 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2674 		(void) aggr_grp_detach_port(grp, port);
2675 		mac_perim_exit(mph);
2676 		cmn_err(CE_WARN, "Unable to restart the port %s while "
2677 		    "setting MTU. Detaching the port from the aggregation.",
2678 		    mac_client_name(port->lp_mch));
2679 	}
2680 	return (err);
2681 }
2682 
2683 static int
2684 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2685 {
2686 	int			err = 0, i, rv;
2687 	aggr_port_t		*port;
2688 	uint32_t		*mtu;
2689 
2690 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2691 
2692 	/*
2693 	 * If the MTU being set is equal to aggr group's maximum
2694 	 * allowable value, then there is nothing to change
2695 	 */
2696 	if (sdu == grp->lg_max_sdu)
2697 		return (0);
2698 
2699 	/* 0 is aggr group's min sdu */
2700 	if (sdu == 0)
2701 		return (EINVAL);
2702 
2703 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2704 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2705 	    port = port->lp_next, i++) {
2706 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2707 	}
2708 	if (err != 0) {
2709 		/* recover from error: reset the mtus of the ports */
2710 		aggr_port_t *tmp;
2711 
2712 		for (tmp = grp->lg_ports, i = 0; tmp != port;
2713 		    tmp = tmp->lp_next, i++) {
2714 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2715 		}
2716 		goto bail;
2717 	}
2718 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2719 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2720 	ASSERT(rv == 0);
2721 bail:
2722 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2723 	return (err);
2724 }
2725 
2726 /*
2727  * Callback functions for set/get of properties
2728  */
2729 /*ARGSUSED*/
2730 static int
2731 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2732     uint_t pr_valsize, const void *pr_val)
2733 {
2734 	int 		err = ENOTSUP;
2735 	aggr_grp_t 	*grp = m_driver;
2736 
2737 	switch (pr_num) {
2738 	case MAC_PROP_MTU: {
2739 		uint32_t 	mtu;
2740 
2741 		if (pr_valsize < sizeof (mtu)) {
2742 			err = EINVAL;
2743 			break;
2744 		}
2745 		bcopy(pr_val, &mtu, sizeof (mtu));
2746 		err = aggr_sdu_update(grp, mtu);
2747 		break;
2748 	}
2749 	default:
2750 		break;
2751 	}
2752 	return (err);
2753 }
2754 
2755 int
2756 aggr_grp_possible_mtu_range(aggr_grp_t *grp, uint32_t *min, uint32_t *max)
2757 {
2758 	mac_propval_range_t		*vals;
2759 	mac_propval_uint32_range_t	*ur;
2760 	aggr_port_t			*port;
2761 	mac_perim_handle_t		mph;
2762 	uint_t 				i;
2763 	int 				err = 0;
2764 
2765 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2766 
2767 	*min = 0;
2768 	*max = (uint32_t)-1;
2769 
2770 	vals = kmem_alloc(sizeof (mac_propval_range_t) * grp->lg_nports,
2771 	    KM_SLEEP);
2772 
2773 	for (port = grp->lg_ports, i = 0; port != NULL;
2774 	    port = port->lp_next, i++) {
2775 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2776 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2777 		    NULL, 0, vals + i, NULL);
2778 		mac_perim_exit(mph);
2779 		if (err != 0)
2780 			break;
2781 	}
2782 
2783 	/*
2784 	 * if any of the underlying ports does not support changing MTU then
2785 	 * just return ENOTSUP
2786 	 */
2787 	if (port != NULL) {
2788 		ASSERT(err != 0);
2789 		goto done;
2790 	}
2791 
2792 	for (i = 0; i < grp->lg_nports; i++) {
2793 		ur = &((vals + i)->mpr_range_uint32[0]);
2794 		/*
2795 		 * Take max of the min, for range_min; that is the minimum
2796 		 * MTU value for an aggregation is the maximum of the
2797 		 * minimum values of all the underlying ports
2798 		 */
2799 		if (ur->mpur_min > *min)
2800 			*min = ur->mpur_min;
2801 		/* Take min of the max, for range_max */
2802 		if (ur->mpur_max < *max)
2803 			*max = ur->mpur_max;
2804 	}
2805 done:
2806 	kmem_free(vals, sizeof (mac_propval_range_t) * grp->lg_nports);
2807 
2808 	return (err);
2809 }
2810 
2811 static void
2812 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2813     mac_prop_info_handle_t prh)
2814 {
2815 	aggr_grp_t		*grp = m_driver;
2816 
2817 	_NOTE(ARGUNUSED(pr_name));
2818 
2819 	switch (pr_num) {
2820 	case MAC_PROP_MTU: {
2821 		uint32_t min, max;
2822 
2823 		if (aggr_grp_possible_mtu_range(grp, &min, &max) != 0)
2824 			return;
2825 		mac_prop_info_set_range_uint32(prh, min, max);
2826 		break;
2827 	}
2828 	}
2829 }
2830