xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c (revision 0904e7ecf266ebe6844dfc4b178441dc8d81296b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2020 Joyent, Inc.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28  *
29  * An instance of the structure aggr_grp_t is allocated for each
30  * link aggregation group. When created, aggr_grp_t objects are
31  * entered into the aggr_grp_hash hash table maintained by the modhash
32  * module. The hash key is the linkid associated with the link
33  * aggregation group.
34  *
35  * Each aggregation contains a set of ports. The port is represented
36  * by the aggr_port_t structure. A port consists of a single MAC
37  * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
38  * MAC. This client is used by the aggr to send and receive LACP
39  * traffic. Each port client takes on the same MAC unicast address --
40  * the address of the aggregation itself (taken from the first port by
41  * default).
42  *
43  * The MAC client that hangs off each aggr port is not your typical
44  * MAC client. Not only does it have exclusive control of the MAC, but
45  * it also has no Tx or Rx SRSes. An SRS is designed to queue and
46  * fanout traffic among L4 protocols; but the aggr is an intermediary,
47  * not a consumer. Instead of using SRSes, the aggr puts the
48  * underlying hardware rings into passthru mode and ships packets up
49  * via a direct call to aggr_recv_cb(). This allows aggr to enforce
50  * LACP while passing all other traffic up to clients of the aggr.
51  *
52  * Pseudo Rx Groups and Rings
53  * --------------------------
54  *
55  * It is imperative for client performance that the aggr provide as
56  * many MAC groups as possible. In order to use the underlying HW
57  * resources, aggr creates pseudo groups to aggregate the underlying
58  * HW groups. Every HW group gets mapped to a pseudo group; and every
59  * HW ring in that group gets mapped to a pseudo ring. The pseudo
60  * group at index 0 combines all the HW groups at index 0 from each
61  * port, etc. The aggr's MAC then creates normal MAC groups and rings
62  * out of these pseudo groups and rings to present to the aggr's
63  * clients. To the clients, the aggr's groups and rings are absolutely
64  * no different than a NIC's groups or rings.
65  *
66  * Pseudo Tx Rings
67  * ---------------
68  *
69  * The underlying ports (NICs) in an aggregation can have Tx rings. To
70  * enhance aggr's performance, these Tx rings are made available to
71  * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
72  * not new. They are already present and implemented on the Rx side.
73  * The same concept is extended to the Tx side where each Tx ring of
74  * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
75  * each pseudo Tx ring will map to a specific hardware Tx ring. Even
76  * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
77  * is given to the aggregation layer.
78  *
79  * With this change, the outgoing stack depth looks much better:
80  *
81  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
82  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
83  *
84  * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
85  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
86  *
87  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
88  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
89  * ring belonging to a port on which the packet has to be sent.
90  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
91  * policy and then uses the fanout_hint passed to it to pick a Tx ring from
92  * the selected port.
93  *
94  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
95  * bandwidth limit is applied first on the outgoing packet and the packets
96  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
97  * particular Tx ring.
98  */
99 
100 #include <sys/types.h>
101 #include <sys/sysmacros.h>
102 #include <sys/conf.h>
103 #include <sys/cmn_err.h>
104 #include <sys/disp.h>
105 #include <sys/list.h>
106 #include <sys/ksynch.h>
107 #include <sys/kmem.h>
108 #include <sys/stream.h>
109 #include <sys/modctl.h>
110 #include <sys/ddi.h>
111 #include <sys/sunddi.h>
112 #include <sys/atomic.h>
113 #include <sys/stat.h>
114 #include <sys/modhash.h>
115 #include <sys/id_space.h>
116 #include <sys/strsun.h>
117 #include <sys/cred.h>
118 #include <sys/dlpi.h>
119 #include <sys/zone.h>
120 #include <sys/mac_provider.h>
121 #include <sys/dls.h>
122 #include <sys/vlan.h>
123 #include <sys/aggr.h>
124 #include <sys/aggr_impl.h>
125 
126 static int aggr_m_start(void *);
127 static void aggr_m_stop(void *);
128 static int aggr_m_promisc(void *, boolean_t);
129 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
130 static int aggr_m_unicst(void *, const uint8_t *);
131 static int aggr_m_stat(void *, uint_t, uint64_t *);
132 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
133 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
134 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
135     const void *);
136 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
137     mac_prop_info_handle_t);
138 
139 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
140 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
141     boolean_t *);
142 
143 static void aggr_grp_capab_set(aggr_grp_t *);
144 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
145 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
146 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
147 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
148 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
149 
150 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
151 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
152 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
153 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
154 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
155 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
156 static int aggr_addmac(void *, const uint8_t *);
157 static int aggr_remmac(void *, const uint8_t *);
158 static int aggr_addvlan(mac_group_driver_t, uint16_t);
159 static int aggr_remvlan(mac_group_driver_t, uint16_t);
160 static mblk_t *aggr_rx_poll(void *, int);
161 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
162     const int, mac_ring_info_t *, mac_ring_handle_t);
163 static void aggr_fill_group(void *, mac_ring_type_t, const int,
164     mac_group_info_t *, mac_group_handle_t);
165 
166 static kmem_cache_t	*aggr_grp_cache;
167 static mod_hash_t	*aggr_grp_hash;
168 static krwlock_t	aggr_grp_lock;
169 static uint_t		aggr_grp_cnt;
170 static id_space_t	*key_ids;
171 
172 #define	GRP_HASHSZ		64
173 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
174 #define	AGGR_PORT_NAME_DELIMIT '-'
175 
176 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
177 
178 #define	AGGR_M_CALLBACK_FLAGS	\
179 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
180 
181 static mac_callbacks_t aggr_m_callbacks = {
182 	AGGR_M_CALLBACK_FLAGS,
183 	aggr_m_stat,
184 	aggr_m_start,
185 	aggr_m_stop,
186 	aggr_m_promisc,
187 	aggr_m_multicst,
188 	NULL,
189 	NULL,
190 	NULL,
191 	aggr_m_ioctl,
192 	aggr_m_capab_get,
193 	NULL,
194 	NULL,
195 	aggr_m_setprop,
196 	NULL,
197 	aggr_m_propinfo
198 };
199 
200 /*ARGSUSED*/
201 static int
202 aggr_grp_constructor(void *buf, void *arg, int kmflag)
203 {
204 	aggr_grp_t *grp = buf;
205 
206 	bzero(grp, sizeof (*grp));
207 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
208 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
209 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
210 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
211 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
212 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
213 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
214 	grp->lg_link_state = LINK_STATE_UNKNOWN;
215 	return (0);
216 }
217 
218 /*ARGSUSED*/
219 static void
220 aggr_grp_destructor(void *buf, void *arg)
221 {
222 	aggr_grp_t *grp = buf;
223 
224 	if (grp->lg_tx_ports != NULL) {
225 		kmem_free(grp->lg_tx_ports,
226 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
227 	}
228 
229 	mutex_destroy(&grp->lg_lacp_lock);
230 	cv_destroy(&grp->lg_lacp_cv);
231 	mutex_destroy(&grp->lg_port_lock);
232 	cv_destroy(&grp->lg_port_cv);
233 	rw_destroy(&grp->lg_tx_lock);
234 	mutex_destroy(&grp->lg_tx_flowctl_lock);
235 	cv_destroy(&grp->lg_tx_flowctl_cv);
236 }
237 
238 void
239 aggr_grp_init(void)
240 {
241 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
242 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
243 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
244 
245 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
246 	    GRP_HASHSZ, mod_hash_null_valdtor);
247 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
248 	aggr_grp_cnt = 0;
249 
250 	/*
251 	 * Allocate an id space to manage key values (when key is not
252 	 * specified). The range of the id space will be from
253 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
254 	 * uses a 16-bit key.
255 	 */
256 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
257 	ASSERT(key_ids != NULL);
258 }
259 
260 void
261 aggr_grp_fini(void)
262 {
263 	id_space_destroy(key_ids);
264 	rw_destroy(&aggr_grp_lock);
265 	mod_hash_destroy_idhash(aggr_grp_hash);
266 	kmem_cache_destroy(aggr_grp_cache);
267 }
268 
269 uint_t
270 aggr_grp_count(void)
271 {
272 	uint_t	count;
273 
274 	rw_enter(&aggr_grp_lock, RW_READER);
275 	count = aggr_grp_cnt;
276 	rw_exit(&aggr_grp_lock);
277 	return (count);
278 }
279 
280 /*
281  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
282  * requires the mac perimeter, this function holds a reference of the aggr
283  * and aggr won't call mac_unregister() until this reference drops to 0.
284  */
285 void
286 aggr_grp_port_hold(aggr_port_t *port)
287 {
288 	aggr_grp_t	*grp = port->lp_grp;
289 
290 	AGGR_PORT_REFHOLD(port);
291 	mutex_enter(&grp->lg_port_lock);
292 	grp->lg_port_ref++;
293 	mutex_exit(&grp->lg_port_lock);
294 }
295 
296 /*
297  * Release the reference of the grp and inform aggr_grp_delete() calling
298  * mac_unregister() is now safe.
299  */
300 void
301 aggr_grp_port_rele(aggr_port_t *port)
302 {
303 	aggr_grp_t	*grp = port->lp_grp;
304 
305 	mutex_enter(&grp->lg_port_lock);
306 	if (--grp->lg_port_ref == 0)
307 		cv_signal(&grp->lg_port_cv);
308 	mutex_exit(&grp->lg_port_lock);
309 	AGGR_PORT_REFRELE(port);
310 }
311 
312 /*
313  * Wait for the port's lacp timer thread and the port's notification callback
314  * to exit.
315  */
316 void
317 aggr_grp_port_wait(aggr_grp_t *grp)
318 {
319 	mutex_enter(&grp->lg_port_lock);
320 	if (grp->lg_port_ref != 0)
321 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
322 	mutex_exit(&grp->lg_port_lock);
323 }
324 
325 /*
326  * Attach a port to a link aggregation group.
327  *
328  * A port is attached to a link aggregation group once its speed
329  * and link state have been verified.
330  *
331  * Returns B_TRUE if the group link state or speed has changed. If
332  * it's the case, the caller must notify the MAC layer via a call
333  * to mac_link().
334  */
335 boolean_t
336 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
337 {
338 	boolean_t link_state_changed = B_FALSE;
339 
340 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
341 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
342 
343 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
344 		return (B_FALSE);
345 
346 	/*
347 	 * Validate the MAC port link speed and update the group
348 	 * link speed if needed.
349 	 */
350 	if (port->lp_ifspeed == 0 ||
351 	    port->lp_link_state != LINK_STATE_UP ||
352 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
353 		/*
354 		 * Can't attach a MAC port with unknown link speed,
355 		 * down link, or not in full duplex mode.
356 		 */
357 		return (B_FALSE);
358 	}
359 
360 	mutex_enter(&grp->lg_stat_lock);
361 	if (grp->lg_ifspeed == 0) {
362 		/*
363 		 * The group inherits the speed of the first link being
364 		 * attached.
365 		 */
366 		grp->lg_ifspeed = port->lp_ifspeed;
367 		link_state_changed = B_TRUE;
368 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
369 		/*
370 		 * The link speed of the MAC port must be the same as
371 		 * the group link speed, as per 802.3ad. Since it is
372 		 * not, the attach is cancelled.
373 		 */
374 		mutex_exit(&grp->lg_stat_lock);
375 		return (B_FALSE);
376 	}
377 	mutex_exit(&grp->lg_stat_lock);
378 
379 	grp->lg_nattached_ports++;
380 
381 	/*
382 	 * Update the group link state.
383 	 */
384 	if (grp->lg_link_state != LINK_STATE_UP) {
385 		grp->lg_link_state = LINK_STATE_UP;
386 		mutex_enter(&grp->lg_stat_lock);
387 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
388 		mutex_exit(&grp->lg_stat_lock);
389 		link_state_changed = B_TRUE;
390 	}
391 
392 	/*
393 	 * Update port's state.
394 	 */
395 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
396 
397 	aggr_grp_multicst_port(port, B_TRUE);
398 
399 	/*
400 	 * The port client doesn't have an Rx SRS; instead of calling
401 	 * mac_rx_set() we set the client's flow callback directly.
402 	 * This datapath is used only when the port's driver doesn't
403 	 * support MAC_CAPAB_RINGS. Drivers with ring support will
404 	 * deliver traffic to the aggr via ring passthru.
405 	 */
406 	mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
407 
408 	/*
409 	 * If LACP is OFF, the port can be used to send data as soon
410 	 * as its link is up and verified to be compatible with the
411 	 * aggregation.
412 	 *
413 	 * If LACP is active or passive, notify the LACP subsystem, which
414 	 * will enable sending on the port following the LACP protocol.
415 	 */
416 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
417 		aggr_send_port_enable(port);
418 	else
419 		aggr_lacp_port_attached(port);
420 
421 	return (link_state_changed);
422 }
423 
424 boolean_t
425 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
426 {
427 	boolean_t link_state_changed = B_FALSE;
428 
429 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
430 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
431 
432 	/* update state */
433 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
434 		return (B_FALSE);
435 
436 	mac_client_clear_flow_cb(port->lp_mch);
437 
438 	aggr_grp_multicst_port(port, B_FALSE);
439 
440 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
441 		aggr_send_port_disable(port);
442 	else
443 		aggr_lacp_port_detached(port);
444 
445 	port->lp_state = AGGR_PORT_STATE_STANDBY;
446 
447 	grp->lg_nattached_ports--;
448 	if (grp->lg_nattached_ports == 0) {
449 		/* the last attached MAC port of the group is being detached */
450 		grp->lg_link_state = LINK_STATE_DOWN;
451 		mutex_enter(&grp->lg_stat_lock);
452 		grp->lg_ifspeed = 0;
453 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
454 		mutex_exit(&grp->lg_stat_lock);
455 		link_state_changed = B_TRUE;
456 	}
457 
458 	return (link_state_changed);
459 }
460 
461 /*
462  * Update the MAC addresses of the constituent ports of the specified
463  * group. This function is invoked:
464  * - after creating a new aggregation group.
465  * - after adding new ports to an aggregation group.
466  * - after removing a port from a group when the MAC address of
467  *   that port was used for the MAC address of the group.
468  * - after the MAC address of a port changed when the MAC address
469  *   of that port was used for the MAC address of the group.
470  *
471  * Return true if the link state of the aggregation changed, for example
472  * as a result of a failure changing the MAC address of one of the
473  * constituent ports.
474  */
475 boolean_t
476 aggr_grp_update_ports_mac(aggr_grp_t *grp)
477 {
478 	aggr_port_t *cport;
479 	boolean_t link_state_changed = B_FALSE;
480 	mac_perim_handle_t mph;
481 
482 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
483 
484 	for (cport = grp->lg_ports; cport != NULL;
485 	    cport = cport->lp_next) {
486 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
487 		if (aggr_port_unicst(cport) != 0) {
488 			if (aggr_grp_detach_port(grp, cport))
489 				link_state_changed = B_TRUE;
490 		} else {
491 			/*
492 			 * If a port was detached because of a previous
493 			 * failure changing the MAC address, the port is
494 			 * reattached when it successfully changes the MAC
495 			 * address now, and this might cause the link state
496 			 * of the aggregation to change.
497 			 */
498 			if (aggr_grp_attach_port(grp, cport))
499 				link_state_changed = B_TRUE;
500 		}
501 		mac_perim_exit(mph);
502 	}
503 	return (link_state_changed);
504 }
505 
506 /*
507  * Invoked when the MAC address of a port has changed. If the port's
508  * MAC address was used for the group MAC address, set mac_addr_changedp
509  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
510  * notification. If the link state changes due to detach/attach of
511  * the constituent port, set link_state_changedp to B_TRUE to indicate
512  * to the caller that it should send a MAC_NOTE_LINK notification. In both
513  * cases, it is the responsibility of the caller to invoke notification
514  * functions after releasing the the port lock.
515  */
516 void
517 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
518     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
519 {
520 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
521 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
522 	ASSERT(mac_addr_changedp != NULL);
523 	ASSERT(link_state_changedp != NULL);
524 
525 	*mac_addr_changedp = B_FALSE;
526 	*link_state_changedp = B_FALSE;
527 
528 	if (grp->lg_addr_fixed) {
529 		/*
530 		 * The group is using a fixed MAC address or an automatic
531 		 * MAC address has not been set.
532 		 */
533 		return;
534 	}
535 
536 	if (grp->lg_mac_addr_port == port) {
537 		/*
538 		 * The MAC address of the port was assigned to the group
539 		 * MAC address. Update the group MAC address.
540 		 */
541 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
542 		*mac_addr_changedp = B_TRUE;
543 	} else {
544 		/*
545 		 * Update the actual port MAC address to the MAC address
546 		 * of the group.
547 		 */
548 		if (aggr_port_unicst(port) != 0) {
549 			*link_state_changedp = aggr_grp_detach_port(grp, port);
550 		} else {
551 			/*
552 			 * If a port was detached because of a previous
553 			 * failure changing the MAC address, the port is
554 			 * reattached when it successfully changes the MAC
555 			 * address now, and this might cause the link state
556 			 * of the aggregation to change.
557 			 */
558 			*link_state_changedp = aggr_grp_attach_port(grp, port);
559 		}
560 	}
561 }
562 
563 /*
564  * Add a port to a link aggregation group.
565  */
566 static int
567 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
568     aggr_port_t **pp)
569 {
570 	aggr_port_t *port, **cport;
571 	mac_perim_handle_t mph;
572 	zoneid_t port_zoneid = ALL_ZONES;
573 	int err;
574 
575 	/* The port must be in the same zone as the aggregation. */
576 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
577 		port_zoneid = GLOBAL_ZONEID;
578 	if (grp->lg_zoneid != port_zoneid)
579 		return (EBUSY);
580 
581 	/*
582 	 * If we are creating the aggr, then there is no MAC handle
583 	 * and thus no perimeter to hold. If we are adding a port to
584 	 * an existing aggr, then the perimiter of the aggr's MAC must
585 	 * be held.
586 	 */
587 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
588 
589 	err = aggr_port_create(grp, port_linkid, force, &port);
590 	if (err != 0)
591 		return (err);
592 
593 	mac_perim_enter_by_mh(port->lp_mh, &mph);
594 
595 	/* Add the new port to the end of the list. */
596 	cport = &grp->lg_ports;
597 	while (*cport != NULL)
598 		cport = &((*cport)->lp_next);
599 	*cport = port;
600 
601 	/*
602 	 * Back reference to the group it is member of. A port always
603 	 * holds a reference to its group to ensure that the back
604 	 * reference is always valid.
605 	 */
606 	port->lp_grp = grp;
607 	AGGR_GRP_REFHOLD(grp);
608 	grp->lg_nports++;
609 
610 	aggr_lacp_init_port(port);
611 	mac_perim_exit(mph);
612 
613 	if (pp != NULL)
614 		*pp = port;
615 
616 	return (0);
617 }
618 
619 /*
620  * This is called when the 'lg_tx_ports' arrangement has changed and
621  * we need to update the corresponding 'mi_default_tx_ring'. This
622  * happens for several reasons.
623  *
624  *     - A pseudo TX mac group was added or removed.
625  *     - An LACP message has changed the port's state.
626  *     - A link event has changed the port's state.
627  *
628  * In any case, we see if there is at least one port enabled (see
629  * 'aggr_send_port_enable()'), and if so we use its first ring as the
630  * mac's default TX ring.
631  *
632  * Note, because we only have a single TX group, we don't have to
633  * worry about the rings moving between groups and the chance that mac
634  * will reassign it unless someone removes a port, at which point, we
635  * play it safe and call this again.
636  */
637 void
638 aggr_grp_update_default(aggr_grp_t *grp)
639 {
640 	aggr_port_t *port;
641 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
642 
643 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
644 
645 	if (grp->lg_ntx_ports == 0) {
646 		rw_exit(&grp->lg_tx_lock);
647 		return;
648 	}
649 
650 	port = grp->lg_tx_ports[0];
651 	ASSERT(port->lp_tx_ring_cnt > 0);
652 	mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
653 	rw_exit(&grp->lg_tx_lock);
654 }
655 
656 /*
657  * Add a pseudo RX ring for the given HW ring handle.
658  */
659 static int
660 aggr_add_pseudo_rx_ring(aggr_port_t *port,
661     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
662 {
663 	aggr_pseudo_rx_ring_t	*ring;
664 	int			err;
665 	int			j;
666 
667 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
668 		ring = rx_grp->arg_rings + j;
669 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
670 			break;
671 	}
672 
673 	/*
674 	 * No slot for this new RX ring.
675 	 */
676 	if (j == MAX_RINGS_PER_GROUP)
677 		return (EIO);
678 
679 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
680 	ring->arr_hw_rh = hw_rh;
681 	ring->arr_port = port;
682 	ring->arr_grp = rx_grp;
683 	rx_grp->arg_ring_cnt++;
684 
685 	/*
686 	 * The group is already registered, dynamically add a new ring to the
687 	 * mac group.
688 	 */
689 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
690 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
691 		ring->arr_hw_rh = NULL;
692 		ring->arr_port = NULL;
693 		ring->arr_grp = NULL;
694 		rx_grp->arg_ring_cnt--;
695 	} else {
696 		/*
697 		 * This must run after the MAC is registered.
698 		 */
699 		ASSERT3P(ring->arr_rh, !=, NULL);
700 		mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
701 		    (void *)port, (mac_resource_handle_t)ring);
702 	}
703 	return (err);
704 }
705 
706 /*
707  * Remove the pseudo RX ring of the given HW ring handle.
708  */
709 static void
710 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
711 {
712 	for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
713 		aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
714 
715 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
716 		    ring->arr_hw_rh != hw_rh) {
717 			continue;
718 		}
719 
720 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
721 
722 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
723 		ring->arr_hw_rh = NULL;
724 		ring->arr_port = NULL;
725 		ring->arr_grp = NULL;
726 		rx_grp->arg_ring_cnt--;
727 		mac_hwring_clear_passthru(hw_rh);
728 		break;
729 	}
730 }
731 
732 /*
733  * Create pseudo rings over the HW rings of the port.
734  *
735  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
736  *
737  * o Program existing unicast filters on the pseudo group into the HW group.
738  *
739  * o Program existing VLAN filters on the pseudo group into the HW group.
740  */
741 static int
742 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
743 {
744 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
745 	aggr_unicst_addr_t	*addr, *a;
746 	mac_perim_handle_t	pmph;
747 	aggr_vlan_t		*avp;
748 	uint_t			hw_rh_cnt, i;
749 	int			err = 0;
750 	uint_t			g_idx = rx_grp->arg_index;
751 
752 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
753 	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
754 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
755 
756 	i = 0;
757 	addr = NULL;
758 	/*
759 	 * This function must be called after the aggr registers its
760 	 * MAC and its Rx groups have been initialized.
761 	 */
762 	ASSERT(rx_grp->arg_gh != NULL);
763 
764 	/*
765 	 * Get the list of the underlying HW rings.
766 	 */
767 	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
768 	    &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
769 
770 	/*
771 	 * Add existing VLAN and unicast address filters to the port.
772 	 */
773 	for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
774 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
775 		if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
776 			goto err;
777 	}
778 
779 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
780 		if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
781 			goto err;
782 	}
783 
784 	for (i = 0; i < hw_rh_cnt; i++) {
785 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
786 		if (err != 0)
787 			goto err;
788 	}
789 
790 	mac_perim_exit(pmph);
791 	return (0);
792 
793 err:
794 	ASSERT(err != 0);
795 
796 	for (uint_t j = 0; j < i; j++)
797 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
798 
799 	for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
800 		aggr_port_remmac(port, g_idx, a->aua_addr);
801 
802 	if (avp != NULL)
803 		avp = list_prev(&rx_grp->arg_vlans, avp);
804 
805 	for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
806 		int err2;
807 
808 		if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
809 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
810 			    ": errno %d.", avp->av_vid,
811 			    mac_client_name(port->lp_mch), err2);
812 		}
813 	}
814 
815 	port->lp_hwghs[g_idx] = NULL;
816 	mac_perim_exit(pmph);
817 	return (err);
818 }
819 
820 /*
821  * Destroy the pseudo rings mapping to this port and remove all VLAN
822  * and unicast filters from this port. Even if there are no underlying
823  * HW rings we must still remove the unicast filters to take the port
824  * out of promisc mode.
825  */
826 static void
827 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
828 {
829 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
830 	aggr_unicst_addr_t	*addr;
831 	mac_perim_handle_t	pmph;
832 	uint_t			hw_rh_cnt;
833 	uint_t			g_idx = rx_grp->arg_index;
834 
835 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
836 	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
837 	ASSERT3P(rx_grp->arg_gh, !=, NULL);
838 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
839 
840 	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
841 	    MAC_RING_TYPE_RX);
842 
843 	for (uint_t i = 0; i < hw_rh_cnt; i++)
844 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
845 
846 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
847 		aggr_port_remmac(port, g_idx, addr->aua_addr);
848 
849 	for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
850 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
851 		int err;
852 
853 		if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
854 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
855 			    ": errno %d.", avp->av_vid,
856 			    mac_client_name(port->lp_mch), err);
857 		}
858 	}
859 
860 	port->lp_hwghs[g_idx] = NULL;
861 	mac_perim_exit(pmph);
862 }
863 
864 /*
865  * Add a pseudo TX ring for the given HW ring handle.
866  */
867 static int
868 aggr_add_pseudo_tx_ring(aggr_port_t *port,
869     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
870     mac_ring_handle_t *pseudo_rh)
871 {
872 	aggr_pseudo_tx_ring_t	*ring;
873 	int			err;
874 	int			i;
875 
876 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
877 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
878 		ring = tx_grp->atg_rings + i;
879 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
880 			break;
881 	}
882 	/*
883 	 * No slot for this new TX ring.
884 	 */
885 	if (i == MAX_RINGS_PER_GROUP)
886 		return (EIO);
887 	/*
888 	 * The following 4 statements needs to be done before
889 	 * calling mac_group_add_ring(). Otherwise it will
890 	 * result in an assertion failure in mac_init_ring().
891 	 */
892 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
893 	ring->atr_hw_rh = hw_rh;
894 	ring->atr_port = port;
895 	tx_grp->atg_ring_cnt++;
896 
897 	/*
898 	 * The TX side has no concept of ring groups unlike RX groups.
899 	 * There is just a single group which stores all the TX rings.
900 	 * This group will be used to store aggr's pseudo TX rings.
901 	 */
902 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
903 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
904 		ring->atr_hw_rh = NULL;
905 		ring->atr_port = NULL;
906 		tx_grp->atg_ring_cnt--;
907 	} else {
908 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
909 		if (hw_rh != NULL) {
910 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
911 			    mac_find_ring(tx_grp->atg_gh, i));
912 		}
913 	}
914 
915 	return (err);
916 }
917 
918 /*
919  * Remove the pseudo TX ring of the given HW ring handle.
920  */
921 static void
922 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
923     mac_ring_handle_t pseudo_hw_rh)
924 {
925 	aggr_pseudo_tx_ring_t	*ring;
926 	int			i;
927 
928 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
929 		ring = tx_grp->atg_rings + i;
930 		if (ring->atr_rh != pseudo_hw_rh)
931 			continue;
932 
933 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
934 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
935 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
936 		mac_hwring_teardown(ring->atr_hw_rh);
937 		ring->atr_hw_rh = NULL;
938 		ring->atr_port = NULL;
939 		tx_grp->atg_ring_cnt--;
940 		break;
941 	}
942 }
943 
944 /*
945  * This function is called to create pseudo rings over hardware rings of
946  * the underlying device. There is a 1:1 mapping between the pseudo TX
947  * rings of the aggr and the hardware rings of the underlying port.
948  */
949 static int
950 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
951 {
952 	aggr_grp_t		*grp = port->lp_grp;
953 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
954 	mac_perim_handle_t	pmph;
955 	int			hw_rh_cnt, i = 0, j;
956 	int			err = 0;
957 
958 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
959 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
960 
961 	/*
962 	 * Get the list the the underlying HW rings.
963 	 */
964 	hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
965 	    MAC_RING_TYPE_TX);
966 
967 	/*
968 	 * Even if the underlying NIC does not have TX rings, we
969 	 * still make a psuedo TX ring for that NIC with NULL as
970 	 * the ring handle.
971 	 */
972 	if (hw_rh_cnt == 0)
973 		port->lp_tx_ring_cnt = 1;
974 	else
975 		port->lp_tx_ring_cnt = hw_rh_cnt;
976 
977 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
978 	    port->lp_tx_ring_cnt), KM_SLEEP);
979 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
980 	    port->lp_tx_ring_cnt), KM_SLEEP);
981 
982 	if (hw_rh_cnt == 0) {
983 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
984 		    NULL, &pseudo_rh)) == 0) {
985 			port->lp_tx_rings[0] = NULL;
986 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
987 		}
988 	} else {
989 		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
990 			err = aggr_add_pseudo_tx_ring(port,
991 			    tx_grp, hw_rh[i], &pseudo_rh);
992 			if (err != 0)
993 				break;
994 			port->lp_tx_rings[i] = hw_rh[i];
995 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
996 		}
997 	}
998 
999 	if (err != 0) {
1000 		if (hw_rh_cnt != 0) {
1001 			for (j = 0; j < i; j++) {
1002 				aggr_rem_pseudo_tx_ring(tx_grp,
1003 				    port->lp_pseudo_tx_rings[j]);
1004 			}
1005 		}
1006 		kmem_free(port->lp_tx_rings,
1007 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1008 		kmem_free(port->lp_pseudo_tx_rings,
1009 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1010 		port->lp_tx_ring_cnt = 0;
1011 	} else {
1012 		port->lp_tx_grp_added = B_TRUE;
1013 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1014 		    aggr_tx_ring_update, port);
1015 	}
1016 	mac_perim_exit(pmph);
1017 	aggr_grp_update_default(grp);
1018 	return (err);
1019 }
1020 
1021 /*
1022  * This function is called by aggr to remove pseudo TX rings over the
1023  * HW rings of the underlying port.
1024  */
1025 static void
1026 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1027 {
1028 	aggr_grp_t		*grp = port->lp_grp;
1029 	mac_perim_handle_t	pmph;
1030 	int			i;
1031 
1032 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1033 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
1034 
1035 	if (!port->lp_tx_grp_added)
1036 		goto done;
1037 
1038 	ASSERT(tx_grp->atg_gh != NULL);
1039 
1040 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
1041 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1042 
1043 	kmem_free(port->lp_tx_rings,
1044 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1045 	kmem_free(port->lp_pseudo_tx_rings,
1046 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
1047 
1048 	port->lp_tx_ring_cnt = 0;
1049 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1050 	port->lp_tx_grp_added = B_FALSE;
1051 	aggr_grp_update_default(grp);
1052 done:
1053 	mac_perim_exit(pmph);
1054 }
1055 
1056 static int
1057 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1058 {
1059 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1060 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1061 }
1062 
1063 static int
1064 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1065 {
1066 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1067 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1068 }
1069 
1070 /*
1071  * Start the pseudo ring. Since the pseudo ring is just an abstraction
1072  * over an actual HW ring, the real task is to start the underlying HW
1073  * ring.
1074  */
1075 static int
1076 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1077 {
1078 	int err;
1079 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1080 
1081 	err = mac_hwring_start(rr_ring->arr_hw_rh);
1082 
1083 	if (err != 0)
1084 		return (err);
1085 
1086 	rr_ring->arr_gen = mr_gen;
1087 	return (err);
1088 }
1089 
1090 /*
1091  * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1092  * over an actual HW ring, the real task is to stop the underlying HW
1093  * ring.
1094  */
1095 static void
1096 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1097 {
1098 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1099 
1100 	/*
1101 	 * The rings underlying the default group must stay up to
1102 	 * continue receiving LACP traffic. We would normally never
1103 	 * stop the default Rx rings because of the primary MAC
1104 	 * client; but aggr's primary MAC client doesn't call
1105 	 * mac_unicast_add() and thus mi_active is 0 when the last
1106 	 * non-primary client is deleted.
1107 	 */
1108 	if (rr_ring->arr_grp->arg_index != 0)
1109 		mac_hwring_stop(rr_ring->arr_hw_rh);
1110 }
1111 
1112 /*
1113  * Add one or more ports to an existing link aggregation group.
1114  */
1115 int
1116 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1117     laioc_port_t *ports)
1118 {
1119 	int rc;
1120 	uint_t port_added = 0;
1121 	uint_t grp_added;
1122 	aggr_grp_t *grp = NULL;
1123 	aggr_port_t *port;
1124 	boolean_t link_state_changed = B_FALSE;
1125 	mac_perim_handle_t mph, pmph;
1126 
1127 	/* Get the aggr corresponding to linkid. */
1128 	rw_enter(&aggr_grp_lock, RW_READER);
1129 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1130 	    (mod_hash_val_t *)&grp) != 0) {
1131 		rw_exit(&aggr_grp_lock);
1132 		return (ENOENT);
1133 	}
1134 	AGGR_GRP_REFHOLD(grp);
1135 
1136 	/*
1137 	 * Hold the perimeter so that the aggregation can't be destroyed.
1138 	 */
1139 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1140 	rw_exit(&aggr_grp_lock);
1141 
1142 	/* Add the specified ports to the aggr. */
1143 	for (uint_t i = 0; i < nports; i++) {
1144 		grp_added = 0;
1145 
1146 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1147 		    force, &port)) != 0) {
1148 			goto bail;
1149 		}
1150 
1151 		ASSERT(port != NULL);
1152 		port_added++;
1153 
1154 		/* check capabilities */
1155 		if (!aggr_grp_capab_check(grp, port) ||
1156 		    !aggr_grp_sdu_check(grp, port) ||
1157 		    !aggr_grp_margin_check(grp, port)) {
1158 			rc = ENOTSUP;
1159 			goto bail;
1160 		}
1161 
1162 		/*
1163 		 * Create the pseudo ring for each HW ring of the underlying
1164 		 * port.
1165 		 */
1166 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1167 		if (rc != 0)
1168 			goto bail;
1169 
1170 		for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1171 			rc = aggr_add_pseudo_rx_group(port,
1172 			    &grp->lg_rx_groups[j]);
1173 
1174 			if (rc != 0)
1175 				goto bail;
1176 
1177 			grp_added++;
1178 		}
1179 
1180 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1181 
1182 		/* set LACP mode */
1183 		aggr_port_lacp_set_mode(grp, port);
1184 
1185 		/* start port if group has already been started */
1186 		if (grp->lg_started) {
1187 			rc = aggr_port_start(port);
1188 			if (rc != 0) {
1189 				mac_perim_exit(pmph);
1190 				goto bail;
1191 			}
1192 
1193 			/*
1194 			 * Turn on the promiscuous mode over the port when it
1195 			 * is requested to be turned on to receive the
1196 			 * non-primary address over a port, or the promiscuous
1197 			 * mode is enabled over the aggr.
1198 			 */
1199 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1200 				rc = aggr_port_promisc(port, B_TRUE);
1201 				if (rc != 0) {
1202 					mac_perim_exit(pmph);
1203 					goto bail;
1204 				}
1205 			}
1206 		}
1207 		mac_perim_exit(pmph);
1208 
1209 		/*
1210 		 * Attach each port if necessary.
1211 		 */
1212 		if (aggr_port_notify_link(grp, port))
1213 			link_state_changed = B_TRUE;
1214 
1215 		/*
1216 		 * Initialize the callback functions for this port.
1217 		 */
1218 		aggr_port_init_callbacks(port);
1219 	}
1220 
1221 	/* update the MAC address of the constituent ports */
1222 	if (aggr_grp_update_ports_mac(grp))
1223 		link_state_changed = B_TRUE;
1224 
1225 	if (link_state_changed)
1226 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1227 
1228 bail:
1229 	if (rc != 0) {
1230 		/* stop and remove ports that have been added */
1231 		for (uint_t i = 0; i < port_added; i++) {
1232 			uint_t grp_remove;
1233 
1234 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1235 			ASSERT(port != NULL);
1236 
1237 			if (grp->lg_started) {
1238 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1239 				(void) aggr_port_promisc(port, B_FALSE);
1240 				aggr_port_stop(port);
1241 				mac_perim_exit(pmph);
1242 			}
1243 
1244 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1245 
1246 			/*
1247 			 * Only the last port could have a partial set
1248 			 * of groups added.
1249 			 */
1250 			grp_remove = (i + 1 == port_added) ? grp_added :
1251 			    grp->lg_rx_group_count;
1252 
1253 			for (uint_t j = 0; j < grp_remove; j++) {
1254 				aggr_rem_pseudo_rx_group(port,
1255 				    &grp->lg_rx_groups[j]);
1256 			}
1257 
1258 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1259 		}
1260 	}
1261 
1262 	mac_perim_exit(mph);
1263 	AGGR_GRP_REFRELE(grp);
1264 	return (rc);
1265 }
1266 
1267 static int
1268 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1269     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1270     aggr_lacp_timer_t lacp_timer)
1271 {
1272 	boolean_t mac_addr_changed = B_FALSE;
1273 	boolean_t link_state_changed = B_FALSE;
1274 	mac_perim_handle_t pmph;
1275 
1276 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1277 
1278 	/* validate fixed address if specified */
1279 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1280 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1281 	    (mac_addr[0] & 0x01))) {
1282 		return (EINVAL);
1283 	}
1284 
1285 	/* update policy if requested */
1286 	if (update_mask & AGGR_MODIFY_POLICY)
1287 		aggr_send_update_policy(grp, policy);
1288 
1289 	/* update unicast MAC address if requested */
1290 	if (update_mask & AGGR_MODIFY_MAC) {
1291 		if (mac_fixed) {
1292 			/* user-supplied MAC address */
1293 			grp->lg_mac_addr_port = NULL;
1294 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1295 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1296 				mac_addr_changed = B_TRUE;
1297 			}
1298 		} else if (grp->lg_addr_fixed) {
1299 			/* switch from user-supplied to automatic */
1300 			aggr_port_t *port = grp->lg_ports;
1301 
1302 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1303 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1304 			grp->lg_mac_addr_port = port;
1305 			mac_addr_changed = B_TRUE;
1306 			mac_perim_exit(pmph);
1307 		}
1308 		grp->lg_addr_fixed = mac_fixed;
1309 	}
1310 
1311 	if (mac_addr_changed)
1312 		link_state_changed = aggr_grp_update_ports_mac(grp);
1313 
1314 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1315 		aggr_lacp_update_mode(grp, lacp_mode);
1316 
1317 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1318 		aggr_lacp_update_timer(grp, lacp_timer);
1319 
1320 	if (link_state_changed)
1321 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1322 
1323 	if (mac_addr_changed)
1324 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1325 
1326 	return (0);
1327 }
1328 
1329 /*
1330  * Update properties of an existing link aggregation group.
1331  */
1332 int
1333 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1334     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1335     aggr_lacp_timer_t lacp_timer)
1336 {
1337 	aggr_grp_t *grp = NULL;
1338 	mac_perim_handle_t mph;
1339 	int err;
1340 
1341 	/* get group corresponding to linkid */
1342 	rw_enter(&aggr_grp_lock, RW_READER);
1343 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1344 	    (mod_hash_val_t *)&grp) != 0) {
1345 		rw_exit(&aggr_grp_lock);
1346 		return (ENOENT);
1347 	}
1348 	AGGR_GRP_REFHOLD(grp);
1349 
1350 	/*
1351 	 * Hold the perimeter so that the aggregation won't be destroyed.
1352 	 */
1353 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1354 	rw_exit(&aggr_grp_lock);
1355 
1356 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1357 	    mac_addr, lacp_mode, lacp_timer);
1358 
1359 	mac_perim_exit(mph);
1360 	AGGR_GRP_REFRELE(grp);
1361 	return (err);
1362 }
1363 
1364 /*
1365  * Create a new link aggregation group upon request from administrator.
1366  * Returns 0 on success, an errno on failure.
1367  */
1368 int
1369 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1370     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1371     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1372     cred_t *credp)
1373 {
1374 	aggr_grp_t *grp = NULL;
1375 	aggr_port_t *port;
1376 	mac_register_t *mac;
1377 	boolean_t link_state_changed;
1378 	mac_perim_handle_t mph;
1379 	int err;
1380 	int i;
1381 	kt_did_t tid = 0;
1382 
1383 	/* need at least one port */
1384 	if (nports == 0)
1385 		return (EINVAL);
1386 
1387 	rw_enter(&aggr_grp_lock, RW_WRITER);
1388 
1389 	/* does a group with the same linkid already exist? */
1390 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1391 	    (mod_hash_val_t *)&grp);
1392 	if (err == 0) {
1393 		rw_exit(&aggr_grp_lock);
1394 		return (EEXIST);
1395 	}
1396 
1397 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1398 
1399 	grp->lg_refs = 1;
1400 	grp->lg_closing = B_FALSE;
1401 	grp->lg_force = force;
1402 	grp->lg_linkid = linkid;
1403 	grp->lg_zoneid = crgetzoneid(credp);
1404 	grp->lg_ifspeed = 0;
1405 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1406 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1407 	grp->lg_started = B_FALSE;
1408 	grp->lg_promisc = B_FALSE;
1409 	grp->lg_lacp_done = B_FALSE;
1410 	grp->lg_tx_notify_done = B_FALSE;
1411 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1412 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1413 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1414 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1415 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1416 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1417 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1418 	grp->lg_tx_blocked_cnt = 0;
1419 	bzero(&grp->lg_rx_groups,
1420 	    sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1421 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1422 	aggr_lacp_init_grp(grp);
1423 
1424 	/* add MAC ports to group */
1425 	grp->lg_ports = NULL;
1426 	grp->lg_nports = 0;
1427 	grp->lg_nattached_ports = 0;
1428 	grp->lg_ntx_ports = 0;
1429 
1430 	/*
1431 	 * If key is not specified by the user, allocate the key.
1432 	 */
1433 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1434 		err = ENOMEM;
1435 		goto bail;
1436 	}
1437 	grp->lg_key = key;
1438 
1439 	for (i = 0; i < nports; i++) {
1440 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1441 		if (err != 0)
1442 			goto bail;
1443 	}
1444 
1445 	grp->lg_rx_group_count = 1;
1446 
1447 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1448 		uint_t num_rgroups;
1449 
1450 		mac_perim_enter_by_mh(port->lp_mh, &mph);
1451 		num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1452 		mac_perim_exit(mph);
1453 
1454 		/*
1455 		 * Utilize all the groups in a port. If some ports
1456 		 * have less groups than others, then traffic destined
1457 		 * for the same unicast address may be HW classified
1458 		 * on some ports but SW classified by aggr when
1459 		 * arriving on other ports.
1460 		 */
1461 		grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1462 		    num_rgroups);
1463 	}
1464 
1465 	/*
1466 	 * There could be cases where the hardware provides more
1467 	 * groups than aggr can support. Make sure we never go above
1468 	 * the max aggr can support.
1469 	 */
1470 	grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1471 	    MAX_GROUPS_PER_PORT);
1472 
1473 	ASSERT3U(grp->lg_rx_group_count, >, 0);
1474 	for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1475 		grp->lg_rx_groups[i].arg_index = i;
1476 		grp->lg_rx_groups[i].arg_untagged = 0;
1477 		list_create(&(grp->lg_rx_groups[i].arg_vlans),
1478 		    sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1479 	}
1480 
1481 	/*
1482 	 * If no explicit MAC address was specified by the administrator,
1483 	 * set it to the MAC address of the first port.
1484 	 */
1485 	grp->lg_addr_fixed = mac_fixed;
1486 	if (grp->lg_addr_fixed) {
1487 		/* validate specified address */
1488 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1489 			err = EINVAL;
1490 			goto bail;
1491 		}
1492 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1493 	} else {
1494 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1495 		grp->lg_mac_addr_port = grp->lg_ports;
1496 	}
1497 
1498 	/* Set the initial group capabilities. */
1499 	aggr_grp_capab_set(grp);
1500 
1501 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1502 		err = ENOMEM;
1503 		goto bail;
1504 	}
1505 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1506 	mac->m_driver = grp;
1507 	mac->m_dip = aggr_dip;
1508 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1509 	mac->m_src_addr = grp->lg_addr;
1510 	mac->m_callbacks = &aggr_m_callbacks;
1511 	mac->m_min_sdu = 0;
1512 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1513 	mac->m_margin = aggr_grp_max_margin(grp);
1514 	mac->m_v12n = MAC_VIRT_LEVEL1;
1515 	err = mac_register(mac, &grp->lg_mh);
1516 	mac_free(mac);
1517 	if (err != 0)
1518 		goto bail;
1519 
1520 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1521 	if (err != 0) {
1522 		(void) mac_unregister(grp->lg_mh);
1523 		grp->lg_mh = NULL;
1524 		goto bail;
1525 	}
1526 
1527 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1528 
1529 	/*
1530 	 * Update the MAC address of the constituent ports.
1531 	 * None of the port is attached at this time, the link state of the
1532 	 * aggregation will not change.
1533 	 *
1534 	 * All ports take on the primary MAC address of the aggr
1535 	 * (lg_aggr). At this point, none of the ports are attached;
1536 	 * thus the link state of the aggregation will not change.
1537 	 */
1538 	link_state_changed = aggr_grp_update_ports_mac(grp);
1539 	ASSERT(!link_state_changed);
1540 
1541 	/* Update outbound load balancing policy. */
1542 	aggr_send_update_policy(grp, policy);
1543 
1544 	/* Set LACP mode. */
1545 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1546 
1547 	/*
1548 	 * Attach each port if necessary.
1549 	 */
1550 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1551 		/*
1552 		 * Create the pseudo ring for each HW ring of the
1553 		 * underlying port. Note that this is done after the
1554 		 * aggr registers its MAC.
1555 		 */
1556 		VERIFY3S(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group),
1557 		    ==, 0);
1558 
1559 		for (i = 0; i < grp->lg_rx_group_count; i++) {
1560 			VERIFY3S(aggr_add_pseudo_rx_group(port,
1561 			    &grp->lg_rx_groups[i]), ==, 0);
1562 		}
1563 
1564 		if (aggr_port_notify_link(grp, port))
1565 			link_state_changed = B_TRUE;
1566 
1567 		/*
1568 		 * Initialize the callback functions for this port.
1569 		 */
1570 		aggr_port_init_callbacks(port);
1571 	}
1572 
1573 	if (link_state_changed)
1574 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1575 
1576 	/* add new group to hash table */
1577 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1578 	    (mod_hash_val_t)grp);
1579 	ASSERT(err == 0);
1580 	aggr_grp_cnt++;
1581 
1582 	mac_perim_exit(mph);
1583 	rw_exit(&aggr_grp_lock);
1584 	return (0);
1585 
1586 bail:
1587 
1588 	grp->lg_closing = B_TRUE;
1589 
1590 	port = grp->lg_ports;
1591 	while (port != NULL) {
1592 		aggr_port_t *cport;
1593 
1594 		cport = port->lp_next;
1595 		aggr_port_delete(port);
1596 		port = cport;
1597 	}
1598 
1599 	/*
1600 	 * Inform the lacp_rx thread to exit.
1601 	 */
1602 	mutex_enter(&grp->lg_lacp_lock);
1603 	grp->lg_lacp_done = B_TRUE;
1604 	cv_signal(&grp->lg_lacp_cv);
1605 	while (grp->lg_lacp_rx_thread != NULL)
1606 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1607 	mutex_exit(&grp->lg_lacp_lock);
1608 	/*
1609 	 * Inform the tx_notify thread to exit.
1610 	 */
1611 	mutex_enter(&grp->lg_tx_flowctl_lock);
1612 	if (grp->lg_tx_notify_thread != NULL) {
1613 		tid = grp->lg_tx_notify_thread->t_did;
1614 		grp->lg_tx_notify_done = B_TRUE;
1615 		cv_signal(&grp->lg_tx_flowctl_cv);
1616 	}
1617 	mutex_exit(&grp->lg_tx_flowctl_lock);
1618 	if (tid != 0)
1619 		thread_join(tid);
1620 
1621 	kmem_free(grp->lg_tx_blocked_rings,
1622 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1623 	rw_exit(&aggr_grp_lock);
1624 	AGGR_GRP_REFRELE(grp);
1625 	return (err);
1626 }
1627 
1628 /*
1629  * Return a pointer to the member of a group with specified linkid.
1630  */
1631 static aggr_port_t *
1632 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1633 {
1634 	aggr_port_t *port;
1635 
1636 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1637 
1638 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1639 		if (port->lp_linkid == linkid)
1640 			break;
1641 	}
1642 
1643 	return (port);
1644 }
1645 
1646 /*
1647  * Stop, detach and remove a port from a link aggregation group.
1648  */
1649 static int
1650 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1651     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1652 {
1653 	int rc = 0;
1654 	aggr_port_t **pport;
1655 	boolean_t mac_addr_changed = B_FALSE;
1656 	boolean_t link_state_changed = B_FALSE;
1657 	mac_perim_handle_t mph;
1658 	uint64_t val;
1659 	uint_t i;
1660 	uint_t stat;
1661 
1662 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1663 	ASSERT(grp->lg_nports > 1);
1664 	ASSERT(!grp->lg_closing);
1665 
1666 	/* unlink port */
1667 	for (pport = &grp->lg_ports; *pport != port;
1668 	    pport = &(*pport)->lp_next) {
1669 		if (*pport == NULL) {
1670 			rc = ENOENT;
1671 			goto done;
1672 		}
1673 	}
1674 	*pport = port->lp_next;
1675 
1676 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1677 
1678 	/*
1679 	 * If the MAC address of the port being removed was assigned
1680 	 * to the group, update the group MAC address
1681 	 * using the MAC address of a different port.
1682 	 */
1683 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1684 		/*
1685 		 * Set the MAC address of the group to the
1686 		 * MAC address of its first port.
1687 		 */
1688 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1689 		grp->lg_mac_addr_port = grp->lg_ports;
1690 		mac_addr_changed = B_TRUE;
1691 	}
1692 
1693 	link_state_changed = aggr_grp_detach_port(grp, port);
1694 
1695 	/*
1696 	 * Add the counter statistics of the ports while it was aggregated
1697 	 * to the group's residual statistics.  This is done by obtaining
1698 	 * the current counter from the underlying MAC then subtracting the
1699 	 * value of the counter at the moment it was added to the
1700 	 * aggregation.
1701 	 */
1702 	for (i = 0; i < MAC_NSTAT; i++) {
1703 		stat = i + MAC_STAT_MIN;
1704 		if (!MAC_STAT_ISACOUNTER(stat))
1705 			continue;
1706 		val = aggr_port_stat(port, stat);
1707 		val -= port->lp_stat[i];
1708 		mutex_enter(&grp->lg_stat_lock);
1709 		grp->lg_stat[i] += val;
1710 		mutex_exit(&grp->lg_stat_lock);
1711 	}
1712 	for (i = 0; i < ETHER_NSTAT; i++) {
1713 		stat = i + MACTYPE_STAT_MIN;
1714 		if (!ETHER_STAT_ISACOUNTER(stat))
1715 			continue;
1716 		val = aggr_port_stat(port, stat);
1717 		val -= port->lp_ether_stat[i];
1718 		mutex_enter(&grp->lg_stat_lock);
1719 		grp->lg_ether_stat[i] += val;
1720 		mutex_exit(&grp->lg_stat_lock);
1721 	}
1722 
1723 	grp->lg_nports--;
1724 	mac_perim_exit(mph);
1725 
1726 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1727 	aggr_port_delete(port);
1728 
1729 	/*
1730 	 * If the group MAC address has changed, update the MAC address of
1731 	 * the remaining constituent ports according to the new MAC
1732 	 * address of the group.
1733 	 */
1734 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1735 		link_state_changed = B_TRUE;
1736 
1737 done:
1738 	if (mac_addr_changedp != NULL)
1739 		*mac_addr_changedp = mac_addr_changed;
1740 	if (link_state_changedp != NULL)
1741 		*link_state_changedp = link_state_changed;
1742 
1743 	return (rc);
1744 }
1745 
1746 /*
1747  * Remove one or more ports from an existing link aggregation group.
1748  */
1749 int
1750 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1751 {
1752 	int rc = 0, i;
1753 	aggr_grp_t *grp = NULL;
1754 	aggr_port_t *port;
1755 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1756 	boolean_t link_state_update = B_FALSE, link_state_changed;
1757 	mac_perim_handle_t mph, pmph;
1758 
1759 	/* get group corresponding to linkid */
1760 	rw_enter(&aggr_grp_lock, RW_READER);
1761 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1762 	    (mod_hash_val_t *)&grp) != 0) {
1763 		rw_exit(&aggr_grp_lock);
1764 		return (ENOENT);
1765 	}
1766 	AGGR_GRP_REFHOLD(grp);
1767 
1768 	/*
1769 	 * Hold the perimeter so that the aggregation won't be destroyed.
1770 	 */
1771 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1772 	rw_exit(&aggr_grp_lock);
1773 
1774 	/* we need to keep at least one port per group */
1775 	if (nports >= grp->lg_nports) {
1776 		rc = EINVAL;
1777 		goto bail;
1778 	}
1779 
1780 	/* first verify that all the groups are valid */
1781 	for (i = 0; i < nports; i++) {
1782 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1783 			/* port not found */
1784 			rc = ENOENT;
1785 			goto bail;
1786 		}
1787 	}
1788 
1789 	/* clear the promiscous mode for the specified ports */
1790 	for (i = 0; i < nports && rc == 0; i++) {
1791 		/* lookup port */
1792 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1793 		ASSERT(port != NULL);
1794 
1795 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1796 		rc = aggr_port_promisc(port, B_FALSE);
1797 		mac_perim_exit(pmph);
1798 	}
1799 	if (rc != 0) {
1800 		for (i = 0; i < nports; i++) {
1801 			port = aggr_grp_port_lookup(grp,
1802 			    ports[i].lp_linkid);
1803 			ASSERT(port != NULL);
1804 
1805 			/*
1806 			 * Turn the promiscuous mode back on if it is required
1807 			 * to receive the non-primary address over a port, or
1808 			 * the promiscous mode is enabled over the aggr.
1809 			 */
1810 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1811 			if (port->lp_started && (grp->lg_promisc ||
1812 			    port->lp_prom_addr != NULL)) {
1813 				(void) aggr_port_promisc(port, B_TRUE);
1814 			}
1815 			mac_perim_exit(pmph);
1816 		}
1817 		goto bail;
1818 	}
1819 
1820 	/* remove the specified ports from group */
1821 	for (i = 0; i < nports; i++) {
1822 		/* lookup port */
1823 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1824 		ASSERT(port != NULL);
1825 
1826 		/* stop port if group has already been started */
1827 		if (grp->lg_started) {
1828 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1829 			aggr_port_stop(port);
1830 			mac_perim_exit(pmph);
1831 		}
1832 
1833 		/*
1834 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1835 		 * it is called from inside aggr_grp_rem_port() after the
1836 		 * port has been detached. The reason is that
1837 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1838 		 * and if there is still traffic going on, then there
1839 		 * is the possibility of aggr_find_tx_ring() returning a
1840 		 * removed ring for transmission. Once the port has been
1841 		 * detached, that port will not be used and
1842 		 * aggr_find_tx_ring() will not return any rings
1843 		 * belonging to it.
1844 		 */
1845 		for (i = 0; i < grp->lg_rx_group_count; i++)
1846 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1847 
1848 		/* remove port from group */
1849 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1850 		    &link_state_changed);
1851 		ASSERT(rc == 0);
1852 		mac_addr_update = mac_addr_update || mac_addr_changed;
1853 		link_state_update = link_state_update || link_state_changed;
1854 	}
1855 
1856 bail:
1857 	if (mac_addr_update)
1858 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1859 	if (link_state_update)
1860 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1861 
1862 	mac_perim_exit(mph);
1863 	AGGR_GRP_REFRELE(grp);
1864 
1865 	return (rc);
1866 }
1867 
1868 int
1869 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1870 {
1871 	aggr_grp_t *grp = NULL;
1872 	aggr_port_t *port, *cport;
1873 	datalink_id_t tmpid;
1874 	mod_hash_val_t val;
1875 	mac_perim_handle_t mph, pmph;
1876 	int err;
1877 	kt_did_t tid = 0;
1878 
1879 	rw_enter(&aggr_grp_lock, RW_WRITER);
1880 
1881 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1882 	    (mod_hash_val_t *)&grp) != 0) {
1883 		rw_exit(&aggr_grp_lock);
1884 		return (ENOENT);
1885 	}
1886 
1887 	/*
1888 	 * Note that dls_devnet_destroy() must be called before lg_lock is
1889 	 * held. Otherwise, it will deadlock if another thread is in
1890 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1891 	 * dls_devnet_destroy() needs to delete.
1892 	 */
1893 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1894 		rw_exit(&aggr_grp_lock);
1895 		return (err);
1896 	}
1897 	ASSERT(linkid == tmpid);
1898 
1899 	/*
1900 	 * Unregister from the MAC service module. Since this can
1901 	 * fail if a client hasn't closed the MAC port, we gracefully
1902 	 * fail the operation.
1903 	 */
1904 	if ((err = mac_disable(grp->lg_mh)) != 0) {
1905 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1906 		rw_exit(&aggr_grp_lock);
1907 		return (err);
1908 	}
1909 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1910 	ASSERT(grp == (aggr_grp_t *)val);
1911 
1912 	ASSERT(aggr_grp_cnt > 0);
1913 	aggr_grp_cnt--;
1914 	rw_exit(&aggr_grp_lock);
1915 
1916 	/*
1917 	 * Inform the lacp_rx thread to exit.
1918 	 */
1919 	mutex_enter(&grp->lg_lacp_lock);
1920 	grp->lg_lacp_done = B_TRUE;
1921 	cv_signal(&grp->lg_lacp_cv);
1922 	while (grp->lg_lacp_rx_thread != NULL)
1923 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1924 	mutex_exit(&grp->lg_lacp_lock);
1925 	/*
1926 	 * Inform the tx_notify_thread to exit.
1927 	 */
1928 	mutex_enter(&grp->lg_tx_flowctl_lock);
1929 	if (grp->lg_tx_notify_thread != NULL) {
1930 		tid = grp->lg_tx_notify_thread->t_did;
1931 		grp->lg_tx_notify_done = B_TRUE;
1932 		cv_signal(&grp->lg_tx_flowctl_cv);
1933 	}
1934 	mutex_exit(&grp->lg_tx_flowctl_lock);
1935 	if (tid != 0)
1936 		thread_join(tid);
1937 
1938 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1939 
1940 	grp->lg_closing = B_TRUE;
1941 	/* detach and free MAC ports associated with group */
1942 	port = grp->lg_ports;
1943 	while (port != NULL) {
1944 		cport = port->lp_next;
1945 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1946 		if (grp->lg_started)
1947 			aggr_port_stop(port);
1948 		(void) aggr_grp_detach_port(grp, port);
1949 		mac_perim_exit(pmph);
1950 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1951 		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
1952 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
1953 		aggr_port_delete(port);
1954 		port = cport;
1955 	}
1956 
1957 	mac_perim_exit(mph);
1958 
1959 	kmem_free(grp->lg_tx_blocked_rings,
1960 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1961 	/*
1962 	 * Wait for the port's lacp timer thread and its notification callback
1963 	 * to exit before calling mac_unregister() since both needs to access
1964 	 * the mac perimeter of the grp.
1965 	 */
1966 	aggr_grp_port_wait(grp);
1967 
1968 	VERIFY(mac_unregister(grp->lg_mh) == 0);
1969 	grp->lg_mh = NULL;
1970 
1971 	for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1972 		list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
1973 	}
1974 
1975 	AGGR_GRP_REFRELE(grp);
1976 	return (0);
1977 }
1978 
1979 void
1980 aggr_grp_free(aggr_grp_t *grp)
1981 {
1982 	ASSERT(grp->lg_refs == 0);
1983 	ASSERT(grp->lg_port_ref == 0);
1984 	if (grp->lg_key > AGGR_MAX_KEY) {
1985 		id_free(key_ids, grp->lg_key);
1986 		grp->lg_key = 0;
1987 	}
1988 	kmem_cache_free(aggr_grp_cache, grp);
1989 }
1990 
1991 int
1992 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1993     aggr_grp_info_new_grp_fn_t new_grp_fn,
1994     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1995 {
1996 	aggr_grp_t	*grp;
1997 	aggr_port_t	*port;
1998 	mac_perim_handle_t mph, pmph;
1999 	int		rc = 0;
2000 
2001 	/*
2002 	 * Make sure that the aggregation link is visible from the caller's
2003 	 * zone.
2004 	 */
2005 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2006 		return (ENOENT);
2007 
2008 	rw_enter(&aggr_grp_lock, RW_READER);
2009 
2010 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2011 	    (mod_hash_val_t *)&grp) != 0) {
2012 		rw_exit(&aggr_grp_lock);
2013 		return (ENOENT);
2014 	}
2015 	AGGR_GRP_REFHOLD(grp);
2016 
2017 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2018 	rw_exit(&aggr_grp_lock);
2019 
2020 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
2021 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2022 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2023 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2024 
2025 	if (rc != 0)
2026 		goto bail;
2027 
2028 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2029 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2030 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2031 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
2032 		mac_perim_exit(pmph);
2033 
2034 		if (rc != 0)
2035 			goto bail;
2036 	}
2037 
2038 bail:
2039 	mac_perim_exit(mph);
2040 	AGGR_GRP_REFRELE(grp);
2041 	return (rc);
2042 }
2043 
2044 /*ARGSUSED*/
2045 static void
2046 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2047 {
2048 	miocnak(q, mp, 0, ENOTSUP);
2049 }
2050 
2051 static int
2052 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2053 {
2054 	aggr_port_t	*port;
2055 	uint_t		stat_index;
2056 
2057 	ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2058 
2059 	/* We only aggregate counter statistics. */
2060 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
2061 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
2062 		return (ENOTSUP);
2063 	}
2064 
2065 	/*
2066 	 * Counter statistics for a group are computed by aggregating the
2067 	 * counters of the members MACs while they were aggregated, plus
2068 	 * the residual counter of the group itself, which is updated each
2069 	 * time a MAC is removed from the group.
2070 	 */
2071 	*val = 0;
2072 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2073 		/* actual port statistic */
2074 		*val += aggr_port_stat(port, stat);
2075 		/*
2076 		 * minus the port stat when it was added, plus any residual
2077 		 * amount for the group.
2078 		 */
2079 		if (IS_MAC_STAT(stat)) {
2080 			stat_index = stat - MAC_STAT_MIN;
2081 			*val -= port->lp_stat[stat_index];
2082 			*val += grp->lg_stat[stat_index];
2083 		} else if (IS_MACTYPE_STAT(stat)) {
2084 			stat_index = stat - MACTYPE_STAT_MIN;
2085 			*val -= port->lp_ether_stat[stat_index];
2086 			*val += grp->lg_ether_stat[stat_index];
2087 		}
2088 	}
2089 	return (0);
2090 }
2091 
2092 int
2093 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2094 {
2095 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2096 
2097 	if (rx_ring->arr_hw_rh != NULL) {
2098 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2099 	} else {
2100 		aggr_port_t	*port = rx_ring->arr_port;
2101 
2102 		*val = mac_stat_get(port->lp_mh, stat);
2103 
2104 	}
2105 	return (0);
2106 }
2107 
2108 int
2109 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2110 {
2111 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2112 
2113 	if (tx_ring->atr_hw_rh != NULL) {
2114 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2115 	} else {
2116 		aggr_port_t	*port = tx_ring->atr_port;
2117 
2118 		*val = mac_stat_get(port->lp_mh, stat);
2119 	}
2120 	return (0);
2121 }
2122 
2123 static int
2124 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2125 {
2126 	aggr_grp_t		*grp = arg;
2127 	int			rval = 0;
2128 
2129 	mutex_enter(&grp->lg_stat_lock);
2130 
2131 	switch (stat) {
2132 	case MAC_STAT_IFSPEED:
2133 		*val = grp->lg_ifspeed;
2134 		break;
2135 
2136 	case ETHER_STAT_LINK_DUPLEX:
2137 		*val = grp->lg_link_duplex;
2138 		break;
2139 
2140 	default:
2141 		/*
2142 		 * For all other statistics, we return the aggregated stat
2143 		 * from the underlying ports.  aggr_grp_stat() will set
2144 		 * rval appropriately if the statistic isn't a counter.
2145 		 */
2146 		rval = aggr_grp_stat(grp, stat, val);
2147 	}
2148 
2149 	mutex_exit(&grp->lg_stat_lock);
2150 	return (rval);
2151 }
2152 
2153 static int
2154 aggr_m_start(void *arg)
2155 {
2156 	aggr_grp_t *grp = arg;
2157 	aggr_port_t *port;
2158 	mac_perim_handle_t mph, pmph;
2159 
2160 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2161 
2162 	/*
2163 	 * Attempts to start all configured members of the group.
2164 	 * Group members will be attached when their link-up notification
2165 	 * is received.
2166 	 */
2167 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2168 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2169 		if (aggr_port_start(port) != 0) {
2170 			mac_perim_exit(pmph);
2171 			continue;
2172 		}
2173 
2174 		/*
2175 		 * Turn on the promiscuous mode if it is required to receive
2176 		 * the non-primary address over a port, or the promiscous
2177 		 * mode is enabled over the aggr.
2178 		 */
2179 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2180 			if (aggr_port_promisc(port, B_TRUE) != 0)
2181 				aggr_port_stop(port);
2182 		}
2183 		mac_perim_exit(pmph);
2184 	}
2185 
2186 	grp->lg_started = B_TRUE;
2187 
2188 	mac_perim_exit(mph);
2189 	return (0);
2190 }
2191 
2192 static void
2193 aggr_m_stop(void *arg)
2194 {
2195 	aggr_grp_t *grp = arg;
2196 	aggr_port_t *port;
2197 	mac_perim_handle_t mph, pmph;
2198 
2199 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2200 
2201 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2202 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2203 
2204 		/* reset port promiscuous mode */
2205 		(void) aggr_port_promisc(port, B_FALSE);
2206 
2207 		aggr_port_stop(port);
2208 		mac_perim_exit(pmph);
2209 	}
2210 
2211 	grp->lg_started = B_FALSE;
2212 	mac_perim_exit(mph);
2213 }
2214 
2215 static int
2216 aggr_m_promisc(void *arg, boolean_t on)
2217 {
2218 	aggr_grp_t *grp = arg;
2219 	aggr_port_t *port;
2220 	boolean_t link_state_changed = B_FALSE;
2221 	mac_perim_handle_t mph, pmph;
2222 
2223 	AGGR_GRP_REFHOLD(grp);
2224 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2225 
2226 	ASSERT(!grp->lg_closing);
2227 
2228 	if (on == grp->lg_promisc)
2229 		goto bail;
2230 
2231 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2232 		int	err = 0;
2233 
2234 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2235 		AGGR_PORT_REFHOLD(port);
2236 		if (!on && (port->lp_prom_addr == NULL))
2237 			err = aggr_port_promisc(port, B_FALSE);
2238 		else if (on && port->lp_started)
2239 			err = aggr_port_promisc(port, B_TRUE);
2240 
2241 		if (err != 0) {
2242 			if (aggr_grp_detach_port(grp, port))
2243 				link_state_changed = B_TRUE;
2244 		} else {
2245 			/*
2246 			 * If a port was detached because of a previous
2247 			 * failure changing the promiscuity, the port
2248 			 * is reattached when it successfully changes
2249 			 * the promiscuity now, and this might cause
2250 			 * the link state of the aggregation to change.
2251 			 */
2252 			if (aggr_grp_attach_port(grp, port))
2253 				link_state_changed = B_TRUE;
2254 		}
2255 		mac_perim_exit(pmph);
2256 		AGGR_PORT_REFRELE(port);
2257 	}
2258 
2259 	grp->lg_promisc = on;
2260 
2261 	if (link_state_changed)
2262 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2263 
2264 bail:
2265 	mac_perim_exit(mph);
2266 	AGGR_GRP_REFRELE(grp);
2267 
2268 	return (0);
2269 }
2270 
2271 static void
2272 aggr_grp_port_rename(const char *new_name, void *arg)
2273 {
2274 	/*
2275 	 * aggr port's mac client name is the format of "aggr link name" plus
2276 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2277 	 */
2278 	int aggr_len, link_len, clnt_name_len, i;
2279 	char *str_end, *str_st, *str_del;
2280 	char aggr_name[MAXNAMELEN];
2281 	char link_name[MAXNAMELEN];
2282 	char *clnt_name;
2283 	aggr_grp_t *aggr_grp = arg;
2284 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2285 
2286 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2287 		clnt_name = mac_client_name(aggr_port->lp_mch);
2288 		clnt_name_len = strlen(clnt_name);
2289 		str_st = clnt_name;
2290 		str_end = &(clnt_name[clnt_name_len]);
2291 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2292 		ASSERT(str_del != NULL);
2293 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2294 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2295 		bzero(aggr_name, MAXNAMELEN);
2296 		bzero(link_name, MAXNAMELEN);
2297 		bcopy(clnt_name, aggr_name, aggr_len);
2298 		bcopy(str_del, link_name, link_len + 1);
2299 		bzero(clnt_name, MAXNAMELEN);
2300 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2301 		    link_name);
2302 
2303 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2304 		aggr_port = aggr_port->lp_next;
2305 	}
2306 }
2307 
2308 /*
2309  * Initialize the capabilities that are advertised for the group
2310  * according to the capabilities of the constituent ports.
2311  */
2312 static boolean_t
2313 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2314 {
2315 	aggr_grp_t *grp = arg;
2316 
2317 	switch (cap) {
2318 	case MAC_CAPAB_HCKSUM: {
2319 		uint32_t *hcksum_txflags = cap_data;
2320 		*hcksum_txflags = grp->lg_hcksum_txflags;
2321 		break;
2322 	}
2323 	case MAC_CAPAB_LSO: {
2324 		mac_capab_lso_t *cap_lso = cap_data;
2325 
2326 		if (grp->lg_lso) {
2327 			*cap_lso = grp->lg_cap_lso;
2328 			break;
2329 		} else {
2330 			return (B_FALSE);
2331 		}
2332 	}
2333 	case MAC_CAPAB_NO_NATIVEVLAN:
2334 		return (!grp->lg_vlan);
2335 	case MAC_CAPAB_NO_ZCOPY:
2336 		return (!grp->lg_zcopy);
2337 	case MAC_CAPAB_RINGS: {
2338 		mac_capab_rings_t *cap_rings = cap_data;
2339 		uint_t ring_cnt = 0;
2340 
2341 		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2342 			ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2343 
2344 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2345 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2346 			cap_rings->mr_rnum = ring_cnt;
2347 			cap_rings->mr_gnum = grp->lg_rx_group_count;
2348 			cap_rings->mr_gaddring = NULL;
2349 			cap_rings->mr_gremring = NULL;
2350 		} else {
2351 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2352 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2353 			cap_rings->mr_gnum = 0;
2354 		}
2355 		cap_rings->mr_rget = aggr_fill_ring;
2356 		cap_rings->mr_gget = aggr_fill_group;
2357 		break;
2358 	}
2359 	case MAC_CAPAB_AGGR:
2360 	{
2361 		mac_capab_aggr_t *aggr_cap;
2362 
2363 		if (cap_data != NULL) {
2364 			aggr_cap = cap_data;
2365 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2366 			aggr_cap->mca_unicst = aggr_m_unicst;
2367 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2368 			aggr_cap->mca_arg = arg;
2369 		}
2370 		return (B_TRUE);
2371 	}
2372 	default:
2373 		return (B_FALSE);
2374 	}
2375 	return (B_TRUE);
2376 }
2377 
2378 /*
2379  * Callback function for MAC layer to register groups.
2380  */
2381 static void
2382 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2383     mac_group_info_t *infop, mac_group_handle_t gh)
2384 {
2385 	aggr_grp_t *grp = arg;
2386 
2387 	if (rtype == MAC_RING_TYPE_RX) {
2388 		aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2389 
2390 		rx_group->arg_gh = gh;
2391 		rx_group->arg_grp = grp;
2392 
2393 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2394 		infop->mgi_start = NULL;
2395 		infop->mgi_stop = NULL;
2396 		infop->mgi_addmac = aggr_addmac;
2397 		infop->mgi_remmac = aggr_remmac;
2398 		infop->mgi_count = rx_group->arg_ring_cnt;
2399 
2400 		/*
2401 		 * Always set the HW VLAN callbacks. They are smart
2402 		 * enough to know when a port has HW VLAN filters to
2403 		 * program and when it doesn't.
2404 		 */
2405 		infop->mgi_addvlan = aggr_addvlan;
2406 		infop->mgi_remvlan = aggr_remvlan;
2407 	} else {
2408 		aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2409 
2410 		ASSERT3S(index, ==, 0);
2411 		tx_group->atg_gh = gh;
2412 	}
2413 }
2414 
2415 /*
2416  * Callback funtion for MAC layer to register all rings.
2417  */
2418 static void
2419 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2420     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2421 {
2422 	aggr_grp_t	*grp = arg;
2423 
2424 	switch (rtype) {
2425 	case MAC_RING_TYPE_RX: {
2426 		aggr_pseudo_rx_group_t	*rx_group;
2427 		aggr_pseudo_rx_ring_t	*rx_ring;
2428 		mac_intr_t		aggr_mac_intr;
2429 
2430 		rx_group = &grp->lg_rx_groups[rg_index];
2431 		ASSERT3S(index, >=, 0);
2432 		ASSERT3S(index, <, rx_group->arg_ring_cnt);
2433 		rx_ring = rx_group->arg_rings + index;
2434 		rx_ring->arr_rh = rh;
2435 
2436 		/*
2437 		 * Entrypoint to enable interrupt (disable poll) and
2438 		 * disable interrupt (enable poll).
2439 		 */
2440 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2441 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2442 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2443 		aggr_mac_intr.mi_ddi_handle = NULL;
2444 
2445 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2446 		infop->mri_start = aggr_pseudo_start_rx_ring;
2447 		infop->mri_stop = aggr_pseudo_stop_rx_ring;
2448 
2449 		infop->mri_intr = aggr_mac_intr;
2450 		infop->mri_poll = aggr_rx_poll;
2451 
2452 		infop->mri_stat = aggr_rx_ring_stat;
2453 		break;
2454 	}
2455 	case MAC_RING_TYPE_TX: {
2456 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2457 		aggr_pseudo_tx_ring_t	*tx_ring;
2458 
2459 		ASSERT(rg_index == -1);
2460 		ASSERT(index < tx_group->atg_ring_cnt);
2461 
2462 		tx_ring = &tx_group->atg_rings[index];
2463 		tx_ring->atr_rh = rh;
2464 
2465 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2466 		infop->mri_start = NULL;
2467 		infop->mri_stop = NULL;
2468 		infop->mri_tx = aggr_ring_tx;
2469 		infop->mri_stat = aggr_tx_ring_stat;
2470 		/*
2471 		 * Use the hw TX ring handle to find if the ring needs
2472 		 * serialization or not. For NICs that do not expose
2473 		 * Tx rings, atr_hw_rh will be NULL.
2474 		 */
2475 		if (tx_ring->atr_hw_rh != NULL) {
2476 			infop->mri_flags =
2477 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2478 		}
2479 		break;
2480 	}
2481 	default:
2482 		break;
2483 	}
2484 }
2485 
2486 static mblk_t *
2487 aggr_rx_poll(void *arg, int bytes_to_pickup)
2488 {
2489 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2490 	aggr_port_t *port = rr_ring->arr_port;
2491 	aggr_grp_t *grp = port->lp_grp;
2492 	mblk_t *mp_chain, *mp, **mpp;
2493 
2494 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2495 
2496 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2497 		return (mp_chain);
2498 
2499 	mpp = &mp_chain;
2500 	while ((mp = *mpp) != NULL) {
2501 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2502 			struct ether_header *ehp;
2503 
2504 			ehp = (struct ether_header *)mp->b_rptr;
2505 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2506 				*mpp = mp->b_next;
2507 				mp->b_next = NULL;
2508 				aggr_recv_lacp(port,
2509 				    (mac_resource_handle_t)rr_ring, mp);
2510 				continue;
2511 			}
2512 		}
2513 
2514 		if (!port->lp_collector_enabled) {
2515 			*mpp = mp->b_next;
2516 			mp->b_next = NULL;
2517 			freemsg(mp);
2518 			continue;
2519 		}
2520 		mpp = &mp->b_next;
2521 	}
2522 	return (mp_chain);
2523 }
2524 
2525 static int
2526 aggr_addmac(void *arg, const uint8_t *mac_addr)
2527 {
2528 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2529 	aggr_unicst_addr_t	*addr, **pprev;
2530 	aggr_grp_t		*grp = rx_group->arg_grp;
2531 	aggr_port_t		*port, *p;
2532 	mac_perim_handle_t	mph;
2533 	int			err = 0;
2534 	uint_t			idx = rx_group->arg_index;
2535 
2536 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2537 
2538 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2539 		mac_perim_exit(mph);
2540 		return (0);
2541 	}
2542 
2543 	/*
2544 	 * Insert this mac address into the list of mac addresses owned by
2545 	 * the aggregation pseudo group.
2546 	 */
2547 	pprev = &rx_group->arg_macaddr;
2548 	while ((addr = *pprev) != NULL) {
2549 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2550 			mac_perim_exit(mph);
2551 			return (EEXIST);
2552 		}
2553 		pprev = &addr->aua_next;
2554 	}
2555 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2556 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2557 	addr->aua_next = NULL;
2558 	*pprev = addr;
2559 
2560 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2561 		if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2562 			break;
2563 
2564 	if (err != 0) {
2565 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2566 			aggr_port_remmac(p, idx, mac_addr);
2567 
2568 		*pprev = NULL;
2569 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2570 	}
2571 
2572 	mac_perim_exit(mph);
2573 	return (err);
2574 }
2575 
2576 static int
2577 aggr_remmac(void *arg, const uint8_t *mac_addr)
2578 {
2579 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2580 	aggr_unicst_addr_t	*addr, **pprev;
2581 	aggr_grp_t		*grp = rx_group->arg_grp;
2582 	aggr_port_t		*port;
2583 	mac_perim_handle_t	mph;
2584 	int			err = 0;
2585 
2586 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2587 
2588 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2589 		mac_perim_exit(mph);
2590 		return (0);
2591 	}
2592 
2593 	/*
2594 	 * Insert this mac address into the list of mac addresses owned by
2595 	 * the aggregation pseudo group.
2596 	 */
2597 	pprev = &rx_group->arg_macaddr;
2598 	while ((addr = *pprev) != NULL) {
2599 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2600 			pprev = &addr->aua_next;
2601 			continue;
2602 		}
2603 		break;
2604 	}
2605 	if (addr == NULL) {
2606 		mac_perim_exit(mph);
2607 		return (EINVAL);
2608 	}
2609 
2610 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2611 		aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2612 
2613 	*pprev = addr->aua_next;
2614 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2615 
2616 	mac_perim_exit(mph);
2617 	return (err);
2618 }
2619 
2620 /*
2621  * Search for VID in the Rx group's list and return a pointer if
2622  * found. Otherwise return NULL.
2623  */
2624 static aggr_vlan_t *
2625 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2626 {
2627 	ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2628 	for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2629 	    avp = list_next(&rx_group->arg_vlans, avp)) {
2630 		if (avp->av_vid == vid)
2631 			return (avp);
2632 	}
2633 
2634 	return (NULL);
2635 }
2636 
2637 /*
2638  * Accept traffic on the specified VID.
2639  *
2640  * Persist VLAN state in the aggr so that ports added later will
2641  * receive the correct filters. In the future it would be nice to
2642  * allow aggr to iterate its clients instead of duplicating state.
2643  */
2644 static int
2645 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2646 {
2647 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2648 	aggr_grp_t		*aggr = rx_group->arg_grp;
2649 	aggr_port_t		*port, *p;
2650 	mac_perim_handle_t	mph;
2651 	int			err = 0;
2652 	aggr_vlan_t		*avp = NULL;
2653 	uint_t			idx = rx_group->arg_index;
2654 
2655 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2656 
2657 	if (vid == MAC_VLAN_UNTAGGED) {
2658 		/*
2659 		 * Aggr is both a MAC provider and MAC client. As a
2660 		 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2661 		 * client. As a client itself, it should pass
2662 		 * VLAN_ID_NONE to its ports.
2663 		 */
2664 		vid = VLAN_ID_NONE;
2665 		rx_group->arg_untagged++;
2666 		goto update_ports;
2667 	}
2668 
2669 	avp = aggr_find_vlan(rx_group, vid);
2670 
2671 	if (avp != NULL) {
2672 		avp->av_refs++;
2673 		mac_perim_exit(mph);
2674 		return (0);
2675 	}
2676 
2677 	avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2678 	avp->av_vid = vid;
2679 	avp->av_refs = 1;
2680 
2681 update_ports:
2682 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2683 		if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2684 			break;
2685 
2686 	if (err != 0) {
2687 		/*
2688 		 * If any of these calls fail then we are in a
2689 		 * situation where the ports have different HW state.
2690 		 * There's no reasonable action the MAC client can
2691 		 * take in this scenario to rectify the situation.
2692 		 */
2693 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2694 			int err2;
2695 
2696 			if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2697 				cmn_err(CE_WARN, "Failed to remove VLAN %u"
2698 				    " from port %s: errno %d.", vid,
2699 				    mac_client_name(p->lp_mch), err2);
2700 			}
2701 
2702 		}
2703 
2704 		if (vid == VLAN_ID_NONE)
2705 			rx_group->arg_untagged--;
2706 
2707 		if (avp != NULL) {
2708 			kmem_free(avp, sizeof (aggr_vlan_t));
2709 			avp = NULL;
2710 		}
2711 	}
2712 
2713 	if (avp != NULL)
2714 		list_insert_tail(&rx_group->arg_vlans, avp);
2715 
2716 done:
2717 	mac_perim_exit(mph);
2718 	return (err);
2719 }
2720 
2721 /*
2722  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2723  */
2724 static int
2725 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2726 {
2727 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2728 	aggr_grp_t		*aggr = rx_group->arg_grp;
2729 	aggr_port_t		*port, *p;
2730 	mac_perim_handle_t	mph;
2731 	int			err = 0;
2732 	aggr_vlan_t		*avp = NULL;
2733 	uint_t			idx = rx_group->arg_index;
2734 
2735 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2736 
2737 	/*
2738 	 * See the comment in aggr_addvlan().
2739 	 */
2740 	if (vid == MAC_VLAN_UNTAGGED) {
2741 		vid = VLAN_ID_NONE;
2742 		rx_group->arg_untagged--;
2743 
2744 		if (rx_group->arg_untagged > 0)
2745 			goto done;
2746 
2747 		goto update_ports;
2748 	}
2749 
2750 	avp = aggr_find_vlan(rx_group, vid);
2751 
2752 	if (avp == NULL) {
2753 		err = ENOENT;
2754 		goto done;
2755 	}
2756 
2757 	avp->av_refs--;
2758 
2759 	if (avp->av_refs > 0)
2760 		goto done;
2761 
2762 update_ports:
2763 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2764 		if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2765 			break;
2766 
2767 	/*
2768 	 * See the comment in aggr_addvlan() for justification of the
2769 	 * use of VERIFY here.
2770 	 */
2771 	if (err != 0) {
2772 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2773 			int err2;
2774 
2775 			if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2776 				cmn_err(CE_WARN, "Failed to add VLAN %u"
2777 				    " to port %s: errno %d.", vid,
2778 				    mac_client_name(p->lp_mch), err2);
2779 			}
2780 		}
2781 
2782 		if (avp != NULL)
2783 			avp->av_refs++;
2784 
2785 		if (vid == VLAN_ID_NONE)
2786 			rx_group->arg_untagged++;
2787 
2788 		goto done;
2789 	}
2790 
2791 	if (err == 0 && avp != NULL) {
2792 		VERIFY3U(avp->av_refs, ==, 0);
2793 		list_remove(&rx_group->arg_vlans, avp);
2794 		kmem_free(avp, sizeof (aggr_vlan_t));
2795 	}
2796 
2797 done:
2798 	mac_perim_exit(mph);
2799 	return (err);
2800 }
2801 
2802 /*
2803  * Add or remove the multicast addresses that are defined for the group
2804  * to or from the specified port.
2805  *
2806  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2807  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2808  * called when the port is either stopped or detached.
2809  */
2810 void
2811 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2812 {
2813 	aggr_grp_t *grp = port->lp_grp;
2814 
2815 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2816 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2817 
2818 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2819 		return;
2820 
2821 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2822 }
2823 
2824 static int
2825 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2826 {
2827 	aggr_grp_t *grp = arg;
2828 	aggr_port_t *port = NULL, *errport = NULL;
2829 	mac_perim_handle_t mph;
2830 	int err = 0;
2831 
2832 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2833 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2834 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2835 		    !port->lp_started) {
2836 			continue;
2837 		}
2838 		err = aggr_port_multicst(port, add, addrp);
2839 		if (err != 0) {
2840 			errport = port;
2841 			break;
2842 		}
2843 	}
2844 
2845 	/*
2846 	 * At least one port caused error return and this error is returned to
2847 	 * mac, eventually a NAK would be sent upwards.
2848 	 * Some ports have this multicast address listed now, and some don't.
2849 	 * Treat this error as a whole aggr failure not individual port failure.
2850 	 * Therefore remove this multicast address from other ports.
2851 	 */
2852 	if ((err != 0) && add) {
2853 		for (port = grp->lg_ports; port != errport;
2854 		    port = port->lp_next) {
2855 			if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2856 			    !port->lp_started) {
2857 				continue;
2858 			}
2859 			(void) aggr_port_multicst(port, B_FALSE, addrp);
2860 		}
2861 	}
2862 	mac_perim_exit(mph);
2863 	return (err);
2864 }
2865 
2866 static int
2867 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2868 {
2869 	aggr_grp_t *grp = arg;
2870 	mac_perim_handle_t mph;
2871 	int err;
2872 
2873 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2874 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2875 	    0, 0);
2876 	mac_perim_exit(mph);
2877 	return (err);
2878 }
2879 
2880 /*
2881  * Initialize the capabilities that are advertised for the group
2882  * according to the capabilities of the constituent ports.
2883  */
2884 static void
2885 aggr_grp_capab_set(aggr_grp_t *grp)
2886 {
2887 	uint32_t cksum;
2888 	aggr_port_t *port;
2889 	mac_capab_lso_t cap_lso;
2890 
2891 	ASSERT(grp->lg_mh == NULL);
2892 	ASSERT(grp->lg_ports != NULL);
2893 
2894 	grp->lg_hcksum_txflags = (uint32_t)-1;
2895 	grp->lg_zcopy = B_TRUE;
2896 	grp->lg_vlan = B_TRUE;
2897 
2898 	grp->lg_lso = B_TRUE;
2899 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2900 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2901 
2902 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2903 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2904 			cksum = 0;
2905 		grp->lg_hcksum_txflags &= cksum;
2906 
2907 		grp->lg_vlan &=
2908 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2909 
2910 		grp->lg_zcopy &=
2911 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2912 
2913 		grp->lg_lso &=
2914 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2915 		if (grp->lg_lso) {
2916 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2917 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2918 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2919 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2920 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
2921 		}
2922 	}
2923 }
2924 
2925 /*
2926  * Checks whether the capabilities of the port being added are compatible
2927  * with the current capabilities of the aggregation.
2928  */
2929 static boolean_t
2930 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2931 {
2932 	uint32_t hcksum_txflags;
2933 
2934 	ASSERT(grp->lg_ports != NULL);
2935 
2936 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2937 	    grp->lg_vlan) != grp->lg_vlan) {
2938 		return (B_FALSE);
2939 	}
2940 
2941 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2942 	    grp->lg_zcopy) != grp->lg_zcopy) {
2943 		return (B_FALSE);
2944 	}
2945 
2946 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2947 		if (grp->lg_hcksum_txflags != 0)
2948 			return (B_FALSE);
2949 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2950 	    grp->lg_hcksum_txflags) {
2951 		return (B_FALSE);
2952 	}
2953 
2954 	if (grp->lg_lso) {
2955 		mac_capab_lso_t cap_lso;
2956 
2957 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2958 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2959 			    grp->lg_cap_lso.lso_flags)
2960 				return (B_FALSE);
2961 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2962 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2963 				return (B_FALSE);
2964 		} else {
2965 			return (B_FALSE);
2966 		}
2967 	}
2968 
2969 	return (B_TRUE);
2970 }
2971 
2972 /*
2973  * Returns the maximum SDU according to the SDU of the constituent ports.
2974  */
2975 static uint_t
2976 aggr_grp_max_sdu(aggr_grp_t *grp)
2977 {
2978 	uint_t max_sdu = (uint_t)-1;
2979 	aggr_port_t *port;
2980 
2981 	ASSERT(grp->lg_ports != NULL);
2982 
2983 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2984 		uint_t port_sdu_max;
2985 
2986 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2987 		if (max_sdu > port_sdu_max)
2988 			max_sdu = port_sdu_max;
2989 	}
2990 
2991 	return (max_sdu);
2992 }
2993 
2994 /*
2995  * Checks if the maximum SDU of the specified port is compatible
2996  * with the maximum SDU of the specified aggregation group, returns
2997  * B_TRUE if it is, B_FALSE otherwise.
2998  */
2999 static boolean_t
3000 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
3001 {
3002 	uint_t port_sdu_max;
3003 
3004 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3005 	return (port_sdu_max >= grp->lg_max_sdu);
3006 }
3007 
3008 /*
3009  * Returns the maximum margin according to the margin of the constituent ports.
3010  */
3011 static uint32_t
3012 aggr_grp_max_margin(aggr_grp_t *grp)
3013 {
3014 	uint32_t margin = UINT32_MAX;
3015 	aggr_port_t *port;
3016 
3017 	ASSERT(grp->lg_mh == NULL);
3018 	ASSERT(grp->lg_ports != NULL);
3019 
3020 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3021 		if (margin > port->lp_margin)
3022 			margin = port->lp_margin;
3023 	}
3024 
3025 	grp->lg_margin = margin;
3026 	return (margin);
3027 }
3028 
3029 /*
3030  * Checks if the maximum margin of the specified port is compatible
3031  * with the maximum margin of the specified aggregation group, returns
3032  * B_TRUE if it is, B_FALSE otherwise.
3033  */
3034 static boolean_t
3035 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3036 {
3037 	if (port->lp_margin >= grp->lg_margin)
3038 		return (B_TRUE);
3039 
3040 	/*
3041 	 * See whether the current margin value is allowed to be changed to
3042 	 * the new value.
3043 	 */
3044 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3045 		return (B_FALSE);
3046 
3047 	grp->lg_margin = port->lp_margin;
3048 	return (B_TRUE);
3049 }
3050 
3051 /*
3052  * Set MTU on individual ports of an aggregation group
3053  */
3054 static int
3055 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3056     uint32_t *old_mtu)
3057 {
3058 	boolean_t		removed = B_FALSE;
3059 	mac_perim_handle_t	mph;
3060 	mac_diag_t		diag;
3061 	int			err, rv, retry = 0;
3062 
3063 	if (port->lp_mah != NULL) {
3064 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3065 		port->lp_mah = NULL;
3066 		removed = B_TRUE;
3067 	}
3068 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3069 try_again:
3070 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3071 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3072 	    &port->lp_mah, 0, &diag)) != 0) {
3073 		/*
3074 		 * following is a workaround for a bug in 'bge' driver.
3075 		 * See CR 6794654 for more information and this work around
3076 		 * will be removed once the CR is fixed.
3077 		 */
3078 		if (rv == EIO && retry++ < 3) {
3079 			delay(2 * hz);
3080 			goto try_again;
3081 		}
3082 		/*
3083 		 * if mac_unicast_add() failed while setting the MTU,
3084 		 * detach the port from the group.
3085 		 */
3086 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3087 		(void) aggr_grp_detach_port(grp, port);
3088 		mac_perim_exit(mph);
3089 		cmn_err(CE_WARN, "Unable to restart the port %s while "
3090 		    "setting MTU. Detaching the port from the aggregation.",
3091 		    mac_client_name(port->lp_mch));
3092 	}
3093 	return (err);
3094 }
3095 
3096 static int
3097 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3098 {
3099 	int			err = 0, i, rv;
3100 	aggr_port_t		*port;
3101 	uint32_t		*mtu;
3102 
3103 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3104 
3105 	/*
3106 	 * If the MTU being set is equal to aggr group's maximum
3107 	 * allowable value, then there is nothing to change
3108 	 */
3109 	if (sdu == grp->lg_max_sdu)
3110 		return (0);
3111 
3112 	/* 0 is aggr group's min sdu */
3113 	if (sdu == 0)
3114 		return (EINVAL);
3115 
3116 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3117 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3118 	    port = port->lp_next, i++) {
3119 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3120 	}
3121 	if (err != 0) {
3122 		/* recover from error: reset the mtus of the ports */
3123 		aggr_port_t *tmp;
3124 
3125 		for (tmp = grp->lg_ports, i = 0; tmp != port;
3126 		    tmp = tmp->lp_next, i++) {
3127 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3128 		}
3129 		goto bail;
3130 	}
3131 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3132 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3133 	ASSERT(rv == 0);
3134 bail:
3135 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3136 	return (err);
3137 }
3138 
3139 /*
3140  * Callback functions for set/get of properties
3141  */
3142 /*ARGSUSED*/
3143 static int
3144 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3145     uint_t pr_valsize, const void *pr_val)
3146 {
3147 	int		err = ENOTSUP;
3148 	aggr_grp_t	*grp = m_driver;
3149 
3150 	switch (pr_num) {
3151 	case MAC_PROP_MTU: {
3152 		uint32_t	mtu;
3153 
3154 		if (pr_valsize < sizeof (mtu)) {
3155 			err = EINVAL;
3156 			break;
3157 		}
3158 		bcopy(pr_val, &mtu, sizeof (mtu));
3159 		err = aggr_sdu_update(grp, mtu);
3160 		break;
3161 	}
3162 	default:
3163 		break;
3164 	}
3165 	return (err);
3166 }
3167 
3168 typedef struct rboundary {
3169 	uint32_t	bval;
3170 	int		btype;
3171 } rboundary_t;
3172 
3173 /*
3174  * This function finds the intersection of mtu ranges stored in arrays -
3175  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3176  * Individual arrays are assumed to contain non-overlapping ranges.
3177  * Algorithm:
3178  *   A range has two boundaries - min and max. We scan all arrays and store
3179  * each boundary as a separate element in a temporary array. We also store
3180  * the boundary types, min or max, as +1 or -1 respectively in the temporary
3181  * array. Then we sort the temporary array in ascending order. We scan the
3182  * sorted array from lower to higher values and keep a cumulative sum of
3183  * boundary types. Element in the temporary array for which the sum reaches
3184  * mcount is a min boundary of a range in the result and next element will be
3185  * max boundary.
3186  *
3187  * Example for mcount = 3,
3188  *
3189  *  ----|_________|-------|_______|----|__|------ mrange[0]
3190  *
3191  *  -------|________|--|____________|-----|___|-- mrange[1]
3192  *
3193  *  --------|________________|-------|____|------ mrange[2]
3194  *
3195  *                                      3 2 1
3196  *                                       \|/
3197  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
3198  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3199  *
3200  *                                 same min and max
3201  *                                        V
3202  *  --------|_____|-------|__|------------|------ intersecting ranges
3203  */
3204 void
3205 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3206     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3207 {
3208 	mac_propval_uint32_range_t	*rval, *ur;
3209 	int				rmaxcnt, rcount;
3210 	size_t				sz_range32;
3211 	rboundary_t			*ta; /* temporary array */
3212 	rboundary_t			temp;
3213 	boolean_t			range_started = B_FALSE;
3214 	int				i, j, m, sum;
3215 
3216 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3217 
3218 	for (i = 0, rmaxcnt = 0; i < mcount; i++)
3219 		rmaxcnt += mrange[i]->mpr_count;
3220 
3221 	/* Allocate enough space to store the results */
3222 	rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3223 
3224 	/* Number of boundaries are twice as many as ranges */
3225 	ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3226 
3227 	for (i = 0, m = 0; i < mcount; i++) {
3228 		ur = &(mrange[i]->mpr_range_uint32[0]);
3229 		for (j = 0; j < mrange[i]->mpr_count; j++) {
3230 			ta[m].bval = ur[j].mpur_min;
3231 			ta[m++].btype = 1;
3232 			ta[m].bval = ur[j].mpur_max;
3233 			ta[m++].btype = -1;
3234 		}
3235 	}
3236 
3237 	/*
3238 	 * Sort the temporary array in ascending order of bval;
3239 	 * if boundary values are same then sort on btype.
3240 	 */
3241 	for (i = 0; i < m-1; i++) {
3242 		for (j = i+1; j < m; j++) {
3243 			if ((ta[i].bval > ta[j].bval) ||
3244 			    ((ta[i].bval == ta[j].bval) &&
3245 			    (ta[i].btype < ta[j].btype))) {
3246 				temp = ta[i];
3247 				ta[i] = ta[j];
3248 				ta[j] = temp;
3249 			}
3250 		}
3251 	}
3252 
3253 	/* Walk through temporary array to find all ranges in the results */
3254 	for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3255 		sum += ta[i].btype;
3256 		if (sum == mcount) {
3257 			rval[rcount].mpur_min = ta[i].bval;
3258 			range_started = B_TRUE;
3259 		} else if (sum < mcount && range_started) {
3260 			rval[rcount++].mpur_max = ta[i].bval;
3261 			range_started = B_FALSE;
3262 		}
3263 	}
3264 
3265 	*prval = rval;
3266 	*prmaxcnt = rmaxcnt;
3267 	*prcount = rcount;
3268 
3269 	kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3270 }
3271 
3272 /*
3273  * Returns the mtu ranges which could be supported by aggr group.
3274  * prmaxcnt returns the size of the buffer prval, prcount returns
3275  * the number of valid entries in prval. Caller is responsible
3276  * for freeing up prval.
3277  */
3278 int
3279 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3280     int *prmaxcnt, int *prcount)
3281 {
3282 	mac_propval_range_t		**vals;
3283 	aggr_port_t			*port;
3284 	mac_perim_handle_t		mph;
3285 	uint_t				i, numr;
3286 	int				err = 0;
3287 	size_t				sz_propval, sz_range32;
3288 	size_t				size;
3289 
3290 	sz_propval = sizeof (mac_propval_range_t);
3291 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3292 
3293 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3294 
3295 	vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3296 	    KM_SLEEP);
3297 
3298 	for (port = grp->lg_ports, i = 0; port != NULL;
3299 	    port = port->lp_next, i++) {
3300 
3301 		size = sz_propval;
3302 		vals[i] = kmem_alloc(size, KM_SLEEP);
3303 		vals[i]->mpr_count = 1;
3304 
3305 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3306 
3307 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3308 		    NULL, 0, vals[i], NULL);
3309 		if (err == ENOSPC) {
3310 			/*
3311 			 * Not enough space to hold all ranges.
3312 			 * Allocate extra space as indicated and retry.
3313 			 */
3314 			numr = vals[i]->mpr_count;
3315 			kmem_free(vals[i], sz_propval);
3316 			size = sz_propval + (numr - 1) * sz_range32;
3317 			vals[i] = kmem_alloc(size, KM_SLEEP);
3318 			vals[i]->mpr_count = numr;
3319 			err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3320 			    NULL, 0, vals[i], NULL);
3321 			ASSERT(err != ENOSPC);
3322 		}
3323 		mac_perim_exit(mph);
3324 		if (err != 0) {
3325 			kmem_free(vals[i], size);
3326 			vals[i] = NULL;
3327 			break;
3328 		}
3329 	}
3330 
3331 	/*
3332 	 * if any of the underlying ports does not support changing MTU then
3333 	 * just return ENOTSUP
3334 	 */
3335 	if (port != NULL) {
3336 		ASSERT(err != 0);
3337 		goto done;
3338 	}
3339 
3340 	aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3341 	    prcount);
3342 
3343 done:
3344 	for (i = 0; i < grp->lg_nports; i++) {
3345 		if (vals[i] != NULL) {
3346 			numr = vals[i]->mpr_count;
3347 			size = sz_propval + (numr - 1) * sz_range32;
3348 			kmem_free(vals[i], size);
3349 		}
3350 	}
3351 
3352 	kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3353 	return (err);
3354 }
3355 
3356 static void
3357 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3358     mac_prop_info_handle_t prh)
3359 {
3360 	aggr_grp_t			*grp = m_driver;
3361 	mac_propval_uint32_range_t	*rval = NULL;
3362 	int				i, rcount, rmaxcnt;
3363 	int				err = 0;
3364 
3365 	_NOTE(ARGUNUSED(pr_name));
3366 
3367 	switch (pr_num) {
3368 	case MAC_PROP_MTU:
3369 
3370 		err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3371 		    &rcount);
3372 		if (err != 0) {
3373 			ASSERT(rval == NULL);
3374 			return;
3375 		}
3376 		for (i = 0; i < rcount; i++) {
3377 			mac_prop_info_set_range_uint32(prh,
3378 			    rval[i].mpur_min, rval[i].mpur_max);
3379 		}
3380 		kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3381 		break;
3382 	}
3383 }
3384