xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c (revision 9164a50bf932130cbb5097a16f6986873ce0e6e5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2020 Joyent, Inc.
24  * Copyright 2020 RackTop Systems, Inc.
25  * Copyright 2024 MNX Cloud, Inc.
26  */
27 
28 /*
29  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
30  *
31  * An instance of the structure aggr_grp_t is allocated for each
32  * link aggregation group. When created, aggr_grp_t objects are
33  * entered into the aggr_grp_hash hash table maintained by the modhash
34  * module. The hash key is the linkid associated with the link
35  * aggregation group.
36  *
37  * Each aggregation contains a set of ports. The port is represented
38  * by the aggr_port_t structure. A port consists of a single MAC
39  * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
40  * MAC. This client is used by the aggr to send and receive LACP
41  * traffic. Each port client takes on the same MAC unicast address --
42  * the address of the aggregation itself (taken from the first port by
43  * default).
44  *
45  * The MAC client that hangs off each aggr port is not your typical
46  * MAC client. Not only does it have exclusive control of the MAC, but
47  * it also has no Tx or Rx SRSes. An SRS is designed to queue and
48  * fanout traffic among L4 protocols; but the aggr is an intermediary,
49  * not a consumer. Instead of using SRSes, the aggr puts the
50  * underlying hardware rings into passthru mode and ships packets up
51  * via a direct call to aggr_recv_cb(). This allows aggr to enforce
52  * LACP while passing all other traffic up to clients of the aggr.
53  *
54  * Pseudo Rx Groups and Rings
55  * --------------------------
56  *
57  * It is imperative for client performance that the aggr provide as
58  * many MAC groups as possible. In order to use the underlying HW
59  * resources, aggr creates pseudo groups to aggregate the underlying
60  * HW groups. Every HW group gets mapped to a pseudo group; and every
61  * HW ring in that group gets mapped to a pseudo ring. The pseudo
62  * group at index 0 combines all the HW groups at index 0 from each
63  * port, etc. The aggr's MAC then creates normal MAC groups and rings
64  * out of these pseudo groups and rings to present to the aggr's
65  * clients. To the clients, the aggr's groups and rings are absolutely
66  * no different than a NIC's groups or rings.
67  *
68  * Pseudo Tx Rings
69  * ---------------
70  *
71  * The underlying ports (NICs) in an aggregation can have Tx rings. To
72  * enhance aggr's performance, these Tx rings are made available to
73  * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
74  * not new. They are already present and implemented on the Rx side.
75  * The same concept is extended to the Tx side where each Tx ring of
76  * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
77  * each pseudo Tx ring will map to a specific hardware Tx ring. Even
78  * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
79  * is given to the aggregation layer.
80  *
81  * With this change, the outgoing stack depth looks much better:
82  *
83  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
84  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
85  *
86  * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
87  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
88  *
89  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
90  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
91  * ring belonging to a port on which the packet has to be sent.
92  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
93  * policy and then uses the fanout_hint passed to it to pick a Tx ring from
94  * the selected port.
95  *
96  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
97  * bandwidth limit is applied first on the outgoing packet and the packets
98  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
99  * particular Tx ring.
100  */
101 
102 #include <sys/types.h>
103 #include <sys/sysmacros.h>
104 #include <sys/conf.h>
105 #include <sys/cmn_err.h>
106 #include <sys/disp.h>
107 #include <sys/list.h>
108 #include <sys/ksynch.h>
109 #include <sys/kmem.h>
110 #include <sys/stream.h>
111 #include <sys/modctl.h>
112 #include <sys/ddi.h>
113 #include <sys/sunddi.h>
114 #include <sys/atomic.h>
115 #include <sys/stat.h>
116 #include <sys/modhash.h>
117 #include <sys/id_space.h>
118 #include <sys/strsun.h>
119 #include <sys/cred.h>
120 #include <sys/dlpi.h>
121 #include <sys/zone.h>
122 #include <sys/mac_provider.h>
123 #include <sys/dls.h>
124 #include <sys/vlan.h>
125 #include <sys/aggr.h>
126 #include <sys/aggr_impl.h>
127 
128 static int aggr_m_start(void *);
129 static void aggr_m_stop(void *);
130 static int aggr_m_promisc(void *, boolean_t);
131 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
132 static int aggr_m_unicst(void *, const uint8_t *);
133 static int aggr_m_stat(void *, uint_t, uint64_t *);
134 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
135 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
136 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
137     const void *);
138 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
139     mac_prop_info_handle_t);
140 
141 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
142 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
143     boolean_t *);
144 
145 static void aggr_grp_capab_set(aggr_grp_t *);
146 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
147 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
148 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
149 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
150 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
151 
152 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
153 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
154 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
155 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
156 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
157 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
158 static int aggr_addmac(void *, const uint8_t *);
159 static int aggr_remmac(void *, const uint8_t *);
160 static int aggr_addvlan(mac_group_driver_t, uint16_t);
161 static int aggr_remvlan(mac_group_driver_t, uint16_t);
162 static mblk_t *aggr_rx_poll(void *, int);
163 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
164     const int, mac_ring_info_t *, mac_ring_handle_t);
165 static void aggr_fill_group(void *, mac_ring_type_t, const int,
166     mac_group_info_t *, mac_group_handle_t);
167 
168 static kmem_cache_t	*aggr_grp_cache;
169 static mod_hash_t	*aggr_grp_hash;
170 static krwlock_t	aggr_grp_lock;
171 static uint_t		aggr_grp_cnt;
172 static id_space_t	*key_ids;
173 
174 #define	GRP_HASHSZ		64
175 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
176 #define	AGGR_PORT_NAME_DELIMIT '-'
177 
178 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
179 
180 #define	AGGR_M_CALLBACK_FLAGS	\
181 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
182 
183 static mac_callbacks_t aggr_m_callbacks = {
184 	AGGR_M_CALLBACK_FLAGS,
185 	aggr_m_stat,
186 	aggr_m_start,
187 	aggr_m_stop,
188 	aggr_m_promisc,
189 	aggr_m_multicst,
190 	NULL,
191 	NULL,
192 	NULL,
193 	aggr_m_ioctl,
194 	aggr_m_capab_get,
195 	NULL,
196 	NULL,
197 	aggr_m_setprop,
198 	NULL,
199 	aggr_m_propinfo
200 };
201 
202 /*ARGSUSED*/
203 static int
204 aggr_grp_constructor(void *buf, void *arg, int kmflag)
205 {
206 	aggr_grp_t *grp = buf;
207 
208 	bzero(grp, sizeof (*grp));
209 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
210 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
211 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
212 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
213 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
214 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
215 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
216 	grp->lg_link_state = LINK_STATE_UNKNOWN;
217 	return (0);
218 }
219 
220 /*ARGSUSED*/
221 static void
222 aggr_grp_destructor(void *buf, void *arg)
223 {
224 	aggr_grp_t *grp = buf;
225 
226 	if (grp->lg_tx_ports != NULL) {
227 		kmem_free(grp->lg_tx_ports,
228 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
229 	}
230 
231 	mutex_destroy(&grp->lg_lacp_lock);
232 	cv_destroy(&grp->lg_lacp_cv);
233 	mutex_destroy(&grp->lg_port_lock);
234 	cv_destroy(&grp->lg_port_cv);
235 	rw_destroy(&grp->lg_tx_lock);
236 	mutex_destroy(&grp->lg_tx_flowctl_lock);
237 	cv_destroy(&grp->lg_tx_flowctl_cv);
238 }
239 
240 void
241 aggr_grp_init(void)
242 {
243 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
244 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
245 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
246 
247 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
248 	    GRP_HASHSZ, mod_hash_null_valdtor);
249 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
250 	aggr_grp_cnt = 0;
251 
252 	/*
253 	 * Allocate an id space to manage key values (when key is not
254 	 * specified). The range of the id space will be from
255 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
256 	 * uses a 16-bit key.
257 	 */
258 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
259 	ASSERT(key_ids != NULL);
260 }
261 
262 void
263 aggr_grp_fini(void)
264 {
265 	id_space_destroy(key_ids);
266 	rw_destroy(&aggr_grp_lock);
267 	mod_hash_destroy_idhash(aggr_grp_hash);
268 	kmem_cache_destroy(aggr_grp_cache);
269 }
270 
271 uint_t
272 aggr_grp_count(void)
273 {
274 	uint_t	count;
275 
276 	rw_enter(&aggr_grp_lock, RW_READER);
277 	count = aggr_grp_cnt;
278 	rw_exit(&aggr_grp_lock);
279 	return (count);
280 }
281 
282 /*
283  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
284  * requires the mac perimeter, this function holds a reference of the aggr
285  * and aggr won't call mac_unregister() until this reference drops to 0.
286  */
287 void
288 aggr_grp_port_hold(aggr_port_t *port)
289 {
290 	aggr_grp_t	*grp = port->lp_grp;
291 
292 	AGGR_PORT_REFHOLD(port);
293 	mutex_enter(&grp->lg_port_lock);
294 	grp->lg_port_ref++;
295 	mutex_exit(&grp->lg_port_lock);
296 }
297 
298 /*
299  * Release the reference of the grp and inform aggr_grp_delete() calling
300  * mac_unregister() is now safe.
301  */
302 void
303 aggr_grp_port_rele(aggr_port_t *port)
304 {
305 	aggr_grp_t	*grp = port->lp_grp;
306 
307 	mutex_enter(&grp->lg_port_lock);
308 	if (--grp->lg_port_ref == 0)
309 		cv_signal(&grp->lg_port_cv);
310 	mutex_exit(&grp->lg_port_lock);
311 	AGGR_PORT_REFRELE(port);
312 }
313 
314 /*
315  * Wait for the port's lacp timer thread and the port's notification callback
316  * to exit.
317  */
318 void
319 aggr_grp_port_wait(aggr_grp_t *grp)
320 {
321 	mutex_enter(&grp->lg_port_lock);
322 	if (grp->lg_port_ref != 0)
323 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
324 	mutex_exit(&grp->lg_port_lock);
325 }
326 
327 /*
328  * Attach a port to a link aggregation group.
329  *
330  * A port is attached to a link aggregation group once its speed
331  * and link state have been verified.
332  *
333  * Returns B_TRUE if the group link state or speed has changed. If
334  * it's the case, the caller must notify the MAC layer via a call
335  * to mac_link().
336  */
337 boolean_t
338 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
339 {
340 	boolean_t link_state_changed = B_FALSE;
341 
342 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
343 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
344 
345 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
346 		return (B_FALSE);
347 
348 	/*
349 	 * Validate the MAC port link speed and update the group
350 	 * link speed if needed.
351 	 */
352 	if (port->lp_ifspeed == 0 ||
353 	    port->lp_link_state != LINK_STATE_UP ||
354 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
355 		/*
356 		 * Can't attach a MAC port with unknown link speed,
357 		 * down link, or not in full duplex mode.
358 		 */
359 		return (B_FALSE);
360 	}
361 
362 	mutex_enter(&grp->lg_stat_lock);
363 	if (grp->lg_ifspeed == 0) {
364 		/*
365 		 * The group inherits the speed of the first link being
366 		 * attached.
367 		 */
368 		grp->lg_ifspeed = port->lp_ifspeed;
369 		link_state_changed = B_TRUE;
370 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
371 		/*
372 		 * The link speed of the MAC port must be the same as
373 		 * the group link speed, as per 802.3ad. Since it is
374 		 * not, the attach is cancelled.
375 		 */
376 		mutex_exit(&grp->lg_stat_lock);
377 		return (B_FALSE);
378 	}
379 	mutex_exit(&grp->lg_stat_lock);
380 
381 	grp->lg_nattached_ports++;
382 
383 	/*
384 	 * Update the group link state.
385 	 */
386 	if (grp->lg_link_state != LINK_STATE_UP) {
387 		grp->lg_link_state = LINK_STATE_UP;
388 		mutex_enter(&grp->lg_stat_lock);
389 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
390 		mutex_exit(&grp->lg_stat_lock);
391 		link_state_changed = B_TRUE;
392 	}
393 
394 	/*
395 	 * Update port's state.
396 	 */
397 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
398 
399 	aggr_grp_multicst_port(port, B_TRUE);
400 
401 	/*
402 	 * The port client doesn't have an Rx SRS; instead of calling
403 	 * mac_rx_set() we set the client's flow callback directly.
404 	 * This datapath is used only when the port's driver doesn't
405 	 * support MAC_CAPAB_RINGS. Drivers with ring support will
406 	 * deliver traffic to the aggr via ring passthru.
407 	 */
408 	mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
409 
410 	/*
411 	 * If LACP is OFF, the port can be used to send data as soon
412 	 * as its link is up and verified to be compatible with the
413 	 * aggregation.
414 	 *
415 	 * If LACP is active or passive, notify the LACP subsystem, which
416 	 * will enable sending on the port following the LACP protocol.
417 	 */
418 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
419 		aggr_send_port_enable(port);
420 	else
421 		aggr_lacp_port_attached(port);
422 
423 	return (link_state_changed);
424 }
425 
426 boolean_t
427 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
428 {
429 	boolean_t link_state_changed = B_FALSE;
430 
431 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
432 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
433 
434 	/* update state */
435 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
436 		return (B_FALSE);
437 
438 	mac_client_clear_flow_cb(port->lp_mch);
439 
440 	aggr_grp_multicst_port(port, B_FALSE);
441 
442 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
443 		aggr_send_port_disable(port);
444 	else
445 		aggr_lacp_port_detached(port);
446 
447 	port->lp_state = AGGR_PORT_STATE_STANDBY;
448 
449 	grp->lg_nattached_ports--;
450 	if (grp->lg_nattached_ports == 0) {
451 		/* the last attached MAC port of the group is being detached */
452 		grp->lg_link_state = LINK_STATE_DOWN;
453 		mutex_enter(&grp->lg_stat_lock);
454 		grp->lg_ifspeed = 0;
455 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
456 		mutex_exit(&grp->lg_stat_lock);
457 		link_state_changed = B_TRUE;
458 	}
459 
460 	return (link_state_changed);
461 }
462 
463 /*
464  * Update the MAC addresses of the constituent ports of the specified
465  * group. This function is invoked:
466  * - after creating a new aggregation group.
467  * - after adding new ports to an aggregation group.
468  * - after removing a port from a group when the MAC address of
469  *   that port was used for the MAC address of the group.
470  * - after the MAC address of a port changed when the MAC address
471  *   of that port was used for the MAC address of the group.
472  *
473  * Return true if the link state of the aggregation changed, for example
474  * as a result of a failure changing the MAC address of one of the
475  * constituent ports.
476  */
477 boolean_t
478 aggr_grp_update_ports_mac(aggr_grp_t *grp)
479 {
480 	aggr_port_t *cport;
481 	boolean_t link_state_changed = B_FALSE;
482 	mac_perim_handle_t mph;
483 
484 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
485 
486 	for (cport = grp->lg_ports; cport != NULL;
487 	    cport = cport->lp_next) {
488 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
489 		if (aggr_port_unicst(cport) != 0) {
490 			if (aggr_grp_detach_port(grp, cport))
491 				link_state_changed = B_TRUE;
492 		} else {
493 			/*
494 			 * If a port was detached because of a previous
495 			 * failure changing the MAC address, the port is
496 			 * reattached when it successfully changes the MAC
497 			 * address now, and this might cause the link state
498 			 * of the aggregation to change.
499 			 */
500 			if (aggr_grp_attach_port(grp, cport))
501 				link_state_changed = B_TRUE;
502 		}
503 		mac_perim_exit(mph);
504 	}
505 	return (link_state_changed);
506 }
507 
508 /*
509  * Invoked when the MAC address of a port has changed. If the port's
510  * MAC address was used for the group MAC address, set mac_addr_changedp
511  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
512  * notification. If the link state changes due to detach/attach of
513  * the constituent port, set link_state_changedp to B_TRUE to indicate
514  * to the caller that it should send a MAC_NOTE_LINK notification. In both
515  * cases, it is the responsibility of the caller to invoke notification
516  * functions after releasing the the port lock.
517  */
518 void
519 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
520     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
521 {
522 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
523 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
524 	ASSERT(mac_addr_changedp != NULL);
525 	ASSERT(link_state_changedp != NULL);
526 
527 	*mac_addr_changedp = B_FALSE;
528 	*link_state_changedp = B_FALSE;
529 
530 	if (grp->lg_addr_fixed) {
531 		/*
532 		 * The group is using a fixed MAC address or an automatic
533 		 * MAC address has not been set.
534 		 */
535 		return;
536 	}
537 
538 	if (grp->lg_mac_addr_port == port) {
539 		/*
540 		 * The MAC address of the port was assigned to the group
541 		 * MAC address. Update the group MAC address.
542 		 */
543 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
544 		*mac_addr_changedp = B_TRUE;
545 	} else {
546 		/*
547 		 * Update the actual port MAC address to the MAC address
548 		 * of the group.
549 		 */
550 		if (aggr_port_unicst(port) != 0) {
551 			*link_state_changedp = aggr_grp_detach_port(grp, port);
552 		} else {
553 			/*
554 			 * If a port was detached because of a previous
555 			 * failure changing the MAC address, the port is
556 			 * reattached when it successfully changes the MAC
557 			 * address now, and this might cause the link state
558 			 * of the aggregation to change.
559 			 */
560 			*link_state_changedp = aggr_grp_attach_port(grp, port);
561 		}
562 	}
563 }
564 
565 /*
566  * Add a port to a link aggregation group.
567  */
568 static int
569 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
570     aggr_port_t **pp)
571 {
572 	aggr_port_t *port, **cport;
573 	mac_perim_handle_t mph;
574 	zoneid_t port_zoneid = ALL_ZONES;
575 	int err;
576 
577 	/* The port must be in the same zone as the aggregation. */
578 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
579 		port_zoneid = GLOBAL_ZONEID;
580 	if (grp->lg_zoneid != port_zoneid)
581 		return (EBUSY);
582 
583 	/*
584 	 * If we are creating the aggr, then there is no MAC handle
585 	 * and thus no perimeter to hold. If we are adding a port to
586 	 * an existing aggr, then the perimiter of the aggr's MAC must
587 	 * be held.
588 	 */
589 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
590 
591 	err = aggr_port_create(grp, port_linkid, force, &port);
592 	if (err != 0)
593 		return (err);
594 
595 	mac_perim_enter_by_mh(port->lp_mh, &mph);
596 
597 	/* Add the new port to the end of the list. */
598 	cport = &grp->lg_ports;
599 	while (*cport != NULL)
600 		cport = &((*cport)->lp_next);
601 	*cport = port;
602 
603 	/*
604 	 * Back reference to the group it is member of. A port always
605 	 * holds a reference to its group to ensure that the back
606 	 * reference is always valid.
607 	 */
608 	port->lp_grp = grp;
609 	AGGR_GRP_REFHOLD(grp);
610 	grp->lg_nports++;
611 	if (grp->lg_nports > grp->lg_nports_high)
612 		grp->lg_nports_high = grp->lg_nports;
613 
614 	aggr_lacp_init_port(port);
615 	mac_perim_exit(mph);
616 
617 	if (pp != NULL)
618 		*pp = port;
619 
620 	return (0);
621 }
622 
623 /*
624  * This is called when the 'lg_tx_ports' arrangement has changed and
625  * we need to update the corresponding 'mi_default_tx_ring'. This
626  * happens for several reasons.
627  *
628  *     - A pseudo TX mac group was added or removed.
629  *     - An LACP message has changed the port's state.
630  *     - A link event has changed the port's state.
631  *
632  * In any case, we see if there is at least one port enabled (see
633  * 'aggr_send_port_enable()'), and if so we use its first ring as the
634  * mac's default TX ring.
635  *
636  * Note, because we only have a single TX group, we don't have to
637  * worry about the rings moving between groups and the chance that mac
638  * will reassign it unless someone removes a port, at which point, we
639  * play it safe and call this again.
640  */
641 void
642 aggr_grp_update_default(aggr_grp_t *grp)
643 {
644 	aggr_port_t *port;
645 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
646 
647 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
648 
649 	if (grp->lg_ntx_ports == 0) {
650 		rw_exit(&grp->lg_tx_lock);
651 		return;
652 	}
653 
654 	port = grp->lg_tx_ports[0];
655 	ASSERT(port->lp_tx_ring_cnt > 0);
656 	mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
657 	rw_exit(&grp->lg_tx_lock);
658 }
659 
660 /*
661  * Add a pseudo RX ring for the given HW ring handle.
662  */
663 static int
664 aggr_add_pseudo_rx_ring(aggr_port_t *port,
665     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
666 {
667 	aggr_pseudo_rx_ring_t	*ring;
668 	int			err;
669 	int			j;
670 
671 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
672 		ring = rx_grp->arg_rings + j;
673 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
674 			break;
675 	}
676 
677 	/*
678 	 * No slot for this new RX ring.
679 	 */
680 	if (j == MAX_RINGS_PER_GROUP)
681 		return (ENOSPC);
682 
683 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
684 	ring->arr_hw_rh = hw_rh;
685 	ring->arr_port = port;
686 	ring->arr_grp = rx_grp;
687 	rx_grp->arg_ring_cnt++;
688 
689 	/*
690 	 * The group is already registered, dynamically add a new ring to the
691 	 * mac group.
692 	 */
693 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
694 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
695 		ring->arr_hw_rh = NULL;
696 		ring->arr_port = NULL;
697 		ring->arr_grp = NULL;
698 		rx_grp->arg_ring_cnt--;
699 	} else {
700 		/*
701 		 * This must run after the MAC is registered.
702 		 */
703 		ASSERT3P(ring->arr_rh, !=, NULL);
704 		mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
705 		    (void *)port, (mac_resource_handle_t)ring);
706 	}
707 	return (err);
708 }
709 
710 /*
711  * Remove the pseudo RX ring of the given HW ring handle.
712  */
713 static void
714 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
715 {
716 	for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
717 		aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
718 
719 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
720 		    ring->arr_hw_rh != hw_rh) {
721 			continue;
722 		}
723 
724 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
725 
726 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
727 		ring->arr_hw_rh = NULL;
728 		ring->arr_port = NULL;
729 		ring->arr_grp = NULL;
730 		rx_grp->arg_ring_cnt--;
731 		mac_hwring_clear_passthru(hw_rh);
732 		break;
733 	}
734 }
735 
736 /*
737  * Create pseudo rings over the HW rings of the port.
738  *
739  * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
740  *
741  * o Program existing unicast filters on the pseudo group into the HW group.
742  *
743  * o Program existing VLAN filters on the pseudo group into the HW group.
744  */
745 static int
746 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
747 {
748 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
749 	aggr_unicst_addr_t	*addr, *a;
750 	mac_perim_handle_t	pmph;
751 	aggr_vlan_t		*avp;
752 	uint_t			hw_rh_cnt, i;
753 	int			err = 0;
754 	uint_t			g_idx = rx_grp->arg_index;
755 
756 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
757 	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
758 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
759 
760 	i = 0;
761 	addr = NULL;
762 	/*
763 	 * This function must be called after the aggr registers its
764 	 * MAC and its Rx groups have been initialized.
765 	 */
766 	ASSERT(rx_grp->arg_gh != NULL);
767 
768 	/*
769 	 * Get the list of the underlying HW rings.
770 	 */
771 	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
772 	    &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
773 
774 	/*
775 	 * Add existing VLAN and unicast address filters to the port.
776 	 */
777 	for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
778 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
779 		if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
780 			goto err;
781 	}
782 
783 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
784 		if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
785 			goto err;
786 	}
787 
788 	for (i = 0; i < hw_rh_cnt; i++) {
789 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
790 		if (err != 0)
791 			goto err;
792 	}
793 
794 	mac_perim_exit(pmph);
795 	return (0);
796 
797 err:
798 	ASSERT(err != 0);
799 
800 	for (uint_t j = 0; j < i; j++)
801 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
802 
803 	for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
804 		aggr_port_remmac(port, g_idx, a->aua_addr);
805 
806 	if (avp != NULL)
807 		avp = list_prev(&rx_grp->arg_vlans, avp);
808 
809 	for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
810 		int err2;
811 
812 		if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
813 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
814 			    ": errno %d.", avp->av_vid,
815 			    mac_client_name(port->lp_mch), err2);
816 		}
817 	}
818 
819 	port->lp_hwghs[g_idx] = NULL;
820 	mac_perim_exit(pmph);
821 	return (err);
822 }
823 
824 /*
825  * Destroy the pseudo rings mapping to this port and remove all VLAN
826  * and unicast filters from this port. Even if there are no underlying
827  * HW rings we must still remove the unicast filters to take the port
828  * out of promisc mode.
829  */
830 static void
831 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
832 {
833 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
834 	aggr_unicst_addr_t	*addr;
835 	mac_perim_handle_t	pmph;
836 	uint_t			hw_rh_cnt;
837 	uint_t			g_idx = rx_grp->arg_index;
838 
839 	ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
840 	ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
841 	ASSERT3P(rx_grp->arg_gh, !=, NULL);
842 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
843 
844 	hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
845 	    MAC_RING_TYPE_RX);
846 
847 	for (uint_t i = 0; i < hw_rh_cnt; i++)
848 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
849 
850 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
851 		aggr_port_remmac(port, g_idx, addr->aua_addr);
852 
853 	for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
854 	    avp = list_next(&rx_grp->arg_vlans, avp)) {
855 		int err;
856 
857 		if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
858 			cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
859 			    ": errno %d.", avp->av_vid,
860 			    mac_client_name(port->lp_mch), err);
861 		}
862 	}
863 
864 	port->lp_hwghs[g_idx] = NULL;
865 	mac_perim_exit(pmph);
866 }
867 
868 /*
869  * Add a pseudo TX ring for the given HW ring handle.
870  */
871 static int
872 aggr_add_pseudo_tx_ring(aggr_port_t *port,
873     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
874     mac_ring_handle_t *pseudo_rh)
875 {
876 	aggr_pseudo_tx_ring_t	*ring;
877 	int			err;
878 	int			i;
879 
880 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
881 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
882 		ring = tx_grp->atg_rings + i;
883 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
884 			break;
885 	}
886 	/*
887 	 * No slot for this new TX ring.
888 	 */
889 	if (i == MAX_RINGS_PER_GROUP)
890 		return (ENOSPC);
891 	/*
892 	 * The following 4 statements needs to be done before
893 	 * calling mac_group_add_ring(). Otherwise it will
894 	 * result in an assertion failure in mac_init_ring().
895 	 */
896 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
897 	ring->atr_hw_rh = hw_rh;
898 	ring->atr_port = port;
899 	tx_grp->atg_ring_cnt++;
900 
901 	/*
902 	 * The TX side has no concept of ring groups unlike RX groups.
903 	 * There is just a single group which stores all the TX rings.
904 	 * This group will be used to store aggr's pseudo TX rings.
905 	 */
906 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
907 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
908 		ring->atr_hw_rh = NULL;
909 		ring->atr_port = NULL;
910 		tx_grp->atg_ring_cnt--;
911 	} else {
912 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
913 		if (hw_rh != NULL) {
914 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
915 			    mac_find_ring(tx_grp->atg_gh, i));
916 		}
917 	}
918 
919 	return (err);
920 }
921 
922 /*
923  * Remove the pseudo TX ring of the given HW ring handle.
924  */
925 static void
926 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
927     mac_ring_handle_t pseudo_hw_rh)
928 {
929 	aggr_pseudo_tx_ring_t	*ring;
930 	int			i;
931 
932 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
933 		ring = tx_grp->atg_rings + i;
934 		if (ring->atr_rh != pseudo_hw_rh)
935 			continue;
936 
937 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
938 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
939 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
940 		mac_hwring_teardown(ring->atr_hw_rh);
941 		ring->atr_hw_rh = NULL;
942 		ring->atr_port = NULL;
943 		tx_grp->atg_ring_cnt--;
944 		break;
945 	}
946 }
947 
948 /*
949  * This function is called to create pseudo rings over hardware rings of
950  * the underlying device. There is a 1:1 mapping between the pseudo TX
951  * rings of the aggr and the hardware rings of the underlying port.
952  */
953 static int
954 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp,
955     uint_t limit)
956 {
957 	aggr_grp_t		*grp = port->lp_grp;
958 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
959 	mac_perim_handle_t	pmph;
960 	int			hw_rh_cnt, i = 0, j;
961 	int			err = 0;
962 
963 	if (limit == 0)
964 		return (ENOSPC);
965 
966 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
967 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
968 
969 	/*
970 	 * Get the list the the underlying HW rings.
971 	 */
972 	hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
973 	    MAC_RING_TYPE_TX);
974 
975 	/*
976 	 * Even if the underlying NIC does not have TX rings, we
977 	 * still make a psuedo TX ring for that NIC with NULL as
978 	 * the ring handle.
979 	 */
980 	if (hw_rh_cnt == 0)
981 		port->lp_tx_ring_cnt = 1;
982 	else
983 		port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit);
984 
985 	port->lp_tx_ring_alloc = port->lp_tx_ring_cnt;
986 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
987 	    port->lp_tx_ring_alloc), KM_SLEEP);
988 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
989 	    port->lp_tx_ring_alloc), KM_SLEEP);
990 
991 	if (hw_rh_cnt == 0) {
992 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
993 		    NULL, &pseudo_rh)) == 0) {
994 			port->lp_tx_rings[0] = NULL;
995 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
996 		}
997 	} else {
998 		for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) {
999 			err = aggr_add_pseudo_tx_ring(port,
1000 			    tx_grp, hw_rh[i], &pseudo_rh);
1001 			if (err != 0)
1002 				break;
1003 			port->lp_tx_rings[i] = hw_rh[i];
1004 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
1005 		}
1006 	}
1007 
1008 	if (err != 0) {
1009 		if (hw_rh_cnt != 0) {
1010 			for (j = 0; j < i; j++) {
1011 				aggr_rem_pseudo_tx_ring(tx_grp,
1012 				    port->lp_pseudo_tx_rings[j]);
1013 			}
1014 		}
1015 		kmem_free(port->lp_tx_rings,
1016 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1017 		kmem_free(port->lp_pseudo_tx_rings,
1018 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1019 		port->lp_tx_ring_cnt = 0;
1020 		port->lp_tx_ring_alloc = 0;
1021 	} else {
1022 		port->lp_tx_grp_added = B_TRUE;
1023 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1024 		    aggr_tx_ring_update, port);
1025 	}
1026 	mac_perim_exit(pmph);
1027 	aggr_grp_update_default(grp);
1028 	return (err);
1029 }
1030 
1031 /*
1032  * This function is called by aggr to remove pseudo TX rings over the
1033  * HW rings of the underlying port.
1034  */
1035 static void
1036 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1037 {
1038 	aggr_grp_t		*grp = port->lp_grp;
1039 	mac_perim_handle_t	pmph;
1040 	int			i;
1041 
1042 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1043 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
1044 
1045 	if (!port->lp_tx_grp_added)
1046 		goto done;
1047 
1048 	ASSERT(tx_grp->atg_gh != NULL);
1049 
1050 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
1051 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1052 
1053 	kmem_free(port->lp_tx_rings,
1054 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1055 	kmem_free(port->lp_pseudo_tx_rings,
1056 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1057 
1058 	port->lp_tx_ring_cnt = 0;
1059 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1060 	port->lp_tx_grp_added = B_FALSE;
1061 	aggr_grp_update_default(grp);
1062 done:
1063 	mac_perim_exit(pmph);
1064 }
1065 
1066 static int
1067 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1068 {
1069 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1070 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1071 }
1072 
1073 static int
1074 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1075 {
1076 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1077 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1078 }
1079 
1080 /*
1081  * Start the pseudo ring. Since the pseudo ring is just an abstraction
1082  * over an actual HW ring, the real task is to start the underlying HW
1083  * ring.
1084  */
1085 static int
1086 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1087 {
1088 	int err;
1089 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1090 
1091 	err = mac_hwring_start(rr_ring->arr_hw_rh);
1092 
1093 	if (err != 0)
1094 		return (err);
1095 
1096 	rr_ring->arr_gen = mr_gen;
1097 	return (err);
1098 }
1099 
1100 /*
1101  * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1102  * over an actual HW ring, the real task is to stop the underlying HW
1103  * ring.
1104  */
1105 static void
1106 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1107 {
1108 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1109 
1110 	/*
1111 	 * The rings underlying the default group must stay up to
1112 	 * continue receiving LACP traffic. We would normally never
1113 	 * stop the default Rx rings because of the primary MAC
1114 	 * client; but aggr's primary MAC client doesn't call
1115 	 * mac_unicast_add() and thus mi_active is 0 when the last
1116 	 * non-primary client is deleted.
1117 	 */
1118 	if (rr_ring->arr_grp->arg_index != 0)
1119 		mac_hwring_stop(rr_ring->arr_hw_rh);
1120 }
1121 
1122 /*
1123  * Trim each port in a group to ensure it uses no more than tx_ring_limit
1124  * rings.
1125  */
1126 static void
1127 aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit)
1128 {
1129 	aggr_port_t *port;
1130 	mac_perim_handle_t mph;
1131 	uint_t i, tx_ring_cnt;
1132 
1133 	ASSERT(tx_ring_limit > 0);
1134 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1135 
1136 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1137 		mac_perim_enter_by_mh(port->lp_mh, &mph);
1138 
1139 		/*
1140 		 * Reduce the Tx ring count first to prevent rings being
1141 		 * used as they are removed.
1142 		 */
1143 		rw_enter(&grp->lg_tx_lock, RW_WRITER);
1144 		if (port->lp_tx_ring_cnt <= tx_ring_limit) {
1145 			rw_exit(&grp->lg_tx_lock);
1146 			mac_perim_exit(mph);
1147 			continue;
1148 		}
1149 
1150 		tx_ring_cnt = port->lp_tx_ring_cnt;
1151 		port->lp_tx_ring_cnt = tx_ring_limit;
1152 		rw_exit(&grp->lg_tx_lock);
1153 
1154 		for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) {
1155 			aggr_rem_pseudo_tx_ring(&grp->lg_tx_group,
1156 			    port->lp_pseudo_tx_rings[i]);
1157 
1158 		}
1159 
1160 		mac_perim_exit(mph);
1161 	}
1162 }
1163 
1164 /*
1165  * Add one or more ports to an existing link aggregation group.
1166  */
1167 int
1168 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1169     laioc_port_t *ports)
1170 {
1171 	int rc;
1172 	uint_t port_added = 0;
1173 	uint_t grp_added;
1174 	uint_t nports_high, tx_ring_limit;
1175 	aggr_grp_t *grp = NULL;
1176 	aggr_port_t *port;
1177 	boolean_t link_state_changed = B_FALSE;
1178 	mac_perim_handle_t mph, pmph;
1179 
1180 	/* Get the aggr corresponding to linkid. */
1181 	rw_enter(&aggr_grp_lock, RW_READER);
1182 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1183 	    (mod_hash_val_t *)&grp) != 0) {
1184 		rw_exit(&aggr_grp_lock);
1185 		return (ENOENT);
1186 	}
1187 	AGGR_GRP_REFHOLD(grp);
1188 
1189 	/*
1190 	 * Hold the perimeter so that the aggregation can't be destroyed.
1191 	 */
1192 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1193 	rw_exit(&aggr_grp_lock);
1194 
1195 	/*
1196 	 * Limit the number of Tx rings per port. When determining the
1197 	 * number of ports take into consideration the existing high
1198 	 * value, and what the new high value may be after this request.
1199 	 */
1200 	nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports);
1201 	tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high;
1202 
1203 	if (tx_ring_limit == 0) {
1204 		rc = ENOSPC;
1205 		goto bail;
1206 	}
1207 
1208 	/*
1209 	 * Balance the Tx rings so each port has a fair share of rings.
1210 	 */
1211 	aggr_grp_balance_tx(grp, tx_ring_limit);
1212 
1213 	/* Add the specified ports to the aggr. */
1214 	for (uint_t i = 0; i < nports; i++) {
1215 		grp_added = 0;
1216 
1217 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1218 		    force, &port)) != 0) {
1219 			goto bail;
1220 		}
1221 
1222 		ASSERT(port != NULL);
1223 		port_added++;
1224 
1225 		/* check capabilities */
1226 		if (!aggr_grp_capab_check(grp, port) ||
1227 		    !aggr_grp_sdu_check(grp, port) ||
1228 		    !aggr_grp_margin_check(grp, port)) {
1229 			rc = ENOTSUP;
1230 			goto bail;
1231 		}
1232 
1233 		/*
1234 		 * Create the pseudo ring for each HW ring of the underlying
1235 		 * port.
1236 		 */
1237 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
1238 		    tx_ring_limit);
1239 		if (rc != 0)
1240 			goto bail;
1241 
1242 		for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1243 			rc = aggr_add_pseudo_rx_group(port,
1244 			    &grp->lg_rx_groups[j]);
1245 
1246 			if (rc != 0)
1247 				goto bail;
1248 
1249 			grp_added++;
1250 		}
1251 
1252 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1253 
1254 		/* set LACP mode */
1255 		aggr_port_lacp_set_mode(grp, port);
1256 
1257 		/* start port if group has already been started */
1258 		if (grp->lg_started) {
1259 			rc = aggr_port_start(port);
1260 			if (rc != 0) {
1261 				mac_perim_exit(pmph);
1262 				goto bail;
1263 			}
1264 
1265 			/*
1266 			 * Turn on the promiscuous mode over the port when it
1267 			 * is requested to be turned on to receive the
1268 			 * non-primary address over a port, or the promiscuous
1269 			 * mode is enabled over the aggr.
1270 			 */
1271 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1272 				rc = aggr_port_promisc(port, B_TRUE);
1273 				if (rc != 0) {
1274 					mac_perim_exit(pmph);
1275 					goto bail;
1276 				}
1277 			}
1278 		}
1279 		mac_perim_exit(pmph);
1280 
1281 		/*
1282 		 * Attach each port if necessary.
1283 		 */
1284 		if (aggr_port_notify_link(grp, port))
1285 			link_state_changed = B_TRUE;
1286 
1287 		/*
1288 		 * Initialize the callback functions for this port.
1289 		 */
1290 		aggr_port_init_callbacks(port);
1291 	}
1292 
1293 	/* update the MAC address of the constituent ports */
1294 	if (aggr_grp_update_ports_mac(grp))
1295 		link_state_changed = B_TRUE;
1296 
1297 	if (link_state_changed)
1298 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1299 
1300 bail:
1301 	if (rc != 0) {
1302 		/* stop and remove ports that have been added */
1303 		for (uint_t i = 0; i < port_added; i++) {
1304 			uint_t grp_remove;
1305 
1306 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1307 			ASSERT(port != NULL);
1308 
1309 			if (grp->lg_started) {
1310 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1311 				(void) aggr_port_promisc(port, B_FALSE);
1312 				aggr_port_stop(port);
1313 				mac_perim_exit(pmph);
1314 			}
1315 
1316 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1317 
1318 			/*
1319 			 * Only the last port could have a partial set
1320 			 * of groups added.
1321 			 */
1322 			grp_remove = (i + 1 == port_added) ? grp_added :
1323 			    grp->lg_rx_group_count;
1324 
1325 			for (uint_t j = 0; j < grp_remove; j++) {
1326 				aggr_rem_pseudo_rx_group(port,
1327 				    &grp->lg_rx_groups[j]);
1328 			}
1329 
1330 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1331 		}
1332 	}
1333 
1334 	mac_perim_exit(mph);
1335 	AGGR_GRP_REFRELE(grp);
1336 	return (rc);
1337 }
1338 
1339 static int
1340 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1341     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1342     aggr_lacp_timer_t lacp_timer)
1343 {
1344 	boolean_t mac_addr_changed = B_FALSE;
1345 	boolean_t link_state_changed = B_FALSE;
1346 	mac_perim_handle_t pmph;
1347 
1348 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1349 
1350 	/* validate fixed address if specified */
1351 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1352 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1353 	    (mac_addr[0] & 0x01))) {
1354 		return (EINVAL);
1355 	}
1356 
1357 	/* update policy if requested */
1358 	if (update_mask & AGGR_MODIFY_POLICY)
1359 		aggr_send_update_policy(grp, policy);
1360 
1361 	/* update unicast MAC address if requested */
1362 	if (update_mask & AGGR_MODIFY_MAC) {
1363 		if (mac_fixed) {
1364 			/* user-supplied MAC address */
1365 			grp->lg_mac_addr_port = NULL;
1366 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1367 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1368 				mac_addr_changed = B_TRUE;
1369 			}
1370 		} else if (grp->lg_addr_fixed) {
1371 			/* switch from user-supplied to automatic */
1372 			aggr_port_t *port = grp->lg_ports;
1373 
1374 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1375 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1376 			grp->lg_mac_addr_port = port;
1377 			mac_addr_changed = B_TRUE;
1378 			mac_perim_exit(pmph);
1379 		}
1380 		grp->lg_addr_fixed = mac_fixed;
1381 	}
1382 
1383 	if (mac_addr_changed)
1384 		link_state_changed = aggr_grp_update_ports_mac(grp);
1385 
1386 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1387 		aggr_lacp_update_mode(grp, lacp_mode);
1388 
1389 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1390 		aggr_lacp_update_timer(grp, lacp_timer);
1391 
1392 	if (link_state_changed)
1393 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1394 
1395 	if (mac_addr_changed)
1396 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1397 
1398 	return (0);
1399 }
1400 
1401 /*
1402  * Update properties of an existing link aggregation group.
1403  */
1404 int
1405 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1406     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1407     aggr_lacp_timer_t lacp_timer)
1408 {
1409 	aggr_grp_t *grp = NULL;
1410 	mac_perim_handle_t mph;
1411 	int err;
1412 
1413 	/* get group corresponding to linkid */
1414 	rw_enter(&aggr_grp_lock, RW_READER);
1415 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1416 	    (mod_hash_val_t *)&grp) != 0) {
1417 		rw_exit(&aggr_grp_lock);
1418 		return (ENOENT);
1419 	}
1420 	AGGR_GRP_REFHOLD(grp);
1421 
1422 	/*
1423 	 * Hold the perimeter so that the aggregation won't be destroyed.
1424 	 */
1425 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1426 	rw_exit(&aggr_grp_lock);
1427 
1428 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1429 	    mac_addr, lacp_mode, lacp_timer);
1430 
1431 	mac_perim_exit(mph);
1432 	AGGR_GRP_REFRELE(grp);
1433 	return (err);
1434 }
1435 
1436 /*
1437  * Create a new link aggregation group upon request from administrator.
1438  * Returns 0 on success, an errno on failure.
1439  */
1440 int
1441 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1442     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1443     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1444     cred_t *credp)
1445 {
1446 	aggr_grp_t *grp = NULL;
1447 	aggr_port_t *port;
1448 	aggr_port_t *last_attached = NULL;
1449 	mac_register_t *mac;
1450 	boolean_t link_state_changed;
1451 	mac_perim_handle_t mph, pmph;
1452 	datalink_id_t tempid;
1453 	boolean_t mac_registered = B_FALSE;
1454 	uint_t tx_ring_limit;
1455 	int err;
1456 	int i, j;
1457 	kt_did_t tid = 0;
1458 
1459 	/* need at least one port */
1460 	if (nports == 0)
1461 		return (EINVAL);
1462 
1463 	rw_enter(&aggr_grp_lock, RW_WRITER);
1464 
1465 	/* does a group with the same linkid already exist? */
1466 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1467 	    (mod_hash_val_t *)&grp);
1468 	if (err == 0) {
1469 		rw_exit(&aggr_grp_lock);
1470 		return (EEXIST);
1471 	}
1472 
1473 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1474 
1475 	grp->lg_refs = 1;
1476 	grp->lg_closing = B_FALSE;
1477 	grp->lg_force = force;
1478 	grp->lg_linkid = linkid;
1479 	grp->lg_zoneid = crgetzoneid(credp);
1480 	grp->lg_ifspeed = 0;
1481 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1482 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1483 	grp->lg_started = B_FALSE;
1484 	grp->lg_promisc = B_FALSE;
1485 	grp->lg_lacp_done = B_FALSE;
1486 	grp->lg_tx_notify_done = B_FALSE;
1487 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1488 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1489 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1490 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1491 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1492 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1493 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1494 	grp->lg_tx_blocked_cnt = 0;
1495 	bzero(&grp->lg_rx_groups,
1496 	    sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1497 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1498 	aggr_lacp_init_grp(grp);
1499 
1500 	/* add MAC ports to group */
1501 	grp->lg_ports = NULL;
1502 	grp->lg_nports = 0;
1503 	grp->lg_nattached_ports = 0;
1504 	grp->lg_ntx_ports = 0;
1505 
1506 	/*
1507 	 * If key is not specified by the user, allocate the key.
1508 	 */
1509 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1510 		err = ENOMEM;
1511 		goto bail;
1512 	}
1513 	grp->lg_key = key;
1514 
1515 	for (i = 0; i < nports; i++) {
1516 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1517 		if (err != 0)
1518 			goto bail;
1519 	}
1520 
1521 	grp->lg_rx_group_count = 1;
1522 
1523 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1524 		uint_t num_rgroups;
1525 
1526 		mac_perim_enter_by_mh(port->lp_mh, &mph);
1527 		num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1528 		mac_perim_exit(mph);
1529 
1530 		/*
1531 		 * Utilize all the groups in a port. If some ports
1532 		 * have less groups than others, then traffic destined
1533 		 * for the same unicast address may be HW classified
1534 		 * on some ports but SW classified by aggr when
1535 		 * arriving on other ports.
1536 		 */
1537 		grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1538 		    num_rgroups);
1539 	}
1540 
1541 	/*
1542 	 * There could be cases where the hardware provides more
1543 	 * groups than aggr can support. Make sure we never go above
1544 	 * the max aggr can support.
1545 	 */
1546 	grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1547 	    MAX_GROUPS_PER_PORT);
1548 
1549 	ASSERT3U(grp->lg_rx_group_count, >, 0);
1550 	for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1551 		grp->lg_rx_groups[i].arg_index = i;
1552 		grp->lg_rx_groups[i].arg_untagged = 0;
1553 		list_create(&(grp->lg_rx_groups[i].arg_vlans),
1554 		    sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1555 	}
1556 
1557 	/*
1558 	 * If no explicit MAC address was specified by the administrator,
1559 	 * set it to the MAC address of the first port.
1560 	 */
1561 	grp->lg_addr_fixed = mac_fixed;
1562 	if (grp->lg_addr_fixed) {
1563 		/* validate specified address */
1564 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1565 			err = EINVAL;
1566 			goto bail;
1567 		}
1568 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1569 	} else {
1570 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1571 		grp->lg_mac_addr_port = grp->lg_ports;
1572 	}
1573 
1574 	/* Set the initial group capabilities. */
1575 	aggr_grp_capab_set(grp);
1576 
1577 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1578 		err = ENOMEM;
1579 		goto bail;
1580 	}
1581 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1582 	mac->m_driver = grp;
1583 	mac->m_dip = aggr_dip;
1584 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1585 	mac->m_src_addr = grp->lg_addr;
1586 	mac->m_callbacks = &aggr_m_callbacks;
1587 	mac->m_min_sdu = 0;
1588 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1589 	mac->m_margin = aggr_grp_max_margin(grp);
1590 	mac->m_v12n = MAC_VIRT_LEVEL1;
1591 	err = mac_register(mac, &grp->lg_mh);
1592 	mac_free(mac);
1593 	if (err != 0)
1594 		goto bail;
1595 
1596 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1597 	if (err != 0) {
1598 		(void) mac_unregister(grp->lg_mh);
1599 		grp->lg_mh = NULL;
1600 		goto bail;
1601 	}
1602 
1603 	mac_registered = B_TRUE;
1604 
1605 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1606 
1607 	/*
1608 	 * Update the MAC address of the constituent ports.
1609 	 * None of the port is attached at this time, the link state of the
1610 	 * aggregation will not change.
1611 	 *
1612 	 * All ports take on the primary MAC address of the aggr
1613 	 * (lg_aggr). At this point, none of the ports are attached;
1614 	 * thus the link state of the aggregation will not change.
1615 	 */
1616 	link_state_changed = aggr_grp_update_ports_mac(grp);
1617 	ASSERT(!link_state_changed);
1618 
1619 	/* Update outbound load balancing policy. */
1620 	aggr_send_update_policy(grp, policy);
1621 
1622 	/* Set LACP mode. */
1623 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1624 
1625 	/*
1626 	 * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP
1627 	 * rings, when all the Tx rings of all the ports are accumulated
1628 	 * it is conceivable this limit is exceeded. We try and prevent
1629 	 * this by limiting the number of rings an individual port will use.
1630 	 *
1631 	 * - When an aggr is first created, we will not let an
1632 	 *   individual port use more than MAX_RINGS_PER_GROUP/nports
1633 	 *   rings.
1634 	 * - As ports are added to an existing aggr, each of the
1635 	 *   ports will not use more than MAX_RINGS_PER_GROUP/nports_high.
1636 	 *   Where nports_high is the highest number of ports the aggr has
1637 	 *   held (including any ports being added). This may involve
1638 	 *   trimming rings from existing ports.
1639 	 */
1640 
1641 	/* Leave room for 4 ports */
1642 	tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports);
1643 
1644 	/*
1645 	 * Attach each port if necessary.
1646 	 */
1647 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1648 		/*
1649 		 * Create the pseudo ring for each HW ring of the
1650 		 * underlying port. Note that this is done after the
1651 		 * aggr registers its MAC.
1652 		 */
1653 		err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
1654 		    tx_ring_limit);
1655 
1656 		if (err != 0) {
1657 			mac_perim_exit(mph);
1658 			goto bail;
1659 		}
1660 
1661 		for (i = 0; i < grp->lg_rx_group_count; i++) {
1662 			err = aggr_add_pseudo_rx_group(port,
1663 			    &grp->lg_rx_groups[i]);
1664 
1665 			if (err != 0) {
1666 				/*
1667 				 * Undo what we have added for the current
1668 				 * port.
1669 				 */
1670 				aggr_rem_pseudo_tx_group(port,
1671 				    &grp->lg_tx_group);
1672 
1673 				for (j = 0; j < i; j++) {
1674 					aggr_rem_pseudo_rx_group(port,
1675 					    &grp->lg_rx_groups[j]);
1676 				}
1677 
1678 				mac_perim_exit(mph);
1679 				goto bail;
1680 			}
1681 		}
1682 
1683 		if (aggr_port_notify_link(grp, port))
1684 			link_state_changed = B_TRUE;
1685 
1686 		/*
1687 		 * Initialize the callback functions for this port.
1688 		 */
1689 		aggr_port_init_callbacks(port);
1690 
1691 		last_attached = port;
1692 	}
1693 
1694 	if (link_state_changed)
1695 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1696 
1697 	/* add new group to hash table */
1698 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1699 	    (mod_hash_val_t)grp);
1700 	ASSERT(err == 0);
1701 	aggr_grp_cnt++;
1702 
1703 	mac_perim_exit(mph);
1704 	rw_exit(&aggr_grp_lock);
1705 	return (0);
1706 
1707 bail:
1708 	grp->lg_closing = B_TRUE;
1709 
1710 	/*
1711 	 * Inform the lacp_rx thread to exit.
1712 	 */
1713 	mutex_enter(&grp->lg_lacp_lock);
1714 	grp->lg_lacp_done = B_TRUE;
1715 	cv_signal(&grp->lg_lacp_cv);
1716 	while (grp->lg_lacp_rx_thread != NULL)
1717 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1718 	mutex_exit(&grp->lg_lacp_lock);
1719 	/*
1720 	 * Inform the tx_notify thread to exit.
1721 	 */
1722 	mutex_enter(&grp->lg_tx_flowctl_lock);
1723 	if (grp->lg_tx_notify_thread != NULL) {
1724 		tid = grp->lg_tx_notify_thread->t_did;
1725 		grp->lg_tx_notify_done = B_TRUE;
1726 		cv_signal(&grp->lg_tx_flowctl_cv);
1727 	}
1728 	mutex_exit(&grp->lg_tx_flowctl_lock);
1729 	if (tid != 0)
1730 		thread_join(tid);
1731 
1732 	if (mac_registered) {
1733 		(void) dls_devnet_destroy(grp->lg_mh, &tempid, B_TRUE);
1734 		(void) mac_disable(grp->lg_mh);
1735 
1736 		if (last_attached != NULL) {
1737 			/*
1738 			 * Detach and clean up ports added.
1739 			 */
1740 			mac_perim_enter_by_mh(grp->lg_mh, &mph);
1741 
1742 			for (port = grp->lg_ports; ; port = port->lp_next) {
1743 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1744 				(void) aggr_grp_detach_port(grp, port);
1745 				mac_perim_exit(pmph);
1746 
1747 				aggr_rem_pseudo_tx_group(port,
1748 				    &grp->lg_tx_group);
1749 
1750 				for (i = 0; i < grp->lg_rx_group_count; i++) {
1751 					aggr_rem_pseudo_rx_group(port,
1752 					    &grp->lg_rx_groups[i]);
1753 				}
1754 				if (port == last_attached)
1755 					break;
1756 			}
1757 
1758 			mac_perim_exit(mph);
1759 		}
1760 
1761 		(void) mac_unregister(grp->lg_mh);
1762 	}
1763 
1764 	port = grp->lg_ports;
1765 	while (port != NULL) {
1766 		aggr_port_t *cport;
1767 
1768 		cport = port->lp_next;
1769 		aggr_port_delete(port);
1770 		port = cport;
1771 	}
1772 
1773 	kmem_free(grp->lg_tx_blocked_rings,
1774 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1775 	rw_exit(&aggr_grp_lock);
1776 	AGGR_GRP_REFRELE(grp);
1777 	return (err);
1778 }
1779 
1780 /*
1781  * Return a pointer to the member of a group with specified linkid.
1782  */
1783 static aggr_port_t *
1784 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1785 {
1786 	aggr_port_t *port;
1787 
1788 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1789 
1790 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1791 		if (port->lp_linkid == linkid)
1792 			break;
1793 	}
1794 
1795 	return (port);
1796 }
1797 
1798 /*
1799  * Stop, detach and remove a port from a link aggregation group.
1800  */
1801 static int
1802 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1803     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1804 {
1805 	int rc = 0;
1806 	aggr_port_t **pport;
1807 	boolean_t mac_addr_changed = B_FALSE;
1808 	boolean_t link_state_changed = B_FALSE;
1809 	mac_perim_handle_t mph;
1810 	uint64_t val;
1811 	uint_t i;
1812 	uint_t stat;
1813 
1814 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1815 	ASSERT(grp->lg_nports > 1);
1816 	ASSERT(!grp->lg_closing);
1817 
1818 	/* unlink port */
1819 	for (pport = &grp->lg_ports; *pport != port;
1820 	    pport = &(*pport)->lp_next) {
1821 		if (*pport == NULL) {
1822 			rc = ENOENT;
1823 			goto done;
1824 		}
1825 	}
1826 	*pport = port->lp_next;
1827 
1828 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1829 
1830 	/*
1831 	 * If the MAC address of the port being removed was assigned
1832 	 * to the group, update the group MAC address
1833 	 * using the MAC address of a different port.
1834 	 */
1835 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1836 		/*
1837 		 * Set the MAC address of the group to the
1838 		 * MAC address of its first port.
1839 		 */
1840 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1841 		grp->lg_mac_addr_port = grp->lg_ports;
1842 		mac_addr_changed = B_TRUE;
1843 	}
1844 
1845 	link_state_changed = aggr_grp_detach_port(grp, port);
1846 
1847 	/*
1848 	 * Add the counter statistics of the ports while it was aggregated
1849 	 * to the group's residual statistics.  This is done by obtaining
1850 	 * the current counter from the underlying MAC then subtracting the
1851 	 * value of the counter at the moment it was added to the
1852 	 * aggregation.
1853 	 */
1854 	for (i = 0; i < MAC_NSTAT; i++) {
1855 		stat = i + MAC_STAT_MIN;
1856 		if (!MAC_STAT_ISACOUNTER(stat))
1857 			continue;
1858 		val = aggr_port_stat(port, stat);
1859 		val -= port->lp_stat[i];
1860 		mutex_enter(&grp->lg_stat_lock);
1861 		grp->lg_stat[i] += val;
1862 		mutex_exit(&grp->lg_stat_lock);
1863 	}
1864 	for (i = 0; i < ETHER_NSTAT; i++) {
1865 		stat = i + MACTYPE_STAT_MIN;
1866 		if (!ETHER_STAT_ISACOUNTER(stat))
1867 			continue;
1868 		val = aggr_port_stat(port, stat);
1869 		val -= port->lp_ether_stat[i];
1870 		mutex_enter(&grp->lg_stat_lock);
1871 		grp->lg_ether_stat[i] += val;
1872 		mutex_exit(&grp->lg_stat_lock);
1873 	}
1874 
1875 	grp->lg_nports--;
1876 	mac_perim_exit(mph);
1877 
1878 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1879 	aggr_port_delete(port);
1880 
1881 	/*
1882 	 * If the group MAC address has changed, update the MAC address of
1883 	 * the remaining constituent ports according to the new MAC
1884 	 * address of the group.
1885 	 */
1886 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1887 		link_state_changed = B_TRUE;
1888 
1889 done:
1890 	if (mac_addr_changedp != NULL)
1891 		*mac_addr_changedp = mac_addr_changed;
1892 	if (link_state_changedp != NULL)
1893 		*link_state_changedp = link_state_changed;
1894 
1895 	return (rc);
1896 }
1897 
1898 /*
1899  * Remove one or more ports from an existing link aggregation group.
1900  */
1901 int
1902 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1903 {
1904 	int rc = 0;
1905 	uint_t i;
1906 	aggr_grp_t *grp = NULL;
1907 	aggr_port_t *port;
1908 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1909 	boolean_t link_state_update = B_FALSE, link_state_changed;
1910 	mac_perim_handle_t mph, pmph;
1911 
1912 	/* get group corresponding to linkid */
1913 	rw_enter(&aggr_grp_lock, RW_READER);
1914 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1915 	    (mod_hash_val_t *)&grp) != 0) {
1916 		rw_exit(&aggr_grp_lock);
1917 		return (ENOENT);
1918 	}
1919 	AGGR_GRP_REFHOLD(grp);
1920 
1921 	/*
1922 	 * Hold the perimeter so that the aggregation won't be destroyed.
1923 	 */
1924 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1925 	rw_exit(&aggr_grp_lock);
1926 
1927 	/* we need to keep at least one port per group */
1928 	if (nports >= grp->lg_nports) {
1929 		rc = EINVAL;
1930 		goto bail;
1931 	}
1932 
1933 	/* first verify that all the groups are valid */
1934 	for (i = 0; i < nports; i++) {
1935 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1936 			/* port not found */
1937 			rc = ENOENT;
1938 			goto bail;
1939 		}
1940 	}
1941 
1942 	/* clear the promiscous mode for the specified ports */
1943 	for (i = 0; i < nports && rc == 0; i++) {
1944 		/* lookup port */
1945 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1946 		ASSERT(port != NULL);
1947 
1948 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1949 		rc = aggr_port_promisc(port, B_FALSE);
1950 		mac_perim_exit(pmph);
1951 	}
1952 	if (rc != 0) {
1953 		for (i = 0; i < nports; i++) {
1954 			port = aggr_grp_port_lookup(grp,
1955 			    ports[i].lp_linkid);
1956 			ASSERT(port != NULL);
1957 
1958 			/*
1959 			 * Turn the promiscuous mode back on if it is required
1960 			 * to receive the non-primary address over a port, or
1961 			 * the promiscous mode is enabled over the aggr.
1962 			 */
1963 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1964 			if (port->lp_started && (grp->lg_promisc ||
1965 			    port->lp_prom_addr != NULL)) {
1966 				(void) aggr_port_promisc(port, B_TRUE);
1967 			}
1968 			mac_perim_exit(pmph);
1969 		}
1970 		goto bail;
1971 	}
1972 
1973 	/* remove the specified ports from group */
1974 	for (i = 0; i < nports; i++) {
1975 		/* lookup port */
1976 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1977 		ASSERT(port != NULL);
1978 
1979 		/* stop port if group has already been started */
1980 		if (grp->lg_started) {
1981 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1982 			aggr_port_stop(port);
1983 			mac_perim_exit(pmph);
1984 		}
1985 
1986 		/*
1987 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1988 		 * it is called from inside aggr_grp_rem_port() after the
1989 		 * port has been detached. The reason is that
1990 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1991 		 * and if there is still traffic going on, then there
1992 		 * is the possibility of aggr_find_tx_ring() returning a
1993 		 * removed ring for transmission. Once the port has been
1994 		 * detached, that port will not be used and
1995 		 * aggr_find_tx_ring() will not return any rings
1996 		 * belonging to it.
1997 		 */
1998 		for (uint_t j = 0; j < grp->lg_rx_group_count; j++)
1999 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]);
2000 
2001 		/* remove port from group */
2002 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
2003 		    &link_state_changed);
2004 		ASSERT(rc == 0);
2005 		mac_addr_update = mac_addr_update || mac_addr_changed;
2006 		link_state_update = link_state_update || link_state_changed;
2007 	}
2008 
2009 bail:
2010 	if (mac_addr_update)
2011 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
2012 	if (link_state_update)
2013 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2014 
2015 	mac_perim_exit(mph);
2016 	AGGR_GRP_REFRELE(grp);
2017 
2018 	return (rc);
2019 }
2020 
2021 int
2022 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
2023 {
2024 	aggr_grp_t *grp = NULL;
2025 	aggr_port_t *port, *cport;
2026 	datalink_id_t tmpid;
2027 	mod_hash_val_t val;
2028 	mac_perim_handle_t mph, pmph;
2029 	int err;
2030 	kt_did_t tid = 0;
2031 
2032 	rw_enter(&aggr_grp_lock, RW_WRITER);
2033 
2034 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2035 	    (mod_hash_val_t *)&grp) != 0) {
2036 		rw_exit(&aggr_grp_lock);
2037 		return (ENOENT);
2038 	}
2039 
2040 	/*
2041 	 * Note that dls_devnet_destroy() must be called before lg_lock is
2042 	 * held. Otherwise, it will deadlock if another thread is in
2043 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
2044 	 * dls_devnet_destroy() needs to delete.
2045 	 */
2046 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
2047 		rw_exit(&aggr_grp_lock);
2048 		return (err);
2049 	}
2050 	ASSERT(linkid == tmpid);
2051 
2052 	/*
2053 	 * Unregister from the MAC service module. Since this can
2054 	 * fail if a client hasn't closed the MAC port, we gracefully
2055 	 * fail the operation.
2056 	 */
2057 	if ((err = mac_disable(grp->lg_mh)) != 0) {
2058 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
2059 		rw_exit(&aggr_grp_lock);
2060 		return (err);
2061 	}
2062 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
2063 	ASSERT(grp == (aggr_grp_t *)val);
2064 
2065 	ASSERT(aggr_grp_cnt > 0);
2066 	aggr_grp_cnt--;
2067 	rw_exit(&aggr_grp_lock);
2068 
2069 	/*
2070 	 * Inform the lacp_rx thread to exit.
2071 	 */
2072 	mutex_enter(&grp->lg_lacp_lock);
2073 	grp->lg_lacp_done = B_TRUE;
2074 	cv_signal(&grp->lg_lacp_cv);
2075 	while (grp->lg_lacp_rx_thread != NULL)
2076 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
2077 	mutex_exit(&grp->lg_lacp_lock);
2078 	/*
2079 	 * Inform the tx_notify_thread to exit.
2080 	 */
2081 	mutex_enter(&grp->lg_tx_flowctl_lock);
2082 	if (grp->lg_tx_notify_thread != NULL) {
2083 		tid = grp->lg_tx_notify_thread->t_did;
2084 		grp->lg_tx_notify_done = B_TRUE;
2085 		cv_signal(&grp->lg_tx_flowctl_cv);
2086 	}
2087 	mutex_exit(&grp->lg_tx_flowctl_lock);
2088 	if (tid != 0)
2089 		thread_join(tid);
2090 
2091 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2092 
2093 	grp->lg_closing = B_TRUE;
2094 	/* detach and free MAC ports associated with group */
2095 	port = grp->lg_ports;
2096 	while (port != NULL) {
2097 		cport = port->lp_next;
2098 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2099 		if (grp->lg_started)
2100 			aggr_port_stop(port);
2101 		(void) aggr_grp_detach_port(grp, port);
2102 		mac_perim_exit(pmph);
2103 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
2104 		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2105 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
2106 		aggr_port_delete(port);
2107 		port = cport;
2108 	}
2109 
2110 	mac_perim_exit(mph);
2111 
2112 	kmem_free(grp->lg_tx_blocked_rings,
2113 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
2114 	/*
2115 	 * Wait for the port's lacp timer thread and its notification callback
2116 	 * to exit before calling mac_unregister() since both needs to access
2117 	 * the mac perimeter of the grp.
2118 	 */
2119 	aggr_grp_port_wait(grp);
2120 
2121 	VERIFY(mac_unregister(grp->lg_mh) == 0);
2122 	grp->lg_mh = NULL;
2123 
2124 	for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
2125 		list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
2126 	}
2127 
2128 	AGGR_GRP_REFRELE(grp);
2129 	return (0);
2130 }
2131 
2132 void
2133 aggr_grp_free(aggr_grp_t *grp)
2134 {
2135 	ASSERT(grp->lg_refs == 0);
2136 	ASSERT(grp->lg_port_ref == 0);
2137 	if (grp->lg_key > AGGR_MAX_KEY) {
2138 		id_free(key_ids, grp->lg_key);
2139 		grp->lg_key = 0;
2140 	}
2141 	kmem_cache_free(aggr_grp_cache, grp);
2142 }
2143 
2144 int
2145 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
2146     aggr_grp_info_new_grp_fn_t new_grp_fn,
2147     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
2148 {
2149 	aggr_grp_t	*grp;
2150 	aggr_port_t	*port;
2151 	mac_perim_handle_t mph, pmph;
2152 	int		rc = 0;
2153 
2154 	/*
2155 	 * Make sure that the aggregation link is visible from the caller's
2156 	 * zone.
2157 	 */
2158 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2159 		return (ENOENT);
2160 
2161 	rw_enter(&aggr_grp_lock, RW_READER);
2162 
2163 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2164 	    (mod_hash_val_t *)&grp) != 0) {
2165 		rw_exit(&aggr_grp_lock);
2166 		return (ENOENT);
2167 	}
2168 	AGGR_GRP_REFHOLD(grp);
2169 
2170 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2171 	rw_exit(&aggr_grp_lock);
2172 
2173 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
2174 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2175 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2176 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2177 
2178 	if (rc != 0)
2179 		goto bail;
2180 
2181 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2182 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2183 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2184 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
2185 		mac_perim_exit(pmph);
2186 
2187 		if (rc != 0)
2188 			goto bail;
2189 	}
2190 
2191 bail:
2192 	mac_perim_exit(mph);
2193 	AGGR_GRP_REFRELE(grp);
2194 	return (rc);
2195 }
2196 
2197 /*ARGSUSED*/
2198 static void
2199 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2200 {
2201 	miocnak(q, mp, 0, ENOTSUP);
2202 }
2203 
2204 static int
2205 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2206 {
2207 	aggr_port_t	*port;
2208 	uint_t		stat_index;
2209 
2210 	ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2211 
2212 	/* We only aggregate counter statistics. */
2213 	if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
2214 	    (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
2215 		return (ENOTSUP);
2216 	}
2217 
2218 	/*
2219 	 * Counter statistics for a group are computed by aggregating the
2220 	 * counters of the members MACs while they were aggregated, plus
2221 	 * the residual counter of the group itself, which is updated each
2222 	 * time a MAC is removed from the group.
2223 	 */
2224 	*val = 0;
2225 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2226 		/* actual port statistic */
2227 		*val += aggr_port_stat(port, stat);
2228 		/*
2229 		 * minus the port stat when it was added, plus any residual
2230 		 * amount for the group.
2231 		 */
2232 		if (IS_MAC_STAT(stat)) {
2233 			stat_index = stat - MAC_STAT_MIN;
2234 			*val -= port->lp_stat[stat_index];
2235 			*val += grp->lg_stat[stat_index];
2236 		} else if (IS_MACTYPE_STAT(stat)) {
2237 			stat_index = stat - MACTYPE_STAT_MIN;
2238 			*val -= port->lp_ether_stat[stat_index];
2239 			*val += grp->lg_ether_stat[stat_index];
2240 		}
2241 	}
2242 	return (0);
2243 }
2244 
2245 int
2246 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2247 {
2248 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2249 
2250 	if (rx_ring->arr_hw_rh != NULL) {
2251 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2252 	} else {
2253 		aggr_port_t	*port = rx_ring->arr_port;
2254 
2255 		*val = mac_stat_get(port->lp_mh, stat);
2256 
2257 	}
2258 	return (0);
2259 }
2260 
2261 int
2262 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2263 {
2264 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2265 
2266 	if (tx_ring->atr_hw_rh != NULL) {
2267 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2268 	} else {
2269 		aggr_port_t	*port = tx_ring->atr_port;
2270 
2271 		*val = mac_stat_get(port->lp_mh, stat);
2272 	}
2273 	return (0);
2274 }
2275 
2276 static int
2277 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2278 {
2279 	aggr_grp_t		*grp = arg;
2280 	int			rval = 0;
2281 
2282 	mutex_enter(&grp->lg_stat_lock);
2283 
2284 	switch (stat) {
2285 	case MAC_STAT_IFSPEED:
2286 		*val = grp->lg_ifspeed;
2287 		break;
2288 
2289 	case ETHER_STAT_LINK_DUPLEX:
2290 		*val = grp->lg_link_duplex;
2291 		break;
2292 
2293 	default:
2294 		/*
2295 		 * For all other statistics, we return the aggregated stat
2296 		 * from the underlying ports.  aggr_grp_stat() will set
2297 		 * rval appropriately if the statistic isn't a counter.
2298 		 */
2299 		rval = aggr_grp_stat(grp, stat, val);
2300 	}
2301 
2302 	mutex_exit(&grp->lg_stat_lock);
2303 	return (rval);
2304 }
2305 
2306 static int
2307 aggr_m_start(void *arg)
2308 {
2309 	aggr_grp_t *grp = arg;
2310 	aggr_port_t *port;
2311 	mac_perim_handle_t mph, pmph;
2312 
2313 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2314 
2315 	/*
2316 	 * Attempts to start all configured members of the group.
2317 	 * Group members will be attached when their link-up notification
2318 	 * is received.
2319 	 */
2320 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2321 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2322 		if (aggr_port_start(port) != 0) {
2323 			mac_perim_exit(pmph);
2324 			continue;
2325 		}
2326 
2327 		/*
2328 		 * Turn on the promiscuous mode if it is required to receive
2329 		 * the non-primary address over a port, or the promiscous
2330 		 * mode is enabled over the aggr.
2331 		 */
2332 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2333 			if (aggr_port_promisc(port, B_TRUE) != 0)
2334 				aggr_port_stop(port);
2335 		}
2336 		mac_perim_exit(pmph);
2337 	}
2338 
2339 	grp->lg_started = B_TRUE;
2340 
2341 	mac_perim_exit(mph);
2342 	return (0);
2343 }
2344 
2345 static void
2346 aggr_m_stop(void *arg)
2347 {
2348 	aggr_grp_t *grp = arg;
2349 	aggr_port_t *port;
2350 	mac_perim_handle_t mph, pmph;
2351 
2352 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2353 
2354 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2355 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2356 
2357 		/* reset port promiscuous mode */
2358 		(void) aggr_port_promisc(port, B_FALSE);
2359 
2360 		aggr_port_stop(port);
2361 		mac_perim_exit(pmph);
2362 	}
2363 
2364 	grp->lg_started = B_FALSE;
2365 	mac_perim_exit(mph);
2366 }
2367 
2368 static int
2369 aggr_m_promisc(void *arg, boolean_t on)
2370 {
2371 	aggr_grp_t *grp = arg;
2372 	aggr_port_t *port;
2373 	boolean_t link_state_changed = B_FALSE;
2374 	mac_perim_handle_t mph, pmph;
2375 
2376 	AGGR_GRP_REFHOLD(grp);
2377 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2378 
2379 	ASSERT(!grp->lg_closing);
2380 
2381 	if (on == grp->lg_promisc)
2382 		goto bail;
2383 
2384 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2385 		int	err = 0;
2386 
2387 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2388 		AGGR_PORT_REFHOLD(port);
2389 		if (!on && (port->lp_prom_addr == NULL))
2390 			err = aggr_port_promisc(port, B_FALSE);
2391 		else if (on && port->lp_started)
2392 			err = aggr_port_promisc(port, B_TRUE);
2393 
2394 		if (err != 0) {
2395 			if (aggr_grp_detach_port(grp, port))
2396 				link_state_changed = B_TRUE;
2397 		} else {
2398 			/*
2399 			 * If a port was detached because of a previous
2400 			 * failure changing the promiscuity, the port
2401 			 * is reattached when it successfully changes
2402 			 * the promiscuity now, and this might cause
2403 			 * the link state of the aggregation to change.
2404 			 */
2405 			if (aggr_grp_attach_port(grp, port))
2406 				link_state_changed = B_TRUE;
2407 		}
2408 		mac_perim_exit(pmph);
2409 		AGGR_PORT_REFRELE(port);
2410 	}
2411 
2412 	grp->lg_promisc = on;
2413 
2414 	if (link_state_changed)
2415 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2416 
2417 bail:
2418 	mac_perim_exit(mph);
2419 	AGGR_GRP_REFRELE(grp);
2420 
2421 	return (0);
2422 }
2423 
2424 static void
2425 aggr_grp_port_rename(const char *new_name, void *arg)
2426 {
2427 	/*
2428 	 * aggr port's mac client name is the format of "aggr link name" plus
2429 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2430 	 */
2431 	int aggr_len, link_len, clnt_name_len, i;
2432 	char *str_end, *str_st, *str_del;
2433 	char aggr_name[MAXNAMELEN];
2434 	char link_name[MAXNAMELEN];
2435 	char *clnt_name;
2436 	aggr_grp_t *aggr_grp = arg;
2437 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2438 
2439 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2440 		clnt_name = mac_client_name(aggr_port->lp_mch);
2441 		clnt_name_len = strlen(clnt_name);
2442 		str_st = clnt_name;
2443 		str_end = &(clnt_name[clnt_name_len]);
2444 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2445 		ASSERT(str_del != NULL);
2446 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2447 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2448 		bzero(aggr_name, MAXNAMELEN);
2449 		bzero(link_name, MAXNAMELEN);
2450 		bcopy(clnt_name, aggr_name, aggr_len);
2451 		bcopy(str_del, link_name, link_len + 1);
2452 		bzero(clnt_name, MAXNAMELEN);
2453 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2454 		    link_name);
2455 
2456 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2457 		aggr_port = aggr_port->lp_next;
2458 	}
2459 }
2460 
2461 /*
2462  * Initialize the capabilities that are advertised for the group
2463  * according to the capabilities of the constituent ports.
2464  */
2465 static boolean_t
2466 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2467 {
2468 	aggr_grp_t *grp = arg;
2469 
2470 	switch (cap) {
2471 	case MAC_CAPAB_HCKSUM: {
2472 		uint32_t *hcksum_txflags = cap_data;
2473 		*hcksum_txflags = grp->lg_hcksum_txflags;
2474 		break;
2475 	}
2476 	case MAC_CAPAB_LSO: {
2477 		mac_capab_lso_t *cap_lso = cap_data;
2478 
2479 		if (grp->lg_lso) {
2480 			*cap_lso = grp->lg_cap_lso;
2481 			break;
2482 		} else {
2483 			return (B_FALSE);
2484 		}
2485 	}
2486 	case MAC_CAPAB_NO_NATIVEVLAN:
2487 		return (!grp->lg_vlan);
2488 	case MAC_CAPAB_NO_ZCOPY:
2489 		return (!grp->lg_zcopy);
2490 	case MAC_CAPAB_RINGS: {
2491 		mac_capab_rings_t *cap_rings = cap_data;
2492 		uint_t ring_cnt = 0;
2493 
2494 		for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2495 			ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2496 
2497 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2498 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2499 			cap_rings->mr_rnum = ring_cnt;
2500 			cap_rings->mr_gnum = grp->lg_rx_group_count;
2501 			cap_rings->mr_gaddring = NULL;
2502 			cap_rings->mr_gremring = NULL;
2503 		} else {
2504 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2505 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2506 			cap_rings->mr_gnum = 0;
2507 		}
2508 		cap_rings->mr_rget = aggr_fill_ring;
2509 		cap_rings->mr_gget = aggr_fill_group;
2510 		break;
2511 	}
2512 	case MAC_CAPAB_AGGR:
2513 	{
2514 		mac_capab_aggr_t *aggr_cap;
2515 
2516 		if (cap_data != NULL) {
2517 			aggr_cap = cap_data;
2518 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2519 			aggr_cap->mca_unicst = aggr_m_unicst;
2520 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2521 			aggr_cap->mca_arg = arg;
2522 		}
2523 		return (B_TRUE);
2524 	}
2525 	default:
2526 		return (B_FALSE);
2527 	}
2528 	return (B_TRUE);
2529 }
2530 
2531 /*
2532  * Callback function for MAC layer to register groups.
2533  */
2534 static void
2535 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2536     mac_group_info_t *infop, mac_group_handle_t gh)
2537 {
2538 	aggr_grp_t *grp = arg;
2539 
2540 	if (rtype == MAC_RING_TYPE_RX) {
2541 		aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2542 
2543 		rx_group->arg_gh = gh;
2544 		rx_group->arg_grp = grp;
2545 
2546 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2547 		infop->mgi_start = NULL;
2548 		infop->mgi_stop = NULL;
2549 		infop->mgi_addmac = aggr_addmac;
2550 		infop->mgi_remmac = aggr_remmac;
2551 		infop->mgi_count = rx_group->arg_ring_cnt;
2552 
2553 		/*
2554 		 * Always set the HW VLAN callbacks. They are smart
2555 		 * enough to know when a port has HW VLAN filters to
2556 		 * program and when it doesn't.
2557 		 */
2558 		infop->mgi_addvlan = aggr_addvlan;
2559 		infop->mgi_remvlan = aggr_remvlan;
2560 	} else {
2561 		aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2562 
2563 		ASSERT3S(index, ==, 0);
2564 		tx_group->atg_gh = gh;
2565 	}
2566 }
2567 
2568 /*
2569  * Callback funtion for MAC layer to register all rings.
2570  */
2571 static void
2572 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2573     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2574 {
2575 	aggr_grp_t	*grp = arg;
2576 
2577 	switch (rtype) {
2578 	case MAC_RING_TYPE_RX: {
2579 		aggr_pseudo_rx_group_t	*rx_group;
2580 		aggr_pseudo_rx_ring_t	*rx_ring;
2581 		mac_intr_t		aggr_mac_intr;
2582 
2583 		rx_group = &grp->lg_rx_groups[rg_index];
2584 		ASSERT3S(index, >=, 0);
2585 		ASSERT3S(index, <, rx_group->arg_ring_cnt);
2586 		rx_ring = rx_group->arg_rings + index;
2587 		rx_ring->arr_rh = rh;
2588 
2589 		/*
2590 		 * Entrypoint to enable interrupt (disable poll) and
2591 		 * disable interrupt (enable poll).
2592 		 */
2593 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2594 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2595 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2596 		aggr_mac_intr.mi_ddi_handle = NULL;
2597 
2598 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2599 		infop->mri_start = aggr_pseudo_start_rx_ring;
2600 		infop->mri_stop = aggr_pseudo_stop_rx_ring;
2601 
2602 		infop->mri_intr = aggr_mac_intr;
2603 		infop->mri_poll = aggr_rx_poll;
2604 
2605 		infop->mri_stat = aggr_rx_ring_stat;
2606 		break;
2607 	}
2608 	case MAC_RING_TYPE_TX: {
2609 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2610 		aggr_pseudo_tx_ring_t	*tx_ring;
2611 
2612 		ASSERT(rg_index == -1);
2613 		ASSERT(index < tx_group->atg_ring_cnt);
2614 
2615 		tx_ring = &tx_group->atg_rings[index];
2616 		tx_ring->atr_rh = rh;
2617 
2618 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2619 		infop->mri_start = NULL;
2620 		infop->mri_stop = NULL;
2621 		infop->mri_tx = aggr_ring_tx;
2622 		infop->mri_stat = aggr_tx_ring_stat;
2623 		/*
2624 		 * Use the hw TX ring handle to find if the ring needs
2625 		 * serialization or not. For NICs that do not expose
2626 		 * Tx rings, atr_hw_rh will be NULL.
2627 		 */
2628 		if (tx_ring->atr_hw_rh != NULL) {
2629 			infop->mri_flags =
2630 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2631 		}
2632 		break;
2633 	}
2634 	default:
2635 		break;
2636 	}
2637 }
2638 
2639 static mblk_t *
2640 aggr_rx_poll(void *arg, int bytes_to_pickup)
2641 {
2642 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2643 	aggr_port_t *port = rr_ring->arr_port;
2644 	aggr_grp_t *grp = port->lp_grp;
2645 	mblk_t *mp_chain, *mp, **mpp;
2646 
2647 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2648 
2649 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2650 		return (mp_chain);
2651 
2652 	mpp = &mp_chain;
2653 	while ((mp = *mpp) != NULL) {
2654 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2655 			struct ether_header *ehp;
2656 
2657 			ehp = (struct ether_header *)mp->b_rptr;
2658 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2659 				*mpp = mp->b_next;
2660 				mp->b_next = NULL;
2661 				aggr_recv_lacp(port,
2662 				    (mac_resource_handle_t)rr_ring, mp);
2663 				continue;
2664 			}
2665 		}
2666 
2667 		if (!port->lp_collector_enabled) {
2668 			*mpp = mp->b_next;
2669 			mp->b_next = NULL;
2670 			freemsg(mp);
2671 			continue;
2672 		}
2673 		mpp = &mp->b_next;
2674 	}
2675 	return (mp_chain);
2676 }
2677 
2678 static int
2679 aggr_addmac(void *arg, const uint8_t *mac_addr)
2680 {
2681 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2682 	aggr_unicst_addr_t	*addr, **pprev;
2683 	aggr_grp_t		*grp = rx_group->arg_grp;
2684 	aggr_port_t		*port, *p;
2685 	mac_perim_handle_t	mph;
2686 	int			err = 0;
2687 	uint_t			idx = rx_group->arg_index;
2688 
2689 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2690 
2691 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2692 		mac_perim_exit(mph);
2693 		return (0);
2694 	}
2695 
2696 	/*
2697 	 * Insert this mac address into the list of mac addresses owned by
2698 	 * the aggregation pseudo group.
2699 	 */
2700 	pprev = &rx_group->arg_macaddr;
2701 	while ((addr = *pprev) != NULL) {
2702 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2703 			mac_perim_exit(mph);
2704 			return (EEXIST);
2705 		}
2706 		pprev = &addr->aua_next;
2707 	}
2708 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2709 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2710 	addr->aua_next = NULL;
2711 	*pprev = addr;
2712 
2713 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2714 		if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2715 			break;
2716 
2717 	if (err != 0) {
2718 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2719 			aggr_port_remmac(p, idx, mac_addr);
2720 
2721 		*pprev = NULL;
2722 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2723 	}
2724 
2725 	mac_perim_exit(mph);
2726 	return (err);
2727 }
2728 
2729 static int
2730 aggr_remmac(void *arg, const uint8_t *mac_addr)
2731 {
2732 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2733 	aggr_unicst_addr_t	*addr, **pprev;
2734 	aggr_grp_t		*grp = rx_group->arg_grp;
2735 	aggr_port_t		*port;
2736 	mac_perim_handle_t	mph;
2737 	int			err = 0;
2738 
2739 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2740 
2741 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2742 		mac_perim_exit(mph);
2743 		return (0);
2744 	}
2745 
2746 	/*
2747 	 * Insert this mac address into the list of mac addresses owned by
2748 	 * the aggregation pseudo group.
2749 	 */
2750 	pprev = &rx_group->arg_macaddr;
2751 	while ((addr = *pprev) != NULL) {
2752 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2753 			pprev = &addr->aua_next;
2754 			continue;
2755 		}
2756 		break;
2757 	}
2758 	if (addr == NULL) {
2759 		mac_perim_exit(mph);
2760 		return (EINVAL);
2761 	}
2762 
2763 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2764 		aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2765 
2766 	*pprev = addr->aua_next;
2767 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2768 
2769 	mac_perim_exit(mph);
2770 	return (err);
2771 }
2772 
2773 /*
2774  * Search for VID in the Rx group's list and return a pointer if
2775  * found. Otherwise return NULL.
2776  */
2777 static aggr_vlan_t *
2778 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2779 {
2780 	ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2781 	for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2782 	    avp = list_next(&rx_group->arg_vlans, avp)) {
2783 		if (avp->av_vid == vid)
2784 			return (avp);
2785 	}
2786 
2787 	return (NULL);
2788 }
2789 
2790 /*
2791  * Accept traffic on the specified VID.
2792  *
2793  * Persist VLAN state in the aggr so that ports added later will
2794  * receive the correct filters. In the future it would be nice to
2795  * allow aggr to iterate its clients instead of duplicating state.
2796  */
2797 static int
2798 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2799 {
2800 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2801 	aggr_grp_t		*aggr = rx_group->arg_grp;
2802 	aggr_port_t		*port, *p;
2803 	mac_perim_handle_t	mph;
2804 	int			err = 0;
2805 	aggr_vlan_t		*avp = NULL;
2806 	uint_t			idx = rx_group->arg_index;
2807 
2808 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2809 
2810 	if (vid == MAC_VLAN_UNTAGGED) {
2811 		/*
2812 		 * Aggr is both a MAC provider and MAC client. As a
2813 		 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2814 		 * client. As a client itself, it should pass
2815 		 * VLAN_ID_NONE to its ports.
2816 		 */
2817 		vid = VLAN_ID_NONE;
2818 		rx_group->arg_untagged++;
2819 		goto update_ports;
2820 	}
2821 
2822 	avp = aggr_find_vlan(rx_group, vid);
2823 
2824 	if (avp != NULL) {
2825 		avp->av_refs++;
2826 		mac_perim_exit(mph);
2827 		return (0);
2828 	}
2829 
2830 	avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2831 	avp->av_vid = vid;
2832 	avp->av_refs = 1;
2833 
2834 update_ports:
2835 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2836 		if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2837 			break;
2838 
2839 	if (err != 0) {
2840 		/*
2841 		 * If any of these calls fail then we are in a
2842 		 * situation where the ports have different HW state.
2843 		 * There's no reasonable action the MAC client can
2844 		 * take in this scenario to rectify the situation.
2845 		 */
2846 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2847 			int err2;
2848 
2849 			if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2850 				cmn_err(CE_WARN, "Failed to remove VLAN %u"
2851 				    " from port %s: errno %d.", vid,
2852 				    mac_client_name(p->lp_mch), err2);
2853 			}
2854 
2855 		}
2856 
2857 		if (vid == VLAN_ID_NONE)
2858 			rx_group->arg_untagged--;
2859 
2860 		if (avp != NULL) {
2861 			kmem_free(avp, sizeof (aggr_vlan_t));
2862 			avp = NULL;
2863 		}
2864 	}
2865 
2866 	if (avp != NULL)
2867 		list_insert_tail(&rx_group->arg_vlans, avp);
2868 
2869 	mac_perim_exit(mph);
2870 	return (err);
2871 }
2872 
2873 /*
2874  * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2875  */
2876 static int
2877 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2878 {
2879 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2880 	aggr_grp_t		*aggr = rx_group->arg_grp;
2881 	aggr_port_t		*port, *p;
2882 	mac_perim_handle_t	mph;
2883 	int			err = 0;
2884 	aggr_vlan_t		*avp = NULL;
2885 	uint_t			idx = rx_group->arg_index;
2886 
2887 	mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2888 
2889 	/*
2890 	 * See the comment in aggr_addvlan().
2891 	 */
2892 	if (vid == MAC_VLAN_UNTAGGED) {
2893 		vid = VLAN_ID_NONE;
2894 		rx_group->arg_untagged--;
2895 
2896 		if (rx_group->arg_untagged > 0)
2897 			goto done;
2898 
2899 		goto update_ports;
2900 	}
2901 
2902 	avp = aggr_find_vlan(rx_group, vid);
2903 
2904 	if (avp == NULL) {
2905 		err = ENOENT;
2906 		goto done;
2907 	}
2908 
2909 	avp->av_refs--;
2910 
2911 	if (avp->av_refs > 0)
2912 		goto done;
2913 
2914 update_ports:
2915 	for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2916 		if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2917 			break;
2918 
2919 	/*
2920 	 * See the comment in aggr_addvlan() for justification of the
2921 	 * use of VERIFY here.
2922 	 */
2923 	if (err != 0) {
2924 		for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2925 			int err2;
2926 
2927 			if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2928 				cmn_err(CE_WARN, "Failed to add VLAN %u"
2929 				    " to port %s: errno %d.", vid,
2930 				    mac_client_name(p->lp_mch), err2);
2931 			}
2932 		}
2933 
2934 		if (avp != NULL)
2935 			avp->av_refs++;
2936 
2937 		if (vid == VLAN_ID_NONE)
2938 			rx_group->arg_untagged++;
2939 
2940 		goto done;
2941 	}
2942 
2943 	if (err == 0 && avp != NULL) {
2944 		VERIFY3U(avp->av_refs, ==, 0);
2945 		list_remove(&rx_group->arg_vlans, avp);
2946 		kmem_free(avp, sizeof (aggr_vlan_t));
2947 	}
2948 
2949 done:
2950 	mac_perim_exit(mph);
2951 	return (err);
2952 }
2953 
2954 /*
2955  * Add or remove the multicast addresses that are defined for the group
2956  * to or from the specified port.
2957  *
2958  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2959  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2960  * called when the port is either stopped or detached.
2961  */
2962 void
2963 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2964 {
2965 	aggr_grp_t *grp = port->lp_grp;
2966 
2967 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2968 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2969 
2970 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2971 		return;
2972 
2973 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2974 }
2975 
2976 static int
2977 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2978 {
2979 	aggr_grp_t *grp = arg;
2980 	aggr_port_t *port = NULL, *errport = NULL;
2981 	mac_perim_handle_t mph;
2982 	int err = 0;
2983 
2984 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2985 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2986 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2987 		    !port->lp_started) {
2988 			continue;
2989 		}
2990 		err = aggr_port_multicst(port, add, addrp);
2991 		if (err != 0) {
2992 			errport = port;
2993 			break;
2994 		}
2995 	}
2996 
2997 	/*
2998 	 * At least one port caused error return and this error is returned to
2999 	 * mac, eventually a NAK would be sent upwards.
3000 	 * Some ports have this multicast address listed now, and some don't.
3001 	 * Treat this error as a whole aggr failure not individual port failure.
3002 	 * Therefore remove this multicast address from other ports.
3003 	 */
3004 	if ((err != 0) && add) {
3005 		for (port = grp->lg_ports; port != errport;
3006 		    port = port->lp_next) {
3007 			if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
3008 			    !port->lp_started) {
3009 				continue;
3010 			}
3011 			(void) aggr_port_multicst(port, B_FALSE, addrp);
3012 		}
3013 	}
3014 	mac_perim_exit(mph);
3015 	return (err);
3016 }
3017 
3018 static int
3019 aggr_m_unicst(void *arg, const uint8_t *macaddr)
3020 {
3021 	aggr_grp_t *grp = arg;
3022 	mac_perim_handle_t mph;
3023 	int err;
3024 
3025 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
3026 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
3027 	    0, 0);
3028 	mac_perim_exit(mph);
3029 	return (err);
3030 }
3031 
3032 /*
3033  * Initialize the capabilities that are advertised for the group
3034  * according to the capabilities of the constituent ports.
3035  */
3036 static void
3037 aggr_grp_capab_set(aggr_grp_t *grp)
3038 {
3039 	uint32_t cksum;
3040 	aggr_port_t *port;
3041 	mac_capab_lso_t cap_lso;
3042 
3043 	ASSERT(grp->lg_mh == NULL);
3044 	ASSERT(grp->lg_ports != NULL);
3045 
3046 	grp->lg_hcksum_txflags = (uint32_t)-1;
3047 	grp->lg_zcopy = B_TRUE;
3048 	grp->lg_vlan = B_TRUE;
3049 
3050 	grp->lg_lso = B_TRUE;
3051 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
3052 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
3053 
3054 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3055 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
3056 			cksum = 0;
3057 		grp->lg_hcksum_txflags &= cksum;
3058 
3059 		grp->lg_vlan &=
3060 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
3061 
3062 		grp->lg_zcopy &=
3063 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
3064 
3065 		grp->lg_lso &=
3066 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
3067 		if (grp->lg_lso) {
3068 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
3069 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
3070 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
3071 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
3072 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
3073 		}
3074 	}
3075 }
3076 
3077 /*
3078  * Checks whether the capabilities of the port being added are compatible
3079  * with the current capabilities of the aggregation.
3080  */
3081 static boolean_t
3082 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
3083 {
3084 	uint32_t hcksum_txflags;
3085 
3086 	ASSERT(grp->lg_ports != NULL);
3087 
3088 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
3089 	    grp->lg_vlan) != grp->lg_vlan) {
3090 		return (B_FALSE);
3091 	}
3092 
3093 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
3094 	    grp->lg_zcopy) != grp->lg_zcopy) {
3095 		return (B_FALSE);
3096 	}
3097 
3098 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
3099 		if (grp->lg_hcksum_txflags != 0)
3100 			return (B_FALSE);
3101 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
3102 	    grp->lg_hcksum_txflags) {
3103 		return (B_FALSE);
3104 	}
3105 
3106 	if (grp->lg_lso) {
3107 		mac_capab_lso_t cap_lso;
3108 
3109 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
3110 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
3111 			    grp->lg_cap_lso.lso_flags)
3112 				return (B_FALSE);
3113 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
3114 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
3115 				return (B_FALSE);
3116 		} else {
3117 			return (B_FALSE);
3118 		}
3119 	}
3120 
3121 	return (B_TRUE);
3122 }
3123 
3124 /*
3125  * Returns the maximum SDU according to the SDU of the constituent ports.
3126  */
3127 static uint_t
3128 aggr_grp_max_sdu(aggr_grp_t *grp)
3129 {
3130 	uint_t max_sdu = (uint_t)-1;
3131 	aggr_port_t *port;
3132 
3133 	ASSERT(grp->lg_ports != NULL);
3134 
3135 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3136 		uint_t port_sdu_max;
3137 
3138 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3139 		if (max_sdu > port_sdu_max)
3140 			max_sdu = port_sdu_max;
3141 	}
3142 
3143 	return (max_sdu);
3144 }
3145 
3146 /*
3147  * Checks if the maximum SDU of the specified port is compatible
3148  * with the maximum SDU of the specified aggregation group, returns
3149  * B_TRUE if it is, B_FALSE otherwise.
3150  */
3151 static boolean_t
3152 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
3153 {
3154 	uint_t port_sdu_max;
3155 
3156 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3157 	return (port_sdu_max >= grp->lg_max_sdu);
3158 }
3159 
3160 /*
3161  * Returns the maximum margin according to the margin of the constituent ports.
3162  */
3163 static uint32_t
3164 aggr_grp_max_margin(aggr_grp_t *grp)
3165 {
3166 	uint32_t margin = UINT32_MAX;
3167 	aggr_port_t *port;
3168 
3169 	ASSERT(grp->lg_mh == NULL);
3170 	ASSERT(grp->lg_ports != NULL);
3171 
3172 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3173 		if (margin > port->lp_margin)
3174 			margin = port->lp_margin;
3175 	}
3176 
3177 	grp->lg_margin = margin;
3178 	return (margin);
3179 }
3180 
3181 /*
3182  * Checks if the maximum margin of the specified port is compatible
3183  * with the maximum margin of the specified aggregation group, returns
3184  * B_TRUE if it is, B_FALSE otherwise.
3185  */
3186 static boolean_t
3187 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3188 {
3189 	if (port->lp_margin >= grp->lg_margin)
3190 		return (B_TRUE);
3191 
3192 	/*
3193 	 * See whether the current margin value is allowed to be changed to
3194 	 * the new value.
3195 	 */
3196 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3197 		return (B_FALSE);
3198 
3199 	grp->lg_margin = port->lp_margin;
3200 	return (B_TRUE);
3201 }
3202 
3203 /*
3204  * Set MTU on individual ports of an aggregation group
3205  */
3206 static int
3207 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3208     uint32_t *old_mtu)
3209 {
3210 	boolean_t		removed = B_FALSE;
3211 	mac_perim_handle_t	mph;
3212 	mac_diag_t		diag;
3213 	int			err, rv, retry = 0;
3214 
3215 	if (port->lp_mah != NULL) {
3216 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3217 		port->lp_mah = NULL;
3218 		removed = B_TRUE;
3219 	}
3220 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3221 try_again:
3222 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3223 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3224 	    &port->lp_mah, 0, &diag)) != 0) {
3225 		/*
3226 		 * following is a workaround for a bug in 'bge' driver.
3227 		 * See CR 6794654 for more information and this work around
3228 		 * will be removed once the CR is fixed.
3229 		 */
3230 		if (rv == EIO && retry++ < 3) {
3231 			delay(2 * hz);
3232 			goto try_again;
3233 		}
3234 		/*
3235 		 * if mac_unicast_add() failed while setting the MTU,
3236 		 * detach the port from the group.
3237 		 */
3238 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3239 		(void) aggr_grp_detach_port(grp, port);
3240 		mac_perim_exit(mph);
3241 		cmn_err(CE_WARN, "Unable to restart the port %s while "
3242 		    "setting MTU. Detaching the port from the aggregation.",
3243 		    mac_client_name(port->lp_mch));
3244 	}
3245 	return (err);
3246 }
3247 
3248 static int
3249 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3250 {
3251 	int			err = 0, i, rv;
3252 	aggr_port_t		*port;
3253 	uint32_t		*mtu;
3254 
3255 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3256 
3257 	/*
3258 	 * If the MTU being set is equal to aggr group's maximum
3259 	 * allowable value, then there is nothing to change
3260 	 */
3261 	if (sdu == grp->lg_max_sdu)
3262 		return (0);
3263 
3264 	/* 0 is aggr group's min sdu */
3265 	if (sdu == 0)
3266 		return (EINVAL);
3267 
3268 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3269 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3270 	    port = port->lp_next, i++) {
3271 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3272 	}
3273 	if (err != 0) {
3274 		/* recover from error: reset the mtus of the ports */
3275 		aggr_port_t *tmp;
3276 
3277 		for (tmp = grp->lg_ports, i = 0; tmp != port;
3278 		    tmp = tmp->lp_next, i++) {
3279 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3280 		}
3281 		goto bail;
3282 	}
3283 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3284 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3285 	ASSERT(rv == 0);
3286 bail:
3287 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3288 	return (err);
3289 }
3290 
3291 /*
3292  * Callback functions for set/get of properties
3293  */
3294 /*ARGSUSED*/
3295 static int
3296 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3297     uint_t pr_valsize, const void *pr_val)
3298 {
3299 	int		err = ENOTSUP;
3300 	aggr_grp_t	*grp = m_driver;
3301 
3302 	switch (pr_num) {
3303 	case MAC_PROP_MTU: {
3304 		uint32_t	mtu;
3305 
3306 		if (pr_valsize < sizeof (mtu)) {
3307 			err = EINVAL;
3308 			break;
3309 		}
3310 		bcopy(pr_val, &mtu, sizeof (mtu));
3311 		err = aggr_sdu_update(grp, mtu);
3312 		break;
3313 	}
3314 	default:
3315 		break;
3316 	}
3317 	return (err);
3318 }
3319 
3320 typedef struct rboundary {
3321 	uint32_t	bval;
3322 	int		btype;
3323 } rboundary_t;
3324 
3325 /*
3326  * This function finds the intersection of mtu ranges stored in arrays -
3327  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3328  * Individual arrays are assumed to contain non-overlapping ranges.
3329  * Algorithm:
3330  *   A range has two boundaries - min and max. We scan all arrays and store
3331  * each boundary as a separate element in a temporary array. We also store
3332  * the boundary types, min or max, as +1 or -1 respectively in the temporary
3333  * array. Then we sort the temporary array in ascending order. We scan the
3334  * sorted array from lower to higher values and keep a cumulative sum of
3335  * boundary types. Element in the temporary array for which the sum reaches
3336  * mcount is a min boundary of a range in the result and next element will be
3337  * max boundary.
3338  *
3339  * Example for mcount = 3,
3340  *
3341  *  ----|_________|-------|_______|----|__|------ mrange[0]
3342  *
3343  *  -------|________|--|____________|-----|___|-- mrange[1]
3344  *
3345  *  --------|________________|-------|____|------ mrange[2]
3346  *
3347  *                                      3 2 1
3348  *                                       \|/
3349  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
3350  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3351  *
3352  *                                 same min and max
3353  *                                        V
3354  *  --------|_____|-------|__|------------|------ intersecting ranges
3355  */
3356 void
3357 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3358     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3359 {
3360 	mac_propval_uint32_range_t	*rval, *ur;
3361 	int				rmaxcnt, rcount;
3362 	size_t				sz_range32;
3363 	rboundary_t			*ta; /* temporary array */
3364 	rboundary_t			temp;
3365 	boolean_t			range_started = B_FALSE;
3366 	int				i, j, m, sum;
3367 
3368 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3369 
3370 	for (i = 0, rmaxcnt = 0; i < mcount; i++)
3371 		rmaxcnt += mrange[i]->mpr_count;
3372 
3373 	/* Allocate enough space to store the results */
3374 	rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3375 
3376 	/* Number of boundaries are twice as many as ranges */
3377 	ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3378 
3379 	for (i = 0, m = 0; i < mcount; i++) {
3380 		ur = &(mrange[i]->mpr_range_uint32[0]);
3381 		for (j = 0; j < mrange[i]->mpr_count; j++) {
3382 			ta[m].bval = ur[j].mpur_min;
3383 			ta[m++].btype = 1;
3384 			ta[m].bval = ur[j].mpur_max;
3385 			ta[m++].btype = -1;
3386 		}
3387 	}
3388 
3389 	/*
3390 	 * Sort the temporary array in ascending order of bval;
3391 	 * if boundary values are same then sort on btype.
3392 	 */
3393 	for (i = 0; i < m-1; i++) {
3394 		for (j = i+1; j < m; j++) {
3395 			if ((ta[i].bval > ta[j].bval) ||
3396 			    ((ta[i].bval == ta[j].bval) &&
3397 			    (ta[i].btype < ta[j].btype))) {
3398 				temp = ta[i];
3399 				ta[i] = ta[j];
3400 				ta[j] = temp;
3401 			}
3402 		}
3403 	}
3404 
3405 	/* Walk through temporary array to find all ranges in the results */
3406 	for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3407 		sum += ta[i].btype;
3408 		if (sum == mcount) {
3409 			rval[rcount].mpur_min = ta[i].bval;
3410 			range_started = B_TRUE;
3411 		} else if (sum < mcount && range_started) {
3412 			rval[rcount++].mpur_max = ta[i].bval;
3413 			range_started = B_FALSE;
3414 		}
3415 	}
3416 
3417 	*prval = rval;
3418 	*prmaxcnt = rmaxcnt;
3419 	*prcount = rcount;
3420 
3421 	kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3422 }
3423 
3424 /*
3425  * Returns the mtu ranges which could be supported by aggr group.
3426  * prmaxcnt returns the size of the buffer prval, prcount returns
3427  * the number of valid entries in prval. Caller is responsible
3428  * for freeing up prval.
3429  */
3430 int
3431 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3432     int *prmaxcnt, int *prcount)
3433 {
3434 	mac_propval_range_t		**vals;
3435 	aggr_port_t			*port;
3436 	mac_perim_handle_t		mph;
3437 	uint_t				i, numr;
3438 	int				err = 0;
3439 	size_t				sz_propval, sz_range32;
3440 	size_t				size;
3441 
3442 	sz_propval = sizeof (mac_propval_range_t);
3443 	sz_range32 = sizeof (mac_propval_uint32_range_t);
3444 
3445 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3446 
3447 	vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3448 	    KM_SLEEP);
3449 
3450 	for (port = grp->lg_ports, i = 0; port != NULL;
3451 	    port = port->lp_next, i++) {
3452 
3453 		size = sz_propval;
3454 		vals[i] = kmem_alloc(size, KM_SLEEP);
3455 		vals[i]->mpr_count = 1;
3456 
3457 		mac_perim_enter_by_mh(port->lp_mh, &mph);
3458 
3459 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3460 		    NULL, 0, vals[i], NULL);
3461 		if (err == ENOSPC) {
3462 			/*
3463 			 * Not enough space to hold all ranges.
3464 			 * Allocate extra space as indicated and retry.
3465 			 */
3466 			numr = vals[i]->mpr_count;
3467 			kmem_free(vals[i], sz_propval);
3468 			size = sz_propval + (numr - 1) * sz_range32;
3469 			vals[i] = kmem_alloc(size, KM_SLEEP);
3470 			vals[i]->mpr_count = numr;
3471 			err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3472 			    NULL, 0, vals[i], NULL);
3473 			ASSERT(err != ENOSPC);
3474 		}
3475 		mac_perim_exit(mph);
3476 		if (err != 0) {
3477 			kmem_free(vals[i], size);
3478 			vals[i] = NULL;
3479 			break;
3480 		}
3481 	}
3482 
3483 	/*
3484 	 * if any of the underlying ports does not support changing MTU then
3485 	 * just return ENOTSUP
3486 	 */
3487 	if (port != NULL) {
3488 		ASSERT(err != 0);
3489 		goto done;
3490 	}
3491 
3492 	aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3493 	    prcount);
3494 
3495 done:
3496 	for (i = 0; i < grp->lg_nports; i++) {
3497 		if (vals[i] != NULL) {
3498 			numr = vals[i]->mpr_count;
3499 			size = sz_propval + (numr - 1) * sz_range32;
3500 			kmem_free(vals[i], size);
3501 		}
3502 	}
3503 
3504 	kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3505 	return (err);
3506 }
3507 
3508 static void
3509 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3510     mac_prop_info_handle_t prh)
3511 {
3512 	aggr_grp_t			*grp = m_driver;
3513 	mac_propval_uint32_range_t	*rval = NULL;
3514 	int				i, rcount, rmaxcnt;
3515 	int				err = 0;
3516 
3517 	_NOTE(ARGUNUSED(pr_name));
3518 
3519 	if (pr_num != MAC_PROP_MTU)
3520 		return;
3521 
3522 	err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, &rcount);
3523 	if (err != 0) {
3524 		ASSERT(rval == NULL);
3525 		return;
3526 	}
3527 	for (i = 0; i < rcount; i++) {
3528 		mac_prop_info_set_range_uint32(prh,
3529 		    rval[i].mpur_min, rval[i].mpur_max);
3530 	}
3531 	kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3532 }
3533