xref: /illumos-gate/usr/src/uts/common/io/aggr/aggr_grp.c (revision 956e8222f10bf55e45b41d8b56084f72ebc113c9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
31  *
32  * An instance of the structure aggr_grp_t is allocated for each
33  * link aggregation group. When created, aggr_grp_t objects are
34  * entered into the aggr_grp_hash hash table maintained by the modhash
35  * module. The hash key is the port number associated with the link
36  * aggregation group. The port number associated with a group corresponds
37  * the key associated with the group.
38  *
39  * A set of MAC ports are associated with each association group.
40  */
41 
42 #include <sys/types.h>
43 #include <sys/sysmacros.h>
44 #include <sys/conf.h>
45 #include <sys/cmn_err.h>
46 #include <sys/list.h>
47 #include <sys/ksynch.h>
48 #include <sys/kmem.h>
49 #include <sys/stream.h>
50 #include <sys/modctl.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/atomic.h>
54 #include <sys/stat.h>
55 #include <sys/modhash.h>
56 #include <sys/strsun.h>
57 #include <sys/dlpi.h>
58 
59 #include <sys/aggr.h>
60 #include <sys/aggr_impl.h>
61 
62 static void aggr_m_info(void *, mac_info_t *);
63 static int aggr_m_start(void *);
64 static void aggr_m_stop(void *);
65 static int aggr_m_promisc(void *, boolean_t);
66 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
67 static int aggr_m_unicst(void *, const uint8_t *);
68 static uint64_t aggr_m_stat(void *, enum mac_stat);
69 static void aggr_m_resources(void *);
70 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
71 
72 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, const char *, uint32_t);
73 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *);
74 static void aggr_stats_op(enum mac_stat, uint64_t *, uint64_t *, boolean_t);
75 static void aggr_grp_capab_set(aggr_grp_t *);
76 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
77 
78 static kmem_cache_t	*aggr_grp_cache;
79 static mod_hash_t	*aggr_grp_hash;
80 static krwlock_t	aggr_grp_lock;
81 static uint_t		aggr_grp_cnt;
82 
83 #define	GRP_HASHSZ		64
84 #define	GRP_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)key)
85 
86 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
87 static uchar_t aggr_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
88 
89 /* used by grp_info_walker */
90 typedef struct aggr_grp_info_state {
91 	uint32_t	ls_group_key;
92 	boolean_t	ls_group_found;
93 	aggr_grp_info_new_grp_fn_t ls_new_grp_fn;
94 	aggr_grp_info_new_port_fn_t ls_new_port_fn;
95 	void		*ls_fn_arg;
96 	int		ls_rc;
97 } aggr_grp_info_state_t;
98 
99 /*ARGSUSED*/
100 static int
101 aggr_grp_constructor(void *buf, void *arg, int kmflag)
102 {
103 	aggr_grp_t *grp = buf;
104 
105 	bzero(grp, sizeof (*grp));
106 	rw_init(&grp->lg_lock, NULL, RW_DRIVER, NULL);
107 	mutex_init(&grp->aggr.gl_lock, NULL, MUTEX_DEFAULT, NULL);
108 
109 	grp->lg_link_state = LINK_STATE_UNKNOWN;
110 
111 	return (0);
112 }
113 
114 /*ARGSUSED*/
115 static void
116 aggr_grp_destructor(void *buf, void *arg)
117 {
118 	aggr_grp_t *grp = buf;
119 
120 	if (grp->lg_tx_ports != NULL) {
121 		kmem_free(grp->lg_tx_ports,
122 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
123 	}
124 
125 	mutex_destroy(&grp->aggr.gl_lock);
126 	rw_destroy(&grp->lg_lock);
127 }
128 
129 void
130 aggr_grp_init(void)
131 {
132 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
133 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
134 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
135 
136 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
137 	    GRP_HASHSZ, mod_hash_null_valdtor);
138 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
139 	aggr_grp_cnt = 0;
140 }
141 
142 int
143 aggr_grp_fini(void)
144 {
145 	if (aggr_grp_cnt > 0)
146 		return (EBUSY);
147 
148 	rw_destroy(&aggr_grp_lock);
149 	mod_hash_destroy_idhash(aggr_grp_hash);
150 	kmem_cache_destroy(aggr_grp_cache);
151 	return (0);
152 }
153 
154 uint_t
155 aggr_grp_count(void)
156 {
157 	uint_t	count;
158 
159 	rw_enter(&aggr_grp_lock, RW_READER);
160 	count = aggr_grp_cnt;
161 	rw_exit(&aggr_grp_lock);
162 	return (count);
163 }
164 
165 /*
166  * Attach a port to a link aggregation group.
167  *
168  * A port is attached to a link aggregation group once its speed
169  * and link state have been verified.
170  *
171  * Returns B_TRUE if the group link state or speed has changed. If
172  * it's the case, the caller must notify the MAC layer via a call
173  * to mac_link().
174  */
175 boolean_t
176 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
177 {
178 	boolean_t link_changed = B_FALSE;
179 
180 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
181 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
182 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
183 
184 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
185 		return (B_FALSE);
186 
187 	/*
188 	 * Validate the MAC port link speed and update the group
189 	 * link speed if needed.
190 	 */
191 	if (port->lp_ifspeed == 0 ||
192 	    port->lp_link_state != LINK_STATE_UP ||
193 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
194 		/*
195 		 * Can't attach a MAC port with unknown link speed,
196 		 * down link, or not in full duplex mode.
197 		 */
198 		return (B_FALSE);
199 	}
200 
201 	if (grp->lg_ifspeed == 0) {
202 		/*
203 		 * The group inherits the speed of the first link being
204 		 * attached.
205 		 */
206 		grp->lg_ifspeed = port->lp_ifspeed;
207 		link_changed = B_TRUE;
208 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
209 		/*
210 		 * The link speed of the MAC port must be the same as
211 		 * the group link speed, as per 802.3ad. Since it is
212 		 * not, the attach is cancelled.
213 		 */
214 		return (B_FALSE);
215 	}
216 
217 	grp->lg_nattached_ports++;
218 
219 	/*
220 	 * Update the group link state.
221 	 */
222 	if (grp->lg_link_state != LINK_STATE_UP) {
223 		grp->lg_link_state = LINK_STATE_UP;
224 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
225 		link_changed = B_TRUE;
226 	}
227 
228 	aggr_grp_multicst_port(port, B_TRUE);
229 
230 	/*
231 	 * Update port's state.
232 	 */
233 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
234 
235 	/*
236 	 * If LACP is OFF, the port can be used to send data as soon
237 	 * as its link is up and verified to be compatible with the
238 	 * aggregation.
239 	 *
240 	 * If LACP is active or passive, notify the LACP subsystem, which
241 	 * will enable sending on the port following the LACP protocol.
242 	 */
243 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
244 		aggr_send_port_enable(port);
245 	else
246 		aggr_lacp_port_attached(port);
247 
248 	return (link_changed);
249 }
250 
251 boolean_t
252 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
253 {
254 	boolean_t link_changed = B_FALSE;
255 
256 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
257 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
258 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
259 
260 	/* update state */
261 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
262 		return (B_FALSE);
263 	port->lp_state = AGGR_PORT_STATE_STANDBY;
264 
265 	aggr_grp_multicst_port(port, B_FALSE);
266 
267 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
268 		aggr_send_port_disable(port);
269 	else
270 		aggr_lacp_port_detached(port);
271 
272 	grp->lg_nattached_ports--;
273 	if (grp->lg_nattached_ports == 0) {
274 		/* the last attached MAC port of the group is being detached */
275 		grp->lg_ifspeed = 0;
276 		grp->lg_link_state = LINK_STATE_DOWN;
277 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
278 		link_changed = B_TRUE;
279 	}
280 
281 	return (link_changed);
282 }
283 
284 /*
285  * Update the MAC addresses of the constituent ports of the specified
286  * group. This function is invoked:
287  * - after creating a new aggregation group.
288  * - after adding new ports to an aggregation group.
289  * - after removing a port from a group when the MAC address of
290  *   that port was used for the MAC address of the group.
291  * - after the MAC address of a port changed when the MAC address
292  *   of that port was used for the MAC address of the group.
293  */
294 void
295 aggr_grp_update_ports_mac(aggr_grp_t *grp)
296 {
297 	aggr_port_t *cport;
298 
299 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
300 
301 	for (cport = grp->lg_ports; cport != NULL;
302 	    cport = cport->lp_next) {
303 		rw_enter(&cport->lp_lock, RW_WRITER);
304 		if (aggr_port_unicst(cport, grp->lg_addr) != 0)
305 			(void) aggr_grp_detach_port(grp, cport);
306 		rw_exit(&cport->lp_lock);
307 		if (grp->lg_closing)
308 			break;
309 	}
310 }
311 
312 /*
313  * Invoked when the MAC address of a port has changed. If the port's
314  * MAC address was used for the group MAC address, returns B_TRUE.
315  * In that case, it is the responsibility of the caller to
316  * invoke aggr_grp_update_ports_mac() after releasing the
317  * the port lock, and aggr_grp_notify() after releasing the
318  * group lock.
319  */
320 boolean_t
321 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port)
322 {
323 	boolean_t grp_addr_changed = B_FALSE;
324 
325 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
326 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
327 
328 	if (grp->lg_addr_fixed) {
329 		/*
330 		 * The group is using a fixed MAC address or an automatic
331 		 * MAC address has not been set.
332 		 */
333 		return (B_FALSE);
334 	}
335 
336 	if (grp->lg_mac_addr_port == port) {
337 		/*
338 		 * The MAC address of the port was assigned to the group
339 		 * MAC address. Update the group MAC address.
340 		 */
341 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
342 		grp_addr_changed = B_TRUE;
343 	} else {
344 		/*
345 		 * Update the actual port MAC address to the MAC address
346 		 * of the group.
347 		 */
348 		if (aggr_port_unicst(port, grp->lg_addr) != 0)
349 			(void) aggr_grp_detach_port(grp, port);
350 	}
351 
352 	return (grp_addr_changed);
353 }
354 
355 /*
356  * Add a port to a link aggregation group.
357  */
358 static int
359 aggr_grp_add_port(aggr_grp_t *grp, const char *name, uint_t portnum,
360     aggr_port_t **pp)
361 {
362 	aggr_port_t *port, **cport;
363 	int err;
364 
365 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
366 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
367 
368 	/* create new port */
369 	err = aggr_port_create(name, portnum, &port);
370 	if (err != 0)
371 		return (err);
372 
373 	rw_enter(&port->lp_lock, RW_WRITER);
374 
375 	/* add port to list of group constituent ports */
376 	cport = &grp->lg_ports;
377 	while (*cport != NULL)
378 		cport = &((*cport)->lp_next);
379 	*cport = port;
380 
381 	/*
382 	 * Back reference to the group it is member of. A port always
383 	 * holds a reference to its group to ensure that the back
384 	 * reference is always valid.
385 	 */
386 	port->lp_grp = grp;
387 	AGGR_GRP_REFHOLD(grp);
388 	grp->lg_nports++;
389 
390 	aggr_lacp_init_port(port);
391 
392 	rw_exit(&port->lp_lock);
393 
394 	if (pp != NULL)
395 		*pp = port;
396 
397 	return (0);
398 }
399 
400 /*
401  * Add one or more ports to an existing link aggregation group.
402  */
403 int
404 aggr_grp_add_ports(uint32_t key, uint_t nports, laioc_port_t *ports)
405 {
406 	int rc, i, nadded = 0;
407 	aggr_grp_t *grp = NULL;
408 	aggr_port_t *port;
409 
410 	/* get group corresponding to key */
411 	rw_enter(&aggr_grp_lock, RW_READER);
412 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
413 	    (mod_hash_val_t *)&grp) != 0) {
414 		rw_exit(&aggr_grp_lock);
415 		return (ENOENT);
416 	}
417 	AGGR_GRP_REFHOLD(grp);
418 	rw_exit(&aggr_grp_lock);
419 
420 	AGGR_LACP_LOCK(grp);
421 	rw_enter(&grp->lg_lock, RW_WRITER);
422 
423 	/* add the specified ports to group */
424 	for (i = 0; i < nports; i++) {
425 		/* add port to group */
426 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_devname,
427 		    ports[i].lp_port, &port)) != 0)
428 			goto bail;
429 		ASSERT(port != NULL);
430 		nadded++;
431 
432 		/* check capabilities */
433 		if (!aggr_grp_capab_check(grp, port)) {
434 			rc = ENOTSUP;
435 			goto bail;
436 		}
437 
438 		/* start port if group has already been started */
439 		if (grp->lg_started) {
440 			rw_enter(&port->lp_lock, RW_WRITER);
441 			rc = aggr_port_start(port);
442 			if (rc != 0) {
443 				rw_exit(&port->lp_lock);
444 				goto bail;
445 			}
446 
447 			/* set port promiscuous mode */
448 			rc = aggr_port_promisc(port, grp->lg_promisc);
449 			if (rc != 0) {
450 				rw_exit(&port->lp_lock);
451 				goto bail;
452 			}
453 			rw_exit(&port->lp_lock);
454 		}
455 	}
456 
457 	/* update the MAC address of the constituent ports */
458 	aggr_grp_update_ports_mac(grp);
459 
460 bail:
461 	if (rc != 0) {
462 		/* stop and remove ports that have been added */
463 		for (i = 0; i < nadded && !grp->lg_closing; i++) {
464 			port = aggr_grp_port_lookup(grp, ports[i].lp_devname,
465 			    ports[i].lp_port);
466 			ASSERT(port != NULL);
467 			if (grp->lg_started) {
468 				rw_enter(&port->lp_lock, RW_WRITER);
469 				aggr_port_stop(port);
470 				rw_exit(&port->lp_lock);
471 			}
472 			(void) aggr_grp_rem_port(grp, port, NULL);
473 		}
474 	}
475 
476 	rw_exit(&grp->lg_lock);
477 	AGGR_LACP_UNLOCK(grp);
478 	if (rc == 0 && !grp->lg_closing)
479 		mac_resource_update(&grp->lg_mac);
480 	AGGR_GRP_REFRELE(grp);
481 	return (rc);
482 }
483 
484 /*
485  * Update properties of an existing link aggregation group.
486  */
487 int
488 aggr_grp_modify(uint32_t key, aggr_grp_t *grp_arg, uint8_t update_mask,
489     uint32_t policy, boolean_t mac_fixed, const uchar_t *mac_addr,
490     aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer)
491 {
492 	int rc = 0;
493 	aggr_grp_t *grp = NULL;
494 	boolean_t mac_addr_changed = B_FALSE;
495 
496 	if (grp_arg == NULL) {
497 		/* get group corresponding to key */
498 		rw_enter(&aggr_grp_lock, RW_READER);
499 		if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
500 		    (mod_hash_val_t *)&grp) != 0) {
501 			rc = ENOENT;
502 			goto bail;
503 		}
504 		AGGR_LACP_LOCK(grp);
505 		rw_enter(&grp->lg_lock, RW_WRITER);
506 	} else {
507 		grp = grp_arg;
508 		ASSERT(AGGR_LACP_LOCK_HELD(grp));
509 		ASSERT(RW_WRITE_HELD(&grp->lg_lock));
510 	}
511 
512 	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
513 	AGGR_GRP_REFHOLD(grp);
514 
515 	/* validate fixed address if specified */
516 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
517 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
518 	    (mac_addr[0] & 0x01))) {
519 		rc = EINVAL;
520 		goto bail;
521 	}
522 
523 	/* update policy if requested */
524 	if (update_mask & AGGR_MODIFY_POLICY)
525 		aggr_send_update_policy(grp, policy);
526 
527 	/* update unicast MAC address if requested */
528 	if (update_mask & AGGR_MODIFY_MAC) {
529 		if (mac_fixed) {
530 			/* user-supplied MAC address */
531 			grp->lg_mac_addr_port = NULL;
532 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
533 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
534 				mac_addr_changed = B_TRUE;
535 			}
536 		} else if (grp->lg_addr_fixed) {
537 			/* switch from user-supplied to automatic */
538 			aggr_port_t *port = grp->lg_ports;
539 
540 			rw_enter(&port->lp_lock, RW_WRITER);
541 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
542 			grp->lg_mac_addr_port = port;
543 			mac_addr_changed = B_TRUE;
544 			rw_exit(&port->lp_lock);
545 		}
546 		grp->lg_addr_fixed = mac_fixed;
547 	}
548 
549 	if (mac_addr_changed)
550 		aggr_grp_update_ports_mac(grp);
551 
552 	if (update_mask & AGGR_MODIFY_LACP_MODE)
553 		aggr_lacp_update_mode(grp, lacp_mode);
554 
555 	if ((update_mask & AGGR_MODIFY_LACP_TIMER) && !grp->lg_closing)
556 		aggr_lacp_update_timer(grp, lacp_timer);
557 
558 bail:
559 	if (grp_arg == NULL) {
560 		if (grp != NULL) {
561 			rw_exit(&grp->lg_lock);
562 			AGGR_LACP_UNLOCK(grp);
563 		}
564 		rw_exit(&aggr_grp_lock);
565 		/* pass new unicast address up to MAC layer */
566 		if (grp != NULL && mac_addr_changed && !grp->lg_closing)
567 			mac_unicst_update(&grp->lg_mac, grp->lg_addr);
568 	}
569 
570 	if (grp != NULL)
571 		AGGR_GRP_REFRELE(grp);
572 
573 	return (rc);
574 }
575 
576 /*
577  * Create a new link aggregation group upon request from administrator.
578  * Returns 0 on success, an errno on failure.
579  */
580 int
581 aggr_grp_create(uint32_t key, uint_t nports, laioc_port_t *ports,
582     uint32_t policy, boolean_t mac_fixed, uchar_t *mac_addr,
583     aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer)
584 {
585 	aggr_grp_t *grp = NULL;
586 	aggr_port_t *port;
587 	mac_t *mac;
588 	mac_info_t *mip;
589 	int err;
590 	int i;
591 
592 	/* need at least one port */
593 	if (nports == 0)
594 		return (EINVAL);
595 
596 	rw_enter(&aggr_grp_lock, RW_WRITER);
597 
598 	/* does a group with the same key already exist? */
599 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
600 	    (mod_hash_val_t *)&grp);
601 	if (err == 0) {
602 		rw_exit(&aggr_grp_lock);
603 		return (EEXIST);
604 	}
605 
606 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
607 
608 	AGGR_LACP_LOCK(grp);
609 	rw_enter(&grp->lg_lock, RW_WRITER);
610 
611 	grp->lg_refs = 1;
612 	grp->lg_closing = B_FALSE;
613 	grp->lg_key = key;
614 
615 	grp->lg_ifspeed = 0;
616 	grp->lg_link_state = LINK_STATE_UNKNOWN;
617 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
618 	grp->lg_started = B_FALSE;
619 	grp->lg_promisc = B_FALSE;
620 	aggr_lacp_init_grp(grp);
621 
622 	/* add MAC ports to group */
623 	grp->lg_ports = NULL;
624 	grp->lg_nports = 0;
625 	grp->lg_nattached_ports = 0;
626 	grp->lg_ntx_ports = 0;
627 
628 	for (i = 0; i < nports; i++) {
629 		err = aggr_grp_add_port(grp, ports[i].lp_devname,
630 		    ports[i].lp_port, NULL);
631 		if (err != 0)
632 			goto bail;
633 	}
634 
635 	/*
636 	 * If no explicit MAC address was specified by the administrator,
637 	 * set it to the MAC address of the first port.
638 	 */
639 	grp->lg_addr_fixed = mac_fixed;
640 	if (grp->lg_addr_fixed) {
641 		/* validate specified address */
642 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
643 			err = EINVAL;
644 			goto bail;
645 		}
646 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
647 	} else {
648 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
649 		grp->lg_mac_addr_port = grp->lg_ports;
650 	}
651 
652 	/* update the MAC address of the constituent ports */
653 	aggr_grp_update_ports_mac(grp);
654 
655 	/* update outbound load balancing policy */
656 	aggr_send_update_policy(grp, policy);
657 
658 	/* register with the MAC module */
659 	mac = &grp->lg_mac;
660 	bzero(mac, sizeof (*mac));
661 
662 	mac->m_ident = MAC_IDENT;
663 
664 	mac->m_driver = grp;
665 	mac->m_dip = aggr_dip;
666 	mac->m_port = key;
667 
668 	mip = &(mac->m_info);
669 	mip->mi_media = DL_ETHER;
670 	mip->mi_sdu_min = 0;
671 	mip->mi_sdu_max = ETHERMTU;
672 
673 	MAC_STAT_MIB(mip->mi_stat);
674 	MAC_STAT_ETHER(mip->mi_stat);
675 	mip->mi_stat[MAC_STAT_LINK_DUPLEX] = B_TRUE;
676 
677 	mip->mi_addr_length = ETHERADDRL;
678 	bcopy(aggr_brdcst_mac, mip->mi_brdcst_addr, ETHERADDRL);
679 	bcopy(grp->lg_addr, mip->mi_unicst_addr, ETHERADDRL);
680 
681 	mac->m_stat = aggr_m_stat;
682 	mac->m_start = aggr_m_start;
683 	mac->m_stop = aggr_m_stop;
684 	mac->m_promisc = aggr_m_promisc;
685 	mac->m_multicst = aggr_m_multicst;
686 	mac->m_unicst = aggr_m_unicst;
687 	mac->m_tx = aggr_m_tx;
688 	mac->m_resources = aggr_m_resources;
689 	mac->m_ioctl = aggr_m_ioctl;
690 
691 	/* set the initial group capabilities */
692 	aggr_grp_capab_set(grp);
693 
694 	if ((err = mac_register(mac)) != 0)
695 		goto bail;
696 
697 	/* set LACP mode */
698 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
699 
700 	/* add new group to hash table */
701 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(key),
702 	    (mod_hash_val_t)grp);
703 	ASSERT(err == 0);
704 	aggr_grp_cnt++;
705 
706 	rw_exit(&grp->lg_lock);
707 	AGGR_LACP_UNLOCK(grp);
708 	rw_exit(&aggr_grp_lock);
709 	return (0);
710 
711 bail:
712 	if (grp != NULL) {
713 		aggr_port_t *cport;
714 
715 		port = grp->lg_ports;
716 		while (port != NULL) {
717 			cport = port->lp_next;
718 			aggr_port_delete(port);
719 			port = cport;
720 		}
721 
722 		rw_exit(&grp->lg_lock);
723 		AGGR_LACP_UNLOCK(grp);
724 
725 		kmem_cache_free(aggr_grp_cache, grp);
726 	}
727 
728 	rw_exit(&aggr_grp_lock);
729 	return (err);
730 }
731 
732 /*
733  * Return a pointer to the member of a group with specified device name
734  * and port number.
735  */
736 static aggr_port_t *
737 aggr_grp_port_lookup(aggr_grp_t *grp, const char *devname, uint32_t portnum)
738 {
739 	aggr_port_t *port;
740 
741 	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
742 
743 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
744 		if ((strcmp(port->lp_devname, devname) == 0) &&
745 		    (port->lp_port == portnum))
746 			break;
747 	}
748 
749 	return (port);
750 }
751 
752 /*
753  * Stop, detach and remove a port from a link aggregation group.
754  */
755 static int
756 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, boolean_t *do_notify)
757 {
758 	aggr_port_t **pport;
759 	boolean_t grp_mac_addr_changed = B_FALSE;
760 	uint64_t val;
761 	uint_t i;
762 
763 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
764 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
765 	ASSERT(grp->lg_nports > 1);
766 
767 	if (do_notify != NULL)
768 		*do_notify = B_FALSE;
769 
770 	/* unlink port */
771 	for (pport = &grp->lg_ports; *pport != port;
772 	    pport = &(*pport)->lp_next) {
773 		if (*pport == NULL)
774 			return (ENOENT);
775 	}
776 	*pport = port->lp_next;
777 
778 	rw_enter(&port->lp_lock, RW_WRITER);
779 	port->lp_closing = B_TRUE;
780 
781 	/*
782 	 * If the MAC address of the port being removed was assigned
783 	 * to the group, update the group MAC address
784 	 * using the MAC address of a different port.
785 	 */
786 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
787 		/*
788 		 * Set the MAC address of the group to the
789 		 * MAC address of its first port.
790 		 */
791 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
792 		grp->lg_mac_addr_port = grp->lg_ports;
793 		grp_mac_addr_changed = B_TRUE;
794 	}
795 
796 	(void) aggr_grp_detach_port(grp, port);
797 
798 	/*
799 	 * Add the statistics of the ports while it was aggregated
800 	 * to the group's residual statistics.
801 	 */
802 	for (i = 0; i < MAC_NSTAT && !grp->lg_closing; i++) {
803 		/* avoid stats that are not counters */
804 		if (i == MAC_STAT_IFSPEED || i == MAC_STAT_LINK_DUPLEX)
805 			continue;
806 
807 		/* get current value */
808 		val = aggr_port_stat(port, i);
809 		/* subtract value at the point of aggregation */
810 		val -= port->lp_stat[i];
811 		/* add to the residual stat */
812 		grp->lg_stat[i] += val;
813 	}
814 
815 	grp->lg_nports--;
816 
817 	rw_exit(&port->lp_lock);
818 
819 	aggr_port_delete(port);
820 
821 	/*
822 	 * If the group MAC address has changed, update the MAC address of
823 	 * the remaining consistuent ports according to the new MAC
824 	 * address of the group.
825 	 */
826 	if (grp->lg_closing) {
827 		*do_notify = B_FALSE;
828 	} else {
829 		if (grp_mac_addr_changed)
830 			aggr_grp_update_ports_mac(grp);
831 
832 		if (do_notify != NULL)
833 			*do_notify = grp_mac_addr_changed;
834 	}
835 
836 	return (0);
837 }
838 
839 /*
840  * Remove one or more ports from an existing link aggregation group.
841  */
842 int
843 aggr_grp_rem_ports(uint32_t key, uint_t nports, laioc_port_t *ports)
844 {
845 	int rc = 0, i;
846 	aggr_grp_t *grp = NULL;
847 	aggr_port_t *port;
848 	boolean_t notify = B_FALSE, grp_mac_addr_changed;
849 
850 	/* get group corresponding to key */
851 	rw_enter(&aggr_grp_lock, RW_READER);
852 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
853 	    (mod_hash_val_t *)&grp) != 0) {
854 		rw_exit(&aggr_grp_lock);
855 		return (ENOENT);
856 	}
857 	AGGR_GRP_REFHOLD(grp);
858 	rw_exit(&aggr_grp_lock);
859 
860 	AGGR_LACP_LOCK(grp);
861 	rw_enter(&grp->lg_lock, RW_WRITER);
862 
863 	/* we need to keep at least one port per group */
864 	if (nports >= grp->lg_nports) {
865 		rc = EINVAL;
866 		goto bail;
867 	}
868 
869 	/* first verify that all the groups are valid */
870 	for (i = 0; i < nports; i++) {
871 		if (aggr_grp_port_lookup(grp, ports[i].lp_devname,
872 		    ports[i].lp_port) == NULL) {
873 			/* port not found */
874 			rc = ENOENT;
875 			goto bail;
876 		}
877 	}
878 
879 	/* remove the specified ports from group */
880 	for (i = 0; i < nports && !grp->lg_closing; i++) {
881 		/* lookup port */
882 		port = aggr_grp_port_lookup(grp, ports[i].lp_devname,
883 		    ports[i].lp_port);
884 		ASSERT(port != NULL);
885 
886 		/* stop port if group has already been started */
887 		if (grp->lg_started) {
888 			rw_enter(&port->lp_lock, RW_WRITER);
889 			aggr_port_stop(port);
890 			rw_exit(&port->lp_lock);
891 		}
892 
893 		/* remove port from group */
894 		rc = aggr_grp_rem_port(grp, port, &grp_mac_addr_changed);
895 		ASSERT(rc == 0);
896 		notify = notify || grp_mac_addr_changed;
897 	}
898 
899 bail:
900 	rw_exit(&grp->lg_lock);
901 	AGGR_LACP_UNLOCK(grp);
902 	if (notify && !grp->lg_closing)
903 		mac_unicst_update(&grp->lg_mac, grp->lg_addr);
904 	if (rc == 0 && !grp->lg_closing)
905 		mac_resource_update(&grp->lg_mac);
906 	AGGR_GRP_REFRELE(grp);
907 
908 	return (rc);
909 }
910 
911 int
912 aggr_grp_delete(uint32_t key)
913 {
914 	aggr_grp_t *grp = NULL;
915 	aggr_port_t *port, *cport;
916 	mod_hash_val_t val;
917 
918 	rw_enter(&aggr_grp_lock, RW_WRITER);
919 
920 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
921 	    (mod_hash_val_t *)&grp) != 0) {
922 		rw_exit(&aggr_grp_lock);
923 		return (ENOENT);
924 	}
925 	AGGR_LACP_LOCK(grp);
926 	rw_enter(&grp->lg_lock, RW_WRITER);
927 	grp->lg_closing = B_TRUE;
928 
929 	/*
930 	 * Unregister from the MAC service module. Since this can
931 	 * fail if a client hasn't closed the MAC port, we gracefully
932 	 * fail the operation.
933 	 */
934 	if (mac_unregister(&grp->lg_mac)) {
935 		rw_exit(&grp->lg_lock);
936 		AGGR_LACP_UNLOCK(grp);
937 		rw_exit(&aggr_grp_lock);
938 		return (EBUSY);
939 	}
940 
941 	/* detach and free MAC ports associated with group */
942 	port = grp->lg_ports;
943 	while (port != NULL) {
944 		cport = port->lp_next;
945 		rw_enter(&port->lp_lock, RW_WRITER);
946 		if (grp->lg_started)
947 			aggr_port_stop(port);
948 		(void) aggr_grp_detach_port(grp, port);
949 		rw_exit(&port->lp_lock);
950 		aggr_port_delete(port);
951 		port = cport;
952 	}
953 
954 	rw_exit(&grp->lg_lock);
955 	AGGR_LACP_UNLOCK(grp);
956 
957 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(key), &val);
958 	ASSERT(grp == (aggr_grp_t *)val);
959 
960 	ASSERT(aggr_grp_cnt > 0);
961 	aggr_grp_cnt--;
962 
963 	rw_exit(&aggr_grp_lock);
964 	AGGR_GRP_REFRELE(grp);
965 
966 	return (0);
967 }
968 
969 void
970 aggr_grp_free(aggr_grp_t *grp)
971 {
972 	ASSERT(grp->lg_refs == 0);
973 	kmem_cache_free(aggr_grp_cache, grp);
974 }
975 
976 /*
977  * Walker invoked when building the list of configured groups and
978  * their ports that must be passed up to user-space.
979  */
980 
981 /*ARGSUSED*/
982 static uint_t
983 aggr_grp_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
984 {
985 	aggr_grp_t *grp;
986 	aggr_port_t *port;
987 	aggr_grp_info_state_t *state = arg;
988 
989 	if (state->ls_rc != 0)
990 		return (MH_WALK_TERMINATE);	/* terminate walk */
991 
992 	grp = (aggr_grp_t *)val;
993 
994 	rw_enter(&grp->lg_lock, RW_READER);
995 
996 	if (state->ls_group_key != 0 && grp->lg_key != state->ls_group_key)
997 		goto bail;
998 
999 	state->ls_group_found = B_TRUE;
1000 
1001 	state->ls_rc = state->ls_new_grp_fn(state->ls_fn_arg, grp->lg_key,
1002 	    grp->lg_addr, grp->lg_addr_fixed, grp->lg_tx_policy,
1003 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1004 
1005 	if (state->ls_rc != 0)
1006 		goto bail;
1007 
1008 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1009 
1010 		rw_enter(&port->lp_lock, RW_READER);
1011 
1012 		state->ls_rc = state->ls_new_port_fn(state->ls_fn_arg,
1013 		    port->lp_devname, port->lp_port, port->lp_addr,
1014 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
1015 
1016 		rw_exit(&port->lp_lock);
1017 
1018 		if (state->ls_rc != 0)
1019 			goto bail;
1020 	}
1021 
1022 bail:
1023 	rw_exit(&grp->lg_lock);
1024 	return ((state->ls_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
1025 }
1026 
1027 int
1028 aggr_grp_info(uint_t *ngroups, uint32_t group_key, void *fn_arg,
1029     aggr_grp_info_new_grp_fn_t new_grp_fn,
1030     aggr_grp_info_new_port_fn_t new_port_fn)
1031 {
1032 	aggr_grp_info_state_t state;
1033 	int rc = 0;
1034 
1035 	rw_enter(&aggr_grp_lock, RW_READER);
1036 
1037 	*ngroups = aggr_grp_cnt;
1038 
1039 	bzero(&state, sizeof (state));
1040 	state.ls_group_key = group_key;
1041 	state.ls_new_grp_fn = new_grp_fn;
1042 	state.ls_new_port_fn = new_port_fn;
1043 	state.ls_fn_arg = fn_arg;
1044 
1045 	mod_hash_walk(aggr_grp_hash, aggr_grp_info_walker, &state);
1046 
1047 	if ((rc = state.ls_rc) == 0 && group_key != 0 &&
1048 	    !state.ls_group_found)
1049 		rc = ENOENT;
1050 
1051 	rw_exit(&aggr_grp_lock);
1052 	return (rc);
1053 }
1054 
1055 /*
1056  * Aggregation group walker.
1057  */
1058 
1059 typedef struct aggr_grp_walker_state_s {
1060 	aggr_grp_walker_fn_t ws_walker_fn;
1061 	void		*ws_arg;
1062 } aggr_grp_walker_state_t;
1063 
1064 void
1065 aggr_grp_walk(aggr_grp_walker_fn_t walker, void *arg)
1066 {
1067 	aggr_grp_walker_state_t state;
1068 
1069 	state.ws_walker_fn = walker;
1070 	state.ws_arg = arg;
1071 
1072 	rw_enter(&aggr_grp_lock, RW_READER);
1073 	mod_hash_walk(aggr_grp_hash, aggr_grp_info_walker, &state);
1074 	rw_exit(&aggr_grp_lock);
1075 }
1076 
1077 static void
1078 aggr_m_resources(void *arg)
1079 {
1080 	aggr_grp_t *grp = arg;
1081 	aggr_port_t *port;
1082 
1083 	/* Call each port's m_resources function */
1084 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
1085 		mac_resources(port->lp_mh);
1086 }
1087 
1088 /*ARGSUSED*/
1089 static void
1090 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1091 {
1092 	miocnak(q, mp, 0, ENOTSUP);
1093 }
1094 
1095 static uint64_t
1096 aggr_m_stat(void *arg, enum mac_stat stat)
1097 {
1098 	aggr_grp_t *grp = arg;
1099 	aggr_port_t *port;
1100 	uint64_t val;
1101 
1102 	rw_enter(&grp->lg_lock, RW_READER);
1103 
1104 	switch (stat) {
1105 	case MAC_STAT_IFSPEED:
1106 		val = grp->lg_ifspeed;
1107 		break;
1108 	case MAC_STAT_LINK_DUPLEX:
1109 		val = grp->lg_link_duplex;
1110 		break;
1111 	default:
1112 		/*
1113 		 * The remaining statistics are counters. They are computed
1114 		 * by aggregating the counters of the members MACs while they
1115 		 * were aggregated, plus the residual counter of the group
1116 		 * itself, which is updated each time a MAC is removed from
1117 		 * the group.
1118 		 */
1119 		val = 0;
1120 		for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1121 			/* actual port statistic */
1122 			val += aggr_port_stat(port, stat);
1123 			/* minus the port stat when it was added */
1124 			val -= port->lp_stat[stat];
1125 			/* plus any residual amount for the group */
1126 			val += grp->lg_stat[stat];
1127 		}
1128 	}
1129 
1130 	rw_exit(&grp->lg_lock);
1131 	return (val);
1132 }
1133 
1134 static int
1135 aggr_m_start(void *arg)
1136 {
1137 	aggr_grp_t *grp = arg;
1138 	aggr_port_t *port;
1139 
1140 	AGGR_LACP_LOCK(grp);
1141 	rw_enter(&grp->lg_lock, RW_WRITER);
1142 
1143 	/*
1144 	 * Attempts to start all configured members of the group.
1145 	 * Group members will be attached when their link-up notification
1146 	 * is received.
1147 	 */
1148 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1149 		rw_enter(&port->lp_lock, RW_WRITER);
1150 		if (aggr_port_start(port) != 0) {
1151 			rw_exit(&port->lp_lock);
1152 			continue;
1153 		}
1154 
1155 		/* set port promiscuous mode */
1156 		if (aggr_port_promisc(port, grp->lg_promisc) != 0)
1157 			aggr_port_stop(port);
1158 		rw_exit(&port->lp_lock);
1159 	}
1160 
1161 	grp->lg_started = B_TRUE;
1162 
1163 	rw_exit(&grp->lg_lock);
1164 	AGGR_LACP_UNLOCK(grp);
1165 
1166 	return (0);
1167 }
1168 
1169 static void
1170 aggr_m_stop(void *arg)
1171 {
1172 	aggr_grp_t *grp = arg;
1173 	aggr_port_t *port;
1174 
1175 	rw_enter(&grp->lg_lock, RW_WRITER);
1176 
1177 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1178 		rw_enter(&port->lp_lock, RW_WRITER);
1179 		aggr_port_stop(port);
1180 		rw_exit(&port->lp_lock);
1181 	}
1182 
1183 	grp->lg_started = B_FALSE;
1184 
1185 	rw_exit(&grp->lg_lock);
1186 }
1187 
1188 static int
1189 aggr_m_promisc(void *arg, boolean_t on)
1190 {
1191 	aggr_grp_t *grp = arg;
1192 	aggr_port_t *port;
1193 
1194 	rw_enter(&grp->lg_lock, RW_WRITER);
1195 	AGGR_GRP_REFHOLD(grp);
1196 
1197 	if (on == grp->lg_promisc)
1198 		goto bail;
1199 
1200 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1201 		rw_enter(&port->lp_lock, RW_WRITER);
1202 		AGGR_PORT_REFHOLD(port);
1203 		if (port->lp_started) {
1204 			if (aggr_port_promisc(port, on) != 0)
1205 				(void) aggr_grp_detach_port(grp, port);
1206 		}
1207 		rw_exit(&port->lp_lock);
1208 		AGGR_PORT_REFRELE(port);
1209 		if (grp->lg_closing)
1210 			break;
1211 	}
1212 
1213 	grp->lg_promisc = on;
1214 
1215 bail:
1216 	rw_exit(&grp->lg_lock);
1217 	AGGR_GRP_REFRELE(grp);
1218 
1219 	return (0);
1220 }
1221 
1222 /*
1223  * Add or remove the multicast addresses that are defined for the group
1224  * to or from the specified port.
1225  * This function is called before stopping a port, before a port
1226  * is detached from a group, and when attaching a port to a group.
1227  */
1228 void
1229 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
1230 {
1231 	aggr_grp_t *grp = port->lp_grp;
1232 
1233 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
1234 	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
1235 
1236 	if (!port->lp_started)
1237 		return;
1238 
1239 	mac_multicst_refresh(&grp->lg_mac, aggr_port_multicst, port,
1240 	    add);
1241 }
1242 
1243 static int
1244 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
1245 {
1246 	aggr_grp_t *grp = arg;
1247 	aggr_port_t *port = NULL;
1248 	int err = 0, cerr;
1249 
1250 	rw_enter(&grp->lg_lock, RW_WRITER);
1251 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1252 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
1253 			continue;
1254 		cerr = aggr_port_multicst(port, add, addrp);
1255 		if (cerr != 0 && err == 0)
1256 			err = cerr;
1257 	}
1258 	rw_exit(&grp->lg_lock);
1259 	return (err);
1260 }
1261 
1262 static int
1263 aggr_m_unicst(void *arg, const uint8_t *macaddr)
1264 {
1265 	aggr_grp_t *grp = arg;
1266 	int rc;
1267 
1268 	AGGR_LACP_LOCK(grp);
1269 	rw_enter(&grp->lg_lock, RW_WRITER);
1270 	rc = aggr_grp_modify(0, grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
1271 	    0, 0);
1272 	rw_exit(&grp->lg_lock);
1273 	AGGR_LACP_UNLOCK(grp);
1274 
1275 	return (rc);
1276 }
1277 
1278 /*
1279  * Initialize the capabilities that are advertised for the group
1280  * according to the capabilities of the constituent ports.
1281  */
1282 static void
1283 aggr_grp_capab_set(aggr_grp_t *grp)
1284 {
1285 	uint32_t cksum = (uint32_t)-1;
1286 	uint32_t poll = DL_CAPAB_POLL;
1287 	aggr_port_t *port;
1288 	const mac_info_t *port_mi;
1289 
1290 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
1291 
1292 	ASSERT(grp->lg_ports != NULL);
1293 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1294 		port_mi = mac_info(port->lp_mh);
1295 		cksum &= port_mi->mi_cksum;
1296 		poll &= port_mi->mi_poll;
1297 	}
1298 
1299 	grp->lg_mac.m_info.mi_cksum = cksum;
1300 	grp->lg_mac.m_info.mi_poll = poll;
1301 }
1302 
1303 /*
1304  * Checks whether the capabilities of the ports being added are compatible
1305  * with the current capabilities of the aggregation.
1306  */
1307 static boolean_t
1308 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
1309 {
1310 	const mac_info_t *port_mi = mac_info(port->lp_mh);
1311 	uint32_t grp_cksum = grp->lg_mac.m_info.mi_cksum;
1312 
1313 	ASSERT(grp->lg_ports != NULL);
1314 
1315 	return (((grp_cksum & port_mi->mi_cksum) == grp_cksum) &&
1316 	    (grp->lg_mac.m_info.mi_poll == port_mi->mi_poll));
1317 }
1318