xref: /titanic_50/usr/src/uts/common/io/aggr/aggr_grp.c (revision 382dbd461c555f1c7e304a961fd0d4458d958ca2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
30  *
31  * An instance of the structure aggr_grp_t is allocated for each
32  * link aggregation group. When created, aggr_grp_t objects are
33  * entered into the aggr_grp_hash hash table maintained by the modhash
34  * module. The hash key is the port number associated with the link
35  * aggregation group. The port number associated with a group corresponds
36  * the key associated with the group.
37  *
38  * A set of MAC ports are associated with each association group.
39  */
40 
41 #include <sys/types.h>
42 #include <sys/sysmacros.h>
43 #include <sys/conf.h>
44 #include <sys/cmn_err.h>
45 #include <sys/list.h>
46 #include <sys/ksynch.h>
47 #include <sys/kmem.h>
48 #include <sys/stream.h>
49 #include <sys/modctl.h>
50 #include <sys/ddi.h>
51 #include <sys/sunddi.h>
52 #include <sys/atomic.h>
53 #include <sys/stat.h>
54 #include <sys/modhash.h>
55 #include <sys/strsun.h>
56 #include <sys/dlpi.h>
57 
58 #include <sys/aggr.h>
59 #include <sys/aggr_impl.h>
60 
61 static void aggr_m_info(void *, mac_info_t *);
62 static int aggr_m_start(void *);
63 static void aggr_m_stop(void *);
64 static int aggr_m_promisc(void *, boolean_t);
65 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
66 static int aggr_m_unicst(void *, const uint8_t *);
67 static uint64_t aggr_m_stat(void *, enum mac_stat);
68 static void aggr_m_resources(void *);
69 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
70 
71 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, const char *, uint32_t);
72 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *);
73 static void aggr_stats_op(enum mac_stat, uint64_t *, uint64_t *, boolean_t);
74 static void aggr_grp_capab_set(aggr_grp_t *);
75 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
76 
77 static kmem_cache_t	*aggr_grp_cache;
78 static mod_hash_t	*aggr_grp_hash;
79 static krwlock_t	aggr_grp_lock;
80 static uint_t		aggr_grp_cnt;
81 
82 #define	GRP_HASHSZ		64
83 #define	GRP_HASH_KEY(key)	((mod_hash_key_t)(uintptr_t)key)
84 
85 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
86 static uchar_t aggr_brdcst_mac[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
87 
88 /* used by grp_info_walker */
89 typedef struct aggr_grp_info_state {
90 	uint32_t	ls_group_key;
91 	boolean_t	ls_group_found;
92 	aggr_grp_info_new_grp_fn_t ls_new_grp_fn;
93 	aggr_grp_info_new_port_fn_t ls_new_port_fn;
94 	void		*ls_fn_arg;
95 	int		ls_rc;
96 } aggr_grp_info_state_t;
97 
98 /*ARGSUSED*/
99 static int
100 aggr_grp_constructor(void *buf, void *arg, int kmflag)
101 {
102 	aggr_grp_t *grp = buf;
103 
104 	bzero(grp, sizeof (*grp));
105 	rw_init(&grp->lg_lock, NULL, RW_DRIVER, NULL);
106 	mutex_init(&grp->aggr.gl_lock, NULL, MUTEX_DEFAULT, NULL);
107 
108 	grp->lg_link_state = LINK_STATE_UNKNOWN;
109 
110 	return (0);
111 }
112 
113 /*ARGSUSED*/
114 static void
115 aggr_grp_destructor(void *buf, void *arg)
116 {
117 	aggr_grp_t *grp = buf;
118 
119 	if (grp->lg_tx_ports != NULL) {
120 		kmem_free(grp->lg_tx_ports,
121 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
122 	}
123 
124 	mutex_destroy(&grp->aggr.gl_lock);
125 	rw_destroy(&grp->lg_lock);
126 }
127 
128 void
129 aggr_grp_init(void)
130 {
131 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
132 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
133 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
134 
135 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
136 	    GRP_HASHSZ, mod_hash_null_valdtor);
137 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
138 	aggr_grp_cnt = 0;
139 }
140 
141 void
142 aggr_grp_fini(void)
143 {
144 	rw_destroy(&aggr_grp_lock);
145 	mod_hash_destroy_idhash(aggr_grp_hash);
146 	kmem_cache_destroy(aggr_grp_cache);
147 }
148 
149 uint_t
150 aggr_grp_count(void)
151 {
152 	uint_t	count;
153 
154 	rw_enter(&aggr_grp_lock, RW_READER);
155 	count = aggr_grp_cnt;
156 	rw_exit(&aggr_grp_lock);
157 	return (count);
158 }
159 
160 /*
161  * Attach a port to a link aggregation group.
162  *
163  * A port is attached to a link aggregation group once its speed
164  * and link state have been verified.
165  *
166  * Returns B_TRUE if the group link state or speed has changed. If
167  * it's the case, the caller must notify the MAC layer via a call
168  * to mac_link().
169  */
170 boolean_t
171 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
172 {
173 	boolean_t link_changed = B_FALSE;
174 
175 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
176 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
177 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
178 
179 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
180 		return (B_FALSE);
181 
182 	/*
183 	 * Validate the MAC port link speed and update the group
184 	 * link speed if needed.
185 	 */
186 	if (port->lp_ifspeed == 0 ||
187 	    port->lp_link_state != LINK_STATE_UP ||
188 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
189 		/*
190 		 * Can't attach a MAC port with unknown link speed,
191 		 * down link, or not in full duplex mode.
192 		 */
193 		return (B_FALSE);
194 	}
195 
196 	if (grp->lg_ifspeed == 0) {
197 		/*
198 		 * The group inherits the speed of the first link being
199 		 * attached.
200 		 */
201 		grp->lg_ifspeed = port->lp_ifspeed;
202 		link_changed = B_TRUE;
203 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
204 		/*
205 		 * The link speed of the MAC port must be the same as
206 		 * the group link speed, as per 802.3ad. Since it is
207 		 * not, the attach is cancelled.
208 		 */
209 		return (B_FALSE);
210 	}
211 
212 	grp->lg_nattached_ports++;
213 
214 	/*
215 	 * Update the group link state.
216 	 */
217 	if (grp->lg_link_state != LINK_STATE_UP) {
218 		grp->lg_link_state = LINK_STATE_UP;
219 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
220 		link_changed = B_TRUE;
221 	}
222 
223 	aggr_grp_multicst_port(port, B_TRUE);
224 
225 	/*
226 	 * Update port's state.
227 	 */
228 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
229 
230 	/*
231 	 * Set port's receive callback
232 	 */
233 	port->lp_mrh = mac_rx_add(port->lp_mh, aggr_recv_cb, (void *)port);
234 
235 	/*
236 	 * If LACP is OFF, the port can be used to send data as soon
237 	 * as its link is up and verified to be compatible with the
238 	 * aggregation.
239 	 *
240 	 * If LACP is active or passive, notify the LACP subsystem, which
241 	 * will enable sending on the port following the LACP protocol.
242 	 */
243 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
244 		aggr_send_port_enable(port);
245 	else
246 		aggr_lacp_port_attached(port);
247 
248 	return (link_changed);
249 }
250 
251 boolean_t
252 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
253 {
254 	boolean_t link_changed = B_FALSE;
255 
256 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
257 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
258 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
259 
260 	/* update state */
261 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
262 		return (B_FALSE);
263 
264 	mac_rx_remove(port->lp_mh, port->lp_mrh);
265 	port->lp_state = AGGR_PORT_STATE_STANDBY;
266 
267 	aggr_grp_multicst_port(port, B_FALSE);
268 
269 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
270 		aggr_send_port_disable(port);
271 	else
272 		aggr_lacp_port_detached(port);
273 
274 	grp->lg_nattached_ports--;
275 	if (grp->lg_nattached_ports == 0) {
276 		/* the last attached MAC port of the group is being detached */
277 		grp->lg_ifspeed = 0;
278 		grp->lg_link_state = LINK_STATE_DOWN;
279 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
280 		link_changed = B_TRUE;
281 	}
282 
283 	return (link_changed);
284 }
285 
286 /*
287  * Update the MAC addresses of the constituent ports of the specified
288  * group. This function is invoked:
289  * - after creating a new aggregation group.
290  * - after adding new ports to an aggregation group.
291  * - after removing a port from a group when the MAC address of
292  *   that port was used for the MAC address of the group.
293  * - after the MAC address of a port changed when the MAC address
294  *   of that port was used for the MAC address of the group.
295  */
296 void
297 aggr_grp_update_ports_mac(aggr_grp_t *grp)
298 {
299 	aggr_port_t *cport;
300 
301 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
302 
303 	for (cport = grp->lg_ports; cport != NULL;
304 	    cport = cport->lp_next) {
305 		rw_enter(&cport->lp_lock, RW_WRITER);
306 		if (aggr_port_unicst(cport, grp->lg_addr) != 0)
307 			(void) aggr_grp_detach_port(grp, cport);
308 		rw_exit(&cport->lp_lock);
309 		if (grp->lg_closing)
310 			break;
311 	}
312 }
313 
314 /*
315  * Invoked when the MAC address of a port has changed. If the port's
316  * MAC address was used for the group MAC address, returns B_TRUE.
317  * In that case, it is the responsibility of the caller to
318  * invoke aggr_grp_update_ports_mac() after releasing the
319  * the port lock, and aggr_grp_notify() after releasing the
320  * group lock.
321  */
322 boolean_t
323 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port)
324 {
325 	boolean_t grp_addr_changed = B_FALSE;
326 
327 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
328 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
329 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
330 
331 	if (grp->lg_addr_fixed) {
332 		/*
333 		 * The group is using a fixed MAC address or an automatic
334 		 * MAC address has not been set.
335 		 */
336 		return (B_FALSE);
337 	}
338 
339 	if (grp->lg_mac_addr_port == port) {
340 		/*
341 		 * The MAC address of the port was assigned to the group
342 		 * MAC address. Update the group MAC address.
343 		 */
344 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
345 		grp_addr_changed = B_TRUE;
346 	} else {
347 		/*
348 		 * Update the actual port MAC address to the MAC address
349 		 * of the group.
350 		 */
351 		if (aggr_port_unicst(port, grp->lg_addr) != 0)
352 			(void) aggr_grp_detach_port(grp, port);
353 	}
354 
355 	return (grp_addr_changed);
356 }
357 
358 /*
359  * Add a port to a link aggregation group.
360  */
361 static int
362 aggr_grp_add_port(aggr_grp_t *grp, const char *name, uint_t portnum,
363     aggr_port_t **pp)
364 {
365 	aggr_port_t *port, **cport;
366 	int err;
367 
368 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
369 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
370 
371 	/* create new port */
372 	err = aggr_port_create(name, portnum, &port);
373 	if (err != 0)
374 		return (err);
375 
376 	rw_enter(&port->lp_lock, RW_WRITER);
377 
378 	/* add port to list of group constituent ports */
379 	cport = &grp->lg_ports;
380 	while (*cport != NULL)
381 		cport = &((*cport)->lp_next);
382 	*cport = port;
383 
384 	/*
385 	 * Back reference to the group it is member of. A port always
386 	 * holds a reference to its group to ensure that the back
387 	 * reference is always valid.
388 	 */
389 	port->lp_grp = grp;
390 	AGGR_GRP_REFHOLD(grp);
391 	grp->lg_nports++;
392 
393 	aggr_lacp_init_port(port);
394 
395 	rw_exit(&port->lp_lock);
396 
397 	if (pp != NULL)
398 		*pp = port;
399 
400 	return (0);
401 }
402 
403 /*
404  * Add one or more ports to an existing link aggregation group.
405  */
406 int
407 aggr_grp_add_ports(uint32_t key, uint_t nports, laioc_port_t *ports)
408 {
409 	int rc, i, nadded = 0;
410 	aggr_grp_t *grp = NULL;
411 	aggr_port_t *port;
412 
413 	/* get group corresponding to key */
414 	rw_enter(&aggr_grp_lock, RW_READER);
415 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
416 	    (mod_hash_val_t *)&grp) != 0) {
417 		rw_exit(&aggr_grp_lock);
418 		return (ENOENT);
419 	}
420 	AGGR_GRP_REFHOLD(grp);
421 	rw_exit(&aggr_grp_lock);
422 
423 	AGGR_LACP_LOCK(grp);
424 	rw_enter(&grp->lg_lock, RW_WRITER);
425 
426 	/* add the specified ports to group */
427 	for (i = 0; i < nports; i++) {
428 		/* add port to group */
429 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_devname,
430 		    ports[i].lp_port, &port)) != 0)
431 			goto bail;
432 		ASSERT(port != NULL);
433 		nadded++;
434 
435 		/* check capabilities */
436 		if (!aggr_grp_capab_check(grp, port)) {
437 			rc = ENOTSUP;
438 			goto bail;
439 		}
440 
441 		/* start port if group has already been started */
442 		if (grp->lg_started) {
443 			rw_enter(&port->lp_lock, RW_WRITER);
444 			rc = aggr_port_start(port);
445 			if (rc != 0) {
446 				rw_exit(&port->lp_lock);
447 				goto bail;
448 			}
449 
450 			/* set port promiscuous mode */
451 			rc = aggr_port_promisc(port, grp->lg_promisc);
452 			if (rc != 0) {
453 				rw_exit(&port->lp_lock);
454 				goto bail;
455 			}
456 			rw_exit(&port->lp_lock);
457 		}
458 	}
459 
460 	/* update the MAC address of the constituent ports */
461 	aggr_grp_update_ports_mac(grp);
462 
463 bail:
464 	if (rc != 0) {
465 		/* stop and remove ports that have been added */
466 		for (i = 0; i < nadded && !grp->lg_closing; i++) {
467 			port = aggr_grp_port_lookup(grp, ports[i].lp_devname,
468 			    ports[i].lp_port);
469 			ASSERT(port != NULL);
470 			if (grp->lg_started) {
471 				rw_enter(&port->lp_lock, RW_WRITER);
472 				aggr_port_stop(port);
473 				rw_exit(&port->lp_lock);
474 			}
475 			(void) aggr_grp_rem_port(grp, port, NULL);
476 		}
477 	}
478 
479 	rw_exit(&grp->lg_lock);
480 	AGGR_LACP_UNLOCK(grp);
481 	if (rc == 0 && !grp->lg_closing)
482 		mac_resource_update(&grp->lg_mac);
483 	AGGR_GRP_REFRELE(grp);
484 	return (rc);
485 }
486 
487 /*
488  * Update properties of an existing link aggregation group.
489  */
490 int
491 aggr_grp_modify(uint32_t key, aggr_grp_t *grp_arg, uint8_t update_mask,
492     uint32_t policy, boolean_t mac_fixed, const uchar_t *mac_addr,
493     aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer)
494 {
495 	int rc = 0;
496 	aggr_grp_t *grp = NULL;
497 	boolean_t mac_addr_changed = B_FALSE;
498 
499 	if (grp_arg == NULL) {
500 		/* get group corresponding to key */
501 		rw_enter(&aggr_grp_lock, RW_READER);
502 		if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
503 		    (mod_hash_val_t *)&grp) != 0) {
504 			rc = ENOENT;
505 			goto bail;
506 		}
507 		AGGR_LACP_LOCK(grp);
508 		rw_enter(&grp->lg_lock, RW_WRITER);
509 	} else {
510 		grp = grp_arg;
511 		ASSERT(AGGR_LACP_LOCK_HELD(grp));
512 		ASSERT(RW_WRITE_HELD(&grp->lg_lock));
513 	}
514 
515 	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
516 	AGGR_GRP_REFHOLD(grp);
517 
518 	/* validate fixed address if specified */
519 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
520 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
521 	    (mac_addr[0] & 0x01))) {
522 		rc = EINVAL;
523 		goto bail;
524 	}
525 
526 	/* update policy if requested */
527 	if (update_mask & AGGR_MODIFY_POLICY)
528 		aggr_send_update_policy(grp, policy);
529 
530 	/* update unicast MAC address if requested */
531 	if (update_mask & AGGR_MODIFY_MAC) {
532 		if (mac_fixed) {
533 			/* user-supplied MAC address */
534 			grp->lg_mac_addr_port = NULL;
535 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
536 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
537 				mac_addr_changed = B_TRUE;
538 			}
539 		} else if (grp->lg_addr_fixed) {
540 			/* switch from user-supplied to automatic */
541 			aggr_port_t *port = grp->lg_ports;
542 
543 			rw_enter(&port->lp_lock, RW_WRITER);
544 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
545 			grp->lg_mac_addr_port = port;
546 			mac_addr_changed = B_TRUE;
547 			rw_exit(&port->lp_lock);
548 		}
549 		grp->lg_addr_fixed = mac_fixed;
550 	}
551 
552 	if (mac_addr_changed)
553 		aggr_grp_update_ports_mac(grp);
554 
555 	if (update_mask & AGGR_MODIFY_LACP_MODE)
556 		aggr_lacp_update_mode(grp, lacp_mode);
557 
558 	if ((update_mask & AGGR_MODIFY_LACP_TIMER) && !grp->lg_closing)
559 		aggr_lacp_update_timer(grp, lacp_timer);
560 
561 bail:
562 	if (grp_arg == NULL) {
563 		if (grp != NULL) {
564 			rw_exit(&grp->lg_lock);
565 			AGGR_LACP_UNLOCK(grp);
566 		}
567 		rw_exit(&aggr_grp_lock);
568 		/* pass new unicast address up to MAC layer */
569 		if (grp != NULL && mac_addr_changed && !grp->lg_closing)
570 			mac_unicst_update(&grp->lg_mac, grp->lg_addr);
571 	}
572 
573 	if (grp != NULL)
574 		AGGR_GRP_REFRELE(grp);
575 
576 	return (rc);
577 }
578 
579 /*
580  * Create a new link aggregation group upon request from administrator.
581  * Returns 0 on success, an errno on failure.
582  */
583 int
584 aggr_grp_create(uint32_t key, uint_t nports, laioc_port_t *ports,
585     uint32_t policy, boolean_t mac_fixed, uchar_t *mac_addr,
586     aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer)
587 {
588 	aggr_grp_t *grp = NULL;
589 	aggr_port_t *port;
590 	mac_t *mac;
591 	mac_info_t *mip;
592 	int err;
593 	int i;
594 
595 	/* need at least one port */
596 	if (nports == 0)
597 		return (EINVAL);
598 
599 	rw_enter(&aggr_grp_lock, RW_WRITER);
600 
601 	/* does a group with the same key already exist? */
602 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
603 	    (mod_hash_val_t *)&grp);
604 	if (err == 0) {
605 		rw_exit(&aggr_grp_lock);
606 		return (EEXIST);
607 	}
608 
609 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
610 
611 	AGGR_LACP_LOCK(grp);
612 	rw_enter(&grp->lg_lock, RW_WRITER);
613 
614 	grp->lg_refs = 1;
615 	grp->lg_closing = 0;
616 	grp->lg_key = key;
617 
618 	grp->lg_ifspeed = 0;
619 	grp->lg_link_state = LINK_STATE_UNKNOWN;
620 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
621 	grp->lg_started = B_FALSE;
622 	grp->lg_promisc = B_FALSE;
623 	aggr_lacp_init_grp(grp);
624 
625 	/* add MAC ports to group */
626 	grp->lg_ports = NULL;
627 	grp->lg_nports = 0;
628 	grp->lg_nattached_ports = 0;
629 	grp->lg_ntx_ports = 0;
630 
631 	for (i = 0; i < nports; i++) {
632 		err = aggr_grp_add_port(grp, ports[i].lp_devname,
633 		    ports[i].lp_port, NULL);
634 		if (err != 0)
635 			goto bail;
636 	}
637 
638 	/*
639 	 * If no explicit MAC address was specified by the administrator,
640 	 * set it to the MAC address of the first port.
641 	 */
642 	grp->lg_addr_fixed = mac_fixed;
643 	if (grp->lg_addr_fixed) {
644 		/* validate specified address */
645 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
646 			err = EINVAL;
647 			goto bail;
648 		}
649 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
650 	} else {
651 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
652 		grp->lg_mac_addr_port = grp->lg_ports;
653 	}
654 
655 	/* update the MAC address of the constituent ports */
656 	aggr_grp_update_ports_mac(grp);
657 
658 	/* update outbound load balancing policy */
659 	aggr_send_update_policy(grp, policy);
660 
661 	/* register with the MAC module */
662 	mac = &grp->lg_mac;
663 	bzero(mac, sizeof (*mac));
664 
665 	mac->m_ident = MAC_IDENT;
666 
667 	mac->m_driver = grp;
668 	mac->m_dip = aggr_dip;
669 	mac->m_port = key;
670 
671 	mip = &(mac->m_info);
672 	mip->mi_media = DL_ETHER;
673 	mip->mi_sdu_min = 0;
674 	mip->mi_sdu_max = ETHERMTU;
675 
676 	MAC_STAT_MIB(mip->mi_stat);
677 	MAC_STAT_ETHER(mip->mi_stat);
678 	mip->mi_stat[MAC_STAT_LINK_DUPLEX] = B_TRUE;
679 
680 	mip->mi_addr_length = ETHERADDRL;
681 	bcopy(aggr_brdcst_mac, mip->mi_brdcst_addr, ETHERADDRL);
682 	bcopy(grp->lg_addr, mip->mi_unicst_addr, ETHERADDRL);
683 
684 	mac->m_stat = aggr_m_stat;
685 	mac->m_start = aggr_m_start;
686 	mac->m_stop = aggr_m_stop;
687 	mac->m_promisc = aggr_m_promisc;
688 	mac->m_multicst = aggr_m_multicst;
689 	mac->m_unicst = aggr_m_unicst;
690 	mac->m_tx = aggr_m_tx;
691 	mac->m_resources = aggr_m_resources;
692 	mac->m_ioctl = aggr_m_ioctl;
693 
694 	/* set the initial group capabilities */
695 	aggr_grp_capab_set(grp);
696 
697 	if ((err = mac_register(mac)) != 0)
698 		goto bail;
699 
700 	/* set LACP mode */
701 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
702 
703 	/* add new group to hash table */
704 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(key),
705 	    (mod_hash_val_t)grp);
706 	ASSERT(err == 0);
707 	aggr_grp_cnt++;
708 
709 	rw_exit(&grp->lg_lock);
710 	AGGR_LACP_UNLOCK(grp);
711 	rw_exit(&aggr_grp_lock);
712 	return (0);
713 
714 bail:
715 	if (grp != NULL) {
716 		aggr_port_t *cport;
717 
718 		atomic_add_32(&grp->lg_closing, 1);
719 
720 		port = grp->lg_ports;
721 		while (port != NULL) {
722 			cport = port->lp_next;
723 			aggr_port_delete(port);
724 			port = cport;
725 		}
726 
727 		rw_exit(&grp->lg_lock);
728 		AGGR_LACP_UNLOCK(grp);
729 
730 		kmem_cache_free(aggr_grp_cache, grp);
731 	}
732 
733 	rw_exit(&aggr_grp_lock);
734 	return (err);
735 }
736 
737 /*
738  * Return a pointer to the member of a group with specified device name
739  * and port number.
740  */
741 static aggr_port_t *
742 aggr_grp_port_lookup(aggr_grp_t *grp, const char *devname, uint32_t portnum)
743 {
744 	aggr_port_t *port;
745 
746 	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
747 
748 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
749 		if ((strcmp(port->lp_devname, devname) == 0) &&
750 		    (port->lp_port == portnum))
751 			break;
752 	}
753 
754 	return (port);
755 }
756 
757 /*
758  * Stop, detach and remove a port from a link aggregation group.
759  */
760 static int
761 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port, boolean_t *do_notify)
762 {
763 	aggr_port_t **pport;
764 	boolean_t grp_mac_addr_changed = B_FALSE;
765 	uint64_t val;
766 	uint_t i;
767 
768 	ASSERT(AGGR_LACP_LOCK_HELD(grp));
769 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
770 	ASSERT(grp->lg_nports > 1);
771 
772 	if (do_notify != NULL)
773 		*do_notify = B_FALSE;
774 
775 	/* unlink port */
776 	for (pport = &grp->lg_ports; *pport != port;
777 	    pport = &(*pport)->lp_next) {
778 		if (*pport == NULL)
779 			return (ENOENT);
780 	}
781 	*pport = port->lp_next;
782 
783 	atomic_add_32(&port->lp_closing, 1);
784 
785 	rw_enter(&port->lp_lock, RW_WRITER);
786 
787 	/*
788 	 * If the MAC address of the port being removed was assigned
789 	 * to the group, update the group MAC address
790 	 * using the MAC address of a different port.
791 	 */
792 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
793 		/*
794 		 * Set the MAC address of the group to the
795 		 * MAC address of its first port.
796 		 */
797 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
798 		grp->lg_mac_addr_port = grp->lg_ports;
799 		grp_mac_addr_changed = B_TRUE;
800 	}
801 
802 	(void) aggr_grp_detach_port(grp, port);
803 
804 	/*
805 	 * Add the statistics of the ports while it was aggregated
806 	 * to the group's residual statistics.
807 	 */
808 	for (i = 0; i < MAC_NSTAT && !grp->lg_closing; i++) {
809 		/* avoid stats that are not counters */
810 		if (i == MAC_STAT_IFSPEED || i == MAC_STAT_LINK_DUPLEX)
811 			continue;
812 
813 		/* get current value */
814 		val = aggr_port_stat(port, i);
815 		/* subtract value at the point of aggregation */
816 		val -= port->lp_stat[i];
817 		/* add to the residual stat */
818 		grp->lg_stat[i] += val;
819 	}
820 
821 	grp->lg_nports--;
822 
823 	rw_exit(&port->lp_lock);
824 
825 	aggr_port_delete(port);
826 
827 	/*
828 	 * If the group MAC address has changed, update the MAC address of
829 	 * the remaining consistuent ports according to the new MAC
830 	 * address of the group.
831 	 */
832 	if (grp->lg_closing) {
833 		*do_notify = B_FALSE;
834 	} else {
835 		if (grp_mac_addr_changed)
836 			aggr_grp_update_ports_mac(grp);
837 
838 		if (do_notify != NULL)
839 			*do_notify = grp_mac_addr_changed;
840 	}
841 
842 	return (0);
843 }
844 
845 /*
846  * Remove one or more ports from an existing link aggregation group.
847  */
848 int
849 aggr_grp_rem_ports(uint32_t key, uint_t nports, laioc_port_t *ports)
850 {
851 	int rc = 0, i;
852 	aggr_grp_t *grp = NULL;
853 	aggr_port_t *port;
854 	boolean_t notify = B_FALSE, grp_mac_addr_changed;
855 
856 	/* get group corresponding to key */
857 	rw_enter(&aggr_grp_lock, RW_READER);
858 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
859 	    (mod_hash_val_t *)&grp) != 0) {
860 		rw_exit(&aggr_grp_lock);
861 		return (ENOENT);
862 	}
863 	AGGR_GRP_REFHOLD(grp);
864 	rw_exit(&aggr_grp_lock);
865 
866 	AGGR_LACP_LOCK(grp);
867 	rw_enter(&grp->lg_lock, RW_WRITER);
868 
869 	/* we need to keep at least one port per group */
870 	if (nports >= grp->lg_nports) {
871 		rc = EINVAL;
872 		goto bail;
873 	}
874 
875 	/* first verify that all the groups are valid */
876 	for (i = 0; i < nports; i++) {
877 		if (aggr_grp_port_lookup(grp, ports[i].lp_devname,
878 		    ports[i].lp_port) == NULL) {
879 			/* port not found */
880 			rc = ENOENT;
881 			goto bail;
882 		}
883 	}
884 
885 	/* remove the specified ports from group */
886 	for (i = 0; i < nports && !grp->lg_closing; i++) {
887 		/* lookup port */
888 		port = aggr_grp_port_lookup(grp, ports[i].lp_devname,
889 		    ports[i].lp_port);
890 		ASSERT(port != NULL);
891 
892 		/* stop port if group has already been started */
893 		if (grp->lg_started) {
894 			rw_enter(&port->lp_lock, RW_WRITER);
895 			aggr_port_stop(port);
896 			rw_exit(&port->lp_lock);
897 		}
898 
899 		/* remove port from group */
900 		rc = aggr_grp_rem_port(grp, port, &grp_mac_addr_changed);
901 		ASSERT(rc == 0);
902 		notify = notify || grp_mac_addr_changed;
903 	}
904 
905 bail:
906 	rw_exit(&grp->lg_lock);
907 	AGGR_LACP_UNLOCK(grp);
908 	if (notify && !grp->lg_closing)
909 		mac_unicst_update(&grp->lg_mac, grp->lg_addr);
910 	if (rc == 0 && !grp->lg_closing)
911 		mac_resource_update(&grp->lg_mac);
912 	AGGR_GRP_REFRELE(grp);
913 
914 	return (rc);
915 }
916 
917 int
918 aggr_grp_delete(uint32_t key)
919 {
920 	aggr_grp_t *grp = NULL;
921 	aggr_port_t *port, *cport;
922 	mod_hash_val_t val;
923 
924 	rw_enter(&aggr_grp_lock, RW_WRITER);
925 
926 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(key),
927 	    (mod_hash_val_t *)&grp) != 0) {
928 		rw_exit(&aggr_grp_lock);
929 		return (ENOENT);
930 	}
931 
932 	atomic_add_32(&grp->lg_closing, 1);
933 
934 	AGGR_LACP_LOCK(grp);
935 	rw_enter(&grp->lg_lock, RW_WRITER);
936 
937 	/*
938 	 * Unregister from the MAC service module. Since this can
939 	 * fail if a client hasn't closed the MAC port, we gracefully
940 	 * fail the operation.
941 	 */
942 	if (mac_unregister(&grp->lg_mac)) {
943 		rw_exit(&grp->lg_lock);
944 		AGGR_LACP_UNLOCK(grp);
945 		rw_exit(&aggr_grp_lock);
946 		return (EBUSY);
947 	}
948 
949 	/* detach and free MAC ports associated with group */
950 	port = grp->lg_ports;
951 	while (port != NULL) {
952 		cport = port->lp_next;
953 		rw_enter(&port->lp_lock, RW_WRITER);
954 		if (grp->lg_started)
955 			aggr_port_stop(port);
956 		(void) aggr_grp_detach_port(grp, port);
957 		rw_exit(&port->lp_lock);
958 		aggr_port_delete(port);
959 		port = cport;
960 	}
961 
962 	rw_exit(&grp->lg_lock);
963 	AGGR_LACP_UNLOCK(grp);
964 
965 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(key), &val);
966 	ASSERT(grp == (aggr_grp_t *)val);
967 
968 	ASSERT(aggr_grp_cnt > 0);
969 	aggr_grp_cnt--;
970 
971 	rw_exit(&aggr_grp_lock);
972 	AGGR_GRP_REFRELE(grp);
973 
974 	return (0);
975 }
976 
977 void
978 aggr_grp_free(aggr_grp_t *grp)
979 {
980 	ASSERT(grp->lg_refs == 0);
981 	kmem_cache_free(aggr_grp_cache, grp);
982 }
983 
984 /*
985  * Walker invoked when building the list of configured groups and
986  * their ports that must be passed up to user-space.
987  */
988 
989 /*ARGSUSED*/
990 static uint_t
991 aggr_grp_info_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
992 {
993 	aggr_grp_t *grp;
994 	aggr_port_t *port;
995 	aggr_grp_info_state_t *state = arg;
996 
997 	if (state->ls_rc != 0)
998 		return (MH_WALK_TERMINATE);	/* terminate walk */
999 
1000 	grp = (aggr_grp_t *)val;
1001 
1002 	rw_enter(&grp->lg_lock, RW_READER);
1003 
1004 	if (state->ls_group_key != 0 && grp->lg_key != state->ls_group_key)
1005 		goto bail;
1006 
1007 	state->ls_group_found = B_TRUE;
1008 
1009 	state->ls_rc = state->ls_new_grp_fn(state->ls_fn_arg, grp->lg_key,
1010 	    grp->lg_addr, grp->lg_addr_fixed, grp->lg_tx_policy,
1011 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1012 
1013 	if (state->ls_rc != 0)
1014 		goto bail;
1015 
1016 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1017 
1018 		rw_enter(&port->lp_lock, RW_READER);
1019 
1020 		state->ls_rc = state->ls_new_port_fn(state->ls_fn_arg,
1021 		    port->lp_devname, port->lp_port, port->lp_addr,
1022 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
1023 
1024 		rw_exit(&port->lp_lock);
1025 
1026 		if (state->ls_rc != 0)
1027 			goto bail;
1028 	}
1029 
1030 bail:
1031 	rw_exit(&grp->lg_lock);
1032 	return ((state->ls_rc == 0) ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
1033 }
1034 
1035 int
1036 aggr_grp_info(uint_t *ngroups, uint32_t group_key, void *fn_arg,
1037     aggr_grp_info_new_grp_fn_t new_grp_fn,
1038     aggr_grp_info_new_port_fn_t new_port_fn)
1039 {
1040 	aggr_grp_info_state_t state;
1041 	int rc = 0;
1042 
1043 	rw_enter(&aggr_grp_lock, RW_READER);
1044 
1045 	*ngroups = aggr_grp_cnt;
1046 
1047 	bzero(&state, sizeof (state));
1048 	state.ls_group_key = group_key;
1049 	state.ls_new_grp_fn = new_grp_fn;
1050 	state.ls_new_port_fn = new_port_fn;
1051 	state.ls_fn_arg = fn_arg;
1052 
1053 	mod_hash_walk(aggr_grp_hash, aggr_grp_info_walker, &state);
1054 
1055 	if ((rc = state.ls_rc) == 0 && group_key != 0 &&
1056 	    !state.ls_group_found)
1057 		rc = ENOENT;
1058 
1059 	rw_exit(&aggr_grp_lock);
1060 	return (rc);
1061 }
1062 
1063 static void
1064 aggr_m_resources(void *arg)
1065 {
1066 	aggr_grp_t *grp = arg;
1067 	aggr_port_t *port;
1068 
1069 	/* Call each port's m_resources function */
1070 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
1071 		mac_resources(port->lp_mh);
1072 }
1073 
1074 /*ARGSUSED*/
1075 static void
1076 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1077 {
1078 	miocnak(q, mp, 0, ENOTSUP);
1079 }
1080 
1081 static uint64_t
1082 aggr_m_stat(void *arg, enum mac_stat stat)
1083 {
1084 	aggr_grp_t *grp = arg;
1085 	aggr_port_t *port;
1086 	uint64_t val;
1087 
1088 	rw_enter(&grp->lg_lock, RW_READER);
1089 
1090 	switch (stat) {
1091 	case MAC_STAT_IFSPEED:
1092 		val = grp->lg_ifspeed;
1093 		break;
1094 	case MAC_STAT_LINK_DUPLEX:
1095 		val = grp->lg_link_duplex;
1096 		break;
1097 	default:
1098 		/*
1099 		 * The remaining statistics are counters. They are computed
1100 		 * by aggregating the counters of the members MACs while they
1101 		 * were aggregated, plus the residual counter of the group
1102 		 * itself, which is updated each time a MAC is removed from
1103 		 * the group.
1104 		 */
1105 		val = 0;
1106 		for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1107 			/* actual port statistic */
1108 			val += aggr_port_stat(port, stat);
1109 			/* minus the port stat when it was added */
1110 			val -= port->lp_stat[stat];
1111 			/* plus any residual amount for the group */
1112 			val += grp->lg_stat[stat];
1113 		}
1114 	}
1115 
1116 	rw_exit(&grp->lg_lock);
1117 	return (val);
1118 }
1119 
1120 static int
1121 aggr_m_start(void *arg)
1122 {
1123 	aggr_grp_t *grp = arg;
1124 	aggr_port_t *port;
1125 
1126 	AGGR_LACP_LOCK(grp);
1127 	rw_enter(&grp->lg_lock, RW_WRITER);
1128 
1129 	/*
1130 	 * Attempts to start all configured members of the group.
1131 	 * Group members will be attached when their link-up notification
1132 	 * is received.
1133 	 */
1134 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1135 		rw_enter(&port->lp_lock, RW_WRITER);
1136 		if (aggr_port_start(port) != 0) {
1137 			rw_exit(&port->lp_lock);
1138 			continue;
1139 		}
1140 
1141 		/* set port promiscuous mode */
1142 		if (aggr_port_promisc(port, grp->lg_promisc) != 0)
1143 			aggr_port_stop(port);
1144 		rw_exit(&port->lp_lock);
1145 	}
1146 
1147 	grp->lg_started = B_TRUE;
1148 
1149 	rw_exit(&grp->lg_lock);
1150 	AGGR_LACP_UNLOCK(grp);
1151 
1152 	return (0);
1153 }
1154 
1155 static void
1156 aggr_m_stop(void *arg)
1157 {
1158 	aggr_grp_t *grp = arg;
1159 	aggr_port_t *port;
1160 
1161 	rw_enter(&grp->lg_lock, RW_WRITER);
1162 
1163 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1164 		rw_enter(&port->lp_lock, RW_WRITER);
1165 		aggr_port_stop(port);
1166 		rw_exit(&port->lp_lock);
1167 	}
1168 
1169 	grp->lg_started = B_FALSE;
1170 
1171 	rw_exit(&grp->lg_lock);
1172 }
1173 
1174 static int
1175 aggr_m_promisc(void *arg, boolean_t on)
1176 {
1177 	aggr_grp_t *grp = arg;
1178 	aggr_port_t *port;
1179 
1180 	AGGR_LACP_LOCK(grp);
1181 	rw_enter(&grp->lg_lock, RW_WRITER);
1182 	AGGR_GRP_REFHOLD(grp);
1183 
1184 	if (on == grp->lg_promisc)
1185 		goto bail;
1186 
1187 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1188 		rw_enter(&port->lp_lock, RW_WRITER);
1189 		AGGR_PORT_REFHOLD(port);
1190 		if (port->lp_started) {
1191 			if (aggr_port_promisc(port, on) != 0)
1192 				(void) aggr_grp_detach_port(grp, port);
1193 		}
1194 		rw_exit(&port->lp_lock);
1195 		AGGR_PORT_REFRELE(port);
1196 		if (grp->lg_closing)
1197 			break;
1198 	}
1199 
1200 	grp->lg_promisc = on;
1201 
1202 bail:
1203 	rw_exit(&grp->lg_lock);
1204 	AGGR_LACP_UNLOCK(grp);
1205 	AGGR_GRP_REFRELE(grp);
1206 
1207 	return (0);
1208 }
1209 
1210 /*
1211  * Add or remove the multicast addresses that are defined for the group
1212  * to or from the specified port.
1213  * This function is called before stopping a port, before a port
1214  * is detached from a group, and when attaching a port to a group.
1215  */
1216 void
1217 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
1218 {
1219 	aggr_grp_t *grp = port->lp_grp;
1220 
1221 	ASSERT(RW_WRITE_HELD(&port->lp_lock));
1222 	ASSERT(RW_WRITE_HELD(&grp->lg_lock) || RW_READ_HELD(&grp->lg_lock));
1223 
1224 	if (!port->lp_started)
1225 		return;
1226 
1227 	mac_multicst_refresh(&grp->lg_mac, aggr_port_multicst, port,
1228 	    add);
1229 }
1230 
1231 static int
1232 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
1233 {
1234 	aggr_grp_t *grp = arg;
1235 	aggr_port_t *port = NULL;
1236 	int err = 0, cerr;
1237 
1238 	rw_enter(&grp->lg_lock, RW_WRITER);
1239 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1240 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
1241 			continue;
1242 		cerr = aggr_port_multicst(port, add, addrp);
1243 		if (cerr != 0 && err == 0)
1244 			err = cerr;
1245 	}
1246 	rw_exit(&grp->lg_lock);
1247 	return (err);
1248 }
1249 
1250 static int
1251 aggr_m_unicst(void *arg, const uint8_t *macaddr)
1252 {
1253 	aggr_grp_t *grp = arg;
1254 	int rc;
1255 
1256 	AGGR_LACP_LOCK(grp);
1257 	rw_enter(&grp->lg_lock, RW_WRITER);
1258 	rc = aggr_grp_modify(0, grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
1259 	    0, 0);
1260 	rw_exit(&grp->lg_lock);
1261 	AGGR_LACP_UNLOCK(grp);
1262 
1263 	return (rc);
1264 }
1265 
1266 /*
1267  * Initialize the capabilities that are advertised for the group
1268  * according to the capabilities of the constituent ports.
1269  */
1270 static void
1271 aggr_grp_capab_set(aggr_grp_t *grp)
1272 {
1273 	uint32_t cksum = (uint32_t)-1;
1274 	uint32_t poll = DL_CAPAB_POLL;
1275 	aggr_port_t *port;
1276 	const mac_info_t *port_mi;
1277 
1278 	ASSERT(RW_WRITE_HELD(&grp->lg_lock));
1279 
1280 	ASSERT(grp->lg_ports != NULL);
1281 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1282 		port_mi = mac_info(port->lp_mh);
1283 		cksum &= port_mi->mi_cksum;
1284 		poll &= port_mi->mi_poll;
1285 	}
1286 
1287 	grp->lg_mac.m_info.mi_cksum = cksum;
1288 	grp->lg_mac.m_info.mi_poll = poll;
1289 }
1290 
1291 /*
1292  * Checks whether the capabilities of the ports being added are compatible
1293  * with the current capabilities of the aggregation.
1294  */
1295 static boolean_t
1296 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
1297 {
1298 	const mac_info_t *port_mi = mac_info(port->lp_mh);
1299 	uint32_t grp_cksum = grp->lg_mac.m_info.mi_cksum;
1300 
1301 	ASSERT(grp->lg_ports != NULL);
1302 
1303 	return (((grp_cksum & port_mi->mi_cksum) == grp_cksum) &&
1304 	    (grp->lg_mac.m_info.mi_poll == port_mi->mi_poll));
1305 }
1306