xref: /titanic_50/usr/src/uts/common/io/aggr/aggr_grp.c (revision 09b7f21a0999a8ceb9f3e517fff7c39c52405ba2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright 2015 Joyent, Inc.
24  */
25 
26 /*
27  * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
28  *
29  * An instance of the structure aggr_grp_t is allocated for each
30  * link aggregation group. When created, aggr_grp_t objects are
31  * entered into the aggr_grp_hash hash table maintained by the modhash
32  * module. The hash key is the linkid associated with the link
33  * aggregation group.
34  *
35  * A set of MAC ports are associated with each association group.
36  *
37  * Aggr pseudo TX rings
38  * --------------------
39  * The underlying ports (NICs) in an aggregation can have TX rings. To
40  * enhance aggr's performance, these TX rings are made available to the
41  * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
42  * They are already present and implemented on the RX side. It is called
43  * as pseudo RX rings. The same concept is extended to the TX side where
44  * each TX ring of an underlying port is reflected in aggr as a pseudo
45  * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
46  * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
47  * TX ring is given to the aggregation layer.
48  *
49  * With this change, the outgoing stack depth looks much better:
50  *
51  * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
52  * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
53  *
54  * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
55  * SRS_TX_AGGR and SRS_TX_BW_AGGR.
56  *
57  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
58  * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
59  * ring belonging to a port on which the packet has to be sent.
60  * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
61  * policy and then uses the fanout_hint passed to it to pick a TX ring from
62  * the selected port.
63  *
64  * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
65  * bandwidth limit is applied first on the outgoing packet and the packets
66  * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
67  * particular TX ring.
68  */
69 
70 #include <sys/types.h>
71 #include <sys/sysmacros.h>
72 #include <sys/conf.h>
73 #include <sys/cmn_err.h>
74 #include <sys/disp.h>
75 #include <sys/list.h>
76 #include <sys/ksynch.h>
77 #include <sys/kmem.h>
78 #include <sys/stream.h>
79 #include <sys/modctl.h>
80 #include <sys/ddi.h>
81 #include <sys/sunddi.h>
82 #include <sys/atomic.h>
83 #include <sys/stat.h>
84 #include <sys/modhash.h>
85 #include <sys/id_space.h>
86 #include <sys/strsun.h>
87 #include <sys/cred.h>
88 #include <sys/dlpi.h>
89 #include <sys/zone.h>
90 #include <sys/mac_provider.h>
91 #include <sys/dls.h>
92 #include <sys/vlan.h>
93 #include <sys/aggr.h>
94 #include <sys/aggr_impl.h>
95 
96 static int aggr_m_start(void *);
97 static void aggr_m_stop(void *);
98 static int aggr_m_promisc(void *, boolean_t);
99 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
100 static int aggr_m_unicst(void *, const uint8_t *);
101 static int aggr_m_stat(void *, uint_t, uint64_t *);
102 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
103 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
104 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
105     const void *);
106 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
107     mac_prop_info_handle_t);
108 
109 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
110 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
111     boolean_t *);
112 
113 static void aggr_grp_capab_set(aggr_grp_t *);
114 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
115 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
116 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
117 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
118 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
119 
120 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
122 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
124 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
125 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
126 static int aggr_addmac(void *, const uint8_t *);
127 static int aggr_remmac(void *, const uint8_t *);
128 static mblk_t *aggr_rx_poll(void *, int);
129 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
130     const int, mac_ring_info_t *, mac_ring_handle_t);
131 static void aggr_fill_group(void *, mac_ring_type_t, const int,
132     mac_group_info_t *, mac_group_handle_t);
133 
134 static kmem_cache_t	*aggr_grp_cache;
135 static mod_hash_t	*aggr_grp_hash;
136 static krwlock_t	aggr_grp_lock;
137 static uint_t		aggr_grp_cnt;
138 static id_space_t	*key_ids;
139 
140 #define	GRP_HASHSZ		64
141 #define	GRP_HASH_KEY(linkid)	((mod_hash_key_t)(uintptr_t)linkid)
142 #define	AGGR_PORT_NAME_DELIMIT '-'
143 
144 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
145 
146 #define	AGGR_M_CALLBACK_FLAGS	\
147 	(MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
148 
149 static mac_callbacks_t aggr_m_callbacks = {
150 	AGGR_M_CALLBACK_FLAGS,
151 	aggr_m_stat,
152 	aggr_m_start,
153 	aggr_m_stop,
154 	aggr_m_promisc,
155 	aggr_m_multicst,
156 	NULL,
157 	NULL,
158 	NULL,
159 	aggr_m_ioctl,
160 	aggr_m_capab_get,
161 	NULL,
162 	NULL,
163 	aggr_m_setprop,
164 	NULL,
165 	aggr_m_propinfo
166 };
167 
168 /*ARGSUSED*/
169 static int
aggr_grp_constructor(void * buf,void * arg,int kmflag)170 aggr_grp_constructor(void *buf, void *arg, int kmflag)
171 {
172 	aggr_grp_t *grp = buf;
173 
174 	bzero(grp, sizeof (*grp));
175 	mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
176 	cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
177 	rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
178 	mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
179 	cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
180 	mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
181 	cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
182 	grp->lg_link_state = LINK_STATE_UNKNOWN;
183 	return (0);
184 }
185 
186 /*ARGSUSED*/
187 static void
aggr_grp_destructor(void * buf,void * arg)188 aggr_grp_destructor(void *buf, void *arg)
189 {
190 	aggr_grp_t *grp = buf;
191 
192 	if (grp->lg_tx_ports != NULL) {
193 		kmem_free(grp->lg_tx_ports,
194 		    grp->lg_tx_ports_size * sizeof (aggr_port_t *));
195 	}
196 
197 	mutex_destroy(&grp->lg_lacp_lock);
198 	cv_destroy(&grp->lg_lacp_cv);
199 	mutex_destroy(&grp->lg_port_lock);
200 	cv_destroy(&grp->lg_port_cv);
201 	rw_destroy(&grp->lg_tx_lock);
202 	mutex_destroy(&grp->lg_tx_flowctl_lock);
203 	cv_destroy(&grp->lg_tx_flowctl_cv);
204 }
205 
206 void
aggr_grp_init(void)207 aggr_grp_init(void)
208 {
209 	aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
210 	    sizeof (aggr_grp_t), 0, aggr_grp_constructor,
211 	    aggr_grp_destructor, NULL, NULL, NULL, 0);
212 
213 	aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
214 	    GRP_HASHSZ, mod_hash_null_valdtor);
215 	rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
216 	aggr_grp_cnt = 0;
217 
218 	/*
219 	 * Allocate an id space to manage key values (when key is not
220 	 * specified). The range of the id space will be from
221 	 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
222 	 * uses a 16-bit key.
223 	 */
224 	key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
225 	ASSERT(key_ids != NULL);
226 }
227 
228 void
aggr_grp_fini(void)229 aggr_grp_fini(void)
230 {
231 	id_space_destroy(key_ids);
232 	rw_destroy(&aggr_grp_lock);
233 	mod_hash_destroy_idhash(aggr_grp_hash);
234 	kmem_cache_destroy(aggr_grp_cache);
235 }
236 
237 uint_t
aggr_grp_count(void)238 aggr_grp_count(void)
239 {
240 	uint_t	count;
241 
242 	rw_enter(&aggr_grp_lock, RW_READER);
243 	count = aggr_grp_cnt;
244 	rw_exit(&aggr_grp_lock);
245 	return (count);
246 }
247 
248 /*
249  * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
250  * requires the mac perimeter, this function holds a reference of the aggr
251  * and aggr won't call mac_unregister() until this reference drops to 0.
252  */
253 void
aggr_grp_port_hold(aggr_port_t * port)254 aggr_grp_port_hold(aggr_port_t *port)
255 {
256 	aggr_grp_t	*grp = port->lp_grp;
257 
258 	AGGR_PORT_REFHOLD(port);
259 	mutex_enter(&grp->lg_port_lock);
260 	grp->lg_port_ref++;
261 	mutex_exit(&grp->lg_port_lock);
262 }
263 
264 /*
265  * Release the reference of the grp and inform aggr_grp_delete() calling
266  * mac_unregister() is now safe.
267  */
268 void
aggr_grp_port_rele(aggr_port_t * port)269 aggr_grp_port_rele(aggr_port_t *port)
270 {
271 	aggr_grp_t	*grp = port->lp_grp;
272 
273 	mutex_enter(&grp->lg_port_lock);
274 	if (--grp->lg_port_ref == 0)
275 		cv_signal(&grp->lg_port_cv);
276 	mutex_exit(&grp->lg_port_lock);
277 	AGGR_PORT_REFRELE(port);
278 }
279 
280 /*
281  * Wait for the port's lacp timer thread and the port's notification callback
282  * to exit.
283  */
284 void
aggr_grp_port_wait(aggr_grp_t * grp)285 aggr_grp_port_wait(aggr_grp_t *grp)
286 {
287 	mutex_enter(&grp->lg_port_lock);
288 	if (grp->lg_port_ref != 0)
289 		cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
290 	mutex_exit(&grp->lg_port_lock);
291 }
292 
293 /*
294  * Attach a port to a link aggregation group.
295  *
296  * A port is attached to a link aggregation group once its speed
297  * and link state have been verified.
298  *
299  * Returns B_TRUE if the group link state or speed has changed. If
300  * it's the case, the caller must notify the MAC layer via a call
301  * to mac_link().
302  */
303 boolean_t
aggr_grp_attach_port(aggr_grp_t * grp,aggr_port_t * port)304 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
305 {
306 	boolean_t link_state_changed = B_FALSE;
307 
308 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
309 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
310 
311 	if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
312 		return (B_FALSE);
313 
314 	/*
315 	 * Validate the MAC port link speed and update the group
316 	 * link speed if needed.
317 	 */
318 	if (port->lp_ifspeed == 0 ||
319 	    port->lp_link_state != LINK_STATE_UP ||
320 	    port->lp_link_duplex != LINK_DUPLEX_FULL) {
321 		/*
322 		 * Can't attach a MAC port with unknown link speed,
323 		 * down link, or not in full duplex mode.
324 		 */
325 		return (B_FALSE);
326 	}
327 
328 	if (grp->lg_ifspeed == 0) {
329 		/*
330 		 * The group inherits the speed of the first link being
331 		 * attached.
332 		 */
333 		grp->lg_ifspeed = port->lp_ifspeed;
334 		link_state_changed = B_TRUE;
335 	} else if (grp->lg_ifspeed != port->lp_ifspeed) {
336 		/*
337 		 * The link speed of the MAC port must be the same as
338 		 * the group link speed, as per 802.3ad. Since it is
339 		 * not, the attach is cancelled.
340 		 */
341 		return (B_FALSE);
342 	}
343 
344 	grp->lg_nattached_ports++;
345 
346 	/*
347 	 * Update the group link state.
348 	 */
349 	if (grp->lg_link_state != LINK_STATE_UP) {
350 		grp->lg_link_state = LINK_STATE_UP;
351 		grp->lg_link_duplex = LINK_DUPLEX_FULL;
352 		link_state_changed = B_TRUE;
353 	}
354 
355 	/*
356 	 * Update port's state.
357 	 */
358 	port->lp_state = AGGR_PORT_STATE_ATTACHED;
359 
360 	aggr_grp_multicst_port(port, B_TRUE);
361 
362 	/*
363 	 * Set port's receive callback
364 	 */
365 	mac_rx_set(port->lp_mch, aggr_recv_cb, port);
366 
367 	/*
368 	 * If LACP is OFF, the port can be used to send data as soon
369 	 * as its link is up and verified to be compatible with the
370 	 * aggregation.
371 	 *
372 	 * If LACP is active or passive, notify the LACP subsystem, which
373 	 * will enable sending on the port following the LACP protocol.
374 	 */
375 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
376 		aggr_send_port_enable(port);
377 	else
378 		aggr_lacp_port_attached(port);
379 
380 	return (link_state_changed);
381 }
382 
383 boolean_t
aggr_grp_detach_port(aggr_grp_t * grp,aggr_port_t * port)384 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
385 {
386 	boolean_t link_state_changed = B_FALSE;
387 
388 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
389 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
390 
391 	/* update state */
392 	if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
393 		return (B_FALSE);
394 
395 	mac_rx_clear(port->lp_mch);
396 
397 	aggr_grp_multicst_port(port, B_FALSE);
398 
399 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
400 		aggr_send_port_disable(port);
401 	else
402 		aggr_lacp_port_detached(port);
403 
404 	port->lp_state = AGGR_PORT_STATE_STANDBY;
405 
406 	grp->lg_nattached_ports--;
407 	if (grp->lg_nattached_ports == 0) {
408 		/* the last attached MAC port of the group is being detached */
409 		grp->lg_ifspeed = 0;
410 		grp->lg_link_state = LINK_STATE_DOWN;
411 		grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
412 		link_state_changed = B_TRUE;
413 	}
414 
415 	return (link_state_changed);
416 }
417 
418 /*
419  * Update the MAC addresses of the constituent ports of the specified
420  * group. This function is invoked:
421  * - after creating a new aggregation group.
422  * - after adding new ports to an aggregation group.
423  * - after removing a port from a group when the MAC address of
424  *   that port was used for the MAC address of the group.
425  * - after the MAC address of a port changed when the MAC address
426  *   of that port was used for the MAC address of the group.
427  *
428  * Return true if the link state of the aggregation changed, for example
429  * as a result of a failure changing the MAC address of one of the
430  * constituent ports.
431  */
432 boolean_t
aggr_grp_update_ports_mac(aggr_grp_t * grp)433 aggr_grp_update_ports_mac(aggr_grp_t *grp)
434 {
435 	aggr_port_t *cport;
436 	boolean_t link_state_changed = B_FALSE;
437 	mac_perim_handle_t mph;
438 
439 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
440 
441 	for (cport = grp->lg_ports; cport != NULL;
442 	    cport = cport->lp_next) {
443 		mac_perim_enter_by_mh(cport->lp_mh, &mph);
444 		if (aggr_port_unicst(cport) != 0) {
445 			if (aggr_grp_detach_port(grp, cport))
446 				link_state_changed = B_TRUE;
447 		} else {
448 			/*
449 			 * If a port was detached because of a previous
450 			 * failure changing the MAC address, the port is
451 			 * reattached when it successfully changes the MAC
452 			 * address now, and this might cause the link state
453 			 * of the aggregation to change.
454 			 */
455 			if (aggr_grp_attach_port(grp, cport))
456 				link_state_changed = B_TRUE;
457 		}
458 		mac_perim_exit(mph);
459 	}
460 	return (link_state_changed);
461 }
462 
463 /*
464  * Invoked when the MAC address of a port has changed. If the port's
465  * MAC address was used for the group MAC address, set mac_addr_changedp
466  * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
467  * notification. If the link state changes due to detach/attach of
468  * the constituent port, set link_state_changedp to B_TRUE to indicate
469  * to the caller that it should send a MAC_NOTE_LINK notification. In both
470  * cases, it is the responsibility of the caller to invoke notification
471  * functions after releasing the the port lock.
472  */
473 void
aggr_grp_port_mac_changed(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)474 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
475     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
476 {
477 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
478 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
479 	ASSERT(mac_addr_changedp != NULL);
480 	ASSERT(link_state_changedp != NULL);
481 
482 	*mac_addr_changedp = B_FALSE;
483 	*link_state_changedp = B_FALSE;
484 
485 	if (grp->lg_addr_fixed) {
486 		/*
487 		 * The group is using a fixed MAC address or an automatic
488 		 * MAC address has not been set.
489 		 */
490 		return;
491 	}
492 
493 	if (grp->lg_mac_addr_port == port) {
494 		/*
495 		 * The MAC address of the port was assigned to the group
496 		 * MAC address. Update the group MAC address.
497 		 */
498 		bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
499 		*mac_addr_changedp = B_TRUE;
500 	} else {
501 		/*
502 		 * Update the actual port MAC address to the MAC address
503 		 * of the group.
504 		 */
505 		if (aggr_port_unicst(port) != 0) {
506 			*link_state_changedp = aggr_grp_detach_port(grp, port);
507 		} else {
508 			/*
509 			 * If a port was detached because of a previous
510 			 * failure changing the MAC address, the port is
511 			 * reattached when it successfully changes the MAC
512 			 * address now, and this might cause the link state
513 			 * of the aggregation to change.
514 			 */
515 			*link_state_changedp = aggr_grp_attach_port(grp, port);
516 		}
517 	}
518 }
519 
520 /*
521  * Add a port to a link aggregation group.
522  */
523 static int
aggr_grp_add_port(aggr_grp_t * grp,datalink_id_t port_linkid,boolean_t force,aggr_port_t ** pp)524 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
525     aggr_port_t **pp)
526 {
527 	aggr_port_t *port, **cport;
528 	mac_perim_handle_t mph;
529 	zoneid_t port_zoneid = ALL_ZONES;
530 	int err;
531 
532 	/* The port must be int the same zone as the aggregation. */
533 	if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
534 		port_zoneid = GLOBAL_ZONEID;
535 	if (grp->lg_zoneid != port_zoneid)
536 		return (EBUSY);
537 
538 	/*
539 	 * lg_mh could be NULL when the function is called during the creation
540 	 * of the aggregation.
541 	 */
542 	ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
543 
544 	/* create new port */
545 	err = aggr_port_create(grp, port_linkid, force, &port);
546 	if (err != 0)
547 		return (err);
548 
549 	mac_perim_enter_by_mh(port->lp_mh, &mph);
550 
551 	/* add port to list of group constituent ports */
552 	cport = &grp->lg_ports;
553 	while (*cport != NULL)
554 		cport = &((*cport)->lp_next);
555 	*cport = port;
556 
557 	/*
558 	 * Back reference to the group it is member of. A port always
559 	 * holds a reference to its group to ensure that the back
560 	 * reference is always valid.
561 	 */
562 	port->lp_grp = grp;
563 	AGGR_GRP_REFHOLD(grp);
564 	grp->lg_nports++;
565 
566 	aggr_lacp_init_port(port);
567 	mac_perim_exit(mph);
568 
569 	if (pp != NULL)
570 		*pp = port;
571 
572 	return (0);
573 }
574 
575 /*
576  * This is called in response to either our LACP state machine or a MAC
577  * notification that the link has gone down via aggr_send_port_disable(). At
578  * this point, we may need to update our default ring. To that end, we go
579  * through the set of ports (underlying datalinks in an aggregation) that are
580  * currently enabled to transmit data. If all our links have been disabled for
581  * transmit, then we don't do anything.
582  *
583  * Note, because we only have a single TX group, we don't have to worry about
584  * the rings moving between groups and the chance that mac will reassign it
585  * unless someone removes a port, at which point, we play it safe and call this
586  * again.
587  */
588 void
aggr_grp_update_default(aggr_grp_t * grp)589 aggr_grp_update_default(aggr_grp_t *grp)
590 {
591 	aggr_port_t *port;
592 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
593 
594 	rw_enter(&grp->lg_tx_lock, RW_WRITER);
595 
596 	if (grp->lg_ntx_ports == 0) {
597 		rw_exit(&grp->lg_tx_lock);
598 		return;
599 	}
600 
601 	port = grp->lg_tx_ports[0];
602 	ASSERT(port->lp_tx_ring_cnt > 0);
603 	mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
604 	rw_exit(&grp->lg_tx_lock);
605 }
606 
607 /*
608  * Add a pseudo RX ring for the given HW ring handle.
609  */
610 static int
aggr_add_pseudo_rx_ring(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)611 aggr_add_pseudo_rx_ring(aggr_port_t *port,
612     aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
613 {
614 	aggr_pseudo_rx_ring_t	*ring;
615 	int			err;
616 	int			j;
617 
618 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
619 		ring = rx_grp->arg_rings + j;
620 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
621 			break;
622 	}
623 
624 	/*
625 	 * No slot for this new RX ring.
626 	 */
627 	if (j == MAX_RINGS_PER_GROUP)
628 		return (EIO);
629 
630 	ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
631 	ring->arr_hw_rh = hw_rh;
632 	ring->arr_port = port;
633 	rx_grp->arg_ring_cnt++;
634 
635 	/*
636 	 * The group is already registered, dynamically add a new ring to the
637 	 * mac group.
638 	 */
639 	if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
640 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
641 		ring->arr_hw_rh = NULL;
642 		ring->arr_port = NULL;
643 		rx_grp->arg_ring_cnt--;
644 	} else {
645 		mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
646 		    mac_find_ring(rx_grp->arg_gh, j));
647 	}
648 	return (err);
649 }
650 
651 /*
652  * Remove the pseudo RX ring of the given HW ring handle.
653  */
654 static void
aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)655 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
656 {
657 	aggr_pseudo_rx_ring_t	*ring;
658 	int			j;
659 
660 	for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
661 		ring = rx_grp->arg_rings + j;
662 		if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
663 		    ring->arr_hw_rh != hw_rh) {
664 			continue;
665 		}
666 
667 		mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
668 
669 		ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
670 		ring->arr_hw_rh = NULL;
671 		ring->arr_port = NULL;
672 		rx_grp->arg_ring_cnt--;
673 		mac_hwring_teardown(hw_rh);
674 		break;
675 	}
676 }
677 
678 /*
679  * This function is called to create pseudo rings over the hardware rings of
680  * the underlying device. Note that there is a 1:1 mapping between the pseudo
681  * RX rings of the aggr and the hardware rings of the underlying port.
682  */
683 static int
aggr_add_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)684 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
685 {
686 	aggr_grp_t		*grp = port->lp_grp;
687 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
688 	aggr_unicst_addr_t	*addr, *a;
689 	mac_perim_handle_t	pmph;
690 	int			hw_rh_cnt, i = 0, j;
691 	int			err = 0;
692 
693 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
694 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
695 
696 	/*
697 	 * This function must be called after the aggr registers its mac
698 	 * and its RX group has been initialized.
699 	 */
700 	ASSERT(rx_grp->arg_gh != NULL);
701 
702 	/*
703 	 * Get the list the the underlying HW rings.
704 	 */
705 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
706 	    &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
707 
708 	if (port->lp_hwgh != NULL) {
709 		/*
710 		 * Quiesce the HW ring and the mac srs on the ring. Note
711 		 * that the HW ring will be restarted when the pseudo ring
712 		 * is started. At that time all the packets will be
713 		 * directly passed up to the pseudo RX ring and handled
714 		 * by mac srs created over the pseudo RX ring.
715 		 */
716 		mac_rx_client_quiesce(port->lp_mch);
717 		mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
718 	}
719 
720 	/*
721 	 * Add all the unicast addresses to the newly added port.
722 	 */
723 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
724 		if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
725 			break;
726 	}
727 
728 	for (i = 0; err == 0 && i < hw_rh_cnt; i++)
729 		err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
730 
731 	if (err != 0) {
732 		for (j = 0; j < i; j++)
733 			aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
734 
735 		for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
736 			aggr_port_remmac(port, a->aua_addr);
737 
738 		if (port->lp_hwgh != NULL) {
739 			mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
740 			mac_rx_client_restart(port->lp_mch);
741 			port->lp_hwgh = NULL;
742 		}
743 	} else {
744 		port->lp_rx_grp_added = B_TRUE;
745 	}
746 done:
747 	mac_perim_exit(pmph);
748 	return (err);
749 }
750 
751 /*
752  * This function is called by aggr to remove pseudo RX rings over the
753  * HW rings of the underlying port.
754  */
755 static void
aggr_rem_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)756 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
757 {
758 	aggr_grp_t		*grp = port->lp_grp;
759 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP];
760 	aggr_unicst_addr_t	*addr;
761 	mac_group_handle_t	hwgh;
762 	mac_perim_handle_t	pmph;
763 	int			hw_rh_cnt, i;
764 
765 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
766 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
767 
768 	if (!port->lp_rx_grp_added)
769 		goto done;
770 
771 	ASSERT(rx_grp->arg_gh != NULL);
772 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
773 	    &hwgh, hw_rh, MAC_RING_TYPE_RX);
774 
775 	/*
776 	 * If hw_rh_cnt is 0, it means that the underlying port does not
777 	 * support RX rings. Directly return in this case.
778 	 */
779 	for (i = 0; i < hw_rh_cnt; i++)
780 		aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
781 
782 	for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
783 		aggr_port_remmac(port, addr->aua_addr);
784 
785 	if (port->lp_hwgh != NULL) {
786 		port->lp_hwgh = NULL;
787 
788 		/*
789 		 * First clear the permanent-quiesced flag of the RX srs then
790 		 * restart the HW ring and the mac srs on the ring. Note that
791 		 * the HW ring and associated SRS will soon been removed when
792 		 * the port is removed from the aggr.
793 		 */
794 		mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
795 		mac_rx_client_restart(port->lp_mch);
796 	}
797 
798 	port->lp_rx_grp_added = B_FALSE;
799 done:
800 	mac_perim_exit(pmph);
801 }
802 
803 /*
804  * Add a pseudo TX ring for the given HW ring handle.
805  */
806 static int
aggr_add_pseudo_tx_ring(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t hw_rh,mac_ring_handle_t * pseudo_rh)807 aggr_add_pseudo_tx_ring(aggr_port_t *port,
808     aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
809     mac_ring_handle_t *pseudo_rh)
810 {
811 	aggr_pseudo_tx_ring_t	*ring;
812 	int			err;
813 	int			i;
814 
815 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
816 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
817 		ring = tx_grp->atg_rings + i;
818 		if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
819 			break;
820 	}
821 	/*
822 	 * No slot for this new TX ring.
823 	 */
824 	if (i == MAX_RINGS_PER_GROUP)
825 		return (EIO);
826 	/*
827 	 * The following 4 statements needs to be done before
828 	 * calling mac_group_add_ring(). Otherwise it will
829 	 * result in an assertion failure in mac_init_ring().
830 	 */
831 	ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
832 	ring->atr_hw_rh = hw_rh;
833 	ring->atr_port = port;
834 	tx_grp->atg_ring_cnt++;
835 
836 	/*
837 	 * The TX side has no concept of ring groups unlike RX groups.
838 	 * There is just a single group which stores all the TX rings.
839 	 * This group will be used to store aggr's pseudo TX rings.
840 	 */
841 	if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
842 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
843 		ring->atr_hw_rh = NULL;
844 		ring->atr_port = NULL;
845 		tx_grp->atg_ring_cnt--;
846 	} else {
847 		*pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
848 		if (hw_rh != NULL) {
849 			mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
850 			    mac_find_ring(tx_grp->atg_gh, i));
851 		}
852 	}
853 
854 	return (err);
855 }
856 
857 /*
858  * Remove the pseudo TX ring of the given HW ring handle.
859  */
860 static void
aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t pseudo_hw_rh)861 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
862     mac_ring_handle_t pseudo_hw_rh)
863 {
864 	aggr_pseudo_tx_ring_t	*ring;
865 	int			i;
866 
867 	for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
868 		ring = tx_grp->atg_rings + i;
869 		if (ring->atr_rh != pseudo_hw_rh)
870 			continue;
871 
872 		ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
873 		mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
874 		ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
875 		mac_hwring_teardown(ring->atr_hw_rh);
876 		ring->atr_hw_rh = NULL;
877 		ring->atr_port = NULL;
878 		tx_grp->atg_ring_cnt--;
879 		break;
880 	}
881 }
882 
883 /*
884  * This function is called to create pseudo rings over hardware rings of
885  * the underlying device. There is a 1:1 mapping between the pseudo TX
886  * rings of the aggr and the hardware rings of the underlying port.
887  */
888 static int
aggr_add_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp)889 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
890 {
891 	aggr_grp_t		*grp = port->lp_grp;
892 	mac_ring_handle_t	hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
893 	mac_perim_handle_t	pmph;
894 	int			hw_rh_cnt, i = 0, j;
895 	int			err = 0;
896 
897 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
898 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
899 
900 	/*
901 	 * Get the list the the underlying HW rings.
902 	 */
903 	hw_rh_cnt = mac_hwrings_get(port->lp_mch,
904 	    NULL, hw_rh, MAC_RING_TYPE_TX);
905 
906 	/*
907 	 * Even if the underlying NIC does not have TX rings, we
908 	 * still make a psuedo TX ring for that NIC with NULL as
909 	 * the ring handle.
910 	 */
911 	if (hw_rh_cnt == 0)
912 		port->lp_tx_ring_cnt = 1;
913 	else
914 		port->lp_tx_ring_cnt = hw_rh_cnt;
915 
916 	port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
917 	    port->lp_tx_ring_cnt), KM_SLEEP);
918 	port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
919 	    port->lp_tx_ring_cnt), KM_SLEEP);
920 
921 	if (hw_rh_cnt == 0) {
922 		if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
923 		    NULL, &pseudo_rh)) == 0) {
924 			port->lp_tx_rings[0] = NULL;
925 			port->lp_pseudo_tx_rings[0] = pseudo_rh;
926 		}
927 	} else {
928 		for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
929 			err = aggr_add_pseudo_tx_ring(port,
930 			    tx_grp, hw_rh[i], &pseudo_rh);
931 			if (err != 0)
932 				break;
933 			port->lp_tx_rings[i] = hw_rh[i];
934 			port->lp_pseudo_tx_rings[i] = pseudo_rh;
935 		}
936 	}
937 
938 	if (err != 0) {
939 		if (hw_rh_cnt != 0) {
940 			for (j = 0; j < i; j++) {
941 				aggr_rem_pseudo_tx_ring(tx_grp,
942 				    port->lp_pseudo_tx_rings[j]);
943 			}
944 		}
945 		kmem_free(port->lp_tx_rings,
946 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
947 		kmem_free(port->lp_pseudo_tx_rings,
948 		    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
949 		port->lp_tx_ring_cnt = 0;
950 	} else {
951 		port->lp_tx_grp_added = B_TRUE;
952 		port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
953 		    aggr_tx_ring_update, port);
954 	}
955 	mac_perim_exit(pmph);
956 	aggr_grp_update_default(grp);
957 	return (err);
958 }
959 
960 /*
961  * This function is called by aggr to remove pseudo TX rings over the
962  * HW rings of the underlying port.
963  */
964 static void
aggr_rem_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp)965 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
966 {
967 	aggr_grp_t		*grp = port->lp_grp;
968 	mac_perim_handle_t	pmph;
969 	int			i;
970 
971 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
972 	mac_perim_enter_by_mh(port->lp_mh, &pmph);
973 
974 	if (!port->lp_tx_grp_added)
975 		goto done;
976 
977 	ASSERT(tx_grp->atg_gh != NULL);
978 
979 	for (i = 0; i < port->lp_tx_ring_cnt; i++)
980 		aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
981 
982 	kmem_free(port->lp_tx_rings,
983 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
984 	kmem_free(port->lp_pseudo_tx_rings,
985 	    (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
986 
987 	port->lp_tx_ring_cnt = 0;
988 	(void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
989 	port->lp_tx_grp_added = B_FALSE;
990 	aggr_grp_update_default(grp);
991 done:
992 	mac_perim_exit(pmph);
993 }
994 
995 static int
aggr_pseudo_disable_intr(mac_intr_handle_t ih)996 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
997 {
998 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
999 	return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1000 }
1001 
1002 static int
aggr_pseudo_enable_intr(mac_intr_handle_t ih)1003 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1004 {
1005 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1006 	return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1007 }
1008 
1009 static int
aggr_pseudo_start_ring(mac_ring_driver_t arg,uint64_t mr_gen)1010 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1011 {
1012 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1013 	int err;
1014 
1015 	err = mac_hwring_start(rr_ring->arr_hw_rh);
1016 	if (err == 0)
1017 		rr_ring->arr_gen = mr_gen;
1018 	return (err);
1019 }
1020 
1021 static void
aggr_pseudo_stop_ring(mac_ring_driver_t arg)1022 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
1023 {
1024 	aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1025 	mac_hwring_stop(rr_ring->arr_hw_rh);
1026 }
1027 
1028 /*
1029  * Add one or more ports to an existing link aggregation group.
1030  */
1031 int
aggr_grp_add_ports(datalink_id_t linkid,uint_t nports,boolean_t force,laioc_port_t * ports)1032 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1033     laioc_port_t *ports)
1034 {
1035 	int rc, i, nadded = 0;
1036 	aggr_grp_t *grp = NULL;
1037 	aggr_port_t *port;
1038 	boolean_t link_state_changed = B_FALSE;
1039 	mac_perim_handle_t mph, pmph;
1040 
1041 	/* get group corresponding to linkid */
1042 	rw_enter(&aggr_grp_lock, RW_READER);
1043 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1044 	    (mod_hash_val_t *)&grp) != 0) {
1045 		rw_exit(&aggr_grp_lock);
1046 		return (ENOENT);
1047 	}
1048 	AGGR_GRP_REFHOLD(grp);
1049 
1050 	/*
1051 	 * Hold the perimeter so that the aggregation won't be destroyed.
1052 	 */
1053 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1054 	rw_exit(&aggr_grp_lock);
1055 
1056 	/* add the specified ports to group */
1057 	for (i = 0; i < nports; i++) {
1058 		/* add port to group */
1059 		if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1060 		    force, &port)) != 0) {
1061 			goto bail;
1062 		}
1063 		ASSERT(port != NULL);
1064 		nadded++;
1065 
1066 		/* check capabilities */
1067 		if (!aggr_grp_capab_check(grp, port) ||
1068 		    !aggr_grp_sdu_check(grp, port) ||
1069 		    !aggr_grp_margin_check(grp, port)) {
1070 			rc = ENOTSUP;
1071 			goto bail;
1072 		}
1073 
1074 		/*
1075 		 * Create the pseudo ring for each HW ring of the underlying
1076 		 * port.
1077 		 */
1078 		rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1079 		if (rc != 0)
1080 			goto bail;
1081 		rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1082 		if (rc != 0)
1083 			goto bail;
1084 
1085 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1086 
1087 		/* set LACP mode */
1088 		aggr_port_lacp_set_mode(grp, port);
1089 
1090 		/* start port if group has already been started */
1091 		if (grp->lg_started) {
1092 			rc = aggr_port_start(port);
1093 			if (rc != 0) {
1094 				mac_perim_exit(pmph);
1095 				goto bail;
1096 			}
1097 
1098 			/*
1099 			 * Turn on the promiscuous mode over the port when it
1100 			 * is requested to be turned on to receive the
1101 			 * non-primary address over a port, or the promiscous
1102 			 * mode is enabled over the aggr.
1103 			 */
1104 			if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1105 				rc = aggr_port_promisc(port, B_TRUE);
1106 				if (rc != 0) {
1107 					mac_perim_exit(pmph);
1108 					goto bail;
1109 				}
1110 			}
1111 		}
1112 		mac_perim_exit(pmph);
1113 
1114 		/*
1115 		 * Attach each port if necessary.
1116 		 */
1117 		if (aggr_port_notify_link(grp, port))
1118 			link_state_changed = B_TRUE;
1119 
1120 		/*
1121 		 * Initialize the callback functions for this port.
1122 		 */
1123 		aggr_port_init_callbacks(port);
1124 	}
1125 
1126 	/* update the MAC address of the constituent ports */
1127 	if (aggr_grp_update_ports_mac(grp))
1128 		link_state_changed = B_TRUE;
1129 
1130 	if (link_state_changed)
1131 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1132 
1133 bail:
1134 	if (rc != 0) {
1135 		/* stop and remove ports that have been added */
1136 		for (i = 0; i < nadded; i++) {
1137 			port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1138 			ASSERT(port != NULL);
1139 			if (grp->lg_started) {
1140 				mac_perim_enter_by_mh(port->lp_mh, &pmph);
1141 				(void) aggr_port_promisc(port, B_FALSE);
1142 				aggr_port_stop(port);
1143 				mac_perim_exit(pmph);
1144 			}
1145 			aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1146 			aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1147 			(void) aggr_grp_rem_port(grp, port, NULL, NULL);
1148 		}
1149 	}
1150 
1151 	mac_perim_exit(mph);
1152 	AGGR_GRP_REFRELE(grp);
1153 	return (rc);
1154 }
1155 
1156 static int
aggr_grp_modify_common(aggr_grp_t * grp,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1157 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1158     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1159     aggr_lacp_timer_t lacp_timer)
1160 {
1161 	boolean_t mac_addr_changed = B_FALSE;
1162 	boolean_t link_state_changed = B_FALSE;
1163 	mac_perim_handle_t pmph;
1164 
1165 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1166 
1167 	/* validate fixed address if specified */
1168 	if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1169 	    ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1170 	    (mac_addr[0] & 0x01))) {
1171 		return (EINVAL);
1172 	}
1173 
1174 	/* update policy if requested */
1175 	if (update_mask & AGGR_MODIFY_POLICY)
1176 		aggr_send_update_policy(grp, policy);
1177 
1178 	/* update unicast MAC address if requested */
1179 	if (update_mask & AGGR_MODIFY_MAC) {
1180 		if (mac_fixed) {
1181 			/* user-supplied MAC address */
1182 			grp->lg_mac_addr_port = NULL;
1183 			if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1184 				bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1185 				mac_addr_changed = B_TRUE;
1186 			}
1187 		} else if (grp->lg_addr_fixed) {
1188 			/* switch from user-supplied to automatic */
1189 			aggr_port_t *port = grp->lg_ports;
1190 
1191 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1192 			bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1193 			grp->lg_mac_addr_port = port;
1194 			mac_addr_changed = B_TRUE;
1195 			mac_perim_exit(pmph);
1196 		}
1197 		grp->lg_addr_fixed = mac_fixed;
1198 	}
1199 
1200 	if (mac_addr_changed)
1201 		link_state_changed = aggr_grp_update_ports_mac(grp);
1202 
1203 	if (update_mask & AGGR_MODIFY_LACP_MODE)
1204 		aggr_lacp_update_mode(grp, lacp_mode);
1205 
1206 	if (update_mask & AGGR_MODIFY_LACP_TIMER)
1207 		aggr_lacp_update_timer(grp, lacp_timer);
1208 
1209 	if (link_state_changed)
1210 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1211 
1212 	if (mac_addr_changed)
1213 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1214 
1215 	return (0);
1216 }
1217 
1218 /*
1219  * Update properties of an existing link aggregation group.
1220  */
1221 int
aggr_grp_modify(datalink_id_t linkid,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1222 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1223     boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1224     aggr_lacp_timer_t lacp_timer)
1225 {
1226 	aggr_grp_t *grp = NULL;
1227 	mac_perim_handle_t mph;
1228 	int err;
1229 
1230 	/* get group corresponding to linkid */
1231 	rw_enter(&aggr_grp_lock, RW_READER);
1232 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1233 	    (mod_hash_val_t *)&grp) != 0) {
1234 		rw_exit(&aggr_grp_lock);
1235 		return (ENOENT);
1236 	}
1237 	AGGR_GRP_REFHOLD(grp);
1238 
1239 	/*
1240 	 * Hold the perimeter so that the aggregation won't be destroyed.
1241 	 */
1242 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1243 	rw_exit(&aggr_grp_lock);
1244 
1245 	err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1246 	    mac_addr, lacp_mode, lacp_timer);
1247 
1248 	mac_perim_exit(mph);
1249 	AGGR_GRP_REFRELE(grp);
1250 	return (err);
1251 }
1252 
1253 /*
1254  * Create a new link aggregation group upon request from administrator.
1255  * Returns 0 on success, an errno on failure.
1256  */
1257 int
aggr_grp_create(datalink_id_t linkid,uint32_t key,uint_t nports,laioc_port_t * ports,uint32_t policy,boolean_t mac_fixed,boolean_t force,uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer,cred_t * credp)1258 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1259     laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1260     uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1261     cred_t *credp)
1262 {
1263 	aggr_grp_t *grp = NULL;
1264 	aggr_port_t *port;
1265 	mac_register_t *mac;
1266 	boolean_t link_state_changed;
1267 	mac_perim_handle_t mph;
1268 	int err;
1269 	int i;
1270 	kt_did_t tid = 0;
1271 
1272 	/* need at least one port */
1273 	if (nports == 0)
1274 		return (EINVAL);
1275 
1276 	rw_enter(&aggr_grp_lock, RW_WRITER);
1277 
1278 	/* does a group with the same linkid already exist? */
1279 	err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1280 	    (mod_hash_val_t *)&grp);
1281 	if (err == 0) {
1282 		rw_exit(&aggr_grp_lock);
1283 		return (EEXIST);
1284 	}
1285 
1286 	grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1287 
1288 	grp->lg_refs = 1;
1289 	grp->lg_closing = B_FALSE;
1290 	grp->lg_force = force;
1291 	grp->lg_linkid = linkid;
1292 	grp->lg_zoneid = crgetzoneid(credp);
1293 	grp->lg_ifspeed = 0;
1294 	grp->lg_link_state = LINK_STATE_UNKNOWN;
1295 	grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1296 	grp->lg_started = B_FALSE;
1297 	grp->lg_promisc = B_FALSE;
1298 	grp->lg_lacp_done = B_FALSE;
1299 	grp->lg_tx_notify_done = B_FALSE;
1300 	grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1301 	grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1302 	    aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1303 	grp->lg_tx_notify_thread = thread_create(NULL, 0,
1304 	    aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1305 	grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1306 	    MAX_RINGS_PER_GROUP), KM_SLEEP);
1307 	grp->lg_tx_blocked_cnt = 0;
1308 	bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1309 	bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1310 	aggr_lacp_init_grp(grp);
1311 
1312 	/* add MAC ports to group */
1313 	grp->lg_ports = NULL;
1314 	grp->lg_nports = 0;
1315 	grp->lg_nattached_ports = 0;
1316 	grp->lg_ntx_ports = 0;
1317 
1318 	/*
1319 	 * If key is not specified by the user, allocate the key.
1320 	 */
1321 	if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1322 		err = ENOMEM;
1323 		goto bail;
1324 	}
1325 	grp->lg_key = key;
1326 
1327 	for (i = 0; i < nports; i++) {
1328 		err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1329 		if (err != 0)
1330 			goto bail;
1331 	}
1332 
1333 	/*
1334 	 * If no explicit MAC address was specified by the administrator,
1335 	 * set it to the MAC address of the first port.
1336 	 */
1337 	grp->lg_addr_fixed = mac_fixed;
1338 	if (grp->lg_addr_fixed) {
1339 		/* validate specified address */
1340 		if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1341 			err = EINVAL;
1342 			goto bail;
1343 		}
1344 		bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1345 	} else {
1346 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1347 		grp->lg_mac_addr_port = grp->lg_ports;
1348 	}
1349 
1350 	/* set the initial group capabilities */
1351 	aggr_grp_capab_set(grp);
1352 
1353 	if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1354 		err = ENOMEM;
1355 		goto bail;
1356 	}
1357 	mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1358 	mac->m_driver = grp;
1359 	mac->m_dip = aggr_dip;
1360 	mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1361 	mac->m_src_addr = grp->lg_addr;
1362 	mac->m_callbacks = &aggr_m_callbacks;
1363 	mac->m_min_sdu = 0;
1364 	mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1365 	mac->m_margin = aggr_grp_max_margin(grp);
1366 	mac->m_v12n = MAC_VIRT_LEVEL1;
1367 	err = mac_register(mac, &grp->lg_mh);
1368 	mac_free(mac);
1369 	if (err != 0)
1370 		goto bail;
1371 
1372 	err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1373 	if (err != 0) {
1374 		(void) mac_unregister(grp->lg_mh);
1375 		grp->lg_mh = NULL;
1376 		goto bail;
1377 	}
1378 
1379 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1380 
1381 	/*
1382 	 * Update the MAC address of the constituent ports.
1383 	 * None of the port is attached at this time, the link state of the
1384 	 * aggregation will not change.
1385 	 */
1386 	link_state_changed = aggr_grp_update_ports_mac(grp);
1387 	ASSERT(!link_state_changed);
1388 
1389 	/* update outbound load balancing policy */
1390 	aggr_send_update_policy(grp, policy);
1391 
1392 	/* set LACP mode */
1393 	aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1394 
1395 	/*
1396 	 * Attach each port if necessary.
1397 	 */
1398 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1399 		/*
1400 		 * Create the pseudo ring for each HW ring of the underlying
1401 		 * port. Note that this is done after the aggr registers the
1402 		 * mac.
1403 		 */
1404 		VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1405 		VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1406 		if (aggr_port_notify_link(grp, port))
1407 			link_state_changed = B_TRUE;
1408 
1409 		/*
1410 		 * Initialize the callback functions for this port.
1411 		 */
1412 		aggr_port_init_callbacks(port);
1413 	}
1414 
1415 	if (link_state_changed)
1416 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1417 
1418 	/* add new group to hash table */
1419 	err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1420 	    (mod_hash_val_t)grp);
1421 	ASSERT(err == 0);
1422 	aggr_grp_cnt++;
1423 
1424 	mac_perim_exit(mph);
1425 	rw_exit(&aggr_grp_lock);
1426 	return (0);
1427 
1428 bail:
1429 
1430 	grp->lg_closing = B_TRUE;
1431 
1432 	port = grp->lg_ports;
1433 	while (port != NULL) {
1434 		aggr_port_t *cport;
1435 
1436 		cport = port->lp_next;
1437 		aggr_port_delete(port);
1438 		port = cport;
1439 	}
1440 
1441 	/*
1442 	 * Inform the lacp_rx thread to exit.
1443 	 */
1444 	mutex_enter(&grp->lg_lacp_lock);
1445 	grp->lg_lacp_done = B_TRUE;
1446 	cv_signal(&grp->lg_lacp_cv);
1447 	while (grp->lg_lacp_rx_thread != NULL)
1448 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1449 	mutex_exit(&grp->lg_lacp_lock);
1450 	/*
1451 	 * Inform the tx_notify thread to exit.
1452 	 */
1453 	mutex_enter(&grp->lg_tx_flowctl_lock);
1454 	if (grp->lg_tx_notify_thread != NULL) {
1455 		tid = grp->lg_tx_notify_thread->t_did;
1456 		grp->lg_tx_notify_done = B_TRUE;
1457 		cv_signal(&grp->lg_tx_flowctl_cv);
1458 	}
1459 	mutex_exit(&grp->lg_tx_flowctl_lock);
1460 	if (tid != 0)
1461 		thread_join(tid);
1462 
1463 	kmem_free(grp->lg_tx_blocked_rings,
1464 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1465 	rw_exit(&aggr_grp_lock);
1466 	AGGR_GRP_REFRELE(grp);
1467 	return (err);
1468 }
1469 
1470 /*
1471  * Return a pointer to the member of a group with specified linkid.
1472  */
1473 static aggr_port_t *
aggr_grp_port_lookup(aggr_grp_t * grp,datalink_id_t linkid)1474 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1475 {
1476 	aggr_port_t *port;
1477 
1478 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1479 
1480 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1481 		if (port->lp_linkid == linkid)
1482 			break;
1483 	}
1484 
1485 	return (port);
1486 }
1487 
1488 /*
1489  * Stop, detach and remove a port from a link aggregation group.
1490  */
1491 static int
aggr_grp_rem_port(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)1492 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1493     boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1494 {
1495 	int rc = 0;
1496 	aggr_port_t **pport;
1497 	boolean_t mac_addr_changed = B_FALSE;
1498 	boolean_t link_state_changed = B_FALSE;
1499 	mac_perim_handle_t mph;
1500 	uint64_t val;
1501 	uint_t i;
1502 	uint_t stat;
1503 
1504 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1505 	ASSERT(grp->lg_nports > 1);
1506 	ASSERT(!grp->lg_closing);
1507 
1508 	/* unlink port */
1509 	for (pport = &grp->lg_ports; *pport != port;
1510 	    pport = &(*pport)->lp_next) {
1511 		if (*pport == NULL) {
1512 			rc = ENOENT;
1513 			goto done;
1514 		}
1515 	}
1516 	*pport = port->lp_next;
1517 
1518 	mac_perim_enter_by_mh(port->lp_mh, &mph);
1519 
1520 	/*
1521 	 * If the MAC address of the port being removed was assigned
1522 	 * to the group, update the group MAC address
1523 	 * using the MAC address of a different port.
1524 	 */
1525 	if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1526 		/*
1527 		 * Set the MAC address of the group to the
1528 		 * MAC address of its first port.
1529 		 */
1530 		bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1531 		grp->lg_mac_addr_port = grp->lg_ports;
1532 		mac_addr_changed = B_TRUE;
1533 	}
1534 
1535 	link_state_changed = aggr_grp_detach_port(grp, port);
1536 
1537 	/*
1538 	 * Add the counter statistics of the ports while it was aggregated
1539 	 * to the group's residual statistics.  This is done by obtaining
1540 	 * the current counter from the underlying MAC then subtracting the
1541 	 * value of the counter at the moment it was added to the
1542 	 * aggregation.
1543 	 */
1544 	for (i = 0; i < MAC_NSTAT; i++) {
1545 		stat = i + MAC_STAT_MIN;
1546 		if (!MAC_STAT_ISACOUNTER(stat))
1547 			continue;
1548 		val = aggr_port_stat(port, stat);
1549 		val -= port->lp_stat[i];
1550 		grp->lg_stat[i] += val;
1551 	}
1552 	for (i = 0; i < ETHER_NSTAT; i++) {
1553 		stat = i + MACTYPE_STAT_MIN;
1554 		if (!ETHER_STAT_ISACOUNTER(stat))
1555 			continue;
1556 		val = aggr_port_stat(port, stat);
1557 		val -= port->lp_ether_stat[i];
1558 		grp->lg_ether_stat[i] += val;
1559 	}
1560 
1561 	grp->lg_nports--;
1562 	mac_perim_exit(mph);
1563 
1564 	aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1565 	aggr_port_delete(port);
1566 
1567 	/*
1568 	 * If the group MAC address has changed, update the MAC address of
1569 	 * the remaining constituent ports according to the new MAC
1570 	 * address of the group.
1571 	 */
1572 	if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1573 		link_state_changed = B_TRUE;
1574 
1575 done:
1576 	if (mac_addr_changedp != NULL)
1577 		*mac_addr_changedp = mac_addr_changed;
1578 	if (link_state_changedp != NULL)
1579 		*link_state_changedp = link_state_changed;
1580 
1581 	return (rc);
1582 }
1583 
1584 /*
1585  * Remove one or more ports from an existing link aggregation group.
1586  */
1587 int
aggr_grp_rem_ports(datalink_id_t linkid,uint_t nports,laioc_port_t * ports)1588 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1589 {
1590 	int rc = 0, i;
1591 	aggr_grp_t *grp = NULL;
1592 	aggr_port_t *port;
1593 	boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1594 	boolean_t link_state_update = B_FALSE, link_state_changed;
1595 	mac_perim_handle_t mph, pmph;
1596 
1597 	/* get group corresponding to linkid */
1598 	rw_enter(&aggr_grp_lock, RW_READER);
1599 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1600 	    (mod_hash_val_t *)&grp) != 0) {
1601 		rw_exit(&aggr_grp_lock);
1602 		return (ENOENT);
1603 	}
1604 	AGGR_GRP_REFHOLD(grp);
1605 
1606 	/*
1607 	 * Hold the perimeter so that the aggregation won't be destroyed.
1608 	 */
1609 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1610 	rw_exit(&aggr_grp_lock);
1611 
1612 	/* we need to keep at least one port per group */
1613 	if (nports >= grp->lg_nports) {
1614 		rc = EINVAL;
1615 		goto bail;
1616 	}
1617 
1618 	/* first verify that all the groups are valid */
1619 	for (i = 0; i < nports; i++) {
1620 		if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1621 			/* port not found */
1622 			rc = ENOENT;
1623 			goto bail;
1624 		}
1625 	}
1626 
1627 	/* clear the promiscous mode for the specified ports */
1628 	for (i = 0; i < nports && rc == 0; i++) {
1629 		/* lookup port */
1630 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1631 		ASSERT(port != NULL);
1632 
1633 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1634 		rc = aggr_port_promisc(port, B_FALSE);
1635 		mac_perim_exit(pmph);
1636 	}
1637 	if (rc != 0) {
1638 		for (i = 0; i < nports; i++) {
1639 			port = aggr_grp_port_lookup(grp,
1640 			    ports[i].lp_linkid);
1641 			ASSERT(port != NULL);
1642 
1643 			/*
1644 			 * Turn the promiscuous mode back on if it is required
1645 			 * to receive the non-primary address over a port, or
1646 			 * the promiscous mode is enabled over the aggr.
1647 			 */
1648 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1649 			if (port->lp_started && (grp->lg_promisc ||
1650 			    port->lp_prom_addr != NULL)) {
1651 				(void) aggr_port_promisc(port, B_TRUE);
1652 			}
1653 			mac_perim_exit(pmph);
1654 		}
1655 		goto bail;
1656 	}
1657 
1658 	/* remove the specified ports from group */
1659 	for (i = 0; i < nports; i++) {
1660 		/* lookup port */
1661 		port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1662 		ASSERT(port != NULL);
1663 
1664 		/* stop port if group has already been started */
1665 		if (grp->lg_started) {
1666 			mac_perim_enter_by_mh(port->lp_mh, &pmph);
1667 			aggr_port_stop(port);
1668 			mac_perim_exit(pmph);
1669 		}
1670 
1671 		/*
1672 		 * aggr_rem_pseudo_tx_group() is not called here. Instead
1673 		 * it is called from inside aggr_grp_rem_port() after the
1674 		 * port has been detached. The reason is that
1675 		 * aggr_rem_pseudo_tx_group() removes one ring at a time
1676 		 * and if there is still traffic going on, then there
1677 		 * is the possibility of aggr_find_tx_ring() returning a
1678 		 * removed ring for transmission. Once the port has been
1679 		 * detached, that port will not be used and
1680 		 * aggr_find_tx_ring() will not return any rings
1681 		 * belonging to it.
1682 		 */
1683 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1684 
1685 		/* remove port from group */
1686 		rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1687 		    &link_state_changed);
1688 		ASSERT(rc == 0);
1689 		mac_addr_update = mac_addr_update || mac_addr_changed;
1690 		link_state_update = link_state_update || link_state_changed;
1691 	}
1692 
1693 bail:
1694 	if (mac_addr_update)
1695 		mac_unicst_update(grp->lg_mh, grp->lg_addr);
1696 	if (link_state_update)
1697 		mac_link_update(grp->lg_mh, grp->lg_link_state);
1698 
1699 	mac_perim_exit(mph);
1700 	AGGR_GRP_REFRELE(grp);
1701 
1702 	return (rc);
1703 }
1704 
1705 int
aggr_grp_delete(datalink_id_t linkid,cred_t * cred)1706 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1707 {
1708 	aggr_grp_t *grp = NULL;
1709 	aggr_port_t *port, *cport;
1710 	datalink_id_t tmpid;
1711 	mod_hash_val_t val;
1712 	mac_perim_handle_t mph, pmph;
1713 	int err;
1714 	kt_did_t tid = 0;
1715 
1716 	rw_enter(&aggr_grp_lock, RW_WRITER);
1717 
1718 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1719 	    (mod_hash_val_t *)&grp) != 0) {
1720 		rw_exit(&aggr_grp_lock);
1721 		return (ENOENT);
1722 	}
1723 
1724 	/*
1725 	 * Note that dls_devnet_destroy() must be called before lg_lock is
1726 	 * held. Otherwise, it will deadlock if another thread is in
1727 	 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1728 	 * dls_devnet_destroy() needs to delete.
1729 	 */
1730 	if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1731 		rw_exit(&aggr_grp_lock);
1732 		return (err);
1733 	}
1734 	ASSERT(linkid == tmpid);
1735 
1736 	/*
1737 	 * Unregister from the MAC service module. Since this can
1738 	 * fail if a client hasn't closed the MAC port, we gracefully
1739 	 * fail the operation.
1740 	 */
1741 	if ((err = mac_disable(grp->lg_mh)) != 0) {
1742 		(void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1743 		rw_exit(&aggr_grp_lock);
1744 		return (err);
1745 	}
1746 	(void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1747 	ASSERT(grp == (aggr_grp_t *)val);
1748 
1749 	ASSERT(aggr_grp_cnt > 0);
1750 	aggr_grp_cnt--;
1751 	rw_exit(&aggr_grp_lock);
1752 
1753 	/*
1754 	 * Inform the lacp_rx thread to exit.
1755 	 */
1756 	mutex_enter(&grp->lg_lacp_lock);
1757 	grp->lg_lacp_done = B_TRUE;
1758 	cv_signal(&grp->lg_lacp_cv);
1759 	while (grp->lg_lacp_rx_thread != NULL)
1760 		cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1761 	mutex_exit(&grp->lg_lacp_lock);
1762 	/*
1763 	 * Inform the tx_notify_thread to exit.
1764 	 */
1765 	mutex_enter(&grp->lg_tx_flowctl_lock);
1766 	if (grp->lg_tx_notify_thread != NULL) {
1767 		tid = grp->lg_tx_notify_thread->t_did;
1768 		grp->lg_tx_notify_done = B_TRUE;
1769 		cv_signal(&grp->lg_tx_flowctl_cv);
1770 	}
1771 	mutex_exit(&grp->lg_tx_flowctl_lock);
1772 	if (tid != 0)
1773 		thread_join(tid);
1774 
1775 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1776 
1777 	grp->lg_closing = B_TRUE;
1778 	/* detach and free MAC ports associated with group */
1779 	port = grp->lg_ports;
1780 	while (port != NULL) {
1781 		cport = port->lp_next;
1782 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1783 		if (grp->lg_started)
1784 			aggr_port_stop(port);
1785 		(void) aggr_grp_detach_port(grp, port);
1786 		mac_perim_exit(pmph);
1787 		aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1788 		aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1789 		aggr_port_delete(port);
1790 		port = cport;
1791 	}
1792 
1793 	mac_perim_exit(mph);
1794 
1795 	kmem_free(grp->lg_tx_blocked_rings,
1796 	    (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1797 	/*
1798 	 * Wait for the port's lacp timer thread and its notification callback
1799 	 * to exit before calling mac_unregister() since both needs to access
1800 	 * the mac perimeter of the grp.
1801 	 */
1802 	aggr_grp_port_wait(grp);
1803 
1804 	VERIFY(mac_unregister(grp->lg_mh) == 0);
1805 	grp->lg_mh = NULL;
1806 
1807 	AGGR_GRP_REFRELE(grp);
1808 	return (0);
1809 }
1810 
1811 void
aggr_grp_free(aggr_grp_t * grp)1812 aggr_grp_free(aggr_grp_t *grp)
1813 {
1814 	ASSERT(grp->lg_refs == 0);
1815 	ASSERT(grp->lg_port_ref == 0);
1816 	if (grp->lg_key > AGGR_MAX_KEY) {
1817 		id_free(key_ids, grp->lg_key);
1818 		grp->lg_key = 0;
1819 	}
1820 	kmem_cache_free(aggr_grp_cache, grp);
1821 }
1822 
1823 int
aggr_grp_info(datalink_id_t linkid,void * fn_arg,aggr_grp_info_new_grp_fn_t new_grp_fn,aggr_grp_info_new_port_fn_t new_port_fn,cred_t * cred)1824 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1825     aggr_grp_info_new_grp_fn_t new_grp_fn,
1826     aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1827 {
1828 	aggr_grp_t	*grp;
1829 	aggr_port_t	*port;
1830 	mac_perim_handle_t mph, pmph;
1831 	int		rc = 0;
1832 
1833 	/*
1834 	 * Make sure that the aggregation link is visible from the caller's
1835 	 * zone.
1836 	 */
1837 	if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1838 		return (ENOENT);
1839 
1840 	rw_enter(&aggr_grp_lock, RW_READER);
1841 
1842 	if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1843 	    (mod_hash_val_t *)&grp) != 0) {
1844 		rw_exit(&aggr_grp_lock);
1845 		return (ENOENT);
1846 	}
1847 	AGGR_GRP_REFHOLD(grp);
1848 
1849 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1850 	rw_exit(&aggr_grp_lock);
1851 
1852 	rc = new_grp_fn(fn_arg, grp->lg_linkid,
1853 	    (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1854 	    grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1855 	    grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1856 
1857 	if (rc != 0)
1858 		goto bail;
1859 
1860 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1861 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
1862 		rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1863 		    port->lp_state, &port->lp_lacp.ActorOperPortState);
1864 		mac_perim_exit(pmph);
1865 
1866 		if (rc != 0)
1867 			goto bail;
1868 	}
1869 
1870 bail:
1871 	mac_perim_exit(mph);
1872 	AGGR_GRP_REFRELE(grp);
1873 	return (rc);
1874 }
1875 
1876 /*ARGSUSED*/
1877 static void
aggr_m_ioctl(void * arg,queue_t * q,mblk_t * mp)1878 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1879 {
1880 	miocnak(q, mp, 0, ENOTSUP);
1881 }
1882 
1883 static int
aggr_grp_stat(aggr_grp_t * grp,uint_t stat,uint64_t * val)1884 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1885 {
1886 	aggr_port_t	*port;
1887 	uint_t		stat_index;
1888 
1889 	/* We only aggregate counter statistics. */
1890 	if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1891 	    IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1892 		return (ENOTSUP);
1893 	}
1894 
1895 	/*
1896 	 * Counter statistics for a group are computed by aggregating the
1897 	 * counters of the members MACs while they were aggregated, plus
1898 	 * the residual counter of the group itself, which is updated each
1899 	 * time a MAC is removed from the group.
1900 	 */
1901 	*val = 0;
1902 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1903 		/* actual port statistic */
1904 		*val += aggr_port_stat(port, stat);
1905 		/*
1906 		 * minus the port stat when it was added, plus any residual
1907 		 * amount for the group.
1908 		 */
1909 		if (IS_MAC_STAT(stat)) {
1910 			stat_index = stat - MAC_STAT_MIN;
1911 			*val -= port->lp_stat[stat_index];
1912 			*val += grp->lg_stat[stat_index];
1913 		} else if (IS_MACTYPE_STAT(stat)) {
1914 			stat_index = stat - MACTYPE_STAT_MIN;
1915 			*val -= port->lp_ether_stat[stat_index];
1916 			*val += grp->lg_ether_stat[stat_index];
1917 		}
1918 	}
1919 	return (0);
1920 }
1921 
1922 int
aggr_rx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)1923 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1924 {
1925 	aggr_pseudo_rx_ring_t   *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1926 
1927 	if (rx_ring->arr_hw_rh != NULL) {
1928 		*val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1929 	} else {
1930 		aggr_port_t	*port = rx_ring->arr_port;
1931 
1932 		*val = mac_stat_get(port->lp_mh, stat);
1933 
1934 	}
1935 	return (0);
1936 }
1937 
1938 int
aggr_tx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)1939 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1940 {
1941 	aggr_pseudo_tx_ring_t   *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1942 
1943 	if (tx_ring->atr_hw_rh != NULL) {
1944 		*val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1945 	} else {
1946 		aggr_port_t	*port = tx_ring->atr_port;
1947 
1948 		*val = mac_stat_get(port->lp_mh, stat);
1949 	}
1950 	return (0);
1951 }
1952 
1953 static int
aggr_m_stat(void * arg,uint_t stat,uint64_t * val)1954 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1955 {
1956 	aggr_grp_t		*grp = arg;
1957 	mac_perim_handle_t	mph;
1958 	int			rval = 0;
1959 
1960 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1961 
1962 	switch (stat) {
1963 	case MAC_STAT_IFSPEED:
1964 		*val = grp->lg_ifspeed;
1965 		break;
1966 
1967 	case ETHER_STAT_LINK_DUPLEX:
1968 		*val = grp->lg_link_duplex;
1969 		break;
1970 
1971 	default:
1972 		/*
1973 		 * For all other statistics, we return the aggregated stat
1974 		 * from the underlying ports.  aggr_grp_stat() will set
1975 		 * rval appropriately if the statistic isn't a counter.
1976 		 */
1977 		rval = aggr_grp_stat(grp, stat, val);
1978 	}
1979 
1980 	mac_perim_exit(mph);
1981 	return (rval);
1982 }
1983 
1984 static int
aggr_m_start(void * arg)1985 aggr_m_start(void *arg)
1986 {
1987 	aggr_grp_t *grp = arg;
1988 	aggr_port_t *port;
1989 	mac_perim_handle_t mph, pmph;
1990 
1991 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
1992 
1993 	/*
1994 	 * Attempts to start all configured members of the group.
1995 	 * Group members will be attached when their link-up notification
1996 	 * is received.
1997 	 */
1998 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1999 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2000 		if (aggr_port_start(port) != 0) {
2001 			mac_perim_exit(pmph);
2002 			continue;
2003 		}
2004 
2005 		/*
2006 		 * Turn on the promiscuous mode if it is required to receive
2007 		 * the non-primary address over a port, or the promiscous
2008 		 * mode is enabled over the aggr.
2009 		 */
2010 		if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2011 			if (aggr_port_promisc(port, B_TRUE) != 0)
2012 				aggr_port_stop(port);
2013 		}
2014 		mac_perim_exit(pmph);
2015 	}
2016 
2017 	grp->lg_started = B_TRUE;
2018 
2019 	mac_perim_exit(mph);
2020 	return (0);
2021 }
2022 
2023 static void
aggr_m_stop(void * arg)2024 aggr_m_stop(void *arg)
2025 {
2026 	aggr_grp_t *grp = arg;
2027 	aggr_port_t *port;
2028 	mac_perim_handle_t mph, pmph;
2029 
2030 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2031 
2032 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2033 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2034 
2035 		/* reset port promiscuous mode */
2036 		(void) aggr_port_promisc(port, B_FALSE);
2037 
2038 		aggr_port_stop(port);
2039 		mac_perim_exit(pmph);
2040 	}
2041 
2042 	grp->lg_started = B_FALSE;
2043 	mac_perim_exit(mph);
2044 }
2045 
2046 static int
aggr_m_promisc(void * arg,boolean_t on)2047 aggr_m_promisc(void *arg, boolean_t on)
2048 {
2049 	aggr_grp_t *grp = arg;
2050 	aggr_port_t *port;
2051 	boolean_t link_state_changed = B_FALSE;
2052 	mac_perim_handle_t mph, pmph;
2053 
2054 	AGGR_GRP_REFHOLD(grp);
2055 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2056 
2057 	ASSERT(!grp->lg_closing);
2058 
2059 	if (on == grp->lg_promisc)
2060 		goto bail;
2061 
2062 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2063 		int	err = 0;
2064 
2065 		mac_perim_enter_by_mh(port->lp_mh, &pmph);
2066 		AGGR_PORT_REFHOLD(port);
2067 		if (!on && (port->lp_prom_addr == NULL))
2068 			err = aggr_port_promisc(port, B_FALSE);
2069 		else if (on && port->lp_started)
2070 			err = aggr_port_promisc(port, B_TRUE);
2071 
2072 		if (err != 0) {
2073 			if (aggr_grp_detach_port(grp, port))
2074 				link_state_changed = B_TRUE;
2075 		} else {
2076 			/*
2077 			 * If a port was detached because of a previous
2078 			 * failure changing the promiscuity, the port
2079 			 * is reattached when it successfully changes
2080 			 * the promiscuity now, and this might cause
2081 			 * the link state of the aggregation to change.
2082 			 */
2083 			if (aggr_grp_attach_port(grp, port))
2084 				link_state_changed = B_TRUE;
2085 		}
2086 		mac_perim_exit(pmph);
2087 		AGGR_PORT_REFRELE(port);
2088 	}
2089 
2090 	grp->lg_promisc = on;
2091 
2092 	if (link_state_changed)
2093 		mac_link_update(grp->lg_mh, grp->lg_link_state);
2094 
2095 bail:
2096 	mac_perim_exit(mph);
2097 	AGGR_GRP_REFRELE(grp);
2098 
2099 	return (0);
2100 }
2101 
2102 static void
aggr_grp_port_rename(const char * new_name,void * arg)2103 aggr_grp_port_rename(const char *new_name, void *arg)
2104 {
2105 	/*
2106 	 * aggr port's mac client name is the format of "aggr link name" plus
2107 	 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2108 	 */
2109 	int aggr_len, link_len, clnt_name_len, i;
2110 	char *str_end, *str_st, *str_del;
2111 	char aggr_name[MAXNAMELEN];
2112 	char link_name[MAXNAMELEN];
2113 	char *clnt_name;
2114 	aggr_grp_t *aggr_grp = arg;
2115 	aggr_port_t *aggr_port = aggr_grp->lg_ports;
2116 
2117 	for (i = 0; i < aggr_grp->lg_nports; i++) {
2118 		clnt_name = mac_client_name(aggr_port->lp_mch);
2119 		clnt_name_len = strlen(clnt_name);
2120 		str_st = clnt_name;
2121 		str_end = &(clnt_name[clnt_name_len]);
2122 		str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2123 		ASSERT(str_del != NULL);
2124 		aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2125 		link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2126 		bzero(aggr_name, MAXNAMELEN);
2127 		bzero(link_name, MAXNAMELEN);
2128 		bcopy(clnt_name, aggr_name, aggr_len);
2129 		bcopy(str_del, link_name, link_len + 1);
2130 		bzero(clnt_name, MAXNAMELEN);
2131 		(void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2132 		    link_name);
2133 
2134 		(void) mac_rename_primary(aggr_port->lp_mh, NULL);
2135 		aggr_port = aggr_port->lp_next;
2136 	}
2137 }
2138 
2139 /*
2140  * Initialize the capabilities that are advertised for the group
2141  * according to the capabilities of the constituent ports.
2142  */
2143 static boolean_t
aggr_m_capab_get(void * arg,mac_capab_t cap,void * cap_data)2144 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2145 {
2146 	aggr_grp_t *grp = arg;
2147 
2148 	switch (cap) {
2149 	case MAC_CAPAB_HCKSUM: {
2150 		uint32_t *hcksum_txflags = cap_data;
2151 		*hcksum_txflags = grp->lg_hcksum_txflags;
2152 		break;
2153 	}
2154 	case MAC_CAPAB_LSO: {
2155 		mac_capab_lso_t *cap_lso = cap_data;
2156 
2157 		if (grp->lg_lso) {
2158 			*cap_lso = grp->lg_cap_lso;
2159 			break;
2160 		} else {
2161 			return (B_FALSE);
2162 		}
2163 	}
2164 	case MAC_CAPAB_NO_NATIVEVLAN:
2165 		return (!grp->lg_vlan);
2166 	case MAC_CAPAB_NO_ZCOPY:
2167 		return (!grp->lg_zcopy);
2168 	case MAC_CAPAB_RINGS: {
2169 		mac_capab_rings_t *cap_rings = cap_data;
2170 
2171 		if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2172 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2173 			cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2174 
2175 			/*
2176 			 * An aggregation advertises only one (pseudo) RX
2177 			 * group, which virtualizes the main/primary group of
2178 			 * the underlying devices.
2179 			 */
2180 			cap_rings->mr_gnum = 1;
2181 			cap_rings->mr_gaddring = NULL;
2182 			cap_rings->mr_gremring = NULL;
2183 		} else {
2184 			cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2185 			cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2186 			cap_rings->mr_gnum = 0;
2187 		}
2188 		cap_rings->mr_rget = aggr_fill_ring;
2189 		cap_rings->mr_gget = aggr_fill_group;
2190 		break;
2191 	}
2192 	case MAC_CAPAB_AGGR:
2193 	{
2194 		mac_capab_aggr_t *aggr_cap;
2195 
2196 		if (cap_data != NULL) {
2197 			aggr_cap = cap_data;
2198 			aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2199 			aggr_cap->mca_unicst = aggr_m_unicst;
2200 			aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2201 			aggr_cap->mca_arg = arg;
2202 		}
2203 		return (B_TRUE);
2204 	}
2205 	default:
2206 		return (B_FALSE);
2207 	}
2208 	return (B_TRUE);
2209 }
2210 
2211 /*
2212  * Callback funtion for MAC layer to register groups.
2213  */
2214 static void
aggr_fill_group(void * arg,mac_ring_type_t rtype,const int index,mac_group_info_t * infop,mac_group_handle_t gh)2215 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2216     mac_group_info_t *infop, mac_group_handle_t gh)
2217 {
2218 	aggr_grp_t *grp = arg;
2219 	aggr_pseudo_rx_group_t *rx_group;
2220 	aggr_pseudo_tx_group_t *tx_group;
2221 
2222 	ASSERT(index == 0);
2223 	if (rtype == MAC_RING_TYPE_RX) {
2224 		rx_group = &grp->lg_rx_group;
2225 		rx_group->arg_gh = gh;
2226 		rx_group->arg_grp = grp;
2227 
2228 		infop->mgi_driver = (mac_group_driver_t)rx_group;
2229 		infop->mgi_start = NULL;
2230 		infop->mgi_stop = NULL;
2231 		infop->mgi_addmac = aggr_addmac;
2232 		infop->mgi_remmac = aggr_remmac;
2233 		infop->mgi_count = rx_group->arg_ring_cnt;
2234 	} else {
2235 		tx_group = &grp->lg_tx_group;
2236 		tx_group->atg_gh = gh;
2237 	}
2238 }
2239 
2240 /*
2241  * Callback funtion for MAC layer to register all rings.
2242  */
2243 static void
aggr_fill_ring(void * arg,mac_ring_type_t rtype,const int rg_index,const int index,mac_ring_info_t * infop,mac_ring_handle_t rh)2244 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2245     const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2246 {
2247 	aggr_grp_t	*grp = arg;
2248 
2249 	switch (rtype) {
2250 	case MAC_RING_TYPE_RX: {
2251 		aggr_pseudo_rx_group_t	*rx_group = &grp->lg_rx_group;
2252 		aggr_pseudo_rx_ring_t	*rx_ring;
2253 		mac_intr_t		aggr_mac_intr;
2254 
2255 		ASSERT(rg_index == 0);
2256 
2257 		ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2258 		rx_ring = rx_group->arg_rings + index;
2259 		rx_ring->arr_rh = rh;
2260 
2261 		/*
2262 		 * Entrypoint to enable interrupt (disable poll) and
2263 		 * disable interrupt (enable poll).
2264 		 */
2265 		aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2266 		aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2267 		aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2268 		aggr_mac_intr.mi_ddi_handle = NULL;
2269 
2270 		infop->mri_driver = (mac_ring_driver_t)rx_ring;
2271 		infop->mri_start = aggr_pseudo_start_ring;
2272 		infop->mri_stop = aggr_pseudo_stop_ring;
2273 
2274 		infop->mri_intr = aggr_mac_intr;
2275 		infop->mri_poll = aggr_rx_poll;
2276 
2277 		infop->mri_stat = aggr_rx_ring_stat;
2278 		break;
2279 	}
2280 	case MAC_RING_TYPE_TX: {
2281 		aggr_pseudo_tx_group_t	*tx_group = &grp->lg_tx_group;
2282 		aggr_pseudo_tx_ring_t	*tx_ring;
2283 
2284 		ASSERT(rg_index == -1);
2285 		ASSERT(index < tx_group->atg_ring_cnt);
2286 
2287 		tx_ring = &tx_group->atg_rings[index];
2288 		tx_ring->atr_rh = rh;
2289 
2290 		infop->mri_driver = (mac_ring_driver_t)tx_ring;
2291 		infop->mri_start = NULL;
2292 		infop->mri_stop = NULL;
2293 		infop->mri_tx = aggr_ring_tx;
2294 		infop->mri_stat = aggr_tx_ring_stat;
2295 		/*
2296 		 * Use the hw TX ring handle to find if the ring needs
2297 		 * serialization or not. For NICs that do not expose
2298 		 * Tx rings, atr_hw_rh will be NULL.
2299 		 */
2300 		if (tx_ring->atr_hw_rh != NULL) {
2301 			infop->mri_flags =
2302 			    mac_hwring_getinfo(tx_ring->atr_hw_rh);
2303 		}
2304 		break;
2305 	}
2306 	default:
2307 		break;
2308 	}
2309 }
2310 
2311 static mblk_t *
aggr_rx_poll(void * arg,int bytes_to_pickup)2312 aggr_rx_poll(void *arg, int bytes_to_pickup)
2313 {
2314 	aggr_pseudo_rx_ring_t *rr_ring = arg;
2315 	aggr_port_t *port = rr_ring->arr_port;
2316 	aggr_grp_t *grp = port->lp_grp;
2317 	mblk_t *mp_chain, *mp, **mpp;
2318 
2319 	mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2320 
2321 	if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2322 		return (mp_chain);
2323 
2324 	mpp = &mp_chain;
2325 	while ((mp = *mpp) != NULL) {
2326 		if (MBLKL(mp) >= sizeof (struct ether_header)) {
2327 			struct ether_header *ehp;
2328 
2329 			ehp = (struct ether_header *)mp->b_rptr;
2330 			if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2331 				*mpp = mp->b_next;
2332 				mp->b_next = NULL;
2333 				aggr_recv_lacp(port,
2334 				    (mac_resource_handle_t)rr_ring, mp);
2335 				continue;
2336 			}
2337 		}
2338 
2339 		if (!port->lp_collector_enabled) {
2340 			*mpp = mp->b_next;
2341 			mp->b_next = NULL;
2342 			freemsg(mp);
2343 			continue;
2344 		}
2345 		mpp = &mp->b_next;
2346 	}
2347 	return (mp_chain);
2348 }
2349 
2350 static int
aggr_addmac(void * arg,const uint8_t * mac_addr)2351 aggr_addmac(void *arg, const uint8_t *mac_addr)
2352 {
2353 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2354 	aggr_unicst_addr_t	*addr, **pprev;
2355 	aggr_grp_t		*grp = rx_group->arg_grp;
2356 	aggr_port_t		*port, *p;
2357 	mac_perim_handle_t	mph;
2358 	int			err = 0;
2359 
2360 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2361 
2362 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2363 		mac_perim_exit(mph);
2364 		return (0);
2365 	}
2366 
2367 	/*
2368 	 * Insert this mac address into the list of mac addresses owned by
2369 	 * the aggregation pseudo group.
2370 	 */
2371 	pprev = &rx_group->arg_macaddr;
2372 	while ((addr = *pprev) != NULL) {
2373 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2374 			mac_perim_exit(mph);
2375 			return (EEXIST);
2376 		}
2377 		pprev = &addr->aua_next;
2378 	}
2379 	addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2380 	bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2381 	addr->aua_next = NULL;
2382 	*pprev = addr;
2383 
2384 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2385 		if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2386 			break;
2387 
2388 	if (err != 0) {
2389 		for (p = grp->lg_ports; p != port; p = p->lp_next)
2390 			aggr_port_remmac(p, mac_addr);
2391 
2392 		*pprev = NULL;
2393 		kmem_free(addr, sizeof (aggr_unicst_addr_t));
2394 	}
2395 
2396 	mac_perim_exit(mph);
2397 	return (err);
2398 }
2399 
2400 static int
aggr_remmac(void * arg,const uint8_t * mac_addr)2401 aggr_remmac(void *arg, const uint8_t *mac_addr)
2402 {
2403 	aggr_pseudo_rx_group_t	*rx_group = (aggr_pseudo_rx_group_t *)arg;
2404 	aggr_unicst_addr_t	*addr, **pprev;
2405 	aggr_grp_t		*grp = rx_group->arg_grp;
2406 	aggr_port_t		*port;
2407 	mac_perim_handle_t	mph;
2408 	int			err = 0;
2409 
2410 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2411 
2412 	if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2413 		mac_perim_exit(mph);
2414 		return (0);
2415 	}
2416 
2417 	/*
2418 	 * Insert this mac address into the list of mac addresses owned by
2419 	 * the aggregation pseudo group.
2420 	 */
2421 	pprev = &rx_group->arg_macaddr;
2422 	while ((addr = *pprev) != NULL) {
2423 		if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2424 			pprev = &addr->aua_next;
2425 			continue;
2426 		}
2427 		break;
2428 	}
2429 	if (addr == NULL) {
2430 		mac_perim_exit(mph);
2431 		return (EINVAL);
2432 	}
2433 
2434 	for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2435 		aggr_port_remmac(port, mac_addr);
2436 
2437 	*pprev = addr->aua_next;
2438 	kmem_free(addr, sizeof (aggr_unicst_addr_t));
2439 
2440 	mac_perim_exit(mph);
2441 	return (err);
2442 }
2443 
2444 /*
2445  * Add or remove the multicast addresses that are defined for the group
2446  * to or from the specified port.
2447  *
2448  * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2449  * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2450  * called when the port is either stopped or detached.
2451  */
2452 void
aggr_grp_multicst_port(aggr_port_t * port,boolean_t add)2453 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2454 {
2455 	aggr_grp_t *grp = port->lp_grp;
2456 
2457 	ASSERT(MAC_PERIM_HELD(port->lp_mh));
2458 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2459 
2460 	if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2461 		return;
2462 
2463 	mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2464 }
2465 
2466 static int
aggr_m_multicst(void * arg,boolean_t add,const uint8_t * addrp)2467 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2468 {
2469 	aggr_grp_t *grp = arg;
2470 	aggr_port_t *port = NULL, *errport = NULL;
2471 	mac_perim_handle_t mph;
2472 	int err = 0;
2473 
2474 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2475 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2476 		if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2477 		    !port->lp_started) {
2478 			continue;
2479 		}
2480 		err = aggr_port_multicst(port, add, addrp);
2481 		if (err != 0) {
2482 			errport = port;
2483 			break;
2484 		}
2485 	}
2486 
2487 	/*
2488 	 * At least one port caused error return and this error is returned to
2489 	 * mac, eventually a NAK would be sent upwards.
2490 	 * Some ports have this multicast address listed now, and some don't.
2491 	 * Treat this error as a whole aggr failure not individual port failure.
2492 	 * Therefore remove this multicast address from other ports.
2493 	 */
2494 	if ((err != 0) && add) {
2495 		for (port = grp->lg_ports; port != errport;
2496 		    port = port->lp_next) {
2497 			if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2498 			    !port->lp_started) {
2499 				continue;
2500 			}
2501 			(void) aggr_port_multicst(port, B_FALSE, addrp);
2502 		}
2503 	}
2504 	mac_perim_exit(mph);
2505 	return (err);
2506 }
2507 
2508 static int
aggr_m_unicst(void * arg,const uint8_t * macaddr)2509 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2510 {
2511 	aggr_grp_t *grp = arg;
2512 	mac_perim_handle_t mph;
2513 	int err;
2514 
2515 	mac_perim_enter_by_mh(grp->lg_mh, &mph);
2516 	err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2517 	    0, 0);
2518 	mac_perim_exit(mph);
2519 	return (err);
2520 }
2521 
2522 /*
2523  * Initialize the capabilities that are advertised for the group
2524  * according to the capabilities of the constituent ports.
2525  */
2526 static void
aggr_grp_capab_set(aggr_grp_t * grp)2527 aggr_grp_capab_set(aggr_grp_t *grp)
2528 {
2529 	uint32_t cksum;
2530 	aggr_port_t *port;
2531 	mac_capab_lso_t cap_lso;
2532 
2533 	ASSERT(grp->lg_mh == NULL);
2534 	ASSERT(grp->lg_ports != NULL);
2535 
2536 	grp->lg_hcksum_txflags = (uint32_t)-1;
2537 	grp->lg_zcopy = B_TRUE;
2538 	grp->lg_vlan = B_TRUE;
2539 
2540 	grp->lg_lso = B_TRUE;
2541 	grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2542 	grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2543 
2544 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2545 		if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2546 			cksum = 0;
2547 		grp->lg_hcksum_txflags &= cksum;
2548 
2549 		grp->lg_vlan &=
2550 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2551 
2552 		grp->lg_zcopy &=
2553 		    !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2554 
2555 		grp->lg_lso &=
2556 		    mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2557 		if (grp->lg_lso) {
2558 			grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2559 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2560 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2561 				grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2562 				    cap_lso.lso_basic_tcp_ipv4.lso_max;
2563 		}
2564 	}
2565 }
2566 
2567 /*
2568  * Checks whether the capabilities of the port being added are compatible
2569  * with the current capabilities of the aggregation.
2570  */
2571 static boolean_t
aggr_grp_capab_check(aggr_grp_t * grp,aggr_port_t * port)2572 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2573 {
2574 	uint32_t hcksum_txflags;
2575 
2576 	ASSERT(grp->lg_ports != NULL);
2577 
2578 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2579 	    grp->lg_vlan) != grp->lg_vlan) {
2580 		return (B_FALSE);
2581 	}
2582 
2583 	if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2584 	    grp->lg_zcopy) != grp->lg_zcopy) {
2585 		return (B_FALSE);
2586 	}
2587 
2588 	if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2589 		if (grp->lg_hcksum_txflags != 0)
2590 			return (B_FALSE);
2591 	} else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2592 	    grp->lg_hcksum_txflags) {
2593 		return (B_FALSE);
2594 	}
2595 
2596 	if (grp->lg_lso) {
2597 		mac_capab_lso_t cap_lso;
2598 
2599 		if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2600 			if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2601 			    grp->lg_cap_lso.lso_flags)
2602 				return (B_FALSE);
2603 			if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2604 			    cap_lso.lso_basic_tcp_ipv4.lso_max)
2605 				return (B_FALSE);
2606 		} else {
2607 			return (B_FALSE);
2608 		}
2609 	}
2610 
2611 	return (B_TRUE);
2612 }
2613 
2614 /*
2615  * Returns the maximum SDU according to the SDU of the constituent ports.
2616  */
2617 static uint_t
aggr_grp_max_sdu(aggr_grp_t * grp)2618 aggr_grp_max_sdu(aggr_grp_t *grp)
2619 {
2620 	uint_t max_sdu = (uint_t)-1;
2621 	aggr_port_t *port;
2622 
2623 	ASSERT(grp->lg_ports != NULL);
2624 
2625 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2626 		uint_t port_sdu_max;
2627 
2628 		mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2629 		if (max_sdu > port_sdu_max)
2630 			max_sdu = port_sdu_max;
2631 	}
2632 
2633 	return (max_sdu);
2634 }
2635 
2636 /*
2637  * Checks if the maximum SDU of the specified port is compatible
2638  * with the maximum SDU of the specified aggregation group, returns
2639  * B_TRUE if it is, B_FALSE otherwise.
2640  */
2641 static boolean_t
aggr_grp_sdu_check(aggr_grp_t * grp,aggr_port_t * port)2642 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2643 {
2644 	uint_t port_sdu_max;
2645 
2646 	mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2647 	return (port_sdu_max >= grp->lg_max_sdu);
2648 }
2649 
2650 /*
2651  * Returns the maximum margin according to the margin of the constituent ports.
2652  */
2653 static uint32_t
aggr_grp_max_margin(aggr_grp_t * grp)2654 aggr_grp_max_margin(aggr_grp_t *grp)
2655 {
2656 	uint32_t margin = UINT32_MAX;
2657 	aggr_port_t *port;
2658 
2659 	ASSERT(grp->lg_mh == NULL);
2660 	ASSERT(grp->lg_ports != NULL);
2661 
2662 	for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2663 		if (margin > port->lp_margin)
2664 			margin = port->lp_margin;
2665 	}
2666 
2667 	grp->lg_margin = margin;
2668 	return (margin);
2669 }
2670 
2671 /*
2672  * Checks if the maximum margin of the specified port is compatible
2673  * with the maximum margin of the specified aggregation group, returns
2674  * B_TRUE if it is, B_FALSE otherwise.
2675  */
2676 static boolean_t
aggr_grp_margin_check(aggr_grp_t * grp,aggr_port_t * port)2677 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2678 {
2679 	if (port->lp_margin >= grp->lg_margin)
2680 		return (B_TRUE);
2681 
2682 	/*
2683 	 * See whether the current margin value is allowed to be changed to
2684 	 * the new value.
2685 	 */
2686 	if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2687 		return (B_FALSE);
2688 
2689 	grp->lg_margin = port->lp_margin;
2690 	return (B_TRUE);
2691 }
2692 
2693 /*
2694  * Set MTU on individual ports of an aggregation group
2695  */
2696 static int
aggr_set_port_sdu(aggr_grp_t * grp,aggr_port_t * port,uint32_t sdu,uint32_t * old_mtu)2697 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2698     uint32_t *old_mtu)
2699 {
2700 	boolean_t 		removed = B_FALSE;
2701 	mac_perim_handle_t	mph;
2702 	mac_diag_t		diag;
2703 	int			err, rv, retry = 0;
2704 
2705 	if (port->lp_mah != NULL) {
2706 		(void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2707 		port->lp_mah = NULL;
2708 		removed = B_TRUE;
2709 	}
2710 	err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2711 try_again:
2712 	if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2713 	    MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2714 	    &port->lp_mah, 0, &diag)) != 0) {
2715 		/*
2716 		 * following is a workaround for a bug in 'bge' driver.
2717 		 * See CR 6794654 for more information and this work around
2718 		 * will be removed once the CR is fixed.
2719 		 */
2720 		if (rv == EIO && retry++ < 3) {
2721 			delay(2 * hz);
2722 			goto try_again;
2723 		}
2724 		/*
2725 		 * if mac_unicast_add() failed while setting the MTU,
2726 		 * detach the port from the group.
2727 		 */
2728 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2729 		(void) aggr_grp_detach_port(grp, port);
2730 		mac_perim_exit(mph);
2731 		cmn_err(CE_WARN, "Unable to restart the port %s while "
2732 		    "setting MTU. Detaching the port from the aggregation.",
2733 		    mac_client_name(port->lp_mch));
2734 	}
2735 	return (err);
2736 }
2737 
2738 static int
aggr_sdu_update(aggr_grp_t * grp,uint32_t sdu)2739 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2740 {
2741 	int			err = 0, i, rv;
2742 	aggr_port_t		*port;
2743 	uint32_t		*mtu;
2744 
2745 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2746 
2747 	/*
2748 	 * If the MTU being set is equal to aggr group's maximum
2749 	 * allowable value, then there is nothing to change
2750 	 */
2751 	if (sdu == grp->lg_max_sdu)
2752 		return (0);
2753 
2754 	/* 0 is aggr group's min sdu */
2755 	if (sdu == 0)
2756 		return (EINVAL);
2757 
2758 	mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2759 	for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2760 	    port = port->lp_next, i++) {
2761 		err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2762 	}
2763 	if (err != 0) {
2764 		/* recover from error: reset the mtus of the ports */
2765 		aggr_port_t *tmp;
2766 
2767 		for (tmp = grp->lg_ports, i = 0; tmp != port;
2768 		    tmp = tmp->lp_next, i++) {
2769 			(void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2770 		}
2771 		goto bail;
2772 	}
2773 	grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2774 	rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2775 	ASSERT(rv == 0);
2776 bail:
2777 	kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2778 	return (err);
2779 }
2780 
2781 /*
2782  * Callback functions for set/get of properties
2783  */
2784 /*ARGSUSED*/
2785 static int
aggr_m_setprop(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)2786 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2787     uint_t pr_valsize, const void *pr_val)
2788 {
2789 	int 		err = ENOTSUP;
2790 	aggr_grp_t 	*grp = m_driver;
2791 
2792 	switch (pr_num) {
2793 	case MAC_PROP_MTU: {
2794 		uint32_t 	mtu;
2795 
2796 		if (pr_valsize < sizeof (mtu)) {
2797 			err = EINVAL;
2798 			break;
2799 		}
2800 		bcopy(pr_val, &mtu, sizeof (mtu));
2801 		err = aggr_sdu_update(grp, mtu);
2802 		break;
2803 	}
2804 	default:
2805 		break;
2806 	}
2807 	return (err);
2808 }
2809 
2810 typedef struct rboundary {
2811 	uint32_t	bval;
2812 	int		btype;
2813 } rboundary_t;
2814 
2815 /*
2816  * This function finds the intersection of mtu ranges stored in arrays -
2817  * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
2818  * Individual arrays are assumed to contain non-overlapping ranges.
2819  * Algorithm:
2820  *   A range has two boundaries - min and max. We scan all arrays and store
2821  * each boundary as a separate element in a temporary array. We also store
2822  * the boundary types, min or max, as +1 or -1 respectively in the temporary
2823  * array. Then we sort the temporary array in ascending order. We scan the
2824  * sorted array from lower to higher values and keep a cumulative sum of
2825  * boundary types. Element in the temporary array for which the sum reaches
2826  * mcount is a min boundary of a range in the result and next element will be
2827  * max boundary.
2828  *
2829  * Example for mcount = 3,
2830  *
2831  *  ----|_________|-------|_______|----|__|------ mrange[0]
2832  *
2833  *  -------|________|--|____________|-----|___|-- mrange[1]
2834  *
2835  *  --------|________________|-------|____|------ mrange[2]
2836  *
2837  *                                      3 2 1
2838  *                                       \|/
2839  *      1  23     2 1  2  3  2    1 01 2  V   0  <- the sum
2840  *  ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
2841  *
2842  *                                 same min and max
2843  *                                        V
2844  *  --------|_____|-------|__|------------|------ intersecting ranges
2845  */
2846 void
aggr_mtu_range_intersection(mac_propval_range_t ** mrange,int mcount,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)2847 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
2848     mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
2849 {
2850 	mac_propval_uint32_range_t	*rval, *ur;
2851 	int				rmaxcnt, rcount;
2852 	size_t				sz_range32;
2853 	rboundary_t			*ta; /* temporary array */
2854 	rboundary_t			temp;
2855 	boolean_t			range_started = B_FALSE;
2856 	int				i, j, m, sum;
2857 
2858 	sz_range32 = sizeof (mac_propval_uint32_range_t);
2859 
2860 	for (i = 0, rmaxcnt = 0; i < mcount; i++)
2861 		rmaxcnt += mrange[i]->mpr_count;
2862 
2863 	/* Allocate enough space to store the results */
2864 	rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
2865 
2866 	/* Number of boundaries are twice as many as ranges */
2867 	ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
2868 
2869 	for (i = 0, m = 0; i < mcount; i++) {
2870 		ur = &(mrange[i]->mpr_range_uint32[0]);
2871 		for (j = 0; j < mrange[i]->mpr_count; j++) {
2872 			ta[m].bval = ur[j].mpur_min;
2873 			ta[m++].btype = 1;
2874 			ta[m].bval = ur[j].mpur_max;
2875 			ta[m++].btype = -1;
2876 		}
2877 	}
2878 
2879 	/*
2880 	 * Sort the temporary array in ascending order of bval;
2881 	 * if boundary values are same then sort on btype.
2882 	 */
2883 	for (i = 0; i < m-1; i++) {
2884 		for (j = i+1; j < m; j++) {
2885 			if ((ta[i].bval > ta[j].bval) ||
2886 			    ((ta[i].bval == ta[j].bval) &&
2887 			    (ta[i].btype < ta[j].btype))) {
2888 				temp = ta[i];
2889 				ta[i] = ta[j];
2890 				ta[j] = temp;
2891 			}
2892 		}
2893 	}
2894 
2895 	/* Walk through temporary array to find all ranges in the results */
2896 	for (i = 0, sum = 0, rcount = 0; i < m; i++) {
2897 		sum += ta[i].btype;
2898 		if (sum == mcount) {
2899 			rval[rcount].mpur_min = ta[i].bval;
2900 			range_started = B_TRUE;
2901 		} else if (sum < mcount && range_started) {
2902 			rval[rcount++].mpur_max = ta[i].bval;
2903 			range_started = B_FALSE;
2904 		}
2905 	}
2906 
2907 	*prval = rval;
2908 	*prmaxcnt = rmaxcnt;
2909 	*prcount = rcount;
2910 
2911 	kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
2912 }
2913 
2914 /*
2915  * Returns the mtu ranges which could be supported by aggr group.
2916  * prmaxcnt returns the size of the buffer prval, prcount returns
2917  * the number of valid entries in prval. Caller is responsible
2918  * for freeing up prval.
2919  */
2920 int
aggr_grp_possible_mtu_range(aggr_grp_t * grp,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)2921 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
2922     int *prmaxcnt, int *prcount)
2923 {
2924 	mac_propval_range_t		**vals;
2925 	aggr_port_t			*port;
2926 	mac_perim_handle_t		mph;
2927 	uint_t 				i, numr;
2928 	int 				err = 0;
2929 	size_t				sz_propval, sz_range32;
2930 	size_t				size;
2931 
2932 	sz_propval = sizeof (mac_propval_range_t);
2933 	sz_range32 = sizeof (mac_propval_uint32_range_t);
2934 
2935 	ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2936 
2937 	vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
2938 	    KM_SLEEP);
2939 
2940 	for (port = grp->lg_ports, i = 0; port != NULL;
2941 	    port = port->lp_next, i++) {
2942 
2943 		size = sz_propval;
2944 		vals[i] = kmem_alloc(size, KM_SLEEP);
2945 		vals[i]->mpr_count = 1;
2946 
2947 		mac_perim_enter_by_mh(port->lp_mh, &mph);
2948 
2949 		err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2950 		    NULL, 0, vals[i], NULL);
2951 		if (err == ENOSPC) {
2952 			/*
2953 			 * Not enough space to hold all ranges.
2954 			 * Allocate extra space as indicated and retry.
2955 			 */
2956 			numr = vals[i]->mpr_count;
2957 			kmem_free(vals[i], sz_propval);
2958 			size = sz_propval + (numr - 1) * sz_range32;
2959 			vals[i] = kmem_alloc(size, KM_SLEEP);
2960 			vals[i]->mpr_count = numr;
2961 			err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2962 			    NULL, 0, vals[i], NULL);
2963 			ASSERT(err != ENOSPC);
2964 		}
2965 		mac_perim_exit(mph);
2966 		if (err != 0) {
2967 			kmem_free(vals[i], size);
2968 			vals[i] = NULL;
2969 			break;
2970 		}
2971 	}
2972 
2973 	/*
2974 	 * if any of the underlying ports does not support changing MTU then
2975 	 * just return ENOTSUP
2976 	 */
2977 	if (port != NULL) {
2978 		ASSERT(err != 0);
2979 		goto done;
2980 	}
2981 
2982 	aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
2983 	    prcount);
2984 
2985 done:
2986 	for (i = 0; i < grp->lg_nports; i++) {
2987 		if (vals[i] != NULL) {
2988 			numr = vals[i]->mpr_count;
2989 			size = sz_propval + (numr - 1) * sz_range32;
2990 			kmem_free(vals[i], size);
2991 		}
2992 	}
2993 
2994 	kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
2995 	return (err);
2996 }
2997 
2998 static void
aggr_m_propinfo(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)2999 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3000     mac_prop_info_handle_t prh)
3001 {
3002 	aggr_grp_t			*grp = m_driver;
3003 	mac_propval_uint32_range_t	*rval = NULL;
3004 	int				i, rcount, rmaxcnt;
3005 	int				err = 0;
3006 
3007 	_NOTE(ARGUNUSED(pr_name));
3008 
3009 	switch (pr_num) {
3010 	case MAC_PROP_MTU:
3011 
3012 		err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
3013 		    &rcount);
3014 		if (err != 0) {
3015 			ASSERT(rval == NULL);
3016 			return;
3017 		}
3018 		for (i = 0; i < rcount; i++) {
3019 			mac_prop_info_set_range_uint32(prh,
3020 			    rval[i].mpur_min, rval[i].mpur_max);
3021 		}
3022 		kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3023 		break;
3024 	}
3025 }
3026