1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2020 Joyent, Inc.
24 * Copyright 2020 RackTop Systems, Inc.
25 * Copyright 2024 MNX Cloud, Inc.
26 */
27
28 /*
29 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
30 *
31 * An instance of the structure aggr_grp_t is allocated for each
32 * link aggregation group. When created, aggr_grp_t objects are
33 * entered into the aggr_grp_hash hash table maintained by the modhash
34 * module. The hash key is the linkid associated with the link
35 * aggregation group.
36 *
37 * Each aggregation contains a set of ports. The port is represented
38 * by the aggr_port_t structure. A port consists of a single MAC
39 * client which has exclusive (MCIS_EXCLUSIVE) use of the underlying
40 * MAC. This client is used by the aggr to send and receive LACP
41 * traffic. Each port client takes on the same MAC unicast address --
42 * the address of the aggregation itself (taken from the first port by
43 * default).
44 *
45 * The MAC client that hangs off each aggr port is not your typical
46 * MAC client. Not only does it have exclusive control of the MAC, but
47 * it also has no Tx or Rx SRSes. An SRS is designed to queue and
48 * fanout traffic among L4 protocols; but the aggr is an intermediary,
49 * not a consumer. Instead of using SRSes, the aggr puts the
50 * underlying hardware rings into passthru mode and ships packets up
51 * via a direct call to aggr_recv_cb(). This allows aggr to enforce
52 * LACP while passing all other traffic up to clients of the aggr.
53 *
54 * Pseudo Rx Groups and Rings
55 * --------------------------
56 *
57 * It is imperative for client performance that the aggr provide as
58 * many MAC groups as possible. In order to use the underlying HW
59 * resources, aggr creates pseudo groups to aggregate the underlying
60 * HW groups. Every HW group gets mapped to a pseudo group; and every
61 * HW ring in that group gets mapped to a pseudo ring. The pseudo
62 * group at index 0 combines all the HW groups at index 0 from each
63 * port, etc. The aggr's MAC then creates normal MAC groups and rings
64 * out of these pseudo groups and rings to present to the aggr's
65 * clients. To the clients, the aggr's groups and rings are absolutely
66 * no different than a NIC's groups or rings.
67 *
68 * Pseudo Tx Rings
69 * ---------------
70 *
71 * The underlying ports (NICs) in an aggregation can have Tx rings. To
72 * enhance aggr's performance, these Tx rings are made available to
73 * the aggr layer as pseudo Tx rings. The concept of pseudo rings are
74 * not new. They are already present and implemented on the Rx side.
75 * The same concept is extended to the Tx side where each Tx ring of
76 * an underlying port is reflected in aggr as a pseudo Tx ring. Thus
77 * each pseudo Tx ring will map to a specific hardware Tx ring. Even
78 * in the case of a NIC that does not have a Tx ring, a pseudo Tx ring
79 * is given to the aggregation layer.
80 *
81 * With this change, the outgoing stack depth looks much better:
82 *
83 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
84 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
85 *
86 * Two new modes are introduced to mac_tx() to handle aggr pseudo Tx rings:
87 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
88 *
89 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
90 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) Tx
91 * ring belonging to a port on which the packet has to be sent.
92 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
93 * policy and then uses the fanout_hint passed to it to pick a Tx ring from
94 * the selected port.
95 *
96 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
97 * bandwidth limit is applied first on the outgoing packet and the packets
98 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
99 * particular Tx ring.
100 */
101
102 #include <sys/types.h>
103 #include <sys/sysmacros.h>
104 #include <sys/conf.h>
105 #include <sys/cmn_err.h>
106 #include <sys/disp.h>
107 #include <sys/list.h>
108 #include <sys/ksynch.h>
109 #include <sys/kmem.h>
110 #include <sys/stream.h>
111 #include <sys/modctl.h>
112 #include <sys/ddi.h>
113 #include <sys/sunddi.h>
114 #include <sys/atomic.h>
115 #include <sys/stat.h>
116 #include <sys/modhash.h>
117 #include <sys/id_space.h>
118 #include <sys/strsun.h>
119 #include <sys/cred.h>
120 #include <sys/dlpi.h>
121 #include <sys/zone.h>
122 #include <sys/mac_provider.h>
123 #include <sys/dls.h>
124 #include <sys/vlan.h>
125 #include <sys/aggr.h>
126 #include <sys/aggr_impl.h>
127
128 static int aggr_m_start(void *);
129 static void aggr_m_stop(void *);
130 static int aggr_m_promisc(void *, boolean_t);
131 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
132 static int aggr_m_unicst(void *, const uint8_t *);
133 static int aggr_m_stat(void *, uint_t, uint64_t *);
134 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
135 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
136 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
137 const void *);
138 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
139 mac_prop_info_handle_t);
140
141 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
142 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
143 boolean_t *);
144
145 static void aggr_grp_capab_set(aggr_grp_t *);
146 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
147 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
148 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
149 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
150 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
151
152 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
153 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
154 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
155 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
156 static int aggr_pseudo_start_rx_ring(mac_ring_driver_t, uint64_t);
157 static void aggr_pseudo_stop_rx_ring(mac_ring_driver_t);
158 static int aggr_addmac(void *, const uint8_t *);
159 static int aggr_remmac(void *, const uint8_t *);
160 static int aggr_addvlan(mac_group_driver_t, uint16_t);
161 static int aggr_remvlan(mac_group_driver_t, uint16_t);
162 static mblk_t *aggr_rx_poll(void *, int);
163 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
164 const int, mac_ring_info_t *, mac_ring_handle_t);
165 static void aggr_fill_group(void *, mac_ring_type_t, const int,
166 mac_group_info_t *, mac_group_handle_t);
167
168 static kmem_cache_t *aggr_grp_cache;
169 static mod_hash_t *aggr_grp_hash;
170 static krwlock_t aggr_grp_lock;
171 static uint_t aggr_grp_cnt;
172 static id_space_t *key_ids;
173
174 #define GRP_HASHSZ 64
175 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
176 #define AGGR_PORT_NAME_DELIMIT '-'
177
178 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
179
180 #define AGGR_M_CALLBACK_FLAGS \
181 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
182
183 static mac_callbacks_t aggr_m_callbacks = {
184 AGGR_M_CALLBACK_FLAGS,
185 aggr_m_stat,
186 aggr_m_start,
187 aggr_m_stop,
188 aggr_m_promisc,
189 aggr_m_multicst,
190 NULL,
191 NULL,
192 NULL,
193 aggr_m_ioctl,
194 aggr_m_capab_get,
195 NULL,
196 NULL,
197 aggr_m_setprop,
198 NULL,
199 aggr_m_propinfo
200 };
201
202 /*ARGSUSED*/
203 static int
aggr_grp_constructor(void * buf,void * arg,int kmflag)204 aggr_grp_constructor(void *buf, void *arg, int kmflag)
205 {
206 aggr_grp_t *grp = buf;
207
208 bzero(grp, sizeof (*grp));
209 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
210 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
211 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
212 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
213 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
214 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
215 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
216 grp->lg_link_state = LINK_STATE_UNKNOWN;
217 return (0);
218 }
219
220 /*ARGSUSED*/
221 static void
aggr_grp_destructor(void * buf,void * arg)222 aggr_grp_destructor(void *buf, void *arg)
223 {
224 aggr_grp_t *grp = buf;
225
226 if (grp->lg_tx_ports != NULL) {
227 kmem_free(grp->lg_tx_ports,
228 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
229 }
230
231 mutex_destroy(&grp->lg_lacp_lock);
232 cv_destroy(&grp->lg_lacp_cv);
233 mutex_destroy(&grp->lg_port_lock);
234 cv_destroy(&grp->lg_port_cv);
235 rw_destroy(&grp->lg_tx_lock);
236 mutex_destroy(&grp->lg_tx_flowctl_lock);
237 cv_destroy(&grp->lg_tx_flowctl_cv);
238 }
239
240 void
aggr_grp_init(void)241 aggr_grp_init(void)
242 {
243 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
244 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
245 aggr_grp_destructor, NULL, NULL, NULL, 0);
246
247 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
248 GRP_HASHSZ, mod_hash_null_valdtor);
249 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
250 aggr_grp_cnt = 0;
251
252 /*
253 * Allocate an id space to manage key values (when key is not
254 * specified). The range of the id space will be from
255 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
256 * uses a 16-bit key.
257 */
258 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
259 ASSERT(key_ids != NULL);
260 }
261
262 void
aggr_grp_fini(void)263 aggr_grp_fini(void)
264 {
265 id_space_destroy(key_ids);
266 rw_destroy(&aggr_grp_lock);
267 mod_hash_destroy_idhash(aggr_grp_hash);
268 kmem_cache_destroy(aggr_grp_cache);
269 }
270
271 uint_t
aggr_grp_count(void)272 aggr_grp_count(void)
273 {
274 uint_t count;
275
276 rw_enter(&aggr_grp_lock, RW_READER);
277 count = aggr_grp_cnt;
278 rw_exit(&aggr_grp_lock);
279 return (count);
280 }
281
282 /*
283 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
284 * requires the mac perimeter, this function holds a reference of the aggr
285 * and aggr won't call mac_unregister() until this reference drops to 0.
286 */
287 void
aggr_grp_port_hold(aggr_port_t * port)288 aggr_grp_port_hold(aggr_port_t *port)
289 {
290 aggr_grp_t *grp = port->lp_grp;
291
292 AGGR_PORT_REFHOLD(port);
293 mutex_enter(&grp->lg_port_lock);
294 grp->lg_port_ref++;
295 mutex_exit(&grp->lg_port_lock);
296 }
297
298 /*
299 * Release the reference of the grp and inform aggr_grp_delete() calling
300 * mac_unregister() is now safe.
301 */
302 void
aggr_grp_port_rele(aggr_port_t * port)303 aggr_grp_port_rele(aggr_port_t *port)
304 {
305 aggr_grp_t *grp = port->lp_grp;
306
307 mutex_enter(&grp->lg_port_lock);
308 if (--grp->lg_port_ref == 0)
309 cv_signal(&grp->lg_port_cv);
310 mutex_exit(&grp->lg_port_lock);
311 AGGR_PORT_REFRELE(port);
312 }
313
314 /*
315 * Wait for the port's lacp timer thread and the port's notification callback
316 * to exit.
317 */
318 void
aggr_grp_port_wait(aggr_grp_t * grp)319 aggr_grp_port_wait(aggr_grp_t *grp)
320 {
321 mutex_enter(&grp->lg_port_lock);
322 if (grp->lg_port_ref != 0)
323 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
324 mutex_exit(&grp->lg_port_lock);
325 }
326
327 /*
328 * Attach a port to a link aggregation group.
329 *
330 * A port is attached to a link aggregation group once its speed
331 * and link state have been verified.
332 *
333 * Returns B_TRUE if the group link state or speed has changed. If
334 * it's the case, the caller must notify the MAC layer via a call
335 * to mac_link().
336 */
337 boolean_t
aggr_grp_attach_port(aggr_grp_t * grp,aggr_port_t * port)338 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
339 {
340 boolean_t link_state_changed = B_FALSE;
341
342 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
343 ASSERT(MAC_PERIM_HELD(port->lp_mh));
344
345 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
346 return (B_FALSE);
347
348 /*
349 * Validate the MAC port link speed and update the group
350 * link speed if needed.
351 */
352 if (port->lp_ifspeed == 0 ||
353 port->lp_link_state != LINK_STATE_UP ||
354 port->lp_link_duplex != LINK_DUPLEX_FULL) {
355 /*
356 * Can't attach a MAC port with unknown link speed,
357 * down link, or not in full duplex mode.
358 */
359 return (B_FALSE);
360 }
361
362 mutex_enter(&grp->lg_stat_lock);
363 if (grp->lg_ifspeed == 0) {
364 /*
365 * The group inherits the speed of the first link being
366 * attached.
367 */
368 grp->lg_ifspeed = port->lp_ifspeed;
369 link_state_changed = B_TRUE;
370 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
371 /*
372 * The link speed of the MAC port must be the same as
373 * the group link speed, as per 802.3ad. Since it is
374 * not, the attach is cancelled.
375 */
376 mutex_exit(&grp->lg_stat_lock);
377 return (B_FALSE);
378 }
379 mutex_exit(&grp->lg_stat_lock);
380
381 grp->lg_nattached_ports++;
382
383 /*
384 * Update the group link state.
385 */
386 if (grp->lg_link_state != LINK_STATE_UP) {
387 grp->lg_link_state = LINK_STATE_UP;
388 mutex_enter(&grp->lg_stat_lock);
389 grp->lg_link_duplex = LINK_DUPLEX_FULL;
390 mutex_exit(&grp->lg_stat_lock);
391 link_state_changed = B_TRUE;
392 }
393
394 /*
395 * Update port's state.
396 */
397 port->lp_state = AGGR_PORT_STATE_ATTACHED;
398
399 aggr_grp_multicst_port(port, B_TRUE);
400
401 /*
402 * The port client doesn't have an Rx SRS; instead of calling
403 * mac_rx_set() we set the client's flow callback directly.
404 * This datapath is used only when the port's driver doesn't
405 * support MAC_CAPAB_RINGS. Drivers with ring support will
406 * deliver traffic to the aggr via ring passthru.
407 */
408 mac_client_set_flow_cb(port->lp_mch, aggr_recv_cb, port);
409
410 /*
411 * If LACP is OFF, the port can be used to send data as soon
412 * as its link is up and verified to be compatible with the
413 * aggregation.
414 *
415 * If LACP is active or passive, notify the LACP subsystem, which
416 * will enable sending on the port following the LACP protocol.
417 */
418 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
419 aggr_send_port_enable(port);
420 else
421 aggr_lacp_port_attached(port);
422
423 return (link_state_changed);
424 }
425
426 boolean_t
aggr_grp_detach_port(aggr_grp_t * grp,aggr_port_t * port)427 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
428 {
429 boolean_t link_state_changed = B_FALSE;
430
431 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
432 ASSERT(MAC_PERIM_HELD(port->lp_mh));
433
434 /* update state */
435 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
436 return (B_FALSE);
437
438 mac_client_clear_flow_cb(port->lp_mch);
439
440 aggr_grp_multicst_port(port, B_FALSE);
441
442 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
443 aggr_send_port_disable(port);
444 else
445 aggr_lacp_port_detached(port);
446
447 port->lp_state = AGGR_PORT_STATE_STANDBY;
448
449 grp->lg_nattached_ports--;
450 if (grp->lg_nattached_ports == 0) {
451 /* the last attached MAC port of the group is being detached */
452 grp->lg_link_state = LINK_STATE_DOWN;
453 mutex_enter(&grp->lg_stat_lock);
454 grp->lg_ifspeed = 0;
455 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
456 mutex_exit(&grp->lg_stat_lock);
457 link_state_changed = B_TRUE;
458 }
459
460 return (link_state_changed);
461 }
462
463 /*
464 * Update the MAC addresses of the constituent ports of the specified
465 * group. This function is invoked:
466 * - after creating a new aggregation group.
467 * - after adding new ports to an aggregation group.
468 * - after removing a port from a group when the MAC address of
469 * that port was used for the MAC address of the group.
470 * - after the MAC address of a port changed when the MAC address
471 * of that port was used for the MAC address of the group.
472 *
473 * Return true if the link state of the aggregation changed, for example
474 * as a result of a failure changing the MAC address of one of the
475 * constituent ports.
476 */
477 boolean_t
aggr_grp_update_ports_mac(aggr_grp_t * grp)478 aggr_grp_update_ports_mac(aggr_grp_t *grp)
479 {
480 aggr_port_t *cport;
481 boolean_t link_state_changed = B_FALSE;
482 mac_perim_handle_t mph;
483
484 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
485
486 for (cport = grp->lg_ports; cport != NULL;
487 cport = cport->lp_next) {
488 mac_perim_enter_by_mh(cport->lp_mh, &mph);
489 if (aggr_port_unicst(cport) != 0) {
490 if (aggr_grp_detach_port(grp, cport))
491 link_state_changed = B_TRUE;
492 } else {
493 /*
494 * If a port was detached because of a previous
495 * failure changing the MAC address, the port is
496 * reattached when it successfully changes the MAC
497 * address now, and this might cause the link state
498 * of the aggregation to change.
499 */
500 if (aggr_grp_attach_port(grp, cport))
501 link_state_changed = B_TRUE;
502 }
503 mac_perim_exit(mph);
504 }
505 return (link_state_changed);
506 }
507
508 /*
509 * Invoked when the MAC address of a port has changed. If the port's
510 * MAC address was used for the group MAC address, set mac_addr_changedp
511 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
512 * notification. If the link state changes due to detach/attach of
513 * the constituent port, set link_state_changedp to B_TRUE to indicate
514 * to the caller that it should send a MAC_NOTE_LINK notification. In both
515 * cases, it is the responsibility of the caller to invoke notification
516 * functions after releasing the the port lock.
517 */
518 void
aggr_grp_port_mac_changed(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)519 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
520 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
521 {
522 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
523 ASSERT(MAC_PERIM_HELD(port->lp_mh));
524 ASSERT(mac_addr_changedp != NULL);
525 ASSERT(link_state_changedp != NULL);
526
527 *mac_addr_changedp = B_FALSE;
528 *link_state_changedp = B_FALSE;
529
530 if (grp->lg_addr_fixed) {
531 /*
532 * The group is using a fixed MAC address or an automatic
533 * MAC address has not been set.
534 */
535 return;
536 }
537
538 if (grp->lg_mac_addr_port == port) {
539 /*
540 * The MAC address of the port was assigned to the group
541 * MAC address. Update the group MAC address.
542 */
543 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
544 *mac_addr_changedp = B_TRUE;
545 } else {
546 /*
547 * Update the actual port MAC address to the MAC address
548 * of the group.
549 */
550 if (aggr_port_unicst(port) != 0) {
551 *link_state_changedp = aggr_grp_detach_port(grp, port);
552 } else {
553 /*
554 * If a port was detached because of a previous
555 * failure changing the MAC address, the port is
556 * reattached when it successfully changes the MAC
557 * address now, and this might cause the link state
558 * of the aggregation to change.
559 */
560 *link_state_changedp = aggr_grp_attach_port(grp, port);
561 }
562 }
563 }
564
565 /*
566 * Add a port to a link aggregation group.
567 */
568 static int
aggr_grp_add_port(aggr_grp_t * grp,datalink_id_t port_linkid,boolean_t force,aggr_port_t ** pp)569 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
570 aggr_port_t **pp)
571 {
572 aggr_port_t *port, **cport;
573 mac_perim_handle_t mph;
574 zoneid_t port_zoneid = ALL_ZONES;
575 int err;
576
577 /* The port must be in the same zone as the aggregation. */
578 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
579 port_zoneid = GLOBAL_ZONEID;
580 if (grp->lg_zoneid != port_zoneid)
581 return (EBUSY);
582
583 /*
584 * If we are creating the aggr, then there is no MAC handle
585 * and thus no perimeter to hold. If we are adding a port to
586 * an existing aggr, then the perimiter of the aggr's MAC must
587 * be held.
588 */
589 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
590
591 err = aggr_port_create(grp, port_linkid, force, &port);
592 if (err != 0)
593 return (err);
594
595 mac_perim_enter_by_mh(port->lp_mh, &mph);
596
597 /* Add the new port to the end of the list. */
598 cport = &grp->lg_ports;
599 while (*cport != NULL)
600 cport = &((*cport)->lp_next);
601 *cport = port;
602
603 /*
604 * Back reference to the group it is member of. A port always
605 * holds a reference to its group to ensure that the back
606 * reference is always valid.
607 */
608 port->lp_grp = grp;
609 AGGR_GRP_REFHOLD(grp);
610 grp->lg_nports++;
611 if (grp->lg_nports > grp->lg_nports_high)
612 grp->lg_nports_high = grp->lg_nports;
613
614 aggr_lacp_init_port(port);
615 mac_perim_exit(mph);
616
617 if (pp != NULL)
618 *pp = port;
619
620 return (0);
621 }
622
623 /*
624 * This is called when the 'lg_tx_ports' arrangement has changed and
625 * we need to update the corresponding 'mi_default_tx_ring'. This
626 * happens for several reasons.
627 *
628 * - A pseudo TX mac group was added or removed.
629 * - An LACP message has changed the port's state.
630 * - A link event has changed the port's state.
631 *
632 * In any case, we see if there is at least one port enabled (see
633 * 'aggr_send_port_enable()'), and if so we use its first ring as the
634 * mac's default TX ring.
635 *
636 * Note, because we only have a single TX group, we don't have to
637 * worry about the rings moving between groups and the chance that mac
638 * will reassign it unless someone removes a port, at which point, we
639 * play it safe and call this again.
640 */
641 void
aggr_grp_update_default(aggr_grp_t * grp)642 aggr_grp_update_default(aggr_grp_t *grp)
643 {
644 aggr_port_t *port;
645 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
646
647 rw_enter(&grp->lg_tx_lock, RW_WRITER);
648
649 if (grp->lg_ntx_ports == 0) {
650 rw_exit(&grp->lg_tx_lock);
651 return;
652 }
653
654 port = grp->lg_tx_ports[0];
655 ASSERT(port->lp_tx_ring_cnt > 0);
656 mac_hwring_set_default(grp->lg_mh, port->lp_pseudo_tx_rings[0]);
657 rw_exit(&grp->lg_tx_lock);
658 }
659
660 /*
661 * Add a pseudo RX ring for the given HW ring handle.
662 */
663 static int
aggr_add_pseudo_rx_ring(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)664 aggr_add_pseudo_rx_ring(aggr_port_t *port,
665 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
666 {
667 aggr_pseudo_rx_ring_t *ring;
668 int err;
669 int j;
670
671 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
672 ring = rx_grp->arg_rings + j;
673 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
674 break;
675 }
676
677 /*
678 * No slot for this new RX ring.
679 */
680 if (j == MAX_RINGS_PER_GROUP)
681 return (ENOSPC);
682
683 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
684 ring->arr_hw_rh = hw_rh;
685 ring->arr_port = port;
686 ring->arr_grp = rx_grp;
687 rx_grp->arg_ring_cnt++;
688
689 /*
690 * The group is already registered, dynamically add a new ring to the
691 * mac group.
692 */
693 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
694 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
695 ring->arr_hw_rh = NULL;
696 ring->arr_port = NULL;
697 ring->arr_grp = NULL;
698 rx_grp->arg_ring_cnt--;
699 } else {
700 /*
701 * This must run after the MAC is registered.
702 */
703 ASSERT3P(ring->arr_rh, !=, NULL);
704 mac_hwring_set_passthru(hw_rh, (mac_rx_t)aggr_recv_cb,
705 (void *)port, (mac_resource_handle_t)ring);
706 }
707 return (err);
708 }
709
710 /*
711 * Remove the pseudo RX ring of the given HW ring handle.
712 */
713 static void
aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)714 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
715 {
716 for (uint_t j = 0; j < MAX_RINGS_PER_GROUP; j++) {
717 aggr_pseudo_rx_ring_t *ring = rx_grp->arg_rings + j;
718
719 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
720 ring->arr_hw_rh != hw_rh) {
721 continue;
722 }
723
724 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
725
726 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
727 ring->arr_hw_rh = NULL;
728 ring->arr_port = NULL;
729 ring->arr_grp = NULL;
730 rx_grp->arg_ring_cnt--;
731 mac_hwring_clear_passthru(hw_rh);
732 break;
733 }
734 }
735
736 /*
737 * Create pseudo rings over the HW rings of the port.
738 *
739 * o Create a pseudo ring in rx_grp per HW ring in the port's HW group.
740 *
741 * o Program existing unicast filters on the pseudo group into the HW group.
742 *
743 * o Program existing VLAN filters on the pseudo group into the HW group.
744 */
745 static int
aggr_add_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)746 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
747 {
748 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
749 aggr_unicst_addr_t *addr, *a;
750 mac_perim_handle_t pmph;
751 aggr_vlan_t *avp;
752 uint_t hw_rh_cnt, i;
753 int err = 0;
754 uint_t g_idx = rx_grp->arg_index;
755
756 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
757 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
758 mac_perim_enter_by_mh(port->lp_mh, &pmph);
759
760 i = 0;
761 addr = NULL;
762 /*
763 * This function must be called after the aggr registers its
764 * MAC and its Rx groups have been initialized.
765 */
766 ASSERT(rx_grp->arg_gh != NULL);
767
768 /*
769 * Get the list of the underlying HW rings.
770 */
771 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx,
772 &port->lp_hwghs[g_idx], hw_rh, MAC_RING_TYPE_RX);
773
774 /*
775 * Add existing VLAN and unicast address filters to the port.
776 */
777 for (avp = list_head(&rx_grp->arg_vlans); avp != NULL;
778 avp = list_next(&rx_grp->arg_vlans, avp)) {
779 if ((err = aggr_port_addvlan(port, g_idx, avp->av_vid)) != 0)
780 goto err;
781 }
782
783 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
784 if ((err = aggr_port_addmac(port, g_idx, addr->aua_addr)) != 0)
785 goto err;
786 }
787
788 for (i = 0; i < hw_rh_cnt; i++) {
789 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
790 if (err != 0)
791 goto err;
792 }
793
794 mac_perim_exit(pmph);
795 return (0);
796
797 err:
798 ASSERT(err != 0);
799
800 for (uint_t j = 0; j < i; j++)
801 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
802
803 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
804 aggr_port_remmac(port, g_idx, a->aua_addr);
805
806 if (avp != NULL)
807 avp = list_prev(&rx_grp->arg_vlans, avp);
808
809 for (; avp != NULL; avp = list_prev(&rx_grp->arg_vlans, avp)) {
810 int err2;
811
812 if ((err2 = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
813 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
814 ": errno %d.", avp->av_vid,
815 mac_client_name(port->lp_mch), err2);
816 }
817 }
818
819 port->lp_hwghs[g_idx] = NULL;
820 mac_perim_exit(pmph);
821 return (err);
822 }
823
824 /*
825 * Destroy the pseudo rings mapping to this port and remove all VLAN
826 * and unicast filters from this port. Even if there are no underlying
827 * HW rings we must still remove the unicast filters to take the port
828 * out of promisc mode.
829 */
830 static void
aggr_rem_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)831 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
832 {
833 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
834 aggr_unicst_addr_t *addr;
835 mac_perim_handle_t pmph;
836 uint_t hw_rh_cnt;
837 uint_t g_idx = rx_grp->arg_index;
838
839 ASSERT(MAC_PERIM_HELD(port->lp_grp->lg_mh));
840 ASSERT3U(g_idx, <, MAX_GROUPS_PER_PORT);
841 ASSERT3P(rx_grp->arg_gh, !=, NULL);
842 mac_perim_enter_by_mh(port->lp_mh, &pmph);
843
844 hw_rh_cnt = mac_hwrings_idx_get(port->lp_mh, g_idx, NULL, hw_rh,
845 MAC_RING_TYPE_RX);
846
847 for (uint_t i = 0; i < hw_rh_cnt; i++)
848 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
849
850 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
851 aggr_port_remmac(port, g_idx, addr->aua_addr);
852
853 for (aggr_vlan_t *avp = list_head(&rx_grp->arg_vlans); avp != NULL;
854 avp = list_next(&rx_grp->arg_vlans, avp)) {
855 int err;
856
857 if ((err = aggr_port_remvlan(port, g_idx, avp->av_vid)) != 0) {
858 cmn_err(CE_WARN, "Failed to remove VLAN %u from port %s"
859 ": errno %d.", avp->av_vid,
860 mac_client_name(port->lp_mch), err);
861 }
862 }
863
864 port->lp_hwghs[g_idx] = NULL;
865 mac_perim_exit(pmph);
866 }
867
868 /*
869 * Add a pseudo TX ring for the given HW ring handle.
870 */
871 static int
aggr_add_pseudo_tx_ring(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t hw_rh,mac_ring_handle_t * pseudo_rh)872 aggr_add_pseudo_tx_ring(aggr_port_t *port,
873 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
874 mac_ring_handle_t *pseudo_rh)
875 {
876 aggr_pseudo_tx_ring_t *ring;
877 int err;
878 int i;
879
880 ASSERT(MAC_PERIM_HELD(port->lp_mh));
881 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
882 ring = tx_grp->atg_rings + i;
883 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
884 break;
885 }
886 /*
887 * No slot for this new TX ring.
888 */
889 if (i == MAX_RINGS_PER_GROUP)
890 return (ENOSPC);
891 /*
892 * The following 4 statements needs to be done before
893 * calling mac_group_add_ring(). Otherwise it will
894 * result in an assertion failure in mac_init_ring().
895 */
896 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
897 ring->atr_hw_rh = hw_rh;
898 ring->atr_port = port;
899 tx_grp->atg_ring_cnt++;
900
901 /*
902 * The TX side has no concept of ring groups unlike RX groups.
903 * There is just a single group which stores all the TX rings.
904 * This group will be used to store aggr's pseudo TX rings.
905 */
906 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
907 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
908 ring->atr_hw_rh = NULL;
909 ring->atr_port = NULL;
910 tx_grp->atg_ring_cnt--;
911 } else {
912 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
913 if (hw_rh != NULL) {
914 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
915 mac_find_ring(tx_grp->atg_gh, i));
916 }
917 }
918
919 return (err);
920 }
921
922 /*
923 * Remove the pseudo TX ring of the given HW ring handle.
924 */
925 static void
aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t pseudo_hw_rh)926 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
927 mac_ring_handle_t pseudo_hw_rh)
928 {
929 aggr_pseudo_tx_ring_t *ring;
930 int i;
931
932 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
933 ring = tx_grp->atg_rings + i;
934 if (ring->atr_rh != pseudo_hw_rh)
935 continue;
936
937 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
938 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
939 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
940 mac_hwring_teardown(ring->atr_hw_rh);
941 ring->atr_hw_rh = NULL;
942 ring->atr_port = NULL;
943 tx_grp->atg_ring_cnt--;
944 break;
945 }
946 }
947
948 /*
949 * This function is called to create pseudo rings over hardware rings of
950 * the underlying device. There is a 1:1 mapping between the pseudo TX
951 * rings of the aggr and the hardware rings of the underlying port.
952 */
953 static int
aggr_add_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp,uint_t limit)954 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp,
955 uint_t limit)
956 {
957 aggr_grp_t *grp = port->lp_grp;
958 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
959 mac_perim_handle_t pmph;
960 int hw_rh_cnt, i = 0, j;
961 int err = 0;
962
963 if (limit == 0)
964 return (ENOSPC);
965
966 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
967 mac_perim_enter_by_mh(port->lp_mh, &pmph);
968
969 /*
970 * Get the list the the underlying HW rings.
971 */
972 hw_rh_cnt = mac_hwrings_get(port->lp_mch, NULL, hw_rh,
973 MAC_RING_TYPE_TX);
974
975 /*
976 * Even if the underlying NIC does not have TX rings, we
977 * still make a psuedo TX ring for that NIC with NULL as
978 * the ring handle.
979 */
980 if (hw_rh_cnt == 0)
981 port->lp_tx_ring_cnt = 1;
982 else
983 port->lp_tx_ring_cnt = MIN(hw_rh_cnt, limit);
984
985 port->lp_tx_ring_alloc = port->lp_tx_ring_cnt;
986 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
987 port->lp_tx_ring_alloc), KM_SLEEP);
988 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
989 port->lp_tx_ring_alloc), KM_SLEEP);
990
991 if (hw_rh_cnt == 0) {
992 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
993 NULL, &pseudo_rh)) == 0) {
994 port->lp_tx_rings[0] = NULL;
995 port->lp_pseudo_tx_rings[0] = pseudo_rh;
996 }
997 } else {
998 for (i = 0; err == 0 && i < port->lp_tx_ring_cnt; i++) {
999 err = aggr_add_pseudo_tx_ring(port,
1000 tx_grp, hw_rh[i], &pseudo_rh);
1001 if (err != 0)
1002 break;
1003 port->lp_tx_rings[i] = hw_rh[i];
1004 port->lp_pseudo_tx_rings[i] = pseudo_rh;
1005 }
1006 }
1007
1008 if (err != 0) {
1009 if (hw_rh_cnt != 0) {
1010 for (j = 0; j < i; j++) {
1011 aggr_rem_pseudo_tx_ring(tx_grp,
1012 port->lp_pseudo_tx_rings[j]);
1013 }
1014 }
1015 kmem_free(port->lp_tx_rings,
1016 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1017 kmem_free(port->lp_pseudo_tx_rings,
1018 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1019 port->lp_tx_ring_cnt = 0;
1020 port->lp_tx_ring_alloc = 0;
1021 } else {
1022 port->lp_tx_grp_added = B_TRUE;
1023 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
1024 aggr_tx_ring_update, port);
1025 }
1026 mac_perim_exit(pmph);
1027 aggr_grp_update_default(grp);
1028 return (err);
1029 }
1030
1031 /*
1032 * This function is called by aggr to remove pseudo TX rings over the
1033 * HW rings of the underlying port.
1034 */
1035 static void
aggr_rem_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp)1036 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
1037 {
1038 aggr_grp_t *grp = port->lp_grp;
1039 mac_perim_handle_t pmph;
1040 int i;
1041
1042 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1043 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1044
1045 if (!port->lp_tx_grp_added)
1046 goto done;
1047
1048 ASSERT(tx_grp->atg_gh != NULL);
1049
1050 for (i = 0; i < port->lp_tx_ring_cnt; i++)
1051 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
1052
1053 kmem_free(port->lp_tx_rings,
1054 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1055 kmem_free(port->lp_pseudo_tx_rings,
1056 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_alloc));
1057
1058 port->lp_tx_ring_cnt = 0;
1059 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
1060 port->lp_tx_grp_added = B_FALSE;
1061 aggr_grp_update_default(grp);
1062 done:
1063 mac_perim_exit(pmph);
1064 }
1065
1066 static int
aggr_pseudo_disable_intr(mac_intr_handle_t ih)1067 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
1068 {
1069 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1070 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
1071 }
1072
1073 static int
aggr_pseudo_enable_intr(mac_intr_handle_t ih)1074 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
1075 {
1076 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
1077 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
1078 }
1079
1080 /*
1081 * Start the pseudo ring. Since the pseudo ring is just an abstraction
1082 * over an actual HW ring, the real task is to start the underlying HW
1083 * ring.
1084 */
1085 static int
aggr_pseudo_start_rx_ring(mac_ring_driver_t arg,uint64_t mr_gen)1086 aggr_pseudo_start_rx_ring(mac_ring_driver_t arg, uint64_t mr_gen)
1087 {
1088 int err;
1089 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1090
1091 err = mac_hwring_start(rr_ring->arr_hw_rh);
1092
1093 if (err != 0)
1094 return (err);
1095
1096 rr_ring->arr_gen = mr_gen;
1097 return (err);
1098 }
1099
1100 /*
1101 * Stop the pseudo ring. Since the pseudo ring is just an abstraction
1102 * over an actual HW ring, the real task is to stop the underlying HW
1103 * ring.
1104 */
1105 static void
aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)1106 aggr_pseudo_stop_rx_ring(mac_ring_driver_t arg)
1107 {
1108 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
1109
1110 /*
1111 * The rings underlying the default group must stay up to
1112 * continue receiving LACP traffic. We would normally never
1113 * stop the default Rx rings because of the primary MAC
1114 * client; but aggr's primary MAC client doesn't call
1115 * mac_unicast_add() and thus mi_active is 0 when the last
1116 * non-primary client is deleted.
1117 */
1118 if (rr_ring->arr_grp->arg_index != 0)
1119 mac_hwring_stop(rr_ring->arr_hw_rh);
1120 }
1121
1122 /*
1123 * Trim each port in a group to ensure it uses no more than tx_ring_limit
1124 * rings.
1125 */
1126 static void
aggr_grp_balance_tx(aggr_grp_t * grp,uint_t tx_ring_limit)1127 aggr_grp_balance_tx(aggr_grp_t *grp, uint_t tx_ring_limit)
1128 {
1129 aggr_port_t *port;
1130 mac_perim_handle_t mph;
1131 uint_t i, tx_ring_cnt;
1132
1133 ASSERT(tx_ring_limit > 0);
1134 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1135
1136 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1137 mac_perim_enter_by_mh(port->lp_mh, &mph);
1138
1139 /*
1140 * Reduce the Tx ring count first to prevent rings being
1141 * used as they are removed.
1142 */
1143 rw_enter(&grp->lg_tx_lock, RW_WRITER);
1144 if (port->lp_tx_ring_cnt <= tx_ring_limit) {
1145 rw_exit(&grp->lg_tx_lock);
1146 mac_perim_exit(mph);
1147 continue;
1148 }
1149
1150 tx_ring_cnt = port->lp_tx_ring_cnt;
1151 port->lp_tx_ring_cnt = tx_ring_limit;
1152 rw_exit(&grp->lg_tx_lock);
1153
1154 for (i = tx_ring_cnt - 1; i >= tx_ring_limit; i--) {
1155 aggr_rem_pseudo_tx_ring(&grp->lg_tx_group,
1156 port->lp_pseudo_tx_rings[i]);
1157
1158 }
1159
1160 mac_perim_exit(mph);
1161 }
1162 }
1163
1164 /*
1165 * Add one or more ports to an existing link aggregation group.
1166 */
1167 int
aggr_grp_add_ports(datalink_id_t linkid,uint_t nports,boolean_t force,laioc_port_t * ports)1168 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
1169 laioc_port_t *ports)
1170 {
1171 int rc;
1172 uint_t port_added = 0;
1173 uint_t grp_added;
1174 uint_t nports_high, tx_ring_limit;
1175 aggr_grp_t *grp = NULL;
1176 aggr_port_t *port;
1177 boolean_t link_state_changed = B_FALSE;
1178 mac_perim_handle_t mph, pmph;
1179
1180 /* Get the aggr corresponding to linkid. */
1181 rw_enter(&aggr_grp_lock, RW_READER);
1182 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1183 (mod_hash_val_t *)&grp) != 0) {
1184 rw_exit(&aggr_grp_lock);
1185 return (ENOENT);
1186 }
1187 AGGR_GRP_REFHOLD(grp);
1188
1189 /*
1190 * Hold the perimeter so that the aggregation can't be destroyed.
1191 */
1192 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1193 rw_exit(&aggr_grp_lock);
1194
1195 /*
1196 * Limit the number of Tx rings per port. When determining the
1197 * number of ports take into consideration the existing high
1198 * value, and what the new high value may be after this request.
1199 */
1200 nports_high = MAX(grp->lg_nports_high, grp->lg_nports + nports);
1201 tx_ring_limit = MAX_RINGS_PER_GROUP / nports_high;
1202
1203 if (tx_ring_limit == 0) {
1204 rc = ENOSPC;
1205 goto bail;
1206 }
1207
1208 /*
1209 * Balance the Tx rings so each port has a fair share of rings.
1210 */
1211 aggr_grp_balance_tx(grp, tx_ring_limit);
1212
1213 /* Add the specified ports to the aggr. */
1214 for (uint_t i = 0; i < nports; i++) {
1215 grp_added = 0;
1216
1217 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1218 force, &port)) != 0) {
1219 goto bail;
1220 }
1221
1222 ASSERT(port != NULL);
1223 port_added++;
1224
1225 /* check capabilities */
1226 if (!aggr_grp_capab_check(grp, port) ||
1227 !aggr_grp_sdu_check(grp, port) ||
1228 !aggr_grp_margin_check(grp, port)) {
1229 rc = ENOTSUP;
1230 goto bail;
1231 }
1232
1233 /*
1234 * Create the pseudo ring for each HW ring of the underlying
1235 * port.
1236 */
1237 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
1238 tx_ring_limit);
1239 if (rc != 0)
1240 goto bail;
1241
1242 for (uint_t j = 0; j < grp->lg_rx_group_count; j++) {
1243 rc = aggr_add_pseudo_rx_group(port,
1244 &grp->lg_rx_groups[j]);
1245
1246 if (rc != 0)
1247 goto bail;
1248
1249 grp_added++;
1250 }
1251
1252 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1253
1254 /* set LACP mode */
1255 aggr_port_lacp_set_mode(grp, port);
1256
1257 /* start port if group has already been started */
1258 if (grp->lg_started) {
1259 rc = aggr_port_start(port);
1260 if (rc != 0) {
1261 mac_perim_exit(pmph);
1262 goto bail;
1263 }
1264
1265 /*
1266 * Turn on the promiscuous mode over the port when it
1267 * is requested to be turned on to receive the
1268 * non-primary address over a port, or the promiscuous
1269 * mode is enabled over the aggr.
1270 */
1271 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1272 rc = aggr_port_promisc(port, B_TRUE);
1273 if (rc != 0) {
1274 mac_perim_exit(pmph);
1275 goto bail;
1276 }
1277 }
1278 }
1279 mac_perim_exit(pmph);
1280
1281 /*
1282 * Attach each port if necessary.
1283 */
1284 if (aggr_port_notify_link(grp, port))
1285 link_state_changed = B_TRUE;
1286
1287 /*
1288 * Initialize the callback functions for this port.
1289 */
1290 aggr_port_init_callbacks(port);
1291 }
1292
1293 /* update the MAC address of the constituent ports */
1294 if (aggr_grp_update_ports_mac(grp))
1295 link_state_changed = B_TRUE;
1296
1297 if (link_state_changed)
1298 mac_link_update(grp->lg_mh, grp->lg_link_state);
1299
1300 bail:
1301 if (rc != 0) {
1302 /* stop and remove ports that have been added */
1303 for (uint_t i = 0; i < port_added; i++) {
1304 uint_t grp_remove;
1305
1306 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1307 ASSERT(port != NULL);
1308
1309 if (grp->lg_started) {
1310 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1311 (void) aggr_port_promisc(port, B_FALSE);
1312 aggr_port_stop(port);
1313 mac_perim_exit(pmph);
1314 }
1315
1316 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1317
1318 /*
1319 * Only the last port could have a partial set
1320 * of groups added.
1321 */
1322 grp_remove = (i + 1 == port_added) ? grp_added :
1323 grp->lg_rx_group_count;
1324
1325 for (uint_t j = 0; j < grp_remove; j++) {
1326 aggr_rem_pseudo_rx_group(port,
1327 &grp->lg_rx_groups[j]);
1328 }
1329
1330 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1331 }
1332 }
1333
1334 mac_perim_exit(mph);
1335 AGGR_GRP_REFRELE(grp);
1336 return (rc);
1337 }
1338
1339 static int
aggr_grp_modify_common(aggr_grp_t * grp,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1340 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1341 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1342 aggr_lacp_timer_t lacp_timer)
1343 {
1344 boolean_t mac_addr_changed = B_FALSE;
1345 boolean_t link_state_changed = B_FALSE;
1346 mac_perim_handle_t pmph;
1347
1348 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1349
1350 /* validate fixed address if specified */
1351 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1352 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1353 (mac_addr[0] & 0x01))) {
1354 return (EINVAL);
1355 }
1356
1357 /* update policy if requested */
1358 if (update_mask & AGGR_MODIFY_POLICY)
1359 aggr_send_update_policy(grp, policy);
1360
1361 /* update unicast MAC address if requested */
1362 if (update_mask & AGGR_MODIFY_MAC) {
1363 if (mac_fixed) {
1364 /* user-supplied MAC address */
1365 grp->lg_mac_addr_port = NULL;
1366 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1367 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1368 mac_addr_changed = B_TRUE;
1369 }
1370 } else if (grp->lg_addr_fixed) {
1371 /* switch from user-supplied to automatic */
1372 aggr_port_t *port = grp->lg_ports;
1373
1374 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1375 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1376 grp->lg_mac_addr_port = port;
1377 mac_addr_changed = B_TRUE;
1378 mac_perim_exit(pmph);
1379 }
1380 grp->lg_addr_fixed = mac_fixed;
1381 }
1382
1383 if (mac_addr_changed)
1384 link_state_changed = aggr_grp_update_ports_mac(grp);
1385
1386 if (update_mask & AGGR_MODIFY_LACP_MODE)
1387 aggr_lacp_update_mode(grp, lacp_mode);
1388
1389 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1390 aggr_lacp_update_timer(grp, lacp_timer);
1391
1392 if (link_state_changed)
1393 mac_link_update(grp->lg_mh, grp->lg_link_state);
1394
1395 if (mac_addr_changed)
1396 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1397
1398 return (0);
1399 }
1400
1401 /*
1402 * Update properties of an existing link aggregation group.
1403 */
1404 int
aggr_grp_modify(datalink_id_t linkid,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1405 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1406 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1407 aggr_lacp_timer_t lacp_timer)
1408 {
1409 aggr_grp_t *grp = NULL;
1410 mac_perim_handle_t mph;
1411 int err;
1412
1413 /* get group corresponding to linkid */
1414 rw_enter(&aggr_grp_lock, RW_READER);
1415 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1416 (mod_hash_val_t *)&grp) != 0) {
1417 rw_exit(&aggr_grp_lock);
1418 return (ENOENT);
1419 }
1420 AGGR_GRP_REFHOLD(grp);
1421
1422 /*
1423 * Hold the perimeter so that the aggregation won't be destroyed.
1424 */
1425 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1426 rw_exit(&aggr_grp_lock);
1427
1428 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1429 mac_addr, lacp_mode, lacp_timer);
1430
1431 mac_perim_exit(mph);
1432 AGGR_GRP_REFRELE(grp);
1433 return (err);
1434 }
1435
1436 /*
1437 * Create a new link aggregation group upon request from administrator.
1438 * Returns 0 on success, an errno on failure.
1439 */
1440 int
aggr_grp_create(datalink_id_t linkid,uint32_t key,uint_t nports,laioc_port_t * ports,uint32_t policy,boolean_t mac_fixed,boolean_t force,uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer,cred_t * credp)1441 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1442 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1443 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1444 cred_t *credp)
1445 {
1446 aggr_grp_t *grp = NULL;
1447 aggr_port_t *port;
1448 aggr_port_t *last_attached = NULL;
1449 mac_register_t *mac;
1450 boolean_t link_state_changed;
1451 mac_perim_handle_t mph, pmph;
1452 datalink_id_t tempid;
1453 boolean_t mac_registered = B_FALSE;
1454 uint_t tx_ring_limit;
1455 int err;
1456 int i, j;
1457 kt_did_t tid = 0;
1458
1459 /* need at least one port */
1460 if (nports == 0)
1461 return (EINVAL);
1462
1463 rw_enter(&aggr_grp_lock, RW_WRITER);
1464
1465 /* does a group with the same linkid already exist? */
1466 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1467 (mod_hash_val_t *)&grp);
1468 if (err == 0) {
1469 rw_exit(&aggr_grp_lock);
1470 return (EEXIST);
1471 }
1472
1473 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1474
1475 grp->lg_refs = 1;
1476 grp->lg_closing = B_FALSE;
1477 grp->lg_force = force;
1478 grp->lg_linkid = linkid;
1479 grp->lg_zoneid = crgetzoneid(credp);
1480 grp->lg_ifspeed = 0;
1481 grp->lg_link_state = LINK_STATE_UNKNOWN;
1482 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1483 grp->lg_started = B_FALSE;
1484 grp->lg_promisc = B_FALSE;
1485 grp->lg_lacp_done = B_FALSE;
1486 grp->lg_tx_notify_done = B_FALSE;
1487 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1488 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1489 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1490 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1491 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1492 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1493 MAX_RINGS_PER_GROUP), KM_SLEEP);
1494 grp->lg_tx_blocked_cnt = 0;
1495 bzero(&grp->lg_rx_groups,
1496 sizeof (aggr_pseudo_rx_group_t) * MAX_GROUPS_PER_PORT);
1497 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1498 aggr_lacp_init_grp(grp);
1499
1500 /* add MAC ports to group */
1501 grp->lg_ports = NULL;
1502 grp->lg_nports = 0;
1503 grp->lg_nattached_ports = 0;
1504 grp->lg_ntx_ports = 0;
1505
1506 /*
1507 * If key is not specified by the user, allocate the key.
1508 */
1509 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1510 err = ENOMEM;
1511 goto bail;
1512 }
1513 grp->lg_key = key;
1514
1515 for (i = 0; i < nports; i++) {
1516 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, &port);
1517 if (err != 0)
1518 goto bail;
1519 }
1520
1521 grp->lg_rx_group_count = 1;
1522
1523 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1524 uint_t num_rgroups;
1525
1526 mac_perim_enter_by_mh(port->lp_mh, &mph);
1527 num_rgroups = mac_get_num_rx_groups(port->lp_mh);
1528 mac_perim_exit(mph);
1529
1530 /*
1531 * Utilize all the groups in a port. If some ports
1532 * have less groups than others, then traffic destined
1533 * for the same unicast address may be HW classified
1534 * on some ports but SW classified by aggr when
1535 * arriving on other ports.
1536 */
1537 grp->lg_rx_group_count = MAX(grp->lg_rx_group_count,
1538 num_rgroups);
1539 }
1540
1541 /*
1542 * There could be cases where the hardware provides more
1543 * groups than aggr can support. Make sure we never go above
1544 * the max aggr can support.
1545 */
1546 grp->lg_rx_group_count = MIN(grp->lg_rx_group_count,
1547 MAX_GROUPS_PER_PORT);
1548
1549 ASSERT3U(grp->lg_rx_group_count, >, 0);
1550 for (i = 0; i < MAX_GROUPS_PER_PORT; i++) {
1551 grp->lg_rx_groups[i].arg_index = i;
1552 grp->lg_rx_groups[i].arg_untagged = 0;
1553 list_create(&(grp->lg_rx_groups[i].arg_vlans),
1554 sizeof (aggr_vlan_t), offsetof(aggr_vlan_t, av_link));
1555 }
1556
1557 /*
1558 * If no explicit MAC address was specified by the administrator,
1559 * set it to the MAC address of the first port.
1560 */
1561 grp->lg_addr_fixed = mac_fixed;
1562 if (grp->lg_addr_fixed) {
1563 /* validate specified address */
1564 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1565 err = EINVAL;
1566 goto bail;
1567 }
1568 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1569 } else {
1570 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1571 grp->lg_mac_addr_port = grp->lg_ports;
1572 }
1573
1574 /* Set the initial group capabilities. */
1575 aggr_grp_capab_set(grp);
1576
1577 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1578 err = ENOMEM;
1579 goto bail;
1580 }
1581 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1582 mac->m_driver = grp;
1583 mac->m_dip = aggr_dip;
1584 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1585 mac->m_src_addr = grp->lg_addr;
1586 mac->m_callbacks = &aggr_m_callbacks;
1587 mac->m_min_sdu = 0;
1588 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1589 mac->m_margin = aggr_grp_max_margin(grp);
1590 mac->m_v12n = MAC_VIRT_LEVEL1;
1591 err = mac_register(mac, &grp->lg_mh);
1592 mac_free(mac);
1593 if (err != 0)
1594 goto bail;
1595
1596 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1597 if (err != 0) {
1598 (void) mac_unregister(grp->lg_mh);
1599 grp->lg_mh = NULL;
1600 goto bail;
1601 }
1602
1603 mac_registered = B_TRUE;
1604
1605 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1606
1607 /*
1608 * Update the MAC address of the constituent ports.
1609 * None of the port is attached at this time, the link state of the
1610 * aggregation will not change.
1611 *
1612 * All ports take on the primary MAC address of the aggr
1613 * (lg_aggr). At this point, none of the ports are attached;
1614 * thus the link state of the aggregation will not change.
1615 */
1616 link_state_changed = aggr_grp_update_ports_mac(grp);
1617 ASSERT(!link_state_changed);
1618
1619 /* Update outbound load balancing policy. */
1620 aggr_send_update_policy(grp, policy);
1621
1622 /* Set LACP mode. */
1623 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1624
1625 /*
1626 * The pseudo Tx group holds a maximum of MAX_RINGS_PER_GROUP
1627 * rings, when all the Tx rings of all the ports are accumulated
1628 * it is conceivable this limit is exceeded. We try and prevent
1629 * this by limiting the number of rings an individual port will use.
1630 *
1631 * - When an aggr is first created, we will not let an
1632 * individual port use more than MAX_RINGS_PER_GROUP/nports
1633 * rings.
1634 * - As ports are added to an existing aggr, each of the
1635 * ports will not use more than MAX_RINGS_PER_GROUP/nports_high.
1636 * Where nports_high is the highest number of ports the aggr has
1637 * held (including any ports being added). This may involve
1638 * trimming rings from existing ports.
1639 */
1640
1641 /* Leave room for 4 ports */
1642 tx_ring_limit = MAX_RINGS_PER_GROUP / MAX(4, nports);
1643
1644 /*
1645 * Attach each port if necessary.
1646 */
1647 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1648 /*
1649 * Create the pseudo ring for each HW ring of the
1650 * underlying port. Note that this is done after the
1651 * aggr registers its MAC.
1652 */
1653 err = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group,
1654 tx_ring_limit);
1655
1656 if (err != 0) {
1657 mac_perim_exit(mph);
1658 goto bail;
1659 }
1660
1661 for (i = 0; i < grp->lg_rx_group_count; i++) {
1662 err = aggr_add_pseudo_rx_group(port,
1663 &grp->lg_rx_groups[i]);
1664
1665 if (err != 0) {
1666 /*
1667 * Undo what we have added for the current
1668 * port.
1669 */
1670 aggr_rem_pseudo_tx_group(port,
1671 &grp->lg_tx_group);
1672
1673 for (j = 0; j < i; j++) {
1674 aggr_rem_pseudo_rx_group(port,
1675 &grp->lg_rx_groups[j]);
1676 }
1677
1678 mac_perim_exit(mph);
1679 goto bail;
1680 }
1681 }
1682
1683 if (aggr_port_notify_link(grp, port))
1684 link_state_changed = B_TRUE;
1685
1686 /*
1687 * Initialize the callback functions for this port.
1688 */
1689 aggr_port_init_callbacks(port);
1690
1691 last_attached = port;
1692 }
1693
1694 if (link_state_changed)
1695 mac_link_update(grp->lg_mh, grp->lg_link_state);
1696
1697 /* add new group to hash table */
1698 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1699 (mod_hash_val_t)grp);
1700 ASSERT(err == 0);
1701 aggr_grp_cnt++;
1702
1703 mac_perim_exit(mph);
1704 rw_exit(&aggr_grp_lock);
1705 return (0);
1706
1707 bail:
1708 grp->lg_closing = B_TRUE;
1709
1710 /*
1711 * Inform the lacp_rx thread to exit.
1712 */
1713 mutex_enter(&grp->lg_lacp_lock);
1714 grp->lg_lacp_done = B_TRUE;
1715 cv_signal(&grp->lg_lacp_cv);
1716 while (grp->lg_lacp_rx_thread != NULL)
1717 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1718 mutex_exit(&grp->lg_lacp_lock);
1719 /*
1720 * Inform the tx_notify thread to exit.
1721 */
1722 mutex_enter(&grp->lg_tx_flowctl_lock);
1723 if (grp->lg_tx_notify_thread != NULL) {
1724 tid = grp->lg_tx_notify_thread->t_did;
1725 grp->lg_tx_notify_done = B_TRUE;
1726 cv_signal(&grp->lg_tx_flowctl_cv);
1727 }
1728 mutex_exit(&grp->lg_tx_flowctl_lock);
1729 if (tid != 0)
1730 thread_join(tid);
1731
1732 if (mac_registered) {
1733 (void) dls_devnet_destroy(grp->lg_mh, &tempid, B_TRUE);
1734 (void) mac_disable(grp->lg_mh);
1735
1736 if (last_attached != NULL) {
1737 /*
1738 * Detach and clean up ports added.
1739 */
1740 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1741
1742 for (port = grp->lg_ports; ; port = port->lp_next) {
1743 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1744 (void) aggr_grp_detach_port(grp, port);
1745 mac_perim_exit(pmph);
1746
1747 aggr_rem_pseudo_tx_group(port,
1748 &grp->lg_tx_group);
1749
1750 for (i = 0; i < grp->lg_rx_group_count; i++) {
1751 aggr_rem_pseudo_rx_group(port,
1752 &grp->lg_rx_groups[i]);
1753 }
1754 if (port == last_attached)
1755 break;
1756 }
1757
1758 mac_perim_exit(mph);
1759 }
1760
1761 (void) mac_unregister(grp->lg_mh);
1762 }
1763
1764 port = grp->lg_ports;
1765 while (port != NULL) {
1766 aggr_port_t *cport;
1767
1768 cport = port->lp_next;
1769 aggr_port_delete(port);
1770 port = cport;
1771 }
1772
1773 kmem_free(grp->lg_tx_blocked_rings,
1774 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1775 rw_exit(&aggr_grp_lock);
1776 AGGR_GRP_REFRELE(grp);
1777 return (err);
1778 }
1779
1780 /*
1781 * Return a pointer to the member of a group with specified linkid.
1782 */
1783 static aggr_port_t *
aggr_grp_port_lookup(aggr_grp_t * grp,datalink_id_t linkid)1784 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1785 {
1786 aggr_port_t *port;
1787
1788 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1789
1790 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1791 if (port->lp_linkid == linkid)
1792 break;
1793 }
1794
1795 return (port);
1796 }
1797
1798 /*
1799 * Stop, detach and remove a port from a link aggregation group.
1800 */
1801 static int
aggr_grp_rem_port(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)1802 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1803 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1804 {
1805 int rc = 0;
1806 aggr_port_t **pport;
1807 boolean_t mac_addr_changed = B_FALSE;
1808 boolean_t link_state_changed = B_FALSE;
1809 mac_perim_handle_t mph;
1810 uint64_t val;
1811 uint_t i;
1812 uint_t stat;
1813
1814 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1815 ASSERT(grp->lg_nports > 1);
1816 ASSERT(!grp->lg_closing);
1817
1818 /* unlink port */
1819 for (pport = &grp->lg_ports; *pport != port;
1820 pport = &(*pport)->lp_next) {
1821 if (*pport == NULL) {
1822 rc = ENOENT;
1823 goto done;
1824 }
1825 }
1826 *pport = port->lp_next;
1827
1828 mac_perim_enter_by_mh(port->lp_mh, &mph);
1829
1830 /*
1831 * If the MAC address of the port being removed was assigned
1832 * to the group, update the group MAC address
1833 * using the MAC address of a different port.
1834 */
1835 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1836 /*
1837 * Set the MAC address of the group to the
1838 * MAC address of its first port.
1839 */
1840 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1841 grp->lg_mac_addr_port = grp->lg_ports;
1842 mac_addr_changed = B_TRUE;
1843 }
1844
1845 link_state_changed = aggr_grp_detach_port(grp, port);
1846
1847 /*
1848 * Add the counter statistics of the ports while it was aggregated
1849 * to the group's residual statistics. This is done by obtaining
1850 * the current counter from the underlying MAC then subtracting the
1851 * value of the counter at the moment it was added to the
1852 * aggregation.
1853 */
1854 for (i = 0; i < MAC_NSTAT; i++) {
1855 stat = i + MAC_STAT_MIN;
1856 if (!MAC_STAT_ISACOUNTER(stat))
1857 continue;
1858 val = aggr_port_stat(port, stat);
1859 val -= port->lp_stat[i];
1860 mutex_enter(&grp->lg_stat_lock);
1861 grp->lg_stat[i] += val;
1862 mutex_exit(&grp->lg_stat_lock);
1863 }
1864 for (i = 0; i < ETHER_NSTAT; i++) {
1865 stat = i + MACTYPE_STAT_MIN;
1866 if (!ETHER_STAT_ISACOUNTER(stat))
1867 continue;
1868 val = aggr_port_stat(port, stat);
1869 val -= port->lp_ether_stat[i];
1870 mutex_enter(&grp->lg_stat_lock);
1871 grp->lg_ether_stat[i] += val;
1872 mutex_exit(&grp->lg_stat_lock);
1873 }
1874
1875 grp->lg_nports--;
1876 mac_perim_exit(mph);
1877
1878 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1879 aggr_port_delete(port);
1880
1881 /*
1882 * If the group MAC address has changed, update the MAC address of
1883 * the remaining constituent ports according to the new MAC
1884 * address of the group.
1885 */
1886 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1887 link_state_changed = B_TRUE;
1888
1889 done:
1890 if (mac_addr_changedp != NULL)
1891 *mac_addr_changedp = mac_addr_changed;
1892 if (link_state_changedp != NULL)
1893 *link_state_changedp = link_state_changed;
1894
1895 return (rc);
1896 }
1897
1898 /*
1899 * Remove one or more ports from an existing link aggregation group.
1900 */
1901 int
aggr_grp_rem_ports(datalink_id_t linkid,uint_t nports,laioc_port_t * ports)1902 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1903 {
1904 int rc = 0;
1905 uint_t i;
1906 aggr_grp_t *grp = NULL;
1907 aggr_port_t *port;
1908 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1909 boolean_t link_state_update = B_FALSE, link_state_changed;
1910 mac_perim_handle_t mph, pmph;
1911
1912 /* get group corresponding to linkid */
1913 rw_enter(&aggr_grp_lock, RW_READER);
1914 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1915 (mod_hash_val_t *)&grp) != 0) {
1916 rw_exit(&aggr_grp_lock);
1917 return (ENOENT);
1918 }
1919 AGGR_GRP_REFHOLD(grp);
1920
1921 /*
1922 * Hold the perimeter so that the aggregation won't be destroyed.
1923 */
1924 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1925 rw_exit(&aggr_grp_lock);
1926
1927 /* we need to keep at least one port per group */
1928 if (nports >= grp->lg_nports) {
1929 rc = EINVAL;
1930 goto bail;
1931 }
1932
1933 /* first verify that all the groups are valid */
1934 for (i = 0; i < nports; i++) {
1935 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1936 /* port not found */
1937 rc = ENOENT;
1938 goto bail;
1939 }
1940 }
1941
1942 /* clear the promiscous mode for the specified ports */
1943 for (i = 0; i < nports && rc == 0; i++) {
1944 /* lookup port */
1945 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1946 ASSERT(port != NULL);
1947
1948 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1949 rc = aggr_port_promisc(port, B_FALSE);
1950 mac_perim_exit(pmph);
1951 }
1952 if (rc != 0) {
1953 for (i = 0; i < nports; i++) {
1954 port = aggr_grp_port_lookup(grp,
1955 ports[i].lp_linkid);
1956 ASSERT(port != NULL);
1957
1958 /*
1959 * Turn the promiscuous mode back on if it is required
1960 * to receive the non-primary address over a port, or
1961 * the promiscous mode is enabled over the aggr.
1962 */
1963 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1964 if (port->lp_started && (grp->lg_promisc ||
1965 port->lp_prom_addr != NULL)) {
1966 (void) aggr_port_promisc(port, B_TRUE);
1967 }
1968 mac_perim_exit(pmph);
1969 }
1970 goto bail;
1971 }
1972
1973 /* remove the specified ports from group */
1974 for (i = 0; i < nports; i++) {
1975 /* lookup port */
1976 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1977 ASSERT(port != NULL);
1978
1979 /* stop port if group has already been started */
1980 if (grp->lg_started) {
1981 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1982 aggr_port_stop(port);
1983 mac_perim_exit(pmph);
1984 }
1985
1986 /*
1987 * aggr_rem_pseudo_tx_group() is not called here. Instead
1988 * it is called from inside aggr_grp_rem_port() after the
1989 * port has been detached. The reason is that
1990 * aggr_rem_pseudo_tx_group() removes one ring at a time
1991 * and if there is still traffic going on, then there
1992 * is the possibility of aggr_find_tx_ring() returning a
1993 * removed ring for transmission. Once the port has been
1994 * detached, that port will not be used and
1995 * aggr_find_tx_ring() will not return any rings
1996 * belonging to it.
1997 */
1998 for (uint_t j = 0; j < grp->lg_rx_group_count; j++)
1999 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[j]);
2000
2001 /* remove port from group */
2002 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
2003 &link_state_changed);
2004 ASSERT(rc == 0);
2005 mac_addr_update = mac_addr_update || mac_addr_changed;
2006 link_state_update = link_state_update || link_state_changed;
2007 }
2008
2009 bail:
2010 if (mac_addr_update)
2011 mac_unicst_update(grp->lg_mh, grp->lg_addr);
2012 if (link_state_update)
2013 mac_link_update(grp->lg_mh, grp->lg_link_state);
2014
2015 mac_perim_exit(mph);
2016 AGGR_GRP_REFRELE(grp);
2017
2018 return (rc);
2019 }
2020
2021 int
aggr_grp_delete(datalink_id_t linkid,cred_t * cred)2022 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
2023 {
2024 aggr_grp_t *grp = NULL;
2025 aggr_port_t *port, *cport;
2026 datalink_id_t tmpid;
2027 mod_hash_val_t val;
2028 mac_perim_handle_t mph, pmph;
2029 int err;
2030 kt_did_t tid = 0;
2031
2032 rw_enter(&aggr_grp_lock, RW_WRITER);
2033
2034 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2035 (mod_hash_val_t *)&grp) != 0) {
2036 rw_exit(&aggr_grp_lock);
2037 return (ENOENT);
2038 }
2039
2040 /*
2041 * Note that dls_devnet_destroy() must be called before lg_lock is
2042 * held. Otherwise, it will deadlock if another thread is in
2043 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
2044 * dls_devnet_destroy() needs to delete.
2045 */
2046 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
2047 rw_exit(&aggr_grp_lock);
2048 return (err);
2049 }
2050 ASSERT(linkid == tmpid);
2051
2052 /*
2053 * Unregister from the MAC service module. Since this can
2054 * fail if a client hasn't closed the MAC port, we gracefully
2055 * fail the operation.
2056 */
2057 if ((err = mac_disable(grp->lg_mh)) != 0) {
2058 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
2059 rw_exit(&aggr_grp_lock);
2060 return (err);
2061 }
2062 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
2063 ASSERT(grp == (aggr_grp_t *)val);
2064
2065 ASSERT(aggr_grp_cnt > 0);
2066 aggr_grp_cnt--;
2067 rw_exit(&aggr_grp_lock);
2068
2069 /*
2070 * Inform the lacp_rx thread to exit.
2071 */
2072 mutex_enter(&grp->lg_lacp_lock);
2073 grp->lg_lacp_done = B_TRUE;
2074 cv_signal(&grp->lg_lacp_cv);
2075 while (grp->lg_lacp_rx_thread != NULL)
2076 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
2077 mutex_exit(&grp->lg_lacp_lock);
2078 /*
2079 * Inform the tx_notify_thread to exit.
2080 */
2081 mutex_enter(&grp->lg_tx_flowctl_lock);
2082 if (grp->lg_tx_notify_thread != NULL) {
2083 tid = grp->lg_tx_notify_thread->t_did;
2084 grp->lg_tx_notify_done = B_TRUE;
2085 cv_signal(&grp->lg_tx_flowctl_cv);
2086 }
2087 mutex_exit(&grp->lg_tx_flowctl_lock);
2088 if (tid != 0)
2089 thread_join(tid);
2090
2091 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2092
2093 grp->lg_closing = B_TRUE;
2094 /* detach and free MAC ports associated with group */
2095 port = grp->lg_ports;
2096 while (port != NULL) {
2097 cport = port->lp_next;
2098 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2099 if (grp->lg_started)
2100 aggr_port_stop(port);
2101 (void) aggr_grp_detach_port(grp, port);
2102 mac_perim_exit(pmph);
2103 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
2104 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2105 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_groups[i]);
2106 aggr_port_delete(port);
2107 port = cport;
2108 }
2109
2110 mac_perim_exit(mph);
2111
2112 kmem_free(grp->lg_tx_blocked_rings,
2113 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
2114 /*
2115 * Wait for the port's lacp timer thread and its notification callback
2116 * to exit before calling mac_unregister() since both needs to access
2117 * the mac perimeter of the grp.
2118 */
2119 aggr_grp_port_wait(grp);
2120
2121 VERIFY(mac_unregister(grp->lg_mh) == 0);
2122 grp->lg_mh = NULL;
2123
2124 for (uint_t i = 0; i < MAX_GROUPS_PER_PORT; i++) {
2125 list_destroy(&(grp->lg_rx_groups[i].arg_vlans));
2126 }
2127
2128 AGGR_GRP_REFRELE(grp);
2129 return (0);
2130 }
2131
2132 void
aggr_grp_free(aggr_grp_t * grp)2133 aggr_grp_free(aggr_grp_t *grp)
2134 {
2135 ASSERT(grp->lg_refs == 0);
2136 ASSERT(grp->lg_port_ref == 0);
2137 if (grp->lg_key > AGGR_MAX_KEY) {
2138 id_free(key_ids, grp->lg_key);
2139 grp->lg_key = 0;
2140 }
2141 kmem_cache_free(aggr_grp_cache, grp);
2142 }
2143
2144 int
aggr_grp_info(datalink_id_t linkid,void * fn_arg,aggr_grp_info_new_grp_fn_t new_grp_fn,aggr_grp_info_new_port_fn_t new_port_fn,cred_t * cred)2145 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
2146 aggr_grp_info_new_grp_fn_t new_grp_fn,
2147 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
2148 {
2149 aggr_grp_t *grp;
2150 aggr_port_t *port;
2151 mac_perim_handle_t mph, pmph;
2152 int rc = 0;
2153
2154 /*
2155 * Make sure that the aggregation link is visible from the caller's
2156 * zone.
2157 */
2158 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
2159 return (ENOENT);
2160
2161 rw_enter(&aggr_grp_lock, RW_READER);
2162
2163 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
2164 (mod_hash_val_t *)&grp) != 0) {
2165 rw_exit(&aggr_grp_lock);
2166 return (ENOENT);
2167 }
2168 AGGR_GRP_REFHOLD(grp);
2169
2170 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2171 rw_exit(&aggr_grp_lock);
2172
2173 rc = new_grp_fn(fn_arg, grp->lg_linkid,
2174 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
2175 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
2176 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
2177
2178 if (rc != 0)
2179 goto bail;
2180
2181 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2182 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2183 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
2184 port->lp_state, &port->lp_lacp.ActorOperPortState);
2185 mac_perim_exit(pmph);
2186
2187 if (rc != 0)
2188 goto bail;
2189 }
2190
2191 bail:
2192 mac_perim_exit(mph);
2193 AGGR_GRP_REFRELE(grp);
2194 return (rc);
2195 }
2196
2197 /*ARGSUSED*/
2198 static void
aggr_m_ioctl(void * arg,queue_t * q,mblk_t * mp)2199 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
2200 {
2201 miocnak(q, mp, 0, ENOTSUP);
2202 }
2203
2204 static int
aggr_grp_stat(aggr_grp_t * grp,uint_t stat,uint64_t * val)2205 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
2206 {
2207 aggr_port_t *port;
2208 uint_t stat_index;
2209
2210 ASSERT(MUTEX_HELD(&grp->lg_stat_lock));
2211
2212 /* We only aggregate counter statistics. */
2213 if ((IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat)) ||
2214 (IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat))) {
2215 return (ENOTSUP);
2216 }
2217
2218 /*
2219 * Counter statistics for a group are computed by aggregating the
2220 * counters of the members MACs while they were aggregated, plus
2221 * the residual counter of the group itself, which is updated each
2222 * time a MAC is removed from the group.
2223 */
2224 *val = 0;
2225 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2226 /* actual port statistic */
2227 *val += aggr_port_stat(port, stat);
2228 /*
2229 * minus the port stat when it was added, plus any residual
2230 * amount for the group.
2231 */
2232 if (IS_MAC_STAT(stat)) {
2233 stat_index = stat - MAC_STAT_MIN;
2234 *val -= port->lp_stat[stat_index];
2235 *val += grp->lg_stat[stat_index];
2236 } else if (IS_MACTYPE_STAT(stat)) {
2237 stat_index = stat - MACTYPE_STAT_MIN;
2238 *val -= port->lp_ether_stat[stat_index];
2239 *val += grp->lg_ether_stat[stat_index];
2240 }
2241 }
2242 return (0);
2243 }
2244
2245 int
aggr_rx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)2246 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2247 {
2248 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
2249
2250 if (rx_ring->arr_hw_rh != NULL) {
2251 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
2252 } else {
2253 aggr_port_t *port = rx_ring->arr_port;
2254
2255 *val = mac_stat_get(port->lp_mh, stat);
2256
2257 }
2258 return (0);
2259 }
2260
2261 int
aggr_tx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)2262 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
2263 {
2264 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
2265
2266 if (tx_ring->atr_hw_rh != NULL) {
2267 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
2268 } else {
2269 aggr_port_t *port = tx_ring->atr_port;
2270
2271 *val = mac_stat_get(port->lp_mh, stat);
2272 }
2273 return (0);
2274 }
2275
2276 static int
aggr_m_stat(void * arg,uint_t stat,uint64_t * val)2277 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
2278 {
2279 aggr_grp_t *grp = arg;
2280 int rval = 0;
2281
2282 mutex_enter(&grp->lg_stat_lock);
2283
2284 switch (stat) {
2285 case MAC_STAT_IFSPEED:
2286 *val = grp->lg_ifspeed;
2287 break;
2288
2289 case ETHER_STAT_LINK_DUPLEX:
2290 *val = grp->lg_link_duplex;
2291 break;
2292
2293 default:
2294 /*
2295 * For all other statistics, we return the aggregated stat
2296 * from the underlying ports. aggr_grp_stat() will set
2297 * rval appropriately if the statistic isn't a counter.
2298 */
2299 rval = aggr_grp_stat(grp, stat, val);
2300 }
2301
2302 mutex_exit(&grp->lg_stat_lock);
2303 return (rval);
2304 }
2305
2306 static int
aggr_m_start(void * arg)2307 aggr_m_start(void *arg)
2308 {
2309 aggr_grp_t *grp = arg;
2310 aggr_port_t *port;
2311 mac_perim_handle_t mph, pmph;
2312
2313 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2314
2315 /*
2316 * Attempts to start all configured members of the group.
2317 * Group members will be attached when their link-up notification
2318 * is received.
2319 */
2320 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2321 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2322 if (aggr_port_start(port) != 0) {
2323 mac_perim_exit(pmph);
2324 continue;
2325 }
2326
2327 /*
2328 * Turn on the promiscuous mode if it is required to receive
2329 * the non-primary address over a port, or the promiscous
2330 * mode is enabled over the aggr.
2331 */
2332 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
2333 if (aggr_port_promisc(port, B_TRUE) != 0)
2334 aggr_port_stop(port);
2335 }
2336 mac_perim_exit(pmph);
2337 }
2338
2339 grp->lg_started = B_TRUE;
2340
2341 mac_perim_exit(mph);
2342 return (0);
2343 }
2344
2345 static void
aggr_m_stop(void * arg)2346 aggr_m_stop(void *arg)
2347 {
2348 aggr_grp_t *grp = arg;
2349 aggr_port_t *port;
2350 mac_perim_handle_t mph, pmph;
2351
2352 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2353
2354 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2355 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2356
2357 /* reset port promiscuous mode */
2358 (void) aggr_port_promisc(port, B_FALSE);
2359
2360 aggr_port_stop(port);
2361 mac_perim_exit(pmph);
2362 }
2363
2364 grp->lg_started = B_FALSE;
2365 mac_perim_exit(mph);
2366 }
2367
2368 static int
aggr_m_promisc(void * arg,boolean_t on)2369 aggr_m_promisc(void *arg, boolean_t on)
2370 {
2371 aggr_grp_t *grp = arg;
2372 aggr_port_t *port;
2373 boolean_t link_state_changed = B_FALSE;
2374 mac_perim_handle_t mph, pmph;
2375
2376 AGGR_GRP_REFHOLD(grp);
2377 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2378
2379 ASSERT(!grp->lg_closing);
2380
2381 if (on == grp->lg_promisc)
2382 goto bail;
2383
2384 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2385 int err = 0;
2386
2387 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2388 AGGR_PORT_REFHOLD(port);
2389 if (!on && (port->lp_prom_addr == NULL))
2390 err = aggr_port_promisc(port, B_FALSE);
2391 else if (on && port->lp_started)
2392 err = aggr_port_promisc(port, B_TRUE);
2393
2394 if (err != 0) {
2395 if (aggr_grp_detach_port(grp, port))
2396 link_state_changed = B_TRUE;
2397 } else {
2398 /*
2399 * If a port was detached because of a previous
2400 * failure changing the promiscuity, the port
2401 * is reattached when it successfully changes
2402 * the promiscuity now, and this might cause
2403 * the link state of the aggregation to change.
2404 */
2405 if (aggr_grp_attach_port(grp, port))
2406 link_state_changed = B_TRUE;
2407 }
2408 mac_perim_exit(pmph);
2409 AGGR_PORT_REFRELE(port);
2410 }
2411
2412 grp->lg_promisc = on;
2413
2414 if (link_state_changed)
2415 mac_link_update(grp->lg_mh, grp->lg_link_state);
2416
2417 bail:
2418 mac_perim_exit(mph);
2419 AGGR_GRP_REFRELE(grp);
2420
2421 return (0);
2422 }
2423
2424 static void
aggr_grp_port_rename(const char * new_name,void * arg)2425 aggr_grp_port_rename(const char *new_name, void *arg)
2426 {
2427 /*
2428 * aggr port's mac client name is the format of "aggr link name" plus
2429 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2430 */
2431 int aggr_len, link_len, clnt_name_len, i;
2432 char *str_end, *str_st, *str_del;
2433 char aggr_name[MAXNAMELEN];
2434 char link_name[MAXNAMELEN];
2435 char *clnt_name;
2436 aggr_grp_t *aggr_grp = arg;
2437 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2438
2439 for (i = 0; i < aggr_grp->lg_nports; i++) {
2440 clnt_name = mac_client_name(aggr_port->lp_mch);
2441 clnt_name_len = strlen(clnt_name);
2442 str_st = clnt_name;
2443 str_end = &(clnt_name[clnt_name_len]);
2444 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2445 ASSERT(str_del != NULL);
2446 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2447 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2448 bzero(aggr_name, MAXNAMELEN);
2449 bzero(link_name, MAXNAMELEN);
2450 bcopy(clnt_name, aggr_name, aggr_len);
2451 bcopy(str_del, link_name, link_len + 1);
2452 bzero(clnt_name, MAXNAMELEN);
2453 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2454 link_name);
2455
2456 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2457 aggr_port = aggr_port->lp_next;
2458 }
2459 }
2460
2461 /*
2462 * Initialize the capabilities that are advertised for the group
2463 * according to the capabilities of the constituent ports.
2464 */
2465 static boolean_t
aggr_m_capab_get(void * arg,mac_capab_t cap,void * cap_data)2466 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2467 {
2468 aggr_grp_t *grp = arg;
2469
2470 switch (cap) {
2471 case MAC_CAPAB_HCKSUM: {
2472 uint32_t *hcksum_txflags = cap_data;
2473 *hcksum_txflags = grp->lg_hcksum_txflags;
2474 break;
2475 }
2476 case MAC_CAPAB_LSO: {
2477 mac_capab_lso_t *cap_lso = cap_data;
2478
2479 if (grp->lg_lso) {
2480 *cap_lso = grp->lg_cap_lso;
2481 break;
2482 } else {
2483 return (B_FALSE);
2484 }
2485 }
2486 case MAC_CAPAB_NO_NATIVEVLAN:
2487 return (!grp->lg_vlan);
2488 case MAC_CAPAB_NO_ZCOPY:
2489 return (!grp->lg_zcopy);
2490 case MAC_CAPAB_RINGS: {
2491 mac_capab_rings_t *cap_rings = cap_data;
2492 uint_t ring_cnt = 0;
2493
2494 for (uint_t i = 0; i < grp->lg_rx_group_count; i++)
2495 ring_cnt += grp->lg_rx_groups[i].arg_ring_cnt;
2496
2497 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2498 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2499 cap_rings->mr_rnum = ring_cnt;
2500 cap_rings->mr_gnum = grp->lg_rx_group_count;
2501 cap_rings->mr_gaddring = NULL;
2502 cap_rings->mr_gremring = NULL;
2503 } else {
2504 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2505 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2506 cap_rings->mr_gnum = 0;
2507 }
2508 cap_rings->mr_rget = aggr_fill_ring;
2509 cap_rings->mr_gget = aggr_fill_group;
2510 break;
2511 }
2512 case MAC_CAPAB_AGGR:
2513 {
2514 mac_capab_aggr_t *aggr_cap;
2515
2516 if (cap_data != NULL) {
2517 aggr_cap = cap_data;
2518 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2519 aggr_cap->mca_unicst = aggr_m_unicst;
2520 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2521 aggr_cap->mca_arg = arg;
2522 }
2523 return (B_TRUE);
2524 }
2525 default:
2526 return (B_FALSE);
2527 }
2528 return (B_TRUE);
2529 }
2530
2531 /*
2532 * Callback function for MAC layer to register groups.
2533 */
2534 static void
aggr_fill_group(void * arg,mac_ring_type_t rtype,const int index,mac_group_info_t * infop,mac_group_handle_t gh)2535 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2536 mac_group_info_t *infop, mac_group_handle_t gh)
2537 {
2538 aggr_grp_t *grp = arg;
2539
2540 if (rtype == MAC_RING_TYPE_RX) {
2541 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_groups[index];
2542
2543 rx_group->arg_gh = gh;
2544 rx_group->arg_grp = grp;
2545
2546 infop->mgi_driver = (mac_group_driver_t)rx_group;
2547 infop->mgi_start = NULL;
2548 infop->mgi_stop = NULL;
2549 infop->mgi_addmac = aggr_addmac;
2550 infop->mgi_remmac = aggr_remmac;
2551 infop->mgi_count = rx_group->arg_ring_cnt;
2552
2553 /*
2554 * Always set the HW VLAN callbacks. They are smart
2555 * enough to know when a port has HW VLAN filters to
2556 * program and when it doesn't.
2557 */
2558 infop->mgi_addvlan = aggr_addvlan;
2559 infop->mgi_remvlan = aggr_remvlan;
2560 } else {
2561 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2562
2563 ASSERT3S(index, ==, 0);
2564 tx_group->atg_gh = gh;
2565 }
2566 }
2567
2568 /*
2569 * Callback funtion for MAC layer to register all rings.
2570 */
2571 static void
aggr_fill_ring(void * arg,mac_ring_type_t rtype,const int rg_index,const int index,mac_ring_info_t * infop,mac_ring_handle_t rh)2572 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2573 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2574 {
2575 aggr_grp_t *grp = arg;
2576
2577 switch (rtype) {
2578 case MAC_RING_TYPE_RX: {
2579 aggr_pseudo_rx_group_t *rx_group;
2580 aggr_pseudo_rx_ring_t *rx_ring;
2581 mac_intr_t aggr_mac_intr;
2582
2583 rx_group = &grp->lg_rx_groups[rg_index];
2584 ASSERT3S(index, >=, 0);
2585 ASSERT3S(index, <, rx_group->arg_ring_cnt);
2586 rx_ring = rx_group->arg_rings + index;
2587 rx_ring->arr_rh = rh;
2588
2589 /*
2590 * Entrypoint to enable interrupt (disable poll) and
2591 * disable interrupt (enable poll).
2592 */
2593 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2594 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2595 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2596 aggr_mac_intr.mi_ddi_handle = NULL;
2597
2598 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2599 infop->mri_start = aggr_pseudo_start_rx_ring;
2600 infop->mri_stop = aggr_pseudo_stop_rx_ring;
2601
2602 infop->mri_intr = aggr_mac_intr;
2603 infop->mri_poll = aggr_rx_poll;
2604
2605 infop->mri_stat = aggr_rx_ring_stat;
2606 break;
2607 }
2608 case MAC_RING_TYPE_TX: {
2609 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2610 aggr_pseudo_tx_ring_t *tx_ring;
2611
2612 ASSERT(rg_index == -1);
2613 ASSERT(index < tx_group->atg_ring_cnt);
2614
2615 tx_ring = &tx_group->atg_rings[index];
2616 tx_ring->atr_rh = rh;
2617
2618 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2619 infop->mri_start = NULL;
2620 infop->mri_stop = NULL;
2621 infop->mri_tx = aggr_ring_tx;
2622 infop->mri_stat = aggr_tx_ring_stat;
2623 /*
2624 * Use the hw TX ring handle to find if the ring needs
2625 * serialization or not. For NICs that do not expose
2626 * Tx rings, atr_hw_rh will be NULL.
2627 */
2628 if (tx_ring->atr_hw_rh != NULL) {
2629 infop->mri_flags =
2630 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2631 }
2632 break;
2633 }
2634 default:
2635 break;
2636 }
2637 }
2638
2639 static mblk_t *
aggr_rx_poll(void * arg,int bytes_to_pickup)2640 aggr_rx_poll(void *arg, int bytes_to_pickup)
2641 {
2642 aggr_pseudo_rx_ring_t *rr_ring = arg;
2643 aggr_port_t *port = rr_ring->arr_port;
2644 aggr_grp_t *grp = port->lp_grp;
2645 mblk_t *mp_chain, *mp, **mpp;
2646
2647 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2648
2649 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2650 return (mp_chain);
2651
2652 mpp = &mp_chain;
2653 while ((mp = *mpp) != NULL) {
2654 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2655 struct ether_header *ehp;
2656
2657 ehp = (struct ether_header *)mp->b_rptr;
2658 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2659 *mpp = mp->b_next;
2660 mp->b_next = NULL;
2661 aggr_recv_lacp(port,
2662 (mac_resource_handle_t)rr_ring, mp);
2663 continue;
2664 }
2665 }
2666
2667 if (!port->lp_collector_enabled) {
2668 *mpp = mp->b_next;
2669 mp->b_next = NULL;
2670 freemsg(mp);
2671 continue;
2672 }
2673 mpp = &mp->b_next;
2674 }
2675 return (mp_chain);
2676 }
2677
2678 static int
aggr_addmac(void * arg,const uint8_t * mac_addr)2679 aggr_addmac(void *arg, const uint8_t *mac_addr)
2680 {
2681 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2682 aggr_unicst_addr_t *addr, **pprev;
2683 aggr_grp_t *grp = rx_group->arg_grp;
2684 aggr_port_t *port, *p;
2685 mac_perim_handle_t mph;
2686 int err = 0;
2687 uint_t idx = rx_group->arg_index;
2688
2689 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2690
2691 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2692 mac_perim_exit(mph);
2693 return (0);
2694 }
2695
2696 /*
2697 * Insert this mac address into the list of mac addresses owned by
2698 * the aggregation pseudo group.
2699 */
2700 pprev = &rx_group->arg_macaddr;
2701 while ((addr = *pprev) != NULL) {
2702 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2703 mac_perim_exit(mph);
2704 return (EEXIST);
2705 }
2706 pprev = &addr->aua_next;
2707 }
2708 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2709 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2710 addr->aua_next = NULL;
2711 *pprev = addr;
2712
2713 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2714 if ((err = aggr_port_addmac(port, idx, mac_addr)) != 0)
2715 break;
2716
2717 if (err != 0) {
2718 for (p = grp->lg_ports; p != port; p = p->lp_next)
2719 aggr_port_remmac(p, idx, mac_addr);
2720
2721 *pprev = NULL;
2722 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2723 }
2724
2725 mac_perim_exit(mph);
2726 return (err);
2727 }
2728
2729 static int
aggr_remmac(void * arg,const uint8_t * mac_addr)2730 aggr_remmac(void *arg, const uint8_t *mac_addr)
2731 {
2732 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2733 aggr_unicst_addr_t *addr, **pprev;
2734 aggr_grp_t *grp = rx_group->arg_grp;
2735 aggr_port_t *port;
2736 mac_perim_handle_t mph;
2737 int err = 0;
2738
2739 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2740
2741 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2742 mac_perim_exit(mph);
2743 return (0);
2744 }
2745
2746 /*
2747 * Insert this mac address into the list of mac addresses owned by
2748 * the aggregation pseudo group.
2749 */
2750 pprev = &rx_group->arg_macaddr;
2751 while ((addr = *pprev) != NULL) {
2752 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2753 pprev = &addr->aua_next;
2754 continue;
2755 }
2756 break;
2757 }
2758 if (addr == NULL) {
2759 mac_perim_exit(mph);
2760 return (EINVAL);
2761 }
2762
2763 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2764 aggr_port_remmac(port, rx_group->arg_index, mac_addr);
2765
2766 *pprev = addr->aua_next;
2767 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2768
2769 mac_perim_exit(mph);
2770 return (err);
2771 }
2772
2773 /*
2774 * Search for VID in the Rx group's list and return a pointer if
2775 * found. Otherwise return NULL.
2776 */
2777 static aggr_vlan_t *
aggr_find_vlan(aggr_pseudo_rx_group_t * rx_group,uint16_t vid)2778 aggr_find_vlan(aggr_pseudo_rx_group_t *rx_group, uint16_t vid)
2779 {
2780 ASSERT(MAC_PERIM_HELD(rx_group->arg_grp->lg_mh));
2781 for (aggr_vlan_t *avp = list_head(&rx_group->arg_vlans); avp != NULL;
2782 avp = list_next(&rx_group->arg_vlans, avp)) {
2783 if (avp->av_vid == vid)
2784 return (avp);
2785 }
2786
2787 return (NULL);
2788 }
2789
2790 /*
2791 * Accept traffic on the specified VID.
2792 *
2793 * Persist VLAN state in the aggr so that ports added later will
2794 * receive the correct filters. In the future it would be nice to
2795 * allow aggr to iterate its clients instead of duplicating state.
2796 */
2797 static int
aggr_addvlan(mac_group_driver_t gdriver,uint16_t vid)2798 aggr_addvlan(mac_group_driver_t gdriver, uint16_t vid)
2799 {
2800 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2801 aggr_grp_t *aggr = rx_group->arg_grp;
2802 aggr_port_t *port, *p;
2803 mac_perim_handle_t mph;
2804 int err = 0;
2805 aggr_vlan_t *avp = NULL;
2806 uint_t idx = rx_group->arg_index;
2807
2808 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2809
2810 if (vid == MAC_VLAN_UNTAGGED) {
2811 /*
2812 * Aggr is both a MAC provider and MAC client. As a
2813 * MAC provider it is passed MAC_VLAN_UNTAGGED by its
2814 * client. As a client itself, it should pass
2815 * VLAN_ID_NONE to its ports.
2816 */
2817 vid = VLAN_ID_NONE;
2818 rx_group->arg_untagged++;
2819 goto update_ports;
2820 }
2821
2822 avp = aggr_find_vlan(rx_group, vid);
2823
2824 if (avp != NULL) {
2825 avp->av_refs++;
2826 mac_perim_exit(mph);
2827 return (0);
2828 }
2829
2830 avp = kmem_zalloc(sizeof (aggr_vlan_t), KM_SLEEP);
2831 avp->av_vid = vid;
2832 avp->av_refs = 1;
2833
2834 update_ports:
2835 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2836 if ((err = aggr_port_addvlan(port, idx, vid)) != 0)
2837 break;
2838
2839 if (err != 0) {
2840 /*
2841 * If any of these calls fail then we are in a
2842 * situation where the ports have different HW state.
2843 * There's no reasonable action the MAC client can
2844 * take in this scenario to rectify the situation.
2845 */
2846 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2847 int err2;
2848
2849 if ((err2 = aggr_port_remvlan(p, idx, vid)) != 0) {
2850 cmn_err(CE_WARN, "Failed to remove VLAN %u"
2851 " from port %s: errno %d.", vid,
2852 mac_client_name(p->lp_mch), err2);
2853 }
2854
2855 }
2856
2857 if (vid == VLAN_ID_NONE)
2858 rx_group->arg_untagged--;
2859
2860 if (avp != NULL) {
2861 kmem_free(avp, sizeof (aggr_vlan_t));
2862 avp = NULL;
2863 }
2864 }
2865
2866 if (avp != NULL)
2867 list_insert_tail(&rx_group->arg_vlans, avp);
2868
2869 mac_perim_exit(mph);
2870 return (err);
2871 }
2872
2873 /*
2874 * Stop accepting traffic on this VLAN if it's the last use of this VLAN.
2875 */
2876 static int
aggr_remvlan(mac_group_driver_t gdriver,uint16_t vid)2877 aggr_remvlan(mac_group_driver_t gdriver, uint16_t vid)
2878 {
2879 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)gdriver;
2880 aggr_grp_t *aggr = rx_group->arg_grp;
2881 aggr_port_t *port, *p;
2882 mac_perim_handle_t mph;
2883 int err = 0;
2884 aggr_vlan_t *avp = NULL;
2885 uint_t idx = rx_group->arg_index;
2886
2887 mac_perim_enter_by_mh(aggr->lg_mh, &mph);
2888
2889 /*
2890 * See the comment in aggr_addvlan().
2891 */
2892 if (vid == MAC_VLAN_UNTAGGED) {
2893 vid = VLAN_ID_NONE;
2894 rx_group->arg_untagged--;
2895
2896 if (rx_group->arg_untagged > 0)
2897 goto done;
2898
2899 goto update_ports;
2900 }
2901
2902 avp = aggr_find_vlan(rx_group, vid);
2903
2904 if (avp == NULL) {
2905 err = ENOENT;
2906 goto done;
2907 }
2908
2909 avp->av_refs--;
2910
2911 if (avp->av_refs > 0)
2912 goto done;
2913
2914 update_ports:
2915 for (port = aggr->lg_ports; port != NULL; port = port->lp_next)
2916 if ((err = aggr_port_remvlan(port, idx, vid)) != 0)
2917 break;
2918
2919 /*
2920 * See the comment in aggr_addvlan() for justification of the
2921 * use of VERIFY here.
2922 */
2923 if (err != 0) {
2924 for (p = aggr->lg_ports; p != port; p = p->lp_next) {
2925 int err2;
2926
2927 if ((err2 = aggr_port_addvlan(p, idx, vid)) != 0) {
2928 cmn_err(CE_WARN, "Failed to add VLAN %u"
2929 " to port %s: errno %d.", vid,
2930 mac_client_name(p->lp_mch), err2);
2931 }
2932 }
2933
2934 if (avp != NULL)
2935 avp->av_refs++;
2936
2937 if (vid == VLAN_ID_NONE)
2938 rx_group->arg_untagged++;
2939
2940 goto done;
2941 }
2942
2943 if (err == 0 && avp != NULL) {
2944 VERIFY3U(avp->av_refs, ==, 0);
2945 list_remove(&rx_group->arg_vlans, avp);
2946 kmem_free(avp, sizeof (aggr_vlan_t));
2947 }
2948
2949 done:
2950 mac_perim_exit(mph);
2951 return (err);
2952 }
2953
2954 /*
2955 * Add or remove the multicast addresses that are defined for the group
2956 * to or from the specified port.
2957 *
2958 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2959 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2960 * called when the port is either stopped or detached.
2961 */
2962 void
aggr_grp_multicst_port(aggr_port_t * port,boolean_t add)2963 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2964 {
2965 aggr_grp_t *grp = port->lp_grp;
2966
2967 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2968 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2969
2970 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2971 return;
2972
2973 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2974 }
2975
2976 static int
aggr_m_multicst(void * arg,boolean_t add,const uint8_t * addrp)2977 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2978 {
2979 aggr_grp_t *grp = arg;
2980 aggr_port_t *port = NULL, *errport = NULL;
2981 mac_perim_handle_t mph;
2982 int err = 0;
2983
2984 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2985 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2986 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2987 !port->lp_started) {
2988 continue;
2989 }
2990 err = aggr_port_multicst(port, add, addrp);
2991 if (err != 0) {
2992 errport = port;
2993 break;
2994 }
2995 }
2996
2997 /*
2998 * At least one port caused error return and this error is returned to
2999 * mac, eventually a NAK would be sent upwards.
3000 * Some ports have this multicast address listed now, and some don't.
3001 * Treat this error as a whole aggr failure not individual port failure.
3002 * Therefore remove this multicast address from other ports.
3003 */
3004 if ((err != 0) && add) {
3005 for (port = grp->lg_ports; port != errport;
3006 port = port->lp_next) {
3007 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
3008 !port->lp_started) {
3009 continue;
3010 }
3011 (void) aggr_port_multicst(port, B_FALSE, addrp);
3012 }
3013 }
3014 mac_perim_exit(mph);
3015 return (err);
3016 }
3017
3018 static int
aggr_m_unicst(void * arg,const uint8_t * macaddr)3019 aggr_m_unicst(void *arg, const uint8_t *macaddr)
3020 {
3021 aggr_grp_t *grp = arg;
3022 mac_perim_handle_t mph;
3023 int err;
3024
3025 mac_perim_enter_by_mh(grp->lg_mh, &mph);
3026 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
3027 0, 0);
3028 mac_perim_exit(mph);
3029 return (err);
3030 }
3031
3032 /*
3033 * Initialize the capabilities that are advertised for the group
3034 * according to the capabilities of the constituent ports.
3035 */
3036 static void
aggr_grp_capab_set(aggr_grp_t * grp)3037 aggr_grp_capab_set(aggr_grp_t *grp)
3038 {
3039 uint32_t cksum;
3040 aggr_port_t *port;
3041 mac_capab_lso_t cap_lso;
3042
3043 ASSERT(grp->lg_mh == NULL);
3044 ASSERT(grp->lg_ports != NULL);
3045
3046 grp->lg_hcksum_txflags = (uint32_t)-1;
3047 grp->lg_zcopy = B_TRUE;
3048 grp->lg_vlan = B_TRUE;
3049
3050 grp->lg_lso = B_TRUE;
3051 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
3052 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
3053
3054 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3055 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
3056 cksum = 0;
3057 grp->lg_hcksum_txflags &= cksum;
3058
3059 grp->lg_vlan &=
3060 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
3061
3062 grp->lg_zcopy &=
3063 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
3064
3065 grp->lg_lso &=
3066 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
3067 if (grp->lg_lso) {
3068 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
3069 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
3070 cap_lso.lso_basic_tcp_ipv4.lso_max)
3071 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
3072 cap_lso.lso_basic_tcp_ipv4.lso_max;
3073 }
3074 }
3075 }
3076
3077 /*
3078 * Checks whether the capabilities of the port being added are compatible
3079 * with the current capabilities of the aggregation.
3080 */
3081 static boolean_t
aggr_grp_capab_check(aggr_grp_t * grp,aggr_port_t * port)3082 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
3083 {
3084 uint32_t hcksum_txflags;
3085
3086 ASSERT(grp->lg_ports != NULL);
3087
3088 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
3089 grp->lg_vlan) != grp->lg_vlan) {
3090 return (B_FALSE);
3091 }
3092
3093 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
3094 grp->lg_zcopy) != grp->lg_zcopy) {
3095 return (B_FALSE);
3096 }
3097
3098 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
3099 if (grp->lg_hcksum_txflags != 0)
3100 return (B_FALSE);
3101 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
3102 grp->lg_hcksum_txflags) {
3103 return (B_FALSE);
3104 }
3105
3106 if (grp->lg_lso) {
3107 mac_capab_lso_t cap_lso;
3108
3109 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
3110 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
3111 grp->lg_cap_lso.lso_flags)
3112 return (B_FALSE);
3113 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
3114 cap_lso.lso_basic_tcp_ipv4.lso_max)
3115 return (B_FALSE);
3116 } else {
3117 return (B_FALSE);
3118 }
3119 }
3120
3121 return (B_TRUE);
3122 }
3123
3124 /*
3125 * Returns the maximum SDU according to the SDU of the constituent ports.
3126 */
3127 static uint_t
aggr_grp_max_sdu(aggr_grp_t * grp)3128 aggr_grp_max_sdu(aggr_grp_t *grp)
3129 {
3130 uint_t max_sdu = (uint_t)-1;
3131 aggr_port_t *port;
3132
3133 ASSERT(grp->lg_ports != NULL);
3134
3135 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3136 uint_t port_sdu_max;
3137
3138 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3139 if (max_sdu > port_sdu_max)
3140 max_sdu = port_sdu_max;
3141 }
3142
3143 return (max_sdu);
3144 }
3145
3146 /*
3147 * Checks if the maximum SDU of the specified port is compatible
3148 * with the maximum SDU of the specified aggregation group, returns
3149 * B_TRUE if it is, B_FALSE otherwise.
3150 */
3151 static boolean_t
aggr_grp_sdu_check(aggr_grp_t * grp,aggr_port_t * port)3152 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
3153 {
3154 uint_t port_sdu_max;
3155
3156 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
3157 return (port_sdu_max >= grp->lg_max_sdu);
3158 }
3159
3160 /*
3161 * Returns the maximum margin according to the margin of the constituent ports.
3162 */
3163 static uint32_t
aggr_grp_max_margin(aggr_grp_t * grp)3164 aggr_grp_max_margin(aggr_grp_t *grp)
3165 {
3166 uint32_t margin = UINT32_MAX;
3167 aggr_port_t *port;
3168
3169 ASSERT(grp->lg_mh == NULL);
3170 ASSERT(grp->lg_ports != NULL);
3171
3172 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
3173 if (margin > port->lp_margin)
3174 margin = port->lp_margin;
3175 }
3176
3177 grp->lg_margin = margin;
3178 return (margin);
3179 }
3180
3181 /*
3182 * Checks if the maximum margin of the specified port is compatible
3183 * with the maximum margin of the specified aggregation group, returns
3184 * B_TRUE if it is, B_FALSE otherwise.
3185 */
3186 static boolean_t
aggr_grp_margin_check(aggr_grp_t * grp,aggr_port_t * port)3187 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
3188 {
3189 if (port->lp_margin >= grp->lg_margin)
3190 return (B_TRUE);
3191
3192 /*
3193 * See whether the current margin value is allowed to be changed to
3194 * the new value.
3195 */
3196 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
3197 return (B_FALSE);
3198
3199 grp->lg_margin = port->lp_margin;
3200 return (B_TRUE);
3201 }
3202
3203 /*
3204 * Set MTU on individual ports of an aggregation group
3205 */
3206 static int
aggr_set_port_sdu(aggr_grp_t * grp,aggr_port_t * port,uint32_t sdu,uint32_t * old_mtu)3207 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
3208 uint32_t *old_mtu)
3209 {
3210 boolean_t removed = B_FALSE;
3211 mac_perim_handle_t mph;
3212 mac_diag_t diag;
3213 int err, rv, retry = 0;
3214
3215 if (port->lp_mah != NULL) {
3216 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
3217 port->lp_mah = NULL;
3218 removed = B_TRUE;
3219 }
3220 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
3221 try_again:
3222 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
3223 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
3224 &port->lp_mah, 0, &diag)) != 0) {
3225 /*
3226 * following is a workaround for a bug in 'bge' driver.
3227 * See CR 6794654 for more information and this work around
3228 * will be removed once the CR is fixed.
3229 */
3230 if (rv == EIO && retry++ < 3) {
3231 delay(2 * hz);
3232 goto try_again;
3233 }
3234 /*
3235 * if mac_unicast_add() failed while setting the MTU,
3236 * detach the port from the group.
3237 */
3238 mac_perim_enter_by_mh(port->lp_mh, &mph);
3239 (void) aggr_grp_detach_port(grp, port);
3240 mac_perim_exit(mph);
3241 cmn_err(CE_WARN, "Unable to restart the port %s while "
3242 "setting MTU. Detaching the port from the aggregation.",
3243 mac_client_name(port->lp_mch));
3244 }
3245 return (err);
3246 }
3247
3248 static int
aggr_sdu_update(aggr_grp_t * grp,uint32_t sdu)3249 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
3250 {
3251 int err = 0, i, rv;
3252 aggr_port_t *port;
3253 uint32_t *mtu;
3254
3255 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3256
3257 /*
3258 * If the MTU being set is equal to aggr group's maximum
3259 * allowable value, then there is nothing to change
3260 */
3261 if (sdu == grp->lg_max_sdu)
3262 return (0);
3263
3264 /* 0 is aggr group's min sdu */
3265 if (sdu == 0)
3266 return (EINVAL);
3267
3268 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
3269 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
3270 port = port->lp_next, i++) {
3271 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
3272 }
3273 if (err != 0) {
3274 /* recover from error: reset the mtus of the ports */
3275 aggr_port_t *tmp;
3276
3277 for (tmp = grp->lg_ports, i = 0; tmp != port;
3278 tmp = tmp->lp_next, i++) {
3279 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
3280 }
3281 goto bail;
3282 }
3283 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
3284 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
3285 ASSERT(rv == 0);
3286 bail:
3287 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
3288 return (err);
3289 }
3290
3291 /*
3292 * Callback functions for set/get of properties
3293 */
3294 /*ARGSUSED*/
3295 static int
aggr_m_setprop(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)3296 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3297 uint_t pr_valsize, const void *pr_val)
3298 {
3299 int err = ENOTSUP;
3300 aggr_grp_t *grp = m_driver;
3301
3302 switch (pr_num) {
3303 case MAC_PROP_MTU: {
3304 uint32_t mtu;
3305
3306 if (pr_valsize < sizeof (mtu)) {
3307 err = EINVAL;
3308 break;
3309 }
3310 bcopy(pr_val, &mtu, sizeof (mtu));
3311 err = aggr_sdu_update(grp, mtu);
3312 break;
3313 }
3314 default:
3315 break;
3316 }
3317 return (err);
3318 }
3319
3320 typedef struct rboundary {
3321 uint32_t bval;
3322 int btype;
3323 } rboundary_t;
3324
3325 /*
3326 * This function finds the intersection of mtu ranges stored in arrays -
3327 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
3328 * Individual arrays are assumed to contain non-overlapping ranges.
3329 * Algorithm:
3330 * A range has two boundaries - min and max. We scan all arrays and store
3331 * each boundary as a separate element in a temporary array. We also store
3332 * the boundary types, min or max, as +1 or -1 respectively in the temporary
3333 * array. Then we sort the temporary array in ascending order. We scan the
3334 * sorted array from lower to higher values and keep a cumulative sum of
3335 * boundary types. Element in the temporary array for which the sum reaches
3336 * mcount is a min boundary of a range in the result and next element will be
3337 * max boundary.
3338 *
3339 * Example for mcount = 3,
3340 *
3341 * ----|_________|-------|_______|----|__|------ mrange[0]
3342 *
3343 * -------|________|--|____________|-----|___|-- mrange[1]
3344 *
3345 * --------|________________|-------|____|------ mrange[2]
3346 *
3347 * 3 2 1
3348 * \|/
3349 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
3350 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
3351 *
3352 * same min and max
3353 * V
3354 * --------|_____|-------|__|------------|------ intersecting ranges
3355 */
3356 void
aggr_mtu_range_intersection(mac_propval_range_t ** mrange,int mcount,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)3357 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
3358 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
3359 {
3360 mac_propval_uint32_range_t *rval, *ur;
3361 int rmaxcnt, rcount;
3362 size_t sz_range32;
3363 rboundary_t *ta; /* temporary array */
3364 rboundary_t temp;
3365 boolean_t range_started = B_FALSE;
3366 int i, j, m, sum;
3367
3368 sz_range32 = sizeof (mac_propval_uint32_range_t);
3369
3370 for (i = 0, rmaxcnt = 0; i < mcount; i++)
3371 rmaxcnt += mrange[i]->mpr_count;
3372
3373 /* Allocate enough space to store the results */
3374 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
3375
3376 /* Number of boundaries are twice as many as ranges */
3377 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
3378
3379 for (i = 0, m = 0; i < mcount; i++) {
3380 ur = &(mrange[i]->mpr_range_uint32[0]);
3381 for (j = 0; j < mrange[i]->mpr_count; j++) {
3382 ta[m].bval = ur[j].mpur_min;
3383 ta[m++].btype = 1;
3384 ta[m].bval = ur[j].mpur_max;
3385 ta[m++].btype = -1;
3386 }
3387 }
3388
3389 /*
3390 * Sort the temporary array in ascending order of bval;
3391 * if boundary values are same then sort on btype.
3392 */
3393 for (i = 0; i < m-1; i++) {
3394 for (j = i+1; j < m; j++) {
3395 if ((ta[i].bval > ta[j].bval) ||
3396 ((ta[i].bval == ta[j].bval) &&
3397 (ta[i].btype < ta[j].btype))) {
3398 temp = ta[i];
3399 ta[i] = ta[j];
3400 ta[j] = temp;
3401 }
3402 }
3403 }
3404
3405 /* Walk through temporary array to find all ranges in the results */
3406 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
3407 sum += ta[i].btype;
3408 if (sum == mcount) {
3409 rval[rcount].mpur_min = ta[i].bval;
3410 range_started = B_TRUE;
3411 } else if (sum < mcount && range_started) {
3412 rval[rcount++].mpur_max = ta[i].bval;
3413 range_started = B_FALSE;
3414 }
3415 }
3416
3417 *prval = rval;
3418 *prmaxcnt = rmaxcnt;
3419 *prcount = rcount;
3420
3421 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
3422 }
3423
3424 /*
3425 * Returns the mtu ranges which could be supported by aggr group.
3426 * prmaxcnt returns the size of the buffer prval, prcount returns
3427 * the number of valid entries in prval. Caller is responsible
3428 * for freeing up prval.
3429 */
3430 int
aggr_grp_possible_mtu_range(aggr_grp_t * grp,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)3431 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
3432 int *prmaxcnt, int *prcount)
3433 {
3434 mac_propval_range_t **vals;
3435 aggr_port_t *port;
3436 mac_perim_handle_t mph;
3437 uint_t i, numr;
3438 int err = 0;
3439 size_t sz_propval, sz_range32;
3440 size_t size;
3441
3442 sz_propval = sizeof (mac_propval_range_t);
3443 sz_range32 = sizeof (mac_propval_uint32_range_t);
3444
3445 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
3446
3447 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
3448 KM_SLEEP);
3449
3450 for (port = grp->lg_ports, i = 0; port != NULL;
3451 port = port->lp_next, i++) {
3452
3453 size = sz_propval;
3454 vals[i] = kmem_alloc(size, KM_SLEEP);
3455 vals[i]->mpr_count = 1;
3456
3457 mac_perim_enter_by_mh(port->lp_mh, &mph);
3458
3459 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3460 NULL, 0, vals[i], NULL);
3461 if (err == ENOSPC) {
3462 /*
3463 * Not enough space to hold all ranges.
3464 * Allocate extra space as indicated and retry.
3465 */
3466 numr = vals[i]->mpr_count;
3467 kmem_free(vals[i], sz_propval);
3468 size = sz_propval + (numr - 1) * sz_range32;
3469 vals[i] = kmem_alloc(size, KM_SLEEP);
3470 vals[i]->mpr_count = numr;
3471 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
3472 NULL, 0, vals[i], NULL);
3473 ASSERT(err != ENOSPC);
3474 }
3475 mac_perim_exit(mph);
3476 if (err != 0) {
3477 kmem_free(vals[i], size);
3478 vals[i] = NULL;
3479 break;
3480 }
3481 }
3482
3483 /*
3484 * if any of the underlying ports does not support changing MTU then
3485 * just return ENOTSUP
3486 */
3487 if (port != NULL) {
3488 ASSERT(err != 0);
3489 goto done;
3490 }
3491
3492 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
3493 prcount);
3494
3495 done:
3496 for (i = 0; i < grp->lg_nports; i++) {
3497 if (vals[i] != NULL) {
3498 numr = vals[i]->mpr_count;
3499 size = sz_propval + (numr - 1) * sz_range32;
3500 kmem_free(vals[i], size);
3501 }
3502 }
3503
3504 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
3505 return (err);
3506 }
3507
3508 static void
aggr_m_propinfo(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)3509 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
3510 mac_prop_info_handle_t prh)
3511 {
3512 aggr_grp_t *grp = m_driver;
3513 mac_propval_uint32_range_t *rval = NULL;
3514 int i, rcount, rmaxcnt;
3515 int err = 0;
3516
3517 _NOTE(ARGUNUSED(pr_name));
3518
3519 if (pr_num != MAC_PROP_MTU)
3520 return;
3521
3522 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt, &rcount);
3523 if (err != 0) {
3524 ASSERT(rval == NULL);
3525 return;
3526 }
3527 for (i = 0; i < rcount; i++) {
3528 mac_prop_info_set_range_uint32(prh,
3529 rval[i].mpur_min, rval[i].mpur_max);
3530 }
3531 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
3532 }
3533