xref: /titanic_51/usr/src/uts/common/io/mac/mac.c (revision 7a286c471efbab8562f7655a82931904703fffe0)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * MAC Services Module
29  *
30  * The GLDv3 framework locking -  The MAC layer
31  * --------------------------------------------
32  *
33  * The MAC layer is central to the GLD framework and can provide the locking
34  * framework needed for itself and for the use of MAC clients. MAC end points
35  * are fairly disjoint and don't share a lot of state. So a coarse grained
36  * multi-threading scheme is to single thread all create/modify/delete or set
37  * type of control operations on a per mac end point while allowing data threads
38  * concurrently.
39  *
40  * Control operations (set) that modify a mac end point are always serialized on
41  * a per mac end point basis, We have at most 1 such thread per mac end point
42  * at a time.
43  *
44  * All other operations that are not serialized are essentially multi-threaded.
45  * For example a control operation (get) like getting statistics which may not
46  * care about reading values atomically or data threads sending or receiving
47  * data. Mostly these type of operations don't modify the control state. Any
48  * state these operations care about are protected using traditional locks.
49  *
50  * The perimeter only serializes serial operations. It does not imply there
51  * aren't any other concurrent operations. However a serialized operation may
52  * sometimes need to make sure it is the only thread. In this case it needs
53  * to use reference counting mechanisms to cv_wait until any current data
54  * threads are done.
55  *
56  * The mac layer itself does not hold any locks across a call to another layer.
57  * The perimeter is however held across a down call to the driver to make the
58  * whole control operation atomic with respect to other control operations.
59  * Also the data path and get type control operations may proceed concurrently.
60  * These operations synchronize with the single serial operation on a given mac
61  * end point using regular locks. The perimeter ensures that conflicting
62  * operations like say a mac_multicast_add and a mac_multicast_remove on the
63  * same mac end point don't interfere with each other and also ensures that the
64  * changes in the mac layer and the call to the underlying driver to say add a
65  * multicast address are done atomically without interference from a thread
66  * trying to delete the same address.
67  *
68  * For example, consider
69  * mac_multicst_add()
70  * {
71  *	mac_perimeter_enter();	serialize all control operations
72  *
73  *	grab list lock		protect against access by data threads
74  *	add to list
75  *	drop list lock
76  *
77  *	call driver's mi_multicst
78  *
79  *	mac_perimeter_exit();
80  * }
81  *
82  * To lessen the number of serialization locks and simplify the lock hierarchy,
83  * we serialize all the control operations on a per mac end point by using a
84  * single serialization lock called the perimeter. We allow recursive entry into
85  * the perimeter to facilitate use of this mechanism by both the mac client and
86  * the MAC layer itself.
87  *
88  * MAC client means an entity that does an operation on a mac handle
89  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
90  * an entity that does an operation on a mac handle obtained from a
91  * mac_register. An entity could be both client and driver but on different
92  * handles eg. aggr. and should only make the corresponding mac interface calls
93  * i.e. mac driver interface or mac client interface as appropriate for that
94  * mac handle.
95  *
96  * General rules.
97  * -------------
98  *
99  * R1. The lock order of upcall threads is natually opposite to downcall
100  * threads. Hence upcalls must not hold any locks across layers for fear of
101  * recursive lock enter and lock order violation. This applies to all layers.
102  *
103  * R2. The perimeter is just another lock. Since it is held in the down
104  * direction, acquiring the perimeter in an upcall is prohibited as it would
105  * cause a deadlock. This applies to all layers.
106  *
107  * Note that upcalls that need to grab the mac perimeter (for example
108  * mac_notify upcalls) can still achieve that by posting the request to a
109  * thread, which can then grab all the required perimeters and locks in the
110  * right global order. Note that in the above example the mac layer iself
111  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
112  * to the client must do that. Please see the aggr code for an example.
113  *
114  * MAC client rules
115  * ----------------
116  *
117  * R3. A MAC client may use the MAC provided perimeter facility to serialize
118  * control operations on a per mac end point. It does this by by acquring
119  * and holding the perimeter across a sequence of calls to the mac layer.
120  * This ensures atomicity across the entire block of mac calls. In this
121  * model the MAC client must not hold any client locks across the calls to
122  * the mac layer. This model is the preferred solution.
123  *
124  * R4. However if a MAC client has a lot of global state across all mac end
125  * points the per mac end point serialization may not be sufficient. In this
126  * case the client may choose to use global locks or use its own serialization.
127  * To avoid deadlocks, these client layer locks held across the mac calls
128  * in the control path must never be acquired by the data path for the reason
129  * mentioned below.
130  *
131  * (Assume that a control operation that holds a client lock blocks in the
132  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
133  * data thread that holds this reference count, tries to acquire the same
134  * client lock subsequently it will deadlock).
135  *
136  * A MAC client may follow either the R3 model or the R4 model, but can't
137  * mix both. In the former, the hierarchy is Perim -> client locks, but in
138  * the latter it is client locks -> Perim.
139  *
140  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
141  * context since they may block while trying to acquire the perimeter.
142  * In addition some calls may block waiting for upcall refcnts to come down to
143  * zero.
144  *
145  * R6. MAC clients must make sure that they are single threaded and all threads
146  * from the top (in particular data threads) have finished before calling
147  * mac_client_close. The MAC framework does not track the number of client
148  * threads using the mac client handle. Also mac clients must make sure
149  * they have undone all the control operations before calling mac_client_close.
150  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
151  * mac_unicast_add/mac_multicast_add.
152  *
153  * MAC framework rules
154  * -------------------
155  *
156  * R7. The mac layer itself must not hold any mac layer locks (except the mac
157  * perimeter) across a call to any other layer from the mac layer. The call to
158  * any other layer could be via mi_* entry points, classifier entry points into
159  * the driver or via upcall pointers into layers above. The mac perimeter may
160  * be acquired or held only in the down direction, for e.g. when calling into
161  * a mi_* driver enty point to provide atomicity of the operation.
162  *
163  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
164  * mac driver interfaces, the MAC layer must provide a cut out for control
165  * interfaces like upcall notifications and start them in a separate thread.
166  *
167  * R9. Note that locking order also implies a plumbing order. For example
168  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
169  * to plumb in any other order must be failed at mac_open time, otherwise it
170  * could lead to deadlocks due to inverse locking order.
171  *
172  * R10. MAC driver interfaces must not block since the driver could call them
173  * in interrupt context.
174  *
175  * R11. Walkers must preferably not hold any locks while calling walker
176  * callbacks. Instead these can operate on reference counts. In simple
177  * callbacks it may be ok to hold a lock and call the callbacks, but this is
178  * harder to maintain in the general case of arbitrary callbacks.
179  *
180  * R12. The MAC layer must protect upcall notification callbacks using reference
181  * counts rather than holding locks across the callbacks.
182  *
183  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
184  * sure that any pointers (such as mac ring pointers) it passes to the driver
185  * remain valid until mac unregister time. Currently the mac layer achieves
186  * this by using generation numbers for rings and freeing the mac rings only
187  * at unregister time.  The MAC layer must provide a layer of indirection and
188  * must not expose underlying driver rings or driver data structures/pointers
189  * directly to MAC clients.
190  *
191  * MAC driver rules
192  * ----------------
193  *
194  * R14. It would be preferable if MAC drivers don't hold any locks across any
195  * mac call. However at a minimum they must not hold any locks across data
196  * upcalls. They must also make sure that all references to mac data structures
197  * are cleaned up and that it is single threaded at mac_unregister time.
198  *
199  * R15. MAC driver interfaces don't block and so the action may be done
200  * asynchronously in a separate thread as for example handling notifications.
201  * The driver must not assume that the action is complete when the call
202  * returns.
203  *
204  * R16. Drivers must maintain a generation number per Rx ring, and pass it
205  * back to mac_rx_ring(); They are expected to increment the generation
206  * number whenever the ring's stop routine is invoked.
207  * See comments in mac_rx_ring();
208  *
209  * R17 Similarly mi_stop is another synchronization point and the driver must
210  * ensure that all upcalls are done and there won't be any future upcall
211  * before returning from mi_stop.
212  *
213  * R18. The driver may assume that all set/modify control operations via
214  * the mi_* entry points are single threaded on a per mac end point.
215  *
216  * Lock and Perimeter hierarchy scenarios
217  * ---------------------------------------
218  *
219  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
220  *
221  * ft_lock -> fe_lock [mac_flow_lookup]
222  *
223  * mi_rw_lock -> fe_lock [mac_bcast_send]
224  *
225  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
226  *
227  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
228  *
229  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
230  *
231  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
232  * client to driver. In the case of clients that explictly use the mac provided
233  * perimeter mechanism for its serialization, the hierarchy is
234  * Perimeter -> mac layer locks, since the client never holds any locks across
235  * the mac calls. In the case of clients that use its own locks the hierarchy
236  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
237  * calls mac_perim_enter/exit in this case.
238  *
239  * Subflow creation rules
240  * ---------------------------
241  * o In case of a user specified cpulist present on underlying link and flows,
242  * the flows cpulist must be a subset of the underlying link.
243  * o In case of a user specified fanout mode present on link and flow, the
244  * subflow fanout count has to be less than or equal to that of the
245  * underlying link. The cpu-bindings for the subflows will be a subset of
246  * the underlying link.
247  * o In case if no cpulist specified on both underlying link and flow, the
248  * underlying link relies on a  MAC tunable to provide out of box fanout.
249  * The subflow will have no cpulist (the subflow will be unbound)
250  * o In case if no cpulist is specified on the underlying link, a subflow can
251  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
252  * for the subflow will not adhere to restriction that they need to be subset
253  * of the underlying link.
254  * o In case where the underlying link is carrying either a user specified
255  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
256  * created unbound.
257  * o While creating unbound subflows, bandwidth mode changes attempt to
258  * figure a right fanout count. In such cases the fanout count will override
259  * the unbound cpu-binding behavior.
260  * o In addition to this, while cycling between flow and link properties, we
261  * impose a restriction that if a link property has a subflow with
262  * user-specified attributes, we will not allow changing the link property.
263  * The administrator needs to reset all the user specified properties for the
264  * subflows before attempting a link property change.
265  * Some of the above rules can be overridden by specifying additional command
266  * line options while creating or modifying link or subflow properties.
267  */
268 
269 #include <sys/types.h>
270 #include <sys/conf.h>
271 #include <sys/id_space.h>
272 #include <sys/esunddi.h>
273 #include <sys/stat.h>
274 #include <sys/mkdev.h>
275 #include <sys/stream.h>
276 #include <sys/strsun.h>
277 #include <sys/strsubr.h>
278 #include <sys/dlpi.h>
279 #include <sys/modhash.h>
280 #include <sys/mac_provider.h>
281 #include <sys/mac_client_impl.h>
282 #include <sys/mac_soft_ring.h>
283 #include <sys/mac_impl.h>
284 #include <sys/mac.h>
285 #include <sys/dls.h>
286 #include <sys/dld.h>
287 #include <sys/modctl.h>
288 #include <sys/fs/dv_node.h>
289 #include <sys/thread.h>
290 #include <sys/proc.h>
291 #include <sys/callb.h>
292 #include <sys/cpuvar.h>
293 #include <sys/atomic.h>
294 #include <sys/bitmap.h>
295 #include <sys/sdt.h>
296 #include <sys/mac_flow.h>
297 #include <sys/ddi_intr_impl.h>
298 #include <sys/disp.h>
299 #include <sys/sdt.h>
300 #include <sys/vnic.h>
301 #include <sys/vnic_impl.h>
302 #include <sys/vlan.h>
303 #include <inet/ip.h>
304 #include <inet/ip6.h>
305 #include <sys/exacct.h>
306 #include <sys/exacct_impl.h>
307 #include <inet/nd.h>
308 #include <sys/ethernet.h>
309 
310 #define	IMPL_HASHSZ	67	/* prime */
311 
312 kmem_cache_t	*i_mac_impl_cachep;
313 mod_hash_t		*i_mac_impl_hash;
314 krwlock_t		i_mac_impl_lock;
315 uint_t			i_mac_impl_count;
316 static kmem_cache_t	*mac_ring_cache;
317 static id_space_t	*minor_ids;
318 static uint32_t		minor_count;
319 
320 /*
321  * Logging stuff. Perhaps mac_logging_interval could be broken into
322  * mac_flow_log_interval and mac_link_log_interval if we want to be
323  * able to schedule them differently.
324  */
325 uint_t			mac_logging_interval;
326 boolean_t		mac_flow_log_enable;
327 boolean_t		mac_link_log_enable;
328 timeout_id_t		mac_logging_timer;
329 
330 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
331 int mac_dbg = 0;
332 
333 #define	MACTYPE_KMODDIR	"mac"
334 #define	MACTYPE_HASHSZ	67
335 static mod_hash_t	*i_mactype_hash;
336 /*
337  * i_mactype_lock synchronizes threads that obtain references to mactype_t
338  * structures through i_mactype_getplugin().
339  */
340 static kmutex_t		i_mactype_lock;
341 
342 /*
343  * mac_tx_percpu_cnt
344  *
345  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
346  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
347  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
348  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
349  */
350 int mac_tx_percpu_cnt;
351 int mac_tx_percpu_cnt_max = 128;
352 
353 /*
354  * Call back functions for the bridge module.  These are guaranteed to be valid
355  * when holding a reference on a link or when holding mip->mi_bridge_lock and
356  * mi_bridge_link is non-NULL.
357  */
358 mac_bridge_tx_t mac_bridge_tx_cb;
359 mac_bridge_rx_t mac_bridge_rx_cb;
360 mac_bridge_ref_t mac_bridge_ref_cb;
361 mac_bridge_ls_t mac_bridge_ls_cb;
362 
363 static int i_mac_constructor(void *, void *, int);
364 static void i_mac_destructor(void *, void *);
365 static int i_mac_ring_ctor(void *, void *, int);
366 static void i_mac_ring_dtor(void *, void *);
367 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
368 void mac_tx_client_flush(mac_client_impl_t *);
369 void mac_tx_client_block(mac_client_impl_t *);
370 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
371 static int mac_start_group_and_rings(mac_group_t *);
372 static void mac_stop_group_and_rings(mac_group_t *);
373 
374 /*
375  * Module initialization functions.
376  */
377 
378 void
379 mac_init(void)
380 {
381 	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
382 	    boot_max_ncpus);
383 
384 	/* Upper bound is mac_tx_percpu_cnt_max */
385 	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
386 		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
387 
388 	if (mac_tx_percpu_cnt < 1) {
389 		/* Someone set max_tx_percpu_cnt_max to 0 or less */
390 		mac_tx_percpu_cnt = 1;
391 	}
392 
393 	ASSERT(mac_tx_percpu_cnt >= 1);
394 	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
395 	/*
396 	 * Make it of the form 2**N - 1 in the range
397 	 * [0 .. mac_tx_percpu_cnt_max - 1]
398 	 */
399 	mac_tx_percpu_cnt--;
400 
401 	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
402 	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
403 	    NULL, NULL, NULL, 0);
404 	ASSERT(i_mac_impl_cachep != NULL);
405 
406 	mac_ring_cache = kmem_cache_create("mac_ring_cache",
407 	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
408 	    NULL, NULL, 0);
409 	ASSERT(mac_ring_cache != NULL);
410 
411 	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
412 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
413 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
414 	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
415 
416 	mac_flow_init();
417 	mac_soft_ring_init();
418 	mac_bcast_init();
419 	mac_client_init();
420 
421 	i_mac_impl_count = 0;
422 
423 	i_mactype_hash = mod_hash_create_extended("mactype_hash",
424 	    MACTYPE_HASHSZ,
425 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
426 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
427 
428 	/*
429 	 * Allocate an id space to manage minor numbers. The range of the
430 	 * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
431 	 * leaves half of the 32-bit minors available for driver private use.
432 	 */
433 	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
434 	    MAC_PRIVATE_MINOR-1);
435 	ASSERT(minor_ids != NULL);
436 	minor_count = 0;
437 
438 	/* Let's default to 20 seconds */
439 	mac_logging_interval = 20;
440 	mac_flow_log_enable = B_FALSE;
441 	mac_link_log_enable = B_FALSE;
442 	mac_logging_timer = 0;
443 }
444 
445 int
446 mac_fini(void)
447 {
448 	if (i_mac_impl_count > 0 || minor_count > 0)
449 		return (EBUSY);
450 
451 	id_space_destroy(minor_ids);
452 	mac_flow_fini();
453 
454 	mod_hash_destroy_hash(i_mac_impl_hash);
455 	rw_destroy(&i_mac_impl_lock);
456 
457 	mac_client_fini();
458 	kmem_cache_destroy(mac_ring_cache);
459 
460 	mod_hash_destroy_hash(i_mactype_hash);
461 	mac_soft_ring_finish();
462 	return (0);
463 }
464 
465 void
466 mac_init_ops(struct dev_ops *ops, const char *name)
467 {
468 	dld_init_ops(ops, name);
469 }
470 
471 void
472 mac_fini_ops(struct dev_ops *ops)
473 {
474 	dld_fini_ops(ops);
475 }
476 
477 /*ARGSUSED*/
478 static int
479 i_mac_constructor(void *buf, void *arg, int kmflag)
480 {
481 	mac_impl_t	*mip = buf;
482 
483 	bzero(buf, sizeof (mac_impl_t));
484 
485 	mip->mi_linkstate = LINK_STATE_UNKNOWN;
486 
487 	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
488 	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
489 	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
490 	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
491 	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
492 
493 	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
494 	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
495 	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
496 	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
497 
498 	mutex_init(&mip->mi_bridge_lock, NULL, MUTEX_DEFAULT, NULL);
499 
500 	return (0);
501 }
502 
503 /*ARGSUSED*/
504 static void
505 i_mac_destructor(void *buf, void *arg)
506 {
507 	mac_impl_t	*mip = buf;
508 	mac_cb_info_t	*mcbi;
509 
510 	ASSERT(mip->mi_ref == 0);
511 	ASSERT(mip->mi_active == 0);
512 	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
513 	ASSERT(mip->mi_devpromisc == 0);
514 	ASSERT(mip->mi_ksp == NULL);
515 	ASSERT(mip->mi_kstat_count == 0);
516 	ASSERT(mip->mi_nclients == 0);
517 	ASSERT(mip->mi_nactiveclients == 0);
518 	ASSERT(mip->mi_single_active_client == NULL);
519 	ASSERT(mip->mi_state_flags == 0);
520 	ASSERT(mip->mi_factory_addr == NULL);
521 	ASSERT(mip->mi_factory_addr_num == 0);
522 	ASSERT(mip->mi_default_tx_ring == NULL);
523 
524 	mcbi = &mip->mi_notify_cb_info;
525 	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
526 	ASSERT(mip->mi_notify_bits == 0);
527 	ASSERT(mip->mi_notify_thread == NULL);
528 	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
529 	mcbi->mcbi_lockp = NULL;
530 
531 	mcbi = &mip->mi_promisc_cb_info;
532 	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
533 	ASSERT(mip->mi_promisc_list == NULL);
534 	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
535 	mcbi->mcbi_lockp = NULL;
536 
537 	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
538 	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
539 
540 	mutex_destroy(&mip->mi_lock);
541 	rw_destroy(&mip->mi_rw_lock);
542 
543 	mutex_destroy(&mip->mi_promisc_lock);
544 	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
545 	mutex_destroy(&mip->mi_notify_lock);
546 	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
547 	mutex_destroy(&mip->mi_ring_lock);
548 
549 	ASSERT(mip->mi_bridge_link == NULL);
550 }
551 
552 /* ARGSUSED */
553 static int
554 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
555 {
556 	mac_ring_t *ring = (mac_ring_t *)buf;
557 
558 	bzero(ring, sizeof (mac_ring_t));
559 	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
560 	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
561 	ring->mr_state = MR_FREE;
562 	return (0);
563 }
564 
565 /* ARGSUSED */
566 static void
567 i_mac_ring_dtor(void *buf, void *arg)
568 {
569 	mac_ring_t *ring = (mac_ring_t *)buf;
570 
571 	cv_destroy(&ring->mr_cv);
572 	mutex_destroy(&ring->mr_lock);
573 }
574 
575 /*
576  * Common functions to do mac callback addition and deletion. Currently this is
577  * used by promisc callbacks and notify callbacks. List addition and deletion
578  * need to take care of list walkers. List walkers in general, can't hold list
579  * locks and make upcall callbacks due to potential lock order and recursive
580  * reentry issues. Instead list walkers increment the list walker count to mark
581  * the presence of a walker thread. Addition can be carefully done to ensure
582  * that the list walker always sees either the old list or the new list.
583  * However the deletion can't be done while the walker is active, instead the
584  * deleting thread simply marks the entry as logically deleted. The last walker
585  * physically deletes and frees up the logically deleted entries when the walk
586  * is complete.
587  */
588 void
589 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
590     mac_cb_t *mcb_elem)
591 {
592 	mac_cb_t	*p;
593 	mac_cb_t	**pp;
594 
595 	/* Verify it is not already in the list */
596 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
597 		if (p == mcb_elem)
598 			break;
599 	}
600 	VERIFY(p == NULL);
601 
602 	/*
603 	 * Add it to the head of the callback list. The membar ensures that
604 	 * the following list pointer manipulations reach global visibility
605 	 * in exactly the program order below.
606 	 */
607 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
608 
609 	mcb_elem->mcb_nextp = *mcb_head;
610 	membar_producer();
611 	*mcb_head = mcb_elem;
612 }
613 
614 /*
615  * Mark the entry as logically deleted. If there aren't any walkers unlink
616  * from the list. In either case return the corresponding status.
617  */
618 boolean_t
619 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
620     mac_cb_t *mcb_elem)
621 {
622 	mac_cb_t	*p;
623 	mac_cb_t	**pp;
624 
625 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
626 	/*
627 	 * Search the callback list for the entry to be removed
628 	 */
629 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
630 		if (p == mcb_elem)
631 			break;
632 	}
633 	VERIFY(p != NULL);
634 
635 	/*
636 	 * If there are walkers just mark it as deleted and the last walker
637 	 * will remove from the list and free it.
638 	 */
639 	if (mcbi->mcbi_walker_cnt != 0) {
640 		p->mcb_flags |= MCB_CONDEMNED;
641 		mcbi->mcbi_del_cnt++;
642 		return (B_FALSE);
643 	}
644 
645 	ASSERT(mcbi->mcbi_del_cnt == 0);
646 	*pp = p->mcb_nextp;
647 	p->mcb_nextp = NULL;
648 	return (B_TRUE);
649 }
650 
651 /*
652  * Wait for all pending callback removals to be completed
653  */
654 void
655 mac_callback_remove_wait(mac_cb_info_t *mcbi)
656 {
657 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
658 	while (mcbi->mcbi_del_cnt != 0) {
659 		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
660 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
661 	}
662 }
663 
664 /*
665  * The last mac callback walker does the cleanup. Walk the list and unlik
666  * all the logically deleted entries and construct a temporary list of
667  * removed entries. Return the list of removed entries to the caller.
668  */
669 mac_cb_t *
670 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
671 {
672 	mac_cb_t	*p;
673 	mac_cb_t	**pp;
674 	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
675 	int	cnt = 0;
676 
677 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
678 	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
679 
680 	pp = mcb_head;
681 	while (*pp != NULL) {
682 		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
683 			p = *pp;
684 			*pp = p->mcb_nextp;
685 			p->mcb_nextp = rmlist;
686 			rmlist = p;
687 			cnt++;
688 			continue;
689 		}
690 		pp = &(*pp)->mcb_nextp;
691 	}
692 
693 	ASSERT(mcbi->mcbi_del_cnt == cnt);
694 	mcbi->mcbi_del_cnt = 0;
695 	return (rmlist);
696 }
697 
698 boolean_t
699 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
700 {
701 	mac_cb_t	*mcb;
702 
703 	/* Verify it is not already in the list */
704 	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
705 		if (mcb == mcb_elem)
706 			return (B_TRUE);
707 	}
708 
709 	return (B_FALSE);
710 }
711 
712 boolean_t
713 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
714 {
715 	boolean_t	found;
716 
717 	mutex_enter(mcbi->mcbi_lockp);
718 	found = mac_callback_lookup(mcb_headp, mcb_elem);
719 	mutex_exit(mcbi->mcbi_lockp);
720 
721 	return (found);
722 }
723 
724 /* Free the list of removed callbacks */
725 void
726 mac_callback_free(mac_cb_t *rmlist)
727 {
728 	mac_cb_t	*mcb;
729 	mac_cb_t	*mcb_next;
730 
731 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
732 		mcb_next = mcb->mcb_nextp;
733 		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
734 	}
735 }
736 
737 /*
738  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
739  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
740  * is only a single shared total walker count, and an entry can't be physically
741  * unlinked if a walker is active on either list. The last walker does this
742  * cleanup of logically deleted entries.
743  */
744 void
745 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
746 {
747 	mac_cb_t	*rmlist;
748 	mac_cb_t	*mcb;
749 	mac_cb_t	*mcb_next;
750 	mac_promisc_impl_t	*mpip;
751 
752 	/*
753 	 * Construct a temporary list of deleted callbacks by walking the
754 	 * the mi_promisc_list. Then for each entry in the temporary list,
755 	 * remove it from the mci_promisc_list and free the entry.
756 	 */
757 	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
758 	    &mip->mi_promisc_list);
759 
760 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
761 		mcb_next = mcb->mcb_nextp;
762 		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
763 		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
764 		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
765 		mcb->mcb_flags = 0;
766 		mcb->mcb_nextp = NULL;
767 		kmem_cache_free(mac_promisc_impl_cache, mpip);
768 	}
769 }
770 
771 void
772 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
773 {
774 	mac_cb_info_t	*mcbi;
775 
776 	/*
777 	 * Signal the notify thread even after mi_ref has become zero and
778 	 * mi_disabled is set. The synchronization with the notify thread
779 	 * happens in mac_unregister and that implies the driver must make
780 	 * sure it is single-threaded (with respect to mac calls) and that
781 	 * all pending mac calls have returned before it calls mac_unregister
782 	 */
783 	rw_enter(&i_mac_impl_lock, RW_READER);
784 	if (mip->mi_state_flags & MIS_DISABLED)
785 		goto exit;
786 
787 	/*
788 	 * Guard against incorrect notifications.  (Running a newer
789 	 * mac client against an older implementation?)
790 	 */
791 	if (type >= MAC_NNOTE)
792 		goto exit;
793 
794 	mcbi = &mip->mi_notify_cb_info;
795 	mutex_enter(mcbi->mcbi_lockp);
796 	mip->mi_notify_bits |= (1 << type);
797 	cv_broadcast(&mcbi->mcbi_cv);
798 	mutex_exit(mcbi->mcbi_lockp);
799 
800 exit:
801 	rw_exit(&i_mac_impl_lock);
802 }
803 
804 /*
805  * Mac serialization primitives. Please see the block comment at the
806  * top of the file.
807  */
808 void
809 i_mac_perim_enter(mac_impl_t *mip)
810 {
811 	mac_client_impl_t	*mcip;
812 
813 	if (mip->mi_state_flags & MIS_IS_VNIC) {
814 		/*
815 		 * This is a VNIC. Return the lower mac since that is what
816 		 * we want to serialize on.
817 		 */
818 		mcip = mac_vnic_lower(mip);
819 		mip = mcip->mci_mip;
820 	}
821 
822 	mutex_enter(&mip->mi_perim_lock);
823 	if (mip->mi_perim_owner == curthread) {
824 		mip->mi_perim_ocnt++;
825 		mutex_exit(&mip->mi_perim_lock);
826 		return;
827 	}
828 
829 	while (mip->mi_perim_owner != NULL)
830 		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
831 
832 	mip->mi_perim_owner = curthread;
833 	ASSERT(mip->mi_perim_ocnt == 0);
834 	mip->mi_perim_ocnt++;
835 #ifdef DEBUG
836 	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
837 	    MAC_PERIM_STACK_DEPTH);
838 #endif
839 	mutex_exit(&mip->mi_perim_lock);
840 }
841 
842 int
843 i_mac_perim_enter_nowait(mac_impl_t *mip)
844 {
845 	/*
846 	 * The vnic is a special case, since the serialization is done based
847 	 * on the lower mac. If the lower mac is busy, it does not imply the
848 	 * vnic can't be unregistered. But in the case of other drivers,
849 	 * a busy perimeter or open mac handles implies that the mac is busy
850 	 * and can't be unregistered.
851 	 */
852 	if (mip->mi_state_flags & MIS_IS_VNIC) {
853 		i_mac_perim_enter(mip);
854 		return (0);
855 	}
856 
857 	mutex_enter(&mip->mi_perim_lock);
858 	if (mip->mi_perim_owner != NULL) {
859 		mutex_exit(&mip->mi_perim_lock);
860 		return (EBUSY);
861 	}
862 	ASSERT(mip->mi_perim_ocnt == 0);
863 	mip->mi_perim_owner = curthread;
864 	mip->mi_perim_ocnt++;
865 	mutex_exit(&mip->mi_perim_lock);
866 
867 	return (0);
868 }
869 
870 void
871 i_mac_perim_exit(mac_impl_t *mip)
872 {
873 	mac_client_impl_t *mcip;
874 
875 	if (mip->mi_state_flags & MIS_IS_VNIC) {
876 		/*
877 		 * This is a VNIC. Return the lower mac since that is what
878 		 * we want to serialize on.
879 		 */
880 		mcip = mac_vnic_lower(mip);
881 		mip = mcip->mci_mip;
882 	}
883 
884 	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
885 
886 	mutex_enter(&mip->mi_perim_lock);
887 	if (--mip->mi_perim_ocnt == 0) {
888 		mip->mi_perim_owner = NULL;
889 		cv_signal(&mip->mi_perim_cv);
890 	}
891 	mutex_exit(&mip->mi_perim_lock);
892 }
893 
894 /*
895  * Returns whether the current thread holds the mac perimeter. Used in making
896  * assertions.
897  */
898 boolean_t
899 mac_perim_held(mac_handle_t mh)
900 {
901 	mac_impl_t	*mip = (mac_impl_t *)mh;
902 	mac_client_impl_t *mcip;
903 
904 	if (mip->mi_state_flags & MIS_IS_VNIC) {
905 		/*
906 		 * This is a VNIC. Return the lower mac since that is what
907 		 * we want to serialize on.
908 		 */
909 		mcip = mac_vnic_lower(mip);
910 		mip = mcip->mci_mip;
911 	}
912 	return (mip->mi_perim_owner == curthread);
913 }
914 
915 /*
916  * mac client interfaces to enter the mac perimeter of a mac end point, given
917  * its mac handle, or macname or linkid.
918  */
919 void
920 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
921 {
922 	mac_impl_t	*mip = (mac_impl_t *)mh;
923 
924 	i_mac_perim_enter(mip);
925 	/*
926 	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
927 	 * mac_open has been done internally while entering the perimeter.
928 	 * This information is used in mac_perim_exit
929 	 */
930 	MAC_ENCODE_MPH(*mphp, mip, 0);
931 }
932 
933 int
934 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
935 {
936 	int	err;
937 	mac_handle_t	mh;
938 
939 	if ((err = mac_open(name, &mh)) != 0)
940 		return (err);
941 
942 	mac_perim_enter_by_mh(mh, mphp);
943 	MAC_ENCODE_MPH(*mphp, mh, 1);
944 	return (0);
945 }
946 
947 int
948 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
949 {
950 	int	err;
951 	mac_handle_t	mh;
952 
953 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
954 		return (err);
955 
956 	mac_perim_enter_by_mh(mh, mphp);
957 	MAC_ENCODE_MPH(*mphp, mh, 1);
958 	return (0);
959 }
960 
961 void
962 mac_perim_exit(mac_perim_handle_t mph)
963 {
964 	mac_impl_t	*mip;
965 	boolean_t	need_close;
966 
967 	MAC_DECODE_MPH(mph, mip, need_close);
968 	i_mac_perim_exit(mip);
969 	if (need_close)
970 		mac_close((mac_handle_t)mip);
971 }
972 
973 int
974 mac_hold(const char *macname, mac_impl_t **pmip)
975 {
976 	mac_impl_t	*mip;
977 	int		err;
978 
979 	/*
980 	 * Check the device name length to make sure it won't overflow our
981 	 * buffer.
982 	 */
983 	if (strlen(macname) >= MAXNAMELEN)
984 		return (EINVAL);
985 
986 	/*
987 	 * Look up its entry in the global hash table.
988 	 */
989 	rw_enter(&i_mac_impl_lock, RW_WRITER);
990 	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
991 	    (mod_hash_val_t *)&mip);
992 
993 	if (err != 0) {
994 		rw_exit(&i_mac_impl_lock);
995 		return (ENOENT);
996 	}
997 
998 	if (mip->mi_state_flags & MIS_DISABLED) {
999 		rw_exit(&i_mac_impl_lock);
1000 		return (ENOENT);
1001 	}
1002 
1003 	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
1004 		rw_exit(&i_mac_impl_lock);
1005 		return (EBUSY);
1006 	}
1007 
1008 	mip->mi_ref++;
1009 	rw_exit(&i_mac_impl_lock);
1010 
1011 	*pmip = mip;
1012 	return (0);
1013 }
1014 
1015 void
1016 mac_rele(mac_impl_t *mip)
1017 {
1018 	rw_enter(&i_mac_impl_lock, RW_WRITER);
1019 	ASSERT(mip->mi_ref != 0);
1020 	if (--mip->mi_ref == 0) {
1021 		ASSERT(mip->mi_nactiveclients == 0 &&
1022 		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1023 	}
1024 	rw_exit(&i_mac_impl_lock);
1025 }
1026 
1027 /*
1028  * Private GLDv3 function to start a MAC instance.
1029  */
1030 int
1031 mac_start(mac_handle_t mh)
1032 {
1033 	mac_impl_t	*mip = (mac_impl_t *)mh;
1034 	int		err = 0;
1035 
1036 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1037 	ASSERT(mip->mi_start != NULL);
1038 
1039 	/*
1040 	 * Check whether the device is already started.
1041 	 */
1042 	if (mip->mi_active++ == 0) {
1043 		mac_ring_t *ring = NULL;
1044 
1045 		/*
1046 		 * Start the device.
1047 		 */
1048 		err = mip->mi_start(mip->mi_driver);
1049 		if (err != 0) {
1050 			mip->mi_active--;
1051 			return (err);
1052 		}
1053 
1054 		/*
1055 		 * Start the default tx ring.
1056 		 */
1057 		if (mip->mi_default_tx_ring != NULL) {
1058 
1059 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1060 			err = mac_start_ring(ring);
1061 			if (err != 0) {
1062 				mip->mi_active--;
1063 				return (err);
1064 			}
1065 			ring->mr_state = MR_INUSE;
1066 		}
1067 
1068 		if (mip->mi_rx_groups != NULL) {
1069 			/*
1070 			 * Start the default ring, since it will be needed
1071 			 * to receive broadcast and multicast traffic for
1072 			 * both primary and non-primary MAC clients.
1073 			 */
1074 			mac_group_t *grp = &mip->mi_rx_groups[0];
1075 
1076 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1077 			err = mac_start_group_and_rings(grp);
1078 			if (err != 0) {
1079 				mip->mi_active--;
1080 				if (ring != NULL) {
1081 					mac_stop_ring(ring);
1082 					ring->mr_state = MR_FREE;
1083 				}
1084 				return (err);
1085 			}
1086 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
1087 		}
1088 	}
1089 
1090 	return (err);
1091 }
1092 
1093 /*
1094  * Private GLDv3 function to stop a MAC instance.
1095  */
1096 void
1097 mac_stop(mac_handle_t mh)
1098 {
1099 	mac_impl_t	*mip = (mac_impl_t *)mh;
1100 
1101 	ASSERT(mip->mi_stop != NULL);
1102 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1103 
1104 	/*
1105 	 * Check whether the device is still needed.
1106 	 */
1107 	ASSERT(mip->mi_active != 0);
1108 	if (--mip->mi_active == 0) {
1109 		if (mip->mi_rx_groups != NULL) {
1110 			/*
1111 			 * There should be no more active clients since the
1112 			 * MAC is being stopped. Stop the default RX group
1113 			 * and transition it back to registered state.
1114 			 */
1115 			mac_group_t *grp = &mip->mi_rx_groups[0];
1116 
1117 			/*
1118 			 * When clients are torn down, the groups
1119 			 * are release via mac_release_rx_group which
1120 			 * knows the the default group is always in
1121 			 * started mode since broadcast uses it. So
1122 			 * we can assert that their are no clients
1123 			 * (since mac_bcast_add doesn't register itself
1124 			 * as a client) and group is in SHARED state.
1125 			 */
1126 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1127 			ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
1128 			    mip->mi_nactiveclients == 0);
1129 			mac_stop_group_and_rings(grp);
1130 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1131 		}
1132 
1133 		if (mip->mi_default_tx_ring != NULL) {
1134 			mac_ring_t *ring;
1135 
1136 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1137 			mac_stop_ring(ring);
1138 			ring->mr_state = MR_FREE;
1139 		}
1140 
1141 		/*
1142 		 * Stop the device.
1143 		 */
1144 		mip->mi_stop(mip->mi_driver);
1145 	}
1146 }
1147 
1148 int
1149 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1150 {
1151 	int		err = 0;
1152 
1153 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1154 	ASSERT(mip->mi_setpromisc != NULL);
1155 
1156 	if (on) {
1157 		/*
1158 		 * Enable promiscuous mode on the device if not yet enabled.
1159 		 */
1160 		if (mip->mi_devpromisc++ == 0) {
1161 			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1162 			if (err != 0) {
1163 				mip->mi_devpromisc--;
1164 				return (err);
1165 			}
1166 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1167 		}
1168 	} else {
1169 		if (mip->mi_devpromisc == 0)
1170 			return (EPROTO);
1171 
1172 		/*
1173 		 * Disable promiscuous mode on the device if this is the last
1174 		 * enabling.
1175 		 */
1176 		if (--mip->mi_devpromisc == 0) {
1177 			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1178 			if (err != 0) {
1179 				mip->mi_devpromisc++;
1180 				return (err);
1181 			}
1182 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1183 		}
1184 	}
1185 
1186 	return (0);
1187 }
1188 
1189 /*
1190  * The promiscuity state can change any time. If the caller needs to take
1191  * actions that are atomic with the promiscuity state, then the caller needs
1192  * to bracket the entire sequence with mac_perim_enter/exit
1193  */
1194 boolean_t
1195 mac_promisc_get(mac_handle_t mh)
1196 {
1197 	mac_impl_t		*mip = (mac_impl_t *)mh;
1198 
1199 	/*
1200 	 * Return the current promiscuity.
1201 	 */
1202 	return (mip->mi_devpromisc != 0);
1203 }
1204 
1205 /*
1206  * Invoked at MAC instance attach time to initialize the list
1207  * of factory MAC addresses supported by a MAC instance. This function
1208  * builds a local cache in the mac_impl_t for the MAC addresses
1209  * supported by the underlying hardware. The MAC clients themselves
1210  * use the mac_addr_factory*() functions to query and reserve
1211  * factory MAC addresses.
1212  */
1213 void
1214 mac_addr_factory_init(mac_impl_t *mip)
1215 {
1216 	mac_capab_multifactaddr_t capab;
1217 	uint8_t *addr;
1218 	int i;
1219 
1220 	/*
1221 	 * First round to see how many factory MAC addresses are available.
1222 	 */
1223 	bzero(&capab, sizeof (capab));
1224 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1225 	    &capab) || (capab.mcm_naddr == 0)) {
1226 		/*
1227 		 * The MAC instance doesn't support multiple factory
1228 		 * MAC addresses, we're done here.
1229 		 */
1230 		return;
1231 	}
1232 
1233 	/*
1234 	 * Allocate the space and get all the factory addresses.
1235 	 */
1236 	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1237 	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1238 
1239 	mip->mi_factory_addr_num = capab.mcm_naddr;
1240 	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1241 	    sizeof (mac_factory_addr_t), KM_SLEEP);
1242 
1243 	for (i = 0; i < capab.mcm_naddr; i++) {
1244 		bcopy(addr + i * MAXMACADDRLEN,
1245 		    mip->mi_factory_addr[i].mfa_addr,
1246 		    mip->mi_type->mt_addr_length);
1247 		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1248 	}
1249 
1250 	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1251 }
1252 
1253 void
1254 mac_addr_factory_fini(mac_impl_t *mip)
1255 {
1256 	if (mip->mi_factory_addr == NULL) {
1257 		ASSERT(mip->mi_factory_addr_num == 0);
1258 		return;
1259 	}
1260 
1261 	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1262 	    sizeof (mac_factory_addr_t));
1263 
1264 	mip->mi_factory_addr = NULL;
1265 	mip->mi_factory_addr_num = 0;
1266 }
1267 
1268 /*
1269  * Reserve a factory MAC address. If *slot is set to -1, the function
1270  * attempts to reserve any of the available factory MAC addresses and
1271  * returns the reserved slot id. If no slots are available, the function
1272  * returns ENOSPC. If *slot is not set to -1, the function reserves
1273  * the specified slot if it is available, or returns EBUSY is the slot
1274  * is already used. Returns ENOTSUP if the underlying MAC does not
1275  * support multiple factory addresses. If the slot number is not -1 but
1276  * is invalid, returns EINVAL.
1277  */
1278 int
1279 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1280 {
1281 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1282 	mac_impl_t *mip = mcip->mci_mip;
1283 	int i, ret = 0;
1284 
1285 	i_mac_perim_enter(mip);
1286 	/*
1287 	 * Protect against concurrent readers that may need a self-consistent
1288 	 * view of the factory addresses
1289 	 */
1290 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1291 
1292 	if (mip->mi_factory_addr_num == 0) {
1293 		ret = ENOTSUP;
1294 		goto bail;
1295 	}
1296 
1297 	if (*slot != -1) {
1298 		/* check the specified slot */
1299 		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1300 			ret = EINVAL;
1301 			goto bail;
1302 		}
1303 		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1304 			ret = EBUSY;
1305 			goto bail;
1306 		}
1307 	} else {
1308 		/* pick the next available slot */
1309 		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1310 			if (!mip->mi_factory_addr[i].mfa_in_use)
1311 				break;
1312 		}
1313 
1314 		if (i == mip->mi_factory_addr_num) {
1315 			ret = ENOSPC;
1316 			goto bail;
1317 		}
1318 		*slot = i+1;
1319 	}
1320 
1321 	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1322 	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1323 
1324 bail:
1325 	rw_exit(&mip->mi_rw_lock);
1326 	i_mac_perim_exit(mip);
1327 	return (ret);
1328 }
1329 
1330 /*
1331  * Release the specified factory MAC address slot.
1332  */
1333 void
1334 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1335 {
1336 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1337 	mac_impl_t *mip = mcip->mci_mip;
1338 
1339 	i_mac_perim_enter(mip);
1340 	/*
1341 	 * Protect against concurrent readers that may need a self-consistent
1342 	 * view of the factory addresses
1343 	 */
1344 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1345 
1346 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1347 	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1348 
1349 	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1350 
1351 	rw_exit(&mip->mi_rw_lock);
1352 	i_mac_perim_exit(mip);
1353 }
1354 
1355 /*
1356  * Stores in mac_addr the value of the specified MAC address. Returns
1357  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1358  * The caller must provide a string of at least MAXNAMELEN bytes.
1359  */
1360 void
1361 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1362     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1363 {
1364 	mac_impl_t *mip = (mac_impl_t *)mh;
1365 	boolean_t in_use;
1366 
1367 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1368 
1369 	/*
1370 	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1371 	 * and mi_rw_lock
1372 	 */
1373 	rw_enter(&mip->mi_rw_lock, RW_READER);
1374 	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1375 	*addr_len = mip->mi_type->mt_addr_length;
1376 	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1377 	if (in_use && client_name != NULL) {
1378 		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1379 		    client_name, MAXNAMELEN);
1380 	}
1381 	if (in_use_arg != NULL)
1382 		*in_use_arg = in_use;
1383 	rw_exit(&mip->mi_rw_lock);
1384 }
1385 
1386 /*
1387  * Returns the number of factory MAC addresses (in addition to the
1388  * primary MAC address), 0 if the underlying MAC doesn't support
1389  * that feature.
1390  */
1391 uint_t
1392 mac_addr_factory_num(mac_handle_t mh)
1393 {
1394 	mac_impl_t *mip = (mac_impl_t *)mh;
1395 
1396 	return (mip->mi_factory_addr_num);
1397 }
1398 
1399 
1400 void
1401 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1402 {
1403 	mac_ring_t	*ring;
1404 
1405 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1406 		ring->mr_flag &= ~flag;
1407 }
1408 
1409 /*
1410  * The following mac_hwrings_xxx() functions are private mac client functions
1411  * used by the aggr driver to access and control the underlying HW Rx group
1412  * and rings. In this case, the aggr driver has exclusive control of the
1413  * underlying HW Rx group/rings, it calls the following functions to
1414  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1415  * addresses, or set up the Rx callback.
1416  */
1417 /* ARGSUSED */
1418 static void
1419 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1420     mblk_t *mp_chain, boolean_t loopback)
1421 {
1422 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1423 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1424 	mac_direct_rx_t		proc;
1425 	void			*arg1;
1426 	mac_resource_handle_t	arg2;
1427 
1428 	proc = srs_rx->sr_func;
1429 	arg1 = srs_rx->sr_arg1;
1430 	arg2 = mac_srs->srs_mrh;
1431 
1432 	proc(arg1, arg2, mp_chain, NULL);
1433 }
1434 
1435 /*
1436  * This function is called to get the list of HW rings that are reserved by
1437  * an exclusive mac client.
1438  *
1439  * Return value: the number of HW rings.
1440  */
1441 int
1442 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1443     mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1444 {
1445 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1446 	int			cnt = 0;
1447 
1448 	switch (rtype) {
1449 	case MAC_RING_TYPE_RX: {
1450 		flow_entry_t	*flent = mcip->mci_flent;
1451 		mac_group_t	*grp;
1452 		mac_ring_t	*ring;
1453 
1454 		grp = flent->fe_rx_ring_group;
1455 		/*
1456 		 * The mac client did not reserve any RX group, return directly.
1457 		 * This is probably because the underlying MAC does not support
1458 		 * any groups.
1459 		 */
1460 		*hwgh = NULL;
1461 		if (grp == NULL)
1462 			return (0);
1463 		/*
1464 		 * This group must be reserved by this mac client.
1465 		 */
1466 		ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1467 		    (mch == (mac_client_handle_t)
1468 		    (MAC_RX_GROUP_ONLY_CLIENT(grp))));
1469 		for (ring = grp->mrg_rings;
1470 		    ring != NULL; ring = ring->mr_next, cnt++) {
1471 			ASSERT(cnt < MAX_RINGS_PER_GROUP);
1472 			hwrh[cnt] = (mac_ring_handle_t)ring;
1473 		}
1474 		*hwgh = (mac_group_handle_t)grp;
1475 		return (cnt);
1476 	}
1477 	case MAC_RING_TYPE_TX: {
1478 		mac_soft_ring_set_t	*tx_srs;
1479 		mac_srs_tx_t		*tx;
1480 
1481 		tx_srs = MCIP_TX_SRS(mcip);
1482 		tx = &tx_srs->srs_tx;
1483 		for (; cnt < tx->st_ring_count; cnt++)
1484 			hwrh[cnt] = tx->st_rings[cnt];
1485 		return (cnt);
1486 	}
1487 	default:
1488 		ASSERT(B_FALSE);
1489 		return (-1);
1490 	}
1491 }
1492 
1493 /*
1494  * Setup the RX callback of the mac client which exclusively controls HW ring.
1495  */
1496 void
1497 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
1498 {
1499 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1500 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1501 
1502 	mac_srs->srs_mrh = prh;
1503 	mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1504 }
1505 
1506 void
1507 mac_hwring_teardown(mac_ring_handle_t hwrh)
1508 {
1509 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1510 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1511 
1512 	mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1513 	mac_srs->srs_mrh = NULL;
1514 }
1515 
1516 int
1517 mac_hwring_disable_intr(mac_ring_handle_t rh)
1518 {
1519 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1520 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1521 
1522 	return (intr->mi_disable(intr->mi_handle));
1523 }
1524 
1525 int
1526 mac_hwring_enable_intr(mac_ring_handle_t rh)
1527 {
1528 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1529 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1530 
1531 	return (intr->mi_enable(intr->mi_handle));
1532 }
1533 
1534 int
1535 mac_hwring_start(mac_ring_handle_t rh)
1536 {
1537 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1538 
1539 	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1540 	return (0);
1541 }
1542 
1543 void
1544 mac_hwring_stop(mac_ring_handle_t rh)
1545 {
1546 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1547 
1548 	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1549 }
1550 
1551 mblk_t *
1552 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1553 {
1554 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1555 	mac_ring_info_t *info = &rr_ring->mr_info;
1556 
1557 	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1558 }
1559 
1560 /*
1561  * Send packets through the selected tx ring.
1562  */
1563 mblk_t *
1564 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1565 {
1566 	mac_ring_t *ring = (mac_ring_t *)rh;
1567 	mac_ring_info_t *info = &ring->mr_info;
1568 
1569 	ASSERT(ring->mr_type == MAC_RING_TYPE_TX &&
1570 	    ring->mr_state >= MR_INUSE);
1571 	return (info->mri_tx(info->mri_driver, mp));
1572 }
1573 
1574 int
1575 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1576 {
1577 	mac_group_t *group = (mac_group_t *)gh;
1578 
1579 	return (mac_group_addmac(group, addr));
1580 }
1581 
1582 int
1583 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1584 {
1585 	mac_group_t *group = (mac_group_t *)gh;
1586 
1587 	return (mac_group_remmac(group, addr));
1588 }
1589 
1590 /*
1591  * Set the RX group to be shared/reserved. Note that the group must be
1592  * started/stopped outside of this function.
1593  */
1594 void
1595 mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
1596 {
1597 	/*
1598 	 * If there is no change in the group state, just return.
1599 	 */
1600 	if (grp->mrg_state == state)
1601 		return;
1602 
1603 	switch (state) {
1604 	case MAC_GROUP_STATE_RESERVED:
1605 		/*
1606 		 * Successfully reserved the group.
1607 		 *
1608 		 * Given that there is an exclusive client controlling this
1609 		 * group, we enable the group level polling when available,
1610 		 * so that SRSs get to turn on/off individual rings they's
1611 		 * assigned to.
1612 		 */
1613 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1614 
1615 		if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
1616 			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1617 
1618 		break;
1619 
1620 	case MAC_GROUP_STATE_SHARED:
1621 		/*
1622 		 * Set all rings of this group to software classified.
1623 		 * If the group has an overriding interrupt, then re-enable it.
1624 		 */
1625 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1626 
1627 		if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
1628 			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1629 
1630 		/* The ring is not available for reservations any more */
1631 		break;
1632 
1633 	case MAC_GROUP_STATE_REGISTERED:
1634 		/* Also callable from mac_register, perim is not held */
1635 		break;
1636 
1637 	default:
1638 		ASSERT(B_FALSE);
1639 		break;
1640 	}
1641 
1642 	grp->mrg_state = state;
1643 }
1644 
1645 /*
1646  * Quiesce future hardware classified packets for the specified Rx ring
1647  */
1648 static void
1649 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1650 {
1651 	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1652 	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1653 
1654 	mutex_enter(&rx_ring->mr_lock);
1655 	rx_ring->mr_flag |= ring_flag;
1656 	while (rx_ring->mr_refcnt != 0)
1657 		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1658 	mutex_exit(&rx_ring->mr_lock);
1659 }
1660 
1661 /*
1662  * Please see mac_tx for details about the per cpu locking scheme
1663  */
1664 static void
1665 mac_tx_lock_all(mac_client_impl_t *mcip)
1666 {
1667 	int	i;
1668 
1669 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1670 		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1671 }
1672 
1673 static void
1674 mac_tx_unlock_all(mac_client_impl_t *mcip)
1675 {
1676 	int	i;
1677 
1678 	for (i = mac_tx_percpu_cnt; i >= 0; i--)
1679 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1680 }
1681 
1682 static void
1683 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1684 {
1685 	int	i;
1686 
1687 	for (i = mac_tx_percpu_cnt; i > 0; i--)
1688 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1689 }
1690 
1691 static int
1692 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1693 {
1694 	int	i;
1695 	int	refcnt = 0;
1696 
1697 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1698 		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1699 
1700 	return (refcnt);
1701 }
1702 
1703 /*
1704  * Stop future Tx packets coming down from the client in preparation for
1705  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1706  * of rings between clients
1707  */
1708 void
1709 mac_tx_client_block(mac_client_impl_t *mcip)
1710 {
1711 	mac_tx_lock_all(mcip);
1712 	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1713 	while (mac_tx_sum_refcnt(mcip) != 0) {
1714 		mac_tx_unlock_allbutzero(mcip);
1715 		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1716 		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1717 		mac_tx_lock_all(mcip);
1718 	}
1719 	mac_tx_unlock_all(mcip);
1720 }
1721 
1722 void
1723 mac_tx_client_unblock(mac_client_impl_t *mcip)
1724 {
1725 	mac_tx_lock_all(mcip);
1726 	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1727 	mac_tx_unlock_all(mcip);
1728 	/*
1729 	 * We may fail to disable flow control for the last MAC_NOTE_TX
1730 	 * notification because the MAC client is quiesced. Send the
1731 	 * notification again.
1732 	 */
1733 	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1734 }
1735 
1736 /*
1737  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1738  * quiesce is done.
1739  */
1740 static void
1741 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1742 {
1743 	mutex_enter(&srs->srs_lock);
1744 	while (!(srs->srs_state & srs_flag))
1745 		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1746 	mutex_exit(&srs->srs_lock);
1747 }
1748 
1749 /*
1750  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1751  * works bottom up by cutting off packet flow from the bottommost point in the
1752  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1753  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1754  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1755  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1756  * for the SRS and MR flags. In the former case the threads pause waiting for
1757  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1758  * is also mostly similar to the above.
1759  *
1760  * 1. Stop future hardware classified packets at the lowest level in the mac.
1761  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1762  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1763  *    from increasing. Upcalls from the driver that come through hardware
1764  *    classification will be dropped in mac_rx from now on. Then we wait for
1765  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1766  *    sure there aren't any upcall threads from the driver through hardware
1767  *    classification. In the case of SRS teardown we also remove the
1768  *    classification rule in the driver.
1769  *
1770  * 2. Stop future software classified packets by marking the flow entry with
1771  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1772  *    increasing. We also remove the flow entry from the table in the latter
1773  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1774  *    that indicates there aren't any active threads using that flow entry.
1775  *
1776  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1777  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1778  *    with the SRS worker thread serving as a master controller. This
1779  *    mechansim is explained in mac_srs_worker_quiesce().
1780  *
1781  * The restart mechanism to reactivate the SRS and softrings is explained
1782  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1783  * restart sequence.
1784  */
1785 void
1786 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1787 {
1788 	flow_entry_t	*flent = srs->srs_flent;
1789 	uint_t	mr_flag, srs_done_flag;
1790 
1791 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1792 	ASSERT(!(srs->srs_type & SRST_TX));
1793 
1794 	if (srs_quiesce_flag == SRS_CONDEMNED) {
1795 		mr_flag = MR_CONDEMNED;
1796 		srs_done_flag = SRS_CONDEMNED_DONE;
1797 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1798 			mac_srs_client_poll_disable(srs->srs_mcip, srs);
1799 	} else {
1800 		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1801 		mr_flag = MR_QUIESCE;
1802 		srs_done_flag = SRS_QUIESCE_DONE;
1803 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1804 			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1805 	}
1806 
1807 	if (srs->srs_ring != NULL) {
1808 		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1809 	} else {
1810 		/*
1811 		 * SRS is driven by software classification. In case
1812 		 * of CONDEMNED, the top level teardown functions will
1813 		 * deal with flow removal.
1814 		 */
1815 		if (srs_quiesce_flag != SRS_CONDEMNED) {
1816 			FLOW_MARK(flent, FE_QUIESCE);
1817 			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1818 		}
1819 	}
1820 
1821 	/*
1822 	 * Signal the SRS to quiesce itself, and then cv_wait for the
1823 	 * SRS quiesce to complete. The SRS worker thread will wake us
1824 	 * up when the quiesce is complete
1825 	 */
1826 	mac_srs_signal(srs, srs_quiesce_flag);
1827 	mac_srs_quiesce_wait(srs, srs_done_flag);
1828 }
1829 
1830 /*
1831  * Remove an SRS.
1832  */
1833 void
1834 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1835 {
1836 	flow_entry_t *flent = srs->srs_flent;
1837 	int i;
1838 
1839 	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1840 	/*
1841 	 * Locate and remove our entry in the fe_rx_srs[] array, and
1842 	 * adjust the fe_rx_srs array entries and array count by
1843 	 * moving the last entry into the vacated spot.
1844 	 */
1845 	mutex_enter(&flent->fe_lock);
1846 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1847 		if (flent->fe_rx_srs[i] == srs)
1848 			break;
1849 	}
1850 
1851 	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1852 	if (i != flent->fe_rx_srs_cnt - 1) {
1853 		flent->fe_rx_srs[i] =
1854 		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1855 		i = flent->fe_rx_srs_cnt - 1;
1856 	}
1857 
1858 	flent->fe_rx_srs[i] = NULL;
1859 	flent->fe_rx_srs_cnt--;
1860 	mutex_exit(&flent->fe_lock);
1861 
1862 	mac_srs_free(srs);
1863 }
1864 
1865 static void
1866 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1867 {
1868 	mutex_enter(&srs->srs_lock);
1869 	srs->srs_state &= ~flag;
1870 	mutex_exit(&srs->srs_lock);
1871 }
1872 
1873 void
1874 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1875 {
1876 	flow_entry_t	*flent = srs->srs_flent;
1877 	mac_ring_t	*mr;
1878 
1879 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1880 	ASSERT((srs->srs_type & SRST_TX) == 0);
1881 
1882 	/*
1883 	 * This handles a change in the number of SRSs between the quiesce and
1884 	 * and restart operation of a flow.
1885 	 */
1886 	if (!SRS_QUIESCED(srs))
1887 		return;
1888 
1889 	/*
1890 	 * Signal the SRS to restart itself. Wait for the restart to complete
1891 	 * Note that we only restart the SRS if it is not marked as
1892 	 * permanently quiesced.
1893 	 */
1894 	if (!SRS_QUIESCED_PERMANENT(srs)) {
1895 		mac_srs_signal(srs, SRS_RESTART);
1896 		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
1897 		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
1898 
1899 		mac_srs_client_poll_restart(srs->srs_mcip, srs);
1900 	}
1901 
1902 	/* Finally clear the flags to let the packets in */
1903 	mr = srs->srs_ring;
1904 	if (mr != NULL) {
1905 		MAC_RING_UNMARK(mr, MR_QUIESCE);
1906 		/* In case the ring was stopped, safely restart it */
1907 		(void) mac_start_ring(mr);
1908 	} else {
1909 		FLOW_UNMARK(flent, FE_QUIESCE);
1910 	}
1911 }
1912 
1913 /*
1914  * Temporary quiesce of a flow and associated Rx SRS.
1915  * Please see block comment above mac_rx_classify_flow_rem.
1916  */
1917 /* ARGSUSED */
1918 int
1919 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
1920 {
1921 	int		i;
1922 
1923 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1924 		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
1925 		    SRS_QUIESCE);
1926 	}
1927 	return (0);
1928 }
1929 
1930 /*
1931  * Restart a flow and associated Rx SRS that has been quiesced temporarily
1932  * Please see block comment above mac_rx_classify_flow_rem
1933  */
1934 /* ARGSUSED */
1935 int
1936 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
1937 {
1938 	int		i;
1939 
1940 	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
1941 		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
1942 
1943 	return (0);
1944 }
1945 
1946 void
1947 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
1948 {
1949 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1950 	flow_entry_t		*flent = mcip->mci_flent;
1951 	mac_impl_t		*mip = mcip->mci_mip;
1952 	mac_soft_ring_set_t	*mac_srs;
1953 	int			i;
1954 
1955 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1956 
1957 	if (flent == NULL)
1958 		return;
1959 
1960 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1961 		mac_srs = flent->fe_rx_srs[i];
1962 		mutex_enter(&mac_srs->srs_lock);
1963 		if (on)
1964 			mac_srs->srs_state |= SRS_QUIESCE_PERM;
1965 		else
1966 			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
1967 		mutex_exit(&mac_srs->srs_lock);
1968 	}
1969 }
1970 
1971 void
1972 mac_rx_client_quiesce(mac_client_handle_t mch)
1973 {
1974 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1975 	mac_impl_t		*mip = mcip->mci_mip;
1976 
1977 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1978 
1979 	if (MCIP_DATAPATH_SETUP(mcip)) {
1980 		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
1981 		    NULL);
1982 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1983 		    mac_rx_classify_flow_quiesce, NULL);
1984 	}
1985 }
1986 
1987 void
1988 mac_rx_client_restart(mac_client_handle_t mch)
1989 {
1990 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1991 	mac_impl_t		*mip = mcip->mci_mip;
1992 
1993 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1994 
1995 	if (MCIP_DATAPATH_SETUP(mcip)) {
1996 		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
1997 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1998 		    mac_rx_classify_flow_restart, NULL);
1999 	}
2000 }
2001 
2002 /*
2003  * This function only quiesces the Tx SRS and softring worker threads. Callers
2004  * need to make sure that there aren't any mac client threads doing current or
2005  * future transmits in the mac before calling this function.
2006  */
2007 void
2008 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2009 {
2010 	mac_client_impl_t	*mcip = srs->srs_mcip;
2011 
2012 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2013 
2014 	ASSERT(srs->srs_type & SRST_TX);
2015 	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2016 	    srs_quiesce_flag == SRS_QUIESCE);
2017 
2018 	/*
2019 	 * Signal the SRS to quiesce itself, and then cv_wait for the
2020 	 * SRS quiesce to complete. The SRS worker thread will wake us
2021 	 * up when the quiesce is complete
2022 	 */
2023 	mac_srs_signal(srs, srs_quiesce_flag);
2024 	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2025 	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2026 }
2027 
2028 void
2029 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2030 {
2031 	/*
2032 	 * Resizing the fanout could result in creation of new SRSs.
2033 	 * They may not necessarily be in the quiesced state in which
2034 	 * case it need be restarted
2035 	 */
2036 	if (!SRS_QUIESCED(srs))
2037 		return;
2038 
2039 	mac_srs_signal(srs, SRS_RESTART);
2040 	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2041 	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2042 }
2043 
2044 /*
2045  * Temporary quiesce of a flow and associated Rx SRS.
2046  * Please see block comment above mac_rx_srs_quiesce
2047  */
2048 /* ARGSUSED */
2049 int
2050 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2051 {
2052 	/*
2053 	 * The fe_tx_srs is null for a subflow on an interface that is
2054 	 * not plumbed
2055 	 */
2056 	if (flent->fe_tx_srs != NULL)
2057 		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2058 	return (0);
2059 }
2060 
2061 /* ARGSUSED */
2062 int
2063 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2064 {
2065 	/*
2066 	 * The fe_tx_srs is null for a subflow on an interface that is
2067 	 * not plumbed
2068 	 */
2069 	if (flent->fe_tx_srs != NULL)
2070 		mac_tx_srs_restart(flent->fe_tx_srs);
2071 	return (0);
2072 }
2073 
2074 void
2075 mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
2076 {
2077 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2078 
2079 	mac_tx_client_block(mcip);
2080 	if (MCIP_TX_SRS(mcip) != NULL) {
2081 		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2082 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2083 		    mac_tx_flow_quiesce, NULL);
2084 	}
2085 }
2086 
2087 void
2088 mac_tx_client_restart(mac_client_impl_t *mcip)
2089 {
2090 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2091 
2092 	mac_tx_client_unblock(mcip);
2093 	if (MCIP_TX_SRS(mcip) != NULL) {
2094 		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2095 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2096 		    mac_tx_flow_restart, NULL);
2097 	}
2098 }
2099 
2100 void
2101 mac_tx_client_flush(mac_client_impl_t *mcip)
2102 {
2103 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2104 
2105 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2106 	mac_tx_client_restart(mcip);
2107 }
2108 
2109 void
2110 mac_client_quiesce(mac_client_impl_t *mcip)
2111 {
2112 	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2113 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2114 }
2115 
2116 void
2117 mac_client_restart(mac_client_impl_t *mcip)
2118 {
2119 	mac_rx_client_restart((mac_client_handle_t)mcip);
2120 	mac_tx_client_restart(mcip);
2121 }
2122 
2123 /*
2124  * Allocate a minor number.
2125  */
2126 minor_t
2127 mac_minor_hold(boolean_t sleep)
2128 {
2129 	minor_t	minor;
2130 
2131 	/*
2132 	 * Grab a value from the arena.
2133 	 */
2134 	atomic_add_32(&minor_count, 1);
2135 
2136 	if (sleep)
2137 		minor = (uint_t)id_alloc(minor_ids);
2138 	else
2139 		minor = (uint_t)id_alloc_nosleep(minor_ids);
2140 
2141 	if (minor == 0) {
2142 		atomic_add_32(&minor_count, -1);
2143 		return (0);
2144 	}
2145 
2146 	return (minor);
2147 }
2148 
2149 /*
2150  * Release a previously allocated minor number.
2151  */
2152 void
2153 mac_minor_rele(minor_t minor)
2154 {
2155 	/*
2156 	 * Return the value to the arena.
2157 	 */
2158 	id_free(minor_ids, minor);
2159 	atomic_add_32(&minor_count, -1);
2160 }
2161 
2162 uint32_t
2163 mac_no_notification(mac_handle_t mh)
2164 {
2165 	mac_impl_t *mip = (mac_impl_t *)mh;
2166 
2167 	return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2168 	    mip->mi_capab_legacy.ml_unsup_note : 0);
2169 }
2170 
2171 /*
2172  * Prevent any new opens of this mac in preparation for unregister
2173  */
2174 int
2175 i_mac_disable(mac_impl_t *mip)
2176 {
2177 	mac_client_impl_t	*mcip;
2178 
2179 	rw_enter(&i_mac_impl_lock, RW_WRITER);
2180 	if (mip->mi_state_flags & MIS_DISABLED) {
2181 		/* Already disabled, return success */
2182 		rw_exit(&i_mac_impl_lock);
2183 		return (0);
2184 	}
2185 	/*
2186 	 * See if there are any other references to this mac_t (e.g., VLAN's).
2187 	 * If so return failure. If all the other checks below pass, then
2188 	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2189 	 * any new VLAN's from being created or new mac client opens of this
2190 	 * mac end point.
2191 	 */
2192 	if (mip->mi_ref > 0) {
2193 		rw_exit(&i_mac_impl_lock);
2194 		return (EBUSY);
2195 	}
2196 
2197 	/*
2198 	 * mac clients must delete all multicast groups they join before
2199 	 * closing. bcast groups are reference counted, the last client
2200 	 * to delete the group will wait till the group is physically
2201 	 * deleted. Since all clients have closed this mac end point
2202 	 * mi_bcast_ngrps must be zero at this point
2203 	 */
2204 	ASSERT(mip->mi_bcast_ngrps == 0);
2205 
2206 	/*
2207 	 * Don't let go of this if it has some flows.
2208 	 * All other code guarantees no flows are added to a disabled
2209 	 * mac, therefore it is sufficient to check for the flow table
2210 	 * only here.
2211 	 */
2212 	mcip = mac_primary_client_handle(mip);
2213 	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2214 		rw_exit(&i_mac_impl_lock);
2215 		return (ENOTEMPTY);
2216 	}
2217 
2218 	mip->mi_state_flags |= MIS_DISABLED;
2219 	rw_exit(&i_mac_impl_lock);
2220 	return (0);
2221 }
2222 
2223 int
2224 mac_disable_nowait(mac_handle_t mh)
2225 {
2226 	mac_impl_t	*mip = (mac_impl_t *)mh;
2227 	int err;
2228 
2229 	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2230 		return (err);
2231 	err = i_mac_disable(mip);
2232 	i_mac_perim_exit(mip);
2233 	return (err);
2234 }
2235 
2236 int
2237 mac_disable(mac_handle_t mh)
2238 {
2239 	mac_impl_t	*mip = (mac_impl_t *)mh;
2240 	int err;
2241 
2242 	i_mac_perim_enter(mip);
2243 	err = i_mac_disable(mip);
2244 	i_mac_perim_exit(mip);
2245 
2246 	/*
2247 	 * Clean up notification thread and wait for it to exit.
2248 	 */
2249 	if (err == 0)
2250 		i_mac_notify_exit(mip);
2251 
2252 	return (err);
2253 }
2254 
2255 /*
2256  * Called when the MAC instance has a non empty flow table, to de-multiplex
2257  * incoming packets to the right flow.
2258  * The MAC's rw lock is assumed held as a READER.
2259  */
2260 /* ARGSUSED */
2261 static mblk_t *
2262 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2263 {
2264 	flow_entry_t	*flent = NULL;
2265 	uint_t		flags = FLOW_INBOUND;
2266 	int		err;
2267 
2268 	/*
2269 	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2270 	 * to mac_flow_lookup() so that the VLAN packets can be successfully
2271 	 * passed to the non-VLAN aggregation flows.
2272 	 *
2273 	 * Note that there is possibly a race between this and
2274 	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2275 	 * classified to non-VLAN flows of non-aggregation mac clients. These
2276 	 * VLAN packets will be then filtered out by the mac module.
2277 	 */
2278 	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2279 		flags |= FLOW_IGNORE_VLAN;
2280 
2281 	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2282 	if (err != 0) {
2283 		/* no registered receive function */
2284 		return (mp);
2285 	} else {
2286 		mac_client_impl_t	*mcip;
2287 
2288 		/*
2289 		 * This flent might just be an additional one on the MAC client,
2290 		 * i.e. for classification purposes (different fdesc), however
2291 		 * the resources, SRS et. al., are in the mci_flent, so if
2292 		 * this isn't the mci_flent, we need to get it.
2293 		 */
2294 		if ((mcip = flent->fe_mcip) != NULL &&
2295 		    mcip->mci_flent != flent) {
2296 			FLOW_REFRELE(flent);
2297 			flent = mcip->mci_flent;
2298 			FLOW_TRY_REFHOLD(flent, err);
2299 			if (err != 0)
2300 				return (mp);
2301 		}
2302 		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2303 		    B_FALSE);
2304 		FLOW_REFRELE(flent);
2305 	}
2306 	return (NULL);
2307 }
2308 
2309 mblk_t *
2310 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2311 {
2312 	mac_impl_t	*mip = (mac_impl_t *)mh;
2313 	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2314 
2315 	/*
2316 	 * We walk the chain and attempt to classify each packet.
2317 	 * The packets that couldn't be classified will be returned
2318 	 * back to the caller.
2319 	 */
2320 	bp = mp_chain;
2321 	bpp = &list;
2322 	while (bp != NULL) {
2323 		bp1 = bp;
2324 		bp = bp->b_next;
2325 		bp1->b_next = NULL;
2326 
2327 		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2328 			*bpp = bp1;
2329 			bpp = &bp1->b_next;
2330 		}
2331 	}
2332 	return (list);
2333 }
2334 
2335 static int
2336 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2337 {
2338 	mac_ring_handle_t ring = arg;
2339 
2340 	if (flent->fe_tx_srs)
2341 		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2342 	return (0);
2343 }
2344 
2345 void
2346 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2347 {
2348 	mac_client_impl_t	*cclient;
2349 	mac_soft_ring_set_t	*mac_srs;
2350 
2351 	/*
2352 	 * After grabbing the mi_rw_lock, the list of clients can't change.
2353 	 * If there are any clients mi_disabled must be B_FALSE and can't
2354 	 * get set since there are clients. If there aren't any clients we
2355 	 * don't do anything. In any case the mip has to be valid. The driver
2356 	 * must make sure that it goes single threaded (with respect to mac
2357 	 * calls) and wait for all pending mac calls to finish before calling
2358 	 * mac_unregister.
2359 	 */
2360 	rw_enter(&i_mac_impl_lock, RW_READER);
2361 	if (mip->mi_state_flags & MIS_DISABLED) {
2362 		rw_exit(&i_mac_impl_lock);
2363 		return;
2364 	}
2365 
2366 	/*
2367 	 * Get MAC tx srs from walking mac_client_handle list.
2368 	 */
2369 	rw_enter(&mip->mi_rw_lock, RW_READER);
2370 	for (cclient = mip->mi_clients_list; cclient != NULL;
2371 	    cclient = cclient->mci_client_next) {
2372 		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
2373 			mac_tx_srs_wakeup(mac_srs, ring);
2374 		(void) mac_flow_walk(cclient->mci_subflow_tab,
2375 		    mac_tx_flow_srs_wakeup, ring);
2376 	}
2377 	rw_exit(&mip->mi_rw_lock);
2378 	rw_exit(&i_mac_impl_lock);
2379 }
2380 
2381 /* ARGSUSED */
2382 void
2383 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2384     boolean_t add)
2385 {
2386 	mac_impl_t *mip = (mac_impl_t *)mh;
2387 
2388 	i_mac_perim_enter((mac_impl_t *)mh);
2389 	/*
2390 	 * If no specific refresh function was given then default to the
2391 	 * driver's m_multicst entry point.
2392 	 */
2393 	if (refresh == NULL) {
2394 		refresh = mip->mi_multicst;
2395 		arg = mip->mi_driver;
2396 	}
2397 
2398 	mac_bcast_refresh(mip, refresh, arg, add);
2399 	i_mac_perim_exit((mac_impl_t *)mh);
2400 }
2401 
2402 void
2403 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2404 {
2405 	mac_impl_t	*mip = (mac_impl_t *)mh;
2406 
2407 	/*
2408 	 * If no specific refresh function was given then default to the
2409 	 * driver's m_promisc entry point.
2410 	 */
2411 	if (refresh == NULL) {
2412 		refresh = mip->mi_setpromisc;
2413 		arg = mip->mi_driver;
2414 	}
2415 	ASSERT(refresh != NULL);
2416 
2417 	/*
2418 	 * Call the refresh function with the current promiscuity.
2419 	 */
2420 	refresh(arg, (mip->mi_devpromisc != 0));
2421 }
2422 
2423 /*
2424  * The mac client requests that the mac not to change its margin size to
2425  * be less than the specified value.  If "current" is B_TRUE, then the client
2426  * requests the mac not to change its margin size to be smaller than the
2427  * current size. Further, return the current margin size value in this case.
2428  *
2429  * We keep every requested size in an ordered list from largest to smallest.
2430  */
2431 int
2432 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2433 {
2434 	mac_impl_t		*mip = (mac_impl_t *)mh;
2435 	mac_margin_req_t	**pp, *p;
2436 	int			err = 0;
2437 
2438 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2439 	if (current)
2440 		*marginp = mip->mi_margin;
2441 
2442 	/*
2443 	 * If the current margin value cannot satisfy the margin requested,
2444 	 * return ENOTSUP directly.
2445 	 */
2446 	if (*marginp > mip->mi_margin) {
2447 		err = ENOTSUP;
2448 		goto done;
2449 	}
2450 
2451 	/*
2452 	 * Check whether the given margin is already in the list. If so,
2453 	 * bump the reference count.
2454 	 */
2455 	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2456 		if (p->mmr_margin == *marginp) {
2457 			/*
2458 			 * The margin requested is already in the list,
2459 			 * so just bump the reference count.
2460 			 */
2461 			p->mmr_ref++;
2462 			goto done;
2463 		}
2464 		if (p->mmr_margin < *marginp)
2465 			break;
2466 	}
2467 
2468 
2469 	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2470 	p->mmr_margin = *marginp;
2471 	p->mmr_ref++;
2472 	p->mmr_nextp = *pp;
2473 	*pp = p;
2474 
2475 done:
2476 	rw_exit(&(mip->mi_rw_lock));
2477 	return (err);
2478 }
2479 
2480 /*
2481  * The mac client requests to cancel its previous mac_margin_add() request.
2482  * We remove the requested margin size from the list.
2483  */
2484 int
2485 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2486 {
2487 	mac_impl_t		*mip = (mac_impl_t *)mh;
2488 	mac_margin_req_t	**pp, *p;
2489 	int			err = 0;
2490 
2491 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2492 	/*
2493 	 * Find the entry in the list for the given margin.
2494 	 */
2495 	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2496 		if (p->mmr_margin == margin) {
2497 			if (--p->mmr_ref == 0)
2498 				break;
2499 
2500 			/*
2501 			 * There is still a reference to this address so
2502 			 * there's nothing more to do.
2503 			 */
2504 			goto done;
2505 		}
2506 	}
2507 
2508 	/*
2509 	 * We did not find an entry for the given margin.
2510 	 */
2511 	if (p == NULL) {
2512 		err = ENOENT;
2513 		goto done;
2514 	}
2515 
2516 	ASSERT(p->mmr_ref == 0);
2517 
2518 	/*
2519 	 * Remove it from the list.
2520 	 */
2521 	*pp = p->mmr_nextp;
2522 	kmem_free(p, sizeof (mac_margin_req_t));
2523 done:
2524 	rw_exit(&(mip->mi_rw_lock));
2525 	return (err);
2526 }
2527 
2528 boolean_t
2529 mac_margin_update(mac_handle_t mh, uint32_t margin)
2530 {
2531 	mac_impl_t	*mip = (mac_impl_t *)mh;
2532 	uint32_t	margin_needed = 0;
2533 
2534 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2535 
2536 	if (mip->mi_mmrp != NULL)
2537 		margin_needed = mip->mi_mmrp->mmr_margin;
2538 
2539 	if (margin_needed <= margin)
2540 		mip->mi_margin = margin;
2541 
2542 	rw_exit(&(mip->mi_rw_lock));
2543 
2544 	if (margin_needed <= margin)
2545 		i_mac_notify(mip, MAC_NOTE_MARGIN);
2546 
2547 	return (margin_needed <= margin);
2548 }
2549 
2550 /*
2551  * MAC Type Plugin functions.
2552  */
2553 
2554 mactype_t *
2555 mactype_getplugin(const char *pname)
2556 {
2557 	mactype_t	*mtype = NULL;
2558 	boolean_t	tried_modload = B_FALSE;
2559 
2560 	mutex_enter(&i_mactype_lock);
2561 
2562 find_registered_mactype:
2563 	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2564 	    (mod_hash_val_t *)&mtype) != 0) {
2565 		if (!tried_modload) {
2566 			/*
2567 			 * If the plugin has not yet been loaded, then
2568 			 * attempt to load it now.  If modload() succeeds,
2569 			 * the plugin should have registered using
2570 			 * mactype_register(), in which case we can go back
2571 			 * and attempt to find it again.
2572 			 */
2573 			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2574 				tried_modload = B_TRUE;
2575 				goto find_registered_mactype;
2576 			}
2577 		}
2578 	} else {
2579 		/*
2580 		 * Note that there's no danger that the plugin we've loaded
2581 		 * could be unloaded between the modload() step and the
2582 		 * reference count bump here, as we're holding
2583 		 * i_mactype_lock, which mactype_unregister() also holds.
2584 		 */
2585 		atomic_inc_32(&mtype->mt_ref);
2586 	}
2587 
2588 	mutex_exit(&i_mactype_lock);
2589 	return (mtype);
2590 }
2591 
2592 mactype_register_t *
2593 mactype_alloc(uint_t mactype_version)
2594 {
2595 	mactype_register_t *mtrp;
2596 
2597 	/*
2598 	 * Make sure there isn't a version mismatch between the plugin and
2599 	 * the framework.  In the future, if multiple versions are
2600 	 * supported, this check could become more sophisticated.
2601 	 */
2602 	if (mactype_version != MACTYPE_VERSION)
2603 		return (NULL);
2604 
2605 	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2606 	mtrp->mtr_version = mactype_version;
2607 	return (mtrp);
2608 }
2609 
2610 void
2611 mactype_free(mactype_register_t *mtrp)
2612 {
2613 	kmem_free(mtrp, sizeof (mactype_register_t));
2614 }
2615 
2616 int
2617 mactype_register(mactype_register_t *mtrp)
2618 {
2619 	mactype_t	*mtp;
2620 	mactype_ops_t	*ops = mtrp->mtr_ops;
2621 
2622 	/* Do some sanity checking before we register this MAC type. */
2623 	if (mtrp->mtr_ident == NULL || ops == NULL)
2624 		return (EINVAL);
2625 
2626 	/*
2627 	 * Verify that all mandatory callbacks are set in the ops
2628 	 * vector.
2629 	 */
2630 	if (ops->mtops_unicst_verify == NULL ||
2631 	    ops->mtops_multicst_verify == NULL ||
2632 	    ops->mtops_sap_verify == NULL ||
2633 	    ops->mtops_header == NULL ||
2634 	    ops->mtops_header_info == NULL) {
2635 		return (EINVAL);
2636 	}
2637 
2638 	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2639 	mtp->mt_ident = mtrp->mtr_ident;
2640 	mtp->mt_ops = *ops;
2641 	mtp->mt_type = mtrp->mtr_mactype;
2642 	mtp->mt_nativetype = mtrp->mtr_nativetype;
2643 	mtp->mt_addr_length = mtrp->mtr_addrlen;
2644 	if (mtrp->mtr_brdcst_addr != NULL) {
2645 		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2646 		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2647 		    mtrp->mtr_addrlen);
2648 	}
2649 
2650 	mtp->mt_stats = mtrp->mtr_stats;
2651 	mtp->mt_statcount = mtrp->mtr_statcount;
2652 
2653 	mtp->mt_mapping = mtrp->mtr_mapping;
2654 	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2655 
2656 	if (mod_hash_insert(i_mactype_hash,
2657 	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2658 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2659 		kmem_free(mtp, sizeof (*mtp));
2660 		return (EEXIST);
2661 	}
2662 	return (0);
2663 }
2664 
2665 int
2666 mactype_unregister(const char *ident)
2667 {
2668 	mactype_t	*mtp;
2669 	mod_hash_val_t	val;
2670 	int 		err;
2671 
2672 	/*
2673 	 * Let's not allow MAC drivers to use this plugin while we're
2674 	 * trying to unregister it.  Holding i_mactype_lock also prevents a
2675 	 * plugin from unregistering while a MAC driver is attempting to
2676 	 * hold a reference to it in i_mactype_getplugin().
2677 	 */
2678 	mutex_enter(&i_mactype_lock);
2679 
2680 	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2681 	    (mod_hash_val_t *)&mtp)) != 0) {
2682 		/* A plugin is trying to unregister, but it never registered. */
2683 		err = ENXIO;
2684 		goto done;
2685 	}
2686 
2687 	if (mtp->mt_ref != 0) {
2688 		err = EBUSY;
2689 		goto done;
2690 	}
2691 
2692 	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2693 	ASSERT(err == 0);
2694 	if (err != 0) {
2695 		/* This should never happen, thus the ASSERT() above. */
2696 		err = EINVAL;
2697 		goto done;
2698 	}
2699 	ASSERT(mtp == (mactype_t *)val);
2700 
2701 	if (mtp->mt_brdcst_addr != NULL)
2702 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2703 	kmem_free(mtp, sizeof (mactype_t));
2704 done:
2705 	mutex_exit(&i_mactype_lock);
2706 	return (err);
2707 }
2708 
2709 /*
2710  * mac_set_prop() sets mac or hardware driver properties:
2711  * 	MAC resource properties include maxbw, priority, and cpu binding list.
2712  *	Driver properties are private properties to the hardware, such as mtu
2713  *	and speed.  There's one other MAC property -- the PVID.
2714  * If the property is a driver property, mac_set_prop() calls driver's callback
2715  * function to set it.
2716  * If the property is a mac resource property, mac_set_prop() invokes
2717  * mac_set_resources() which will cache the property value in mac_impl_t and
2718  * may call mac_client_set_resource() to update property value of the primary
2719  * mac client, if it exists.
2720  */
2721 int
2722 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
2723 {
2724 	int err = ENOTSUP;
2725 	mac_impl_t *mip = (mac_impl_t *)mh;
2726 
2727 	ASSERT(MAC_PERIM_HELD(mh));
2728 
2729 	switch (macprop->mp_id) {
2730 	case MAC_PROP_MAXBW:
2731 	case MAC_PROP_PRIO:
2732 	case MAC_PROP_PROTECT:
2733 	case MAC_PROP_BIND_CPU: {
2734 		mac_resource_props_t mrp;
2735 
2736 		/* If it is mac property, call mac_set_resources() */
2737 		if (valsize < sizeof (mac_resource_props_t))
2738 			return (EINVAL);
2739 		bcopy(val, &mrp, sizeof (mrp));
2740 		err = mac_set_resources(mh, &mrp);
2741 		break;
2742 	}
2743 
2744 	case MAC_PROP_PVID:
2745 		if (valsize < sizeof (uint16_t) ||
2746 		    (mip->mi_state_flags & MIS_IS_VNIC))
2747 			return (EINVAL);
2748 		err = mac_set_pvid(mh, *(uint16_t *)val);
2749 		break;
2750 
2751 	case MAC_PROP_MTU: {
2752 		uint32_t mtu;
2753 
2754 		if (valsize < sizeof (mtu))
2755 			return (EINVAL);
2756 		bcopy(val, &mtu, sizeof (mtu));
2757 		err = mac_set_mtu(mh, mtu, NULL);
2758 		break;
2759 	}
2760 
2761 	case MAC_PROP_LLIMIT:
2762 	case MAC_PROP_LDECAY: {
2763 		uint32_t learnval;
2764 
2765 		if (valsize < sizeof (learnval) ||
2766 		    (mip->mi_state_flags & MIS_IS_VNIC))
2767 			return (EINVAL);
2768 		bcopy(val, &learnval, sizeof (learnval));
2769 		if (learnval == 0 && macprop->mp_id == MAC_PROP_LDECAY)
2770 			return (EINVAL);
2771 		if (macprop->mp_id == MAC_PROP_LLIMIT)
2772 			mip->mi_llimit = learnval;
2773 		else
2774 			mip->mi_ldecay = learnval;
2775 		err = 0;
2776 		break;
2777 	}
2778 
2779 	default:
2780 		/* For other driver properties, call driver's callback */
2781 		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
2782 			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
2783 			    macprop->mp_name, macprop->mp_id, valsize, val);
2784 		}
2785 	}
2786 	return (err);
2787 }
2788 
2789 /*
2790  * mac_get_prop() gets mac or hardware driver properties.
2791  *
2792  * If the property is a driver property, mac_get_prop() calls driver's callback
2793  * function to get it.
2794  * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
2795  * which returns the cached value in mac_impl_t.
2796  */
2797 int
2798 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
2799     uint_t *perm)
2800 {
2801 	int err = ENOTSUP;
2802 	mac_impl_t *mip = (mac_impl_t *)mh;
2803 	link_state_t link_state;
2804 	boolean_t is_getprop, is_setprop;
2805 
2806 	is_getprop = (mip->mi_callbacks->mc_callbacks & MC_GETPROP);
2807 	is_setprop = (mip->mi_callbacks->mc_callbacks & MC_SETPROP);
2808 
2809 	switch (macprop->mp_id) {
2810 	case MAC_PROP_MAXBW:
2811 	case MAC_PROP_PRIO:
2812 	case MAC_PROP_PROTECT:
2813 	case MAC_PROP_BIND_CPU: {
2814 		mac_resource_props_t mrp;
2815 
2816 		/* If mac property, read from cache */
2817 		if (valsize < sizeof (mac_resource_props_t))
2818 			return (EINVAL);
2819 		mac_get_resources(mh, &mrp);
2820 		bcopy(&mrp, val, sizeof (mac_resource_props_t));
2821 		return (0);
2822 	}
2823 
2824 	case MAC_PROP_PVID:
2825 		if (valsize < sizeof (uint16_t) ||
2826 		    (mip->mi_state_flags & MIS_IS_VNIC))
2827 			return (EINVAL);
2828 		*(uint16_t *)val = mac_get_pvid(mh);
2829 		return (0);
2830 
2831 	case MAC_PROP_LLIMIT:
2832 	case MAC_PROP_LDECAY:
2833 		if (valsize < sizeof (uint32_t) ||
2834 		    (mip->mi_state_flags & MIS_IS_VNIC))
2835 			return (EINVAL);
2836 		if (macprop->mp_id == MAC_PROP_LLIMIT)
2837 			bcopy(&mip->mi_llimit, val, sizeof (mip->mi_llimit));
2838 		else
2839 			bcopy(&mip->mi_ldecay, val, sizeof (mip->mi_ldecay));
2840 		return (0);
2841 
2842 	case MAC_PROP_MTU: {
2843 		uint32_t sdu;
2844 		mac_propval_range_t range;
2845 
2846 		if ((macprop->mp_flags & MAC_PROP_POSSIBLE) != 0) {
2847 			if (valsize < sizeof (mac_propval_range_t))
2848 				return (EINVAL);
2849 			if (is_getprop) {
2850 				err = mip->mi_callbacks->mc_getprop(mip->
2851 				    mi_driver, macprop->mp_name, macprop->mp_id,
2852 				    macprop->mp_flags, valsize, val, perm);
2853 			}
2854 			/*
2855 			 * If the driver doesn't have *_m_getprop defined or
2856 			 * if the driver doesn't support setting MTU then
2857 			 * return the CURRENT value as POSSIBLE value.
2858 			 */
2859 			if (!is_getprop || err == ENOTSUP) {
2860 				mac_sdu_get(mh, NULL, &sdu);
2861 				range.mpr_count = 1;
2862 				range.mpr_type = MAC_PROPVAL_UINT32;
2863 				range.range_uint32[0].mpur_min =
2864 				    range.range_uint32[0].mpur_max = sdu;
2865 				bcopy(&range, val, sizeof (range));
2866 				err = 0;
2867 			}
2868 			return (err);
2869 		}
2870 		if (valsize < sizeof (sdu))
2871 			return (EINVAL);
2872 		if ((macprop->mp_flags & MAC_PROP_DEFAULT) == 0) {
2873 			mac_sdu_get(mh, NULL, &sdu);
2874 			bcopy(&sdu, val, sizeof (sdu));
2875 			if (is_setprop && (mip->mi_callbacks->mc_setprop(mip->
2876 			    mi_driver, macprop->mp_name, macprop->mp_id,
2877 			    valsize, val) == 0)) {
2878 				*perm = MAC_PROP_PERM_RW;
2879 			} else {
2880 				*perm = MAC_PROP_PERM_READ;
2881 			}
2882 			return (0);
2883 		} else {
2884 			if (mip->mi_info.mi_media == DL_ETHER) {
2885 				sdu = ETHERMTU;
2886 				bcopy(&sdu, val, sizeof (sdu));
2887 
2888 				return (0);
2889 			}
2890 			/*
2891 			 * ask driver for its default.
2892 			 */
2893 			break;
2894 		}
2895 	}
2896 	case MAC_PROP_STATUS:
2897 		if (valsize < sizeof (link_state))
2898 			return (EINVAL);
2899 		*perm = MAC_PROP_PERM_READ;
2900 		link_state = mac_link_get(mh);
2901 		bcopy(&link_state, val, sizeof (link_state));
2902 		return (0);
2903 	default:
2904 		break;
2905 
2906 	}
2907 	/* If driver property, request from driver */
2908 	if (is_getprop) {
2909 		err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
2910 		    macprop->mp_name, macprop->mp_id, macprop->mp_flags,
2911 		    valsize, val, perm);
2912 	}
2913 	return (err);
2914 }
2915 
2916 int
2917 mac_fastpath_disable(mac_handle_t mh)
2918 {
2919 	mac_impl_t	*mip = (mac_impl_t *)mh;
2920 
2921 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2922 		return (0);
2923 
2924 	return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
2925 }
2926 
2927 void
2928 mac_fastpath_enable(mac_handle_t mh)
2929 {
2930 	mac_impl_t	*mip = (mac_impl_t *)mh;
2931 
2932 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2933 		return;
2934 
2935 	mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
2936 }
2937 
2938 void
2939 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
2940 {
2941 	mac_priv_prop_t *mpriv;
2942 
2943 	if (mpp == NULL)
2944 		return;
2945 
2946 	mpriv = kmem_zalloc(nprop * sizeof (*mpriv), KM_SLEEP);
2947 	(void) memcpy(mpriv, mpp, nprop * sizeof (*mpriv));
2948 	mip->mi_priv_prop = mpriv;
2949 	mip->mi_priv_prop_count = nprop;
2950 }
2951 
2952 void
2953 mac_unregister_priv_prop(mac_impl_t *mip)
2954 {
2955 	mac_priv_prop_t	*mpriv;
2956 
2957 	mpriv = mip->mi_priv_prop;
2958 	if (mpriv != NULL) {
2959 		kmem_free(mpriv, mip->mi_priv_prop_count * sizeof (*mpriv));
2960 		mip->mi_priv_prop = NULL;
2961 	}
2962 	mip->mi_priv_prop_count = 0;
2963 }
2964 
2965 /*
2966  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
2967  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
2968  * cases if MAC free's the ring structure after mac_stop_ring(), any
2969  * illegal access to the ring structure coming from the driver will panic
2970  * the system. In order to protect the system from such inadverent access,
2971  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
2972  * When packets are received on free'd up rings, MAC (through the generation
2973  * count mechanism) will drop such packets.
2974  */
2975 static mac_ring_t *
2976 mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
2977 {
2978 	mac_ring_t *ring;
2979 
2980 	if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2981 		mutex_enter(&mip->mi_ring_lock);
2982 		if (mip->mi_ring_freelist != NULL) {
2983 			ring = mip->mi_ring_freelist;
2984 			mip->mi_ring_freelist = ring->mr_next;
2985 			bzero(ring, sizeof (mac_ring_t));
2986 		} else {
2987 			ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
2988 		}
2989 		mutex_exit(&mip->mi_ring_lock);
2990 	} else {
2991 		ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
2992 	}
2993 	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
2994 	return (ring);
2995 }
2996 
2997 static void
2998 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
2999 {
3000 	if (ring->mr_type == MAC_RING_TYPE_RX) {
3001 		mutex_enter(&mip->mi_ring_lock);
3002 		ring->mr_state = MR_FREE;
3003 		ring->mr_flag = 0;
3004 		ring->mr_next = mip->mi_ring_freelist;
3005 		mip->mi_ring_freelist = ring;
3006 		mutex_exit(&mip->mi_ring_lock);
3007 	} else {
3008 		kmem_free(ring, sizeof (mac_ring_t));
3009 	}
3010 }
3011 
3012 static void
3013 mac_ring_freeall(mac_impl_t *mip)
3014 {
3015 	mac_ring_t *ring_next;
3016 	mutex_enter(&mip->mi_ring_lock);
3017 	mac_ring_t *ring = mip->mi_ring_freelist;
3018 	while (ring != NULL) {
3019 		ring_next = ring->mr_next;
3020 		kmem_cache_free(mac_ring_cache, ring);
3021 		ring = ring_next;
3022 	}
3023 	mip->mi_ring_freelist = NULL;
3024 	mutex_exit(&mip->mi_ring_lock);
3025 }
3026 
3027 int
3028 mac_start_ring(mac_ring_t *ring)
3029 {
3030 	int rv = 0;
3031 
3032 	if (ring->mr_start != NULL)
3033 		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
3034 
3035 	return (rv);
3036 }
3037 
3038 void
3039 mac_stop_ring(mac_ring_t *ring)
3040 {
3041 	if (ring->mr_stop != NULL)
3042 		ring->mr_stop(ring->mr_driver);
3043 
3044 	/*
3045 	 * Increment the ring generation number for this ring.
3046 	 */
3047 	ring->mr_gen_num++;
3048 }
3049 
3050 int
3051 mac_start_group(mac_group_t *group)
3052 {
3053 	int rv = 0;
3054 
3055 	if (group->mrg_start != NULL)
3056 		rv = group->mrg_start(group->mrg_driver);
3057 
3058 	return (rv);
3059 }
3060 
3061 void
3062 mac_stop_group(mac_group_t *group)
3063 {
3064 	if (group->mrg_stop != NULL)
3065 		group->mrg_stop(group->mrg_driver);
3066 }
3067 
3068 /*
3069  * Called from mac_start() on the default Rx group. Broadcast and multicast
3070  * packets are received only on the default group. Hence the default group
3071  * needs to be up even if the primary client is not up, for the other groups
3072  * to be functional. We do this by calling this function at mac_start time
3073  * itself. However the broadcast packets that are received can't make their
3074  * way beyond mac_rx until a mac client creates a broadcast flow.
3075  */
3076 static int
3077 mac_start_group_and_rings(mac_group_t *group)
3078 {
3079 	mac_ring_t	*ring;
3080 	int		rv = 0;
3081 
3082 	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3083 	if ((rv = mac_start_group(group)) != 0)
3084 		return (rv);
3085 
3086 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3087 		ASSERT(ring->mr_state == MR_FREE);
3088 		if ((rv = mac_start_ring(ring)) != 0)
3089 			goto error;
3090 		ring->mr_state = MR_INUSE;
3091 		ring->mr_classify_type = MAC_SW_CLASSIFIER;
3092 	}
3093 	return (0);
3094 
3095 error:
3096 	mac_stop_group_and_rings(group);
3097 	return (rv);
3098 }
3099 
3100 /* Called from mac_stop on the default Rx group */
3101 static void
3102 mac_stop_group_and_rings(mac_group_t *group)
3103 {
3104 	mac_ring_t	*ring;
3105 
3106 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3107 		if (ring->mr_state != MR_FREE) {
3108 			mac_stop_ring(ring);
3109 			ring->mr_state = MR_FREE;
3110 			ring->mr_flag = 0;
3111 			ring->mr_classify_type = MAC_NO_CLASSIFIER;
3112 		}
3113 	}
3114 	mac_stop_group(group);
3115 }
3116 
3117 
3118 static mac_ring_t *
3119 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3120     mac_capab_rings_t *cap_rings)
3121 {
3122 	mac_ring_t *ring;
3123 	mac_ring_info_t ring_info;
3124 
3125 	ring = mac_ring_alloc(mip, cap_rings);
3126 
3127 	/* Prepare basic information of ring */
3128 	ring->mr_index = index;
3129 	ring->mr_type = group->mrg_type;
3130 	ring->mr_gh = (mac_group_handle_t)group;
3131 
3132 	/* Insert the new ring to the list. */
3133 	ring->mr_next = group->mrg_rings;
3134 	group->mrg_rings = ring;
3135 
3136 	/* Zero to reuse the info data structure */
3137 	bzero(&ring_info, sizeof (ring_info));
3138 
3139 	/* Query ring information from driver */
3140 	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3141 	    index, &ring_info, (mac_ring_handle_t)ring);
3142 
3143 	ring->mr_info = ring_info;
3144 
3145 	/* Update ring's status */
3146 	ring->mr_state = MR_FREE;
3147 	ring->mr_flag = 0;
3148 
3149 	/* Update the ring count of the group */
3150 	group->mrg_cur_count++;
3151 	return (ring);
3152 }
3153 
3154 /*
3155  * Rings are chained together for easy regrouping.
3156  */
3157 static void
3158 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3159     mac_capab_rings_t *cap_rings)
3160 {
3161 	int index;
3162 
3163 	/*
3164 	 * Initialize all ring members of this group. Size of zero will not
3165 	 * enter the loop, so it's safe for initializing an empty group.
3166 	 */
3167 	for (index = size - 1; index >= 0; index--)
3168 		(void) mac_init_ring(mip, group, index, cap_rings);
3169 }
3170 
3171 int
3172 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3173 {
3174 	mac_capab_rings_t *cap_rings;
3175 	mac_group_t *group, *groups;
3176 	mac_group_info_t group_info;
3177 	uint_t group_free = 0;
3178 	uint_t ring_left;
3179 	mac_ring_t *ring;
3180 	int g, err = 0;
3181 
3182 	switch (rtype) {
3183 	case MAC_RING_TYPE_RX:
3184 		ASSERT(mip->mi_rx_groups == NULL);
3185 
3186 		cap_rings = &mip->mi_rx_rings_cap;
3187 		cap_rings->mr_type = MAC_RING_TYPE_RX;
3188 		break;
3189 	case MAC_RING_TYPE_TX:
3190 		ASSERT(mip->mi_tx_groups == NULL);
3191 
3192 		cap_rings = &mip->mi_tx_rings_cap;
3193 		cap_rings->mr_type = MAC_RING_TYPE_TX;
3194 		break;
3195 	default:
3196 		ASSERT(B_FALSE);
3197 	}
3198 
3199 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
3200 	    cap_rings))
3201 		return (0);
3202 
3203 	/*
3204 	 * Allocate a contiguous buffer for all groups.
3205 	 */
3206 	groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
3207 	    KM_SLEEP);
3208 
3209 	ring_left = cap_rings->mr_rnum;
3210 
3211 	/*
3212 	 * Get all ring groups if any, and get their ring members
3213 	 * if any.
3214 	 */
3215 	for (g = 0; g < cap_rings->mr_gnum; g++) {
3216 		group = groups + g;
3217 
3218 		/* Prepare basic information of the group */
3219 		group->mrg_index = g;
3220 		group->mrg_type = rtype;
3221 		group->mrg_state = MAC_GROUP_STATE_UNINIT;
3222 		group->mrg_mh = (mac_handle_t)mip;
3223 		group->mrg_next = group + 1;
3224 
3225 		/* Zero to reuse the info data structure */
3226 		bzero(&group_info, sizeof (group_info));
3227 
3228 		/* Query group information from driver */
3229 		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3230 		    (mac_group_handle_t)group);
3231 
3232 		switch (cap_rings->mr_group_type) {
3233 		case MAC_GROUP_TYPE_DYNAMIC:
3234 			if (cap_rings->mr_gaddring == NULL ||
3235 			    cap_rings->mr_gremring == NULL) {
3236 				DTRACE_PROBE3(
3237 				    mac__init__rings_no_addremring,
3238 				    char *, mip->mi_name,
3239 				    mac_group_add_ring_t,
3240 				    cap_rings->mr_gaddring,
3241 				    mac_group_add_ring_t,
3242 				    cap_rings->mr_gremring);
3243 				err = EINVAL;
3244 				goto bail;
3245 			}
3246 
3247 			switch (rtype) {
3248 			case MAC_RING_TYPE_RX:
3249 				/*
3250 				 * The first RX group must have non-zero
3251 				 * rings, and the following groups must
3252 				 * have zero rings.
3253 				 */
3254 				if (g == 0 && group_info.mgi_count == 0) {
3255 					DTRACE_PROBE1(
3256 					    mac__init__rings__rx__def__zero,
3257 					    char *, mip->mi_name);
3258 					err = EINVAL;
3259 					goto bail;
3260 				}
3261 				if (g > 0 && group_info.mgi_count != 0) {
3262 					DTRACE_PROBE3(
3263 					    mac__init__rings__rx__nonzero,
3264 					    char *, mip->mi_name,
3265 					    int, g, int, group_info.mgi_count);
3266 					err = EINVAL;
3267 					goto bail;
3268 				}
3269 				break;
3270 			case MAC_RING_TYPE_TX:
3271 				/*
3272 				 * All TX ring groups must have zero rings.
3273 				 */
3274 				if (group_info.mgi_count != 0) {
3275 					DTRACE_PROBE3(
3276 					    mac__init__rings__tx__nonzero,
3277 					    char *, mip->mi_name,
3278 					    int, g, int, group_info.mgi_count);
3279 					err = EINVAL;
3280 					goto bail;
3281 				}
3282 				break;
3283 			}
3284 			break;
3285 		case MAC_GROUP_TYPE_STATIC:
3286 			/*
3287 			 * Note that an empty group is allowed, e.g., an aggr
3288 			 * would start with an empty group.
3289 			 */
3290 			break;
3291 		default:
3292 			/* unknown group type */
3293 			DTRACE_PROBE2(mac__init__rings__unknown__type,
3294 			    char *, mip->mi_name,
3295 			    int, cap_rings->mr_group_type);
3296 			err = EINVAL;
3297 			goto bail;
3298 		}
3299 
3300 
3301 		/*
3302 		 * Driver must register group->mgi_addmac/remmac() for rx groups
3303 		 * to support multiple MAC addresses.
3304 		 */
3305 		if (rtype == MAC_RING_TYPE_RX) {
3306 			if ((group_info.mgi_addmac == NULL) ||
3307 			    (group_info.mgi_addmac == NULL))
3308 				goto bail;
3309 		}
3310 
3311 		/* Cache driver-supplied information */
3312 		group->mrg_info = group_info;
3313 
3314 		/* Update the group's status and group count. */
3315 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3316 		group_free++;
3317 
3318 		group->mrg_rings = NULL;
3319 		group->mrg_cur_count = 0;
3320 		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3321 		ring_left -= group_info.mgi_count;
3322 
3323 		/* The current group size should be equal to default value */
3324 		ASSERT(group->mrg_cur_count == group_info.mgi_count);
3325 	}
3326 
3327 	/* Build up a dummy group for free resources as a pool */
3328 	group = groups + cap_rings->mr_gnum;
3329 
3330 	/* Prepare basic information of the group */
3331 	group->mrg_index = -1;
3332 	group->mrg_type = rtype;
3333 	group->mrg_state = MAC_GROUP_STATE_UNINIT;
3334 	group->mrg_mh = (mac_handle_t)mip;
3335 	group->mrg_next = NULL;
3336 
3337 	/*
3338 	 * If there are ungrouped rings, allocate a continuous buffer for
3339 	 * remaining resources.
3340 	 */
3341 	if (ring_left != 0) {
3342 		group->mrg_rings = NULL;
3343 		group->mrg_cur_count = 0;
3344 		mac_init_group(mip, group, ring_left, cap_rings);
3345 
3346 		/* The current group size should be equal to ring_left */
3347 		ASSERT(group->mrg_cur_count == ring_left);
3348 
3349 		ring_left = 0;
3350 
3351 		/* Update this group's status */
3352 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3353 	} else
3354 		group->mrg_rings = NULL;
3355 
3356 	ASSERT(ring_left == 0);
3357 
3358 bail:
3359 	/* Cache other important information to finalize the initialization */
3360 	switch (rtype) {
3361 	case MAC_RING_TYPE_RX:
3362 		mip->mi_rx_group_type = cap_rings->mr_group_type;
3363 		mip->mi_rx_group_count = cap_rings->mr_gnum;
3364 		mip->mi_rx_groups = groups;
3365 		break;
3366 	case MAC_RING_TYPE_TX:
3367 		mip->mi_tx_group_type = cap_rings->mr_group_type;
3368 		mip->mi_tx_group_count = cap_rings->mr_gnum;
3369 		mip->mi_tx_group_free = group_free;
3370 		mip->mi_tx_groups = groups;
3371 
3372 		/*
3373 		 * Ring 0 is used as the default one and it could be assigned
3374 		 * to a client as well.
3375 		 */
3376 		group = groups + cap_rings->mr_gnum;
3377 		ring = group->mrg_rings;
3378 		while ((ring->mr_index != 0) && (ring->mr_next != NULL))
3379 			ring = ring->mr_next;
3380 		ASSERT(ring->mr_index == 0);
3381 		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
3382 		break;
3383 	default:
3384 		ASSERT(B_FALSE);
3385 	}
3386 
3387 	if (err != 0)
3388 		mac_free_rings(mip, rtype);
3389 
3390 	return (err);
3391 }
3392 
3393 /*
3394  * Called to free all ring groups with particular type. It's supposed all groups
3395  * have been released by clinet.
3396  */
3397 void
3398 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3399 {
3400 	mac_group_t *group, *groups;
3401 	uint_t group_count;
3402 
3403 	switch (rtype) {
3404 	case MAC_RING_TYPE_RX:
3405 		if (mip->mi_rx_groups == NULL)
3406 			return;
3407 
3408 		groups = mip->mi_rx_groups;
3409 		group_count = mip->mi_rx_group_count;
3410 
3411 		mip->mi_rx_groups = NULL;
3412 		mip->mi_rx_group_count = 0;
3413 		break;
3414 	case MAC_RING_TYPE_TX:
3415 		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
3416 
3417 		if (mip->mi_tx_groups == NULL)
3418 			return;
3419 
3420 		groups = mip->mi_tx_groups;
3421 		group_count = mip->mi_tx_group_count;
3422 
3423 		mip->mi_tx_groups = NULL;
3424 		mip->mi_tx_group_count = 0;
3425 		mip->mi_tx_group_free = 0;
3426 		mip->mi_default_tx_ring = NULL;
3427 		break;
3428 	default:
3429 		ASSERT(B_FALSE);
3430 	}
3431 
3432 	for (group = groups; group != NULL; group = group->mrg_next) {
3433 		mac_ring_t *ring;
3434 
3435 		if (group->mrg_cur_count == 0)
3436 			continue;
3437 
3438 		ASSERT(group->mrg_rings != NULL);
3439 
3440 		while ((ring = group->mrg_rings) != NULL) {
3441 			group->mrg_rings = ring->mr_next;
3442 			mac_ring_free(mip, ring);
3443 		}
3444 	}
3445 
3446 	/* Free all the cached rings */
3447 	mac_ring_freeall(mip);
3448 	/* Free the block of group data strutures */
3449 	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
3450 }
3451 
3452 /*
3453  * Associate a MAC address with a receive group.
3454  *
3455  * The return value of this function should always be checked properly, because
3456  * any type of failure could cause unexpected results. A group can be added
3457  * or removed with a MAC address only after it has been reserved. Ideally,
3458  * a successful reservation always leads to calling mac_group_addmac() to
3459  * steer desired traffic. Failure of adding an unicast MAC address doesn't
3460  * always imply that the group is functioning abnormally.
3461  *
3462  * Currently this function is called everywhere, and it reflects assumptions
3463  * about MAC addresses in the implementation. CR 6735196.
3464  */
3465 int
3466 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
3467 {
3468 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3469 	ASSERT(group->mrg_info.mgi_addmac != NULL);
3470 
3471 	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
3472 }
3473 
3474 /*
3475  * Remove the association between MAC address and receive group.
3476  */
3477 int
3478 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
3479 {
3480 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3481 	ASSERT(group->mrg_info.mgi_remmac != NULL);
3482 
3483 	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
3484 }
3485 
3486 /*
3487  * Release a ring in use by marking it MR_FREE.
3488  * Any other client may reserve it for its use.
3489  */
3490 void
3491 mac_release_tx_ring(mac_ring_handle_t rh)
3492 {
3493 	mac_ring_t *ring = (mac_ring_t *)rh;
3494 	mac_group_t *group = (mac_group_t *)ring->mr_gh;
3495 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3496 
3497 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3498 	ASSERT(ring->mr_state != MR_FREE);
3499 
3500 	/*
3501 	 * Default tx ring will be released by mac_stop().
3502 	 */
3503 	if (rh == mip->mi_default_tx_ring)
3504 		return;
3505 
3506 	mac_stop_ring(ring);
3507 
3508 	ring->mr_state = MR_FREE;
3509 	ring->mr_flag = 0;
3510 }
3511 
3512 /*
3513  * This is the entry point for packets transmitted through the bridging code.
3514  * If no bridge is in place, MAC_RING_TX transmits using tx ring. The 'rh'
3515  * pointer may be NULL to select the default ring.
3516  */
3517 mblk_t *
3518 mac_bridge_tx(mac_impl_t *mip, mac_ring_handle_t rh, mblk_t *mp)
3519 {
3520 	mac_handle_t mh;
3521 
3522 	/*
3523 	 * Once we take a reference on the bridge link, the bridge
3524 	 * module itself can't unload, so the callback pointers are
3525 	 * stable.
3526 	 */
3527 	mutex_enter(&mip->mi_bridge_lock);
3528 	if ((mh = mip->mi_bridge_link) != NULL)
3529 		mac_bridge_ref_cb(mh, B_TRUE);
3530 	mutex_exit(&mip->mi_bridge_lock);
3531 	if (mh == NULL) {
3532 		MAC_RING_TX(mip, rh, mp, mp);
3533 	} else {
3534 		mp = mac_bridge_tx_cb(mh, rh, mp);
3535 		mac_bridge_ref_cb(mh, B_FALSE);
3536 	}
3537 
3538 	return (mp);
3539 }
3540 
3541 /*
3542  * Find a ring from its index.
3543  */
3544 mac_ring_t *
3545 mac_find_ring(mac_group_t *group, int index)
3546 {
3547 	mac_ring_t *ring = group->mrg_rings;
3548 
3549 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
3550 		if (ring->mr_index == index)
3551 			break;
3552 
3553 	return (ring);
3554 }
3555 /*
3556  * Add a ring to an existing group.
3557  *
3558  * The ring must be either passed directly (for example if the ring
3559  * movement is initiated by the framework), or specified through a driver
3560  * index (for example when the ring is added by the driver.
3561  *
3562  * The caller needs to call mac_perim_enter() before calling this function.
3563  */
3564 int
3565 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
3566 {
3567 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3568 	mac_capab_rings_t *cap_rings;
3569 	boolean_t driver_call = (ring == NULL);
3570 	mac_group_type_t group_type;
3571 	int ret = 0;
3572 
3573 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3574 
3575 	switch (group->mrg_type) {
3576 	case MAC_RING_TYPE_RX:
3577 		cap_rings = &mip->mi_rx_rings_cap;
3578 		group_type = mip->mi_rx_group_type;
3579 		break;
3580 	case MAC_RING_TYPE_TX:
3581 		cap_rings = &mip->mi_tx_rings_cap;
3582 		group_type = mip->mi_tx_group_type;
3583 		break;
3584 	default:
3585 		ASSERT(B_FALSE);
3586 	}
3587 
3588 	/*
3589 	 * There should be no ring with the same ring index in the target
3590 	 * group.
3591 	 */
3592 	ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
3593 	    NULL);
3594 
3595 	if (driver_call) {
3596 		/*
3597 		 * The function is called as a result of a request from
3598 		 * a driver to add a ring to an existing group, for example
3599 		 * from the aggregation driver. Allocate a new mac_ring_t
3600 		 * for that ring.
3601 		 */
3602 		ring = mac_init_ring(mip, group, index, cap_rings);
3603 		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
3604 	} else {
3605 		/*
3606 		 * The function is called as a result of a MAC layer request
3607 		 * to add a ring to an existing group. In this case the
3608 		 * ring is being moved between groups, which requires
3609 		 * the underlying driver to support dynamic grouping,
3610 		 * and the mac_ring_t already exists.
3611 		 */
3612 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3613 		ASSERT(cap_rings->mr_gaddring != NULL);
3614 		ASSERT(ring->mr_gh == NULL);
3615 	}
3616 
3617 	/*
3618 	 * At this point the ring should not be in use, and it should be
3619 	 * of the right for the target group.
3620 	 */
3621 	ASSERT(ring->mr_state < MR_INUSE);
3622 	ASSERT(ring->mr_srs == NULL);
3623 	ASSERT(ring->mr_type == group->mrg_type);
3624 
3625 	if (!driver_call) {
3626 		/*
3627 		 * Add the driver level hardware ring if the process was not
3628 		 * initiated by the driver, and the target group is not the
3629 		 * group.
3630 		 */
3631 		if (group->mrg_driver != NULL) {
3632 			cap_rings->mr_gaddring(group->mrg_driver,
3633 			    ring->mr_driver, ring->mr_type);
3634 		}
3635 
3636 		/*
3637 		 * Insert the ring ahead existing rings.
3638 		 */
3639 		ring->mr_next = group->mrg_rings;
3640 		group->mrg_rings = ring;
3641 		ring->mr_gh = (mac_group_handle_t)group;
3642 		group->mrg_cur_count++;
3643 	}
3644 
3645 	/*
3646 	 * If the group has not been actively used, we're done.
3647 	 */
3648 	if (group->mrg_index != -1 &&
3649 	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
3650 		return (0);
3651 
3652 	/*
3653 	 * Set up SRS/SR according to the ring type.
3654 	 */
3655 	switch (ring->mr_type) {
3656 	case MAC_RING_TYPE_RX:
3657 		/*
3658 		 * Setup SRS on top of the new ring if the group is
3659 		 * reserved for someones exclusive use.
3660 		 */
3661 		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
3662 			flow_entry_t *flent;
3663 			mac_client_impl_t *mcip;
3664 
3665 			mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
3666 			ASSERT(mcip != NULL);
3667 			flent = mcip->mci_flent;
3668 			ASSERT(flent->fe_rx_srs_cnt > 0);
3669 			mac_srs_group_setup(mcip, flent, group, SRST_LINK);
3670 		}
3671 		break;
3672 	case MAC_RING_TYPE_TX:
3673 		/*
3674 		 * For TX this function is only invoked during the
3675 		 * initial creation of a group when a share is
3676 		 * associated with a MAC client. So the datapath is not
3677 		 * yet setup, and will be setup later after the
3678 		 * group has been reserved and populated.
3679 		 */
3680 		break;
3681 	default:
3682 		ASSERT(B_FALSE);
3683 	}
3684 
3685 	/*
3686 	 * Start the ring if needed. Failure causes to undo the grouping action.
3687 	 */
3688 	if ((ret = mac_start_ring(ring)) != 0) {
3689 		if (ring->mr_type == MAC_RING_TYPE_RX) {
3690 			if (ring->mr_srs != NULL) {
3691 				mac_rx_srs_remove(ring->mr_srs);
3692 				ring->mr_srs = NULL;
3693 			}
3694 		}
3695 		if (!driver_call) {
3696 			cap_rings->mr_gremring(group->mrg_driver,
3697 			    ring->mr_driver, ring->mr_type);
3698 		}
3699 		group->mrg_cur_count--;
3700 		group->mrg_rings = ring->mr_next;
3701 
3702 		ring->mr_gh = NULL;
3703 
3704 		if (driver_call)
3705 			mac_ring_free(mip, ring);
3706 
3707 		return (ret);
3708 	}
3709 
3710 	/*
3711 	 * Update the ring's state.
3712 	 */
3713 	ring->mr_state = MR_INUSE;
3714 	MAC_RING_UNMARK(ring, MR_INCIPIENT);
3715 	return (0);
3716 }
3717 
3718 /*
3719  * Remove a ring from it's current group. MAC internal function for dynamic
3720  * grouping.
3721  *
3722  * The caller needs to call mac_perim_enter() before calling this function.
3723  */
3724 void
3725 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
3726     boolean_t driver_call)
3727 {
3728 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3729 	mac_capab_rings_t *cap_rings = NULL;
3730 	mac_group_type_t group_type;
3731 
3732 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3733 
3734 	ASSERT(mac_find_ring(group, ring->mr_index) == ring);
3735 	ASSERT((mac_group_t *)ring->mr_gh == group);
3736 	ASSERT(ring->mr_type == group->mrg_type);
3737 
3738 	switch (ring->mr_type) {
3739 	case MAC_RING_TYPE_RX:
3740 		group_type = mip->mi_rx_group_type;
3741 		cap_rings = &mip->mi_rx_rings_cap;
3742 
3743 		if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
3744 			mac_stop_ring(ring);
3745 
3746 		/*
3747 		 * Only hardware classified packets hold a reference to the
3748 		 * ring all the way up the Rx path. mac_rx_srs_remove()
3749 		 * will take care of quiescing the Rx path and removing the
3750 		 * SRS. The software classified path neither holds a reference
3751 		 * nor any association with the ring in mac_rx.
3752 		 */
3753 		if (ring->mr_srs != NULL) {
3754 			mac_rx_srs_remove(ring->mr_srs);
3755 			ring->mr_srs = NULL;
3756 		}
3757 		ring->mr_state = MR_FREE;
3758 		ring->mr_flag = 0;
3759 
3760 		break;
3761 	case MAC_RING_TYPE_TX:
3762 		/*
3763 		 * For TX this function is only invoked in two
3764 		 * cases:
3765 		 *
3766 		 * 1) In the case of a failure during the
3767 		 * initial creation of a group when a share is
3768 		 * associated with a MAC client. So the SRS is not
3769 		 * yet setup, and will be setup later after the
3770 		 * group has been reserved and populated.
3771 		 *
3772 		 * 2) From mac_release_tx_group() when freeing
3773 		 * a TX SRS.
3774 		 *
3775 		 * In both cases the SRS and its soft rings are
3776 		 * already quiesced.
3777 		 */
3778 		ASSERT(!driver_call);
3779 		group_type = mip->mi_tx_group_type;
3780 		cap_rings = &mip->mi_tx_rings_cap;
3781 		break;
3782 	default:
3783 		ASSERT(B_FALSE);
3784 	}
3785 
3786 	/*
3787 	 * Remove the ring from the group.
3788 	 */
3789 	if (ring == group->mrg_rings)
3790 		group->mrg_rings = ring->mr_next;
3791 	else {
3792 		mac_ring_t *pre;
3793 
3794 		pre = group->mrg_rings;
3795 		while (pre->mr_next != ring)
3796 			pre = pre->mr_next;
3797 		pre->mr_next = ring->mr_next;
3798 	}
3799 	group->mrg_cur_count--;
3800 
3801 	if (!driver_call) {
3802 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3803 		ASSERT(cap_rings->mr_gremring != NULL);
3804 
3805 		/*
3806 		 * Remove the driver level hardware ring.
3807 		 */
3808 		if (group->mrg_driver != NULL) {
3809 			cap_rings->mr_gremring(group->mrg_driver,
3810 			    ring->mr_driver, ring->mr_type);
3811 		}
3812 	}
3813 
3814 	ring->mr_gh = NULL;
3815 	if (driver_call) {
3816 		mac_ring_free(mip, ring);
3817 	} else {
3818 		ring->mr_state = MR_FREE;
3819 		ring->mr_flag = 0;
3820 	}
3821 }
3822 
3823 /*
3824  * Move a ring to the target group. If needed, remove the ring from the group
3825  * that it currently belongs to.
3826  *
3827  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
3828  */
3829 static int
3830 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
3831 {
3832 	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
3833 	int rv;
3834 
3835 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3836 	ASSERT(d_group != NULL);
3837 	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
3838 
3839 	if (s_group == d_group)
3840 		return (0);
3841 
3842 	/*
3843 	 * Remove it from current group first.
3844 	 */
3845 	if (s_group != NULL)
3846 		i_mac_group_rem_ring(s_group, ring, B_FALSE);
3847 
3848 	/*
3849 	 * Add it to the new group.
3850 	 */
3851 	rv = i_mac_group_add_ring(d_group, ring, 0);
3852 	if (rv != 0) {
3853 		/*
3854 		 * Failed to add ring back to source group. If
3855 		 * that fails, the ring is stuck in limbo, log message.
3856 		 */
3857 		if (i_mac_group_add_ring(s_group, ring, 0)) {
3858 			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
3859 			    mip->mi_name, (void *)ring);
3860 		}
3861 	}
3862 
3863 	return (rv);
3864 }
3865 
3866 /*
3867  * Find a MAC address according to its value.
3868  */
3869 mac_address_t *
3870 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
3871 {
3872 	mac_address_t *map;
3873 
3874 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3875 
3876 	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
3877 		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
3878 			break;
3879 	}
3880 
3881 	return (map);
3882 }
3883 
3884 /*
3885  * Check whether the MAC address is shared by multiple clients.
3886  */
3887 boolean_t
3888 mac_check_macaddr_shared(mac_address_t *map)
3889 {
3890 	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
3891 
3892 	return (map->ma_nusers > 1);
3893 }
3894 
3895 /*
3896  * Remove the specified MAC address from the MAC address list and free it.
3897  */
3898 static void
3899 mac_free_macaddr(mac_address_t *map)
3900 {
3901 	mac_impl_t *mip = map->ma_mip;
3902 
3903 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3904 	ASSERT(mip->mi_addresses != NULL);
3905 
3906 	map = mac_find_macaddr(mip, map->ma_addr);
3907 
3908 	ASSERT(map != NULL);
3909 	ASSERT(map->ma_nusers == 0);
3910 
3911 	if (map == mip->mi_addresses) {
3912 		mip->mi_addresses = map->ma_next;
3913 	} else {
3914 		mac_address_t *pre;
3915 
3916 		pre = mip->mi_addresses;
3917 		while (pre->ma_next != map)
3918 			pre = pre->ma_next;
3919 		pre->ma_next = map->ma_next;
3920 	}
3921 
3922 	kmem_free(map, sizeof (mac_address_t));
3923 }
3924 
3925 /*
3926  * Add a MAC address reference for a client. If the desired MAC address
3927  * exists, add a reference to it. Otherwise, add the new address by adding
3928  * it to a reserved group or setting promiscuous mode. Won't try different
3929  * group is the group is non-NULL, so the caller must explictly share
3930  * default group when needed.
3931  *
3932  * Note, the primary MAC address is initialized at registration time, so
3933  * to add it to default group only need to activate it if its reference
3934  * count is still zero. Also, some drivers may not have advertised RINGS
3935  * capability.
3936  */
3937 int
3938 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
3939     boolean_t use_hw)
3940 {
3941 	mac_address_t *map;
3942 	int err = 0;
3943 	boolean_t allocated_map = B_FALSE;
3944 
3945 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3946 
3947 	map = mac_find_macaddr(mip, mac_addr);
3948 
3949 	/*
3950 	 * If the new MAC address has not been added. Allocate a new one
3951 	 * and set it up.
3952 	 */
3953 	if (map == NULL) {
3954 		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
3955 		map->ma_len = mip->mi_type->mt_addr_length;
3956 		bcopy(mac_addr, map->ma_addr, map->ma_len);
3957 		map->ma_nusers = 0;
3958 		map->ma_group = group;
3959 		map->ma_mip = mip;
3960 
3961 		/* add the new MAC address to the head of the address list */
3962 		map->ma_next = mip->mi_addresses;
3963 		mip->mi_addresses = map;
3964 
3965 		allocated_map = B_TRUE;
3966 	}
3967 
3968 	ASSERT(map->ma_group == group);
3969 
3970 	/*
3971 	 * If the MAC address is already in use, simply account for the
3972 	 * new client.
3973 	 */
3974 	if (map->ma_nusers++ > 0)
3975 		return (0);
3976 
3977 	/*
3978 	 * Activate this MAC address by adding it to the reserved group.
3979 	 */
3980 	if (group != NULL) {
3981 		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
3982 		if (err == 0) {
3983 			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3984 			return (0);
3985 		}
3986 	}
3987 
3988 	/*
3989 	 * The MAC address addition failed. If the client requires a
3990 	 * hardware classified MAC address, fail the operation.
3991 	 */
3992 	if (use_hw) {
3993 		err = ENOSPC;
3994 		goto bail;
3995 	}
3996 
3997 	/*
3998 	 * Try promiscuous mode.
3999 	 *
4000 	 * For drivers that don't advertise RINGS capability, do
4001 	 * nothing for the primary address.
4002 	 */
4003 	if ((group == NULL) &&
4004 	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
4005 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4006 		return (0);
4007 	}
4008 
4009 	/*
4010 	 * Enable promiscuous mode in order to receive traffic
4011 	 * to the new MAC address.
4012 	 */
4013 	if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
4014 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
4015 		return (0);
4016 	}
4017 
4018 	/*
4019 	 * Free the MAC address that could not be added. Don't free
4020 	 * a pre-existing address, it could have been the entry
4021 	 * for the primary MAC address which was pre-allocated by
4022 	 * mac_init_macaddr(), and which must remain on the list.
4023 	 */
4024 bail:
4025 	map->ma_nusers--;
4026 	if (allocated_map)
4027 		mac_free_macaddr(map);
4028 	return (err);
4029 }
4030 
4031 /*
4032  * Remove a reference to a MAC address. This may cause to remove the MAC
4033  * address from an associated group or to turn off promiscuous mode.
4034  * The caller needs to handle the failure properly.
4035  */
4036 int
4037 mac_remove_macaddr(mac_address_t *map)
4038 {
4039 	mac_impl_t *mip = map->ma_mip;
4040 	int err = 0;
4041 
4042 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4043 
4044 	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
4045 
4046 	/*
4047 	 * If it's not the last client using this MAC address, only update
4048 	 * the MAC clients count.
4049 	 */
4050 	if (--map->ma_nusers > 0)
4051 		return (0);
4052 
4053 	/*
4054 	 * The MAC address is no longer used by any MAC client, so remove
4055 	 * it from its associated group, or turn off promiscuous mode
4056 	 * if it was enabled for the MAC address.
4057 	 */
4058 	switch (map->ma_type) {
4059 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4060 		/*
4061 		 * Don't free the preset primary address for drivers that
4062 		 * don't advertise RINGS capability.
4063 		 */
4064 		if (map->ma_group == NULL)
4065 			return (0);
4066 
4067 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4068 		break;
4069 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4070 		err = i_mac_promisc_set(mip, B_FALSE);
4071 		break;
4072 	default:
4073 		ASSERT(B_FALSE);
4074 	}
4075 
4076 	if (err != 0)
4077 		return (err);
4078 
4079 	/*
4080 	 * We created MAC address for the primary one at registration, so we
4081 	 * won't free it here. mac_fini_macaddr() will take care of it.
4082 	 */
4083 	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4084 		mac_free_macaddr(map);
4085 
4086 	return (0);
4087 }
4088 
4089 /*
4090  * Update an existing MAC address. The caller need to make sure that the new
4091  * value has not been used.
4092  */
4093 int
4094 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4095 {
4096 	mac_impl_t *mip = map->ma_mip;
4097 	int err = 0;
4098 
4099 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4100 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4101 
4102 	switch (map->ma_type) {
4103 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4104 		/*
4105 		 * Update the primary address for drivers that are not
4106 		 * RINGS capable.
4107 		 */
4108 		if (map->ma_group == NULL) {
4109 			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
4110 			    mac_addr);
4111 			if (err != 0)
4112 				return (err);
4113 			break;
4114 		}
4115 
4116 		/*
4117 		 * If this MAC address is not currently in use,
4118 		 * simply break out and update the value.
4119 		 */
4120 		if (map->ma_nusers == 0)
4121 			break;
4122 
4123 		/*
4124 		 * Need to replace the MAC address associated with a group.
4125 		 */
4126 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4127 		if (err != 0)
4128 			return (err);
4129 
4130 		err = mac_group_addmac(map->ma_group, mac_addr);
4131 
4132 		/*
4133 		 * Failure hints hardware error. The MAC layer needs to
4134 		 * have error notification facility to handle this.
4135 		 * Now, simply try to restore the value.
4136 		 */
4137 		if (err != 0)
4138 			(void) mac_group_addmac(map->ma_group, map->ma_addr);
4139 
4140 		break;
4141 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4142 		/*
4143 		 * Need to do nothing more if in promiscuous mode.
4144 		 */
4145 		break;
4146 	default:
4147 		ASSERT(B_FALSE);
4148 	}
4149 
4150 	/*
4151 	 * Successfully replaced the MAC address.
4152 	 */
4153 	if (err == 0)
4154 		bcopy(mac_addr, map->ma_addr, map->ma_len);
4155 
4156 	return (err);
4157 }
4158 
4159 /*
4160  * Freshen the MAC address with new value. Its caller must have updated the
4161  * hardware MAC address before calling this function.
4162  * This funcitons is supposed to be used to handle the MAC address change
4163  * notification from underlying drivers.
4164  */
4165 void
4166 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
4167 {
4168 	mac_impl_t *mip = map->ma_mip;
4169 
4170 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4171 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4172 
4173 	/*
4174 	 * Freshen the MAC address with new value.
4175 	 */
4176 	bcopy(mac_addr, map->ma_addr, map->ma_len);
4177 	bcopy(mac_addr, mip->mi_addr, map->ma_len);
4178 
4179 	/*
4180 	 * Update all MAC clients that share this MAC address.
4181 	 */
4182 	mac_unicast_update_clients(mip, map);
4183 }
4184 
4185 /*
4186  * Set up the primary MAC address.
4187  */
4188 void
4189 mac_init_macaddr(mac_impl_t *mip)
4190 {
4191 	mac_address_t *map;
4192 
4193 	/*
4194 	 * The reference count is initialized to zero, until it's really
4195 	 * activated.
4196 	 */
4197 	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4198 	map->ma_len = mip->mi_type->mt_addr_length;
4199 	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
4200 
4201 	/*
4202 	 * If driver advertises RINGS capability, it shouldn't have initialized
4203 	 * its primary MAC address. For other drivers, including VNIC, the
4204 	 * primary address must work after registration.
4205 	 */
4206 	if (mip->mi_rx_groups == NULL)
4207 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4208 
4209 	/*
4210 	 * The primary MAC address is reserved for default group according
4211 	 * to current design.
4212 	 */
4213 	map->ma_group = mip->mi_rx_groups;
4214 	map->ma_mip = mip;
4215 
4216 	mip->mi_addresses = map;
4217 }
4218 
4219 /*
4220  * Clean up the primary MAC address. Note, only one primary MAC address
4221  * is allowed. All other MAC addresses must have been freed appropriately.
4222  */
4223 void
4224 mac_fini_macaddr(mac_impl_t *mip)
4225 {
4226 	mac_address_t *map = mip->mi_addresses;
4227 
4228 	if (map == NULL)
4229 		return;
4230 
4231 	/*
4232 	 * If mi_addresses is initialized, there should be exactly one
4233 	 * entry left on the list with no users.
4234 	 */
4235 	ASSERT(map->ma_nusers == 0);
4236 	ASSERT(map->ma_next == NULL);
4237 
4238 	kmem_free(map, sizeof (mac_address_t));
4239 	mip->mi_addresses = NULL;
4240 }
4241 
4242 /*
4243  * Logging related functions.
4244  */
4245 
4246 /* Write the Flow description to the log file */
4247 int
4248 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
4249 {
4250 	flow_desc_t		*fdesc;
4251 	mac_resource_props_t	*mrp;
4252 	net_desc_t		ndesc;
4253 
4254 	bzero(&ndesc, sizeof (net_desc_t));
4255 
4256 	/*
4257 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4258 	 * Updates to the fe_flow_desc are done under the fe_lock
4259 	 */
4260 	mutex_enter(&flent->fe_lock);
4261 	fdesc = &flent->fe_flow_desc;
4262 	mrp = &flent->fe_resource_props;
4263 
4264 	ndesc.nd_name = flent->fe_flow_name;
4265 	ndesc.nd_devname = mcip->mci_name;
4266 	bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4267 	bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
4268 	ndesc.nd_sap = htonl(fdesc->fd_sap);
4269 	ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
4270 	ndesc.nd_bw_limit = mrp->mrp_maxbw;
4271 	if (ndesc.nd_isv4) {
4272 		ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
4273 		ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
4274 	} else {
4275 		bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
4276 		bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
4277 	}
4278 	ndesc.nd_sport = htons(fdesc->fd_local_port);
4279 	ndesc.nd_dport = htons(fdesc->fd_remote_port);
4280 	ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
4281 	mutex_exit(&flent->fe_lock);
4282 
4283 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
4284 }
4285 
4286 /* Write the Flow statistics to the log file */
4287 int
4288 mac_write_flow_stats(flow_entry_t *flent)
4289 {
4290 	flow_stats_t	*fl_stats;
4291 	net_stat_t	nstat;
4292 
4293 	fl_stats = &flent->fe_flowstats;
4294 	nstat.ns_name = flent->fe_flow_name;
4295 	nstat.ns_ibytes = fl_stats->fs_rbytes;
4296 	nstat.ns_obytes = fl_stats->fs_obytes;
4297 	nstat.ns_ipackets = fl_stats->fs_ipackets;
4298 	nstat.ns_opackets = fl_stats->fs_opackets;
4299 	nstat.ns_ierrors = fl_stats->fs_ierrors;
4300 	nstat.ns_oerrors = fl_stats->fs_oerrors;
4301 
4302 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
4303 }
4304 
4305 /* Write the Link Description to the log file */
4306 int
4307 mac_write_link_desc(mac_client_impl_t *mcip)
4308 {
4309 	net_desc_t		ndesc;
4310 	flow_entry_t		*flent = mcip->mci_flent;
4311 
4312 	bzero(&ndesc, sizeof (net_desc_t));
4313 
4314 	ndesc.nd_name = mcip->mci_name;
4315 	ndesc.nd_devname = mcip->mci_name;
4316 	ndesc.nd_isv4 = B_TRUE;
4317 	/*
4318 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4319 	 * Updates to the fe_flow_desc are done under the fe_lock
4320 	 * after removing the flent from the flow table.
4321 	 */
4322 	mutex_enter(&flent->fe_lock);
4323 	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4324 	mutex_exit(&flent->fe_lock);
4325 
4326 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
4327 }
4328 
4329 /* Write the Link statistics to the log file */
4330 int
4331 mac_write_link_stats(mac_client_impl_t *mcip)
4332 {
4333 	net_stat_t	nstat;
4334 
4335 	nstat.ns_name = mcip->mci_name;
4336 	nstat.ns_ibytes = mcip->mci_stat_ibytes;
4337 	nstat.ns_obytes = mcip->mci_stat_obytes;
4338 	nstat.ns_ipackets = mcip->mci_stat_ipackets;
4339 	nstat.ns_opackets = mcip->mci_stat_opackets;
4340 	nstat.ns_ierrors = mcip->mci_stat_ierrors;
4341 	nstat.ns_oerrors = mcip->mci_stat_oerrors;
4342 
4343 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
4344 }
4345 
4346 /*
4347  * For a given flow, if the descrition has not been logged before, do it now.
4348  * If it is a VNIC, then we have collected information about it from the MAC
4349  * table, so skip it.
4350  */
4351 /*ARGSUSED*/
4352 static int
4353 mac_log_flowinfo(flow_entry_t *flent, void *args)
4354 {
4355 	mac_client_impl_t	*mcip = flent->fe_mcip;
4356 
4357 	if (mcip == NULL)
4358 		return (0);
4359 
4360 	/*
4361 	 * If the name starts with "vnic", and fe_user_generated is true (to
4362 	 * exclude the mcast and active flow entries created implicitly for
4363 	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
4364 	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
4365 	 */
4366 	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
4367 	    (flent->fe_type & FLOW_USER) != 0) {
4368 		return (0);
4369 	}
4370 
4371 	if (!flent->fe_desc_logged) {
4372 		/*
4373 		 * We don't return error because we want to continu the
4374 		 * walk in case this is the last walk which means we
4375 		 * need to reset fe_desc_logged in all the flows.
4376 		 */
4377 		if (mac_write_flow_desc(flent, mcip) != 0)
4378 			return (0);
4379 		flent->fe_desc_logged = B_TRUE;
4380 	}
4381 
4382 	/*
4383 	 * Regardless of the error, we want to proceed in case we have to
4384 	 * reset fe_desc_logged.
4385 	 */
4386 	(void) mac_write_flow_stats(flent);
4387 
4388 	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
4389 		flent->fe_desc_logged = B_FALSE;
4390 
4391 	return (0);
4392 }
4393 
4394 typedef struct i_mac_log_state_s {
4395 	boolean_t	mi_last;
4396 	int		mi_fenable;
4397 	int		mi_lenable;
4398 } i_mac_log_state_t;
4399 
4400 /*
4401  * Walk the mac_impl_ts and log the description for each mac client of this mac,
4402  * if it hasn't already been done. Additionally, log statistics for the link as
4403  * well. Walk the flow table and log information for each flow as well.
4404  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
4405  * also fe_desc_logged, if flow logging is on) since we want to log the
4406  * description if and when logging is restarted.
4407  */
4408 /*ARGSUSED*/
4409 static uint_t
4410 i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
4411 {
4412 	mac_impl_t		*mip = (mac_impl_t *)val;
4413 	i_mac_log_state_t	*lstate = (i_mac_log_state_t *)arg;
4414 	int			ret;
4415 	mac_client_impl_t	*mcip;
4416 
4417 	/*
4418 	 * Only walk the client list for NIC and etherstub
4419 	 */
4420 	if ((mip->mi_state_flags & MIS_DISABLED) ||
4421 	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
4422 	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
4423 		return (MH_WALK_CONTINUE);
4424 
4425 	for (mcip = mip->mi_clients_list; mcip != NULL;
4426 	    mcip = mcip->mci_client_next) {
4427 		if (!MCIP_DATAPATH_SETUP(mcip))
4428 			continue;
4429 		if (lstate->mi_lenable) {
4430 			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
4431 				ret = mac_write_link_desc(mcip);
4432 				if (ret != 0) {
4433 				/*
4434 				 * We can't terminate it if this is the last
4435 				 * walk, else there might be some links with
4436 				 * mi_desc_logged set to true, which means
4437 				 * their description won't be logged the next
4438 				 * time logging is started (similarly for the
4439 				 * flows within such links). We can continue
4440 				 * without walking the flow table (i.e. to
4441 				 * set fe_desc_logged to false) because we
4442 				 * won't have written any flow stuff for this
4443 				 * link as we haven't logged the link itself.
4444 				 */
4445 					if (lstate->mi_last)
4446 						return (MH_WALK_CONTINUE);
4447 					else
4448 						return (MH_WALK_TERMINATE);
4449 				}
4450 				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
4451 			}
4452 		}
4453 
4454 		if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
4455 			return (MH_WALK_TERMINATE);
4456 
4457 		if (lstate->mi_last)
4458 			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
4459 
4460 		if (lstate->mi_fenable) {
4461 			if (mcip->mci_subflow_tab != NULL) {
4462 				(void) mac_flow_walk(mcip->mci_subflow_tab,
4463 				    mac_log_flowinfo, mip);
4464 			}
4465 		}
4466 	}
4467 	return (MH_WALK_CONTINUE);
4468 }
4469 
4470 /*
4471  * The timer thread that runs every mac_logging_interval seconds and logs
4472  * link and/or flow information.
4473  */
4474 /* ARGSUSED */
4475 void
4476 mac_log_linkinfo(void *arg)
4477 {
4478 	i_mac_log_state_t	lstate;
4479 
4480 	rw_enter(&i_mac_impl_lock, RW_READER);
4481 	if (!mac_flow_log_enable && !mac_link_log_enable) {
4482 		rw_exit(&i_mac_impl_lock);
4483 		return;
4484 	}
4485 	lstate.mi_fenable = mac_flow_log_enable;
4486 	lstate.mi_lenable = mac_link_log_enable;
4487 	lstate.mi_last = B_FALSE;
4488 	rw_exit(&i_mac_impl_lock);
4489 
4490 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4491 
4492 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4493 	if (mac_flow_log_enable || mac_link_log_enable) {
4494 		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
4495 		    SEC_TO_TICK(mac_logging_interval));
4496 	}
4497 	rw_exit(&i_mac_impl_lock);
4498 }
4499 
4500 typedef struct i_mac_fastpath_state_s {
4501 	boolean_t	mf_disable;
4502 	int		mf_err;
4503 } i_mac_fastpath_state_t;
4504 
4505 /*ARGSUSED*/
4506 static uint_t
4507 i_mac_fastpath_disable_walker(mod_hash_key_t key, mod_hash_val_t *val,
4508     void *arg)
4509 {
4510 	i_mac_fastpath_state_t	*state = arg;
4511 	mac_handle_t		mh = (mac_handle_t)val;
4512 
4513 	if (state->mf_disable)
4514 		state->mf_err = mac_fastpath_disable(mh);
4515 	else
4516 		mac_fastpath_enable(mh);
4517 
4518 	return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
4519 }
4520 
4521 /*
4522  * Start the logging timer.
4523  */
4524 int
4525 mac_start_logusage(mac_logtype_t type, uint_t interval)
4526 {
4527 	i_mac_fastpath_state_t state = {B_TRUE, 0};
4528 	int err;
4529 
4530 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4531 	switch (type) {
4532 	case MAC_LOGTYPE_FLOW:
4533 		if (mac_flow_log_enable) {
4534 			rw_exit(&i_mac_impl_lock);
4535 			return (0);
4536 		}
4537 		/* FALLTHRU */
4538 	case MAC_LOGTYPE_LINK:
4539 		if (mac_link_log_enable) {
4540 			rw_exit(&i_mac_impl_lock);
4541 			return (0);
4542 		}
4543 		break;
4544 	default:
4545 		ASSERT(0);
4546 	}
4547 
4548 	/* Disable fastpath */
4549 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4550 	if ((err = state.mf_err) != 0) {
4551 		/* Reenable fastpath  */
4552 		state.mf_disable = B_FALSE;
4553 		state.mf_err = 0;
4554 		mod_hash_walk(i_mac_impl_hash,
4555 		    i_mac_fastpath_disable_walker, &state);
4556 		rw_exit(&i_mac_impl_lock);
4557 		return (err);
4558 	}
4559 
4560 	switch (type) {
4561 	case MAC_LOGTYPE_FLOW:
4562 		mac_flow_log_enable = B_TRUE;
4563 		/* FALLTHRU */
4564 	case MAC_LOGTYPE_LINK:
4565 		mac_link_log_enable = B_TRUE;
4566 		break;
4567 	}
4568 
4569 	mac_logging_interval = interval;
4570 	rw_exit(&i_mac_impl_lock);
4571 	mac_log_linkinfo(NULL);
4572 	return (0);
4573 }
4574 
4575 /*
4576  * Stop the logging timer if both Link and Flow logging are turned off.
4577  */
4578 void
4579 mac_stop_logusage(mac_logtype_t type)
4580 {
4581 	i_mac_log_state_t	lstate;
4582 	i_mac_fastpath_state_t	state = {B_FALSE, 0};
4583 
4584 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4585 	lstate.mi_fenable = mac_flow_log_enable;
4586 	lstate.mi_lenable = mac_link_log_enable;
4587 
4588 	/* Last walk */
4589 	lstate.mi_last = B_TRUE;
4590 
4591 	switch (type) {
4592 	case MAC_LOGTYPE_FLOW:
4593 		if (lstate.mi_fenable) {
4594 			ASSERT(mac_link_log_enable);
4595 			mac_flow_log_enable = B_FALSE;
4596 			mac_link_log_enable = B_FALSE;
4597 			break;
4598 		}
4599 		/* FALLTHRU */
4600 	case MAC_LOGTYPE_LINK:
4601 		if (!lstate.mi_lenable || mac_flow_log_enable) {
4602 			rw_exit(&i_mac_impl_lock);
4603 			return;
4604 		}
4605 		mac_link_log_enable = B_FALSE;
4606 		break;
4607 	default:
4608 		ASSERT(0);
4609 	}
4610 
4611 	/* Reenable fastpath */
4612 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4613 
4614 	rw_exit(&i_mac_impl_lock);
4615 	(void) untimeout(mac_logging_timer);
4616 	mac_logging_timer = 0;
4617 
4618 	/* Last walk */
4619 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4620 }
4621 
4622 /*
4623  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
4624  */
4625 void
4626 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
4627 {
4628 	pri_t			pri;
4629 	int			count;
4630 	mac_soft_ring_set_t	*mac_srs;
4631 
4632 	if (flent->fe_rx_srs_cnt <= 0)
4633 		return;
4634 
4635 	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
4636 	    SRST_FLOW) {
4637 		pri = FLOW_PRIORITY(mcip->mci_min_pri,
4638 		    mcip->mci_max_pri,
4639 		    flent->fe_resource_props.mrp_priority);
4640 	} else {
4641 		pri = mcip->mci_max_pri;
4642 	}
4643 
4644 	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
4645 		mac_srs = flent->fe_rx_srs[count];
4646 		mac_update_srs_priority(mac_srs, pri);
4647 	}
4648 	/*
4649 	 * If we have a Tx SRS, we need to modify all the threads associated
4650 	 * with it.
4651 	 */
4652 	if (flent->fe_tx_srs != NULL)
4653 		mac_update_srs_priority(flent->fe_tx_srs, pri);
4654 }
4655 
4656 /*
4657  * RX and TX rings are reserved according to different semantics depending
4658  * on the requests from the MAC clients and type of rings:
4659  *
4660  * On the Tx side, by default we reserve individual rings, independently from
4661  * the groups.
4662  *
4663  * On the Rx side, the reservation is at the granularity of the group
4664  * of rings, and used for v12n level 1 only. It has a special case for the
4665  * primary client.
4666  *
4667  * If a share is allocated to a MAC client, we allocate a TX group and an
4668  * RX group to the client, and assign TX rings and RX rings to these
4669  * groups according to information gathered from the driver through
4670  * the share capability.
4671  *
4672  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
4673  * to allocate individual rings out of a group and program the hw classifier
4674  * based on IP address or higher level criteria.
4675  */
4676 
4677 /*
4678  * mac_reserve_tx_ring()
4679  * Reserve a unused ring by marking it with MR_INUSE state.
4680  * As reserved, the ring is ready to function.
4681  *
4682  * Notes for Hybrid I/O:
4683  *
4684  * If a specific ring is needed, it is specified through the desired_ring
4685  * argument. Otherwise that argument is set to NULL.
4686  * If the desired ring was previous allocated to another client, this
4687  * function swaps it with a new ring from the group of unassigned rings.
4688  */
4689 mac_ring_t *
4690 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
4691 {
4692 	mac_group_t *group;
4693 	mac_ring_t *ring;
4694 
4695 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4696 
4697 	if (mip->mi_tx_groups == NULL)
4698 		return (NULL);
4699 
4700 	/*
4701 	 * Find an available ring and start it before changing its status.
4702 	 * The unassigned rings are at the end of the mi_tx_groups
4703 	 * array.
4704 	 */
4705 	group = mip->mi_tx_groups + mip->mi_tx_group_count;
4706 
4707 	for (ring = group->mrg_rings; ring != NULL;
4708 	    ring = ring->mr_next) {
4709 		if (desired_ring == NULL) {
4710 			if (ring->mr_state == MR_FREE)
4711 				/* wanted any free ring and found one */
4712 				break;
4713 		} else {
4714 			mac_ring_t *sring;
4715 			mac_client_impl_t *client;
4716 			mac_soft_ring_set_t *srs;
4717 
4718 			if (ring != desired_ring)
4719 				/* wants a desired ring but this one ain't it */
4720 				continue;
4721 
4722 			if (ring->mr_state == MR_FREE)
4723 				break;
4724 
4725 			/*
4726 			 * Found the desired ring but it's already in use.
4727 			 * Swap it with a new ring.
4728 			 */
4729 
4730 			/* find the client which owns that ring */
4731 			for (client = mip->mi_clients_list; client != NULL;
4732 			    client = client->mci_client_next) {
4733 				srs = MCIP_TX_SRS(client);
4734 				if (srs != NULL && mac_tx_srs_ring_present(srs,
4735 				    desired_ring)) {
4736 					/* found our ring */
4737 					break;
4738 				}
4739 			}
4740 			if (client == NULL) {
4741 				/*
4742 				 * The TX ring is in use, but it's not
4743 				 * associated with any clients, so it
4744 				 * has to be the default ring. In that
4745 				 * case we can simply assign a new ring
4746 				 * as the default ring, and we're done.
4747 				 */
4748 				ASSERT(mip->mi_default_tx_ring ==
4749 				    (mac_ring_handle_t)desired_ring);
4750 
4751 				/*
4752 				 * Quiesce all clients on top of
4753 				 * the NIC to make sure there are no
4754 				 * pending threads still relying on
4755 				 * that default ring, for example
4756 				 * the multicast path.
4757 				 */
4758 				for (client = mip->mi_clients_list;
4759 				    client != NULL;
4760 				    client = client->mci_client_next) {
4761 					mac_tx_client_quiesce(client,
4762 					    SRS_QUIESCE);
4763 				}
4764 
4765 				mip->mi_default_tx_ring = (mac_ring_handle_t)
4766 				    mac_reserve_tx_ring(mip, NULL);
4767 
4768 				/* resume the clients */
4769 				for (client = mip->mi_clients_list;
4770 				    client != NULL;
4771 				    client = client->mci_client_next)
4772 					mac_tx_client_restart(client);
4773 
4774 				break;
4775 			}
4776 
4777 			/*
4778 			 * Note that we cannot simply invoke the group
4779 			 * add/rem routines since the client doesn't have a
4780 			 * TX group. So we need to instead add/remove
4781 			 * the rings from the SRS.
4782 			 */
4783 			ASSERT(client->mci_share == NULL);
4784 
4785 			/* first quiece the client */
4786 			mac_tx_client_quiesce(client, SRS_QUIESCE);
4787 
4788 			/* give a new ring to the client... */
4789 			sring = mac_reserve_tx_ring(mip, NULL);
4790 			if (sring != NULL) {
4791 				/*
4792 				 * There are no other available ring
4793 				 * on that MAC instance. The client
4794 				 * will fallback to the shared TX
4795 				 * ring.
4796 				 */
4797 				mac_tx_srs_add_ring(srs, sring);
4798 			}
4799 
4800 			/* ... in exchange for our desired ring */
4801 			mac_tx_srs_del_ring(srs, desired_ring);
4802 
4803 			/* restart the client */
4804 			mac_tx_client_restart(client);
4805 
4806 			if (mip->mi_default_tx_ring ==
4807 			    (mac_ring_handle_t)desired_ring) {
4808 				/*
4809 				 * The desired ring is the default ring,
4810 				 * and there are one or more clients
4811 				 * using that default ring directly.
4812 				 */
4813 				mip->mi_default_tx_ring =
4814 				    (mac_ring_handle_t)sring;
4815 				/*
4816 				 * Find clients using default ring and
4817 				 * swap it with the new default ring.
4818 				 */
4819 				for (client = mip->mi_clients_list;
4820 				    client != NULL;
4821 				    client = client->mci_client_next) {
4822 					srs = MCIP_TX_SRS(client);
4823 					if (srs != NULL &&
4824 					    mac_tx_srs_ring_present(srs,
4825 					    desired_ring)) {
4826 						/* first quiece the client */
4827 						mac_tx_client_quiesce(client,
4828 						    SRS_QUIESCE);
4829 
4830 						/*
4831 						 * Give it the new default
4832 						 * ring, and remove the old
4833 						 * one.
4834 						 */
4835 						if (sring != NULL) {
4836 							mac_tx_srs_add_ring(srs,
4837 							    sring);
4838 						}
4839 						mac_tx_srs_del_ring(srs,
4840 						    desired_ring);
4841 
4842 						/* restart the client */
4843 						mac_tx_client_restart(client);
4844 					}
4845 				}
4846 			}
4847 			break;
4848 		}
4849 	}
4850 
4851 	if (ring != NULL) {
4852 		if (mac_start_ring(ring) != 0)
4853 			return (NULL);
4854 		ring->mr_state = MR_INUSE;
4855 	}
4856 
4857 	return (ring);
4858 }
4859 
4860 /*
4861  * Minimum number of rings to leave in the default TX group when allocating
4862  * rings to new clients.
4863  */
4864 static uint_t mac_min_rx_default_rings = 1;
4865 
4866 /*
4867  * Populate a zero-ring group with rings. If the share is non-NULL,
4868  * the rings are chosen according to that share.
4869  * Invoked after allocating a new RX or TX group through
4870  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
4871  * Returns zero on success, an errno otherwise.
4872  */
4873 int
4874 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
4875     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
4876 {
4877 	mac_ring_t **rings, *tmp_ring[1], *ring;
4878 	uint_t nrings;
4879 	int rv, i, j;
4880 
4881 	ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
4882 	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
4883 	ASSERT(new_group->mrg_cur_count == 0);
4884 
4885 	/*
4886 	 * First find the rings to allocate to the group.
4887 	 */
4888 	if (share != NULL) {
4889 		/* get rings through ms_squery() */
4890 		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
4891 		ASSERT(nrings != 0);
4892 		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
4893 		    KM_SLEEP);
4894 		mip->mi_share_capab.ms_squery(share, ring_type,
4895 		    (mac_ring_handle_t *)rings, &nrings);
4896 	} else {
4897 		/* this function is called for TX only with a share */
4898 		ASSERT(ring_type == MAC_RING_TYPE_RX);
4899 		/*
4900 		 * Pick one ring from default group.
4901 		 *
4902 		 * for now pick the second ring which requires the first ring
4903 		 * at index 0 to stay in the default group, since it is the
4904 		 * ring which carries the multicast traffic.
4905 		 * We need a better way for a driver to indicate this,
4906 		 * for example a per-ring flag.
4907 		 */
4908 		for (ring = src_group->mrg_rings; ring != NULL;
4909 		    ring = ring->mr_next) {
4910 			if (ring->mr_index != 0)
4911 				break;
4912 		}
4913 		ASSERT(ring != NULL);
4914 		nrings = 1;
4915 		tmp_ring[0] = ring;
4916 		rings = tmp_ring;
4917 	}
4918 
4919 	switch (ring_type) {
4920 	case MAC_RING_TYPE_RX:
4921 		if (src_group->mrg_cur_count - nrings <
4922 		    mac_min_rx_default_rings) {
4923 			/* we ran out of rings */
4924 			return (ENOSPC);
4925 		}
4926 
4927 		/* move receive rings to new group */
4928 		for (i = 0; i < nrings; i++) {
4929 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4930 			if (rv != 0) {
4931 				/* move rings back on failure */
4932 				for (j = 0; j < i; j++) {
4933 					(void) mac_group_mov_ring(mip,
4934 					    src_group, rings[j]);
4935 				}
4936 				return (rv);
4937 			}
4938 		}
4939 		break;
4940 
4941 	case MAC_RING_TYPE_TX: {
4942 		mac_ring_t *tmp_ring;
4943 
4944 		/* move the TX rings to the new group */
4945 		ASSERT(src_group == NULL);
4946 		for (i = 0; i < nrings; i++) {
4947 			/* get the desired ring */
4948 			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
4949 			ASSERT(tmp_ring == rings[i]);
4950 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4951 			if (rv != 0) {
4952 				/* cleanup on failure */
4953 				for (j = 0; j < i; j++) {
4954 					(void) mac_group_mov_ring(mip,
4955 					    mip->mi_tx_groups +
4956 					    mip->mi_tx_group_count, rings[j]);
4957 				}
4958 			}
4959 		}
4960 		break;
4961 	}
4962 	}
4963 
4964 	if (share != NULL) {
4965 		/* add group to share */
4966 		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
4967 		/* free temporary array of rings */
4968 		kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
4969 	}
4970 
4971 	return (0);
4972 }
4973 
4974 void
4975 mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
4976 {
4977 	mac_grp_client_t *mgcp;
4978 
4979 	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
4980 		if (mgcp->mgc_client == mcip)
4981 			break;
4982 	}
4983 
4984 	VERIFY(mgcp == NULL);
4985 
4986 	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
4987 	mgcp->mgc_client = mcip;
4988 	mgcp->mgc_next = grp->mrg_clients;
4989 	grp->mrg_clients = mgcp;
4990 
4991 }
4992 
4993 void
4994 mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
4995 {
4996 	mac_grp_client_t *mgcp, **pprev;
4997 
4998 	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
4999 	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
5000 		if (mgcp->mgc_client == mcip)
5001 			break;
5002 	}
5003 
5004 	ASSERT(mgcp != NULL);
5005 
5006 	*pprev = mgcp->mgc_next;
5007 	kmem_free(mgcp, sizeof (mac_grp_client_t));
5008 }
5009 
5010 /*
5011  * mac_reserve_rx_group()
5012  *
5013  * Finds an available group and exclusively reserves it for a client.
5014  * The group is chosen to suit the flow's resource controls (bandwidth and
5015  * fanout requirements) and the address type.
5016  * If the requestor is the pimary MAC then return the group with the
5017  * largest number of rings, otherwise the default ring when available.
5018  */
5019 mac_group_t *
5020 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
5021     mac_rx_group_reserve_type_t rtype)
5022 {
5023 	mac_share_handle_t	share = mcip->mci_share;
5024 	mac_impl_t		*mip = mcip->mci_mip;
5025 	mac_group_t		*grp = NULL;
5026 	int			i, start, loopcount;
5027 	int			err;
5028 	mac_address_t		*map;
5029 
5030 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
5031 
5032 	/* Check if a group already has this mac address (case of VLANs) */
5033 	if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
5034 		return (map->ma_group);
5035 
5036 	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
5037 	    rtype == MAC_RX_NO_RESERVE)
5038 		return (NULL);
5039 
5040 	/*
5041 	 * Try to exclusively reserve a RX group.
5042 	 *
5043 	 * For flows requires SW_RING it always goes to the default group
5044 	 * (Until we can explicitely call out default groups (CR 6695600),
5045 	 * we assume that the default group is always at position zero);
5046 	 *
5047 	 * For flows requires HW_DEFAULT_RING (unicast flow of the primary
5048 	 * client), try to reserve the default RX group only.
5049 	 *
5050 	 * For flows requires HW_RING (unicast flow of other clients), try
5051 	 * to reserve non-default RX group then the default group.
5052 	 */
5053 	switch (rtype) {
5054 	case MAC_RX_RESERVE_DEFAULT:
5055 		start = 0;
5056 		loopcount = 1;
5057 		break;
5058 	case MAC_RX_RESERVE_NONDEFAULT:
5059 		start = 1;
5060 		loopcount = mip->mi_rx_group_count;
5061 	}
5062 
5063 	for (i = start; i < start + loopcount; i++) {
5064 		grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
5065 
5066 		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
5067 		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
5068 
5069 		/*
5070 		 * Check to see whether this mac client is the only client
5071 		 * on this RX group. If not, we cannot exclusively reserve
5072 		 * this RX group.
5073 		 */
5074 		if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
5075 		    (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
5076 			continue;
5077 		}
5078 
5079 		/*
5080 		 * This group could already be SHARED by other multicast
5081 		 * flows on this client. In that case, the group would
5082 		 * be shared and has already been started.
5083 		 */
5084 		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
5085 
5086 		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
5087 		    (mac_start_group(grp) != 0)) {
5088 			continue;
5089 		}
5090 
5091 		if ((i % mip->mi_rx_group_count) == 0 ||
5092 		    mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
5093 			break;
5094 		}
5095 
5096 		ASSERT(grp->mrg_cur_count == 0);
5097 
5098 		/*
5099 		 * Populate the group. Rings should be taken
5100 		 * from the default group at position 0 for now.
5101 		 */
5102 
5103 		err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
5104 		    &mip->mi_rx_groups[0], grp, share);
5105 		if (err == 0)
5106 			break;
5107 
5108 		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
5109 		    mip->mi_name, int, grp->mrg_index, int, err);
5110 
5111 		/*
5112 		 * It's a dynamic group but the grouping operation failed.
5113 		 */
5114 		mac_stop_group(grp);
5115 	}
5116 
5117 	if (i == start + loopcount)
5118 		return (NULL);
5119 
5120 	ASSERT(grp != NULL);
5121 
5122 	DTRACE_PROBE2(rx__group__reserved,
5123 	    char *, mip->mi_name, int, grp->mrg_index);
5124 	return (grp);
5125 }
5126 
5127 /*
5128  * mac_rx_release_group()
5129  *
5130  * This is called when there are no clients left for the group.
5131  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
5132  * and if it is a non default group, the shares are removed and
5133  * all rings are assigned back to default group.
5134  */
5135 void
5136 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
5137 {
5138 	mac_impl_t	*mip = mcip->mci_mip;
5139 	mac_ring_t	*ring;
5140 
5141 	ASSERT(group != &mip->mi_rx_groups[0]);
5142 
5143 	/*
5144 	 * This is the case where there are no clients left. Any
5145 	 * SRS etc on this group have also be quiesced.
5146 	 */
5147 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
5148 		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
5149 			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5150 			/*
5151 			 * Remove the SRS associated with the HW ring.
5152 			 * As a result, polling will be disabled.
5153 			 */
5154 			ring->mr_srs = NULL;
5155 		}
5156 		ASSERT(ring->mr_state == MR_INUSE);
5157 		mac_stop_ring(ring);
5158 		ring->mr_state = MR_FREE;
5159 		ring->mr_flag = 0;
5160 	}
5161 
5162 	/* remove group from share */
5163 	if (mcip->mci_share != NULL) {
5164 		mip->mi_share_capab.ms_sremove(mcip->mci_share,
5165 		    group->mrg_driver);
5166 	}
5167 
5168 	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
5169 		mac_ring_t *ring;
5170 
5171 		/*
5172 		 * Rings were dynamically allocated to group.
5173 		 * Move rings back to default group.
5174 		 */
5175 		while ((ring = group->mrg_rings) != NULL) {
5176 			(void) mac_group_mov_ring(mip,
5177 			    &mip->mi_rx_groups[0], ring);
5178 		}
5179 	}
5180 	mac_stop_group(group);
5181 	/*
5182 	 * Possible improvement: See if we can assign the group just released
5183 	 * to a another client of the mip
5184 	 */
5185 }
5186 
5187 /*
5188  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
5189  * when a share was allocated to the client.
5190  */
5191 mac_group_t *
5192 mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
5193 {
5194 	mac_group_t *grp;
5195 	int rv, i;
5196 
5197 	/*
5198 	 * TX groups are currently allocated only to MAC clients
5199 	 * which are associated with a share. Since we have a fixed
5200 	 * number of share and groups, and we already successfully
5201 	 * allocated a share, find an available TX group.
5202 	 */
5203 	ASSERT(share != NULL);
5204 	ASSERT(mip->mi_tx_group_free > 0);
5205 
5206 	for (i = 0; i <  mip->mi_tx_group_count; i++) {
5207 		grp = &mip->mi_tx_groups[i];
5208 
5209 		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
5210 		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
5211 			continue;
5212 
5213 		rv = mac_start_group(grp);
5214 		ASSERT(rv == 0);
5215 
5216 		grp->mrg_state = MAC_GROUP_STATE_RESERVED;
5217 		break;
5218 	}
5219 
5220 	ASSERT(grp != NULL);
5221 
5222 	/*
5223 	 * Populate the group. Rings should be taken from the group
5224 	 * of unassigned rings, which is past the array of TX
5225 	 * groups adversized by the driver.
5226 	 */
5227 	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
5228 	    grp, share);
5229 	if (rv != 0) {
5230 		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
5231 		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
5232 
5233 		mac_stop_group(grp);
5234 		grp->mrg_state = MAC_GROUP_STATE_UNINIT;
5235 
5236 		return (NULL);
5237 	}
5238 
5239 	mip->mi_tx_group_free--;
5240 
5241 	return (grp);
5242 }
5243 
5244 void
5245 mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
5246 {
5247 	mac_client_impl_t *mcip = grp->mrg_tx_client;
5248 	mac_share_handle_t share = mcip->mci_share;
5249 	mac_ring_t *ring;
5250 
5251 	ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
5252 	ASSERT(share != NULL);
5253 	ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
5254 
5255 	mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
5256 	while ((ring = grp->mrg_rings) != NULL) {
5257 		/* move the ring back to the pool */
5258 		(void) mac_group_mov_ring(mip, mip->mi_tx_groups +
5259 		    mip->mi_tx_group_count, ring);
5260 	}
5261 	mac_stop_group(grp);
5262 	mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
5263 	grp->mrg_tx_client = NULL;
5264 	mip->mi_tx_group_free++;
5265 }
5266 
5267 /*
5268  * This is a 1-time control path activity initiated by the client (IP).
5269  * The mac perimeter protects against other simultaneous control activities,
5270  * for example an ioctl that attempts to change the degree of fanout and
5271  * increase or decrease the number of softrings associated with this Tx SRS.
5272  */
5273 static mac_tx_notify_cb_t *
5274 mac_client_tx_notify_add(mac_client_impl_t *mcip,
5275     mac_tx_notify_t notify, void *arg)
5276 {
5277 	mac_cb_info_t *mcbi;
5278 	mac_tx_notify_cb_t *mtnfp;
5279 
5280 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5281 
5282 	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
5283 	mtnfp->mtnf_fn = notify;
5284 	mtnfp->mtnf_arg = arg;
5285 	mtnfp->mtnf_link.mcb_objp = mtnfp;
5286 	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
5287 	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
5288 
5289 	mcbi = &mcip->mci_tx_notify_cb_info;
5290 	mutex_enter(mcbi->mcbi_lockp);
5291 	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
5292 	mutex_exit(mcbi->mcbi_lockp);
5293 	return (mtnfp);
5294 }
5295 
5296 static void
5297 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
5298 {
5299 	mac_cb_info_t	*mcbi;
5300 	mac_cb_t	**cblist;
5301 
5302 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5303 
5304 	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
5305 	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
5306 		cmn_err(CE_WARN,
5307 		    "mac_client_tx_notify_remove: callback not "
5308 		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
5309 		return;
5310 	}
5311 
5312 	mcbi = &mcip->mci_tx_notify_cb_info;
5313 	cblist = &mcip->mci_tx_notify_cb_list;
5314 	mutex_enter(mcbi->mcbi_lockp);
5315 	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
5316 		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
5317 	else
5318 		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
5319 	mutex_exit(mcbi->mcbi_lockp);
5320 }
5321 
5322 /*
5323  * mac_client_tx_notify():
5324  * call to add and remove flow control callback routine.
5325  */
5326 mac_tx_notify_handle_t
5327 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
5328     void *ptr)
5329 {
5330 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
5331 	mac_tx_notify_cb_t	*mtnfp = NULL;
5332 
5333 	i_mac_perim_enter(mcip->mci_mip);
5334 
5335 	if (callb_func != NULL) {
5336 		/* Add a notify callback */
5337 		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
5338 	} else {
5339 		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
5340 	}
5341 	i_mac_perim_exit(mcip->mci_mip);
5342 
5343 	return ((mac_tx_notify_handle_t)mtnfp);
5344 }
5345 
5346 void
5347 mac_bridge_vectors(mac_bridge_tx_t txf, mac_bridge_rx_t rxf,
5348     mac_bridge_ref_t reff, mac_bridge_ls_t lsf)
5349 {
5350 	mac_bridge_tx_cb = txf;
5351 	mac_bridge_rx_cb = rxf;
5352 	mac_bridge_ref_cb = reff;
5353 	mac_bridge_ls_cb = lsf;
5354 }
5355 
5356 int
5357 mac_bridge_set(mac_handle_t mh, mac_handle_t link)
5358 {
5359 	mac_impl_t *mip = (mac_impl_t *)mh;
5360 	int retv;
5361 
5362 	mutex_enter(&mip->mi_bridge_lock);
5363 	if (mip->mi_bridge_link == NULL) {
5364 		mip->mi_bridge_link = link;
5365 		retv = 0;
5366 	} else {
5367 		retv = EBUSY;
5368 	}
5369 	mutex_exit(&mip->mi_bridge_lock);
5370 	if (retv == 0) {
5371 		mac_poll_state_change(mh, B_FALSE);
5372 		mac_capab_update(mh);
5373 	}
5374 	return (retv);
5375 }
5376 
5377 /*
5378  * Disable bridging on the indicated link.
5379  */
5380 void
5381 mac_bridge_clear(mac_handle_t mh, mac_handle_t link)
5382 {
5383 	mac_impl_t *mip = (mac_impl_t *)mh;
5384 
5385 	mutex_enter(&mip->mi_bridge_lock);
5386 	ASSERT(mip->mi_bridge_link == link);
5387 	mip->mi_bridge_link = NULL;
5388 	mutex_exit(&mip->mi_bridge_lock);
5389 	mac_poll_state_change(mh, B_TRUE);
5390 	mac_capab_update(mh);
5391 }
5392 
5393 void
5394 mac_no_active(mac_handle_t mh)
5395 {
5396 	mac_impl_t *mip = (mac_impl_t *)mh;
5397 
5398 	i_mac_perim_enter(mip);
5399 	mip->mi_state_flags |= MIS_NO_ACTIVE;
5400 	i_mac_perim_exit(mip);
5401 }
5402