xref: /titanic_51/usr/src/uts/common/io/mac/mac.c (revision 7f0b8309074a5d8e9f9d8ffe7aad7bb0b1ee6b1f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * MAC Services Module
29  *
30  * The GLDv3 framework locking -  The MAC layer
31  * --------------------------------------------
32  *
33  * The MAC layer is central to the GLD framework and can provide the locking
34  * framework needed for itself and for the use of MAC clients. MAC end points
35  * are fairly disjoint and don't share a lot of state. So a coarse grained
36  * multi-threading scheme is to single thread all create/modify/delete or set
37  * type of control operations on a per mac end point while allowing data threads
38  * concurrently.
39  *
40  * Control operations (set) that modify a mac end point are always serialized on
41  * a per mac end point basis, We have at most 1 such thread per mac end point
42  * at a time.
43  *
44  * All other operations that are not serialized are essentially multi-threaded.
45  * For example a control operation (get) like getting statistics which may not
46  * care about reading values atomically or data threads sending or receiving
47  * data. Mostly these type of operations don't modify the control state. Any
48  * state these operations care about are protected using traditional locks.
49  *
50  * The perimeter only serializes serial operations. It does not imply there
51  * aren't any other concurrent operations. However a serialized operation may
52  * sometimes need to make sure it is the only thread. In this case it needs
53  * to use reference counting mechanisms to cv_wait until any current data
54  * threads are done.
55  *
56  * The mac layer itself does not hold any locks across a call to another layer.
57  * The perimeter is however held across a down call to the driver to make the
58  * whole control operation atomic with respect to other control operations.
59  * Also the data path and get type control operations may proceed concurrently.
60  * These operations synchronize with the single serial operation on a given mac
61  * end point using regular locks. The perimeter ensures that conflicting
62  * operations like say a mac_multicast_add and a mac_multicast_remove on the
63  * same mac end point don't interfere with each other and also ensures that the
64  * changes in the mac layer and the call to the underlying driver to say add a
65  * multicast address are done atomically without interference from a thread
66  * trying to delete the same address.
67  *
68  * For example, consider
69  * mac_multicst_add()
70  * {
71  *	mac_perimeter_enter();	serialize all control operations
72  *
73  *	grab list lock		protect against access by data threads
74  *	add to list
75  *	drop list lock
76  *
77  *	call driver's mi_multicst
78  *
79  *	mac_perimeter_exit();
80  * }
81  *
82  * To lessen the number of serialization locks and simplify the lock hierarchy,
83  * we serialize all the control operations on a per mac end point by using a
84  * single serialization lock called the perimeter. We allow recursive entry into
85  * the perimeter to facilitate use of this mechanism by both the mac client and
86  * the MAC layer itself.
87  *
88  * MAC client means an entity that does an operation on a mac handle
89  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
90  * an entity that does an operation on a mac handle obtained from a
91  * mac_register. An entity could be both client and driver but on different
92  * handles eg. aggr. and should only make the corresponding mac interface calls
93  * i.e. mac driver interface or mac client interface as appropriate for that
94  * mac handle.
95  *
96  * General rules.
97  * -------------
98  *
99  * R1. The lock order of upcall threads is natually opposite to downcall
100  * threads. Hence upcalls must not hold any locks across layers for fear of
101  * recursive lock enter and lock order violation. This applies to all layers.
102  *
103  * R2. The perimeter is just another lock. Since it is held in the down
104  * direction, acquiring the perimeter in an upcall is prohibited as it would
105  * cause a deadlock. This applies to all layers.
106  *
107  * Note that upcalls that need to grab the mac perimeter (for example
108  * mac_notify upcalls) can still achieve that by posting the request to a
109  * thread, which can then grab all the required perimeters and locks in the
110  * right global order. Note that in the above example the mac layer iself
111  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
112  * to the client must do that. Please see the aggr code for an example.
113  *
114  * MAC client rules
115  * ----------------
116  *
117  * R3. A MAC client may use the MAC provided perimeter facility to serialize
118  * control operations on a per mac end point. It does this by by acquring
119  * and holding the perimeter across a sequence of calls to the mac layer.
120  * This ensures atomicity across the entire block of mac calls. In this
121  * model the MAC client must not hold any client locks across the calls to
122  * the mac layer. This model is the preferred solution.
123  *
124  * R4. However if a MAC client has a lot of global state across all mac end
125  * points the per mac end point serialization may not be sufficient. In this
126  * case the client may choose to use global locks or use its own serialization.
127  * To avoid deadlocks, these client layer locks held across the mac calls
128  * in the control path must never be acquired by the data path for the reason
129  * mentioned below.
130  *
131  * (Assume that a control operation that holds a client lock blocks in the
132  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
133  * data thread that holds this reference count, tries to acquire the same
134  * client lock subsequently it will deadlock).
135  *
136  * A MAC client may follow either the R3 model or the R4 model, but can't
137  * mix both. In the former, the hierarchy is Perim -> client locks, but in
138  * the latter it is client locks -> Perim.
139  *
140  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
141  * context since they may block while trying to acquire the perimeter.
142  * In addition some calls may block waiting for upcall refcnts to come down to
143  * zero.
144  *
145  * R6. MAC clients must make sure that they are single threaded and all threads
146  * from the top (in particular data threads) have finished before calling
147  * mac_client_close. The MAC framework does not track the number of client
148  * threads using the mac client handle. Also mac clients must make sure
149  * they have undone all the control operations before calling mac_client_close.
150  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
151  * mac_unicast_add/mac_multicast_add.
152  *
153  * MAC framework rules
154  * -------------------
155  *
156  * R7. The mac layer itself must not hold any mac layer locks (except the mac
157  * perimeter) across a call to any other layer from the mac layer. The call to
158  * any other layer could be via mi_* entry points, classifier entry points into
159  * the driver or via upcall pointers into layers above. The mac perimeter may
160  * be acquired or held only in the down direction, for e.g. when calling into
161  * a mi_* driver enty point to provide atomicity of the operation.
162  *
163  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
164  * mac driver interfaces, the MAC layer must provide a cut out for control
165  * interfaces like upcall notifications and start them in a separate thread.
166  *
167  * R9. Note that locking order also implies a plumbing order. For example
168  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
169  * to plumb in any other order must be failed at mac_open time, otherwise it
170  * could lead to deadlocks due to inverse locking order.
171  *
172  * R10. MAC driver interfaces must not block since the driver could call them
173  * in interrupt context.
174  *
175  * R11. Walkers must preferably not hold any locks while calling walker
176  * callbacks. Instead these can operate on reference counts. In simple
177  * callbacks it may be ok to hold a lock and call the callbacks, but this is
178  * harder to maintain in the general case of arbitrary callbacks.
179  *
180  * R12. The MAC layer must protect upcall notification callbacks using reference
181  * counts rather than holding locks across the callbacks.
182  *
183  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
184  * sure that any pointers (such as mac ring pointers) it passes to the driver
185  * remain valid until mac unregister time. Currently the mac layer achieves
186  * this by using generation numbers for rings and freeing the mac rings only
187  * at unregister time.  The MAC layer must provide a layer of indirection and
188  * must not expose underlying driver rings or driver data structures/pointers
189  * directly to MAC clients.
190  *
191  * MAC driver rules
192  * ----------------
193  *
194  * R14. It would be preferable if MAC drivers don't hold any locks across any
195  * mac call. However at a minimum they must not hold any locks across data
196  * upcalls. They must also make sure that all references to mac data structures
197  * are cleaned up and that it is single threaded at mac_unregister time.
198  *
199  * R15. MAC driver interfaces don't block and so the action may be done
200  * asynchronously in a separate thread as for example handling notifications.
201  * The driver must not assume that the action is complete when the call
202  * returns.
203  *
204  * R16. Drivers must maintain a generation number per Rx ring, and pass it
205  * back to mac_rx_ring(); They are expected to increment the generation
206  * number whenever the ring's stop routine is invoked.
207  * See comments in mac_rx_ring();
208  *
209  * R17 Similarly mi_stop is another synchronization point and the driver must
210  * ensure that all upcalls are done and there won't be any future upcall
211  * before returning from mi_stop.
212  *
213  * R18. The driver may assume that all set/modify control operations via
214  * the mi_* entry points are single threaded on a per mac end point.
215  *
216  * Lock and Perimeter hierarchy scenarios
217  * ---------------------------------------
218  *
219  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
220  *
221  * ft_lock -> fe_lock [mac_flow_lookup]
222  *
223  * mi_rw_lock -> fe_lock [mac_bcast_send]
224  *
225  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
226  *
227  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
228  *
229  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
230  *
231  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
232  * client to driver. In the case of clients that explictly use the mac provided
233  * perimeter mechanism for its serialization, the hierarchy is
234  * Perimeter -> mac layer locks, since the client never holds any locks across
235  * the mac calls. In the case of clients that use its own locks the hierarchy
236  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
237  * calls mac_perim_enter/exit in this case.
238  *
239  * Subflow creation rules
240  * ---------------------------
241  * o In case of a user specified cpulist present on underlying link and flows,
242  * the flows cpulist must be a subset of the underlying link.
243  * o In case of a user specified fanout mode present on link and flow, the
244  * subflow fanout count has to be less than or equal to that of the
245  * underlying link. The cpu-bindings for the subflows will be a subset of
246  * the underlying link.
247  * o In case if no cpulist specified on both underlying link and flow, the
248  * underlying link relies on a  MAC tunable to provide out of box fanout.
249  * The subflow will have no cpulist (the subflow will be unbound)
250  * o In case if no cpulist is specified on the underlying link, a subflow can
251  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
252  * for the subflow will not adhere to restriction that they need to be subset
253  * of the underlying link.
254  * o In case where the underlying link is carrying either a user specified
255  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
256  * created unbound.
257  * o While creating unbound subflows, bandwidth mode changes attempt to
258  * figure a right fanout count. In such cases the fanout count will override
259  * the unbound cpu-binding behavior.
260  * o In addition to this, while cycling between flow and link properties, we
261  * impose a restriction that if a link property has a subflow with
262  * user-specified attributes, we will not allow changing the link property.
263  * The administrator needs to reset all the user specified properties for the
264  * subflows before attempting a link property change.
265  * Some of the above rules can be overridden by specifying additional command
266  * line options while creating or modifying link or subflow properties.
267  */
268 
269 #include <sys/types.h>
270 #include <sys/conf.h>
271 #include <sys/id_space.h>
272 #include <sys/esunddi.h>
273 #include <sys/stat.h>
274 #include <sys/mkdev.h>
275 #include <sys/stream.h>
276 #include <sys/strsun.h>
277 #include <sys/strsubr.h>
278 #include <sys/dlpi.h>
279 #include <sys/modhash.h>
280 #include <sys/mac_provider.h>
281 #include <sys/mac_client_impl.h>
282 #include <sys/mac_soft_ring.h>
283 #include <sys/mac_impl.h>
284 #include <sys/mac.h>
285 #include <sys/dls.h>
286 #include <sys/dld.h>
287 #include <sys/modctl.h>
288 #include <sys/fs/dv_node.h>
289 #include <sys/thread.h>
290 #include <sys/proc.h>
291 #include <sys/callb.h>
292 #include <sys/cpuvar.h>
293 #include <sys/atomic.h>
294 #include <sys/bitmap.h>
295 #include <sys/sdt.h>
296 #include <sys/mac_flow.h>
297 #include <sys/ddi_intr_impl.h>
298 #include <sys/disp.h>
299 #include <sys/sdt.h>
300 #include <sys/vnic.h>
301 #include <sys/vnic_impl.h>
302 #include <sys/vlan.h>
303 #include <inet/ip.h>
304 #include <inet/ip6.h>
305 #include <sys/exacct.h>
306 #include <sys/exacct_impl.h>
307 #include <inet/nd.h>
308 #include <sys/ethernet.h>
309 
310 #define	IMPL_HASHSZ	67	/* prime */
311 
312 kmem_cache_t	*i_mac_impl_cachep;
313 mod_hash_t		*i_mac_impl_hash;
314 krwlock_t		i_mac_impl_lock;
315 uint_t			i_mac_impl_count;
316 static kmem_cache_t	*mac_ring_cache;
317 static id_space_t	*minor_ids;
318 static uint32_t		minor_count;
319 
320 /*
321  * Logging stuff. Perhaps mac_logging_interval could be broken into
322  * mac_flow_log_interval and mac_link_log_interval if we want to be
323  * able to schedule them differently.
324  */
325 uint_t			mac_logging_interval;
326 boolean_t		mac_flow_log_enable;
327 boolean_t		mac_link_log_enable;
328 timeout_id_t		mac_logging_timer;
329 
330 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
331 int mac_dbg = 0;
332 
333 #define	MACTYPE_KMODDIR	"mac"
334 #define	MACTYPE_HASHSZ	67
335 static mod_hash_t	*i_mactype_hash;
336 /*
337  * i_mactype_lock synchronizes threads that obtain references to mactype_t
338  * structures through i_mactype_getplugin().
339  */
340 static kmutex_t		i_mactype_lock;
341 
342 /*
343  * mac_tx_percpu_cnt
344  *
345  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
346  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
347  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
348  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
349  */
350 int mac_tx_percpu_cnt;
351 int mac_tx_percpu_cnt_max = 128;
352 
353 static int i_mac_constructor(void *, void *, int);
354 static void i_mac_destructor(void *, void *);
355 static int i_mac_ring_ctor(void *, void *, int);
356 static void i_mac_ring_dtor(void *, void *);
357 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
358 void mac_tx_client_flush(mac_client_impl_t *);
359 void mac_tx_client_block(mac_client_impl_t *);
360 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
361 static int mac_start_group_and_rings(mac_group_t *);
362 static void mac_stop_group_and_rings(mac_group_t *);
363 
364 /*
365  * Module initialization functions.
366  */
367 
368 void
369 mac_init(void)
370 {
371 	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
372 	    boot_max_ncpus);
373 
374 	/* Upper bound is mac_tx_percpu_cnt_max */
375 	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
376 		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
377 
378 	if (mac_tx_percpu_cnt < 1) {
379 		/* Someone set max_tx_percpu_cnt_max to 0 or less */
380 		mac_tx_percpu_cnt = 1;
381 	}
382 
383 	ASSERT(mac_tx_percpu_cnt >= 1);
384 	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
385 	/*
386 	 * Make it of the form 2**N - 1 in the range
387 	 * [0 .. mac_tx_percpu_cnt_max - 1]
388 	 */
389 	mac_tx_percpu_cnt--;
390 
391 	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
392 	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
393 	    NULL, NULL, NULL, 0);
394 	ASSERT(i_mac_impl_cachep != NULL);
395 
396 	mac_ring_cache = kmem_cache_create("mac_ring_cache",
397 	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
398 	    NULL, NULL, 0);
399 	ASSERT(mac_ring_cache != NULL);
400 
401 	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
402 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
403 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
404 	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
405 
406 	mac_flow_init();
407 	mac_soft_ring_init();
408 	mac_bcast_init();
409 	mac_client_init();
410 
411 	i_mac_impl_count = 0;
412 
413 	i_mactype_hash = mod_hash_create_extended("mactype_hash",
414 	    MACTYPE_HASHSZ,
415 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
416 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
417 
418 	/*
419 	 * Allocate an id space to manage minor numbers. The range of the
420 	 * space will be from MAC_MAX_MINOR+1 to MAXMIN32 (maximum legal
421 	 * minor number is MAXMIN, but id_t is type of integer and does not
422 	 * allow MAXMIN).
423 	 */
424 	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1, MAXMIN32);
425 	ASSERT(minor_ids != NULL);
426 	minor_count = 0;
427 
428 	/* Let's default to 20 seconds */
429 	mac_logging_interval = 20;
430 	mac_flow_log_enable = B_FALSE;
431 	mac_link_log_enable = B_FALSE;
432 	mac_logging_timer = 0;
433 }
434 
435 int
436 mac_fini(void)
437 {
438 	if (i_mac_impl_count > 0 || minor_count > 0)
439 		return (EBUSY);
440 
441 	id_space_destroy(minor_ids);
442 	mac_flow_fini();
443 
444 	mod_hash_destroy_hash(i_mac_impl_hash);
445 	rw_destroy(&i_mac_impl_lock);
446 
447 	mac_client_fini();
448 	kmem_cache_destroy(mac_ring_cache);
449 
450 	mod_hash_destroy_hash(i_mactype_hash);
451 	mac_soft_ring_finish();
452 	return (0);
453 }
454 
455 void
456 mac_init_ops(struct dev_ops *ops, const char *name)
457 {
458 	dld_init_ops(ops, name);
459 }
460 
461 void
462 mac_fini_ops(struct dev_ops *ops)
463 {
464 	dld_fini_ops(ops);
465 }
466 
467 /*ARGSUSED*/
468 static int
469 i_mac_constructor(void *buf, void *arg, int kmflag)
470 {
471 	mac_impl_t	*mip = buf;
472 
473 	bzero(buf, sizeof (mac_impl_t));
474 
475 	mip->mi_linkstate = LINK_STATE_UNKNOWN;
476 	mip->mi_nclients = 0;
477 
478 	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
479 	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
480 	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
481 	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
482 	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
483 
484 	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
485 	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
486 	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
487 	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
488 	return (0);
489 }
490 
491 /*ARGSUSED*/
492 static void
493 i_mac_destructor(void *buf, void *arg)
494 {
495 	mac_impl_t	*mip = buf;
496 	mac_cb_info_t	*mcbi;
497 
498 	ASSERT(mip->mi_ref == 0);
499 	ASSERT(mip->mi_active == 0);
500 	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
501 	ASSERT(mip->mi_devpromisc == 0);
502 	ASSERT(mip->mi_promisc == 0);
503 	ASSERT(mip->mi_ksp == NULL);
504 	ASSERT(mip->mi_kstat_count == 0);
505 	ASSERT(mip->mi_nclients == 0);
506 	ASSERT(mip->mi_nactiveclients == 0);
507 	ASSERT(mip->mi_single_active_client == NULL);
508 	ASSERT(mip->mi_state_flags == 0);
509 	ASSERT(mip->mi_factory_addr == NULL);
510 	ASSERT(mip->mi_factory_addr_num == 0);
511 	ASSERT(mip->mi_default_tx_ring == NULL);
512 
513 	mcbi = &mip->mi_notify_cb_info;
514 	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
515 	ASSERT(mip->mi_notify_bits == 0);
516 	ASSERT(mip->mi_notify_thread == NULL);
517 	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
518 	mcbi->mcbi_lockp = NULL;
519 
520 	mcbi = &mip->mi_promisc_cb_info;
521 	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
522 	ASSERT(mip->mi_promisc_list == NULL);
523 	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
524 	mcbi->mcbi_lockp = NULL;
525 
526 	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
527 	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
528 
529 	mutex_destroy(&mip->mi_lock);
530 	rw_destroy(&mip->mi_rw_lock);
531 
532 	mutex_destroy(&mip->mi_promisc_lock);
533 	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
534 	mutex_destroy(&mip->mi_notify_lock);
535 	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
536 	mutex_destroy(&mip->mi_ring_lock);
537 }
538 
539 /* ARGSUSED */
540 static int
541 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
542 {
543 	mac_ring_t *ring = (mac_ring_t *)buf;
544 
545 	bzero(ring, sizeof (mac_ring_t));
546 	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
547 	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
548 	ring->mr_state = MR_FREE;
549 	return (0);
550 }
551 
552 /* ARGSUSED */
553 static void
554 i_mac_ring_dtor(void *buf, void *arg)
555 {
556 	mac_ring_t *ring = (mac_ring_t *)buf;
557 
558 	cv_destroy(&ring->mr_cv);
559 	mutex_destroy(&ring->mr_lock);
560 }
561 
562 /*
563  * Common functions to do mac callback addition and deletion. Currently this is
564  * used by promisc callbacks and notify callbacks. List addition and deletion
565  * need to take care of list walkers. List walkers in general, can't hold list
566  * locks and make upcall callbacks due to potential lock order and recursive
567  * reentry issues. Instead list walkers increment the list walker count to mark
568  * the presence of a walker thread. Addition can be carefully done to ensure
569  * that the list walker always sees either the old list or the new list.
570  * However the deletion can't be done while the walker is active, instead the
571  * deleting thread simply marks the entry as logically deleted. The last walker
572  * physically deletes and frees up the logically deleted entries when the walk
573  * is complete.
574  */
575 void
576 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
577     mac_cb_t *mcb_elem)
578 {
579 	mac_cb_t	*p;
580 	mac_cb_t	**pp;
581 
582 	/* Verify it is not already in the list */
583 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
584 		if (p == mcb_elem)
585 			break;
586 	}
587 	VERIFY(p == NULL);
588 
589 	/*
590 	 * Add it to the head of the callback list. The membar ensures that
591 	 * the following list pointer manipulations reach global visibility
592 	 * in exactly the program order below.
593 	 */
594 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
595 
596 	mcb_elem->mcb_nextp = *mcb_head;
597 	membar_producer();
598 	*mcb_head = mcb_elem;
599 }
600 
601 /*
602  * Mark the entry as logically deleted. If there aren't any walkers unlink
603  * from the list. In either case return the corresponding status.
604  */
605 boolean_t
606 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
607     mac_cb_t *mcb_elem)
608 {
609 	mac_cb_t	*p;
610 	mac_cb_t	**pp;
611 
612 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
613 	/*
614 	 * Search the callback list for the entry to be removed
615 	 */
616 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
617 		if (p == mcb_elem)
618 			break;
619 	}
620 	VERIFY(p != NULL);
621 
622 	/*
623 	 * If there are walkers just mark it as deleted and the last walker
624 	 * will remove from the list and free it.
625 	 */
626 	if (mcbi->mcbi_walker_cnt != 0) {
627 		p->mcb_flags |= MCB_CONDEMNED;
628 		mcbi->mcbi_del_cnt++;
629 		return (B_FALSE);
630 	}
631 
632 	ASSERT(mcbi->mcbi_del_cnt == 0);
633 	*pp = p->mcb_nextp;
634 	p->mcb_nextp = NULL;
635 	return (B_TRUE);
636 }
637 
638 /*
639  * Wait for all pending callback removals to be completed
640  */
641 void
642 mac_callback_remove_wait(mac_cb_info_t *mcbi)
643 {
644 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
645 	while (mcbi->mcbi_del_cnt != 0) {
646 		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
647 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
648 	}
649 }
650 
651 /*
652  * The last mac callback walker does the cleanup. Walk the list and unlik
653  * all the logically deleted entries and construct a temporary list of
654  * removed entries. Return the list of removed entries to the caller.
655  */
656 mac_cb_t *
657 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
658 {
659 	mac_cb_t	*p;
660 	mac_cb_t	**pp;
661 	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
662 	int	cnt = 0;
663 
664 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
665 	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
666 
667 	pp = mcb_head;
668 	while (*pp != NULL) {
669 		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
670 			p = *pp;
671 			*pp = p->mcb_nextp;
672 			p->mcb_nextp = rmlist;
673 			rmlist = p;
674 			cnt++;
675 			continue;
676 		}
677 		pp = &(*pp)->mcb_nextp;
678 	}
679 
680 	ASSERT(mcbi->mcbi_del_cnt == cnt);
681 	mcbi->mcbi_del_cnt = 0;
682 	return (rmlist);
683 }
684 
685 boolean_t
686 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
687 {
688 	mac_cb_t	*mcb;
689 
690 	/* Verify it is not already in the list */
691 	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
692 		if (mcb == mcb_elem)
693 			return (B_TRUE);
694 	}
695 
696 	return (B_FALSE);
697 }
698 
699 boolean_t
700 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
701 {
702 	boolean_t	found;
703 
704 	mutex_enter(mcbi->mcbi_lockp);
705 	found = mac_callback_lookup(mcb_headp, mcb_elem);
706 	mutex_exit(mcbi->mcbi_lockp);
707 
708 	return (found);
709 }
710 
711 /* Free the list of removed callbacks */
712 void
713 mac_callback_free(mac_cb_t *rmlist)
714 {
715 	mac_cb_t	*mcb;
716 	mac_cb_t	*mcb_next;
717 
718 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
719 		mcb_next = mcb->mcb_nextp;
720 		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
721 	}
722 }
723 
724 /*
725  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
726  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
727  * is only a single shared total walker count, and an entry can't be physically
728  * unlinked if a walker is active on either list. The last walker does this
729  * cleanup of logically deleted entries.
730  */
731 void
732 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
733 {
734 	mac_cb_t	*rmlist;
735 	mac_cb_t	*mcb;
736 	mac_cb_t	*mcb_next;
737 	mac_promisc_impl_t	*mpip;
738 
739 	/*
740 	 * Construct a temporary list of deleted callbacks by walking the
741 	 * the mi_promisc_list. Then for each entry in the temporary list,
742 	 * remove it from the mci_promisc_list and free the entry.
743 	 */
744 	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
745 	    &mip->mi_promisc_list);
746 
747 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
748 		mcb_next = mcb->mcb_nextp;
749 		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
750 		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
751 		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
752 		mcb->mcb_flags = 0;
753 		mcb->mcb_nextp = NULL;
754 		kmem_cache_free(mac_promisc_impl_cache, mpip);
755 	}
756 }
757 
758 void
759 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
760 {
761 	mac_cb_info_t	*mcbi;
762 
763 	/*
764 	 * Signal the notify thread even after mi_ref has become zero and
765 	 * mi_disabled is set. The synchronization with the notify thread
766 	 * happens in mac_unregister and that implies the driver must make
767 	 * sure it is single-threaded (with respect to mac calls) and that
768 	 * all pending mac calls have returned before it calls mac_unregister
769 	 */
770 	rw_enter(&i_mac_impl_lock, RW_READER);
771 	if (mip->mi_state_flags & MIS_DISABLED)
772 		goto exit;
773 
774 	/*
775 	 * Guard against incorrect notifications.  (Running a newer
776 	 * mac client against an older implementation?)
777 	 */
778 	if (type >= MAC_NNOTE)
779 		goto exit;
780 
781 	mcbi = &mip->mi_notify_cb_info;
782 	mutex_enter(mcbi->mcbi_lockp);
783 	mip->mi_notify_bits |= (1 << type);
784 	cv_broadcast(&mcbi->mcbi_cv);
785 	mutex_exit(mcbi->mcbi_lockp);
786 
787 exit:
788 	rw_exit(&i_mac_impl_lock);
789 }
790 
791 /*
792  * Mac serialization primitives. Please see the block comment at the
793  * top of the file.
794  */
795 void
796 i_mac_perim_enter(mac_impl_t *mip)
797 {
798 	mac_client_impl_t	*mcip;
799 
800 	if (mip->mi_state_flags & MIS_IS_VNIC) {
801 		/*
802 		 * This is a VNIC. Return the lower mac since that is what
803 		 * we want to serialize on.
804 		 */
805 		mcip = mac_vnic_lower(mip);
806 		mip = mcip->mci_mip;
807 	}
808 
809 	mutex_enter(&mip->mi_perim_lock);
810 	if (mip->mi_perim_owner == curthread) {
811 		mip->mi_perim_ocnt++;
812 		mutex_exit(&mip->mi_perim_lock);
813 		return;
814 	}
815 
816 	while (mip->mi_perim_owner != NULL)
817 		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
818 
819 	mip->mi_perim_owner = curthread;
820 	ASSERT(mip->mi_perim_ocnt == 0);
821 	mip->mi_perim_ocnt++;
822 #ifdef DEBUG
823 	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
824 	    MAC_PERIM_STACK_DEPTH);
825 #endif
826 	mutex_exit(&mip->mi_perim_lock);
827 }
828 
829 int
830 i_mac_perim_enter_nowait(mac_impl_t *mip)
831 {
832 	/*
833 	 * The vnic is a special case, since the serialization is done based
834 	 * on the lower mac. If the lower mac is busy, it does not imply the
835 	 * vnic can't be unregistered. But in the case of other drivers,
836 	 * a busy perimeter or open mac handles implies that the mac is busy
837 	 * and can't be unregistered.
838 	 */
839 	if (mip->mi_state_flags & MIS_IS_VNIC) {
840 		i_mac_perim_enter(mip);
841 		return (0);
842 	}
843 
844 	mutex_enter(&mip->mi_perim_lock);
845 	if (mip->mi_perim_owner != NULL) {
846 		mutex_exit(&mip->mi_perim_lock);
847 		return (EBUSY);
848 	}
849 	ASSERT(mip->mi_perim_ocnt == 0);
850 	mip->mi_perim_owner = curthread;
851 	mip->mi_perim_ocnt++;
852 	mutex_exit(&mip->mi_perim_lock);
853 
854 	return (0);
855 }
856 
857 void
858 i_mac_perim_exit(mac_impl_t *mip)
859 {
860 	mac_client_impl_t *mcip;
861 
862 	if (mip->mi_state_flags & MIS_IS_VNIC) {
863 		/*
864 		 * This is a VNIC. Return the lower mac since that is what
865 		 * we want to serialize on.
866 		 */
867 		mcip = mac_vnic_lower(mip);
868 		mip = mcip->mci_mip;
869 	}
870 
871 	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
872 
873 	mutex_enter(&mip->mi_perim_lock);
874 	if (--mip->mi_perim_ocnt == 0) {
875 		mip->mi_perim_owner = NULL;
876 		cv_signal(&mip->mi_perim_cv);
877 	}
878 	mutex_exit(&mip->mi_perim_lock);
879 }
880 
881 /*
882  * Returns whether the current thread holds the mac perimeter. Used in making
883  * assertions.
884  */
885 boolean_t
886 mac_perim_held(mac_handle_t mh)
887 {
888 	mac_impl_t	*mip = (mac_impl_t *)mh;
889 	mac_client_impl_t *mcip;
890 
891 	if (mip->mi_state_flags & MIS_IS_VNIC) {
892 		/*
893 		 * This is a VNIC. Return the lower mac since that is what
894 		 * we want to serialize on.
895 		 */
896 		mcip = mac_vnic_lower(mip);
897 		mip = mcip->mci_mip;
898 	}
899 	return (mip->mi_perim_owner == curthread);
900 }
901 
902 /*
903  * mac client interfaces to enter the mac perimeter of a mac end point, given
904  * its mac handle, or macname or linkid.
905  */
906 void
907 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
908 {
909 	mac_impl_t	*mip = (mac_impl_t *)mh;
910 
911 	i_mac_perim_enter(mip);
912 	/*
913 	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
914 	 * mac_open has been done internally while entering the perimeter.
915 	 * This information is used in mac_perim_exit
916 	 */
917 	MAC_ENCODE_MPH(*mphp, mip, 0);
918 }
919 
920 int
921 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
922 {
923 	int	err;
924 	mac_handle_t	mh;
925 
926 	if ((err = mac_open(name, &mh)) != 0)
927 		return (err);
928 
929 	mac_perim_enter_by_mh(mh, mphp);
930 	MAC_ENCODE_MPH(*mphp, mh, 1);
931 	return (0);
932 }
933 
934 int
935 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
936 {
937 	int	err;
938 	mac_handle_t	mh;
939 
940 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
941 		return (err);
942 
943 	mac_perim_enter_by_mh(mh, mphp);
944 	MAC_ENCODE_MPH(*mphp, mh, 1);
945 	return (0);
946 }
947 
948 void
949 mac_perim_exit(mac_perim_handle_t mph)
950 {
951 	mac_impl_t	*mip;
952 	boolean_t	need_close;
953 
954 	MAC_DECODE_MPH(mph, mip, need_close);
955 	i_mac_perim_exit(mip);
956 	if (need_close)
957 		mac_close((mac_handle_t)mip);
958 }
959 
960 int
961 mac_hold(const char *macname, mac_impl_t **pmip)
962 {
963 	mac_impl_t	*mip;
964 	int		err;
965 
966 	/*
967 	 * Check the device name length to make sure it won't overflow our
968 	 * buffer.
969 	 */
970 	if (strlen(macname) >= MAXNAMELEN)
971 		return (EINVAL);
972 
973 	/*
974 	 * Look up its entry in the global hash table.
975 	 */
976 	rw_enter(&i_mac_impl_lock, RW_WRITER);
977 	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
978 	    (mod_hash_val_t *)&mip);
979 
980 	if (err != 0) {
981 		rw_exit(&i_mac_impl_lock);
982 		return (ENOENT);
983 	}
984 
985 	if (mip->mi_state_flags & MIS_DISABLED) {
986 		rw_exit(&i_mac_impl_lock);
987 		return (ENOENT);
988 	}
989 
990 	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
991 		rw_exit(&i_mac_impl_lock);
992 		return (EBUSY);
993 	}
994 
995 	mip->mi_ref++;
996 	rw_exit(&i_mac_impl_lock);
997 
998 	*pmip = mip;
999 	return (0);
1000 }
1001 
1002 void
1003 mac_rele(mac_impl_t *mip)
1004 {
1005 	rw_enter(&i_mac_impl_lock, RW_WRITER);
1006 	ASSERT(mip->mi_ref != 0);
1007 	if (--mip->mi_ref == 0) {
1008 		ASSERT(mip->mi_nactiveclients == 0 &&
1009 		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1010 	}
1011 	rw_exit(&i_mac_impl_lock);
1012 }
1013 
1014 /*
1015  * This function is called only by mac_client_open.
1016  */
1017 int
1018 mac_start(mac_impl_t *mip)
1019 {
1020 	int		err = 0;
1021 
1022 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1023 	ASSERT(mip->mi_start != NULL);
1024 
1025 	/*
1026 	 * Check whether the device is already started.
1027 	 */
1028 	if (mip->mi_active++ == 0) {
1029 		mac_ring_t *ring = NULL;
1030 
1031 		/*
1032 		 * Start the device.
1033 		 */
1034 		err = mip->mi_start(mip->mi_driver);
1035 		if (err != 0) {
1036 			mip->mi_active--;
1037 			return (err);
1038 		}
1039 
1040 		/*
1041 		 * Start the default tx ring.
1042 		 */
1043 		if (mip->mi_default_tx_ring != NULL) {
1044 
1045 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1046 			err = mac_start_ring(ring);
1047 			if (err != 0) {
1048 				mip->mi_active--;
1049 				return (err);
1050 			}
1051 			ring->mr_state = MR_INUSE;
1052 		}
1053 
1054 		if (mip->mi_rx_groups != NULL) {
1055 			/*
1056 			 * Start the default ring, since it will be needed
1057 			 * to receive broadcast and multicast traffic for
1058 			 * both primary and non-primary MAC clients.
1059 			 */
1060 			mac_group_t *grp = &mip->mi_rx_groups[0];
1061 
1062 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1063 			err = mac_start_group_and_rings(grp);
1064 			if (err != 0) {
1065 				mip->mi_active--;
1066 				if (ring != NULL) {
1067 					mac_stop_ring(ring);
1068 					ring->mr_state = MR_FREE;
1069 				}
1070 				return (err);
1071 			}
1072 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
1073 		}
1074 	}
1075 
1076 	return (err);
1077 }
1078 
1079 /*
1080  * This function is called only by mac_client_close.
1081  */
1082 void
1083 mac_stop(mac_impl_t *mip)
1084 {
1085 	ASSERT(mip->mi_stop != NULL);
1086 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1087 
1088 	/*
1089 	 * Check whether the device is still needed.
1090 	 */
1091 	ASSERT(mip->mi_active != 0);
1092 	if (--mip->mi_active == 0) {
1093 		if (mip->mi_rx_groups != NULL) {
1094 			/*
1095 			 * There should be no more active clients since the
1096 			 * MAC is being stopped. Stop the default RX group
1097 			 * and transition it back to registered state.
1098 			 */
1099 			mac_group_t *grp = &mip->mi_rx_groups[0];
1100 
1101 			/*
1102 			 * When clients are torn down, the groups
1103 			 * are release via mac_release_rx_group which
1104 			 * knows the the default group is always in
1105 			 * started mode since broadcast uses it. So
1106 			 * we can assert that their are no clients
1107 			 * (since mac_bcast_add doesn't register itself
1108 			 * as a client) and group is in SHARED state.
1109 			 */
1110 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1111 			ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
1112 			    mip->mi_nactiveclients == 0);
1113 			mac_stop_group_and_rings(grp);
1114 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1115 		}
1116 
1117 		if (mip->mi_default_tx_ring != NULL) {
1118 			mac_ring_t *ring;
1119 
1120 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1121 			mac_stop_ring(ring);
1122 			ring->mr_state = MR_FREE;
1123 		}
1124 
1125 		/*
1126 		 * Stop the device.
1127 		 */
1128 		mip->mi_stop(mip->mi_driver);
1129 	}
1130 }
1131 
1132 int
1133 i_mac_promisc_set(mac_impl_t *mip, boolean_t on, mac_promisc_type_t ptype)
1134 {
1135 	int		err = 0;
1136 
1137 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1138 	ASSERT(mip->mi_setpromisc != NULL);
1139 	ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC);
1140 
1141 	/*
1142 	 * Determine whether we should enable or disable promiscuous mode.
1143 	 * For details on the distinction between "device promiscuous mode"
1144 	 * and "MAC promiscuous mode", see PSARC/2005/289.
1145 	 */
1146 	if (on) {
1147 		/*
1148 		 * Enable promiscuous mode on the device if not yet enabled.
1149 		 */
1150 		if (mip->mi_devpromisc++ == 0) {
1151 			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1152 			if (err != 0) {
1153 				mip->mi_devpromisc--;
1154 				return (err);
1155 			}
1156 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1157 		}
1158 
1159 		/*
1160 		 * Enable promiscuous mode on the MAC if not yet enabled.
1161 		 */
1162 		if (ptype == MAC_PROMISC && mip->mi_promisc++ == 0)
1163 			i_mac_notify(mip, MAC_NOTE_PROMISC);
1164 	} else {
1165 		if (mip->mi_devpromisc == 0)
1166 			return (EPROTO);
1167 
1168 		/*
1169 		 * Disable promiscuous mode on the device if this is the last
1170 		 * enabling.
1171 		 */
1172 		if (--mip->mi_devpromisc == 0) {
1173 			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1174 			if (err != 0) {
1175 				mip->mi_devpromisc++;
1176 				return (err);
1177 			}
1178 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1179 		}
1180 
1181 		/*
1182 		 * Disable promiscuous mode on the MAC if this is the last
1183 		 * enabling.
1184 		 */
1185 		if (ptype == MAC_PROMISC && --mip->mi_promisc == 0)
1186 			i_mac_notify(mip, MAC_NOTE_PROMISC);
1187 	}
1188 
1189 	return (0);
1190 }
1191 
1192 int
1193 mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
1194 {
1195 	mac_impl_t	*mip = (mac_impl_t *)mh;
1196 	int		rv;
1197 
1198 	i_mac_perim_enter(mip);
1199 	rv = i_mac_promisc_set(mip, on, ptype);
1200 	i_mac_perim_exit(mip);
1201 
1202 	return (rv);
1203 }
1204 
1205 /*
1206  * The promiscuity state can change any time. If the caller needs to take
1207  * actions that are atomic with the promiscuity state, then the caller needs
1208  * to bracket the entire sequence with mac_perim_enter/exit
1209  */
1210 boolean_t
1211 mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype)
1212 {
1213 	mac_impl_t		*mip = (mac_impl_t *)mh;
1214 
1215 	ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC);
1216 
1217 	/*
1218 	 * Return the current promiscuity.
1219 	 */
1220 	if (ptype == MAC_DEVPROMISC)
1221 		return (mip->mi_devpromisc != 0);
1222 	else
1223 		return (mip->mi_promisc != 0);
1224 }
1225 
1226 /*
1227  * Invoked at MAC instance attach time to initialize the list
1228  * of factory MAC addresses supported by a MAC instance. This function
1229  * builds a local cache in the mac_impl_t for the MAC addresses
1230  * supported by the underlying hardware. The MAC clients themselves
1231  * use the mac_addr_factory*() functions to query and reserve
1232  * factory MAC addresses.
1233  */
1234 void
1235 mac_addr_factory_init(mac_impl_t *mip)
1236 {
1237 	mac_capab_multifactaddr_t capab;
1238 	uint8_t *addr;
1239 	int i;
1240 
1241 	/*
1242 	 * First round to see how many factory MAC addresses are available.
1243 	 */
1244 	bzero(&capab, sizeof (capab));
1245 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1246 	    &capab) || (capab.mcm_naddr == 0)) {
1247 		/*
1248 		 * The MAC instance doesn't support multiple factory
1249 		 * MAC addresses, we're done here.
1250 		 */
1251 		return;
1252 	}
1253 
1254 	/*
1255 	 * Allocate the space and get all the factory addresses.
1256 	 */
1257 	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1258 	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1259 
1260 	mip->mi_factory_addr_num = capab.mcm_naddr;
1261 	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1262 	    sizeof (mac_factory_addr_t), KM_SLEEP);
1263 
1264 	for (i = 0; i < capab.mcm_naddr; i++) {
1265 		bcopy(addr + i * MAXMACADDRLEN,
1266 		    mip->mi_factory_addr[i].mfa_addr,
1267 		    mip->mi_type->mt_addr_length);
1268 		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1269 	}
1270 
1271 	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1272 }
1273 
1274 void
1275 mac_addr_factory_fini(mac_impl_t *mip)
1276 {
1277 	if (mip->mi_factory_addr == NULL) {
1278 		ASSERT(mip->mi_factory_addr_num == 0);
1279 		return;
1280 	}
1281 
1282 	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1283 	    sizeof (mac_factory_addr_t));
1284 
1285 	mip->mi_factory_addr = NULL;
1286 	mip->mi_factory_addr_num = 0;
1287 }
1288 
1289 /*
1290  * Reserve a factory MAC address. If *slot is set to -1, the function
1291  * attempts to reserve any of the available factory MAC addresses and
1292  * returns the reserved slot id. If no slots are available, the function
1293  * returns ENOSPC. If *slot is not set to -1, the function reserves
1294  * the specified slot if it is available, or returns EBUSY is the slot
1295  * is already used. Returns ENOTSUP if the underlying MAC does not
1296  * support multiple factory addresses. If the slot number is not -1 but
1297  * is invalid, returns EINVAL.
1298  */
1299 int
1300 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1301 {
1302 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1303 	mac_impl_t *mip = mcip->mci_mip;
1304 	int i, ret = 0;
1305 
1306 	i_mac_perim_enter(mip);
1307 	/*
1308 	 * Protect against concurrent readers that may need a self-consistent
1309 	 * view of the factory addresses
1310 	 */
1311 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1312 
1313 	if (mip->mi_factory_addr_num == 0) {
1314 		ret = ENOTSUP;
1315 		goto bail;
1316 	}
1317 
1318 	if (*slot != -1) {
1319 		/* check the specified slot */
1320 		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1321 			ret = EINVAL;
1322 			goto bail;
1323 		}
1324 		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1325 			ret = EBUSY;
1326 			goto bail;
1327 		}
1328 	} else {
1329 		/* pick the next available slot */
1330 		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1331 			if (!mip->mi_factory_addr[i].mfa_in_use)
1332 				break;
1333 		}
1334 
1335 		if (i == mip->mi_factory_addr_num) {
1336 			ret = ENOSPC;
1337 			goto bail;
1338 		}
1339 		*slot = i+1;
1340 	}
1341 
1342 	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1343 	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1344 
1345 bail:
1346 	rw_exit(&mip->mi_rw_lock);
1347 	i_mac_perim_exit(mip);
1348 	return (ret);
1349 }
1350 
1351 /*
1352  * Release the specified factory MAC address slot.
1353  */
1354 void
1355 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1356 {
1357 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1358 	mac_impl_t *mip = mcip->mci_mip;
1359 
1360 	i_mac_perim_enter(mip);
1361 	/*
1362 	 * Protect against concurrent readers that may need a self-consistent
1363 	 * view of the factory addresses
1364 	 */
1365 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1366 
1367 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1368 	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1369 
1370 	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1371 
1372 	rw_exit(&mip->mi_rw_lock);
1373 	i_mac_perim_exit(mip);
1374 }
1375 
1376 /*
1377  * Stores in mac_addr the value of the specified MAC address. Returns
1378  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1379  * The caller must provide a string of at least MAXNAMELEN bytes.
1380  */
1381 void
1382 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1383     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1384 {
1385 	mac_impl_t *mip = (mac_impl_t *)mh;
1386 	boolean_t in_use;
1387 
1388 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1389 
1390 	/*
1391 	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1392 	 * and mi_rw_lock
1393 	 */
1394 	rw_enter(&mip->mi_rw_lock, RW_READER);
1395 	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1396 	*addr_len = mip->mi_type->mt_addr_length;
1397 	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1398 	if (in_use && client_name != NULL) {
1399 		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1400 		    client_name, MAXNAMELEN);
1401 	}
1402 	if (in_use_arg != NULL)
1403 		*in_use_arg = in_use;
1404 	rw_exit(&mip->mi_rw_lock);
1405 }
1406 
1407 /*
1408  * Returns the number of factory MAC addresses (in addition to the
1409  * primary MAC address), 0 if the underlying MAC doesn't support
1410  * that feature.
1411  */
1412 uint_t
1413 mac_addr_factory_num(mac_handle_t mh)
1414 {
1415 	mac_impl_t *mip = (mac_impl_t *)mh;
1416 
1417 	return (mip->mi_factory_addr_num);
1418 }
1419 
1420 
1421 void
1422 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1423 {
1424 	mac_ring_t	*ring;
1425 
1426 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1427 		ring->mr_flag &= ~flag;
1428 }
1429 
1430 /*
1431  * The following mac_hwrings_xxx() functions are private mac client functions
1432  * used by the aggr driver to access and control the underlying HW Rx group
1433  * and rings. In this case, the aggr driver has exclusive control of the
1434  * underlying HW Rx group/rings, it calls the following functions to
1435  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1436  * addresses, or set up the Rx callback.
1437  */
1438 /* ARGSUSED */
1439 static void
1440 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1441     mblk_t *mp_chain, boolean_t loopback)
1442 {
1443 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1444 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1445 	mac_direct_rx_t		proc;
1446 	void			*arg1;
1447 	mac_resource_handle_t	arg2;
1448 
1449 	proc = srs_rx->sr_func;
1450 	arg1 = srs_rx->sr_arg1;
1451 	arg2 = mac_srs->srs_mrh;
1452 
1453 	proc(arg1, arg2, mp_chain, NULL);
1454 }
1455 
1456 /*
1457  * This function is called to get the list of HW rings that are reserved by
1458  * an exclusive mac client.
1459  *
1460  * Return value: the number of HW rings.
1461  */
1462 int
1463 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1464     mac_ring_handle_t *hwrh)
1465 {
1466 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1467 	flow_entry_t		*flent = mcip->mci_flent;
1468 	mac_group_t		*grp = flent->fe_rx_ring_group;
1469 	mac_ring_t		*ring;
1470 	int			cnt = 0;
1471 
1472 	/*
1473 	 * The mac client did not reserve any RX group, return directly.
1474 	 * This is probably because the underlying MAC does not support
1475 	 * any RX groups.
1476 	 */
1477 	*hwgh = NULL;
1478 	if (grp == NULL)
1479 		return (0);
1480 
1481 	/*
1482 	 * This RX group must be reserved by this mac client.
1483 	 */
1484 	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1485 	    (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp))));
1486 
1487 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) {
1488 		ASSERT(cnt < MAX_RINGS_PER_GROUP);
1489 		hwrh[cnt++] = (mac_ring_handle_t)ring;
1490 	}
1491 	*hwgh = (mac_group_handle_t)grp;
1492 	return (cnt);
1493 }
1494 
1495 /*
1496  * Setup the RX callback of the mac client which exclusively controls HW ring.
1497  */
1498 void
1499 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
1500 {
1501 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1502 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1503 
1504 	mac_srs->srs_mrh = prh;
1505 	mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1506 }
1507 
1508 void
1509 mac_hwring_teardown(mac_ring_handle_t hwrh)
1510 {
1511 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1512 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1513 
1514 	mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1515 	mac_srs->srs_mrh = NULL;
1516 }
1517 
1518 int
1519 mac_hwring_disable_intr(mac_ring_handle_t rh)
1520 {
1521 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1522 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1523 
1524 	return (intr->mi_disable(intr->mi_handle));
1525 }
1526 
1527 int
1528 mac_hwring_enable_intr(mac_ring_handle_t rh)
1529 {
1530 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1531 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1532 
1533 	return (intr->mi_enable(intr->mi_handle));
1534 }
1535 
1536 int
1537 mac_hwring_start(mac_ring_handle_t rh)
1538 {
1539 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1540 
1541 	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1542 	return (0);
1543 }
1544 
1545 void
1546 mac_hwring_stop(mac_ring_handle_t rh)
1547 {
1548 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1549 
1550 	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1551 }
1552 
1553 mblk_t *
1554 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1555 {
1556 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1557 	mac_ring_info_t *info = &rr_ring->mr_info;
1558 
1559 	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1560 }
1561 
1562 int
1563 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1564 {
1565 	mac_group_t *group = (mac_group_t *)gh;
1566 
1567 	return (mac_group_addmac(group, addr));
1568 }
1569 
1570 int
1571 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1572 {
1573 	mac_group_t *group = (mac_group_t *)gh;
1574 
1575 	return (mac_group_remmac(group, addr));
1576 }
1577 
1578 /*
1579  * Set the RX group to be shared/reserved. Note that the group must be
1580  * started/stopped outside of this function.
1581  */
1582 void
1583 mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
1584 {
1585 	/*
1586 	 * If there is no change in the group state, just return.
1587 	 */
1588 	if (grp->mrg_state == state)
1589 		return;
1590 
1591 	switch (state) {
1592 	case MAC_GROUP_STATE_RESERVED:
1593 		/*
1594 		 * Successfully reserved the group.
1595 		 *
1596 		 * Given that there is an exclusive client controlling this
1597 		 * group, we enable the group level polling when available,
1598 		 * so that SRSs get to turn on/off individual rings they's
1599 		 * assigned to.
1600 		 */
1601 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1602 
1603 		if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
1604 			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1605 
1606 		break;
1607 
1608 	case MAC_GROUP_STATE_SHARED:
1609 		/*
1610 		 * Set all rings of this group to software classified.
1611 		 * If the group has an overriding interrupt, then re-enable it.
1612 		 */
1613 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1614 
1615 		if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
1616 			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1617 
1618 		/* The ring is not available for reservations any more */
1619 		break;
1620 
1621 	case MAC_GROUP_STATE_REGISTERED:
1622 		/* Also callable from mac_register, perim is not held */
1623 		break;
1624 
1625 	default:
1626 		ASSERT(B_FALSE);
1627 		break;
1628 	}
1629 
1630 	grp->mrg_state = state;
1631 }
1632 
1633 /*
1634  * Quiesce future hardware classified packets for the specified Rx ring
1635  */
1636 static void
1637 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1638 {
1639 	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1640 	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1641 
1642 	mutex_enter(&rx_ring->mr_lock);
1643 	rx_ring->mr_flag |= ring_flag;
1644 	while (rx_ring->mr_refcnt != 0)
1645 		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1646 	mutex_exit(&rx_ring->mr_lock);
1647 }
1648 
1649 /*
1650  * Please see mac_tx for details about the per cpu locking scheme
1651  */
1652 static void
1653 mac_tx_lock_all(mac_client_impl_t *mcip)
1654 {
1655 	int	i;
1656 
1657 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1658 		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1659 }
1660 
1661 static void
1662 mac_tx_unlock_all(mac_client_impl_t *mcip)
1663 {
1664 	int	i;
1665 
1666 	for (i = mac_tx_percpu_cnt; i >= 0; i--)
1667 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1668 }
1669 
1670 static void
1671 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1672 {
1673 	int	i;
1674 
1675 	for (i = mac_tx_percpu_cnt; i > 0; i--)
1676 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1677 }
1678 
1679 static int
1680 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1681 {
1682 	int	i;
1683 	int	refcnt = 0;
1684 
1685 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1686 		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1687 
1688 	return (refcnt);
1689 }
1690 
1691 /*
1692  * Stop future Tx packets coming down from the client in preparation for
1693  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1694  * of rings between clients
1695  */
1696 void
1697 mac_tx_client_block(mac_client_impl_t *mcip)
1698 {
1699 	mac_tx_lock_all(mcip);
1700 	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1701 	while (mac_tx_sum_refcnt(mcip) != 0) {
1702 		mac_tx_unlock_allbutzero(mcip);
1703 		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1704 		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1705 		mac_tx_lock_all(mcip);
1706 	}
1707 	mac_tx_unlock_all(mcip);
1708 }
1709 
1710 void
1711 mac_tx_client_unblock(mac_client_impl_t *mcip)
1712 {
1713 	mac_tx_lock_all(mcip);
1714 	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1715 	mac_tx_unlock_all(mcip);
1716 	/*
1717 	 * We may fail to disable flow control for the last MAC_NOTE_TX
1718 	 * notification because the MAC client is quiesced. Send the
1719 	 * notification again.
1720 	 */
1721 	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1722 }
1723 
1724 /*
1725  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1726  * quiesce is done.
1727  */
1728 static void
1729 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1730 {
1731 	mutex_enter(&srs->srs_lock);
1732 	while (!(srs->srs_state & srs_flag))
1733 		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1734 	mutex_exit(&srs->srs_lock);
1735 }
1736 
1737 /*
1738  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1739  * works bottom up by cutting off packet flow from the bottommost point in the
1740  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1741  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1742  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1743  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1744  * for the SRS and MR flags. In the former case the threads pause waiting for
1745  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1746  * is also mostly similar to the above.
1747  *
1748  * 1. Stop future hardware classified packets at the lowest level in the mac.
1749  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1750  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1751  *    from increasing. Upcalls from the driver that come through hardware
1752  *    classification will be dropped in mac_rx from now on. Then we wait for
1753  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1754  *    sure there aren't any upcall threads from the driver through hardware
1755  *    classification. In the case of SRS teardown we also remove the
1756  *    classification rule in the driver.
1757  *
1758  * 2. Stop future software classified packets by marking the flow entry with
1759  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1760  *    increasing. We also remove the flow entry from the table in the latter
1761  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1762  *    that indicates there aren't any active threads using that flow entry.
1763  *
1764  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1765  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1766  *    with the SRS worker thread serving as a master controller. This
1767  *    mechansim is explained in mac_srs_worker_quiesce().
1768  *
1769  * The restart mechanism to reactivate the SRS and softrings is explained
1770  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1771  * restart sequence.
1772  */
1773 void
1774 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1775 {
1776 	flow_entry_t	*flent = srs->srs_flent;
1777 	uint_t	mr_flag, srs_done_flag;
1778 
1779 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1780 	ASSERT(!(srs->srs_type & SRST_TX));
1781 
1782 	if (srs_quiesce_flag == SRS_CONDEMNED) {
1783 		mr_flag = MR_CONDEMNED;
1784 		srs_done_flag = SRS_CONDEMNED_DONE;
1785 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1786 			mac_srs_client_poll_disable(srs->srs_mcip, srs);
1787 	} else {
1788 		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1789 		mr_flag = MR_QUIESCE;
1790 		srs_done_flag = SRS_QUIESCE_DONE;
1791 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1792 			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1793 	}
1794 
1795 	if (srs->srs_ring != NULL) {
1796 		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1797 	} else {
1798 		/*
1799 		 * SRS is driven by software classification. In case
1800 		 * of CONDEMNED, the top level teardown functions will
1801 		 * deal with flow removal.
1802 		 */
1803 		if (srs_quiesce_flag != SRS_CONDEMNED) {
1804 			FLOW_MARK(flent, FE_QUIESCE);
1805 			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1806 		}
1807 	}
1808 
1809 	/*
1810 	 * Signal the SRS to quiesce itself, and then cv_wait for the
1811 	 * SRS quiesce to complete. The SRS worker thread will wake us
1812 	 * up when the quiesce is complete
1813 	 */
1814 	mac_srs_signal(srs, srs_quiesce_flag);
1815 	mac_srs_quiesce_wait(srs, srs_done_flag);
1816 }
1817 
1818 /*
1819  * Remove an SRS.
1820  */
1821 void
1822 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1823 {
1824 	flow_entry_t *flent = srs->srs_flent;
1825 	int i;
1826 
1827 	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1828 	/*
1829 	 * Locate and remove our entry in the fe_rx_srs[] array, and
1830 	 * adjust the fe_rx_srs array entries and array count by
1831 	 * moving the last entry into the vacated spot.
1832 	 */
1833 	mutex_enter(&flent->fe_lock);
1834 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1835 		if (flent->fe_rx_srs[i] == srs)
1836 			break;
1837 	}
1838 
1839 	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1840 	if (i != flent->fe_rx_srs_cnt - 1) {
1841 		flent->fe_rx_srs[i] =
1842 		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1843 		i = flent->fe_rx_srs_cnt - 1;
1844 	}
1845 
1846 	flent->fe_rx_srs[i] = NULL;
1847 	flent->fe_rx_srs_cnt--;
1848 	mutex_exit(&flent->fe_lock);
1849 
1850 	mac_srs_free(srs);
1851 }
1852 
1853 static void
1854 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1855 {
1856 	mutex_enter(&srs->srs_lock);
1857 	srs->srs_state &= ~flag;
1858 	mutex_exit(&srs->srs_lock);
1859 }
1860 
1861 void
1862 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1863 {
1864 	flow_entry_t	*flent = srs->srs_flent;
1865 	mac_ring_t	*mr;
1866 
1867 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1868 	ASSERT((srs->srs_type & SRST_TX) == 0);
1869 
1870 	/*
1871 	 * This handles a change in the number of SRSs between the quiesce and
1872 	 * and restart operation of a flow.
1873 	 */
1874 	if (!SRS_QUIESCED(srs))
1875 		return;
1876 
1877 	/*
1878 	 * Signal the SRS to restart itself. Wait for the restart to complete
1879 	 * Note that we only restart the SRS if it is not marked as
1880 	 * permanently quiesced.
1881 	 */
1882 	if (!SRS_QUIESCED_PERMANENT(srs)) {
1883 		mac_srs_signal(srs, SRS_RESTART);
1884 		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
1885 		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
1886 
1887 		mac_srs_client_poll_restart(srs->srs_mcip, srs);
1888 	}
1889 
1890 	/* Finally clear the flags to let the packets in */
1891 	mr = srs->srs_ring;
1892 	if (mr != NULL) {
1893 		MAC_RING_UNMARK(mr, MR_QUIESCE);
1894 		/* In case the ring was stopped, safely restart it */
1895 		(void) mac_start_ring(mr);
1896 	} else {
1897 		FLOW_UNMARK(flent, FE_QUIESCE);
1898 	}
1899 }
1900 
1901 /*
1902  * Temporary quiesce of a flow and associated Rx SRS.
1903  * Please see block comment above mac_rx_classify_flow_rem.
1904  */
1905 /* ARGSUSED */
1906 int
1907 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
1908 {
1909 	int		i;
1910 
1911 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1912 		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
1913 		    SRS_QUIESCE);
1914 	}
1915 	return (0);
1916 }
1917 
1918 /*
1919  * Restart a flow and associated Rx SRS that has been quiesced temporarily
1920  * Please see block comment above mac_rx_classify_flow_rem
1921  */
1922 /* ARGSUSED */
1923 int
1924 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
1925 {
1926 	int		i;
1927 
1928 	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
1929 		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
1930 
1931 	return (0);
1932 }
1933 
1934 void
1935 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
1936 {
1937 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1938 	flow_entry_t		*flent = mcip->mci_flent;
1939 	mac_impl_t		*mip = mcip->mci_mip;
1940 	mac_soft_ring_set_t	*mac_srs;
1941 	int			i;
1942 
1943 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1944 
1945 	if (flent == NULL)
1946 		return;
1947 
1948 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1949 		mac_srs = flent->fe_rx_srs[i];
1950 		mutex_enter(&mac_srs->srs_lock);
1951 		if (on)
1952 			mac_srs->srs_state |= SRS_QUIESCE_PERM;
1953 		else
1954 			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
1955 		mutex_exit(&mac_srs->srs_lock);
1956 	}
1957 }
1958 
1959 void
1960 mac_rx_client_quiesce(mac_client_handle_t mch)
1961 {
1962 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1963 	mac_impl_t		*mip = mcip->mci_mip;
1964 
1965 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1966 
1967 	if (MCIP_DATAPATH_SETUP(mcip)) {
1968 		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
1969 		    NULL);
1970 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1971 		    mac_rx_classify_flow_quiesce, NULL);
1972 	}
1973 }
1974 
1975 void
1976 mac_rx_client_restart(mac_client_handle_t mch)
1977 {
1978 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1979 	mac_impl_t		*mip = mcip->mci_mip;
1980 
1981 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1982 
1983 	if (MCIP_DATAPATH_SETUP(mcip)) {
1984 		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
1985 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1986 		    mac_rx_classify_flow_restart, NULL);
1987 	}
1988 }
1989 
1990 /*
1991  * This function only quiesces the Tx SRS and softring worker threads. Callers
1992  * need to make sure that there aren't any mac client threads doing current or
1993  * future transmits in the mac before calling this function.
1994  */
1995 void
1996 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1997 {
1998 	mac_client_impl_t	*mcip = srs->srs_mcip;
1999 
2000 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2001 
2002 	ASSERT(srs->srs_type & SRST_TX);
2003 	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2004 	    srs_quiesce_flag == SRS_QUIESCE);
2005 
2006 	/*
2007 	 * Signal the SRS to quiesce itself, and then cv_wait for the
2008 	 * SRS quiesce to complete. The SRS worker thread will wake us
2009 	 * up when the quiesce is complete
2010 	 */
2011 	mac_srs_signal(srs, srs_quiesce_flag);
2012 	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2013 	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2014 }
2015 
2016 void
2017 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2018 {
2019 	/*
2020 	 * Resizing the fanout could result in creation of new SRSs.
2021 	 * They may not necessarily be in the quiesced state in which
2022 	 * case it need be restarted
2023 	 */
2024 	if (!SRS_QUIESCED(srs))
2025 		return;
2026 
2027 	mac_srs_signal(srs, SRS_RESTART);
2028 	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2029 	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2030 }
2031 
2032 /*
2033  * Temporary quiesce of a flow and associated Rx SRS.
2034  * Please see block comment above mac_rx_srs_quiesce
2035  */
2036 /* ARGSUSED */
2037 int
2038 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2039 {
2040 	/*
2041 	 * The fe_tx_srs is null for a subflow on an interface that is
2042 	 * not plumbed
2043 	 */
2044 	if (flent->fe_tx_srs != NULL)
2045 		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2046 	return (0);
2047 }
2048 
2049 /* ARGSUSED */
2050 int
2051 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2052 {
2053 	/*
2054 	 * The fe_tx_srs is null for a subflow on an interface that is
2055 	 * not plumbed
2056 	 */
2057 	if (flent->fe_tx_srs != NULL)
2058 		mac_tx_srs_restart(flent->fe_tx_srs);
2059 	return (0);
2060 }
2061 
2062 void
2063 mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
2064 {
2065 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2066 
2067 	mac_tx_client_block(mcip);
2068 	if (MCIP_TX_SRS(mcip) != NULL) {
2069 		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2070 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2071 		    mac_tx_flow_quiesce, NULL);
2072 	}
2073 }
2074 
2075 void
2076 mac_tx_client_restart(mac_client_impl_t *mcip)
2077 {
2078 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2079 
2080 	mac_tx_client_unblock(mcip);
2081 	if (MCIP_TX_SRS(mcip) != NULL) {
2082 		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2083 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2084 		    mac_tx_flow_restart, NULL);
2085 	}
2086 }
2087 
2088 void
2089 mac_tx_client_flush(mac_client_impl_t *mcip)
2090 {
2091 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2092 
2093 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2094 	mac_tx_client_restart(mcip);
2095 }
2096 
2097 void
2098 mac_client_quiesce(mac_client_impl_t *mcip)
2099 {
2100 	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2101 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2102 }
2103 
2104 void
2105 mac_client_restart(mac_client_impl_t *mcip)
2106 {
2107 	mac_rx_client_restart((mac_client_handle_t)mcip);
2108 	mac_tx_client_restart(mcip);
2109 }
2110 
2111 /*
2112  * Allocate a minor number.
2113  */
2114 minor_t
2115 mac_minor_hold(boolean_t sleep)
2116 {
2117 	minor_t	minor;
2118 
2119 	/*
2120 	 * Grab a value from the arena.
2121 	 */
2122 	atomic_add_32(&minor_count, 1);
2123 
2124 	if (sleep)
2125 		minor = (uint_t)id_alloc(minor_ids);
2126 	else
2127 		minor = (uint_t)id_alloc_nosleep(minor_ids);
2128 
2129 	if (minor == 0) {
2130 		atomic_add_32(&minor_count, -1);
2131 		return (0);
2132 	}
2133 
2134 	return (minor);
2135 }
2136 
2137 /*
2138  * Release a previously allocated minor number.
2139  */
2140 void
2141 mac_minor_rele(minor_t minor)
2142 {
2143 	/*
2144 	 * Return the value to the arena.
2145 	 */
2146 	id_free(minor_ids, minor);
2147 	atomic_add_32(&minor_count, -1);
2148 }
2149 
2150 uint32_t
2151 mac_no_notification(mac_handle_t mh)
2152 {
2153 	mac_impl_t *mip = (mac_impl_t *)mh;
2154 	return (mip->mi_unsup_note);
2155 }
2156 
2157 /*
2158  * Prevent any new opens of this mac in preparation for unregister
2159  */
2160 int
2161 i_mac_disable(mac_impl_t *mip)
2162 {
2163 	mac_client_impl_t	*mcip;
2164 
2165 	rw_enter(&i_mac_impl_lock, RW_WRITER);
2166 	if (mip->mi_state_flags & MIS_DISABLED) {
2167 		/* Already disabled, return success */
2168 		rw_exit(&i_mac_impl_lock);
2169 		return (0);
2170 	}
2171 	/*
2172 	 * See if there are any other references to this mac_t (e.g., VLAN's).
2173 	 * If so return failure. If all the other checks below pass, then
2174 	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2175 	 * any new VLAN's from being created or new mac client opens of this
2176 	 * mac end point.
2177 	 */
2178 	if (mip->mi_ref > 0) {
2179 		rw_exit(&i_mac_impl_lock);
2180 		return (EBUSY);
2181 	}
2182 
2183 	/*
2184 	 * mac clients must delete all multicast groups they join before
2185 	 * closing. bcast groups are reference counted, the last client
2186 	 * to delete the group will wait till the group is physically
2187 	 * deleted. Since all clients have closed this mac end point
2188 	 * mi_bcast_ngrps must be zero at this point
2189 	 */
2190 	ASSERT(mip->mi_bcast_ngrps == 0);
2191 
2192 	/*
2193 	 * Don't let go of this if it has some flows.
2194 	 * All other code guarantees no flows are added to a disabled
2195 	 * mac, therefore it is sufficient to check for the flow table
2196 	 * only here.
2197 	 */
2198 	mcip = mac_primary_client_handle(mip);
2199 	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2200 		rw_exit(&i_mac_impl_lock);
2201 		return (ENOTEMPTY);
2202 	}
2203 
2204 	mip->mi_state_flags |= MIS_DISABLED;
2205 	rw_exit(&i_mac_impl_lock);
2206 	return (0);
2207 }
2208 
2209 int
2210 mac_disable_nowait(mac_handle_t mh)
2211 {
2212 	mac_impl_t	*mip = (mac_impl_t *)mh;
2213 	int err;
2214 
2215 	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2216 		return (err);
2217 	err = i_mac_disable(mip);
2218 	i_mac_perim_exit(mip);
2219 	return (err);
2220 }
2221 
2222 int
2223 mac_disable(mac_handle_t mh)
2224 {
2225 	mac_impl_t	*mip = (mac_impl_t *)mh;
2226 	int err;
2227 
2228 	i_mac_perim_enter(mip);
2229 	err = i_mac_disable(mip);
2230 	i_mac_perim_exit(mip);
2231 
2232 	/*
2233 	 * Clean up notification thread and wait for it to exit.
2234 	 */
2235 	if (err == 0)
2236 		i_mac_notify_exit(mip);
2237 
2238 	return (err);
2239 }
2240 
2241 /*
2242  * Called when the MAC instance has a non empty flow table, to de-multiplex
2243  * incoming packets to the right flow.
2244  * The MAC's rw lock is assumed held as a READER.
2245  */
2246 /* ARGSUSED */
2247 static mblk_t *
2248 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2249 {
2250 	flow_entry_t	*flent = NULL;
2251 	uint_t		flags = FLOW_INBOUND;
2252 	int		err;
2253 
2254 	/*
2255 	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2256 	 * to mac_flow_lookup() so that the VLAN packets can be successfully
2257 	 * passed to the non-VLAN aggregation flows.
2258 	 *
2259 	 * Note that there is possibly a race between this and
2260 	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2261 	 * classified to non-VLAN flows of non-aggregation mac clients. These
2262 	 * VLAN packets will be then filtered out by the mac module.
2263 	 */
2264 	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2265 		flags |= FLOW_IGNORE_VLAN;
2266 
2267 	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2268 	if (err != 0) {
2269 		/* no registered receive function */
2270 		return (mp);
2271 	} else {
2272 		mac_client_impl_t	*mcip;
2273 
2274 		/*
2275 		 * This flent might just be an additional one on the MAC client,
2276 		 * i.e. for classification purposes (different fdesc), however
2277 		 * the resources, SRS et. al., are in the mci_flent, so if
2278 		 * this isn't the mci_flent, we need to get it.
2279 		 */
2280 		if ((mcip = flent->fe_mcip) != NULL &&
2281 		    mcip->mci_flent != flent) {
2282 			FLOW_REFRELE(flent);
2283 			flent = mcip->mci_flent;
2284 			FLOW_TRY_REFHOLD(flent, err);
2285 			if (err != 0)
2286 				return (mp);
2287 		}
2288 		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2289 		    B_FALSE);
2290 		FLOW_REFRELE(flent);
2291 	}
2292 	return (NULL);
2293 }
2294 
2295 mblk_t *
2296 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2297 {
2298 	mac_impl_t	*mip = (mac_impl_t *)mh;
2299 	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2300 
2301 	/*
2302 	 * We walk the chain and attempt to classify each packet.
2303 	 * The packets that couldn't be classified will be returned
2304 	 * back to the caller.
2305 	 */
2306 	bp = mp_chain;
2307 	bpp = &list;
2308 	while (bp != NULL) {
2309 		bp1 = bp;
2310 		bp = bp->b_next;
2311 		bp1->b_next = NULL;
2312 
2313 		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2314 			*bpp = bp1;
2315 			bpp = &bp1->b_next;
2316 		}
2317 	}
2318 	return (list);
2319 }
2320 
2321 static int
2322 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2323 {
2324 	mac_ring_handle_t ring = arg;
2325 
2326 	if (flent->fe_tx_srs)
2327 		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2328 	return (0);
2329 }
2330 
2331 void
2332 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2333 {
2334 	mac_client_impl_t	*cclient;
2335 	mac_soft_ring_set_t	*mac_srs;
2336 
2337 	/*
2338 	 * After grabbing the mi_rw_lock, the list of clients can't change.
2339 	 * If there are any clients mi_disabled must be B_FALSE and can't
2340 	 * get set since there are clients. If there aren't any clients we
2341 	 * don't do anything. In any case the mip has to be valid. The driver
2342 	 * must make sure that it goes single threaded (with respect to mac
2343 	 * calls) and wait for all pending mac calls to finish before calling
2344 	 * mac_unregister.
2345 	 */
2346 	rw_enter(&i_mac_impl_lock, RW_READER);
2347 	if (mip->mi_state_flags & MIS_DISABLED) {
2348 		rw_exit(&i_mac_impl_lock);
2349 		return;
2350 	}
2351 
2352 	/*
2353 	 * Get MAC tx srs from walking mac_client_handle list.
2354 	 */
2355 	rw_enter(&mip->mi_rw_lock, RW_READER);
2356 	for (cclient = mip->mi_clients_list; cclient != NULL;
2357 	    cclient = cclient->mci_client_next) {
2358 		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
2359 			mac_tx_srs_wakeup(mac_srs, ring);
2360 		(void) mac_flow_walk(cclient->mci_subflow_tab,
2361 		    mac_tx_flow_srs_wakeup, ring);
2362 	}
2363 	rw_exit(&mip->mi_rw_lock);
2364 	rw_exit(&i_mac_impl_lock);
2365 }
2366 
2367 /* ARGSUSED */
2368 void
2369 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2370     boolean_t add)
2371 {
2372 	mac_impl_t *mip = (mac_impl_t *)mh;
2373 
2374 	i_mac_perim_enter((mac_impl_t *)mh);
2375 	/*
2376 	 * If no specific refresh function was given then default to the
2377 	 * driver's m_multicst entry point.
2378 	 */
2379 	if (refresh == NULL) {
2380 		refresh = mip->mi_multicst;
2381 		arg = mip->mi_driver;
2382 	}
2383 
2384 	mac_bcast_refresh(mip, refresh, arg, add);
2385 	i_mac_perim_exit((mac_impl_t *)mh);
2386 }
2387 
2388 void
2389 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2390 {
2391 	mac_impl_t	*mip = (mac_impl_t *)mh;
2392 
2393 	/*
2394 	 * If no specific refresh function was given then default to the
2395 	 * driver's m_promisc entry point.
2396 	 */
2397 	if (refresh == NULL) {
2398 		refresh = mip->mi_setpromisc;
2399 		arg = mip->mi_driver;
2400 	}
2401 	ASSERT(refresh != NULL);
2402 
2403 	/*
2404 	 * Call the refresh function with the current promiscuity.
2405 	 */
2406 	refresh(arg, (mip->mi_devpromisc != 0));
2407 }
2408 
2409 /*
2410  * The mac client requests that the mac not to change its margin size to
2411  * be less than the specified value.  If "current" is B_TRUE, then the client
2412  * requests the mac not to change its margin size to be smaller than the
2413  * current size. Further, return the current margin size value in this case.
2414  *
2415  * We keep every requested size in an ordered list from largest to smallest.
2416  */
2417 int
2418 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2419 {
2420 	mac_impl_t		*mip = (mac_impl_t *)mh;
2421 	mac_margin_req_t	**pp, *p;
2422 	int			err = 0;
2423 
2424 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2425 	if (current)
2426 		*marginp = mip->mi_margin;
2427 
2428 	/*
2429 	 * If the current margin value cannot satisfy the margin requested,
2430 	 * return ENOTSUP directly.
2431 	 */
2432 	if (*marginp > mip->mi_margin) {
2433 		err = ENOTSUP;
2434 		goto done;
2435 	}
2436 
2437 	/*
2438 	 * Check whether the given margin is already in the list. If so,
2439 	 * bump the reference count.
2440 	 */
2441 	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2442 		if (p->mmr_margin == *marginp) {
2443 			/*
2444 			 * The margin requested is already in the list,
2445 			 * so just bump the reference count.
2446 			 */
2447 			p->mmr_ref++;
2448 			goto done;
2449 		}
2450 		if (p->mmr_margin < *marginp)
2451 			break;
2452 	}
2453 
2454 
2455 	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2456 	p->mmr_margin = *marginp;
2457 	p->mmr_ref++;
2458 	p->mmr_nextp = *pp;
2459 	*pp = p;
2460 
2461 done:
2462 	rw_exit(&(mip->mi_rw_lock));
2463 	return (err);
2464 }
2465 
2466 /*
2467  * The mac client requests to cancel its previous mac_margin_add() request.
2468  * We remove the requested margin size from the list.
2469  */
2470 int
2471 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2472 {
2473 	mac_impl_t		*mip = (mac_impl_t *)mh;
2474 	mac_margin_req_t	**pp, *p;
2475 	int			err = 0;
2476 
2477 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2478 	/*
2479 	 * Find the entry in the list for the given margin.
2480 	 */
2481 	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2482 		if (p->mmr_margin == margin) {
2483 			if (--p->mmr_ref == 0)
2484 				break;
2485 
2486 			/*
2487 			 * There is still a reference to this address so
2488 			 * there's nothing more to do.
2489 			 */
2490 			goto done;
2491 		}
2492 	}
2493 
2494 	/*
2495 	 * We did not find an entry for the given margin.
2496 	 */
2497 	if (p == NULL) {
2498 		err = ENOENT;
2499 		goto done;
2500 	}
2501 
2502 	ASSERT(p->mmr_ref == 0);
2503 
2504 	/*
2505 	 * Remove it from the list.
2506 	 */
2507 	*pp = p->mmr_nextp;
2508 	kmem_free(p, sizeof (mac_margin_req_t));
2509 done:
2510 	rw_exit(&(mip->mi_rw_lock));
2511 	return (err);
2512 }
2513 
2514 boolean_t
2515 mac_margin_update(mac_handle_t mh, uint32_t margin)
2516 {
2517 	mac_impl_t	*mip = (mac_impl_t *)mh;
2518 	uint32_t	margin_needed = 0;
2519 
2520 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2521 
2522 	if (mip->mi_mmrp != NULL)
2523 		margin_needed = mip->mi_mmrp->mmr_margin;
2524 
2525 	if (margin_needed <= margin)
2526 		mip->mi_margin = margin;
2527 
2528 	rw_exit(&(mip->mi_rw_lock));
2529 
2530 	if (margin_needed <= margin)
2531 		i_mac_notify(mip, MAC_NOTE_MARGIN);
2532 
2533 	return (margin_needed <= margin);
2534 }
2535 
2536 /*
2537  * MAC Type Plugin functions.
2538  */
2539 
2540 mactype_t *
2541 mactype_getplugin(const char *pname)
2542 {
2543 	mactype_t	*mtype = NULL;
2544 	boolean_t	tried_modload = B_FALSE;
2545 
2546 	mutex_enter(&i_mactype_lock);
2547 
2548 find_registered_mactype:
2549 	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2550 	    (mod_hash_val_t *)&mtype) != 0) {
2551 		if (!tried_modload) {
2552 			/*
2553 			 * If the plugin has not yet been loaded, then
2554 			 * attempt to load it now.  If modload() succeeds,
2555 			 * the plugin should have registered using
2556 			 * mactype_register(), in which case we can go back
2557 			 * and attempt to find it again.
2558 			 */
2559 			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2560 				tried_modload = B_TRUE;
2561 				goto find_registered_mactype;
2562 			}
2563 		}
2564 	} else {
2565 		/*
2566 		 * Note that there's no danger that the plugin we've loaded
2567 		 * could be unloaded between the modload() step and the
2568 		 * reference count bump here, as we're holding
2569 		 * i_mactype_lock, which mactype_unregister() also holds.
2570 		 */
2571 		atomic_inc_32(&mtype->mt_ref);
2572 	}
2573 
2574 	mutex_exit(&i_mactype_lock);
2575 	return (mtype);
2576 }
2577 
2578 mactype_register_t *
2579 mactype_alloc(uint_t mactype_version)
2580 {
2581 	mactype_register_t *mtrp;
2582 
2583 	/*
2584 	 * Make sure there isn't a version mismatch between the plugin and
2585 	 * the framework.  In the future, if multiple versions are
2586 	 * supported, this check could become more sophisticated.
2587 	 */
2588 	if (mactype_version != MACTYPE_VERSION)
2589 		return (NULL);
2590 
2591 	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2592 	mtrp->mtr_version = mactype_version;
2593 	return (mtrp);
2594 }
2595 
2596 void
2597 mactype_free(mactype_register_t *mtrp)
2598 {
2599 	kmem_free(mtrp, sizeof (mactype_register_t));
2600 }
2601 
2602 int
2603 mactype_register(mactype_register_t *mtrp)
2604 {
2605 	mactype_t	*mtp;
2606 	mactype_ops_t	*ops = mtrp->mtr_ops;
2607 
2608 	/* Do some sanity checking before we register this MAC type. */
2609 	if (mtrp->mtr_ident == NULL || ops == NULL)
2610 		return (EINVAL);
2611 
2612 	/*
2613 	 * Verify that all mandatory callbacks are set in the ops
2614 	 * vector.
2615 	 */
2616 	if (ops->mtops_unicst_verify == NULL ||
2617 	    ops->mtops_multicst_verify == NULL ||
2618 	    ops->mtops_sap_verify == NULL ||
2619 	    ops->mtops_header == NULL ||
2620 	    ops->mtops_header_info == NULL) {
2621 		return (EINVAL);
2622 	}
2623 
2624 	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2625 	mtp->mt_ident = mtrp->mtr_ident;
2626 	mtp->mt_ops = *ops;
2627 	mtp->mt_type = mtrp->mtr_mactype;
2628 	mtp->mt_nativetype = mtrp->mtr_nativetype;
2629 	mtp->mt_addr_length = mtrp->mtr_addrlen;
2630 	if (mtrp->mtr_brdcst_addr != NULL) {
2631 		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2632 		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2633 		    mtrp->mtr_addrlen);
2634 	}
2635 
2636 	mtp->mt_stats = mtrp->mtr_stats;
2637 	mtp->mt_statcount = mtrp->mtr_statcount;
2638 
2639 	mtp->mt_mapping = mtrp->mtr_mapping;
2640 	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2641 
2642 	if (mod_hash_insert(i_mactype_hash,
2643 	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2644 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2645 		kmem_free(mtp, sizeof (*mtp));
2646 		return (EEXIST);
2647 	}
2648 	return (0);
2649 }
2650 
2651 int
2652 mactype_unregister(const char *ident)
2653 {
2654 	mactype_t	*mtp;
2655 	mod_hash_val_t	val;
2656 	int 		err;
2657 
2658 	/*
2659 	 * Let's not allow MAC drivers to use this plugin while we're
2660 	 * trying to unregister it.  Holding i_mactype_lock also prevents a
2661 	 * plugin from unregistering while a MAC driver is attempting to
2662 	 * hold a reference to it in i_mactype_getplugin().
2663 	 */
2664 	mutex_enter(&i_mactype_lock);
2665 
2666 	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2667 	    (mod_hash_val_t *)&mtp)) != 0) {
2668 		/* A plugin is trying to unregister, but it never registered. */
2669 		err = ENXIO;
2670 		goto done;
2671 	}
2672 
2673 	if (mtp->mt_ref != 0) {
2674 		err = EBUSY;
2675 		goto done;
2676 	}
2677 
2678 	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2679 	ASSERT(err == 0);
2680 	if (err != 0) {
2681 		/* This should never happen, thus the ASSERT() above. */
2682 		err = EINVAL;
2683 		goto done;
2684 	}
2685 	ASSERT(mtp == (mactype_t *)val);
2686 
2687 	kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2688 	kmem_free(mtp, sizeof (mactype_t));
2689 done:
2690 	mutex_exit(&i_mactype_lock);
2691 	return (err);
2692 }
2693 
2694 /*
2695  * Returns TRUE when the specified property is intended for the MAC framework,
2696  * as opposed to driver defined properties.
2697  */
2698 static boolean_t
2699 mac_is_macprop(mac_prop_t *macprop)
2700 {
2701 	switch (macprop->mp_id) {
2702 	case MAC_PROP_MAXBW:
2703 	case MAC_PROP_PRIO:
2704 	case MAC_PROP_BIND_CPU:
2705 		return (B_TRUE);
2706 	default:
2707 		return (B_FALSE);
2708 	}
2709 }
2710 
2711 /*
2712  * mac_set_prop() sets mac or hardware driver properties:
2713  * 	mac properties include maxbw, priority, and cpu binding list. Driver
2714  *	properties are private properties to the hardware, such as mtu, speed
2715  *	etc.
2716  * If the property is a driver property, mac_set_prop() calls driver's callback
2717  * function to set it.
2718  * If the property is a mac property, mac_set_prop() invokes mac_set_resources()
2719  * which will cache the property value in mac_impl_t and may call
2720  * mac_client_set_resource() to update property value of the primary mac client,
2721  * if it exists.
2722  */
2723 int
2724 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
2725 {
2726 	int err = ENOTSUP;
2727 	mac_impl_t *mip = (mac_impl_t *)mh;
2728 
2729 	ASSERT(MAC_PERIM_HELD(mh));
2730 
2731 	/* If it is mac property, call mac_set_resources() */
2732 	if (mac_is_macprop(macprop)) {
2733 		mac_resource_props_t mrp;
2734 
2735 		if (valsize < sizeof (mac_resource_props_t))
2736 			return (EINVAL);
2737 		bzero(&mrp, sizeof (mac_resource_props_t));
2738 		bcopy(val, &mrp, sizeof (mrp));
2739 		return (mac_set_resources(mh, &mrp));
2740 	}
2741 	switch (macprop->mp_id) {
2742 	case MAC_PROP_MTU: {
2743 		uint32_t mtu;
2744 
2745 		if (valsize < sizeof (mtu))
2746 			return (EINVAL);
2747 		bcopy(val, &mtu, sizeof (mtu));
2748 		err = mac_set_mtu(mh, mtu, NULL);
2749 		break;
2750 	}
2751 	default:
2752 		/* For other driver properties, call driver's callback */
2753 		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
2754 			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
2755 			    macprop->mp_name, macprop->mp_id, valsize, val);
2756 		}
2757 	}
2758 	return (err);
2759 }
2760 
2761 /*
2762  * mac_get_prop() gets mac or hardware driver properties.
2763  *
2764  * If the property is a driver property, mac_get_prop() calls driver's callback
2765  * function to get it.
2766  * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
2767  * which returns the cached value in mac_impl_t.
2768  */
2769 int
2770 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
2771     uint_t *perm)
2772 {
2773 	int err = ENOTSUP;
2774 	mac_impl_t *mip = (mac_impl_t *)mh;
2775 	uint32_t sdu;
2776 	link_state_t link_state;
2777 
2778 	/* If mac property, read from cache */
2779 	if (mac_is_macprop(macprop)) {
2780 		mac_resource_props_t mrp;
2781 
2782 		if (valsize < sizeof (mac_resource_props_t))
2783 			return (EINVAL);
2784 		bzero(&mrp, sizeof (mac_resource_props_t));
2785 		mac_get_resources(mh, &mrp);
2786 		bcopy(&mrp, val, sizeof (mac_resource_props_t));
2787 		return (0);
2788 	}
2789 
2790 	switch (macprop->mp_id) {
2791 	case MAC_PROP_MTU:
2792 		if (valsize < sizeof (sdu))
2793 			return (EINVAL);
2794 		if ((macprop->mp_flags & MAC_PROP_DEFAULT) == 0) {
2795 			mac_sdu_get(mh, NULL, &sdu);
2796 			bcopy(&sdu, val, sizeof (sdu));
2797 			if ((mip->mi_callbacks->mc_callbacks & MC_SETPROP) &&
2798 			    (mip->mi_callbacks->mc_setprop(mip->mi_driver,
2799 			    macprop->mp_name, macprop->mp_id, valsize,
2800 			    val) == 0)) {
2801 				*perm = MAC_PROP_PERM_RW;
2802 			} else {
2803 				*perm = MAC_PROP_PERM_READ;
2804 			}
2805 			return (0);
2806 		} else {
2807 			if (mip->mi_info.mi_media == DL_ETHER) {
2808 				sdu = ETHERMTU;
2809 				bcopy(&sdu, val, sizeof (sdu));
2810 
2811 				return (0);
2812 			}
2813 			/*
2814 			 * ask driver for its default.
2815 			 */
2816 			break;
2817 		}
2818 	case MAC_PROP_STATUS:
2819 		if (valsize < sizeof (link_state))
2820 			return (EINVAL);
2821 		*perm = MAC_PROP_PERM_READ;
2822 		link_state = mac_link_get(mh);
2823 		bcopy(&link_state, val, sizeof (link_state));
2824 		return (0);
2825 	default:
2826 		break;
2827 
2828 	}
2829 	/* If driver property, request from driver */
2830 	if (mip->mi_callbacks->mc_callbacks & MC_GETPROP) {
2831 		err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
2832 		    macprop->mp_name, macprop->mp_id, macprop->mp_flags,
2833 		    valsize, val, perm);
2834 	}
2835 	return (err);
2836 }
2837 
2838 void
2839 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
2840 {
2841 	mac_priv_prop_t *mpriv;
2842 
2843 	if (mpp == NULL)
2844 		return;
2845 
2846 	mpriv = kmem_zalloc(nprop * sizeof (*mpriv), KM_SLEEP);
2847 	(void) memcpy(mpriv, mpp, nprop * sizeof (*mpriv));
2848 	mip->mi_priv_prop = mpriv;
2849 	mip->mi_priv_prop_count = nprop;
2850 }
2851 
2852 void
2853 mac_unregister_priv_prop(mac_impl_t *mip)
2854 {
2855 	mac_priv_prop_t	*mpriv;
2856 
2857 	mpriv = mip->mi_priv_prop;
2858 	if (mpriv != NULL) {
2859 		kmem_free(mpriv, mip->mi_priv_prop_count * sizeof (*mpriv));
2860 		mip->mi_priv_prop = NULL;
2861 	}
2862 	mip->mi_priv_prop_count = 0;
2863 }
2864 
2865 /*
2866  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
2867  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
2868  * cases if MAC free's the ring structure after mac_stop_ring(), any
2869  * illegal access to the ring structure coming from the driver will panic
2870  * the system. In order to protect the system from such inadverent access,
2871  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
2872  * When packets are received on free'd up rings, MAC (through the generation
2873  * count mechanism) will drop such packets.
2874  */
2875 static mac_ring_t *
2876 mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
2877 {
2878 	mac_ring_t *ring;
2879 
2880 	if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2881 		mutex_enter(&mip->mi_ring_lock);
2882 		if (mip->mi_ring_freelist != NULL) {
2883 			ring = mip->mi_ring_freelist;
2884 			mip->mi_ring_freelist = ring->mr_next;
2885 			bzero(ring, sizeof (mac_ring_t));
2886 		} else {
2887 			ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
2888 		}
2889 		mutex_exit(&mip->mi_ring_lock);
2890 	} else {
2891 		ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
2892 	}
2893 	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
2894 	return (ring);
2895 }
2896 
2897 static void
2898 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
2899 {
2900 	if (ring->mr_type == MAC_RING_TYPE_RX) {
2901 		mutex_enter(&mip->mi_ring_lock);
2902 		ring->mr_state = MR_FREE;
2903 		ring->mr_flag = 0;
2904 		ring->mr_next = mip->mi_ring_freelist;
2905 		mip->mi_ring_freelist = ring;
2906 		mutex_exit(&mip->mi_ring_lock);
2907 	} else {
2908 		kmem_free(ring, sizeof (mac_ring_t));
2909 	}
2910 }
2911 
2912 static void
2913 mac_ring_freeall(mac_impl_t *mip)
2914 {
2915 	mac_ring_t *ring_next;
2916 	mutex_enter(&mip->mi_ring_lock);
2917 	mac_ring_t *ring = mip->mi_ring_freelist;
2918 	while (ring != NULL) {
2919 		ring_next = ring->mr_next;
2920 		kmem_cache_free(mac_ring_cache, ring);
2921 		ring = ring_next;
2922 	}
2923 	mip->mi_ring_freelist = NULL;
2924 	mutex_exit(&mip->mi_ring_lock);
2925 }
2926 
2927 int
2928 mac_start_ring(mac_ring_t *ring)
2929 {
2930 	int rv = 0;
2931 
2932 	if (ring->mr_start != NULL)
2933 		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
2934 
2935 	return (rv);
2936 }
2937 
2938 void
2939 mac_stop_ring(mac_ring_t *ring)
2940 {
2941 	if (ring->mr_stop != NULL)
2942 		ring->mr_stop(ring->mr_driver);
2943 
2944 	/*
2945 	 * Increment the ring generation number for this ring.
2946 	 */
2947 	ring->mr_gen_num++;
2948 }
2949 
2950 int
2951 mac_start_group(mac_group_t *group)
2952 {
2953 	int rv = 0;
2954 
2955 	if (group->mrg_start != NULL)
2956 		rv = group->mrg_start(group->mrg_driver);
2957 
2958 	return (rv);
2959 }
2960 
2961 void
2962 mac_stop_group(mac_group_t *group)
2963 {
2964 	if (group->mrg_stop != NULL)
2965 		group->mrg_stop(group->mrg_driver);
2966 }
2967 
2968 /*
2969  * Called from mac_start() on the default Rx group. Broadcast and multicast
2970  * packets are received only on the default group. Hence the default group
2971  * needs to be up even if the primary client is not up, for the other groups
2972  * to be functional. We do this by calling this function at mac_start time
2973  * itself. However the broadcast packets that are received can't make their
2974  * way beyond mac_rx until a mac client creates a broadcast flow.
2975  */
2976 static int
2977 mac_start_group_and_rings(mac_group_t *group)
2978 {
2979 	mac_ring_t	*ring;
2980 	int		rv = 0;
2981 
2982 	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
2983 	if ((rv = mac_start_group(group)) != 0)
2984 		return (rv);
2985 
2986 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
2987 		ASSERT(ring->mr_state == MR_FREE);
2988 		if ((rv = mac_start_ring(ring)) != 0)
2989 			goto error;
2990 		ring->mr_state = MR_INUSE;
2991 		ring->mr_classify_type = MAC_SW_CLASSIFIER;
2992 	}
2993 	return (0);
2994 
2995 error:
2996 	mac_stop_group_and_rings(group);
2997 	return (rv);
2998 }
2999 
3000 /* Called from mac_stop on the default Rx group */
3001 static void
3002 mac_stop_group_and_rings(mac_group_t *group)
3003 {
3004 	mac_ring_t	*ring;
3005 
3006 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3007 		if (ring->mr_state != MR_FREE) {
3008 			mac_stop_ring(ring);
3009 			ring->mr_state = MR_FREE;
3010 			ring->mr_flag = 0;
3011 			ring->mr_classify_type = MAC_NO_CLASSIFIER;
3012 		}
3013 	}
3014 	mac_stop_group(group);
3015 }
3016 
3017 
3018 static mac_ring_t *
3019 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3020     mac_capab_rings_t *cap_rings)
3021 {
3022 	mac_ring_t *ring;
3023 	mac_ring_info_t ring_info;
3024 
3025 	ring = mac_ring_alloc(mip, cap_rings);
3026 
3027 	/* Prepare basic information of ring */
3028 	ring->mr_index = index;
3029 	ring->mr_type = group->mrg_type;
3030 	ring->mr_gh = (mac_group_handle_t)group;
3031 
3032 	/* Insert the new ring to the list. */
3033 	ring->mr_next = group->mrg_rings;
3034 	group->mrg_rings = ring;
3035 
3036 	/* Zero to reuse the info data structure */
3037 	bzero(&ring_info, sizeof (ring_info));
3038 
3039 	/* Query ring information from driver */
3040 	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3041 	    index, &ring_info, (mac_ring_handle_t)ring);
3042 
3043 	ring->mr_info = ring_info;
3044 
3045 	/* Update ring's status */
3046 	ring->mr_state = MR_FREE;
3047 	ring->mr_flag = 0;
3048 
3049 	/* Update the ring count of the group */
3050 	group->mrg_cur_count++;
3051 	return (ring);
3052 }
3053 
3054 /*
3055  * Rings are chained together for easy regrouping.
3056  */
3057 static void
3058 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3059     mac_capab_rings_t *cap_rings)
3060 {
3061 	int index;
3062 
3063 	/*
3064 	 * Initialize all ring members of this group. Size of zero will not
3065 	 * enter the loop, so it's safe for initializing an empty group.
3066 	 */
3067 	for (index = size - 1; index >= 0; index--)
3068 		(void) mac_init_ring(mip, group, index, cap_rings);
3069 }
3070 
3071 int
3072 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3073 {
3074 	mac_capab_rings_t *cap_rings;
3075 	mac_group_t *group, *groups;
3076 	mac_group_info_t group_info;
3077 	uint_t group_free = 0;
3078 	uint_t ring_left;
3079 	mac_ring_t *ring;
3080 	int g, err = 0;
3081 
3082 	switch (rtype) {
3083 	case MAC_RING_TYPE_RX:
3084 		ASSERT(mip->mi_rx_groups == NULL);
3085 
3086 		cap_rings = &mip->mi_rx_rings_cap;
3087 		cap_rings->mr_type = MAC_RING_TYPE_RX;
3088 		break;
3089 	case MAC_RING_TYPE_TX:
3090 		ASSERT(mip->mi_tx_groups == NULL);
3091 
3092 		cap_rings = &mip->mi_tx_rings_cap;
3093 		cap_rings->mr_type = MAC_RING_TYPE_TX;
3094 		break;
3095 	default:
3096 		ASSERT(B_FALSE);
3097 	}
3098 
3099 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
3100 	    cap_rings))
3101 		return (0);
3102 
3103 	/*
3104 	 * Allocate a contiguous buffer for all groups.
3105 	 */
3106 	groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
3107 	    KM_SLEEP);
3108 
3109 	ring_left = cap_rings->mr_rnum;
3110 
3111 	/*
3112 	 * Get all ring groups if any, and get their ring members
3113 	 * if any.
3114 	 */
3115 	for (g = 0; g < cap_rings->mr_gnum; g++) {
3116 		group = groups + g;
3117 
3118 		/* Prepare basic information of the group */
3119 		group->mrg_index = g;
3120 		group->mrg_type = rtype;
3121 		group->mrg_state = MAC_GROUP_STATE_UNINIT;
3122 		group->mrg_mh = (mac_handle_t)mip;
3123 		group->mrg_next = group + 1;
3124 
3125 		/* Zero to reuse the info data structure */
3126 		bzero(&group_info, sizeof (group_info));
3127 
3128 		/* Query group information from driver */
3129 		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3130 		    (mac_group_handle_t)group);
3131 
3132 		switch (cap_rings->mr_group_type) {
3133 		case MAC_GROUP_TYPE_DYNAMIC:
3134 			if (cap_rings->mr_gaddring == NULL ||
3135 			    cap_rings->mr_gremring == NULL) {
3136 				DTRACE_PROBE3(
3137 				    mac__init__rings_no_addremring,
3138 				    char *, mip->mi_name,
3139 				    mac_group_add_ring_t,
3140 				    cap_rings->mr_gaddring,
3141 				    mac_group_add_ring_t,
3142 				    cap_rings->mr_gremring);
3143 				err = EINVAL;
3144 				goto bail;
3145 			}
3146 
3147 			switch (rtype) {
3148 			case MAC_RING_TYPE_RX:
3149 				/*
3150 				 * The first RX group must have non-zero
3151 				 * rings, and the following groups must
3152 				 * have zero rings.
3153 				 */
3154 				if (g == 0 && group_info.mgi_count == 0) {
3155 					DTRACE_PROBE1(
3156 					    mac__init__rings__rx__def__zero,
3157 					    char *, mip->mi_name);
3158 					err = EINVAL;
3159 					goto bail;
3160 				}
3161 				if (g > 0 && group_info.mgi_count != 0) {
3162 					DTRACE_PROBE3(
3163 					    mac__init__rings__rx__nonzero,
3164 					    char *, mip->mi_name,
3165 					    int, g, int, group_info.mgi_count);
3166 					err = EINVAL;
3167 					goto bail;
3168 				}
3169 				break;
3170 			case MAC_RING_TYPE_TX:
3171 				/*
3172 				 * All TX ring groups must have zero rings.
3173 				 */
3174 				if (group_info.mgi_count != 0) {
3175 					DTRACE_PROBE3(
3176 					    mac__init__rings__tx__nonzero,
3177 					    char *, mip->mi_name,
3178 					    int, g, int, group_info.mgi_count);
3179 					err = EINVAL;
3180 					goto bail;
3181 				}
3182 				break;
3183 			}
3184 			break;
3185 		case MAC_GROUP_TYPE_STATIC:
3186 			/*
3187 			 * Note that an empty group is allowed, e.g., an aggr
3188 			 * would start with an empty group.
3189 			 */
3190 			break;
3191 		default:
3192 			/* unknown group type */
3193 			DTRACE_PROBE2(mac__init__rings__unknown__type,
3194 			    char *, mip->mi_name,
3195 			    int, cap_rings->mr_group_type);
3196 			err = EINVAL;
3197 			goto bail;
3198 		}
3199 
3200 
3201 		/*
3202 		 * Driver must register group->mgi_addmac/remmac() for rx groups
3203 		 * to support multiple MAC addresses.
3204 		 */
3205 		if (rtype == MAC_RING_TYPE_RX) {
3206 			if ((group_info.mgi_addmac == NULL) ||
3207 			    (group_info.mgi_addmac == NULL))
3208 				goto bail;
3209 		}
3210 
3211 		/* Cache driver-supplied information */
3212 		group->mrg_info = group_info;
3213 
3214 		/* Update the group's status and group count. */
3215 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3216 		group_free++;
3217 
3218 		group->mrg_rings = NULL;
3219 		group->mrg_cur_count = 0;
3220 		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3221 		ring_left -= group_info.mgi_count;
3222 
3223 		/* The current group size should be equal to default value */
3224 		ASSERT(group->mrg_cur_count == group_info.mgi_count);
3225 	}
3226 
3227 	/* Build up a dummy group for free resources as a pool */
3228 	group = groups + cap_rings->mr_gnum;
3229 
3230 	/* Prepare basic information of the group */
3231 	group->mrg_index = -1;
3232 	group->mrg_type = rtype;
3233 	group->mrg_state = MAC_GROUP_STATE_UNINIT;
3234 	group->mrg_mh = (mac_handle_t)mip;
3235 	group->mrg_next = NULL;
3236 
3237 	/*
3238 	 * If there are ungrouped rings, allocate a continuous buffer for
3239 	 * remaining resources.
3240 	 */
3241 	if (ring_left != 0) {
3242 		group->mrg_rings = NULL;
3243 		group->mrg_cur_count = 0;
3244 		mac_init_group(mip, group, ring_left, cap_rings);
3245 
3246 		/* The current group size should be equal to ring_left */
3247 		ASSERT(group->mrg_cur_count == ring_left);
3248 
3249 		ring_left = 0;
3250 
3251 		/* Update this group's status */
3252 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3253 	} else
3254 		group->mrg_rings = NULL;
3255 
3256 	ASSERT(ring_left == 0);
3257 
3258 bail:
3259 	/* Cache other important information to finalize the initialization */
3260 	switch (rtype) {
3261 	case MAC_RING_TYPE_RX:
3262 		mip->mi_rx_group_type = cap_rings->mr_group_type;
3263 		mip->mi_rx_group_count = cap_rings->mr_gnum;
3264 		mip->mi_rx_groups = groups;
3265 		break;
3266 	case MAC_RING_TYPE_TX:
3267 		mip->mi_tx_group_type = cap_rings->mr_group_type;
3268 		mip->mi_tx_group_count = cap_rings->mr_gnum;
3269 		mip->mi_tx_group_free = group_free;
3270 		mip->mi_tx_groups = groups;
3271 
3272 		/*
3273 		 * Ring 0 is used as the default one and it could be assigned
3274 		 * to a client as well.
3275 		 */
3276 		group = groups + cap_rings->mr_gnum;
3277 		ring = group->mrg_rings;
3278 		while ((ring->mr_index != 0) && (ring->mr_next != NULL))
3279 			ring = ring->mr_next;
3280 		ASSERT(ring->mr_index == 0);
3281 		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
3282 		break;
3283 	default:
3284 		ASSERT(B_FALSE);
3285 	}
3286 
3287 	if (err != 0)
3288 		mac_free_rings(mip, rtype);
3289 
3290 	return (err);
3291 }
3292 
3293 /*
3294  * Called to free all ring groups with particular type. It's supposed all groups
3295  * have been released by clinet.
3296  */
3297 void
3298 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3299 {
3300 	mac_group_t *group, *groups;
3301 	uint_t group_count;
3302 
3303 	switch (rtype) {
3304 	case MAC_RING_TYPE_RX:
3305 		if (mip->mi_rx_groups == NULL)
3306 			return;
3307 
3308 		groups = mip->mi_rx_groups;
3309 		group_count = mip->mi_rx_group_count;
3310 
3311 		mip->mi_rx_groups = NULL;
3312 		mip->mi_rx_group_count = 0;
3313 		break;
3314 	case MAC_RING_TYPE_TX:
3315 		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
3316 
3317 		if (mip->mi_tx_groups == NULL)
3318 			return;
3319 
3320 		groups = mip->mi_tx_groups;
3321 		group_count = mip->mi_tx_group_count;
3322 
3323 		mip->mi_tx_groups = NULL;
3324 		mip->mi_tx_group_count = 0;
3325 		mip->mi_tx_group_free = 0;
3326 		mip->mi_default_tx_ring = NULL;
3327 		break;
3328 	default:
3329 		ASSERT(B_FALSE);
3330 	}
3331 
3332 	for (group = groups; group != NULL; group = group->mrg_next) {
3333 		mac_ring_t *ring;
3334 
3335 		if (group->mrg_cur_count == 0)
3336 			continue;
3337 
3338 		ASSERT(group->mrg_rings != NULL);
3339 
3340 		while ((ring = group->mrg_rings) != NULL) {
3341 			group->mrg_rings = ring->mr_next;
3342 			mac_ring_free(mip, ring);
3343 		}
3344 	}
3345 
3346 	/* Free all the cached rings */
3347 	mac_ring_freeall(mip);
3348 	/* Free the block of group data strutures */
3349 	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
3350 }
3351 
3352 /*
3353  * Associate a MAC address with a receive group.
3354  *
3355  * The return value of this function should always be checked properly, because
3356  * any type of failure could cause unexpected results. A group can be added
3357  * or removed with a MAC address only after it has been reserved. Ideally,
3358  * a successful reservation always leads to calling mac_group_addmac() to
3359  * steer desired traffic. Failure of adding an unicast MAC address doesn't
3360  * always imply that the group is functioning abnormally.
3361  *
3362  * Currently this function is called everywhere, and it reflects assumptions
3363  * about MAC addresses in the implementation. CR 6735196.
3364  */
3365 int
3366 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
3367 {
3368 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3369 	ASSERT(group->mrg_info.mgi_addmac != NULL);
3370 
3371 	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
3372 }
3373 
3374 /*
3375  * Remove the association between MAC address and receive group.
3376  */
3377 int
3378 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
3379 {
3380 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3381 	ASSERT(group->mrg_info.mgi_remmac != NULL);
3382 
3383 	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
3384 }
3385 
3386 /*
3387  * Release a ring in use by marking it MR_FREE.
3388  * Any other client may reserve it for its use.
3389  */
3390 void
3391 mac_release_tx_ring(mac_ring_handle_t rh)
3392 {
3393 	mac_ring_t *ring = (mac_ring_t *)rh;
3394 	mac_group_t *group = (mac_group_t *)ring->mr_gh;
3395 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3396 
3397 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3398 	ASSERT(ring->mr_state != MR_FREE);
3399 
3400 	/*
3401 	 * Default tx ring will be released by mac_stop().
3402 	 */
3403 	if (rh == mip->mi_default_tx_ring)
3404 		return;
3405 
3406 	mac_stop_ring(ring);
3407 
3408 	ring->mr_state = MR_FREE;
3409 	ring->mr_flag = 0;
3410 }
3411 
3412 /*
3413  * Send packets through a selected tx ring.
3414  */
3415 mblk_t *
3416 mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp)
3417 {
3418 	mac_ring_t *ring = (mac_ring_t *)rh;
3419 	mac_ring_info_t *info = &ring->mr_info;
3420 
3421 	ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
3422 	ASSERT(ring->mr_state >= MR_INUSE);
3423 	ASSERT(info->mri_tx != NULL);
3424 
3425 	return (info->mri_tx(info->mri_driver, mp));
3426 }
3427 
3428 /*
3429  * Find a ring from its index.
3430  */
3431 mac_ring_t *
3432 mac_find_ring(mac_group_t *group, int index)
3433 {
3434 	mac_ring_t *ring = group->mrg_rings;
3435 
3436 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
3437 		if (ring->mr_index == index)
3438 			break;
3439 
3440 	return (ring);
3441 }
3442 /*
3443  * Add a ring to an existing group.
3444  *
3445  * The ring must be either passed directly (for example if the ring
3446  * movement is initiated by the framework), or specified through a driver
3447  * index (for example when the ring is added by the driver.
3448  *
3449  * The caller needs to call mac_perim_enter() before calling this function.
3450  */
3451 int
3452 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
3453 {
3454 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3455 	mac_capab_rings_t *cap_rings;
3456 	boolean_t driver_call = (ring == NULL);
3457 	mac_group_type_t group_type;
3458 	int ret = 0;
3459 
3460 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3461 
3462 	switch (group->mrg_type) {
3463 	case MAC_RING_TYPE_RX:
3464 		cap_rings = &mip->mi_rx_rings_cap;
3465 		group_type = mip->mi_rx_group_type;
3466 		break;
3467 	case MAC_RING_TYPE_TX:
3468 		cap_rings = &mip->mi_tx_rings_cap;
3469 		group_type = mip->mi_tx_group_type;
3470 		break;
3471 	default:
3472 		ASSERT(B_FALSE);
3473 	}
3474 
3475 	/*
3476 	 * There should be no ring with the same ring index in the target
3477 	 * group.
3478 	 */
3479 	ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
3480 	    NULL);
3481 
3482 	if (driver_call) {
3483 		/*
3484 		 * The function is called as a result of a request from
3485 		 * a driver to add a ring to an existing group, for example
3486 		 * from the aggregation driver. Allocate a new mac_ring_t
3487 		 * for that ring.
3488 		 */
3489 		ring = mac_init_ring(mip, group, index, cap_rings);
3490 		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
3491 	} else {
3492 		/*
3493 		 * The function is called as a result of a MAC layer request
3494 		 * to add a ring to an existing group. In this case the
3495 		 * ring is being moved between groups, which requires
3496 		 * the underlying driver to support dynamic grouping,
3497 		 * and the mac_ring_t already exists.
3498 		 */
3499 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3500 		ASSERT(cap_rings->mr_gaddring != NULL);
3501 		ASSERT(ring->mr_gh == NULL);
3502 	}
3503 
3504 	/*
3505 	 * At this point the ring should not be in use, and it should be
3506 	 * of the right for the target group.
3507 	 */
3508 	ASSERT(ring->mr_state < MR_INUSE);
3509 	ASSERT(ring->mr_srs == NULL);
3510 	ASSERT(ring->mr_type == group->mrg_type);
3511 
3512 	if (!driver_call) {
3513 		/*
3514 		 * Add the driver level hardware ring if the process was not
3515 		 * initiated by the driver, and the target group is not the
3516 		 * group.
3517 		 */
3518 		if (group->mrg_driver != NULL) {
3519 			cap_rings->mr_gaddring(group->mrg_driver,
3520 			    ring->mr_driver, ring->mr_type);
3521 		}
3522 
3523 		/*
3524 		 * Insert the ring ahead existing rings.
3525 		 */
3526 		ring->mr_next = group->mrg_rings;
3527 		group->mrg_rings = ring;
3528 		ring->mr_gh = (mac_group_handle_t)group;
3529 		group->mrg_cur_count++;
3530 	}
3531 
3532 	/*
3533 	 * If the group has not been actively used, we're done.
3534 	 */
3535 	if (group->mrg_index != -1 &&
3536 	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
3537 		return (0);
3538 
3539 	/*
3540 	 * Set up SRS/SR according to the ring type.
3541 	 */
3542 	switch (ring->mr_type) {
3543 	case MAC_RING_TYPE_RX:
3544 		/*
3545 		 * Setup SRS on top of the new ring if the group is
3546 		 * reserved for someones exclusive use.
3547 		 */
3548 		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
3549 			flow_entry_t *flent;
3550 			mac_client_impl_t *mcip;
3551 
3552 			mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
3553 			ASSERT(mcip != NULL);
3554 			flent = mcip->mci_flent;
3555 			ASSERT(flent->fe_rx_srs_cnt > 0);
3556 			mac_srs_group_setup(mcip, flent, group, SRST_LINK);
3557 		}
3558 		break;
3559 	case MAC_RING_TYPE_TX:
3560 		/*
3561 		 * For TX this function is only invoked during the
3562 		 * initial creation of a group when a share is
3563 		 * associated with a MAC client. So the datapath is not
3564 		 * yet setup, and will be setup later after the
3565 		 * group has been reserved and populated.
3566 		 */
3567 		break;
3568 	default:
3569 		ASSERT(B_FALSE);
3570 	}
3571 
3572 	/*
3573 	 * Start the ring if needed. Failure causes to undo the grouping action.
3574 	 */
3575 	if ((ret = mac_start_ring(ring)) != 0) {
3576 		if (ring->mr_type == MAC_RING_TYPE_RX) {
3577 			if (ring->mr_srs != NULL) {
3578 				mac_rx_srs_remove(ring->mr_srs);
3579 				ring->mr_srs = NULL;
3580 			}
3581 		}
3582 		if (!driver_call) {
3583 			cap_rings->mr_gremring(group->mrg_driver,
3584 			    ring->mr_driver, ring->mr_type);
3585 		}
3586 		group->mrg_cur_count--;
3587 		group->mrg_rings = ring->mr_next;
3588 
3589 		ring->mr_gh = NULL;
3590 
3591 		if (driver_call)
3592 			mac_ring_free(mip, ring);
3593 
3594 		return (ret);
3595 	}
3596 
3597 	/*
3598 	 * Update the ring's state.
3599 	 */
3600 	ring->mr_state = MR_INUSE;
3601 	MAC_RING_UNMARK(ring, MR_INCIPIENT);
3602 	return (0);
3603 }
3604 
3605 /*
3606  * Remove a ring from it's current group. MAC internal function for dynamic
3607  * grouping.
3608  *
3609  * The caller needs to call mac_perim_enter() before calling this function.
3610  */
3611 void
3612 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
3613     boolean_t driver_call)
3614 {
3615 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3616 	mac_capab_rings_t *cap_rings = NULL;
3617 	mac_group_type_t group_type;
3618 
3619 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3620 
3621 	ASSERT(mac_find_ring(group, ring->mr_index) == ring);
3622 	ASSERT((mac_group_t *)ring->mr_gh == group);
3623 	ASSERT(ring->mr_type == group->mrg_type);
3624 
3625 	switch (ring->mr_type) {
3626 	case MAC_RING_TYPE_RX:
3627 		group_type = mip->mi_rx_group_type;
3628 		cap_rings = &mip->mi_rx_rings_cap;
3629 
3630 		if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
3631 			mac_stop_ring(ring);
3632 
3633 		/*
3634 		 * Only hardware classified packets hold a reference to the
3635 		 * ring all the way up the Rx path. mac_rx_srs_remove()
3636 		 * will take care of quiescing the Rx path and removing the
3637 		 * SRS. The software classified path neither holds a reference
3638 		 * nor any association with the ring in mac_rx.
3639 		 */
3640 		if (ring->mr_srs != NULL) {
3641 			mac_rx_srs_remove(ring->mr_srs);
3642 			ring->mr_srs = NULL;
3643 		}
3644 		ring->mr_state = MR_FREE;
3645 		ring->mr_flag = 0;
3646 
3647 		break;
3648 	case MAC_RING_TYPE_TX:
3649 		/*
3650 		 * For TX this function is only invoked in two
3651 		 * cases:
3652 		 *
3653 		 * 1) In the case of a failure during the
3654 		 * initial creation of a group when a share is
3655 		 * associated with a MAC client. So the SRS is not
3656 		 * yet setup, and will be setup later after the
3657 		 * group has been reserved and populated.
3658 		 *
3659 		 * 2) From mac_release_tx_group() when freeing
3660 		 * a TX SRS.
3661 		 *
3662 		 * In both cases the SRS and its soft rings are
3663 		 * already quiesced.
3664 		 */
3665 		ASSERT(!driver_call);
3666 		group_type = mip->mi_tx_group_type;
3667 		cap_rings = &mip->mi_tx_rings_cap;
3668 		break;
3669 	default:
3670 		ASSERT(B_FALSE);
3671 	}
3672 
3673 	/*
3674 	 * Remove the ring from the group.
3675 	 */
3676 	if (ring == group->mrg_rings)
3677 		group->mrg_rings = ring->mr_next;
3678 	else {
3679 		mac_ring_t *pre;
3680 
3681 		pre = group->mrg_rings;
3682 		while (pre->mr_next != ring)
3683 			pre = pre->mr_next;
3684 		pre->mr_next = ring->mr_next;
3685 	}
3686 	group->mrg_cur_count--;
3687 
3688 	if (!driver_call) {
3689 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3690 		ASSERT(cap_rings->mr_gremring != NULL);
3691 
3692 		/*
3693 		 * Remove the driver level hardware ring.
3694 		 */
3695 		if (group->mrg_driver != NULL) {
3696 			cap_rings->mr_gremring(group->mrg_driver,
3697 			    ring->mr_driver, ring->mr_type);
3698 		}
3699 	}
3700 
3701 	ring->mr_gh = NULL;
3702 	if (driver_call) {
3703 		mac_ring_free(mip, ring);
3704 	} else {
3705 		ring->mr_state = MR_FREE;
3706 		ring->mr_flag = 0;
3707 	}
3708 }
3709 
3710 /*
3711  * Move a ring to the target group. If needed, remove the ring from the group
3712  * that it currently belongs to.
3713  *
3714  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
3715  */
3716 static int
3717 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
3718 {
3719 	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
3720 	int rv;
3721 
3722 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3723 	ASSERT(d_group != NULL);
3724 	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
3725 
3726 	if (s_group == d_group)
3727 		return (0);
3728 
3729 	/*
3730 	 * Remove it from current group first.
3731 	 */
3732 	if (s_group != NULL)
3733 		i_mac_group_rem_ring(s_group, ring, B_FALSE);
3734 
3735 	/*
3736 	 * Add it to the new group.
3737 	 */
3738 	rv = i_mac_group_add_ring(d_group, ring, 0);
3739 	if (rv != 0) {
3740 		/*
3741 		 * Failed to add ring back to source group. If
3742 		 * that fails, the ring is stuck in limbo, log message.
3743 		 */
3744 		if (i_mac_group_add_ring(s_group, ring, 0)) {
3745 			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
3746 			    mip->mi_name, (void *)ring);
3747 		}
3748 	}
3749 
3750 	return (rv);
3751 }
3752 
3753 /*
3754  * Find a MAC address according to its value.
3755  */
3756 mac_address_t *
3757 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
3758 {
3759 	mac_address_t *map;
3760 
3761 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3762 
3763 	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
3764 		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
3765 			break;
3766 	}
3767 
3768 	return (map);
3769 }
3770 
3771 /*
3772  * Check whether the MAC address is shared by multiple clients.
3773  */
3774 boolean_t
3775 mac_check_macaddr_shared(mac_address_t *map)
3776 {
3777 	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
3778 
3779 	return (map->ma_nusers > 1);
3780 }
3781 
3782 /*
3783  * Remove the specified MAC address from the MAC address list and free it.
3784  */
3785 static void
3786 mac_free_macaddr(mac_address_t *map)
3787 {
3788 	mac_impl_t *mip = map->ma_mip;
3789 
3790 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3791 	ASSERT(mip->mi_addresses != NULL);
3792 
3793 	map = mac_find_macaddr(mip, map->ma_addr);
3794 
3795 	ASSERT(map != NULL);
3796 	ASSERT(map->ma_nusers == 0);
3797 
3798 	if (map == mip->mi_addresses) {
3799 		mip->mi_addresses = map->ma_next;
3800 	} else {
3801 		mac_address_t *pre;
3802 
3803 		pre = mip->mi_addresses;
3804 		while (pre->ma_next != map)
3805 			pre = pre->ma_next;
3806 		pre->ma_next = map->ma_next;
3807 	}
3808 
3809 	kmem_free(map, sizeof (mac_address_t));
3810 }
3811 
3812 /*
3813  * Add a MAC address reference for a client. If the desired MAC address
3814  * exists, add a reference to it. Otherwise, add the new address by adding
3815  * it to a reserved group or setting promiscuous mode. Won't try different
3816  * group is the group is non-NULL, so the caller must explictly share
3817  * default group when needed.
3818  *
3819  * Note, the primary MAC address is initialized at registration time, so
3820  * to add it to default group only need to activate it if its reference
3821  * count is still zero. Also, some drivers may not have advertised RINGS
3822  * capability.
3823  */
3824 int
3825 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
3826     boolean_t use_hw)
3827 {
3828 	mac_address_t *map;
3829 	int err = 0;
3830 	boolean_t allocated_map = B_FALSE;
3831 
3832 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3833 
3834 	map = mac_find_macaddr(mip, mac_addr);
3835 
3836 	/*
3837 	 * If the new MAC address has not been added. Allocate a new one
3838 	 * and set it up.
3839 	 */
3840 	if (map == NULL) {
3841 		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
3842 		map->ma_len = mip->mi_type->mt_addr_length;
3843 		bcopy(mac_addr, map->ma_addr, map->ma_len);
3844 		map->ma_nusers = 0;
3845 		map->ma_group = group;
3846 		map->ma_mip = mip;
3847 
3848 		/* add the new MAC address to the head of the address list */
3849 		map->ma_next = mip->mi_addresses;
3850 		mip->mi_addresses = map;
3851 
3852 		allocated_map = B_TRUE;
3853 	}
3854 
3855 	ASSERT(map->ma_group == group);
3856 
3857 	/*
3858 	 * If the MAC address is already in use, simply account for the
3859 	 * new client.
3860 	 */
3861 	if (map->ma_nusers++ > 0)
3862 		return (0);
3863 
3864 	/*
3865 	 * Activate this MAC address by adding it to the reserved group.
3866 	 */
3867 	if (group != NULL) {
3868 		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
3869 		if (err == 0) {
3870 			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3871 			return (0);
3872 		}
3873 	}
3874 
3875 	/*
3876 	 * The MAC address addition failed. If the client requires a
3877 	 * hardware classified MAC address, fail the operation.
3878 	 */
3879 	if (use_hw) {
3880 		err = ENOSPC;
3881 		goto bail;
3882 	}
3883 
3884 	/*
3885 	 * Try promiscuous mode.
3886 	 *
3887 	 * For drivers that don't advertise RINGS capability, do
3888 	 * nothing for the primary address.
3889 	 */
3890 	if ((group == NULL) &&
3891 	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
3892 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3893 		return (0);
3894 	}
3895 
3896 	/*
3897 	 * Enable promiscuous mode in order to receive traffic
3898 	 * to the new MAC address.
3899 	 */
3900 	if ((err = i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC)) == 0) {
3901 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
3902 		return (0);
3903 	}
3904 
3905 	/*
3906 	 * Free the MAC address that could not be added. Don't free
3907 	 * a pre-existing address, it could have been the entry
3908 	 * for the primary MAC address which was pre-allocated by
3909 	 * mac_init_macaddr(), and which must remain on the list.
3910 	 */
3911 bail:
3912 	map->ma_nusers--;
3913 	if (allocated_map)
3914 		mac_free_macaddr(map);
3915 	return (err);
3916 }
3917 
3918 /*
3919  * Remove a reference to a MAC address. This may cause to remove the MAC
3920  * address from an associated group or to turn off promiscuous mode.
3921  * The caller needs to handle the failure properly.
3922  */
3923 int
3924 mac_remove_macaddr(mac_address_t *map)
3925 {
3926 	mac_impl_t *mip = map->ma_mip;
3927 	int err = 0;
3928 
3929 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3930 
3931 	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
3932 
3933 	/*
3934 	 * If it's not the last client using this MAC address, only update
3935 	 * the MAC clients count.
3936 	 */
3937 	if (--map->ma_nusers > 0)
3938 		return (0);
3939 
3940 	/*
3941 	 * The MAC address is no longer used by any MAC client, so remove
3942 	 * it from its associated group, or turn off promiscuous mode
3943 	 * if it was enabled for the MAC address.
3944 	 */
3945 	switch (map->ma_type) {
3946 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
3947 		/*
3948 		 * Don't free the preset primary address for drivers that
3949 		 * don't advertise RINGS capability.
3950 		 */
3951 		if (map->ma_group == NULL)
3952 			return (0);
3953 
3954 		err = mac_group_remmac(map->ma_group, map->ma_addr);
3955 		break;
3956 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
3957 		err = i_mac_promisc_set(mip, B_FALSE, MAC_DEVPROMISC);
3958 		break;
3959 	default:
3960 		ASSERT(B_FALSE);
3961 	}
3962 
3963 	if (err != 0)
3964 		return (err);
3965 
3966 	/*
3967 	 * We created MAC address for the primary one at registration, so we
3968 	 * won't free it here. mac_fini_macaddr() will take care of it.
3969 	 */
3970 	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
3971 		mac_free_macaddr(map);
3972 
3973 	return (0);
3974 }
3975 
3976 /*
3977  * Update an existing MAC address. The caller need to make sure that the new
3978  * value has not been used.
3979  */
3980 int
3981 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
3982 {
3983 	mac_impl_t *mip = map->ma_mip;
3984 	int err = 0;
3985 
3986 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3987 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
3988 
3989 	switch (map->ma_type) {
3990 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
3991 		/*
3992 		 * Update the primary address for drivers that are not
3993 		 * RINGS capable.
3994 		 */
3995 		if (map->ma_group == NULL) {
3996 			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
3997 			    mac_addr);
3998 			if (err != 0)
3999 				return (err);
4000 			break;
4001 		}
4002 
4003 		/*
4004 		 * If this MAC address is not currently in use,
4005 		 * simply break out and update the value.
4006 		 */
4007 		if (map->ma_nusers == 0)
4008 			break;
4009 
4010 		/*
4011 		 * Need to replace the MAC address associated with a group.
4012 		 */
4013 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4014 		if (err != 0)
4015 			return (err);
4016 
4017 		err = mac_group_addmac(map->ma_group, mac_addr);
4018 
4019 		/*
4020 		 * Failure hints hardware error. The MAC layer needs to
4021 		 * have error notification facility to handle this.
4022 		 * Now, simply try to restore the value.
4023 		 */
4024 		if (err != 0)
4025 			(void) mac_group_addmac(map->ma_group, map->ma_addr);
4026 
4027 		break;
4028 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4029 		/*
4030 		 * Need to do nothing more if in promiscuous mode.
4031 		 */
4032 		break;
4033 	default:
4034 		ASSERT(B_FALSE);
4035 	}
4036 
4037 	/*
4038 	 * Successfully replaced the MAC address.
4039 	 */
4040 	if (err == 0)
4041 		bcopy(mac_addr, map->ma_addr, map->ma_len);
4042 
4043 	return (err);
4044 }
4045 
4046 /*
4047  * Freshen the MAC address with new value. Its caller must have updated the
4048  * hardware MAC address before calling this function.
4049  * This funcitons is supposed to be used to handle the MAC address change
4050  * notification from underlying drivers.
4051  */
4052 void
4053 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
4054 {
4055 	mac_impl_t *mip = map->ma_mip;
4056 
4057 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4058 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4059 
4060 	/*
4061 	 * Freshen the MAC address with new value.
4062 	 */
4063 	bcopy(mac_addr, map->ma_addr, map->ma_len);
4064 	bcopy(mac_addr, mip->mi_addr, map->ma_len);
4065 
4066 	/*
4067 	 * Update all MAC clients that share this MAC address.
4068 	 */
4069 	mac_unicast_update_clients(mip, map);
4070 }
4071 
4072 /*
4073  * Set up the primary MAC address.
4074  */
4075 void
4076 mac_init_macaddr(mac_impl_t *mip)
4077 {
4078 	mac_address_t *map;
4079 
4080 	/*
4081 	 * The reference count is initialized to zero, until it's really
4082 	 * activated.
4083 	 */
4084 	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4085 	map->ma_len = mip->mi_type->mt_addr_length;
4086 	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
4087 
4088 	/*
4089 	 * If driver advertises RINGS capability, it shouldn't have initialized
4090 	 * its primary MAC address. For other drivers, including VNIC, the
4091 	 * primary address must work after registration.
4092 	 */
4093 	if (mip->mi_rx_groups == NULL)
4094 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4095 
4096 	/*
4097 	 * The primary MAC address is reserved for default group according
4098 	 * to current design.
4099 	 */
4100 	map->ma_group = mip->mi_rx_groups;
4101 	map->ma_mip = mip;
4102 
4103 	mip->mi_addresses = map;
4104 }
4105 
4106 /*
4107  * Clean up the primary MAC address. Note, only one primary MAC address
4108  * is allowed. All other MAC addresses must have been freed appropriately.
4109  */
4110 void
4111 mac_fini_macaddr(mac_impl_t *mip)
4112 {
4113 	mac_address_t *map = mip->mi_addresses;
4114 
4115 	if (map == NULL)
4116 		return;
4117 
4118 	/*
4119 	 * If mi_addresses is initialized, there should be exactly one
4120 	 * entry left on the list with no users.
4121 	 */
4122 	ASSERT(map->ma_nusers == 0);
4123 	ASSERT(map->ma_next == NULL);
4124 
4125 	kmem_free(map, sizeof (mac_address_t));
4126 	mip->mi_addresses = NULL;
4127 }
4128 
4129 /*
4130  * Logging related functions.
4131  */
4132 
4133 /* Write the Flow description to the log file */
4134 int
4135 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
4136 {
4137 	flow_desc_t		*fdesc;
4138 	mac_resource_props_t	*mrp;
4139 	net_desc_t		ndesc;
4140 
4141 	bzero(&ndesc, sizeof (net_desc_t));
4142 
4143 	/*
4144 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4145 	 * Updates to the fe_flow_desc are done under the fe_lock
4146 	 */
4147 	mutex_enter(&flent->fe_lock);
4148 	fdesc = &flent->fe_flow_desc;
4149 	mrp = &flent->fe_resource_props;
4150 
4151 	ndesc.nd_name = flent->fe_flow_name;
4152 	ndesc.nd_devname = mcip->mci_name;
4153 	bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4154 	bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
4155 	ndesc.nd_sap = htonl(fdesc->fd_sap);
4156 	ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
4157 	ndesc.nd_bw_limit = mrp->mrp_maxbw;
4158 	if (ndesc.nd_isv4) {
4159 		ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
4160 		ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
4161 	} else {
4162 		bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
4163 		bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
4164 	}
4165 	ndesc.nd_sport = htons(fdesc->fd_local_port);
4166 	ndesc.nd_dport = htons(fdesc->fd_remote_port);
4167 	ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
4168 	mutex_exit(&flent->fe_lock);
4169 
4170 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
4171 }
4172 
4173 /* Write the Flow statistics to the log file */
4174 int
4175 mac_write_flow_stats(flow_entry_t *flent)
4176 {
4177 	flow_stats_t	*fl_stats;
4178 	net_stat_t	nstat;
4179 
4180 	fl_stats = &flent->fe_flowstats;
4181 	nstat.ns_name = flent->fe_flow_name;
4182 	nstat.ns_ibytes = fl_stats->fs_rbytes;
4183 	nstat.ns_obytes = fl_stats->fs_obytes;
4184 	nstat.ns_ipackets = fl_stats->fs_ipackets;
4185 	nstat.ns_opackets = fl_stats->fs_opackets;
4186 	nstat.ns_ierrors = fl_stats->fs_ierrors;
4187 	nstat.ns_oerrors = fl_stats->fs_oerrors;
4188 
4189 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
4190 }
4191 
4192 /* Write the Link Description to the log file */
4193 int
4194 mac_write_link_desc(mac_client_impl_t *mcip)
4195 {
4196 	net_desc_t		ndesc;
4197 	flow_entry_t		*flent = mcip->mci_flent;
4198 
4199 	bzero(&ndesc, sizeof (net_desc_t));
4200 
4201 	ndesc.nd_name = mcip->mci_name;
4202 	ndesc.nd_devname = mcip->mci_name;
4203 	ndesc.nd_isv4 = B_TRUE;
4204 	/*
4205 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4206 	 * Updates to the fe_flow_desc are done under the fe_lock
4207 	 * after removing the flent from the flow table.
4208 	 */
4209 	mutex_enter(&flent->fe_lock);
4210 	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4211 	mutex_exit(&flent->fe_lock);
4212 
4213 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
4214 }
4215 
4216 /* Write the Link statistics to the log file */
4217 int
4218 mac_write_link_stats(mac_client_impl_t *mcip)
4219 {
4220 	net_stat_t	nstat;
4221 
4222 	nstat.ns_name = mcip->mci_name;
4223 	nstat.ns_ibytes = mcip->mci_stat_ibytes;
4224 	nstat.ns_obytes = mcip->mci_stat_obytes;
4225 	nstat.ns_ipackets = mcip->mci_stat_ipackets;
4226 	nstat.ns_opackets = mcip->mci_stat_opackets;
4227 	nstat.ns_ierrors = mcip->mci_stat_ierrors;
4228 	nstat.ns_oerrors = mcip->mci_stat_oerrors;
4229 
4230 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
4231 }
4232 
4233 /*
4234  * For a given flow, if the descrition has not been logged before, do it now.
4235  * If it is a VNIC, then we have collected information about it from the MAC
4236  * table, so skip it.
4237  */
4238 /*ARGSUSED*/
4239 static int
4240 mac_log_flowinfo(flow_entry_t *flent, void *args)
4241 {
4242 	mac_client_impl_t	*mcip = flent->fe_mcip;
4243 
4244 	if (mcip == NULL)
4245 		return (0);
4246 
4247 	/*
4248 	 * If the name starts with "vnic", and fe_user_generated is true (to
4249 	 * exclude the mcast and active flow entries created implicitly for
4250 	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
4251 	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
4252 	 */
4253 	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
4254 	    (flent->fe_type & FLOW_USER) != 0) {
4255 		return (0);
4256 	}
4257 
4258 	if (!flent->fe_desc_logged) {
4259 		/*
4260 		 * We don't return error because we want to continu the
4261 		 * walk in case this is the last walk which means we
4262 		 * need to reset fe_desc_logged in all the flows.
4263 		 */
4264 		if (mac_write_flow_desc(flent, mcip) != 0)
4265 			return (0);
4266 		flent->fe_desc_logged = B_TRUE;
4267 	}
4268 
4269 	/*
4270 	 * Regardless of the error, we want to proceed in case we have to
4271 	 * reset fe_desc_logged.
4272 	 */
4273 	(void) mac_write_flow_stats(flent);
4274 
4275 	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
4276 		flent->fe_desc_logged = B_FALSE;
4277 
4278 	return (0);
4279 }
4280 
4281 typedef struct i_mac_log_state_s {
4282 	boolean_t	mi_last;
4283 	int		mi_fenable;
4284 	int		mi_lenable;
4285 } i_mac_log_state_t;
4286 
4287 /*
4288  * Walk the mac_impl_ts and log the description for each mac client of this mac,
4289  * if it hasn't already been done. Additionally, log statistics for the link as
4290  * well. Walk the flow table and log information for each flow as well.
4291  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
4292  * also fe_desc_logged, if flow logging is on) since we want to log the
4293  * description if and when logging is restarted.
4294  */
4295 /*ARGSUSED*/
4296 static uint_t
4297 i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
4298 {
4299 	mac_impl_t		*mip = (mac_impl_t *)val;
4300 	i_mac_log_state_t	*lstate = (i_mac_log_state_t *)arg;
4301 	int			ret;
4302 	mac_client_impl_t	*mcip;
4303 
4304 	/*
4305 	 * Only walk the client list for NIC and etherstub
4306 	 */
4307 	if ((mip->mi_state_flags & MIS_DISABLED) ||
4308 	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
4309 	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
4310 		return (MH_WALK_CONTINUE);
4311 
4312 	for (mcip = mip->mi_clients_list; mcip != NULL;
4313 	    mcip = mcip->mci_client_next) {
4314 		if (!MCIP_DATAPATH_SETUP(mcip))
4315 			continue;
4316 		if (lstate->mi_lenable) {
4317 			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
4318 				ret = mac_write_link_desc(mcip);
4319 				if (ret != 0) {
4320 				/*
4321 				 * We can't terminate it if this is the last
4322 				 * walk, else there might be some links with
4323 				 * mi_desc_logged set to true, which means
4324 				 * their description won't be logged the next
4325 				 * time logging is started (similarly for the
4326 				 * flows within such links). We can continue
4327 				 * without walking the flow table (i.e. to
4328 				 * set fe_desc_logged to false) because we
4329 				 * won't have written any flow stuff for this
4330 				 * link as we haven't logged the link itself.
4331 				 */
4332 					if (lstate->mi_last)
4333 						return (MH_WALK_CONTINUE);
4334 					else
4335 						return (MH_WALK_TERMINATE);
4336 				}
4337 				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
4338 			}
4339 		}
4340 
4341 		if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
4342 			return (MH_WALK_TERMINATE);
4343 
4344 		if (lstate->mi_last)
4345 			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
4346 
4347 		if (lstate->mi_fenable) {
4348 			if (mcip->mci_subflow_tab != NULL) {
4349 				(void) mac_flow_walk(mcip->mci_subflow_tab,
4350 				    mac_log_flowinfo, mip);
4351 			}
4352 		}
4353 	}
4354 	return (MH_WALK_CONTINUE);
4355 }
4356 
4357 /*
4358  * The timer thread that runs every mac_logging_interval seconds and logs
4359  * link and/or flow information.
4360  */
4361 /* ARGSUSED */
4362 void
4363 mac_log_linkinfo(void *arg)
4364 {
4365 	i_mac_log_state_t	lstate;
4366 
4367 	rw_enter(&i_mac_impl_lock, RW_READER);
4368 	if (!mac_flow_log_enable && !mac_link_log_enable) {
4369 		rw_exit(&i_mac_impl_lock);
4370 		return;
4371 	}
4372 	lstate.mi_fenable = mac_flow_log_enable;
4373 	lstate.mi_lenable = mac_link_log_enable;
4374 	lstate.mi_last = B_FALSE;
4375 	rw_exit(&i_mac_impl_lock);
4376 
4377 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4378 
4379 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4380 	if (mac_flow_log_enable || mac_link_log_enable) {
4381 		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
4382 		    SEC_TO_TICK(mac_logging_interval));
4383 	}
4384 	rw_exit(&i_mac_impl_lock);
4385 }
4386 
4387 /*
4388  * Start the logging timer.
4389  */
4390 void
4391 mac_start_logusage(mac_logtype_t type, uint_t interval)
4392 {
4393 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4394 	switch (type) {
4395 	case MAC_LOGTYPE_FLOW:
4396 		if (mac_flow_log_enable) {
4397 			rw_exit(&i_mac_impl_lock);
4398 			return;
4399 		}
4400 		mac_flow_log_enable = B_TRUE;
4401 		/* FALLTHRU */
4402 	case MAC_LOGTYPE_LINK:
4403 		if (mac_link_log_enable) {
4404 			rw_exit(&i_mac_impl_lock);
4405 			return;
4406 		}
4407 		mac_link_log_enable = B_TRUE;
4408 		break;
4409 	default:
4410 		ASSERT(0);
4411 	}
4412 	mac_logging_interval = interval;
4413 	rw_exit(&i_mac_impl_lock);
4414 	mac_log_linkinfo(NULL);
4415 }
4416 
4417 /*
4418  * Stop the logging timer if both Link and Flow logging are turned off.
4419  */
4420 void
4421 mac_stop_logusage(mac_logtype_t type)
4422 {
4423 	i_mac_log_state_t	lstate;
4424 
4425 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4426 	lstate.mi_fenable = mac_flow_log_enable;
4427 	lstate.mi_lenable = mac_link_log_enable;
4428 
4429 	/* Last walk */
4430 	lstate.mi_last = B_TRUE;
4431 
4432 	switch (type) {
4433 	case MAC_LOGTYPE_FLOW:
4434 		if (lstate.mi_fenable) {
4435 			ASSERT(mac_link_log_enable);
4436 			mac_flow_log_enable = B_FALSE;
4437 			mac_link_log_enable = B_FALSE;
4438 			break;
4439 		}
4440 		/* FALLTHRU */
4441 	case MAC_LOGTYPE_LINK:
4442 		if (!lstate.mi_lenable || mac_flow_log_enable) {
4443 			rw_exit(&i_mac_impl_lock);
4444 			return;
4445 		}
4446 		mac_link_log_enable = B_FALSE;
4447 		break;
4448 	default:
4449 		ASSERT(0);
4450 	}
4451 	rw_exit(&i_mac_impl_lock);
4452 	(void) untimeout(mac_logging_timer);
4453 	mac_logging_timer = 0;
4454 
4455 	/* Last walk */
4456 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4457 }
4458 
4459 /*
4460  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
4461  */
4462 void
4463 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
4464 {
4465 	pri_t			pri;
4466 	int			count;
4467 	mac_soft_ring_set_t	*mac_srs;
4468 
4469 	if (flent->fe_rx_srs_cnt <= 0)
4470 		return;
4471 
4472 	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
4473 	    SRST_FLOW) {
4474 		pri = FLOW_PRIORITY(mcip->mci_min_pri,
4475 		    mcip->mci_max_pri,
4476 		    flent->fe_resource_props.mrp_priority);
4477 	} else {
4478 		pri = mcip->mci_max_pri;
4479 	}
4480 
4481 	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
4482 		mac_srs = flent->fe_rx_srs[count];
4483 		mac_update_srs_priority(mac_srs, pri);
4484 	}
4485 	/*
4486 	 * If we have a Tx SRS, we need to modify all the threads associated
4487 	 * with it.
4488 	 */
4489 	if (flent->fe_tx_srs != NULL)
4490 		mac_update_srs_priority(flent->fe_tx_srs, pri);
4491 }
4492 
4493 /*
4494  * RX and TX rings are reserved according to different semantics depending
4495  * on the requests from the MAC clients and type of rings:
4496  *
4497  * On the Tx side, by default we reserve individual rings, independently from
4498  * the groups.
4499  *
4500  * On the Rx side, the reservation is at the granularity of the group
4501  * of rings, and used for v12n level 1 only. It has a special case for the
4502  * primary client.
4503  *
4504  * If a share is allocated to a MAC client, we allocate a TX group and an
4505  * RX group to the client, and assign TX rings and RX rings to these
4506  * groups according to information gathered from the driver through
4507  * the share capability.
4508  *
4509  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
4510  * to allocate individual rings out of a group and program the hw classifier
4511  * based on IP address or higher level criteria.
4512  */
4513 
4514 /*
4515  * mac_reserve_tx_ring()
4516  * Reserve a unused ring by marking it with MR_INUSE state.
4517  * As reserved, the ring is ready to function.
4518  *
4519  * Notes for Hybrid I/O:
4520  *
4521  * If a specific ring is needed, it is specified through the desired_ring
4522  * argument. Otherwise that argument is set to NULL.
4523  * If the desired ring was previous allocated to another client, this
4524  * function swaps it with a new ring from the group of unassigned rings.
4525  */
4526 mac_ring_t *
4527 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
4528 {
4529 	mac_group_t *group;
4530 	mac_ring_t *ring;
4531 
4532 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4533 
4534 	if (mip->mi_tx_groups == NULL)
4535 		return (NULL);
4536 
4537 	/*
4538 	 * Find an available ring and start it before changing its status.
4539 	 * The unassigned rings are at the end of the mi_tx_groups
4540 	 * array.
4541 	 */
4542 	group = mip->mi_tx_groups + mip->mi_tx_group_count;
4543 
4544 	for (ring = group->mrg_rings; ring != NULL;
4545 	    ring = ring->mr_next) {
4546 		if (desired_ring == NULL) {
4547 			if (ring->mr_state == MR_FREE)
4548 				/* wanted any free ring and found one */
4549 				break;
4550 		} else {
4551 			mac_ring_t *sring;
4552 			mac_client_impl_t *client;
4553 			mac_soft_ring_set_t *srs;
4554 
4555 			if (ring != desired_ring)
4556 				/* wants a desired ring but this one ain't it */
4557 				continue;
4558 
4559 			if (ring->mr_state == MR_FREE)
4560 				break;
4561 
4562 			/*
4563 			 * Found the desired ring but it's already in use.
4564 			 * Swap it with a new ring.
4565 			 */
4566 
4567 			/* find the client which owns that ring */
4568 			for (client = mip->mi_clients_list; client != NULL;
4569 			    client = client->mci_client_next) {
4570 				srs = MCIP_TX_SRS(client);
4571 				if (srs != NULL && mac_tx_srs_ring_present(srs,
4572 				    desired_ring)) {
4573 					/* found our ring */
4574 					break;
4575 				}
4576 			}
4577 			if (client == NULL) {
4578 				/*
4579 				 * The TX ring is in use, but it's not
4580 				 * associated with any clients, so it
4581 				 * has to be the default ring. In that
4582 				 * case we can simply assign a new ring
4583 				 * as the default ring, and we're done.
4584 				 */
4585 				ASSERT(mip->mi_default_tx_ring ==
4586 				    (mac_ring_handle_t)desired_ring);
4587 
4588 				/*
4589 				 * Quiesce all clients on top of
4590 				 * the NIC to make sure there are no
4591 				 * pending threads still relying on
4592 				 * that default ring, for example
4593 				 * the multicast path.
4594 				 */
4595 				for (client = mip->mi_clients_list;
4596 				    client != NULL;
4597 				    client = client->mci_client_next) {
4598 					mac_tx_client_quiesce(client,
4599 					    SRS_QUIESCE);
4600 				}
4601 
4602 				mip->mi_default_tx_ring = (mac_ring_handle_t)
4603 				    mac_reserve_tx_ring(mip, NULL);
4604 
4605 				/* resume the clients */
4606 				for (client = mip->mi_clients_list;
4607 				    client != NULL;
4608 				    client = client->mci_client_next)
4609 					mac_tx_client_restart(client);
4610 
4611 				break;
4612 			}
4613 
4614 			/*
4615 			 * Note that we cannot simply invoke the group
4616 			 * add/rem routines since the client doesn't have a
4617 			 * TX group. So we need to instead add/remove
4618 			 * the rings from the SRS.
4619 			 */
4620 			ASSERT(client->mci_share == NULL);
4621 
4622 			/* first quiece the client */
4623 			mac_tx_client_quiesce(client, SRS_QUIESCE);
4624 
4625 			/* give a new ring to the client... */
4626 			sring = mac_reserve_tx_ring(mip, NULL);
4627 			if (sring != NULL) {
4628 				/*
4629 				 * There are no other available ring
4630 				 * on that MAC instance. The client
4631 				 * will fallback to the shared TX
4632 				 * ring.
4633 				 */
4634 				mac_tx_srs_add_ring(srs, sring);
4635 			}
4636 
4637 			/* ... in exchange for our desired ring */
4638 			mac_tx_srs_del_ring(srs, desired_ring);
4639 
4640 			/* restart the client */
4641 			mac_tx_client_restart(client);
4642 
4643 			if (mip->mi_default_tx_ring ==
4644 			    (mac_ring_handle_t)desired_ring) {
4645 				/*
4646 				 * The desired ring is the default ring,
4647 				 * and there are one or more clients
4648 				 * using that default ring directly.
4649 				 */
4650 				mip->mi_default_tx_ring =
4651 				    (mac_ring_handle_t)sring;
4652 				/*
4653 				 * Find clients using default ring and
4654 				 * swap it with the new default ring.
4655 				 */
4656 				for (client = mip->mi_clients_list;
4657 				    client != NULL;
4658 				    client = client->mci_client_next) {
4659 					srs = MCIP_TX_SRS(client);
4660 					if (srs != NULL &&
4661 					    mac_tx_srs_ring_present(srs,
4662 					    desired_ring)) {
4663 						/* first quiece the client */
4664 						mac_tx_client_quiesce(client,
4665 						    SRS_QUIESCE);
4666 
4667 						/*
4668 						 * Give it the new default
4669 						 * ring, and remove the old
4670 						 * one.
4671 						 */
4672 						if (sring != NULL) {
4673 							mac_tx_srs_add_ring(srs,
4674 							    sring);
4675 						}
4676 						mac_tx_srs_del_ring(srs,
4677 						    desired_ring);
4678 
4679 						/* restart the client */
4680 						mac_tx_client_restart(client);
4681 					}
4682 				}
4683 			}
4684 			break;
4685 		}
4686 	}
4687 
4688 	if (ring != NULL) {
4689 		if (mac_start_ring(ring) != 0)
4690 			return (NULL);
4691 		ring->mr_state = MR_INUSE;
4692 	}
4693 
4694 	return (ring);
4695 }
4696 
4697 /*
4698  * Minimum number of rings to leave in the default TX group when allocating
4699  * rings to new clients.
4700  */
4701 static uint_t mac_min_rx_default_rings = 1;
4702 
4703 /*
4704  * Populate a zero-ring group with rings. If the share is non-NULL,
4705  * the rings are chosen according to that share.
4706  * Invoked after allocating a new RX or TX group through
4707  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
4708  * Returns zero on success, an errno otherwise.
4709  */
4710 int
4711 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
4712     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
4713 {
4714 	mac_ring_t **rings, *tmp_ring[1], *ring;
4715 	uint_t nrings;
4716 	int rv, i, j;
4717 
4718 	ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
4719 	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
4720 	ASSERT(new_group->mrg_cur_count == 0);
4721 
4722 	/*
4723 	 * First find the rings to allocate to the group.
4724 	 */
4725 	if (share != NULL) {
4726 		/* get rings through ms_squery() */
4727 		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
4728 		ASSERT(nrings != 0);
4729 		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
4730 		    KM_SLEEP);
4731 		mip->mi_share_capab.ms_squery(share, ring_type,
4732 		    (mac_ring_handle_t *)rings, &nrings);
4733 	} else {
4734 		/* this function is called for TX only with a share */
4735 		ASSERT(ring_type == MAC_RING_TYPE_RX);
4736 		/*
4737 		 * Pick one ring from default group.
4738 		 *
4739 		 * for now pick the second ring which requires the first ring
4740 		 * at index 0 to stay in the default group, since it is the
4741 		 * ring which carries the multicast traffic.
4742 		 * We need a better way for a driver to indicate this,
4743 		 * for example a per-ring flag.
4744 		 */
4745 		for (ring = src_group->mrg_rings; ring != NULL;
4746 		    ring = ring->mr_next) {
4747 			if (ring->mr_index != 0)
4748 				break;
4749 		}
4750 		ASSERT(ring != NULL);
4751 		nrings = 1;
4752 		tmp_ring[0] = ring;
4753 		rings = tmp_ring;
4754 	}
4755 
4756 	switch (ring_type) {
4757 	case MAC_RING_TYPE_RX:
4758 		if (src_group->mrg_cur_count - nrings <
4759 		    mac_min_rx_default_rings) {
4760 			/* we ran out of rings */
4761 			return (ENOSPC);
4762 		}
4763 
4764 		/* move receive rings to new group */
4765 		for (i = 0; i < nrings; i++) {
4766 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4767 			if (rv != 0) {
4768 				/* move rings back on failure */
4769 				for (j = 0; j < i; j++) {
4770 					(void) mac_group_mov_ring(mip,
4771 					    src_group, rings[j]);
4772 				}
4773 				return (rv);
4774 			}
4775 		}
4776 		break;
4777 
4778 	case MAC_RING_TYPE_TX: {
4779 		mac_ring_t *tmp_ring;
4780 
4781 		/* move the TX rings to the new group */
4782 		ASSERT(src_group == NULL);
4783 		for (i = 0; i < nrings; i++) {
4784 			/* get the desired ring */
4785 			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
4786 			ASSERT(tmp_ring == rings[i]);
4787 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4788 			if (rv != 0) {
4789 				/* cleanup on failure */
4790 				for (j = 0; j < i; j++) {
4791 					(void) mac_group_mov_ring(mip,
4792 					    mip->mi_tx_groups +
4793 					    mip->mi_tx_group_count, rings[j]);
4794 				}
4795 			}
4796 		}
4797 		break;
4798 	}
4799 	}
4800 
4801 	if (share != NULL) {
4802 		/* add group to share */
4803 		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
4804 		/* free temporary array of rings */
4805 		kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
4806 	}
4807 
4808 	return (0);
4809 }
4810 
4811 void
4812 mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
4813 {
4814 	mac_grp_client_t *mgcp;
4815 
4816 	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
4817 		if (mgcp->mgc_client == mcip)
4818 			break;
4819 	}
4820 
4821 	VERIFY(mgcp == NULL);
4822 
4823 	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
4824 	mgcp->mgc_client = mcip;
4825 	mgcp->mgc_next = grp->mrg_clients;
4826 	grp->mrg_clients = mgcp;
4827 
4828 }
4829 
4830 void
4831 mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
4832 {
4833 	mac_grp_client_t *mgcp, **pprev;
4834 
4835 	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
4836 	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
4837 		if (mgcp->mgc_client == mcip)
4838 			break;
4839 	}
4840 
4841 	ASSERT(mgcp != NULL);
4842 
4843 	*pprev = mgcp->mgc_next;
4844 	kmem_free(mgcp, sizeof (mac_grp_client_t));
4845 }
4846 
4847 /*
4848  * mac_reserve_rx_group()
4849  *
4850  * Finds an available group and exclusively reserves it for a client.
4851  * The group is chosen to suit the flow's resource controls (bandwidth and
4852  * fanout requirements) and the address type.
4853  * If the requestor is the pimary MAC then return the group with the
4854  * largest number of rings, otherwise the default ring when available.
4855  */
4856 mac_group_t *
4857 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
4858     mac_rx_group_reserve_type_t rtype)
4859 {
4860 	mac_share_handle_t	share = mcip->mci_share;
4861 	mac_impl_t		*mip = mcip->mci_mip;
4862 	mac_group_t		*grp = NULL;
4863 	int			i, start, loopcount;
4864 	int			err;
4865 	mac_address_t		*map;
4866 
4867 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4868 
4869 	/* Check if a group already has this mac address (case of VLANs) */
4870 	if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
4871 		return (map->ma_group);
4872 
4873 	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
4874 	    rtype == MAC_RX_NO_RESERVE)
4875 		return (NULL);
4876 
4877 	/*
4878 	 * Try to exclusively reserve a RX group.
4879 	 *
4880 	 * For flows requires SW_RING it always goes to the default group
4881 	 * (Until we can explicitely call out default groups (CR 6695600),
4882 	 * we assume that the default group is always at position zero);
4883 	 *
4884 	 * For flows requires HW_DEFAULT_RING (unicast flow of the primary
4885 	 * client), try to reserve the default RX group only.
4886 	 *
4887 	 * For flows requires HW_RING (unicast flow of other clients), try
4888 	 * to reserve non-default RX group then the default group.
4889 	 */
4890 	switch (rtype) {
4891 	case MAC_RX_RESERVE_DEFAULT:
4892 		start = 0;
4893 		loopcount = 1;
4894 		break;
4895 	case MAC_RX_RESERVE_NONDEFAULT:
4896 		start = 1;
4897 		loopcount = mip->mi_rx_group_count;
4898 	}
4899 
4900 	for (i = start; i < start + loopcount; i++) {
4901 		grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
4902 
4903 		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
4904 		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
4905 
4906 		/*
4907 		 * Check to see whether this mac client is the only client
4908 		 * on this RX group. If not, we cannot exclusively reserve
4909 		 * this RX group.
4910 		 */
4911 		if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
4912 		    (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
4913 			continue;
4914 		}
4915 
4916 		/*
4917 		 * This group could already be SHARED by other multicast
4918 		 * flows on this client. In that case, the group would
4919 		 * be shared and has already been started.
4920 		 */
4921 		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
4922 
4923 		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
4924 		    (mac_start_group(grp) != 0)) {
4925 			continue;
4926 		}
4927 
4928 		if ((i % mip->mi_rx_group_count) == 0 ||
4929 		    mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
4930 			break;
4931 		}
4932 
4933 		ASSERT(grp->mrg_cur_count == 0);
4934 
4935 		/*
4936 		 * Populate the group. Rings should be taken
4937 		 * from the default group at position 0 for now.
4938 		 */
4939 
4940 		err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
4941 		    &mip->mi_rx_groups[0], grp, share);
4942 		if (err == 0)
4943 			break;
4944 
4945 		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
4946 		    mip->mi_name, int, grp->mrg_index, int, err);
4947 
4948 		/*
4949 		 * It's a dynamic group but the grouping operation failed.
4950 		 */
4951 		mac_stop_group(grp);
4952 	}
4953 
4954 	if (i == start + loopcount)
4955 		return (NULL);
4956 
4957 	ASSERT(grp != NULL);
4958 
4959 	DTRACE_PROBE2(rx__group__reserved,
4960 	    char *, mip->mi_name, int, grp->mrg_index);
4961 	return (grp);
4962 }
4963 
4964 /*
4965  * mac_rx_release_group()
4966  *
4967  * This is called when there are no clients left for the group.
4968  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
4969  * and if it is a non default group, the shares are removed and
4970  * all rings are assigned back to default group.
4971  */
4972 void
4973 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
4974 {
4975 	mac_impl_t	*mip = mcip->mci_mip;
4976 	mac_ring_t	*ring;
4977 
4978 	ASSERT(group != &mip->mi_rx_groups[0]);
4979 
4980 	/*
4981 	 * This is the case where there are no clients left. Any
4982 	 * SRS etc on this group have also be quiesced.
4983 	 */
4984 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
4985 		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
4986 			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
4987 			/*
4988 			 * Remove the SRS associated with the HW ring.
4989 			 * As a result, polling will be disabled.
4990 			 */
4991 			ring->mr_srs = NULL;
4992 		}
4993 		ASSERT(ring->mr_state == MR_INUSE);
4994 		mac_stop_ring(ring);
4995 		ring->mr_state = MR_FREE;
4996 		ring->mr_flag = 0;
4997 	}
4998 
4999 	/* remove group from share */
5000 	if (mcip->mci_share != NULL) {
5001 		mip->mi_share_capab.ms_sremove(mcip->mci_share,
5002 		    group->mrg_driver);
5003 	}
5004 
5005 	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
5006 		mac_ring_t *ring;
5007 
5008 		/*
5009 		 * Rings were dynamically allocated to group.
5010 		 * Move rings back to default group.
5011 		 */
5012 		while ((ring = group->mrg_rings) != NULL) {
5013 			(void) mac_group_mov_ring(mip,
5014 			    &mip->mi_rx_groups[0], ring);
5015 		}
5016 	}
5017 	mac_stop_group(group);
5018 	/*
5019 	 * Possible improvement: See if we can assign the group just released
5020 	 * to a another client of the mip
5021 	 */
5022 }
5023 
5024 /*
5025  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
5026  * when a share was allocated to the client.
5027  */
5028 mac_group_t *
5029 mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
5030 {
5031 	mac_group_t *grp;
5032 	int rv, i;
5033 
5034 	/*
5035 	 * TX groups are currently allocated only to MAC clients
5036 	 * which are associated with a share. Since we have a fixed
5037 	 * number of share and groups, and we already successfully
5038 	 * allocated a share, find an available TX group.
5039 	 */
5040 	ASSERT(share != NULL);
5041 	ASSERT(mip->mi_tx_group_free > 0);
5042 
5043 	for (i = 0; i <  mip->mi_tx_group_count; i++) {
5044 		grp = &mip->mi_tx_groups[i];
5045 
5046 		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
5047 		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
5048 			continue;
5049 
5050 		rv = mac_start_group(grp);
5051 		ASSERT(rv == 0);
5052 
5053 		grp->mrg_state = MAC_GROUP_STATE_RESERVED;
5054 		break;
5055 	}
5056 
5057 	ASSERT(grp != NULL);
5058 
5059 	/*
5060 	 * Populate the group. Rings should be taken from the group
5061 	 * of unassigned rings, which is past the array of TX
5062 	 * groups adversized by the driver.
5063 	 */
5064 	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
5065 	    grp, share);
5066 	if (rv != 0) {
5067 		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
5068 		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
5069 
5070 		mac_stop_group(grp);
5071 		grp->mrg_state = MAC_GROUP_STATE_UNINIT;
5072 
5073 		return (NULL);
5074 	}
5075 
5076 	mip->mi_tx_group_free--;
5077 
5078 	return (grp);
5079 }
5080 
5081 void
5082 mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
5083 {
5084 	mac_client_impl_t *mcip = grp->mrg_tx_client;
5085 	mac_share_handle_t share = mcip->mci_share;
5086 	mac_ring_t *ring;
5087 
5088 	ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
5089 	ASSERT(share != NULL);
5090 	ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
5091 
5092 	mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
5093 	while ((ring = grp->mrg_rings) != NULL) {
5094 		/* move the ring back to the pool */
5095 		(void) mac_group_mov_ring(mip, mip->mi_tx_groups +
5096 		    mip->mi_tx_group_count, ring);
5097 	}
5098 	mac_stop_group(grp);
5099 	mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
5100 	grp->mrg_tx_client = NULL;
5101 	mip->mi_tx_group_free++;
5102 }
5103 
5104 /*
5105  * This is a 1-time control path activity initiated by the client (IP).
5106  * The mac perimeter protects against other simultaneous control activities,
5107  * for example an ioctl that attempts to change the degree of fanout and
5108  * increase or decrease the number of softrings associated with this Tx SRS.
5109  */
5110 static mac_tx_notify_cb_t *
5111 mac_client_tx_notify_add(mac_client_impl_t *mcip,
5112     mac_tx_notify_t notify, void *arg)
5113 {
5114 	mac_cb_info_t *mcbi;
5115 	mac_tx_notify_cb_t *mtnfp;
5116 
5117 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5118 
5119 	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
5120 	mtnfp->mtnf_fn = notify;
5121 	mtnfp->mtnf_arg = arg;
5122 	mtnfp->mtnf_link.mcb_objp = mtnfp;
5123 	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
5124 	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
5125 
5126 	mcbi = &mcip->mci_tx_notify_cb_info;
5127 	mutex_enter(mcbi->mcbi_lockp);
5128 	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
5129 	mutex_exit(mcbi->mcbi_lockp);
5130 	return (mtnfp);
5131 }
5132 
5133 static void
5134 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
5135 {
5136 	mac_cb_info_t	*mcbi;
5137 	mac_cb_t	**cblist;
5138 
5139 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5140 
5141 	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
5142 	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
5143 		cmn_err(CE_WARN,
5144 		    "mac_client_tx_notify_remove: callback not "
5145 		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
5146 		return;
5147 	}
5148 
5149 	mcbi = &mcip->mci_tx_notify_cb_info;
5150 	cblist = &mcip->mci_tx_notify_cb_list;
5151 	mutex_enter(mcbi->mcbi_lockp);
5152 	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
5153 		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
5154 	else
5155 		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
5156 	mutex_exit(mcbi->mcbi_lockp);
5157 }
5158 
5159 /*
5160  * mac_client_tx_notify():
5161  * call to add and remove flow control callback routine.
5162  */
5163 mac_tx_notify_handle_t
5164 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
5165     void *ptr)
5166 {
5167 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
5168 	mac_tx_notify_cb_t	*mtnfp = NULL;
5169 
5170 	i_mac_perim_enter(mcip->mci_mip);
5171 
5172 	if (callb_func != NULL) {
5173 		/* Add a notify callback */
5174 		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
5175 	} else {
5176 		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
5177 	}
5178 	i_mac_perim_exit(mcip->mci_mip);
5179 
5180 	return ((mac_tx_notify_handle_t)mtnfp);
5181 }
5182