xref: /titanic_51/usr/src/uts/common/io/mac/mac.c (revision e6fda97ba28fe3b5aaba49c030a73272719c7cba)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * MAC Services Module
29  *
30  * The GLDv3 framework locking -  The MAC layer
31  * --------------------------------------------
32  *
33  * The MAC layer is central to the GLD framework and can provide the locking
34  * framework needed for itself and for the use of MAC clients. MAC end points
35  * are fairly disjoint and don't share a lot of state. So a coarse grained
36  * multi-threading scheme is to single thread all create/modify/delete or set
37  * type of control operations on a per mac end point while allowing data threads
38  * concurrently.
39  *
40  * Control operations (set) that modify a mac end point are always serialized on
41  * a per mac end point basis, We have at most 1 such thread per mac end point
42  * at a time.
43  *
44  * All other operations that are not serialized are essentially multi-threaded.
45  * For example a control operation (get) like getting statistics which may not
46  * care about reading values atomically or data threads sending or receiving
47  * data. Mostly these type of operations don't modify the control state. Any
48  * state these operations care about are protected using traditional locks.
49  *
50  * The perimeter only serializes serial operations. It does not imply there
51  * aren't any other concurrent operations. However a serialized operation may
52  * sometimes need to make sure it is the only thread. In this case it needs
53  * to use reference counting mechanisms to cv_wait until any current data
54  * threads are done.
55  *
56  * The mac layer itself does not hold any locks across a call to another layer.
57  * The perimeter is however held across a down call to the driver to make the
58  * whole control operation atomic with respect to other control operations.
59  * Also the data path and get type control operations may proceed concurrently.
60  * These operations synchronize with the single serial operation on a given mac
61  * end point using regular locks. The perimeter ensures that conflicting
62  * operations like say a mac_multicast_add and a mac_multicast_remove on the
63  * same mac end point don't interfere with each other and also ensures that the
64  * changes in the mac layer and the call to the underlying driver to say add a
65  * multicast address are done atomically without interference from a thread
66  * trying to delete the same address.
67  *
68  * For example, consider
69  * mac_multicst_add()
70  * {
71  *	mac_perimeter_enter();	serialize all control operations
72  *
73  *	grab list lock		protect against access by data threads
74  *	add to list
75  *	drop list lock
76  *
77  *	call driver's mi_multicst
78  *
79  *	mac_perimeter_exit();
80  * }
81  *
82  * To lessen the number of serialization locks and simplify the lock hierarchy,
83  * we serialize all the control operations on a per mac end point by using a
84  * single serialization lock called the perimeter. We allow recursive entry into
85  * the perimeter to facilitate use of this mechanism by both the mac client and
86  * the MAC layer itself.
87  *
88  * MAC client means an entity that does an operation on a mac handle
89  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
90  * an entity that does an operation on a mac handle obtained from a
91  * mac_register. An entity could be both client and driver but on different
92  * handles eg. aggr. and should only make the corresponding mac interface calls
93  * i.e. mac driver interface or mac client interface as appropriate for that
94  * mac handle.
95  *
96  * General rules.
97  * -------------
98  *
99  * R1. The lock order of upcall threads is natually opposite to downcall
100  * threads. Hence upcalls must not hold any locks across layers for fear of
101  * recursive lock enter and lock order violation. This applies to all layers.
102  *
103  * R2. The perimeter is just another lock. Since it is held in the down
104  * direction, acquiring the perimeter in an upcall is prohibited as it would
105  * cause a deadlock. This applies to all layers.
106  *
107  * Note that upcalls that need to grab the mac perimeter (for example
108  * mac_notify upcalls) can still achieve that by posting the request to a
109  * thread, which can then grab all the required perimeters and locks in the
110  * right global order. Note that in the above example the mac layer iself
111  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
112  * to the client must do that. Please see the aggr code for an example.
113  *
114  * MAC client rules
115  * ----------------
116  *
117  * R3. A MAC client may use the MAC provided perimeter facility to serialize
118  * control operations on a per mac end point. It does this by by acquring
119  * and holding the perimeter across a sequence of calls to the mac layer.
120  * This ensures atomicity across the entire block of mac calls. In this
121  * model the MAC client must not hold any client locks across the calls to
122  * the mac layer. This model is the preferred solution.
123  *
124  * R4. However if a MAC client has a lot of global state across all mac end
125  * points the per mac end point serialization may not be sufficient. In this
126  * case the client may choose to use global locks or use its own serialization.
127  * To avoid deadlocks, these client layer locks held across the mac calls
128  * in the control path must never be acquired by the data path for the reason
129  * mentioned below.
130  *
131  * (Assume that a control operation that holds a client lock blocks in the
132  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
133  * data thread that holds this reference count, tries to acquire the same
134  * client lock subsequently it will deadlock).
135  *
136  * A MAC client may follow either the R3 model or the R4 model, but can't
137  * mix both. In the former, the hierarchy is Perim -> client locks, but in
138  * the latter it is client locks -> Perim.
139  *
140  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
141  * context since they may block while trying to acquire the perimeter.
142  * In addition some calls may block waiting for upcall refcnts to come down to
143  * zero.
144  *
145  * R6. MAC clients must make sure that they are single threaded and all threads
146  * from the top (in particular data threads) have finished before calling
147  * mac_client_close. The MAC framework does not track the number of client
148  * threads using the mac client handle. Also mac clients must make sure
149  * they have undone all the control operations before calling mac_client_close.
150  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
151  * mac_unicast_add/mac_multicast_add.
152  *
153  * MAC framework rules
154  * -------------------
155  *
156  * R7. The mac layer itself must not hold any mac layer locks (except the mac
157  * perimeter) across a call to any other layer from the mac layer. The call to
158  * any other layer could be via mi_* entry points, classifier entry points into
159  * the driver or via upcall pointers into layers above. The mac perimeter may
160  * be acquired or held only in the down direction, for e.g. when calling into
161  * a mi_* driver enty point to provide atomicity of the operation.
162  *
163  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
164  * mac driver interfaces, the MAC layer must provide a cut out for control
165  * interfaces like upcall notifications and start them in a separate thread.
166  *
167  * R9. Note that locking order also implies a plumbing order. For example
168  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
169  * to plumb in any other order must be failed at mac_open time, otherwise it
170  * could lead to deadlocks due to inverse locking order.
171  *
172  * R10. MAC driver interfaces must not block since the driver could call them
173  * in interrupt context.
174  *
175  * R11. Walkers must preferably not hold any locks while calling walker
176  * callbacks. Instead these can operate on reference counts. In simple
177  * callbacks it may be ok to hold a lock and call the callbacks, but this is
178  * harder to maintain in the general case of arbitrary callbacks.
179  *
180  * R12. The MAC layer must protect upcall notification callbacks using reference
181  * counts rather than holding locks across the callbacks.
182  *
183  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
184  * sure that any pointers (such as mac ring pointers) it passes to the driver
185  * remain valid until mac unregister time. Currently the mac layer achieves
186  * this by using generation numbers for rings and freeing the mac rings only
187  * at unregister time.  The MAC layer must provide a layer of indirection and
188  * must not expose underlying driver rings or driver data structures/pointers
189  * directly to MAC clients.
190  *
191  * MAC driver rules
192  * ----------------
193  *
194  * R14. It would be preferable if MAC drivers don't hold any locks across any
195  * mac call. However at a minimum they must not hold any locks across data
196  * upcalls. They must also make sure that all references to mac data structures
197  * are cleaned up and that it is single threaded at mac_unregister time.
198  *
199  * R15. MAC driver interfaces don't block and so the action may be done
200  * asynchronously in a separate thread as for example handling notifications.
201  * The driver must not assume that the action is complete when the call
202  * returns.
203  *
204  * R16. Drivers must maintain a generation number per Rx ring, and pass it
205  * back to mac_rx_ring(); They are expected to increment the generation
206  * number whenever the ring's stop routine is invoked.
207  * See comments in mac_rx_ring();
208  *
209  * R17 Similarly mi_stop is another synchronization point and the driver must
210  * ensure that all upcalls are done and there won't be any future upcall
211  * before returning from mi_stop.
212  *
213  * R18. The driver may assume that all set/modify control operations via
214  * the mi_* entry points are single threaded on a per mac end point.
215  *
216  * Lock and Perimeter hierarchy scenarios
217  * ---------------------------------------
218  *
219  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
220  *
221  * ft_lock -> fe_lock [mac_flow_lookup]
222  *
223  * mi_rw_lock -> fe_lock [mac_bcast_send]
224  *
225  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
226  *
227  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
228  *
229  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
230  *
231  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
232  * client to driver. In the case of clients that explictly use the mac provided
233  * perimeter mechanism for its serialization, the hierarchy is
234  * Perimeter -> mac layer locks, since the client never holds any locks across
235  * the mac calls. In the case of clients that use its own locks the hierarchy
236  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
237  * calls mac_perim_enter/exit in this case.
238  *
239  * Subflow creation rules
240  * ---------------------------
241  * o In case of a user specified cpulist present on underlying link and flows,
242  * the flows cpulist must be a subset of the underlying link.
243  * o In case of a user specified fanout mode present on link and flow, the
244  * subflow fanout count has to be less than or equal to that of the
245  * underlying link. The cpu-bindings for the subflows will be a subset of
246  * the underlying link.
247  * o In case if no cpulist specified on both underlying link and flow, the
248  * underlying link relies on a  MAC tunable to provide out of box fanout.
249  * The subflow will have no cpulist (the subflow will be unbound)
250  * o In case if no cpulist is specified on the underlying link, a subflow can
251  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
252  * for the subflow will not adhere to restriction that they need to be subset
253  * of the underlying link.
254  * o In case where the underlying link is carrying either a user specified
255  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
256  * created unbound.
257  * o While creating unbound subflows, bandwidth mode changes attempt to
258  * figure a right fanout count. In such cases the fanout count will override
259  * the unbound cpu-binding behavior.
260  * o In addition to this, while cycling between flow and link properties, we
261  * impose a restriction that if a link property has a subflow with
262  * user-specified attributes, we will not allow changing the link property.
263  * The administrator needs to reset all the user specified properties for the
264  * subflows before attempting a link property change.
265  * Some of the above rules can be overridden by specifying additional command
266  * line options while creating or modifying link or subflow properties.
267  */
268 
269 #include <sys/types.h>
270 #include <sys/conf.h>
271 #include <sys/id_space.h>
272 #include <sys/esunddi.h>
273 #include <sys/stat.h>
274 #include <sys/mkdev.h>
275 #include <sys/stream.h>
276 #include <sys/strsun.h>
277 #include <sys/strsubr.h>
278 #include <sys/dlpi.h>
279 #include <sys/modhash.h>
280 #include <sys/mac_provider.h>
281 #include <sys/mac_client_impl.h>
282 #include <sys/mac_soft_ring.h>
283 #include <sys/mac_impl.h>
284 #include <sys/mac.h>
285 #include <sys/dls.h>
286 #include <sys/dld.h>
287 #include <sys/modctl.h>
288 #include <sys/fs/dv_node.h>
289 #include <sys/thread.h>
290 #include <sys/proc.h>
291 #include <sys/callb.h>
292 #include <sys/cpuvar.h>
293 #include <sys/atomic.h>
294 #include <sys/bitmap.h>
295 #include <sys/sdt.h>
296 #include <sys/mac_flow.h>
297 #include <sys/ddi_intr_impl.h>
298 #include <sys/disp.h>
299 #include <sys/sdt.h>
300 #include <sys/vnic.h>
301 #include <sys/vnic_impl.h>
302 #include <sys/vlan.h>
303 #include <inet/ip.h>
304 #include <inet/ip6.h>
305 #include <sys/exacct.h>
306 #include <sys/exacct_impl.h>
307 #include <inet/nd.h>
308 #include <sys/ethernet.h>
309 
310 #define	IMPL_HASHSZ	67	/* prime */
311 
312 kmem_cache_t	*i_mac_impl_cachep;
313 mod_hash_t		*i_mac_impl_hash;
314 krwlock_t		i_mac_impl_lock;
315 uint_t			i_mac_impl_count;
316 static kmem_cache_t	*mac_ring_cache;
317 static id_space_t	*minor_ids;
318 static uint32_t		minor_count;
319 
320 /*
321  * Logging stuff. Perhaps mac_logging_interval could be broken into
322  * mac_flow_log_interval and mac_link_log_interval if we want to be
323  * able to schedule them differently.
324  */
325 uint_t			mac_logging_interval;
326 boolean_t		mac_flow_log_enable;
327 boolean_t		mac_link_log_enable;
328 timeout_id_t		mac_logging_timer;
329 
330 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
331 int mac_dbg = 0;
332 
333 #define	MACTYPE_KMODDIR	"mac"
334 #define	MACTYPE_HASHSZ	67
335 static mod_hash_t	*i_mactype_hash;
336 /*
337  * i_mactype_lock synchronizes threads that obtain references to mactype_t
338  * structures through i_mactype_getplugin().
339  */
340 static kmutex_t		i_mactype_lock;
341 
342 /*
343  * mac_tx_percpu_cnt
344  *
345  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
346  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
347  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
348  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
349  */
350 int mac_tx_percpu_cnt;
351 int mac_tx_percpu_cnt_max = 128;
352 
353 static int i_mac_constructor(void *, void *, int);
354 static void i_mac_destructor(void *, void *);
355 static int i_mac_ring_ctor(void *, void *, int);
356 static void i_mac_ring_dtor(void *, void *);
357 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
358 void mac_tx_client_flush(mac_client_impl_t *);
359 void mac_tx_client_block(mac_client_impl_t *);
360 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
361 static int mac_start_group_and_rings(mac_group_t *);
362 static void mac_stop_group_and_rings(mac_group_t *);
363 
364 /*
365  * Module initialization functions.
366  */
367 
368 void
369 mac_init(void)
370 {
371 	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
372 	    boot_max_ncpus);
373 
374 	/* Upper bound is mac_tx_percpu_cnt_max */
375 	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
376 		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
377 
378 	if (mac_tx_percpu_cnt < 1) {
379 		/* Someone set max_tx_percpu_cnt_max to 0 or less */
380 		mac_tx_percpu_cnt = 1;
381 	}
382 
383 	ASSERT(mac_tx_percpu_cnt >= 1);
384 	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
385 	/*
386 	 * Make it of the form 2**N - 1 in the range
387 	 * [0 .. mac_tx_percpu_cnt_max - 1]
388 	 */
389 	mac_tx_percpu_cnt--;
390 
391 	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
392 	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
393 	    NULL, NULL, NULL, 0);
394 	ASSERT(i_mac_impl_cachep != NULL);
395 
396 	mac_ring_cache = kmem_cache_create("mac_ring_cache",
397 	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
398 	    NULL, NULL, 0);
399 	ASSERT(mac_ring_cache != NULL);
400 
401 	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
402 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
403 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
404 	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
405 
406 	mac_flow_init();
407 	mac_soft_ring_init();
408 	mac_bcast_init();
409 	mac_client_init();
410 
411 	i_mac_impl_count = 0;
412 
413 	i_mactype_hash = mod_hash_create_extended("mactype_hash",
414 	    MACTYPE_HASHSZ,
415 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
416 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
417 
418 	/*
419 	 * Allocate an id space to manage minor numbers. The range of the
420 	 * space will be from MAC_MAX_MINOR+1 to MAXMIN32 (maximum legal
421 	 * minor number is MAXMIN, but id_t is type of integer and does not
422 	 * allow MAXMIN).
423 	 */
424 	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1, MAXMIN32);
425 	ASSERT(minor_ids != NULL);
426 	minor_count = 0;
427 
428 	/* Let's default to 20 seconds */
429 	mac_logging_interval = 20;
430 	mac_flow_log_enable = B_FALSE;
431 	mac_link_log_enable = B_FALSE;
432 	mac_logging_timer = 0;
433 }
434 
435 int
436 mac_fini(void)
437 {
438 	if (i_mac_impl_count > 0 || minor_count > 0)
439 		return (EBUSY);
440 
441 	id_space_destroy(minor_ids);
442 	mac_flow_fini();
443 
444 	mod_hash_destroy_hash(i_mac_impl_hash);
445 	rw_destroy(&i_mac_impl_lock);
446 
447 	mac_client_fini();
448 	kmem_cache_destroy(mac_ring_cache);
449 
450 	mod_hash_destroy_hash(i_mactype_hash);
451 	mac_soft_ring_finish();
452 	return (0);
453 }
454 
455 void
456 mac_init_ops(struct dev_ops *ops, const char *name)
457 {
458 	dld_init_ops(ops, name);
459 }
460 
461 void
462 mac_fini_ops(struct dev_ops *ops)
463 {
464 	dld_fini_ops(ops);
465 }
466 
467 /*ARGSUSED*/
468 static int
469 i_mac_constructor(void *buf, void *arg, int kmflag)
470 {
471 	mac_impl_t	*mip = buf;
472 
473 	bzero(buf, sizeof (mac_impl_t));
474 
475 	mip->mi_linkstate = LINK_STATE_UNKNOWN;
476 	mip->mi_nclients = 0;
477 
478 	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
479 	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
480 	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
481 	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
482 	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
483 
484 	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
485 	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
486 	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
487 	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
488 	return (0);
489 }
490 
491 /*ARGSUSED*/
492 static void
493 i_mac_destructor(void *buf, void *arg)
494 {
495 	mac_impl_t	*mip = buf;
496 	mac_cb_info_t	*mcbi;
497 
498 	ASSERT(mip->mi_ref == 0);
499 	ASSERT(mip->mi_active == 0);
500 	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
501 	ASSERT(mip->mi_devpromisc == 0);
502 	ASSERT(mip->mi_promisc == 0);
503 	ASSERT(mip->mi_ksp == NULL);
504 	ASSERT(mip->mi_kstat_count == 0);
505 	ASSERT(mip->mi_nclients == 0);
506 	ASSERT(mip->mi_nactiveclients == 0);
507 	ASSERT(mip->mi_single_active_client == NULL);
508 	ASSERT(mip->mi_state_flags == 0);
509 	ASSERT(mip->mi_factory_addr == NULL);
510 	ASSERT(mip->mi_factory_addr_num == 0);
511 	ASSERT(mip->mi_default_tx_ring == NULL);
512 
513 	mcbi = &mip->mi_notify_cb_info;
514 	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
515 	ASSERT(mip->mi_notify_bits == 0);
516 	ASSERT(mip->mi_notify_thread == NULL);
517 	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
518 	mcbi->mcbi_lockp = NULL;
519 
520 	mcbi = &mip->mi_promisc_cb_info;
521 	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
522 	ASSERT(mip->mi_promisc_list == NULL);
523 	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
524 	mcbi->mcbi_lockp = NULL;
525 
526 	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
527 	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
528 
529 	mutex_destroy(&mip->mi_lock);
530 	rw_destroy(&mip->mi_rw_lock);
531 
532 	mutex_destroy(&mip->mi_promisc_lock);
533 	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
534 	mutex_destroy(&mip->mi_notify_lock);
535 	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
536 	mutex_destroy(&mip->mi_ring_lock);
537 }
538 
539 /* ARGSUSED */
540 static int
541 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
542 {
543 	mac_ring_t *ring = (mac_ring_t *)buf;
544 
545 	bzero(ring, sizeof (mac_ring_t));
546 	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
547 	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
548 	ring->mr_state = MR_FREE;
549 	return (0);
550 }
551 
552 /* ARGSUSED */
553 static void
554 i_mac_ring_dtor(void *buf, void *arg)
555 {
556 	mac_ring_t *ring = (mac_ring_t *)buf;
557 
558 	cv_destroy(&ring->mr_cv);
559 	mutex_destroy(&ring->mr_lock);
560 }
561 
562 /*
563  * Common functions to do mac callback addition and deletion. Currently this is
564  * used by promisc callbacks and notify callbacks. List addition and deletion
565  * need to take care of list walkers. List walkers in general, can't hold list
566  * locks and make upcall callbacks due to potential lock order and recursive
567  * reentry issues. Instead list walkers increment the list walker count to mark
568  * the presence of a walker thread. Addition can be carefully done to ensure
569  * that the list walker always sees either the old list or the new list.
570  * However the deletion can't be done while the walker is active, instead the
571  * deleting thread simply marks the entry as logically deleted. The last walker
572  * physically deletes and frees up the logically deleted entries when the walk
573  * is complete.
574  */
575 void
576 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
577     mac_cb_t *mcb_elem)
578 {
579 	mac_cb_t	*p;
580 	mac_cb_t	**pp;
581 
582 	/* Verify it is not already in the list */
583 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
584 		if (p == mcb_elem)
585 			break;
586 	}
587 	VERIFY(p == NULL);
588 
589 	/*
590 	 * Add it to the head of the callback list. The membar ensures that
591 	 * the following list pointer manipulations reach global visibility
592 	 * in exactly the program order below.
593 	 */
594 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
595 
596 	mcb_elem->mcb_nextp = *mcb_head;
597 	membar_producer();
598 	*mcb_head = mcb_elem;
599 }
600 
601 /*
602  * Mark the entry as logically deleted. If there aren't any walkers unlink
603  * from the list. In either case return the corresponding status.
604  */
605 boolean_t
606 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
607     mac_cb_t *mcb_elem)
608 {
609 	mac_cb_t	*p;
610 	mac_cb_t	**pp;
611 
612 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
613 	/*
614 	 * Search the callback list for the entry to be removed
615 	 */
616 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
617 		if (p == mcb_elem)
618 			break;
619 	}
620 	VERIFY(p != NULL);
621 
622 	/*
623 	 * If there are walkers just mark it as deleted and the last walker
624 	 * will remove from the list and free it.
625 	 */
626 	if (mcbi->mcbi_walker_cnt != 0) {
627 		p->mcb_flags |= MCB_CONDEMNED;
628 		mcbi->mcbi_del_cnt++;
629 		return (B_FALSE);
630 	}
631 
632 	ASSERT(mcbi->mcbi_del_cnt == 0);
633 	*pp = p->mcb_nextp;
634 	p->mcb_nextp = NULL;
635 	return (B_TRUE);
636 }
637 
638 /*
639  * Wait for all pending callback removals to be completed
640  */
641 void
642 mac_callback_remove_wait(mac_cb_info_t *mcbi)
643 {
644 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
645 	while (mcbi->mcbi_del_cnt != 0) {
646 		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
647 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
648 	}
649 }
650 
651 /*
652  * The last mac callback walker does the cleanup. Walk the list and unlik
653  * all the logically deleted entries and construct a temporary list of
654  * removed entries. Return the list of removed entries to the caller.
655  */
656 mac_cb_t *
657 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
658 {
659 	mac_cb_t	*p;
660 	mac_cb_t	**pp;
661 	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
662 	int	cnt = 0;
663 
664 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
665 	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
666 
667 	pp = mcb_head;
668 	while (*pp != NULL) {
669 		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
670 			p = *pp;
671 			*pp = p->mcb_nextp;
672 			p->mcb_nextp = rmlist;
673 			rmlist = p;
674 			cnt++;
675 			continue;
676 		}
677 		pp = &(*pp)->mcb_nextp;
678 	}
679 
680 	ASSERT(mcbi->mcbi_del_cnt == cnt);
681 	mcbi->mcbi_del_cnt = 0;
682 	return (rmlist);
683 }
684 
685 boolean_t
686 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
687 {
688 	mac_cb_t	*mcb;
689 
690 	/* Verify it is not already in the list */
691 	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
692 		if (mcb == mcb_elem)
693 			return (B_TRUE);
694 	}
695 
696 	return (B_FALSE);
697 }
698 
699 boolean_t
700 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
701 {
702 	boolean_t	found;
703 
704 	mutex_enter(mcbi->mcbi_lockp);
705 	found = mac_callback_lookup(mcb_headp, mcb_elem);
706 	mutex_exit(mcbi->mcbi_lockp);
707 
708 	return (found);
709 }
710 
711 /* Free the list of removed callbacks */
712 void
713 mac_callback_free(mac_cb_t *rmlist)
714 {
715 	mac_cb_t	*mcb;
716 	mac_cb_t	*mcb_next;
717 
718 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
719 		mcb_next = mcb->mcb_nextp;
720 		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
721 	}
722 }
723 
724 /*
725  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
726  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
727  * is only a single shared total walker count, and an entry can't be physically
728  * unlinked if a walker is active on either list. The last walker does this
729  * cleanup of logically deleted entries.
730  */
731 void
732 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
733 {
734 	mac_cb_t	*rmlist;
735 	mac_cb_t	*mcb;
736 	mac_cb_t	*mcb_next;
737 	mac_promisc_impl_t	*mpip;
738 
739 	/*
740 	 * Construct a temporary list of deleted callbacks by walking the
741 	 * the mi_promisc_list. Then for each entry in the temporary list,
742 	 * remove it from the mci_promisc_list and free the entry.
743 	 */
744 	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
745 	    &mip->mi_promisc_list);
746 
747 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
748 		mcb_next = mcb->mcb_nextp;
749 		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
750 		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
751 		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
752 		mcb->mcb_flags = 0;
753 		mcb->mcb_nextp = NULL;
754 		kmem_cache_free(mac_promisc_impl_cache, mpip);
755 	}
756 }
757 
758 void
759 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
760 {
761 	mac_cb_info_t	*mcbi;
762 
763 	/*
764 	 * Signal the notify thread even after mi_ref has become zero and
765 	 * mi_disabled is set. The synchronization with the notify thread
766 	 * happens in mac_unregister and that implies the driver must make
767 	 * sure it is single-threaded (with respect to mac calls) and that
768 	 * all pending mac calls have returned before it calls mac_unregister
769 	 */
770 	rw_enter(&i_mac_impl_lock, RW_READER);
771 	if (mip->mi_state_flags & MIS_DISABLED)
772 		goto exit;
773 
774 	/*
775 	 * Guard against incorrect notifications.  (Running a newer
776 	 * mac client against an older implementation?)
777 	 */
778 	if (type >= MAC_NNOTE)
779 		goto exit;
780 
781 	mcbi = &mip->mi_notify_cb_info;
782 	mutex_enter(mcbi->mcbi_lockp);
783 	mip->mi_notify_bits |= (1 << type);
784 	cv_broadcast(&mcbi->mcbi_cv);
785 	mutex_exit(mcbi->mcbi_lockp);
786 
787 exit:
788 	rw_exit(&i_mac_impl_lock);
789 }
790 
791 /*
792  * Mac serialization primitives. Please see the block comment at the
793  * top of the file.
794  */
795 void
796 i_mac_perim_enter(mac_impl_t *mip)
797 {
798 	mac_client_impl_t	*mcip;
799 
800 	if (mip->mi_state_flags & MIS_IS_VNIC) {
801 		/*
802 		 * This is a VNIC. Return the lower mac since that is what
803 		 * we want to serialize on.
804 		 */
805 		mcip = mac_vnic_lower(mip);
806 		mip = mcip->mci_mip;
807 	}
808 
809 	mutex_enter(&mip->mi_perim_lock);
810 	if (mip->mi_perim_owner == curthread) {
811 		mip->mi_perim_ocnt++;
812 		mutex_exit(&mip->mi_perim_lock);
813 		return;
814 	}
815 
816 	while (mip->mi_perim_owner != NULL)
817 		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
818 
819 	mip->mi_perim_owner = curthread;
820 	ASSERT(mip->mi_perim_ocnt == 0);
821 	mip->mi_perim_ocnt++;
822 #ifdef DEBUG
823 	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
824 	    MAC_PERIM_STACK_DEPTH);
825 #endif
826 	mutex_exit(&mip->mi_perim_lock);
827 }
828 
829 int
830 i_mac_perim_enter_nowait(mac_impl_t *mip)
831 {
832 	/*
833 	 * The vnic is a special case, since the serialization is done based
834 	 * on the lower mac. If the lower mac is busy, it does not imply the
835 	 * vnic can't be unregistered. But in the case of other drivers,
836 	 * a busy perimeter or open mac handles implies that the mac is busy
837 	 * and can't be unregistered.
838 	 */
839 	if (mip->mi_state_flags & MIS_IS_VNIC) {
840 		i_mac_perim_enter(mip);
841 		return (0);
842 	}
843 
844 	mutex_enter(&mip->mi_perim_lock);
845 	if (mip->mi_perim_owner != NULL) {
846 		mutex_exit(&mip->mi_perim_lock);
847 		return (EBUSY);
848 	}
849 	ASSERT(mip->mi_perim_ocnt == 0);
850 	mip->mi_perim_owner = curthread;
851 	mip->mi_perim_ocnt++;
852 	mutex_exit(&mip->mi_perim_lock);
853 
854 	return (0);
855 }
856 
857 void
858 i_mac_perim_exit(mac_impl_t *mip)
859 {
860 	mac_client_impl_t *mcip;
861 
862 	if (mip->mi_state_flags & MIS_IS_VNIC) {
863 		/*
864 		 * This is a VNIC. Return the lower mac since that is what
865 		 * we want to serialize on.
866 		 */
867 		mcip = mac_vnic_lower(mip);
868 		mip = mcip->mci_mip;
869 	}
870 
871 	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
872 
873 	mutex_enter(&mip->mi_perim_lock);
874 	if (--mip->mi_perim_ocnt == 0) {
875 		mip->mi_perim_owner = NULL;
876 		cv_signal(&mip->mi_perim_cv);
877 	}
878 	mutex_exit(&mip->mi_perim_lock);
879 }
880 
881 /*
882  * Returns whether the current thread holds the mac perimeter. Used in making
883  * assertions.
884  */
885 boolean_t
886 mac_perim_held(mac_handle_t mh)
887 {
888 	mac_impl_t	*mip = (mac_impl_t *)mh;
889 	mac_client_impl_t *mcip;
890 
891 	if (mip->mi_state_flags & MIS_IS_VNIC) {
892 		/*
893 		 * This is a VNIC. Return the lower mac since that is what
894 		 * we want to serialize on.
895 		 */
896 		mcip = mac_vnic_lower(mip);
897 		mip = mcip->mci_mip;
898 	}
899 	return (mip->mi_perim_owner == curthread);
900 }
901 
902 /*
903  * mac client interfaces to enter the mac perimeter of a mac end point, given
904  * its mac handle, or macname or linkid.
905  */
906 void
907 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
908 {
909 	mac_impl_t	*mip = (mac_impl_t *)mh;
910 
911 	i_mac_perim_enter(mip);
912 	/*
913 	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
914 	 * mac_open has been done internally while entering the perimeter.
915 	 * This information is used in mac_perim_exit
916 	 */
917 	MAC_ENCODE_MPH(*mphp, mip, 0);
918 }
919 
920 int
921 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
922 {
923 	int	err;
924 	mac_handle_t	mh;
925 
926 	if ((err = mac_open(name, &mh)) != 0)
927 		return (err);
928 
929 	mac_perim_enter_by_mh(mh, mphp);
930 	MAC_ENCODE_MPH(*mphp, mh, 1);
931 	return (0);
932 }
933 
934 int
935 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
936 {
937 	int	err;
938 	mac_handle_t	mh;
939 
940 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
941 		return (err);
942 
943 	mac_perim_enter_by_mh(mh, mphp);
944 	MAC_ENCODE_MPH(*mphp, mh, 1);
945 	return (0);
946 }
947 
948 void
949 mac_perim_exit(mac_perim_handle_t mph)
950 {
951 	mac_impl_t	*mip;
952 	boolean_t	need_close;
953 
954 	MAC_DECODE_MPH(mph, mip, need_close);
955 	i_mac_perim_exit(mip);
956 	if (need_close)
957 		mac_close((mac_handle_t)mip);
958 }
959 
960 int
961 mac_hold(const char *macname, mac_impl_t **pmip)
962 {
963 	mac_impl_t	*mip;
964 	int		err;
965 
966 	/*
967 	 * Check the device name length to make sure it won't overflow our
968 	 * buffer.
969 	 */
970 	if (strlen(macname) >= MAXNAMELEN)
971 		return (EINVAL);
972 
973 	/*
974 	 * Look up its entry in the global hash table.
975 	 */
976 	rw_enter(&i_mac_impl_lock, RW_WRITER);
977 	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
978 	    (mod_hash_val_t *)&mip);
979 
980 	if (err != 0) {
981 		rw_exit(&i_mac_impl_lock);
982 		return (ENOENT);
983 	}
984 
985 	if (mip->mi_state_flags & MIS_DISABLED) {
986 		rw_exit(&i_mac_impl_lock);
987 		return (ENOENT);
988 	}
989 
990 	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
991 		rw_exit(&i_mac_impl_lock);
992 		return (EBUSY);
993 	}
994 
995 	mip->mi_ref++;
996 	rw_exit(&i_mac_impl_lock);
997 
998 	*pmip = mip;
999 	return (0);
1000 }
1001 
1002 void
1003 mac_rele(mac_impl_t *mip)
1004 {
1005 	rw_enter(&i_mac_impl_lock, RW_WRITER);
1006 	ASSERT(mip->mi_ref != 0);
1007 	if (--mip->mi_ref == 0) {
1008 		ASSERT(mip->mi_nactiveclients == 0 &&
1009 		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1010 	}
1011 	rw_exit(&i_mac_impl_lock);
1012 }
1013 
1014 /*
1015  * Private GLDv3 function to start a MAC instance.
1016  */
1017 int
1018 mac_start(mac_handle_t mh)
1019 {
1020 	mac_impl_t	*mip = (mac_impl_t *)mh;
1021 	int		err = 0;
1022 
1023 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1024 	ASSERT(mip->mi_start != NULL);
1025 
1026 	/*
1027 	 * Check whether the device is already started.
1028 	 */
1029 	if (mip->mi_active++ == 0) {
1030 		mac_ring_t *ring = NULL;
1031 
1032 		/*
1033 		 * Start the device.
1034 		 */
1035 		err = mip->mi_start(mip->mi_driver);
1036 		if (err != 0) {
1037 			mip->mi_active--;
1038 			return (err);
1039 		}
1040 
1041 		/*
1042 		 * Start the default tx ring.
1043 		 */
1044 		if (mip->mi_default_tx_ring != NULL) {
1045 
1046 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1047 			err = mac_start_ring(ring);
1048 			if (err != 0) {
1049 				mip->mi_active--;
1050 				return (err);
1051 			}
1052 			ring->mr_state = MR_INUSE;
1053 		}
1054 
1055 		if (mip->mi_rx_groups != NULL) {
1056 			/*
1057 			 * Start the default ring, since it will be needed
1058 			 * to receive broadcast and multicast traffic for
1059 			 * both primary and non-primary MAC clients.
1060 			 */
1061 			mac_group_t *grp = &mip->mi_rx_groups[0];
1062 
1063 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1064 			err = mac_start_group_and_rings(grp);
1065 			if (err != 0) {
1066 				mip->mi_active--;
1067 				if (ring != NULL) {
1068 					mac_stop_ring(ring);
1069 					ring->mr_state = MR_FREE;
1070 				}
1071 				return (err);
1072 			}
1073 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
1074 		}
1075 	}
1076 
1077 	return (err);
1078 }
1079 
1080 /*
1081  * Private GLDv3 function to stop a MAC instance.
1082  */
1083 void
1084 mac_stop(mac_handle_t mh)
1085 {
1086 	mac_impl_t	*mip = (mac_impl_t *)mh;
1087 
1088 	ASSERT(mip->mi_stop != NULL);
1089 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1090 
1091 	/*
1092 	 * Check whether the device is still needed.
1093 	 */
1094 	ASSERT(mip->mi_active != 0);
1095 	if (--mip->mi_active == 0) {
1096 		if (mip->mi_rx_groups != NULL) {
1097 			/*
1098 			 * There should be no more active clients since the
1099 			 * MAC is being stopped. Stop the default RX group
1100 			 * and transition it back to registered state.
1101 			 */
1102 			mac_group_t *grp = &mip->mi_rx_groups[0];
1103 
1104 			/*
1105 			 * When clients are torn down, the groups
1106 			 * are release via mac_release_rx_group which
1107 			 * knows the the default group is always in
1108 			 * started mode since broadcast uses it. So
1109 			 * we can assert that their are no clients
1110 			 * (since mac_bcast_add doesn't register itself
1111 			 * as a client) and group is in SHARED state.
1112 			 */
1113 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1114 			ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
1115 			    mip->mi_nactiveclients == 0);
1116 			mac_stop_group_and_rings(grp);
1117 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1118 		}
1119 
1120 		if (mip->mi_default_tx_ring != NULL) {
1121 			mac_ring_t *ring;
1122 
1123 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1124 			mac_stop_ring(ring);
1125 			ring->mr_state = MR_FREE;
1126 		}
1127 
1128 		/*
1129 		 * Stop the device.
1130 		 */
1131 		mip->mi_stop(mip->mi_driver);
1132 	}
1133 }
1134 
1135 int
1136 i_mac_promisc_set(mac_impl_t *mip, boolean_t on, mac_promisc_type_t ptype)
1137 {
1138 	int		err = 0;
1139 
1140 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1141 	ASSERT(mip->mi_setpromisc != NULL);
1142 	ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC);
1143 
1144 	/*
1145 	 * Determine whether we should enable or disable promiscuous mode.
1146 	 * For details on the distinction between "device promiscuous mode"
1147 	 * and "MAC promiscuous mode", see PSARC/2005/289.
1148 	 */
1149 	if (on) {
1150 		/*
1151 		 * Enable promiscuous mode on the device if not yet enabled.
1152 		 */
1153 		if (mip->mi_devpromisc++ == 0) {
1154 			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1155 			if (err != 0) {
1156 				mip->mi_devpromisc--;
1157 				return (err);
1158 			}
1159 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1160 		}
1161 
1162 		/*
1163 		 * Enable promiscuous mode on the MAC if not yet enabled.
1164 		 */
1165 		if (ptype == MAC_PROMISC && mip->mi_promisc++ == 0)
1166 			i_mac_notify(mip, MAC_NOTE_PROMISC);
1167 	} else {
1168 		if (mip->mi_devpromisc == 0)
1169 			return (EPROTO);
1170 
1171 		/*
1172 		 * Disable promiscuous mode on the device if this is the last
1173 		 * enabling.
1174 		 */
1175 		if (--mip->mi_devpromisc == 0) {
1176 			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1177 			if (err != 0) {
1178 				mip->mi_devpromisc++;
1179 				return (err);
1180 			}
1181 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1182 		}
1183 
1184 		/*
1185 		 * Disable promiscuous mode on the MAC if this is the last
1186 		 * enabling.
1187 		 */
1188 		if (ptype == MAC_PROMISC && --mip->mi_promisc == 0)
1189 			i_mac_notify(mip, MAC_NOTE_PROMISC);
1190 	}
1191 
1192 	return (0);
1193 }
1194 
1195 int
1196 mac_promisc_set(mac_handle_t mh, boolean_t on, mac_promisc_type_t ptype)
1197 {
1198 	mac_impl_t	*mip = (mac_impl_t *)mh;
1199 	int		rv;
1200 
1201 	i_mac_perim_enter(mip);
1202 	rv = i_mac_promisc_set(mip, on, ptype);
1203 	if (rv != 0 && !on) {
1204 		cmn_err(CE_WARN, "%s: failed to switch OFF promiscuous mode "
1205 		    "because of error 0x%x", mip->mi_name, rv);
1206 		rv = 0;
1207 	}
1208 	i_mac_perim_exit(mip);
1209 	return (rv);
1210 }
1211 
1212 /*
1213  * The promiscuity state can change any time. If the caller needs to take
1214  * actions that are atomic with the promiscuity state, then the caller needs
1215  * to bracket the entire sequence with mac_perim_enter/exit
1216  */
1217 boolean_t
1218 mac_promisc_get(mac_handle_t mh, mac_promisc_type_t ptype)
1219 {
1220 	mac_impl_t		*mip = (mac_impl_t *)mh;
1221 
1222 	ASSERT(ptype == MAC_DEVPROMISC || ptype == MAC_PROMISC);
1223 
1224 	/*
1225 	 * Return the current promiscuity.
1226 	 */
1227 	if (ptype == MAC_DEVPROMISC)
1228 		return (mip->mi_devpromisc != 0);
1229 	else
1230 		return (mip->mi_promisc != 0);
1231 }
1232 
1233 /*
1234  * Invoked at MAC instance attach time to initialize the list
1235  * of factory MAC addresses supported by a MAC instance. This function
1236  * builds a local cache in the mac_impl_t for the MAC addresses
1237  * supported by the underlying hardware. The MAC clients themselves
1238  * use the mac_addr_factory*() functions to query and reserve
1239  * factory MAC addresses.
1240  */
1241 void
1242 mac_addr_factory_init(mac_impl_t *mip)
1243 {
1244 	mac_capab_multifactaddr_t capab;
1245 	uint8_t *addr;
1246 	int i;
1247 
1248 	/*
1249 	 * First round to see how many factory MAC addresses are available.
1250 	 */
1251 	bzero(&capab, sizeof (capab));
1252 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1253 	    &capab) || (capab.mcm_naddr == 0)) {
1254 		/*
1255 		 * The MAC instance doesn't support multiple factory
1256 		 * MAC addresses, we're done here.
1257 		 */
1258 		return;
1259 	}
1260 
1261 	/*
1262 	 * Allocate the space and get all the factory addresses.
1263 	 */
1264 	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1265 	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1266 
1267 	mip->mi_factory_addr_num = capab.mcm_naddr;
1268 	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1269 	    sizeof (mac_factory_addr_t), KM_SLEEP);
1270 
1271 	for (i = 0; i < capab.mcm_naddr; i++) {
1272 		bcopy(addr + i * MAXMACADDRLEN,
1273 		    mip->mi_factory_addr[i].mfa_addr,
1274 		    mip->mi_type->mt_addr_length);
1275 		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1276 	}
1277 
1278 	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1279 }
1280 
1281 void
1282 mac_addr_factory_fini(mac_impl_t *mip)
1283 {
1284 	if (mip->mi_factory_addr == NULL) {
1285 		ASSERT(mip->mi_factory_addr_num == 0);
1286 		return;
1287 	}
1288 
1289 	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1290 	    sizeof (mac_factory_addr_t));
1291 
1292 	mip->mi_factory_addr = NULL;
1293 	mip->mi_factory_addr_num = 0;
1294 }
1295 
1296 /*
1297  * Reserve a factory MAC address. If *slot is set to -1, the function
1298  * attempts to reserve any of the available factory MAC addresses and
1299  * returns the reserved slot id. If no slots are available, the function
1300  * returns ENOSPC. If *slot is not set to -1, the function reserves
1301  * the specified slot if it is available, or returns EBUSY is the slot
1302  * is already used. Returns ENOTSUP if the underlying MAC does not
1303  * support multiple factory addresses. If the slot number is not -1 but
1304  * is invalid, returns EINVAL.
1305  */
1306 int
1307 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1308 {
1309 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1310 	mac_impl_t *mip = mcip->mci_mip;
1311 	int i, ret = 0;
1312 
1313 	i_mac_perim_enter(mip);
1314 	/*
1315 	 * Protect against concurrent readers that may need a self-consistent
1316 	 * view of the factory addresses
1317 	 */
1318 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1319 
1320 	if (mip->mi_factory_addr_num == 0) {
1321 		ret = ENOTSUP;
1322 		goto bail;
1323 	}
1324 
1325 	if (*slot != -1) {
1326 		/* check the specified slot */
1327 		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1328 			ret = EINVAL;
1329 			goto bail;
1330 		}
1331 		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1332 			ret = EBUSY;
1333 			goto bail;
1334 		}
1335 	} else {
1336 		/* pick the next available slot */
1337 		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1338 			if (!mip->mi_factory_addr[i].mfa_in_use)
1339 				break;
1340 		}
1341 
1342 		if (i == mip->mi_factory_addr_num) {
1343 			ret = ENOSPC;
1344 			goto bail;
1345 		}
1346 		*slot = i+1;
1347 	}
1348 
1349 	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1350 	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1351 
1352 bail:
1353 	rw_exit(&mip->mi_rw_lock);
1354 	i_mac_perim_exit(mip);
1355 	return (ret);
1356 }
1357 
1358 /*
1359  * Release the specified factory MAC address slot.
1360  */
1361 void
1362 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1363 {
1364 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1365 	mac_impl_t *mip = mcip->mci_mip;
1366 
1367 	i_mac_perim_enter(mip);
1368 	/*
1369 	 * Protect against concurrent readers that may need a self-consistent
1370 	 * view of the factory addresses
1371 	 */
1372 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1373 
1374 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1375 	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1376 
1377 	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1378 
1379 	rw_exit(&mip->mi_rw_lock);
1380 	i_mac_perim_exit(mip);
1381 }
1382 
1383 /*
1384  * Stores in mac_addr the value of the specified MAC address. Returns
1385  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1386  * The caller must provide a string of at least MAXNAMELEN bytes.
1387  */
1388 void
1389 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1390     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1391 {
1392 	mac_impl_t *mip = (mac_impl_t *)mh;
1393 	boolean_t in_use;
1394 
1395 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1396 
1397 	/*
1398 	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1399 	 * and mi_rw_lock
1400 	 */
1401 	rw_enter(&mip->mi_rw_lock, RW_READER);
1402 	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1403 	*addr_len = mip->mi_type->mt_addr_length;
1404 	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1405 	if (in_use && client_name != NULL) {
1406 		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1407 		    client_name, MAXNAMELEN);
1408 	}
1409 	if (in_use_arg != NULL)
1410 		*in_use_arg = in_use;
1411 	rw_exit(&mip->mi_rw_lock);
1412 }
1413 
1414 /*
1415  * Returns the number of factory MAC addresses (in addition to the
1416  * primary MAC address), 0 if the underlying MAC doesn't support
1417  * that feature.
1418  */
1419 uint_t
1420 mac_addr_factory_num(mac_handle_t mh)
1421 {
1422 	mac_impl_t *mip = (mac_impl_t *)mh;
1423 
1424 	return (mip->mi_factory_addr_num);
1425 }
1426 
1427 
1428 void
1429 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1430 {
1431 	mac_ring_t	*ring;
1432 
1433 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1434 		ring->mr_flag &= ~flag;
1435 }
1436 
1437 /*
1438  * The following mac_hwrings_xxx() functions are private mac client functions
1439  * used by the aggr driver to access and control the underlying HW Rx group
1440  * and rings. In this case, the aggr driver has exclusive control of the
1441  * underlying HW Rx group/rings, it calls the following functions to
1442  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1443  * addresses, or set up the Rx callback.
1444  */
1445 /* ARGSUSED */
1446 static void
1447 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1448     mblk_t *mp_chain, boolean_t loopback)
1449 {
1450 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1451 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1452 	mac_direct_rx_t		proc;
1453 	void			*arg1;
1454 	mac_resource_handle_t	arg2;
1455 
1456 	proc = srs_rx->sr_func;
1457 	arg1 = srs_rx->sr_arg1;
1458 	arg2 = mac_srs->srs_mrh;
1459 
1460 	proc(arg1, arg2, mp_chain, NULL);
1461 }
1462 
1463 /*
1464  * This function is called to get the list of HW rings that are reserved by
1465  * an exclusive mac client.
1466  *
1467  * Return value: the number of HW rings.
1468  */
1469 int
1470 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1471     mac_ring_handle_t *hwrh)
1472 {
1473 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1474 	flow_entry_t		*flent = mcip->mci_flent;
1475 	mac_group_t		*grp = flent->fe_rx_ring_group;
1476 	mac_ring_t		*ring;
1477 	int			cnt = 0;
1478 
1479 	/*
1480 	 * The mac client did not reserve any RX group, return directly.
1481 	 * This is probably because the underlying MAC does not support
1482 	 * any RX groups.
1483 	 */
1484 	*hwgh = NULL;
1485 	if (grp == NULL)
1486 		return (0);
1487 
1488 	/*
1489 	 * This RX group must be reserved by this mac client.
1490 	 */
1491 	ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1492 	    (mch == (mac_client_handle_t)(MAC_RX_GROUP_ONLY_CLIENT(grp))));
1493 
1494 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next) {
1495 		ASSERT(cnt < MAX_RINGS_PER_GROUP);
1496 		hwrh[cnt++] = (mac_ring_handle_t)ring;
1497 	}
1498 	*hwgh = (mac_group_handle_t)grp;
1499 	return (cnt);
1500 }
1501 
1502 /*
1503  * Setup the RX callback of the mac client which exclusively controls HW ring.
1504  */
1505 void
1506 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
1507 {
1508 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1509 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1510 
1511 	mac_srs->srs_mrh = prh;
1512 	mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1513 }
1514 
1515 void
1516 mac_hwring_teardown(mac_ring_handle_t hwrh)
1517 {
1518 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1519 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1520 
1521 	mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1522 	mac_srs->srs_mrh = NULL;
1523 }
1524 
1525 int
1526 mac_hwring_disable_intr(mac_ring_handle_t rh)
1527 {
1528 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1529 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1530 
1531 	return (intr->mi_disable(intr->mi_handle));
1532 }
1533 
1534 int
1535 mac_hwring_enable_intr(mac_ring_handle_t rh)
1536 {
1537 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1538 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1539 
1540 	return (intr->mi_enable(intr->mi_handle));
1541 }
1542 
1543 int
1544 mac_hwring_start(mac_ring_handle_t rh)
1545 {
1546 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1547 
1548 	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1549 	return (0);
1550 }
1551 
1552 void
1553 mac_hwring_stop(mac_ring_handle_t rh)
1554 {
1555 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1556 
1557 	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1558 }
1559 
1560 mblk_t *
1561 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1562 {
1563 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1564 	mac_ring_info_t *info = &rr_ring->mr_info;
1565 
1566 	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1567 }
1568 
1569 int
1570 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1571 {
1572 	mac_group_t *group = (mac_group_t *)gh;
1573 
1574 	return (mac_group_addmac(group, addr));
1575 }
1576 
1577 int
1578 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1579 {
1580 	mac_group_t *group = (mac_group_t *)gh;
1581 
1582 	return (mac_group_remmac(group, addr));
1583 }
1584 
1585 /*
1586  * Set the RX group to be shared/reserved. Note that the group must be
1587  * started/stopped outside of this function.
1588  */
1589 void
1590 mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
1591 {
1592 	/*
1593 	 * If there is no change in the group state, just return.
1594 	 */
1595 	if (grp->mrg_state == state)
1596 		return;
1597 
1598 	switch (state) {
1599 	case MAC_GROUP_STATE_RESERVED:
1600 		/*
1601 		 * Successfully reserved the group.
1602 		 *
1603 		 * Given that there is an exclusive client controlling this
1604 		 * group, we enable the group level polling when available,
1605 		 * so that SRSs get to turn on/off individual rings they's
1606 		 * assigned to.
1607 		 */
1608 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1609 
1610 		if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
1611 			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1612 
1613 		break;
1614 
1615 	case MAC_GROUP_STATE_SHARED:
1616 		/*
1617 		 * Set all rings of this group to software classified.
1618 		 * If the group has an overriding interrupt, then re-enable it.
1619 		 */
1620 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1621 
1622 		if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
1623 			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1624 
1625 		/* The ring is not available for reservations any more */
1626 		break;
1627 
1628 	case MAC_GROUP_STATE_REGISTERED:
1629 		/* Also callable from mac_register, perim is not held */
1630 		break;
1631 
1632 	default:
1633 		ASSERT(B_FALSE);
1634 		break;
1635 	}
1636 
1637 	grp->mrg_state = state;
1638 }
1639 
1640 /*
1641  * Quiesce future hardware classified packets for the specified Rx ring
1642  */
1643 static void
1644 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1645 {
1646 	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1647 	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1648 
1649 	mutex_enter(&rx_ring->mr_lock);
1650 	rx_ring->mr_flag |= ring_flag;
1651 	while (rx_ring->mr_refcnt != 0)
1652 		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1653 	mutex_exit(&rx_ring->mr_lock);
1654 }
1655 
1656 /*
1657  * Please see mac_tx for details about the per cpu locking scheme
1658  */
1659 static void
1660 mac_tx_lock_all(mac_client_impl_t *mcip)
1661 {
1662 	int	i;
1663 
1664 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1665 		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1666 }
1667 
1668 static void
1669 mac_tx_unlock_all(mac_client_impl_t *mcip)
1670 {
1671 	int	i;
1672 
1673 	for (i = mac_tx_percpu_cnt; i >= 0; i--)
1674 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1675 }
1676 
1677 static void
1678 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1679 {
1680 	int	i;
1681 
1682 	for (i = mac_tx_percpu_cnt; i > 0; i--)
1683 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1684 }
1685 
1686 static int
1687 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1688 {
1689 	int	i;
1690 	int	refcnt = 0;
1691 
1692 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1693 		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1694 
1695 	return (refcnt);
1696 }
1697 
1698 /*
1699  * Stop future Tx packets coming down from the client in preparation for
1700  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1701  * of rings between clients
1702  */
1703 void
1704 mac_tx_client_block(mac_client_impl_t *mcip)
1705 {
1706 	mac_tx_lock_all(mcip);
1707 	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1708 	while (mac_tx_sum_refcnt(mcip) != 0) {
1709 		mac_tx_unlock_allbutzero(mcip);
1710 		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1711 		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1712 		mac_tx_lock_all(mcip);
1713 	}
1714 	mac_tx_unlock_all(mcip);
1715 }
1716 
1717 void
1718 mac_tx_client_unblock(mac_client_impl_t *mcip)
1719 {
1720 	mac_tx_lock_all(mcip);
1721 	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1722 	mac_tx_unlock_all(mcip);
1723 	/*
1724 	 * We may fail to disable flow control for the last MAC_NOTE_TX
1725 	 * notification because the MAC client is quiesced. Send the
1726 	 * notification again.
1727 	 */
1728 	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1729 }
1730 
1731 /*
1732  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1733  * quiesce is done.
1734  */
1735 static void
1736 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1737 {
1738 	mutex_enter(&srs->srs_lock);
1739 	while (!(srs->srs_state & srs_flag))
1740 		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1741 	mutex_exit(&srs->srs_lock);
1742 }
1743 
1744 /*
1745  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1746  * works bottom up by cutting off packet flow from the bottommost point in the
1747  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1748  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1749  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1750  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1751  * for the SRS and MR flags. In the former case the threads pause waiting for
1752  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1753  * is also mostly similar to the above.
1754  *
1755  * 1. Stop future hardware classified packets at the lowest level in the mac.
1756  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1757  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1758  *    from increasing. Upcalls from the driver that come through hardware
1759  *    classification will be dropped in mac_rx from now on. Then we wait for
1760  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1761  *    sure there aren't any upcall threads from the driver through hardware
1762  *    classification. In the case of SRS teardown we also remove the
1763  *    classification rule in the driver.
1764  *
1765  * 2. Stop future software classified packets by marking the flow entry with
1766  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1767  *    increasing. We also remove the flow entry from the table in the latter
1768  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1769  *    that indicates there aren't any active threads using that flow entry.
1770  *
1771  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1772  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1773  *    with the SRS worker thread serving as a master controller. This
1774  *    mechansim is explained in mac_srs_worker_quiesce().
1775  *
1776  * The restart mechanism to reactivate the SRS and softrings is explained
1777  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1778  * restart sequence.
1779  */
1780 void
1781 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1782 {
1783 	flow_entry_t	*flent = srs->srs_flent;
1784 	uint_t	mr_flag, srs_done_flag;
1785 
1786 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1787 	ASSERT(!(srs->srs_type & SRST_TX));
1788 
1789 	if (srs_quiesce_flag == SRS_CONDEMNED) {
1790 		mr_flag = MR_CONDEMNED;
1791 		srs_done_flag = SRS_CONDEMNED_DONE;
1792 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1793 			mac_srs_client_poll_disable(srs->srs_mcip, srs);
1794 	} else {
1795 		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1796 		mr_flag = MR_QUIESCE;
1797 		srs_done_flag = SRS_QUIESCE_DONE;
1798 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1799 			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1800 	}
1801 
1802 	if (srs->srs_ring != NULL) {
1803 		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1804 	} else {
1805 		/*
1806 		 * SRS is driven by software classification. In case
1807 		 * of CONDEMNED, the top level teardown functions will
1808 		 * deal with flow removal.
1809 		 */
1810 		if (srs_quiesce_flag != SRS_CONDEMNED) {
1811 			FLOW_MARK(flent, FE_QUIESCE);
1812 			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1813 		}
1814 	}
1815 
1816 	/*
1817 	 * Signal the SRS to quiesce itself, and then cv_wait for the
1818 	 * SRS quiesce to complete. The SRS worker thread will wake us
1819 	 * up when the quiesce is complete
1820 	 */
1821 	mac_srs_signal(srs, srs_quiesce_flag);
1822 	mac_srs_quiesce_wait(srs, srs_done_flag);
1823 }
1824 
1825 /*
1826  * Remove an SRS.
1827  */
1828 void
1829 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1830 {
1831 	flow_entry_t *flent = srs->srs_flent;
1832 	int i;
1833 
1834 	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1835 	/*
1836 	 * Locate and remove our entry in the fe_rx_srs[] array, and
1837 	 * adjust the fe_rx_srs array entries and array count by
1838 	 * moving the last entry into the vacated spot.
1839 	 */
1840 	mutex_enter(&flent->fe_lock);
1841 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1842 		if (flent->fe_rx_srs[i] == srs)
1843 			break;
1844 	}
1845 
1846 	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1847 	if (i != flent->fe_rx_srs_cnt - 1) {
1848 		flent->fe_rx_srs[i] =
1849 		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1850 		i = flent->fe_rx_srs_cnt - 1;
1851 	}
1852 
1853 	flent->fe_rx_srs[i] = NULL;
1854 	flent->fe_rx_srs_cnt--;
1855 	mutex_exit(&flent->fe_lock);
1856 
1857 	mac_srs_free(srs);
1858 }
1859 
1860 static void
1861 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1862 {
1863 	mutex_enter(&srs->srs_lock);
1864 	srs->srs_state &= ~flag;
1865 	mutex_exit(&srs->srs_lock);
1866 }
1867 
1868 void
1869 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1870 {
1871 	flow_entry_t	*flent = srs->srs_flent;
1872 	mac_ring_t	*mr;
1873 
1874 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1875 	ASSERT((srs->srs_type & SRST_TX) == 0);
1876 
1877 	/*
1878 	 * This handles a change in the number of SRSs between the quiesce and
1879 	 * and restart operation of a flow.
1880 	 */
1881 	if (!SRS_QUIESCED(srs))
1882 		return;
1883 
1884 	/*
1885 	 * Signal the SRS to restart itself. Wait for the restart to complete
1886 	 * Note that we only restart the SRS if it is not marked as
1887 	 * permanently quiesced.
1888 	 */
1889 	if (!SRS_QUIESCED_PERMANENT(srs)) {
1890 		mac_srs_signal(srs, SRS_RESTART);
1891 		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
1892 		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
1893 
1894 		mac_srs_client_poll_restart(srs->srs_mcip, srs);
1895 	}
1896 
1897 	/* Finally clear the flags to let the packets in */
1898 	mr = srs->srs_ring;
1899 	if (mr != NULL) {
1900 		MAC_RING_UNMARK(mr, MR_QUIESCE);
1901 		/* In case the ring was stopped, safely restart it */
1902 		(void) mac_start_ring(mr);
1903 	} else {
1904 		FLOW_UNMARK(flent, FE_QUIESCE);
1905 	}
1906 }
1907 
1908 /*
1909  * Temporary quiesce of a flow and associated Rx SRS.
1910  * Please see block comment above mac_rx_classify_flow_rem.
1911  */
1912 /* ARGSUSED */
1913 int
1914 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
1915 {
1916 	int		i;
1917 
1918 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1919 		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
1920 		    SRS_QUIESCE);
1921 	}
1922 	return (0);
1923 }
1924 
1925 /*
1926  * Restart a flow and associated Rx SRS that has been quiesced temporarily
1927  * Please see block comment above mac_rx_classify_flow_rem
1928  */
1929 /* ARGSUSED */
1930 int
1931 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
1932 {
1933 	int		i;
1934 
1935 	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
1936 		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
1937 
1938 	return (0);
1939 }
1940 
1941 void
1942 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
1943 {
1944 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1945 	flow_entry_t		*flent = mcip->mci_flent;
1946 	mac_impl_t		*mip = mcip->mci_mip;
1947 	mac_soft_ring_set_t	*mac_srs;
1948 	int			i;
1949 
1950 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1951 
1952 	if (flent == NULL)
1953 		return;
1954 
1955 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1956 		mac_srs = flent->fe_rx_srs[i];
1957 		mutex_enter(&mac_srs->srs_lock);
1958 		if (on)
1959 			mac_srs->srs_state |= SRS_QUIESCE_PERM;
1960 		else
1961 			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
1962 		mutex_exit(&mac_srs->srs_lock);
1963 	}
1964 }
1965 
1966 void
1967 mac_rx_client_quiesce(mac_client_handle_t mch)
1968 {
1969 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1970 	mac_impl_t		*mip = mcip->mci_mip;
1971 
1972 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1973 
1974 	if (MCIP_DATAPATH_SETUP(mcip)) {
1975 		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
1976 		    NULL);
1977 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1978 		    mac_rx_classify_flow_quiesce, NULL);
1979 	}
1980 }
1981 
1982 void
1983 mac_rx_client_restart(mac_client_handle_t mch)
1984 {
1985 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1986 	mac_impl_t		*mip = mcip->mci_mip;
1987 
1988 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1989 
1990 	if (MCIP_DATAPATH_SETUP(mcip)) {
1991 		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
1992 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1993 		    mac_rx_classify_flow_restart, NULL);
1994 	}
1995 }
1996 
1997 /*
1998  * This function only quiesces the Tx SRS and softring worker threads. Callers
1999  * need to make sure that there aren't any mac client threads doing current or
2000  * future transmits in the mac before calling this function.
2001  */
2002 void
2003 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
2004 {
2005 	mac_client_impl_t	*mcip = srs->srs_mcip;
2006 
2007 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2008 
2009 	ASSERT(srs->srs_type & SRST_TX);
2010 	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2011 	    srs_quiesce_flag == SRS_QUIESCE);
2012 
2013 	/*
2014 	 * Signal the SRS to quiesce itself, and then cv_wait for the
2015 	 * SRS quiesce to complete. The SRS worker thread will wake us
2016 	 * up when the quiesce is complete
2017 	 */
2018 	mac_srs_signal(srs, srs_quiesce_flag);
2019 	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2020 	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2021 }
2022 
2023 void
2024 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2025 {
2026 	/*
2027 	 * Resizing the fanout could result in creation of new SRSs.
2028 	 * They may not necessarily be in the quiesced state in which
2029 	 * case it need be restarted
2030 	 */
2031 	if (!SRS_QUIESCED(srs))
2032 		return;
2033 
2034 	mac_srs_signal(srs, SRS_RESTART);
2035 	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2036 	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2037 }
2038 
2039 /*
2040  * Temporary quiesce of a flow and associated Rx SRS.
2041  * Please see block comment above mac_rx_srs_quiesce
2042  */
2043 /* ARGSUSED */
2044 int
2045 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2046 {
2047 	/*
2048 	 * The fe_tx_srs is null for a subflow on an interface that is
2049 	 * not plumbed
2050 	 */
2051 	if (flent->fe_tx_srs != NULL)
2052 		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2053 	return (0);
2054 }
2055 
2056 /* ARGSUSED */
2057 int
2058 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2059 {
2060 	/*
2061 	 * The fe_tx_srs is null for a subflow on an interface that is
2062 	 * not plumbed
2063 	 */
2064 	if (flent->fe_tx_srs != NULL)
2065 		mac_tx_srs_restart(flent->fe_tx_srs);
2066 	return (0);
2067 }
2068 
2069 void
2070 mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
2071 {
2072 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2073 
2074 	mac_tx_client_block(mcip);
2075 	if (MCIP_TX_SRS(mcip) != NULL) {
2076 		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2077 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2078 		    mac_tx_flow_quiesce, NULL);
2079 	}
2080 }
2081 
2082 void
2083 mac_tx_client_restart(mac_client_impl_t *mcip)
2084 {
2085 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2086 
2087 	mac_tx_client_unblock(mcip);
2088 	if (MCIP_TX_SRS(mcip) != NULL) {
2089 		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2090 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2091 		    mac_tx_flow_restart, NULL);
2092 	}
2093 }
2094 
2095 void
2096 mac_tx_client_flush(mac_client_impl_t *mcip)
2097 {
2098 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2099 
2100 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2101 	mac_tx_client_restart(mcip);
2102 }
2103 
2104 void
2105 mac_client_quiesce(mac_client_impl_t *mcip)
2106 {
2107 	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2108 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2109 }
2110 
2111 void
2112 mac_client_restart(mac_client_impl_t *mcip)
2113 {
2114 	mac_rx_client_restart((mac_client_handle_t)mcip);
2115 	mac_tx_client_restart(mcip);
2116 }
2117 
2118 /*
2119  * Allocate a minor number.
2120  */
2121 minor_t
2122 mac_minor_hold(boolean_t sleep)
2123 {
2124 	minor_t	minor;
2125 
2126 	/*
2127 	 * Grab a value from the arena.
2128 	 */
2129 	atomic_add_32(&minor_count, 1);
2130 
2131 	if (sleep)
2132 		minor = (uint_t)id_alloc(minor_ids);
2133 	else
2134 		minor = (uint_t)id_alloc_nosleep(minor_ids);
2135 
2136 	if (minor == 0) {
2137 		atomic_add_32(&minor_count, -1);
2138 		return (0);
2139 	}
2140 
2141 	return (minor);
2142 }
2143 
2144 /*
2145  * Release a previously allocated minor number.
2146  */
2147 void
2148 mac_minor_rele(minor_t minor)
2149 {
2150 	/*
2151 	 * Return the value to the arena.
2152 	 */
2153 	id_free(minor_ids, minor);
2154 	atomic_add_32(&minor_count, -1);
2155 }
2156 
2157 uint32_t
2158 mac_no_notification(mac_handle_t mh)
2159 {
2160 	mac_impl_t *mip = (mac_impl_t *)mh;
2161 
2162 	return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2163 	    mip->mi_capab_legacy.ml_unsup_note : 0);
2164 }
2165 
2166 /*
2167  * Prevent any new opens of this mac in preparation for unregister
2168  */
2169 int
2170 i_mac_disable(mac_impl_t *mip)
2171 {
2172 	mac_client_impl_t	*mcip;
2173 
2174 	rw_enter(&i_mac_impl_lock, RW_WRITER);
2175 	if (mip->mi_state_flags & MIS_DISABLED) {
2176 		/* Already disabled, return success */
2177 		rw_exit(&i_mac_impl_lock);
2178 		return (0);
2179 	}
2180 	/*
2181 	 * See if there are any other references to this mac_t (e.g., VLAN's).
2182 	 * If so return failure. If all the other checks below pass, then
2183 	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2184 	 * any new VLAN's from being created or new mac client opens of this
2185 	 * mac end point.
2186 	 */
2187 	if (mip->mi_ref > 0) {
2188 		rw_exit(&i_mac_impl_lock);
2189 		return (EBUSY);
2190 	}
2191 
2192 	/*
2193 	 * mac clients must delete all multicast groups they join before
2194 	 * closing. bcast groups are reference counted, the last client
2195 	 * to delete the group will wait till the group is physically
2196 	 * deleted. Since all clients have closed this mac end point
2197 	 * mi_bcast_ngrps must be zero at this point
2198 	 */
2199 	ASSERT(mip->mi_bcast_ngrps == 0);
2200 
2201 	/*
2202 	 * Don't let go of this if it has some flows.
2203 	 * All other code guarantees no flows are added to a disabled
2204 	 * mac, therefore it is sufficient to check for the flow table
2205 	 * only here.
2206 	 */
2207 	mcip = mac_primary_client_handle(mip);
2208 	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2209 		rw_exit(&i_mac_impl_lock);
2210 		return (ENOTEMPTY);
2211 	}
2212 
2213 	mip->mi_state_flags |= MIS_DISABLED;
2214 	rw_exit(&i_mac_impl_lock);
2215 	return (0);
2216 }
2217 
2218 int
2219 mac_disable_nowait(mac_handle_t mh)
2220 {
2221 	mac_impl_t	*mip = (mac_impl_t *)mh;
2222 	int err;
2223 
2224 	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2225 		return (err);
2226 	err = i_mac_disable(mip);
2227 	i_mac_perim_exit(mip);
2228 	return (err);
2229 }
2230 
2231 int
2232 mac_disable(mac_handle_t mh)
2233 {
2234 	mac_impl_t	*mip = (mac_impl_t *)mh;
2235 	int err;
2236 
2237 	i_mac_perim_enter(mip);
2238 	err = i_mac_disable(mip);
2239 	i_mac_perim_exit(mip);
2240 
2241 	/*
2242 	 * Clean up notification thread and wait for it to exit.
2243 	 */
2244 	if (err == 0)
2245 		i_mac_notify_exit(mip);
2246 
2247 	return (err);
2248 }
2249 
2250 /*
2251  * Called when the MAC instance has a non empty flow table, to de-multiplex
2252  * incoming packets to the right flow.
2253  * The MAC's rw lock is assumed held as a READER.
2254  */
2255 /* ARGSUSED */
2256 static mblk_t *
2257 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2258 {
2259 	flow_entry_t	*flent = NULL;
2260 	uint_t		flags = FLOW_INBOUND;
2261 	int		err;
2262 
2263 	/*
2264 	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2265 	 * to mac_flow_lookup() so that the VLAN packets can be successfully
2266 	 * passed to the non-VLAN aggregation flows.
2267 	 *
2268 	 * Note that there is possibly a race between this and
2269 	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2270 	 * classified to non-VLAN flows of non-aggregation mac clients. These
2271 	 * VLAN packets will be then filtered out by the mac module.
2272 	 */
2273 	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2274 		flags |= FLOW_IGNORE_VLAN;
2275 
2276 	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2277 	if (err != 0) {
2278 		/* no registered receive function */
2279 		return (mp);
2280 	} else {
2281 		mac_client_impl_t	*mcip;
2282 
2283 		/*
2284 		 * This flent might just be an additional one on the MAC client,
2285 		 * i.e. for classification purposes (different fdesc), however
2286 		 * the resources, SRS et. al., are in the mci_flent, so if
2287 		 * this isn't the mci_flent, we need to get it.
2288 		 */
2289 		if ((mcip = flent->fe_mcip) != NULL &&
2290 		    mcip->mci_flent != flent) {
2291 			FLOW_REFRELE(flent);
2292 			flent = mcip->mci_flent;
2293 			FLOW_TRY_REFHOLD(flent, err);
2294 			if (err != 0)
2295 				return (mp);
2296 		}
2297 		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2298 		    B_FALSE);
2299 		FLOW_REFRELE(flent);
2300 	}
2301 	return (NULL);
2302 }
2303 
2304 mblk_t *
2305 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2306 {
2307 	mac_impl_t	*mip = (mac_impl_t *)mh;
2308 	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2309 
2310 	/*
2311 	 * We walk the chain and attempt to classify each packet.
2312 	 * The packets that couldn't be classified will be returned
2313 	 * back to the caller.
2314 	 */
2315 	bp = mp_chain;
2316 	bpp = &list;
2317 	while (bp != NULL) {
2318 		bp1 = bp;
2319 		bp = bp->b_next;
2320 		bp1->b_next = NULL;
2321 
2322 		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2323 			*bpp = bp1;
2324 			bpp = &bp1->b_next;
2325 		}
2326 	}
2327 	return (list);
2328 }
2329 
2330 static int
2331 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2332 {
2333 	mac_ring_handle_t ring = arg;
2334 
2335 	if (flent->fe_tx_srs)
2336 		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2337 	return (0);
2338 }
2339 
2340 void
2341 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2342 {
2343 	mac_client_impl_t	*cclient;
2344 	mac_soft_ring_set_t	*mac_srs;
2345 
2346 	/*
2347 	 * After grabbing the mi_rw_lock, the list of clients can't change.
2348 	 * If there are any clients mi_disabled must be B_FALSE and can't
2349 	 * get set since there are clients. If there aren't any clients we
2350 	 * don't do anything. In any case the mip has to be valid. The driver
2351 	 * must make sure that it goes single threaded (with respect to mac
2352 	 * calls) and wait for all pending mac calls to finish before calling
2353 	 * mac_unregister.
2354 	 */
2355 	rw_enter(&i_mac_impl_lock, RW_READER);
2356 	if (mip->mi_state_flags & MIS_DISABLED) {
2357 		rw_exit(&i_mac_impl_lock);
2358 		return;
2359 	}
2360 
2361 	/*
2362 	 * Get MAC tx srs from walking mac_client_handle list.
2363 	 */
2364 	rw_enter(&mip->mi_rw_lock, RW_READER);
2365 	for (cclient = mip->mi_clients_list; cclient != NULL;
2366 	    cclient = cclient->mci_client_next) {
2367 		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
2368 			mac_tx_srs_wakeup(mac_srs, ring);
2369 		(void) mac_flow_walk(cclient->mci_subflow_tab,
2370 		    mac_tx_flow_srs_wakeup, ring);
2371 	}
2372 	rw_exit(&mip->mi_rw_lock);
2373 	rw_exit(&i_mac_impl_lock);
2374 }
2375 
2376 /* ARGSUSED */
2377 void
2378 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2379     boolean_t add)
2380 {
2381 	mac_impl_t *mip = (mac_impl_t *)mh;
2382 
2383 	i_mac_perim_enter((mac_impl_t *)mh);
2384 	/*
2385 	 * If no specific refresh function was given then default to the
2386 	 * driver's m_multicst entry point.
2387 	 */
2388 	if (refresh == NULL) {
2389 		refresh = mip->mi_multicst;
2390 		arg = mip->mi_driver;
2391 	}
2392 
2393 	mac_bcast_refresh(mip, refresh, arg, add);
2394 	i_mac_perim_exit((mac_impl_t *)mh);
2395 }
2396 
2397 void
2398 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2399 {
2400 	mac_impl_t	*mip = (mac_impl_t *)mh;
2401 
2402 	/*
2403 	 * If no specific refresh function was given then default to the
2404 	 * driver's m_promisc entry point.
2405 	 */
2406 	if (refresh == NULL) {
2407 		refresh = mip->mi_setpromisc;
2408 		arg = mip->mi_driver;
2409 	}
2410 	ASSERT(refresh != NULL);
2411 
2412 	/*
2413 	 * Call the refresh function with the current promiscuity.
2414 	 */
2415 	refresh(arg, (mip->mi_devpromisc != 0));
2416 }
2417 
2418 /*
2419  * The mac client requests that the mac not to change its margin size to
2420  * be less than the specified value.  If "current" is B_TRUE, then the client
2421  * requests the mac not to change its margin size to be smaller than the
2422  * current size. Further, return the current margin size value in this case.
2423  *
2424  * We keep every requested size in an ordered list from largest to smallest.
2425  */
2426 int
2427 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2428 {
2429 	mac_impl_t		*mip = (mac_impl_t *)mh;
2430 	mac_margin_req_t	**pp, *p;
2431 	int			err = 0;
2432 
2433 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2434 	if (current)
2435 		*marginp = mip->mi_margin;
2436 
2437 	/*
2438 	 * If the current margin value cannot satisfy the margin requested,
2439 	 * return ENOTSUP directly.
2440 	 */
2441 	if (*marginp > mip->mi_margin) {
2442 		err = ENOTSUP;
2443 		goto done;
2444 	}
2445 
2446 	/*
2447 	 * Check whether the given margin is already in the list. If so,
2448 	 * bump the reference count.
2449 	 */
2450 	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2451 		if (p->mmr_margin == *marginp) {
2452 			/*
2453 			 * The margin requested is already in the list,
2454 			 * so just bump the reference count.
2455 			 */
2456 			p->mmr_ref++;
2457 			goto done;
2458 		}
2459 		if (p->mmr_margin < *marginp)
2460 			break;
2461 	}
2462 
2463 
2464 	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2465 	p->mmr_margin = *marginp;
2466 	p->mmr_ref++;
2467 	p->mmr_nextp = *pp;
2468 	*pp = p;
2469 
2470 done:
2471 	rw_exit(&(mip->mi_rw_lock));
2472 	return (err);
2473 }
2474 
2475 /*
2476  * The mac client requests to cancel its previous mac_margin_add() request.
2477  * We remove the requested margin size from the list.
2478  */
2479 int
2480 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2481 {
2482 	mac_impl_t		*mip = (mac_impl_t *)mh;
2483 	mac_margin_req_t	**pp, *p;
2484 	int			err = 0;
2485 
2486 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2487 	/*
2488 	 * Find the entry in the list for the given margin.
2489 	 */
2490 	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2491 		if (p->mmr_margin == margin) {
2492 			if (--p->mmr_ref == 0)
2493 				break;
2494 
2495 			/*
2496 			 * There is still a reference to this address so
2497 			 * there's nothing more to do.
2498 			 */
2499 			goto done;
2500 		}
2501 	}
2502 
2503 	/*
2504 	 * We did not find an entry for the given margin.
2505 	 */
2506 	if (p == NULL) {
2507 		err = ENOENT;
2508 		goto done;
2509 	}
2510 
2511 	ASSERT(p->mmr_ref == 0);
2512 
2513 	/*
2514 	 * Remove it from the list.
2515 	 */
2516 	*pp = p->mmr_nextp;
2517 	kmem_free(p, sizeof (mac_margin_req_t));
2518 done:
2519 	rw_exit(&(mip->mi_rw_lock));
2520 	return (err);
2521 }
2522 
2523 boolean_t
2524 mac_margin_update(mac_handle_t mh, uint32_t margin)
2525 {
2526 	mac_impl_t	*mip = (mac_impl_t *)mh;
2527 	uint32_t	margin_needed = 0;
2528 
2529 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2530 
2531 	if (mip->mi_mmrp != NULL)
2532 		margin_needed = mip->mi_mmrp->mmr_margin;
2533 
2534 	if (margin_needed <= margin)
2535 		mip->mi_margin = margin;
2536 
2537 	rw_exit(&(mip->mi_rw_lock));
2538 
2539 	if (margin_needed <= margin)
2540 		i_mac_notify(mip, MAC_NOTE_MARGIN);
2541 
2542 	return (margin_needed <= margin);
2543 }
2544 
2545 /*
2546  * MAC Type Plugin functions.
2547  */
2548 
2549 mactype_t *
2550 mactype_getplugin(const char *pname)
2551 {
2552 	mactype_t	*mtype = NULL;
2553 	boolean_t	tried_modload = B_FALSE;
2554 
2555 	mutex_enter(&i_mactype_lock);
2556 
2557 find_registered_mactype:
2558 	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2559 	    (mod_hash_val_t *)&mtype) != 0) {
2560 		if (!tried_modload) {
2561 			/*
2562 			 * If the plugin has not yet been loaded, then
2563 			 * attempt to load it now.  If modload() succeeds,
2564 			 * the plugin should have registered using
2565 			 * mactype_register(), in which case we can go back
2566 			 * and attempt to find it again.
2567 			 */
2568 			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2569 				tried_modload = B_TRUE;
2570 				goto find_registered_mactype;
2571 			}
2572 		}
2573 	} else {
2574 		/*
2575 		 * Note that there's no danger that the plugin we've loaded
2576 		 * could be unloaded between the modload() step and the
2577 		 * reference count bump here, as we're holding
2578 		 * i_mactype_lock, which mactype_unregister() also holds.
2579 		 */
2580 		atomic_inc_32(&mtype->mt_ref);
2581 	}
2582 
2583 	mutex_exit(&i_mactype_lock);
2584 	return (mtype);
2585 }
2586 
2587 mactype_register_t *
2588 mactype_alloc(uint_t mactype_version)
2589 {
2590 	mactype_register_t *mtrp;
2591 
2592 	/*
2593 	 * Make sure there isn't a version mismatch between the plugin and
2594 	 * the framework.  In the future, if multiple versions are
2595 	 * supported, this check could become more sophisticated.
2596 	 */
2597 	if (mactype_version != MACTYPE_VERSION)
2598 		return (NULL);
2599 
2600 	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2601 	mtrp->mtr_version = mactype_version;
2602 	return (mtrp);
2603 }
2604 
2605 void
2606 mactype_free(mactype_register_t *mtrp)
2607 {
2608 	kmem_free(mtrp, sizeof (mactype_register_t));
2609 }
2610 
2611 int
2612 mactype_register(mactype_register_t *mtrp)
2613 {
2614 	mactype_t	*mtp;
2615 	mactype_ops_t	*ops = mtrp->mtr_ops;
2616 
2617 	/* Do some sanity checking before we register this MAC type. */
2618 	if (mtrp->mtr_ident == NULL || ops == NULL)
2619 		return (EINVAL);
2620 
2621 	/*
2622 	 * Verify that all mandatory callbacks are set in the ops
2623 	 * vector.
2624 	 */
2625 	if (ops->mtops_unicst_verify == NULL ||
2626 	    ops->mtops_multicst_verify == NULL ||
2627 	    ops->mtops_sap_verify == NULL ||
2628 	    ops->mtops_header == NULL ||
2629 	    ops->mtops_header_info == NULL) {
2630 		return (EINVAL);
2631 	}
2632 
2633 	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2634 	mtp->mt_ident = mtrp->mtr_ident;
2635 	mtp->mt_ops = *ops;
2636 	mtp->mt_type = mtrp->mtr_mactype;
2637 	mtp->mt_nativetype = mtrp->mtr_nativetype;
2638 	mtp->mt_addr_length = mtrp->mtr_addrlen;
2639 	if (mtrp->mtr_brdcst_addr != NULL) {
2640 		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2641 		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2642 		    mtrp->mtr_addrlen);
2643 	}
2644 
2645 	mtp->mt_stats = mtrp->mtr_stats;
2646 	mtp->mt_statcount = mtrp->mtr_statcount;
2647 
2648 	mtp->mt_mapping = mtrp->mtr_mapping;
2649 	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2650 
2651 	if (mod_hash_insert(i_mactype_hash,
2652 	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2653 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2654 		kmem_free(mtp, sizeof (*mtp));
2655 		return (EEXIST);
2656 	}
2657 	return (0);
2658 }
2659 
2660 int
2661 mactype_unregister(const char *ident)
2662 {
2663 	mactype_t	*mtp;
2664 	mod_hash_val_t	val;
2665 	int 		err;
2666 
2667 	/*
2668 	 * Let's not allow MAC drivers to use this plugin while we're
2669 	 * trying to unregister it.  Holding i_mactype_lock also prevents a
2670 	 * plugin from unregistering while a MAC driver is attempting to
2671 	 * hold a reference to it in i_mactype_getplugin().
2672 	 */
2673 	mutex_enter(&i_mactype_lock);
2674 
2675 	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2676 	    (mod_hash_val_t *)&mtp)) != 0) {
2677 		/* A plugin is trying to unregister, but it never registered. */
2678 		err = ENXIO;
2679 		goto done;
2680 	}
2681 
2682 	if (mtp->mt_ref != 0) {
2683 		err = EBUSY;
2684 		goto done;
2685 	}
2686 
2687 	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2688 	ASSERT(err == 0);
2689 	if (err != 0) {
2690 		/* This should never happen, thus the ASSERT() above. */
2691 		err = EINVAL;
2692 		goto done;
2693 	}
2694 	ASSERT(mtp == (mactype_t *)val);
2695 
2696 	kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2697 	kmem_free(mtp, sizeof (mactype_t));
2698 done:
2699 	mutex_exit(&i_mactype_lock);
2700 	return (err);
2701 }
2702 
2703 /*
2704  * Returns TRUE when the specified property is intended for the MAC framework,
2705  * as opposed to driver defined properties.
2706  */
2707 static boolean_t
2708 mac_is_macprop(mac_prop_t *macprop)
2709 {
2710 	switch (macprop->mp_id) {
2711 	case MAC_PROP_MAXBW:
2712 	case MAC_PROP_PRIO:
2713 	case MAC_PROP_BIND_CPU:
2714 		return (B_TRUE);
2715 	default:
2716 		return (B_FALSE);
2717 	}
2718 }
2719 
2720 /*
2721  * mac_set_prop() sets mac or hardware driver properties:
2722  * 	mac properties include maxbw, priority, and cpu binding list. Driver
2723  *	properties are private properties to the hardware, such as mtu, speed
2724  *	etc.
2725  * If the property is a driver property, mac_set_prop() calls driver's callback
2726  * function to set it.
2727  * If the property is a mac property, mac_set_prop() invokes mac_set_resources()
2728  * which will cache the property value in mac_impl_t and may call
2729  * mac_client_set_resource() to update property value of the primary mac client,
2730  * if it exists.
2731  */
2732 int
2733 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
2734 {
2735 	int err = ENOTSUP;
2736 	mac_impl_t *mip = (mac_impl_t *)mh;
2737 
2738 	ASSERT(MAC_PERIM_HELD(mh));
2739 
2740 	/* If it is mac property, call mac_set_resources() */
2741 	if (mac_is_macprop(macprop)) {
2742 		mac_resource_props_t mrp;
2743 
2744 		if (valsize < sizeof (mac_resource_props_t))
2745 			return (EINVAL);
2746 		bzero(&mrp, sizeof (mac_resource_props_t));
2747 		bcopy(val, &mrp, sizeof (mrp));
2748 		return (mac_set_resources(mh, &mrp));
2749 	}
2750 	switch (macprop->mp_id) {
2751 	case MAC_PROP_MTU: {
2752 		uint32_t mtu;
2753 
2754 		if (valsize < sizeof (mtu))
2755 			return (EINVAL);
2756 		bcopy(val, &mtu, sizeof (mtu));
2757 		err = mac_set_mtu(mh, mtu, NULL);
2758 		break;
2759 	}
2760 	default:
2761 		/* For other driver properties, call driver's callback */
2762 		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
2763 			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
2764 			    macprop->mp_name, macprop->mp_id, valsize, val);
2765 		}
2766 	}
2767 	return (err);
2768 }
2769 
2770 /*
2771  * mac_get_prop() gets mac or hardware driver properties.
2772  *
2773  * If the property is a driver property, mac_get_prop() calls driver's callback
2774  * function to get it.
2775  * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
2776  * which returns the cached value in mac_impl_t.
2777  */
2778 int
2779 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
2780     uint_t *perm)
2781 {
2782 	int err = ENOTSUP;
2783 	mac_impl_t *mip = (mac_impl_t *)mh;
2784 	link_state_t link_state;
2785 	boolean_t is_getprop, is_setprop;
2786 
2787 	is_getprop = (mip->mi_callbacks->mc_callbacks & MC_GETPROP);
2788 	is_setprop = (mip->mi_callbacks->mc_callbacks & MC_SETPROP);
2789 
2790 	/* If mac property, read from cache */
2791 	if (mac_is_macprop(macprop)) {
2792 		mac_resource_props_t mrp;
2793 
2794 		if (valsize < sizeof (mac_resource_props_t))
2795 			return (EINVAL);
2796 		bzero(&mrp, sizeof (mac_resource_props_t));
2797 		mac_get_resources(mh, &mrp);
2798 		bcopy(&mrp, val, sizeof (mac_resource_props_t));
2799 		return (0);
2800 	}
2801 
2802 	switch (macprop->mp_id) {
2803 	case MAC_PROP_MTU: {
2804 		uint32_t sdu;
2805 		mac_propval_range_t range;
2806 
2807 		if ((macprop->mp_flags & MAC_PROP_POSSIBLE) != 0) {
2808 			if (valsize < sizeof (mac_propval_range_t))
2809 				return (EINVAL);
2810 			if (is_getprop) {
2811 				err = mip->mi_callbacks->mc_getprop(mip->
2812 				    mi_driver, macprop->mp_name, macprop->mp_id,
2813 				    macprop->mp_flags, valsize, val, perm);
2814 			}
2815 			/*
2816 			 * If the driver doesn't have *_m_getprop defined or
2817 			 * if the driver doesn't support setting MTU then
2818 			 * return the CURRENT value as POSSIBLE value.
2819 			 */
2820 			if (!is_getprop || err == ENOTSUP) {
2821 				mac_sdu_get(mh, NULL, &sdu);
2822 				range.mpr_count = 1;
2823 				range.mpr_type = MAC_PROPVAL_UINT32;
2824 				range.range_uint32[0].mpur_min =
2825 				    range.range_uint32[0].mpur_max = sdu;
2826 				bcopy(&range, val, sizeof (range));
2827 				err = 0;
2828 			}
2829 			return (err);
2830 		}
2831 		if (valsize < sizeof (sdu))
2832 			return (EINVAL);
2833 		if ((macprop->mp_flags & MAC_PROP_DEFAULT) == 0) {
2834 			mac_sdu_get(mh, NULL, &sdu);
2835 			bcopy(&sdu, val, sizeof (sdu));
2836 			if (is_setprop && (mip->mi_callbacks->mc_setprop(mip->
2837 			    mi_driver, macprop->mp_name, macprop->mp_id,
2838 			    valsize, val) == 0)) {
2839 				*perm = MAC_PROP_PERM_RW;
2840 			} else {
2841 				*perm = MAC_PROP_PERM_READ;
2842 			}
2843 			return (0);
2844 		} else {
2845 			if (mip->mi_info.mi_media == DL_ETHER) {
2846 				sdu = ETHERMTU;
2847 				bcopy(&sdu, val, sizeof (sdu));
2848 
2849 				return (0);
2850 			}
2851 			/*
2852 			 * ask driver for its default.
2853 			 */
2854 			break;
2855 		}
2856 	}
2857 	case MAC_PROP_STATUS:
2858 		if (valsize < sizeof (link_state))
2859 			return (EINVAL);
2860 		*perm = MAC_PROP_PERM_READ;
2861 		link_state = mac_link_get(mh);
2862 		bcopy(&link_state, val, sizeof (link_state));
2863 		return (0);
2864 	default:
2865 		break;
2866 
2867 	}
2868 	/* If driver property, request from driver */
2869 	if (is_getprop) {
2870 		err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
2871 		    macprop->mp_name, macprop->mp_id, macprop->mp_flags,
2872 		    valsize, val, perm);
2873 	}
2874 	return (err);
2875 }
2876 
2877 int
2878 mac_fastpath_disable(mac_handle_t mh)
2879 {
2880 	mac_impl_t	*mip = (mac_impl_t *)mh;
2881 
2882 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2883 		return (0);
2884 
2885 	return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
2886 }
2887 
2888 void
2889 mac_fastpath_enable(mac_handle_t mh)
2890 {
2891 	mac_impl_t	*mip = (mac_impl_t *)mh;
2892 
2893 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2894 		return;
2895 
2896 	mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
2897 }
2898 
2899 void
2900 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
2901 {
2902 	mac_priv_prop_t *mpriv;
2903 
2904 	if (mpp == NULL)
2905 		return;
2906 
2907 	mpriv = kmem_zalloc(nprop * sizeof (*mpriv), KM_SLEEP);
2908 	(void) memcpy(mpriv, mpp, nprop * sizeof (*mpriv));
2909 	mip->mi_priv_prop = mpriv;
2910 	mip->mi_priv_prop_count = nprop;
2911 }
2912 
2913 void
2914 mac_unregister_priv_prop(mac_impl_t *mip)
2915 {
2916 	mac_priv_prop_t	*mpriv;
2917 
2918 	mpriv = mip->mi_priv_prop;
2919 	if (mpriv != NULL) {
2920 		kmem_free(mpriv, mip->mi_priv_prop_count * sizeof (*mpriv));
2921 		mip->mi_priv_prop = NULL;
2922 	}
2923 	mip->mi_priv_prop_count = 0;
2924 }
2925 
2926 /*
2927  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
2928  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
2929  * cases if MAC free's the ring structure after mac_stop_ring(), any
2930  * illegal access to the ring structure coming from the driver will panic
2931  * the system. In order to protect the system from such inadverent access,
2932  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
2933  * When packets are received on free'd up rings, MAC (through the generation
2934  * count mechanism) will drop such packets.
2935  */
2936 static mac_ring_t *
2937 mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
2938 {
2939 	mac_ring_t *ring;
2940 
2941 	if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2942 		mutex_enter(&mip->mi_ring_lock);
2943 		if (mip->mi_ring_freelist != NULL) {
2944 			ring = mip->mi_ring_freelist;
2945 			mip->mi_ring_freelist = ring->mr_next;
2946 			bzero(ring, sizeof (mac_ring_t));
2947 		} else {
2948 			ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
2949 		}
2950 		mutex_exit(&mip->mi_ring_lock);
2951 	} else {
2952 		ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
2953 	}
2954 	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
2955 	return (ring);
2956 }
2957 
2958 static void
2959 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
2960 {
2961 	if (ring->mr_type == MAC_RING_TYPE_RX) {
2962 		mutex_enter(&mip->mi_ring_lock);
2963 		ring->mr_state = MR_FREE;
2964 		ring->mr_flag = 0;
2965 		ring->mr_next = mip->mi_ring_freelist;
2966 		mip->mi_ring_freelist = ring;
2967 		mutex_exit(&mip->mi_ring_lock);
2968 	} else {
2969 		kmem_free(ring, sizeof (mac_ring_t));
2970 	}
2971 }
2972 
2973 static void
2974 mac_ring_freeall(mac_impl_t *mip)
2975 {
2976 	mac_ring_t *ring_next;
2977 	mutex_enter(&mip->mi_ring_lock);
2978 	mac_ring_t *ring = mip->mi_ring_freelist;
2979 	while (ring != NULL) {
2980 		ring_next = ring->mr_next;
2981 		kmem_cache_free(mac_ring_cache, ring);
2982 		ring = ring_next;
2983 	}
2984 	mip->mi_ring_freelist = NULL;
2985 	mutex_exit(&mip->mi_ring_lock);
2986 }
2987 
2988 int
2989 mac_start_ring(mac_ring_t *ring)
2990 {
2991 	int rv = 0;
2992 
2993 	if (ring->mr_start != NULL)
2994 		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
2995 
2996 	return (rv);
2997 }
2998 
2999 void
3000 mac_stop_ring(mac_ring_t *ring)
3001 {
3002 	if (ring->mr_stop != NULL)
3003 		ring->mr_stop(ring->mr_driver);
3004 
3005 	/*
3006 	 * Increment the ring generation number for this ring.
3007 	 */
3008 	ring->mr_gen_num++;
3009 }
3010 
3011 int
3012 mac_start_group(mac_group_t *group)
3013 {
3014 	int rv = 0;
3015 
3016 	if (group->mrg_start != NULL)
3017 		rv = group->mrg_start(group->mrg_driver);
3018 
3019 	return (rv);
3020 }
3021 
3022 void
3023 mac_stop_group(mac_group_t *group)
3024 {
3025 	if (group->mrg_stop != NULL)
3026 		group->mrg_stop(group->mrg_driver);
3027 }
3028 
3029 /*
3030  * Called from mac_start() on the default Rx group. Broadcast and multicast
3031  * packets are received only on the default group. Hence the default group
3032  * needs to be up even if the primary client is not up, for the other groups
3033  * to be functional. We do this by calling this function at mac_start time
3034  * itself. However the broadcast packets that are received can't make their
3035  * way beyond mac_rx until a mac client creates a broadcast flow.
3036  */
3037 static int
3038 mac_start_group_and_rings(mac_group_t *group)
3039 {
3040 	mac_ring_t	*ring;
3041 	int		rv = 0;
3042 
3043 	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3044 	if ((rv = mac_start_group(group)) != 0)
3045 		return (rv);
3046 
3047 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3048 		ASSERT(ring->mr_state == MR_FREE);
3049 		if ((rv = mac_start_ring(ring)) != 0)
3050 			goto error;
3051 		ring->mr_state = MR_INUSE;
3052 		ring->mr_classify_type = MAC_SW_CLASSIFIER;
3053 	}
3054 	return (0);
3055 
3056 error:
3057 	mac_stop_group_and_rings(group);
3058 	return (rv);
3059 }
3060 
3061 /* Called from mac_stop on the default Rx group */
3062 static void
3063 mac_stop_group_and_rings(mac_group_t *group)
3064 {
3065 	mac_ring_t	*ring;
3066 
3067 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3068 		if (ring->mr_state != MR_FREE) {
3069 			mac_stop_ring(ring);
3070 			ring->mr_state = MR_FREE;
3071 			ring->mr_flag = 0;
3072 			ring->mr_classify_type = MAC_NO_CLASSIFIER;
3073 		}
3074 	}
3075 	mac_stop_group(group);
3076 }
3077 
3078 
3079 static mac_ring_t *
3080 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3081     mac_capab_rings_t *cap_rings)
3082 {
3083 	mac_ring_t *ring;
3084 	mac_ring_info_t ring_info;
3085 
3086 	ring = mac_ring_alloc(mip, cap_rings);
3087 
3088 	/* Prepare basic information of ring */
3089 	ring->mr_index = index;
3090 	ring->mr_type = group->mrg_type;
3091 	ring->mr_gh = (mac_group_handle_t)group;
3092 
3093 	/* Insert the new ring to the list. */
3094 	ring->mr_next = group->mrg_rings;
3095 	group->mrg_rings = ring;
3096 
3097 	/* Zero to reuse the info data structure */
3098 	bzero(&ring_info, sizeof (ring_info));
3099 
3100 	/* Query ring information from driver */
3101 	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3102 	    index, &ring_info, (mac_ring_handle_t)ring);
3103 
3104 	ring->mr_info = ring_info;
3105 
3106 	/* Update ring's status */
3107 	ring->mr_state = MR_FREE;
3108 	ring->mr_flag = 0;
3109 
3110 	/* Update the ring count of the group */
3111 	group->mrg_cur_count++;
3112 	return (ring);
3113 }
3114 
3115 /*
3116  * Rings are chained together for easy regrouping.
3117  */
3118 static void
3119 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3120     mac_capab_rings_t *cap_rings)
3121 {
3122 	int index;
3123 
3124 	/*
3125 	 * Initialize all ring members of this group. Size of zero will not
3126 	 * enter the loop, so it's safe for initializing an empty group.
3127 	 */
3128 	for (index = size - 1; index >= 0; index--)
3129 		(void) mac_init_ring(mip, group, index, cap_rings);
3130 }
3131 
3132 int
3133 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3134 {
3135 	mac_capab_rings_t *cap_rings;
3136 	mac_group_t *group, *groups;
3137 	mac_group_info_t group_info;
3138 	uint_t group_free = 0;
3139 	uint_t ring_left;
3140 	mac_ring_t *ring;
3141 	int g, err = 0;
3142 
3143 	switch (rtype) {
3144 	case MAC_RING_TYPE_RX:
3145 		ASSERT(mip->mi_rx_groups == NULL);
3146 
3147 		cap_rings = &mip->mi_rx_rings_cap;
3148 		cap_rings->mr_type = MAC_RING_TYPE_RX;
3149 		break;
3150 	case MAC_RING_TYPE_TX:
3151 		ASSERT(mip->mi_tx_groups == NULL);
3152 
3153 		cap_rings = &mip->mi_tx_rings_cap;
3154 		cap_rings->mr_type = MAC_RING_TYPE_TX;
3155 		break;
3156 	default:
3157 		ASSERT(B_FALSE);
3158 	}
3159 
3160 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
3161 	    cap_rings))
3162 		return (0);
3163 
3164 	/*
3165 	 * Allocate a contiguous buffer for all groups.
3166 	 */
3167 	groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
3168 	    KM_SLEEP);
3169 
3170 	ring_left = cap_rings->mr_rnum;
3171 
3172 	/*
3173 	 * Get all ring groups if any, and get their ring members
3174 	 * if any.
3175 	 */
3176 	for (g = 0; g < cap_rings->mr_gnum; g++) {
3177 		group = groups + g;
3178 
3179 		/* Prepare basic information of the group */
3180 		group->mrg_index = g;
3181 		group->mrg_type = rtype;
3182 		group->mrg_state = MAC_GROUP_STATE_UNINIT;
3183 		group->mrg_mh = (mac_handle_t)mip;
3184 		group->mrg_next = group + 1;
3185 
3186 		/* Zero to reuse the info data structure */
3187 		bzero(&group_info, sizeof (group_info));
3188 
3189 		/* Query group information from driver */
3190 		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3191 		    (mac_group_handle_t)group);
3192 
3193 		switch (cap_rings->mr_group_type) {
3194 		case MAC_GROUP_TYPE_DYNAMIC:
3195 			if (cap_rings->mr_gaddring == NULL ||
3196 			    cap_rings->mr_gremring == NULL) {
3197 				DTRACE_PROBE3(
3198 				    mac__init__rings_no_addremring,
3199 				    char *, mip->mi_name,
3200 				    mac_group_add_ring_t,
3201 				    cap_rings->mr_gaddring,
3202 				    mac_group_add_ring_t,
3203 				    cap_rings->mr_gremring);
3204 				err = EINVAL;
3205 				goto bail;
3206 			}
3207 
3208 			switch (rtype) {
3209 			case MAC_RING_TYPE_RX:
3210 				/*
3211 				 * The first RX group must have non-zero
3212 				 * rings, and the following groups must
3213 				 * have zero rings.
3214 				 */
3215 				if (g == 0 && group_info.mgi_count == 0) {
3216 					DTRACE_PROBE1(
3217 					    mac__init__rings__rx__def__zero,
3218 					    char *, mip->mi_name);
3219 					err = EINVAL;
3220 					goto bail;
3221 				}
3222 				if (g > 0 && group_info.mgi_count != 0) {
3223 					DTRACE_PROBE3(
3224 					    mac__init__rings__rx__nonzero,
3225 					    char *, mip->mi_name,
3226 					    int, g, int, group_info.mgi_count);
3227 					err = EINVAL;
3228 					goto bail;
3229 				}
3230 				break;
3231 			case MAC_RING_TYPE_TX:
3232 				/*
3233 				 * All TX ring groups must have zero rings.
3234 				 */
3235 				if (group_info.mgi_count != 0) {
3236 					DTRACE_PROBE3(
3237 					    mac__init__rings__tx__nonzero,
3238 					    char *, mip->mi_name,
3239 					    int, g, int, group_info.mgi_count);
3240 					err = EINVAL;
3241 					goto bail;
3242 				}
3243 				break;
3244 			}
3245 			break;
3246 		case MAC_GROUP_TYPE_STATIC:
3247 			/*
3248 			 * Note that an empty group is allowed, e.g., an aggr
3249 			 * would start with an empty group.
3250 			 */
3251 			break;
3252 		default:
3253 			/* unknown group type */
3254 			DTRACE_PROBE2(mac__init__rings__unknown__type,
3255 			    char *, mip->mi_name,
3256 			    int, cap_rings->mr_group_type);
3257 			err = EINVAL;
3258 			goto bail;
3259 		}
3260 
3261 
3262 		/*
3263 		 * Driver must register group->mgi_addmac/remmac() for rx groups
3264 		 * to support multiple MAC addresses.
3265 		 */
3266 		if (rtype == MAC_RING_TYPE_RX) {
3267 			if ((group_info.mgi_addmac == NULL) ||
3268 			    (group_info.mgi_addmac == NULL))
3269 				goto bail;
3270 		}
3271 
3272 		/* Cache driver-supplied information */
3273 		group->mrg_info = group_info;
3274 
3275 		/* Update the group's status and group count. */
3276 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3277 		group_free++;
3278 
3279 		group->mrg_rings = NULL;
3280 		group->mrg_cur_count = 0;
3281 		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3282 		ring_left -= group_info.mgi_count;
3283 
3284 		/* The current group size should be equal to default value */
3285 		ASSERT(group->mrg_cur_count == group_info.mgi_count);
3286 	}
3287 
3288 	/* Build up a dummy group for free resources as a pool */
3289 	group = groups + cap_rings->mr_gnum;
3290 
3291 	/* Prepare basic information of the group */
3292 	group->mrg_index = -1;
3293 	group->mrg_type = rtype;
3294 	group->mrg_state = MAC_GROUP_STATE_UNINIT;
3295 	group->mrg_mh = (mac_handle_t)mip;
3296 	group->mrg_next = NULL;
3297 
3298 	/*
3299 	 * If there are ungrouped rings, allocate a continuous buffer for
3300 	 * remaining resources.
3301 	 */
3302 	if (ring_left != 0) {
3303 		group->mrg_rings = NULL;
3304 		group->mrg_cur_count = 0;
3305 		mac_init_group(mip, group, ring_left, cap_rings);
3306 
3307 		/* The current group size should be equal to ring_left */
3308 		ASSERT(group->mrg_cur_count == ring_left);
3309 
3310 		ring_left = 0;
3311 
3312 		/* Update this group's status */
3313 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3314 	} else
3315 		group->mrg_rings = NULL;
3316 
3317 	ASSERT(ring_left == 0);
3318 
3319 bail:
3320 	/* Cache other important information to finalize the initialization */
3321 	switch (rtype) {
3322 	case MAC_RING_TYPE_RX:
3323 		mip->mi_rx_group_type = cap_rings->mr_group_type;
3324 		mip->mi_rx_group_count = cap_rings->mr_gnum;
3325 		mip->mi_rx_groups = groups;
3326 		break;
3327 	case MAC_RING_TYPE_TX:
3328 		mip->mi_tx_group_type = cap_rings->mr_group_type;
3329 		mip->mi_tx_group_count = cap_rings->mr_gnum;
3330 		mip->mi_tx_group_free = group_free;
3331 		mip->mi_tx_groups = groups;
3332 
3333 		/*
3334 		 * Ring 0 is used as the default one and it could be assigned
3335 		 * to a client as well.
3336 		 */
3337 		group = groups + cap_rings->mr_gnum;
3338 		ring = group->mrg_rings;
3339 		while ((ring->mr_index != 0) && (ring->mr_next != NULL))
3340 			ring = ring->mr_next;
3341 		ASSERT(ring->mr_index == 0);
3342 		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
3343 		break;
3344 	default:
3345 		ASSERT(B_FALSE);
3346 	}
3347 
3348 	if (err != 0)
3349 		mac_free_rings(mip, rtype);
3350 
3351 	return (err);
3352 }
3353 
3354 /*
3355  * Called to free all ring groups with particular type. It's supposed all groups
3356  * have been released by clinet.
3357  */
3358 void
3359 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3360 {
3361 	mac_group_t *group, *groups;
3362 	uint_t group_count;
3363 
3364 	switch (rtype) {
3365 	case MAC_RING_TYPE_RX:
3366 		if (mip->mi_rx_groups == NULL)
3367 			return;
3368 
3369 		groups = mip->mi_rx_groups;
3370 		group_count = mip->mi_rx_group_count;
3371 
3372 		mip->mi_rx_groups = NULL;
3373 		mip->mi_rx_group_count = 0;
3374 		break;
3375 	case MAC_RING_TYPE_TX:
3376 		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
3377 
3378 		if (mip->mi_tx_groups == NULL)
3379 			return;
3380 
3381 		groups = mip->mi_tx_groups;
3382 		group_count = mip->mi_tx_group_count;
3383 
3384 		mip->mi_tx_groups = NULL;
3385 		mip->mi_tx_group_count = 0;
3386 		mip->mi_tx_group_free = 0;
3387 		mip->mi_default_tx_ring = NULL;
3388 		break;
3389 	default:
3390 		ASSERT(B_FALSE);
3391 	}
3392 
3393 	for (group = groups; group != NULL; group = group->mrg_next) {
3394 		mac_ring_t *ring;
3395 
3396 		if (group->mrg_cur_count == 0)
3397 			continue;
3398 
3399 		ASSERT(group->mrg_rings != NULL);
3400 
3401 		while ((ring = group->mrg_rings) != NULL) {
3402 			group->mrg_rings = ring->mr_next;
3403 			mac_ring_free(mip, ring);
3404 		}
3405 	}
3406 
3407 	/* Free all the cached rings */
3408 	mac_ring_freeall(mip);
3409 	/* Free the block of group data strutures */
3410 	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
3411 }
3412 
3413 /*
3414  * Associate a MAC address with a receive group.
3415  *
3416  * The return value of this function should always be checked properly, because
3417  * any type of failure could cause unexpected results. A group can be added
3418  * or removed with a MAC address only after it has been reserved. Ideally,
3419  * a successful reservation always leads to calling mac_group_addmac() to
3420  * steer desired traffic. Failure of adding an unicast MAC address doesn't
3421  * always imply that the group is functioning abnormally.
3422  *
3423  * Currently this function is called everywhere, and it reflects assumptions
3424  * about MAC addresses in the implementation. CR 6735196.
3425  */
3426 int
3427 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
3428 {
3429 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3430 	ASSERT(group->mrg_info.mgi_addmac != NULL);
3431 
3432 	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
3433 }
3434 
3435 /*
3436  * Remove the association between MAC address and receive group.
3437  */
3438 int
3439 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
3440 {
3441 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3442 	ASSERT(group->mrg_info.mgi_remmac != NULL);
3443 
3444 	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
3445 }
3446 
3447 /*
3448  * Release a ring in use by marking it MR_FREE.
3449  * Any other client may reserve it for its use.
3450  */
3451 void
3452 mac_release_tx_ring(mac_ring_handle_t rh)
3453 {
3454 	mac_ring_t *ring = (mac_ring_t *)rh;
3455 	mac_group_t *group = (mac_group_t *)ring->mr_gh;
3456 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3457 
3458 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3459 	ASSERT(ring->mr_state != MR_FREE);
3460 
3461 	/*
3462 	 * Default tx ring will be released by mac_stop().
3463 	 */
3464 	if (rh == mip->mi_default_tx_ring)
3465 		return;
3466 
3467 	mac_stop_ring(ring);
3468 
3469 	ring->mr_state = MR_FREE;
3470 	ring->mr_flag = 0;
3471 }
3472 
3473 /*
3474  * Send packets through a selected tx ring.
3475  */
3476 mblk_t *
3477 mac_ring_tx(mac_ring_handle_t rh, mblk_t *mp)
3478 {
3479 	mac_ring_t *ring = (mac_ring_t *)rh;
3480 	mac_ring_info_t *info = &ring->mr_info;
3481 
3482 	ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
3483 	ASSERT(ring->mr_state >= MR_INUSE);
3484 	ASSERT(info->mri_tx != NULL);
3485 
3486 	return (info->mri_tx(info->mri_driver, mp));
3487 }
3488 
3489 /*
3490  * Find a ring from its index.
3491  */
3492 mac_ring_t *
3493 mac_find_ring(mac_group_t *group, int index)
3494 {
3495 	mac_ring_t *ring = group->mrg_rings;
3496 
3497 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
3498 		if (ring->mr_index == index)
3499 			break;
3500 
3501 	return (ring);
3502 }
3503 /*
3504  * Add a ring to an existing group.
3505  *
3506  * The ring must be either passed directly (for example if the ring
3507  * movement is initiated by the framework), or specified through a driver
3508  * index (for example when the ring is added by the driver.
3509  *
3510  * The caller needs to call mac_perim_enter() before calling this function.
3511  */
3512 int
3513 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
3514 {
3515 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3516 	mac_capab_rings_t *cap_rings;
3517 	boolean_t driver_call = (ring == NULL);
3518 	mac_group_type_t group_type;
3519 	int ret = 0;
3520 
3521 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3522 
3523 	switch (group->mrg_type) {
3524 	case MAC_RING_TYPE_RX:
3525 		cap_rings = &mip->mi_rx_rings_cap;
3526 		group_type = mip->mi_rx_group_type;
3527 		break;
3528 	case MAC_RING_TYPE_TX:
3529 		cap_rings = &mip->mi_tx_rings_cap;
3530 		group_type = mip->mi_tx_group_type;
3531 		break;
3532 	default:
3533 		ASSERT(B_FALSE);
3534 	}
3535 
3536 	/*
3537 	 * There should be no ring with the same ring index in the target
3538 	 * group.
3539 	 */
3540 	ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
3541 	    NULL);
3542 
3543 	if (driver_call) {
3544 		/*
3545 		 * The function is called as a result of a request from
3546 		 * a driver to add a ring to an existing group, for example
3547 		 * from the aggregation driver. Allocate a new mac_ring_t
3548 		 * for that ring.
3549 		 */
3550 		ring = mac_init_ring(mip, group, index, cap_rings);
3551 		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
3552 	} else {
3553 		/*
3554 		 * The function is called as a result of a MAC layer request
3555 		 * to add a ring to an existing group. In this case the
3556 		 * ring is being moved between groups, which requires
3557 		 * the underlying driver to support dynamic grouping,
3558 		 * and the mac_ring_t already exists.
3559 		 */
3560 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3561 		ASSERT(cap_rings->mr_gaddring != NULL);
3562 		ASSERT(ring->mr_gh == NULL);
3563 	}
3564 
3565 	/*
3566 	 * At this point the ring should not be in use, and it should be
3567 	 * of the right for the target group.
3568 	 */
3569 	ASSERT(ring->mr_state < MR_INUSE);
3570 	ASSERT(ring->mr_srs == NULL);
3571 	ASSERT(ring->mr_type == group->mrg_type);
3572 
3573 	if (!driver_call) {
3574 		/*
3575 		 * Add the driver level hardware ring if the process was not
3576 		 * initiated by the driver, and the target group is not the
3577 		 * group.
3578 		 */
3579 		if (group->mrg_driver != NULL) {
3580 			cap_rings->mr_gaddring(group->mrg_driver,
3581 			    ring->mr_driver, ring->mr_type);
3582 		}
3583 
3584 		/*
3585 		 * Insert the ring ahead existing rings.
3586 		 */
3587 		ring->mr_next = group->mrg_rings;
3588 		group->mrg_rings = ring;
3589 		ring->mr_gh = (mac_group_handle_t)group;
3590 		group->mrg_cur_count++;
3591 	}
3592 
3593 	/*
3594 	 * If the group has not been actively used, we're done.
3595 	 */
3596 	if (group->mrg_index != -1 &&
3597 	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
3598 		return (0);
3599 
3600 	/*
3601 	 * Set up SRS/SR according to the ring type.
3602 	 */
3603 	switch (ring->mr_type) {
3604 	case MAC_RING_TYPE_RX:
3605 		/*
3606 		 * Setup SRS on top of the new ring if the group is
3607 		 * reserved for someones exclusive use.
3608 		 */
3609 		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
3610 			flow_entry_t *flent;
3611 			mac_client_impl_t *mcip;
3612 
3613 			mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
3614 			ASSERT(mcip != NULL);
3615 			flent = mcip->mci_flent;
3616 			ASSERT(flent->fe_rx_srs_cnt > 0);
3617 			mac_srs_group_setup(mcip, flent, group, SRST_LINK);
3618 		}
3619 		break;
3620 	case MAC_RING_TYPE_TX:
3621 		/*
3622 		 * For TX this function is only invoked during the
3623 		 * initial creation of a group when a share is
3624 		 * associated with a MAC client. So the datapath is not
3625 		 * yet setup, and will be setup later after the
3626 		 * group has been reserved and populated.
3627 		 */
3628 		break;
3629 	default:
3630 		ASSERT(B_FALSE);
3631 	}
3632 
3633 	/*
3634 	 * Start the ring if needed. Failure causes to undo the grouping action.
3635 	 */
3636 	if ((ret = mac_start_ring(ring)) != 0) {
3637 		if (ring->mr_type == MAC_RING_TYPE_RX) {
3638 			if (ring->mr_srs != NULL) {
3639 				mac_rx_srs_remove(ring->mr_srs);
3640 				ring->mr_srs = NULL;
3641 			}
3642 		}
3643 		if (!driver_call) {
3644 			cap_rings->mr_gremring(group->mrg_driver,
3645 			    ring->mr_driver, ring->mr_type);
3646 		}
3647 		group->mrg_cur_count--;
3648 		group->mrg_rings = ring->mr_next;
3649 
3650 		ring->mr_gh = NULL;
3651 
3652 		if (driver_call)
3653 			mac_ring_free(mip, ring);
3654 
3655 		return (ret);
3656 	}
3657 
3658 	/*
3659 	 * Update the ring's state.
3660 	 */
3661 	ring->mr_state = MR_INUSE;
3662 	MAC_RING_UNMARK(ring, MR_INCIPIENT);
3663 	return (0);
3664 }
3665 
3666 /*
3667  * Remove a ring from it's current group. MAC internal function for dynamic
3668  * grouping.
3669  *
3670  * The caller needs to call mac_perim_enter() before calling this function.
3671  */
3672 void
3673 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
3674     boolean_t driver_call)
3675 {
3676 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3677 	mac_capab_rings_t *cap_rings = NULL;
3678 	mac_group_type_t group_type;
3679 
3680 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3681 
3682 	ASSERT(mac_find_ring(group, ring->mr_index) == ring);
3683 	ASSERT((mac_group_t *)ring->mr_gh == group);
3684 	ASSERT(ring->mr_type == group->mrg_type);
3685 
3686 	switch (ring->mr_type) {
3687 	case MAC_RING_TYPE_RX:
3688 		group_type = mip->mi_rx_group_type;
3689 		cap_rings = &mip->mi_rx_rings_cap;
3690 
3691 		if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
3692 			mac_stop_ring(ring);
3693 
3694 		/*
3695 		 * Only hardware classified packets hold a reference to the
3696 		 * ring all the way up the Rx path. mac_rx_srs_remove()
3697 		 * will take care of quiescing the Rx path and removing the
3698 		 * SRS. The software classified path neither holds a reference
3699 		 * nor any association with the ring in mac_rx.
3700 		 */
3701 		if (ring->mr_srs != NULL) {
3702 			mac_rx_srs_remove(ring->mr_srs);
3703 			ring->mr_srs = NULL;
3704 		}
3705 		ring->mr_state = MR_FREE;
3706 		ring->mr_flag = 0;
3707 
3708 		break;
3709 	case MAC_RING_TYPE_TX:
3710 		/*
3711 		 * For TX this function is only invoked in two
3712 		 * cases:
3713 		 *
3714 		 * 1) In the case of a failure during the
3715 		 * initial creation of a group when a share is
3716 		 * associated with a MAC client. So the SRS is not
3717 		 * yet setup, and will be setup later after the
3718 		 * group has been reserved and populated.
3719 		 *
3720 		 * 2) From mac_release_tx_group() when freeing
3721 		 * a TX SRS.
3722 		 *
3723 		 * In both cases the SRS and its soft rings are
3724 		 * already quiesced.
3725 		 */
3726 		ASSERT(!driver_call);
3727 		group_type = mip->mi_tx_group_type;
3728 		cap_rings = &mip->mi_tx_rings_cap;
3729 		break;
3730 	default:
3731 		ASSERT(B_FALSE);
3732 	}
3733 
3734 	/*
3735 	 * Remove the ring from the group.
3736 	 */
3737 	if (ring == group->mrg_rings)
3738 		group->mrg_rings = ring->mr_next;
3739 	else {
3740 		mac_ring_t *pre;
3741 
3742 		pre = group->mrg_rings;
3743 		while (pre->mr_next != ring)
3744 			pre = pre->mr_next;
3745 		pre->mr_next = ring->mr_next;
3746 	}
3747 	group->mrg_cur_count--;
3748 
3749 	if (!driver_call) {
3750 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3751 		ASSERT(cap_rings->mr_gremring != NULL);
3752 
3753 		/*
3754 		 * Remove the driver level hardware ring.
3755 		 */
3756 		if (group->mrg_driver != NULL) {
3757 			cap_rings->mr_gremring(group->mrg_driver,
3758 			    ring->mr_driver, ring->mr_type);
3759 		}
3760 	}
3761 
3762 	ring->mr_gh = NULL;
3763 	if (driver_call) {
3764 		mac_ring_free(mip, ring);
3765 	} else {
3766 		ring->mr_state = MR_FREE;
3767 		ring->mr_flag = 0;
3768 	}
3769 }
3770 
3771 /*
3772  * Move a ring to the target group. If needed, remove the ring from the group
3773  * that it currently belongs to.
3774  *
3775  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
3776  */
3777 static int
3778 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
3779 {
3780 	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
3781 	int rv;
3782 
3783 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3784 	ASSERT(d_group != NULL);
3785 	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
3786 
3787 	if (s_group == d_group)
3788 		return (0);
3789 
3790 	/*
3791 	 * Remove it from current group first.
3792 	 */
3793 	if (s_group != NULL)
3794 		i_mac_group_rem_ring(s_group, ring, B_FALSE);
3795 
3796 	/*
3797 	 * Add it to the new group.
3798 	 */
3799 	rv = i_mac_group_add_ring(d_group, ring, 0);
3800 	if (rv != 0) {
3801 		/*
3802 		 * Failed to add ring back to source group. If
3803 		 * that fails, the ring is stuck in limbo, log message.
3804 		 */
3805 		if (i_mac_group_add_ring(s_group, ring, 0)) {
3806 			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
3807 			    mip->mi_name, (void *)ring);
3808 		}
3809 	}
3810 
3811 	return (rv);
3812 }
3813 
3814 /*
3815  * Find a MAC address according to its value.
3816  */
3817 mac_address_t *
3818 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
3819 {
3820 	mac_address_t *map;
3821 
3822 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3823 
3824 	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
3825 		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
3826 			break;
3827 	}
3828 
3829 	return (map);
3830 }
3831 
3832 /*
3833  * Check whether the MAC address is shared by multiple clients.
3834  */
3835 boolean_t
3836 mac_check_macaddr_shared(mac_address_t *map)
3837 {
3838 	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
3839 
3840 	return (map->ma_nusers > 1);
3841 }
3842 
3843 /*
3844  * Remove the specified MAC address from the MAC address list and free it.
3845  */
3846 static void
3847 mac_free_macaddr(mac_address_t *map)
3848 {
3849 	mac_impl_t *mip = map->ma_mip;
3850 
3851 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3852 	ASSERT(mip->mi_addresses != NULL);
3853 
3854 	map = mac_find_macaddr(mip, map->ma_addr);
3855 
3856 	ASSERT(map != NULL);
3857 	ASSERT(map->ma_nusers == 0);
3858 
3859 	if (map == mip->mi_addresses) {
3860 		mip->mi_addresses = map->ma_next;
3861 	} else {
3862 		mac_address_t *pre;
3863 
3864 		pre = mip->mi_addresses;
3865 		while (pre->ma_next != map)
3866 			pre = pre->ma_next;
3867 		pre->ma_next = map->ma_next;
3868 	}
3869 
3870 	kmem_free(map, sizeof (mac_address_t));
3871 }
3872 
3873 /*
3874  * Add a MAC address reference for a client. If the desired MAC address
3875  * exists, add a reference to it. Otherwise, add the new address by adding
3876  * it to a reserved group or setting promiscuous mode. Won't try different
3877  * group is the group is non-NULL, so the caller must explictly share
3878  * default group when needed.
3879  *
3880  * Note, the primary MAC address is initialized at registration time, so
3881  * to add it to default group only need to activate it if its reference
3882  * count is still zero. Also, some drivers may not have advertised RINGS
3883  * capability.
3884  */
3885 int
3886 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
3887     boolean_t use_hw)
3888 {
3889 	mac_address_t *map;
3890 	int err = 0;
3891 	boolean_t allocated_map = B_FALSE;
3892 
3893 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3894 
3895 	map = mac_find_macaddr(mip, mac_addr);
3896 
3897 	/*
3898 	 * If the new MAC address has not been added. Allocate a new one
3899 	 * and set it up.
3900 	 */
3901 	if (map == NULL) {
3902 		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
3903 		map->ma_len = mip->mi_type->mt_addr_length;
3904 		bcopy(mac_addr, map->ma_addr, map->ma_len);
3905 		map->ma_nusers = 0;
3906 		map->ma_group = group;
3907 		map->ma_mip = mip;
3908 
3909 		/* add the new MAC address to the head of the address list */
3910 		map->ma_next = mip->mi_addresses;
3911 		mip->mi_addresses = map;
3912 
3913 		allocated_map = B_TRUE;
3914 	}
3915 
3916 	ASSERT(map->ma_group == group);
3917 
3918 	/*
3919 	 * If the MAC address is already in use, simply account for the
3920 	 * new client.
3921 	 */
3922 	if (map->ma_nusers++ > 0)
3923 		return (0);
3924 
3925 	/*
3926 	 * Activate this MAC address by adding it to the reserved group.
3927 	 */
3928 	if (group != NULL) {
3929 		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
3930 		if (err == 0) {
3931 			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3932 			return (0);
3933 		}
3934 	}
3935 
3936 	/*
3937 	 * The MAC address addition failed. If the client requires a
3938 	 * hardware classified MAC address, fail the operation.
3939 	 */
3940 	if (use_hw) {
3941 		err = ENOSPC;
3942 		goto bail;
3943 	}
3944 
3945 	/*
3946 	 * Try promiscuous mode.
3947 	 *
3948 	 * For drivers that don't advertise RINGS capability, do
3949 	 * nothing for the primary address.
3950 	 */
3951 	if ((group == NULL) &&
3952 	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
3953 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3954 		return (0);
3955 	}
3956 
3957 	/*
3958 	 * Enable promiscuous mode in order to receive traffic
3959 	 * to the new MAC address.
3960 	 */
3961 	if ((err = i_mac_promisc_set(mip, B_TRUE, MAC_DEVPROMISC)) == 0) {
3962 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
3963 		return (0);
3964 	}
3965 
3966 	/*
3967 	 * Free the MAC address that could not be added. Don't free
3968 	 * a pre-existing address, it could have been the entry
3969 	 * for the primary MAC address which was pre-allocated by
3970 	 * mac_init_macaddr(), and which must remain on the list.
3971 	 */
3972 bail:
3973 	map->ma_nusers--;
3974 	if (allocated_map)
3975 		mac_free_macaddr(map);
3976 	return (err);
3977 }
3978 
3979 /*
3980  * Remove a reference to a MAC address. This may cause to remove the MAC
3981  * address from an associated group or to turn off promiscuous mode.
3982  * The caller needs to handle the failure properly.
3983  */
3984 int
3985 mac_remove_macaddr(mac_address_t *map)
3986 {
3987 	mac_impl_t *mip = map->ma_mip;
3988 	int err = 0;
3989 
3990 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3991 
3992 	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
3993 
3994 	/*
3995 	 * If it's not the last client using this MAC address, only update
3996 	 * the MAC clients count.
3997 	 */
3998 	if (--map->ma_nusers > 0)
3999 		return (0);
4000 
4001 	/*
4002 	 * The MAC address is no longer used by any MAC client, so remove
4003 	 * it from its associated group, or turn off promiscuous mode
4004 	 * if it was enabled for the MAC address.
4005 	 */
4006 	switch (map->ma_type) {
4007 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4008 		/*
4009 		 * Don't free the preset primary address for drivers that
4010 		 * don't advertise RINGS capability.
4011 		 */
4012 		if (map->ma_group == NULL)
4013 			return (0);
4014 
4015 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4016 		break;
4017 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4018 		err = i_mac_promisc_set(mip, B_FALSE, MAC_DEVPROMISC);
4019 		break;
4020 	default:
4021 		ASSERT(B_FALSE);
4022 	}
4023 
4024 	if (err != 0)
4025 		return (err);
4026 
4027 	/*
4028 	 * We created MAC address for the primary one at registration, so we
4029 	 * won't free it here. mac_fini_macaddr() will take care of it.
4030 	 */
4031 	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4032 		mac_free_macaddr(map);
4033 
4034 	return (0);
4035 }
4036 
4037 /*
4038  * Update an existing MAC address. The caller need to make sure that the new
4039  * value has not been used.
4040  */
4041 int
4042 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4043 {
4044 	mac_impl_t *mip = map->ma_mip;
4045 	int err = 0;
4046 
4047 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4048 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4049 
4050 	switch (map->ma_type) {
4051 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4052 		/*
4053 		 * Update the primary address for drivers that are not
4054 		 * RINGS capable.
4055 		 */
4056 		if (map->ma_group == NULL) {
4057 			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
4058 			    mac_addr);
4059 			if (err != 0)
4060 				return (err);
4061 			break;
4062 		}
4063 
4064 		/*
4065 		 * If this MAC address is not currently in use,
4066 		 * simply break out and update the value.
4067 		 */
4068 		if (map->ma_nusers == 0)
4069 			break;
4070 
4071 		/*
4072 		 * Need to replace the MAC address associated with a group.
4073 		 */
4074 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4075 		if (err != 0)
4076 			return (err);
4077 
4078 		err = mac_group_addmac(map->ma_group, mac_addr);
4079 
4080 		/*
4081 		 * Failure hints hardware error. The MAC layer needs to
4082 		 * have error notification facility to handle this.
4083 		 * Now, simply try to restore the value.
4084 		 */
4085 		if (err != 0)
4086 			(void) mac_group_addmac(map->ma_group, map->ma_addr);
4087 
4088 		break;
4089 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4090 		/*
4091 		 * Need to do nothing more if in promiscuous mode.
4092 		 */
4093 		break;
4094 	default:
4095 		ASSERT(B_FALSE);
4096 	}
4097 
4098 	/*
4099 	 * Successfully replaced the MAC address.
4100 	 */
4101 	if (err == 0)
4102 		bcopy(mac_addr, map->ma_addr, map->ma_len);
4103 
4104 	return (err);
4105 }
4106 
4107 /*
4108  * Freshen the MAC address with new value. Its caller must have updated the
4109  * hardware MAC address before calling this function.
4110  * This funcitons is supposed to be used to handle the MAC address change
4111  * notification from underlying drivers.
4112  */
4113 void
4114 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
4115 {
4116 	mac_impl_t *mip = map->ma_mip;
4117 
4118 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4119 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4120 
4121 	/*
4122 	 * Freshen the MAC address with new value.
4123 	 */
4124 	bcopy(mac_addr, map->ma_addr, map->ma_len);
4125 	bcopy(mac_addr, mip->mi_addr, map->ma_len);
4126 
4127 	/*
4128 	 * Update all MAC clients that share this MAC address.
4129 	 */
4130 	mac_unicast_update_clients(mip, map);
4131 }
4132 
4133 /*
4134  * Set up the primary MAC address.
4135  */
4136 void
4137 mac_init_macaddr(mac_impl_t *mip)
4138 {
4139 	mac_address_t *map;
4140 
4141 	/*
4142 	 * The reference count is initialized to zero, until it's really
4143 	 * activated.
4144 	 */
4145 	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4146 	map->ma_len = mip->mi_type->mt_addr_length;
4147 	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
4148 
4149 	/*
4150 	 * If driver advertises RINGS capability, it shouldn't have initialized
4151 	 * its primary MAC address. For other drivers, including VNIC, the
4152 	 * primary address must work after registration.
4153 	 */
4154 	if (mip->mi_rx_groups == NULL)
4155 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4156 
4157 	/*
4158 	 * The primary MAC address is reserved for default group according
4159 	 * to current design.
4160 	 */
4161 	map->ma_group = mip->mi_rx_groups;
4162 	map->ma_mip = mip;
4163 
4164 	mip->mi_addresses = map;
4165 }
4166 
4167 /*
4168  * Clean up the primary MAC address. Note, only one primary MAC address
4169  * is allowed. All other MAC addresses must have been freed appropriately.
4170  */
4171 void
4172 mac_fini_macaddr(mac_impl_t *mip)
4173 {
4174 	mac_address_t *map = mip->mi_addresses;
4175 
4176 	if (map == NULL)
4177 		return;
4178 
4179 	/*
4180 	 * If mi_addresses is initialized, there should be exactly one
4181 	 * entry left on the list with no users.
4182 	 */
4183 	ASSERT(map->ma_nusers == 0);
4184 	ASSERT(map->ma_next == NULL);
4185 
4186 	kmem_free(map, sizeof (mac_address_t));
4187 	mip->mi_addresses = NULL;
4188 }
4189 
4190 /*
4191  * Logging related functions.
4192  */
4193 
4194 /* Write the Flow description to the log file */
4195 int
4196 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
4197 {
4198 	flow_desc_t		*fdesc;
4199 	mac_resource_props_t	*mrp;
4200 	net_desc_t		ndesc;
4201 
4202 	bzero(&ndesc, sizeof (net_desc_t));
4203 
4204 	/*
4205 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4206 	 * Updates to the fe_flow_desc are done under the fe_lock
4207 	 */
4208 	mutex_enter(&flent->fe_lock);
4209 	fdesc = &flent->fe_flow_desc;
4210 	mrp = &flent->fe_resource_props;
4211 
4212 	ndesc.nd_name = flent->fe_flow_name;
4213 	ndesc.nd_devname = mcip->mci_name;
4214 	bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4215 	bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
4216 	ndesc.nd_sap = htonl(fdesc->fd_sap);
4217 	ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
4218 	ndesc.nd_bw_limit = mrp->mrp_maxbw;
4219 	if (ndesc.nd_isv4) {
4220 		ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
4221 		ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
4222 	} else {
4223 		bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
4224 		bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
4225 	}
4226 	ndesc.nd_sport = htons(fdesc->fd_local_port);
4227 	ndesc.nd_dport = htons(fdesc->fd_remote_port);
4228 	ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
4229 	mutex_exit(&flent->fe_lock);
4230 
4231 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
4232 }
4233 
4234 /* Write the Flow statistics to the log file */
4235 int
4236 mac_write_flow_stats(flow_entry_t *flent)
4237 {
4238 	flow_stats_t	*fl_stats;
4239 	net_stat_t	nstat;
4240 
4241 	fl_stats = &flent->fe_flowstats;
4242 	nstat.ns_name = flent->fe_flow_name;
4243 	nstat.ns_ibytes = fl_stats->fs_rbytes;
4244 	nstat.ns_obytes = fl_stats->fs_obytes;
4245 	nstat.ns_ipackets = fl_stats->fs_ipackets;
4246 	nstat.ns_opackets = fl_stats->fs_opackets;
4247 	nstat.ns_ierrors = fl_stats->fs_ierrors;
4248 	nstat.ns_oerrors = fl_stats->fs_oerrors;
4249 
4250 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
4251 }
4252 
4253 /* Write the Link Description to the log file */
4254 int
4255 mac_write_link_desc(mac_client_impl_t *mcip)
4256 {
4257 	net_desc_t		ndesc;
4258 	flow_entry_t		*flent = mcip->mci_flent;
4259 
4260 	bzero(&ndesc, sizeof (net_desc_t));
4261 
4262 	ndesc.nd_name = mcip->mci_name;
4263 	ndesc.nd_devname = mcip->mci_name;
4264 	ndesc.nd_isv4 = B_TRUE;
4265 	/*
4266 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4267 	 * Updates to the fe_flow_desc are done under the fe_lock
4268 	 * after removing the flent from the flow table.
4269 	 */
4270 	mutex_enter(&flent->fe_lock);
4271 	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4272 	mutex_exit(&flent->fe_lock);
4273 
4274 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
4275 }
4276 
4277 /* Write the Link statistics to the log file */
4278 int
4279 mac_write_link_stats(mac_client_impl_t *mcip)
4280 {
4281 	net_stat_t	nstat;
4282 
4283 	nstat.ns_name = mcip->mci_name;
4284 	nstat.ns_ibytes = mcip->mci_stat_ibytes;
4285 	nstat.ns_obytes = mcip->mci_stat_obytes;
4286 	nstat.ns_ipackets = mcip->mci_stat_ipackets;
4287 	nstat.ns_opackets = mcip->mci_stat_opackets;
4288 	nstat.ns_ierrors = mcip->mci_stat_ierrors;
4289 	nstat.ns_oerrors = mcip->mci_stat_oerrors;
4290 
4291 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
4292 }
4293 
4294 /*
4295  * For a given flow, if the descrition has not been logged before, do it now.
4296  * If it is a VNIC, then we have collected information about it from the MAC
4297  * table, so skip it.
4298  */
4299 /*ARGSUSED*/
4300 static int
4301 mac_log_flowinfo(flow_entry_t *flent, void *args)
4302 {
4303 	mac_client_impl_t	*mcip = flent->fe_mcip;
4304 
4305 	if (mcip == NULL)
4306 		return (0);
4307 
4308 	/*
4309 	 * If the name starts with "vnic", and fe_user_generated is true (to
4310 	 * exclude the mcast and active flow entries created implicitly for
4311 	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
4312 	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
4313 	 */
4314 	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
4315 	    (flent->fe_type & FLOW_USER) != 0) {
4316 		return (0);
4317 	}
4318 
4319 	if (!flent->fe_desc_logged) {
4320 		/*
4321 		 * We don't return error because we want to continu the
4322 		 * walk in case this is the last walk which means we
4323 		 * need to reset fe_desc_logged in all the flows.
4324 		 */
4325 		if (mac_write_flow_desc(flent, mcip) != 0)
4326 			return (0);
4327 		flent->fe_desc_logged = B_TRUE;
4328 	}
4329 
4330 	/*
4331 	 * Regardless of the error, we want to proceed in case we have to
4332 	 * reset fe_desc_logged.
4333 	 */
4334 	(void) mac_write_flow_stats(flent);
4335 
4336 	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
4337 		flent->fe_desc_logged = B_FALSE;
4338 
4339 	return (0);
4340 }
4341 
4342 typedef struct i_mac_log_state_s {
4343 	boolean_t	mi_last;
4344 	int		mi_fenable;
4345 	int		mi_lenable;
4346 } i_mac_log_state_t;
4347 
4348 /*
4349  * Walk the mac_impl_ts and log the description for each mac client of this mac,
4350  * if it hasn't already been done. Additionally, log statistics for the link as
4351  * well. Walk the flow table and log information for each flow as well.
4352  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
4353  * also fe_desc_logged, if flow logging is on) since we want to log the
4354  * description if and when logging is restarted.
4355  */
4356 /*ARGSUSED*/
4357 static uint_t
4358 i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
4359 {
4360 	mac_impl_t		*mip = (mac_impl_t *)val;
4361 	i_mac_log_state_t	*lstate = (i_mac_log_state_t *)arg;
4362 	int			ret;
4363 	mac_client_impl_t	*mcip;
4364 
4365 	/*
4366 	 * Only walk the client list for NIC and etherstub
4367 	 */
4368 	if ((mip->mi_state_flags & MIS_DISABLED) ||
4369 	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
4370 	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
4371 		return (MH_WALK_CONTINUE);
4372 
4373 	for (mcip = mip->mi_clients_list; mcip != NULL;
4374 	    mcip = mcip->mci_client_next) {
4375 		if (!MCIP_DATAPATH_SETUP(mcip))
4376 			continue;
4377 		if (lstate->mi_lenable) {
4378 			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
4379 				ret = mac_write_link_desc(mcip);
4380 				if (ret != 0) {
4381 				/*
4382 				 * We can't terminate it if this is the last
4383 				 * walk, else there might be some links with
4384 				 * mi_desc_logged set to true, which means
4385 				 * their description won't be logged the next
4386 				 * time logging is started (similarly for the
4387 				 * flows within such links). We can continue
4388 				 * without walking the flow table (i.e. to
4389 				 * set fe_desc_logged to false) because we
4390 				 * won't have written any flow stuff for this
4391 				 * link as we haven't logged the link itself.
4392 				 */
4393 					if (lstate->mi_last)
4394 						return (MH_WALK_CONTINUE);
4395 					else
4396 						return (MH_WALK_TERMINATE);
4397 				}
4398 				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
4399 			}
4400 		}
4401 
4402 		if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
4403 			return (MH_WALK_TERMINATE);
4404 
4405 		if (lstate->mi_last)
4406 			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
4407 
4408 		if (lstate->mi_fenable) {
4409 			if (mcip->mci_subflow_tab != NULL) {
4410 				(void) mac_flow_walk(mcip->mci_subflow_tab,
4411 				    mac_log_flowinfo, mip);
4412 			}
4413 		}
4414 	}
4415 	return (MH_WALK_CONTINUE);
4416 }
4417 
4418 /*
4419  * The timer thread that runs every mac_logging_interval seconds and logs
4420  * link and/or flow information.
4421  */
4422 /* ARGSUSED */
4423 void
4424 mac_log_linkinfo(void *arg)
4425 {
4426 	i_mac_log_state_t	lstate;
4427 
4428 	rw_enter(&i_mac_impl_lock, RW_READER);
4429 	if (!mac_flow_log_enable && !mac_link_log_enable) {
4430 		rw_exit(&i_mac_impl_lock);
4431 		return;
4432 	}
4433 	lstate.mi_fenable = mac_flow_log_enable;
4434 	lstate.mi_lenable = mac_link_log_enable;
4435 	lstate.mi_last = B_FALSE;
4436 	rw_exit(&i_mac_impl_lock);
4437 
4438 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4439 
4440 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4441 	if (mac_flow_log_enable || mac_link_log_enable) {
4442 		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
4443 		    SEC_TO_TICK(mac_logging_interval));
4444 	}
4445 	rw_exit(&i_mac_impl_lock);
4446 }
4447 
4448 typedef struct i_mac_fastpath_state_s {
4449 	boolean_t	mf_disable;
4450 	int		mf_err;
4451 } i_mac_fastpath_state_t;
4452 
4453 /*ARGSUSED*/
4454 static uint_t
4455 i_mac_fastpath_disable_walker(mod_hash_key_t key, mod_hash_val_t *val,
4456     void *arg)
4457 {
4458 	i_mac_fastpath_state_t	*state = arg;
4459 	mac_handle_t		mh = (mac_handle_t)val;
4460 
4461 	if (state->mf_disable)
4462 		state->mf_err = mac_fastpath_disable(mh);
4463 	else
4464 		mac_fastpath_enable(mh);
4465 
4466 	return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
4467 }
4468 
4469 /*
4470  * Start the logging timer.
4471  */
4472 int
4473 mac_start_logusage(mac_logtype_t type, uint_t interval)
4474 {
4475 	i_mac_fastpath_state_t state = {B_TRUE, 0};
4476 	int err;
4477 
4478 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4479 	switch (type) {
4480 	case MAC_LOGTYPE_FLOW:
4481 		if (mac_flow_log_enable) {
4482 			rw_exit(&i_mac_impl_lock);
4483 			return (0);
4484 		}
4485 		/* FALLTHRU */
4486 	case MAC_LOGTYPE_LINK:
4487 		if (mac_link_log_enable) {
4488 			rw_exit(&i_mac_impl_lock);
4489 			return (0);
4490 		}
4491 		break;
4492 	default:
4493 		ASSERT(0);
4494 	}
4495 
4496 	/* Disable fastpath */
4497 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4498 	if ((err = state.mf_err) != 0) {
4499 		/* Reenable fastpath  */
4500 		state.mf_disable = B_FALSE;
4501 		state.mf_err = 0;
4502 		mod_hash_walk(i_mac_impl_hash,
4503 		    i_mac_fastpath_disable_walker, &state);
4504 		rw_exit(&i_mac_impl_lock);
4505 		return (err);
4506 	}
4507 
4508 	switch (type) {
4509 	case MAC_LOGTYPE_FLOW:
4510 		mac_flow_log_enable = B_TRUE;
4511 		/* FALLTHRU */
4512 	case MAC_LOGTYPE_LINK:
4513 		mac_link_log_enable = B_TRUE;
4514 		break;
4515 	}
4516 
4517 	mac_logging_interval = interval;
4518 	rw_exit(&i_mac_impl_lock);
4519 	mac_log_linkinfo(NULL);
4520 	return (0);
4521 }
4522 
4523 /*
4524  * Stop the logging timer if both Link and Flow logging are turned off.
4525  */
4526 void
4527 mac_stop_logusage(mac_logtype_t type)
4528 {
4529 	i_mac_log_state_t	lstate;
4530 	i_mac_fastpath_state_t	state = {B_FALSE, 0};
4531 
4532 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4533 	lstate.mi_fenable = mac_flow_log_enable;
4534 	lstate.mi_lenable = mac_link_log_enable;
4535 
4536 	/* Last walk */
4537 	lstate.mi_last = B_TRUE;
4538 
4539 	switch (type) {
4540 	case MAC_LOGTYPE_FLOW:
4541 		if (lstate.mi_fenable) {
4542 			ASSERT(mac_link_log_enable);
4543 			mac_flow_log_enable = B_FALSE;
4544 			mac_link_log_enable = B_FALSE;
4545 			break;
4546 		}
4547 		/* FALLTHRU */
4548 	case MAC_LOGTYPE_LINK:
4549 		if (!lstate.mi_lenable || mac_flow_log_enable) {
4550 			rw_exit(&i_mac_impl_lock);
4551 			return;
4552 		}
4553 		mac_link_log_enable = B_FALSE;
4554 		break;
4555 	default:
4556 		ASSERT(0);
4557 	}
4558 
4559 	/* Reenable fastpath */
4560 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4561 
4562 	rw_exit(&i_mac_impl_lock);
4563 	(void) untimeout(mac_logging_timer);
4564 	mac_logging_timer = 0;
4565 
4566 	/* Last walk */
4567 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4568 }
4569 
4570 /*
4571  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
4572  */
4573 void
4574 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
4575 {
4576 	pri_t			pri;
4577 	int			count;
4578 	mac_soft_ring_set_t	*mac_srs;
4579 
4580 	if (flent->fe_rx_srs_cnt <= 0)
4581 		return;
4582 
4583 	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
4584 	    SRST_FLOW) {
4585 		pri = FLOW_PRIORITY(mcip->mci_min_pri,
4586 		    mcip->mci_max_pri,
4587 		    flent->fe_resource_props.mrp_priority);
4588 	} else {
4589 		pri = mcip->mci_max_pri;
4590 	}
4591 
4592 	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
4593 		mac_srs = flent->fe_rx_srs[count];
4594 		mac_update_srs_priority(mac_srs, pri);
4595 	}
4596 	/*
4597 	 * If we have a Tx SRS, we need to modify all the threads associated
4598 	 * with it.
4599 	 */
4600 	if (flent->fe_tx_srs != NULL)
4601 		mac_update_srs_priority(flent->fe_tx_srs, pri);
4602 }
4603 
4604 /*
4605  * RX and TX rings are reserved according to different semantics depending
4606  * on the requests from the MAC clients and type of rings:
4607  *
4608  * On the Tx side, by default we reserve individual rings, independently from
4609  * the groups.
4610  *
4611  * On the Rx side, the reservation is at the granularity of the group
4612  * of rings, and used for v12n level 1 only. It has a special case for the
4613  * primary client.
4614  *
4615  * If a share is allocated to a MAC client, we allocate a TX group and an
4616  * RX group to the client, and assign TX rings and RX rings to these
4617  * groups according to information gathered from the driver through
4618  * the share capability.
4619  *
4620  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
4621  * to allocate individual rings out of a group and program the hw classifier
4622  * based on IP address or higher level criteria.
4623  */
4624 
4625 /*
4626  * mac_reserve_tx_ring()
4627  * Reserve a unused ring by marking it with MR_INUSE state.
4628  * As reserved, the ring is ready to function.
4629  *
4630  * Notes for Hybrid I/O:
4631  *
4632  * If a specific ring is needed, it is specified through the desired_ring
4633  * argument. Otherwise that argument is set to NULL.
4634  * If the desired ring was previous allocated to another client, this
4635  * function swaps it with a new ring from the group of unassigned rings.
4636  */
4637 mac_ring_t *
4638 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
4639 {
4640 	mac_group_t *group;
4641 	mac_ring_t *ring;
4642 
4643 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4644 
4645 	if (mip->mi_tx_groups == NULL)
4646 		return (NULL);
4647 
4648 	/*
4649 	 * Find an available ring and start it before changing its status.
4650 	 * The unassigned rings are at the end of the mi_tx_groups
4651 	 * array.
4652 	 */
4653 	group = mip->mi_tx_groups + mip->mi_tx_group_count;
4654 
4655 	for (ring = group->mrg_rings; ring != NULL;
4656 	    ring = ring->mr_next) {
4657 		if (desired_ring == NULL) {
4658 			if (ring->mr_state == MR_FREE)
4659 				/* wanted any free ring and found one */
4660 				break;
4661 		} else {
4662 			mac_ring_t *sring;
4663 			mac_client_impl_t *client;
4664 			mac_soft_ring_set_t *srs;
4665 
4666 			if (ring != desired_ring)
4667 				/* wants a desired ring but this one ain't it */
4668 				continue;
4669 
4670 			if (ring->mr_state == MR_FREE)
4671 				break;
4672 
4673 			/*
4674 			 * Found the desired ring but it's already in use.
4675 			 * Swap it with a new ring.
4676 			 */
4677 
4678 			/* find the client which owns that ring */
4679 			for (client = mip->mi_clients_list; client != NULL;
4680 			    client = client->mci_client_next) {
4681 				srs = MCIP_TX_SRS(client);
4682 				if (srs != NULL && mac_tx_srs_ring_present(srs,
4683 				    desired_ring)) {
4684 					/* found our ring */
4685 					break;
4686 				}
4687 			}
4688 			if (client == NULL) {
4689 				/*
4690 				 * The TX ring is in use, but it's not
4691 				 * associated with any clients, so it
4692 				 * has to be the default ring. In that
4693 				 * case we can simply assign a new ring
4694 				 * as the default ring, and we're done.
4695 				 */
4696 				ASSERT(mip->mi_default_tx_ring ==
4697 				    (mac_ring_handle_t)desired_ring);
4698 
4699 				/*
4700 				 * Quiesce all clients on top of
4701 				 * the NIC to make sure there are no
4702 				 * pending threads still relying on
4703 				 * that default ring, for example
4704 				 * the multicast path.
4705 				 */
4706 				for (client = mip->mi_clients_list;
4707 				    client != NULL;
4708 				    client = client->mci_client_next) {
4709 					mac_tx_client_quiesce(client,
4710 					    SRS_QUIESCE);
4711 				}
4712 
4713 				mip->mi_default_tx_ring = (mac_ring_handle_t)
4714 				    mac_reserve_tx_ring(mip, NULL);
4715 
4716 				/* resume the clients */
4717 				for (client = mip->mi_clients_list;
4718 				    client != NULL;
4719 				    client = client->mci_client_next)
4720 					mac_tx_client_restart(client);
4721 
4722 				break;
4723 			}
4724 
4725 			/*
4726 			 * Note that we cannot simply invoke the group
4727 			 * add/rem routines since the client doesn't have a
4728 			 * TX group. So we need to instead add/remove
4729 			 * the rings from the SRS.
4730 			 */
4731 			ASSERT(client->mci_share == NULL);
4732 
4733 			/* first quiece the client */
4734 			mac_tx_client_quiesce(client, SRS_QUIESCE);
4735 
4736 			/* give a new ring to the client... */
4737 			sring = mac_reserve_tx_ring(mip, NULL);
4738 			if (sring != NULL) {
4739 				/*
4740 				 * There are no other available ring
4741 				 * on that MAC instance. The client
4742 				 * will fallback to the shared TX
4743 				 * ring.
4744 				 */
4745 				mac_tx_srs_add_ring(srs, sring);
4746 			}
4747 
4748 			/* ... in exchange for our desired ring */
4749 			mac_tx_srs_del_ring(srs, desired_ring);
4750 
4751 			/* restart the client */
4752 			mac_tx_client_restart(client);
4753 
4754 			if (mip->mi_default_tx_ring ==
4755 			    (mac_ring_handle_t)desired_ring) {
4756 				/*
4757 				 * The desired ring is the default ring,
4758 				 * and there are one or more clients
4759 				 * using that default ring directly.
4760 				 */
4761 				mip->mi_default_tx_ring =
4762 				    (mac_ring_handle_t)sring;
4763 				/*
4764 				 * Find clients using default ring and
4765 				 * swap it with the new default ring.
4766 				 */
4767 				for (client = mip->mi_clients_list;
4768 				    client != NULL;
4769 				    client = client->mci_client_next) {
4770 					srs = MCIP_TX_SRS(client);
4771 					if (srs != NULL &&
4772 					    mac_tx_srs_ring_present(srs,
4773 					    desired_ring)) {
4774 						/* first quiece the client */
4775 						mac_tx_client_quiesce(client,
4776 						    SRS_QUIESCE);
4777 
4778 						/*
4779 						 * Give it the new default
4780 						 * ring, and remove the old
4781 						 * one.
4782 						 */
4783 						if (sring != NULL) {
4784 							mac_tx_srs_add_ring(srs,
4785 							    sring);
4786 						}
4787 						mac_tx_srs_del_ring(srs,
4788 						    desired_ring);
4789 
4790 						/* restart the client */
4791 						mac_tx_client_restart(client);
4792 					}
4793 				}
4794 			}
4795 			break;
4796 		}
4797 	}
4798 
4799 	if (ring != NULL) {
4800 		if (mac_start_ring(ring) != 0)
4801 			return (NULL);
4802 		ring->mr_state = MR_INUSE;
4803 	}
4804 
4805 	return (ring);
4806 }
4807 
4808 /*
4809  * Minimum number of rings to leave in the default TX group when allocating
4810  * rings to new clients.
4811  */
4812 static uint_t mac_min_rx_default_rings = 1;
4813 
4814 /*
4815  * Populate a zero-ring group with rings. If the share is non-NULL,
4816  * the rings are chosen according to that share.
4817  * Invoked after allocating a new RX or TX group through
4818  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
4819  * Returns zero on success, an errno otherwise.
4820  */
4821 int
4822 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
4823     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
4824 {
4825 	mac_ring_t **rings, *tmp_ring[1], *ring;
4826 	uint_t nrings;
4827 	int rv, i, j;
4828 
4829 	ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
4830 	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
4831 	ASSERT(new_group->mrg_cur_count == 0);
4832 
4833 	/*
4834 	 * First find the rings to allocate to the group.
4835 	 */
4836 	if (share != NULL) {
4837 		/* get rings through ms_squery() */
4838 		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
4839 		ASSERT(nrings != 0);
4840 		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
4841 		    KM_SLEEP);
4842 		mip->mi_share_capab.ms_squery(share, ring_type,
4843 		    (mac_ring_handle_t *)rings, &nrings);
4844 	} else {
4845 		/* this function is called for TX only with a share */
4846 		ASSERT(ring_type == MAC_RING_TYPE_RX);
4847 		/*
4848 		 * Pick one ring from default group.
4849 		 *
4850 		 * for now pick the second ring which requires the first ring
4851 		 * at index 0 to stay in the default group, since it is the
4852 		 * ring which carries the multicast traffic.
4853 		 * We need a better way for a driver to indicate this,
4854 		 * for example a per-ring flag.
4855 		 */
4856 		for (ring = src_group->mrg_rings; ring != NULL;
4857 		    ring = ring->mr_next) {
4858 			if (ring->mr_index != 0)
4859 				break;
4860 		}
4861 		ASSERT(ring != NULL);
4862 		nrings = 1;
4863 		tmp_ring[0] = ring;
4864 		rings = tmp_ring;
4865 	}
4866 
4867 	switch (ring_type) {
4868 	case MAC_RING_TYPE_RX:
4869 		if (src_group->mrg_cur_count - nrings <
4870 		    mac_min_rx_default_rings) {
4871 			/* we ran out of rings */
4872 			return (ENOSPC);
4873 		}
4874 
4875 		/* move receive rings to new group */
4876 		for (i = 0; i < nrings; i++) {
4877 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4878 			if (rv != 0) {
4879 				/* move rings back on failure */
4880 				for (j = 0; j < i; j++) {
4881 					(void) mac_group_mov_ring(mip,
4882 					    src_group, rings[j]);
4883 				}
4884 				return (rv);
4885 			}
4886 		}
4887 		break;
4888 
4889 	case MAC_RING_TYPE_TX: {
4890 		mac_ring_t *tmp_ring;
4891 
4892 		/* move the TX rings to the new group */
4893 		ASSERT(src_group == NULL);
4894 		for (i = 0; i < nrings; i++) {
4895 			/* get the desired ring */
4896 			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
4897 			ASSERT(tmp_ring == rings[i]);
4898 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4899 			if (rv != 0) {
4900 				/* cleanup on failure */
4901 				for (j = 0; j < i; j++) {
4902 					(void) mac_group_mov_ring(mip,
4903 					    mip->mi_tx_groups +
4904 					    mip->mi_tx_group_count, rings[j]);
4905 				}
4906 			}
4907 		}
4908 		break;
4909 	}
4910 	}
4911 
4912 	if (share != NULL) {
4913 		/* add group to share */
4914 		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
4915 		/* free temporary array of rings */
4916 		kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
4917 	}
4918 
4919 	return (0);
4920 }
4921 
4922 void
4923 mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
4924 {
4925 	mac_grp_client_t *mgcp;
4926 
4927 	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
4928 		if (mgcp->mgc_client == mcip)
4929 			break;
4930 	}
4931 
4932 	VERIFY(mgcp == NULL);
4933 
4934 	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
4935 	mgcp->mgc_client = mcip;
4936 	mgcp->mgc_next = grp->mrg_clients;
4937 	grp->mrg_clients = mgcp;
4938 
4939 }
4940 
4941 void
4942 mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
4943 {
4944 	mac_grp_client_t *mgcp, **pprev;
4945 
4946 	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
4947 	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
4948 		if (mgcp->mgc_client == mcip)
4949 			break;
4950 	}
4951 
4952 	ASSERT(mgcp != NULL);
4953 
4954 	*pprev = mgcp->mgc_next;
4955 	kmem_free(mgcp, sizeof (mac_grp_client_t));
4956 }
4957 
4958 /*
4959  * mac_reserve_rx_group()
4960  *
4961  * Finds an available group and exclusively reserves it for a client.
4962  * The group is chosen to suit the flow's resource controls (bandwidth and
4963  * fanout requirements) and the address type.
4964  * If the requestor is the pimary MAC then return the group with the
4965  * largest number of rings, otherwise the default ring when available.
4966  */
4967 mac_group_t *
4968 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
4969     mac_rx_group_reserve_type_t rtype)
4970 {
4971 	mac_share_handle_t	share = mcip->mci_share;
4972 	mac_impl_t		*mip = mcip->mci_mip;
4973 	mac_group_t		*grp = NULL;
4974 	int			i, start, loopcount;
4975 	int			err;
4976 	mac_address_t		*map;
4977 
4978 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4979 
4980 	/* Check if a group already has this mac address (case of VLANs) */
4981 	if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
4982 		return (map->ma_group);
4983 
4984 	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
4985 	    rtype == MAC_RX_NO_RESERVE)
4986 		return (NULL);
4987 
4988 	/*
4989 	 * Try to exclusively reserve a RX group.
4990 	 *
4991 	 * For flows requires SW_RING it always goes to the default group
4992 	 * (Until we can explicitely call out default groups (CR 6695600),
4993 	 * we assume that the default group is always at position zero);
4994 	 *
4995 	 * For flows requires HW_DEFAULT_RING (unicast flow of the primary
4996 	 * client), try to reserve the default RX group only.
4997 	 *
4998 	 * For flows requires HW_RING (unicast flow of other clients), try
4999 	 * to reserve non-default RX group then the default group.
5000 	 */
5001 	switch (rtype) {
5002 	case MAC_RX_RESERVE_DEFAULT:
5003 		start = 0;
5004 		loopcount = 1;
5005 		break;
5006 	case MAC_RX_RESERVE_NONDEFAULT:
5007 		start = 1;
5008 		loopcount = mip->mi_rx_group_count;
5009 	}
5010 
5011 	for (i = start; i < start + loopcount; i++) {
5012 		grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
5013 
5014 		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
5015 		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
5016 
5017 		/*
5018 		 * Check to see whether this mac client is the only client
5019 		 * on this RX group. If not, we cannot exclusively reserve
5020 		 * this RX group.
5021 		 */
5022 		if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
5023 		    (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
5024 			continue;
5025 		}
5026 
5027 		/*
5028 		 * This group could already be SHARED by other multicast
5029 		 * flows on this client. In that case, the group would
5030 		 * be shared and has already been started.
5031 		 */
5032 		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
5033 
5034 		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
5035 		    (mac_start_group(grp) != 0)) {
5036 			continue;
5037 		}
5038 
5039 		if ((i % mip->mi_rx_group_count) == 0 ||
5040 		    mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
5041 			break;
5042 		}
5043 
5044 		ASSERT(grp->mrg_cur_count == 0);
5045 
5046 		/*
5047 		 * Populate the group. Rings should be taken
5048 		 * from the default group at position 0 for now.
5049 		 */
5050 
5051 		err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
5052 		    &mip->mi_rx_groups[0], grp, share);
5053 		if (err == 0)
5054 			break;
5055 
5056 		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
5057 		    mip->mi_name, int, grp->mrg_index, int, err);
5058 
5059 		/*
5060 		 * It's a dynamic group but the grouping operation failed.
5061 		 */
5062 		mac_stop_group(grp);
5063 	}
5064 
5065 	if (i == start + loopcount)
5066 		return (NULL);
5067 
5068 	ASSERT(grp != NULL);
5069 
5070 	DTRACE_PROBE2(rx__group__reserved,
5071 	    char *, mip->mi_name, int, grp->mrg_index);
5072 	return (grp);
5073 }
5074 
5075 /*
5076  * mac_rx_release_group()
5077  *
5078  * This is called when there are no clients left for the group.
5079  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
5080  * and if it is a non default group, the shares are removed and
5081  * all rings are assigned back to default group.
5082  */
5083 void
5084 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
5085 {
5086 	mac_impl_t	*mip = mcip->mci_mip;
5087 	mac_ring_t	*ring;
5088 
5089 	ASSERT(group != &mip->mi_rx_groups[0]);
5090 
5091 	/*
5092 	 * This is the case where there are no clients left. Any
5093 	 * SRS etc on this group have also be quiesced.
5094 	 */
5095 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
5096 		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
5097 			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5098 			/*
5099 			 * Remove the SRS associated with the HW ring.
5100 			 * As a result, polling will be disabled.
5101 			 */
5102 			ring->mr_srs = NULL;
5103 		}
5104 		ASSERT(ring->mr_state == MR_INUSE);
5105 		mac_stop_ring(ring);
5106 		ring->mr_state = MR_FREE;
5107 		ring->mr_flag = 0;
5108 	}
5109 
5110 	/* remove group from share */
5111 	if (mcip->mci_share != NULL) {
5112 		mip->mi_share_capab.ms_sremove(mcip->mci_share,
5113 		    group->mrg_driver);
5114 	}
5115 
5116 	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
5117 		mac_ring_t *ring;
5118 
5119 		/*
5120 		 * Rings were dynamically allocated to group.
5121 		 * Move rings back to default group.
5122 		 */
5123 		while ((ring = group->mrg_rings) != NULL) {
5124 			(void) mac_group_mov_ring(mip,
5125 			    &mip->mi_rx_groups[0], ring);
5126 		}
5127 	}
5128 	mac_stop_group(group);
5129 	/*
5130 	 * Possible improvement: See if we can assign the group just released
5131 	 * to a another client of the mip
5132 	 */
5133 }
5134 
5135 /*
5136  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
5137  * when a share was allocated to the client.
5138  */
5139 mac_group_t *
5140 mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
5141 {
5142 	mac_group_t *grp;
5143 	int rv, i;
5144 
5145 	/*
5146 	 * TX groups are currently allocated only to MAC clients
5147 	 * which are associated with a share. Since we have a fixed
5148 	 * number of share and groups, and we already successfully
5149 	 * allocated a share, find an available TX group.
5150 	 */
5151 	ASSERT(share != NULL);
5152 	ASSERT(mip->mi_tx_group_free > 0);
5153 
5154 	for (i = 0; i <  mip->mi_tx_group_count; i++) {
5155 		grp = &mip->mi_tx_groups[i];
5156 
5157 		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
5158 		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
5159 			continue;
5160 
5161 		rv = mac_start_group(grp);
5162 		ASSERT(rv == 0);
5163 
5164 		grp->mrg_state = MAC_GROUP_STATE_RESERVED;
5165 		break;
5166 	}
5167 
5168 	ASSERT(grp != NULL);
5169 
5170 	/*
5171 	 * Populate the group. Rings should be taken from the group
5172 	 * of unassigned rings, which is past the array of TX
5173 	 * groups adversized by the driver.
5174 	 */
5175 	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
5176 	    grp, share);
5177 	if (rv != 0) {
5178 		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
5179 		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
5180 
5181 		mac_stop_group(grp);
5182 		grp->mrg_state = MAC_GROUP_STATE_UNINIT;
5183 
5184 		return (NULL);
5185 	}
5186 
5187 	mip->mi_tx_group_free--;
5188 
5189 	return (grp);
5190 }
5191 
5192 void
5193 mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
5194 {
5195 	mac_client_impl_t *mcip = grp->mrg_tx_client;
5196 	mac_share_handle_t share = mcip->mci_share;
5197 	mac_ring_t *ring;
5198 
5199 	ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
5200 	ASSERT(share != NULL);
5201 	ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
5202 
5203 	mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
5204 	while ((ring = grp->mrg_rings) != NULL) {
5205 		/* move the ring back to the pool */
5206 		(void) mac_group_mov_ring(mip, mip->mi_tx_groups +
5207 		    mip->mi_tx_group_count, ring);
5208 	}
5209 	mac_stop_group(grp);
5210 	mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
5211 	grp->mrg_tx_client = NULL;
5212 	mip->mi_tx_group_free++;
5213 }
5214 
5215 /*
5216  * This is a 1-time control path activity initiated by the client (IP).
5217  * The mac perimeter protects against other simultaneous control activities,
5218  * for example an ioctl that attempts to change the degree of fanout and
5219  * increase or decrease the number of softrings associated with this Tx SRS.
5220  */
5221 static mac_tx_notify_cb_t *
5222 mac_client_tx_notify_add(mac_client_impl_t *mcip,
5223     mac_tx_notify_t notify, void *arg)
5224 {
5225 	mac_cb_info_t *mcbi;
5226 	mac_tx_notify_cb_t *mtnfp;
5227 
5228 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5229 
5230 	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
5231 	mtnfp->mtnf_fn = notify;
5232 	mtnfp->mtnf_arg = arg;
5233 	mtnfp->mtnf_link.mcb_objp = mtnfp;
5234 	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
5235 	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
5236 
5237 	mcbi = &mcip->mci_tx_notify_cb_info;
5238 	mutex_enter(mcbi->mcbi_lockp);
5239 	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
5240 	mutex_exit(mcbi->mcbi_lockp);
5241 	return (mtnfp);
5242 }
5243 
5244 static void
5245 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
5246 {
5247 	mac_cb_info_t	*mcbi;
5248 	mac_cb_t	**cblist;
5249 
5250 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5251 
5252 	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
5253 	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
5254 		cmn_err(CE_WARN,
5255 		    "mac_client_tx_notify_remove: callback not "
5256 		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
5257 		return;
5258 	}
5259 
5260 	mcbi = &mcip->mci_tx_notify_cb_info;
5261 	cblist = &mcip->mci_tx_notify_cb_list;
5262 	mutex_enter(mcbi->mcbi_lockp);
5263 	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
5264 		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
5265 	else
5266 		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
5267 	mutex_exit(mcbi->mcbi_lockp);
5268 }
5269 
5270 /*
5271  * mac_client_tx_notify():
5272  * call to add and remove flow control callback routine.
5273  */
5274 mac_tx_notify_handle_t
5275 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
5276     void *ptr)
5277 {
5278 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
5279 	mac_tx_notify_cb_t	*mtnfp = NULL;
5280 
5281 	i_mac_perim_enter(mcip->mci_mip);
5282 
5283 	if (callb_func != NULL) {
5284 		/* Add a notify callback */
5285 		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
5286 	} else {
5287 		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
5288 	}
5289 	i_mac_perim_exit(mcip->mci_mip);
5290 
5291 	return ((mac_tx_notify_handle_t)mtnfp);
5292 }
5293