xref: /titanic_51/usr/src/uts/common/io/mac/mac.c (revision 7b3411b6ce9d5ba373d2e36a6c8db48b2dfb74b7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * MAC Services Module
29  *
30  * The GLDv3 framework locking -  The MAC layer
31  * --------------------------------------------
32  *
33  * The MAC layer is central to the GLD framework and can provide the locking
34  * framework needed for itself and for the use of MAC clients. MAC end points
35  * are fairly disjoint and don't share a lot of state. So a coarse grained
36  * multi-threading scheme is to single thread all create/modify/delete or set
37  * type of control operations on a per mac end point while allowing data threads
38  * concurrently.
39  *
40  * Control operations (set) that modify a mac end point are always serialized on
41  * a per mac end point basis, We have at most 1 such thread per mac end point
42  * at a time.
43  *
44  * All other operations that are not serialized are essentially multi-threaded.
45  * For example a control operation (get) like getting statistics which may not
46  * care about reading values atomically or data threads sending or receiving
47  * data. Mostly these type of operations don't modify the control state. Any
48  * state these operations care about are protected using traditional locks.
49  *
50  * The perimeter only serializes serial operations. It does not imply there
51  * aren't any other concurrent operations. However a serialized operation may
52  * sometimes need to make sure it is the only thread. In this case it needs
53  * to use reference counting mechanisms to cv_wait until any current data
54  * threads are done.
55  *
56  * The mac layer itself does not hold any locks across a call to another layer.
57  * The perimeter is however held across a down call to the driver to make the
58  * whole control operation atomic with respect to other control operations.
59  * Also the data path and get type control operations may proceed concurrently.
60  * These operations synchronize with the single serial operation on a given mac
61  * end point using regular locks. The perimeter ensures that conflicting
62  * operations like say a mac_multicast_add and a mac_multicast_remove on the
63  * same mac end point don't interfere with each other and also ensures that the
64  * changes in the mac layer and the call to the underlying driver to say add a
65  * multicast address are done atomically without interference from a thread
66  * trying to delete the same address.
67  *
68  * For example, consider
69  * mac_multicst_add()
70  * {
71  *	mac_perimeter_enter();	serialize all control operations
72  *
73  *	grab list lock		protect against access by data threads
74  *	add to list
75  *	drop list lock
76  *
77  *	call driver's mi_multicst
78  *
79  *	mac_perimeter_exit();
80  * }
81  *
82  * To lessen the number of serialization locks and simplify the lock hierarchy,
83  * we serialize all the control operations on a per mac end point by using a
84  * single serialization lock called the perimeter. We allow recursive entry into
85  * the perimeter to facilitate use of this mechanism by both the mac client and
86  * the MAC layer itself.
87  *
88  * MAC client means an entity that does an operation on a mac handle
89  * obtained from a mac_open/mac_client_open. Similarly MAC driver means
90  * an entity that does an operation on a mac handle obtained from a
91  * mac_register. An entity could be both client and driver but on different
92  * handles eg. aggr. and should only make the corresponding mac interface calls
93  * i.e. mac driver interface or mac client interface as appropriate for that
94  * mac handle.
95  *
96  * General rules.
97  * -------------
98  *
99  * R1. The lock order of upcall threads is natually opposite to downcall
100  * threads. Hence upcalls must not hold any locks across layers for fear of
101  * recursive lock enter and lock order violation. This applies to all layers.
102  *
103  * R2. The perimeter is just another lock. Since it is held in the down
104  * direction, acquiring the perimeter in an upcall is prohibited as it would
105  * cause a deadlock. This applies to all layers.
106  *
107  * Note that upcalls that need to grab the mac perimeter (for example
108  * mac_notify upcalls) can still achieve that by posting the request to a
109  * thread, which can then grab all the required perimeters and locks in the
110  * right global order. Note that in the above example the mac layer iself
111  * won't grab the mac perimeter in the mac_notify upcall, instead the upcall
112  * to the client must do that. Please see the aggr code for an example.
113  *
114  * MAC client rules
115  * ----------------
116  *
117  * R3. A MAC client may use the MAC provided perimeter facility to serialize
118  * control operations on a per mac end point. It does this by by acquring
119  * and holding the perimeter across a sequence of calls to the mac layer.
120  * This ensures atomicity across the entire block of mac calls. In this
121  * model the MAC client must not hold any client locks across the calls to
122  * the mac layer. This model is the preferred solution.
123  *
124  * R4. However if a MAC client has a lot of global state across all mac end
125  * points the per mac end point serialization may not be sufficient. In this
126  * case the client may choose to use global locks or use its own serialization.
127  * To avoid deadlocks, these client layer locks held across the mac calls
128  * in the control path must never be acquired by the data path for the reason
129  * mentioned below.
130  *
131  * (Assume that a control operation that holds a client lock blocks in the
132  * mac layer waiting for upcall reference counts to drop to zero. If an upcall
133  * data thread that holds this reference count, tries to acquire the same
134  * client lock subsequently it will deadlock).
135  *
136  * A MAC client may follow either the R3 model or the R4 model, but can't
137  * mix both. In the former, the hierarchy is Perim -> client locks, but in
138  * the latter it is client locks -> Perim.
139  *
140  * R5. MAC clients must make MAC calls (excluding data calls) in a cv_wait'able
141  * context since they may block while trying to acquire the perimeter.
142  * In addition some calls may block waiting for upcall refcnts to come down to
143  * zero.
144  *
145  * R6. MAC clients must make sure that they are single threaded and all threads
146  * from the top (in particular data threads) have finished before calling
147  * mac_client_close. The MAC framework does not track the number of client
148  * threads using the mac client handle. Also mac clients must make sure
149  * they have undone all the control operations before calling mac_client_close.
150  * For example mac_unicast_remove/mac_multicast_remove to undo the corresponding
151  * mac_unicast_add/mac_multicast_add.
152  *
153  * MAC framework rules
154  * -------------------
155  *
156  * R7. The mac layer itself must not hold any mac layer locks (except the mac
157  * perimeter) across a call to any other layer from the mac layer. The call to
158  * any other layer could be via mi_* entry points, classifier entry points into
159  * the driver or via upcall pointers into layers above. The mac perimeter may
160  * be acquired or held only in the down direction, for e.g. when calling into
161  * a mi_* driver enty point to provide atomicity of the operation.
162  *
163  * R8. Since it is not guaranteed (see R14) that drivers won't hold locks across
164  * mac driver interfaces, the MAC layer must provide a cut out for control
165  * interfaces like upcall notifications and start them in a separate thread.
166  *
167  * R9. Note that locking order also implies a plumbing order. For example
168  * VNICs are allowed to be created over aggrs, but not vice-versa. An attempt
169  * to plumb in any other order must be failed at mac_open time, otherwise it
170  * could lead to deadlocks due to inverse locking order.
171  *
172  * R10. MAC driver interfaces must not block since the driver could call them
173  * in interrupt context.
174  *
175  * R11. Walkers must preferably not hold any locks while calling walker
176  * callbacks. Instead these can operate on reference counts. In simple
177  * callbacks it may be ok to hold a lock and call the callbacks, but this is
178  * harder to maintain in the general case of arbitrary callbacks.
179  *
180  * R12. The MAC layer must protect upcall notification callbacks using reference
181  * counts rather than holding locks across the callbacks.
182  *
183  * R13. Given the variety of drivers, it is preferable if the MAC layer can make
184  * sure that any pointers (such as mac ring pointers) it passes to the driver
185  * remain valid until mac unregister time. Currently the mac layer achieves
186  * this by using generation numbers for rings and freeing the mac rings only
187  * at unregister time.  The MAC layer must provide a layer of indirection and
188  * must not expose underlying driver rings or driver data structures/pointers
189  * directly to MAC clients.
190  *
191  * MAC driver rules
192  * ----------------
193  *
194  * R14. It would be preferable if MAC drivers don't hold any locks across any
195  * mac call. However at a minimum they must not hold any locks across data
196  * upcalls. They must also make sure that all references to mac data structures
197  * are cleaned up and that it is single threaded at mac_unregister time.
198  *
199  * R15. MAC driver interfaces don't block and so the action may be done
200  * asynchronously in a separate thread as for example handling notifications.
201  * The driver must not assume that the action is complete when the call
202  * returns.
203  *
204  * R16. Drivers must maintain a generation number per Rx ring, and pass it
205  * back to mac_rx_ring(); They are expected to increment the generation
206  * number whenever the ring's stop routine is invoked.
207  * See comments in mac_rx_ring();
208  *
209  * R17 Similarly mi_stop is another synchronization point and the driver must
210  * ensure that all upcalls are done and there won't be any future upcall
211  * before returning from mi_stop.
212  *
213  * R18. The driver may assume that all set/modify control operations via
214  * the mi_* entry points are single threaded on a per mac end point.
215  *
216  * Lock and Perimeter hierarchy scenarios
217  * ---------------------------------------
218  *
219  * i_mac_impl_lock -> mi_rw_lock -> srs_lock -> s_ring_lock[i_mac_tx_srs_notify]
220  *
221  * ft_lock -> fe_lock [mac_flow_lookup]
222  *
223  * mi_rw_lock -> fe_lock [mac_bcast_send]
224  *
225  * srs_lock -> mac_bw_lock [mac_rx_srs_drain_bw]
226  *
227  * cpu_lock -> mac_srs_g_lock -> srs_lock -> s_ring_lock [mac_walk_srs_and_bind]
228  *
229  * i_dls_devnet_lock -> mac layer locks [dls_devnet_rename]
230  *
231  * Perimeters are ordered P1 -> P2 -> P3 from top to bottom in order of mac
232  * client to driver. In the case of clients that explictly use the mac provided
233  * perimeter mechanism for its serialization, the hierarchy is
234  * Perimeter -> mac layer locks, since the client never holds any locks across
235  * the mac calls. In the case of clients that use its own locks the hierarchy
236  * is Client locks -> Mac Perim -> Mac layer locks. The client never explicitly
237  * calls mac_perim_enter/exit in this case.
238  *
239  * Subflow creation rules
240  * ---------------------------
241  * o In case of a user specified cpulist present on underlying link and flows,
242  * the flows cpulist must be a subset of the underlying link.
243  * o In case of a user specified fanout mode present on link and flow, the
244  * subflow fanout count has to be less than or equal to that of the
245  * underlying link. The cpu-bindings for the subflows will be a subset of
246  * the underlying link.
247  * o In case if no cpulist specified on both underlying link and flow, the
248  * underlying link relies on a  MAC tunable to provide out of box fanout.
249  * The subflow will have no cpulist (the subflow will be unbound)
250  * o In case if no cpulist is specified on the underlying link, a subflow can
251  * carry  either a user-specified cpulist or fanout count. The cpu-bindings
252  * for the subflow will not adhere to restriction that they need to be subset
253  * of the underlying link.
254  * o In case where the underlying link is carrying either a user specified
255  * cpulist or fanout mode and for a unspecified subflow, the subflow will be
256  * created unbound.
257  * o While creating unbound subflows, bandwidth mode changes attempt to
258  * figure a right fanout count. In such cases the fanout count will override
259  * the unbound cpu-binding behavior.
260  * o In addition to this, while cycling between flow and link properties, we
261  * impose a restriction that if a link property has a subflow with
262  * user-specified attributes, we will not allow changing the link property.
263  * The administrator needs to reset all the user specified properties for the
264  * subflows before attempting a link property change.
265  * Some of the above rules can be overridden by specifying additional command
266  * line options while creating or modifying link or subflow properties.
267  */
268 
269 #include <sys/types.h>
270 #include <sys/conf.h>
271 #include <sys/id_space.h>
272 #include <sys/esunddi.h>
273 #include <sys/stat.h>
274 #include <sys/mkdev.h>
275 #include <sys/stream.h>
276 #include <sys/strsun.h>
277 #include <sys/strsubr.h>
278 #include <sys/dlpi.h>
279 #include <sys/modhash.h>
280 #include <sys/mac_provider.h>
281 #include <sys/mac_client_impl.h>
282 #include <sys/mac_soft_ring.h>
283 #include <sys/mac_impl.h>
284 #include <sys/mac.h>
285 #include <sys/dls.h>
286 #include <sys/dld.h>
287 #include <sys/modctl.h>
288 #include <sys/fs/dv_node.h>
289 #include <sys/thread.h>
290 #include <sys/proc.h>
291 #include <sys/callb.h>
292 #include <sys/cpuvar.h>
293 #include <sys/atomic.h>
294 #include <sys/bitmap.h>
295 #include <sys/sdt.h>
296 #include <sys/mac_flow.h>
297 #include <sys/ddi_intr_impl.h>
298 #include <sys/disp.h>
299 #include <sys/sdt.h>
300 #include <sys/vnic.h>
301 #include <sys/vnic_impl.h>
302 #include <sys/vlan.h>
303 #include <inet/ip.h>
304 #include <inet/ip6.h>
305 #include <sys/exacct.h>
306 #include <sys/exacct_impl.h>
307 #include <inet/nd.h>
308 #include <sys/ethernet.h>
309 
310 #define	IMPL_HASHSZ	67	/* prime */
311 
312 kmem_cache_t	*i_mac_impl_cachep;
313 mod_hash_t		*i_mac_impl_hash;
314 krwlock_t		i_mac_impl_lock;
315 uint_t			i_mac_impl_count;
316 static kmem_cache_t	*mac_ring_cache;
317 static id_space_t	*minor_ids;
318 static uint32_t		minor_count;
319 
320 /*
321  * Logging stuff. Perhaps mac_logging_interval could be broken into
322  * mac_flow_log_interval and mac_link_log_interval if we want to be
323  * able to schedule them differently.
324  */
325 uint_t			mac_logging_interval;
326 boolean_t		mac_flow_log_enable;
327 boolean_t		mac_link_log_enable;
328 timeout_id_t		mac_logging_timer;
329 
330 /* for debugging, see MAC_DBG_PRT() in mac_impl.h */
331 int mac_dbg = 0;
332 
333 #define	MACTYPE_KMODDIR	"mac"
334 #define	MACTYPE_HASHSZ	67
335 static mod_hash_t	*i_mactype_hash;
336 /*
337  * i_mactype_lock synchronizes threads that obtain references to mactype_t
338  * structures through i_mactype_getplugin().
339  */
340 static kmutex_t		i_mactype_lock;
341 
342 /*
343  * mac_tx_percpu_cnt
344  *
345  * Number of per cpu locks per mac_client_impl_t. Used by the transmit side
346  * in mac_tx to reduce lock contention. This is sized at boot time in mac_init.
347  * mac_tx_percpu_cnt_max is settable in /etc/system and must be a power of 2.
348  * Per cpu locks may be disabled by setting mac_tx_percpu_cnt_max to 1.
349  */
350 int mac_tx_percpu_cnt;
351 int mac_tx_percpu_cnt_max = 128;
352 
353 static int i_mac_constructor(void *, void *, int);
354 static void i_mac_destructor(void *, void *);
355 static int i_mac_ring_ctor(void *, void *, int);
356 static void i_mac_ring_dtor(void *, void *);
357 static mblk_t *mac_rx_classify(mac_impl_t *, mac_resource_handle_t, mblk_t *);
358 void mac_tx_client_flush(mac_client_impl_t *);
359 void mac_tx_client_block(mac_client_impl_t *);
360 static void mac_rx_ring_quiesce(mac_ring_t *, uint_t);
361 static int mac_start_group_and_rings(mac_group_t *);
362 static void mac_stop_group_and_rings(mac_group_t *);
363 
364 /*
365  * Module initialization functions.
366  */
367 
368 void
369 mac_init(void)
370 {
371 	mac_tx_percpu_cnt = ((boot_max_ncpus == -1) ? max_ncpus :
372 	    boot_max_ncpus);
373 
374 	/* Upper bound is mac_tx_percpu_cnt_max */
375 	if (mac_tx_percpu_cnt > mac_tx_percpu_cnt_max)
376 		mac_tx_percpu_cnt = mac_tx_percpu_cnt_max;
377 
378 	if (mac_tx_percpu_cnt < 1) {
379 		/* Someone set max_tx_percpu_cnt_max to 0 or less */
380 		mac_tx_percpu_cnt = 1;
381 	}
382 
383 	ASSERT(mac_tx_percpu_cnt >= 1);
384 	mac_tx_percpu_cnt = (1 << highbit(mac_tx_percpu_cnt - 1));
385 	/*
386 	 * Make it of the form 2**N - 1 in the range
387 	 * [0 .. mac_tx_percpu_cnt_max - 1]
388 	 */
389 	mac_tx_percpu_cnt--;
390 
391 	i_mac_impl_cachep = kmem_cache_create("mac_impl_cache",
392 	    sizeof (mac_impl_t), 0, i_mac_constructor, i_mac_destructor,
393 	    NULL, NULL, NULL, 0);
394 	ASSERT(i_mac_impl_cachep != NULL);
395 
396 	mac_ring_cache = kmem_cache_create("mac_ring_cache",
397 	    sizeof (mac_ring_t), 0, i_mac_ring_ctor, i_mac_ring_dtor, NULL,
398 	    NULL, NULL, 0);
399 	ASSERT(mac_ring_cache != NULL);
400 
401 	i_mac_impl_hash = mod_hash_create_extended("mac_impl_hash",
402 	    IMPL_HASHSZ, mod_hash_null_keydtor, mod_hash_null_valdtor,
403 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
404 	rw_init(&i_mac_impl_lock, NULL, RW_DEFAULT, NULL);
405 
406 	mac_flow_init();
407 	mac_soft_ring_init();
408 	mac_bcast_init();
409 	mac_client_init();
410 
411 	i_mac_impl_count = 0;
412 
413 	i_mactype_hash = mod_hash_create_extended("mactype_hash",
414 	    MACTYPE_HASHSZ,
415 	    mod_hash_null_keydtor, mod_hash_null_valdtor,
416 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
417 
418 	/*
419 	 * Allocate an id space to manage minor numbers. The range of the
420 	 * space will be from MAC_MAX_MINOR+1 to MAC_PRIVATE_MINOR-1.  This
421 	 * leaves half of the 32-bit minors available for driver private use.
422 	 */
423 	minor_ids = id_space_create("mac_minor_ids", MAC_MAX_MINOR+1,
424 	    MAC_PRIVATE_MINOR-1);
425 	ASSERT(minor_ids != NULL);
426 	minor_count = 0;
427 
428 	/* Let's default to 20 seconds */
429 	mac_logging_interval = 20;
430 	mac_flow_log_enable = B_FALSE;
431 	mac_link_log_enable = B_FALSE;
432 	mac_logging_timer = 0;
433 }
434 
435 int
436 mac_fini(void)
437 {
438 	if (i_mac_impl_count > 0 || minor_count > 0)
439 		return (EBUSY);
440 
441 	id_space_destroy(minor_ids);
442 	mac_flow_fini();
443 
444 	mod_hash_destroy_hash(i_mac_impl_hash);
445 	rw_destroy(&i_mac_impl_lock);
446 
447 	mac_client_fini();
448 	kmem_cache_destroy(mac_ring_cache);
449 
450 	mod_hash_destroy_hash(i_mactype_hash);
451 	mac_soft_ring_finish();
452 	return (0);
453 }
454 
455 void
456 mac_init_ops(struct dev_ops *ops, const char *name)
457 {
458 	dld_init_ops(ops, name);
459 }
460 
461 void
462 mac_fini_ops(struct dev_ops *ops)
463 {
464 	dld_fini_ops(ops);
465 }
466 
467 /*ARGSUSED*/
468 static int
469 i_mac_constructor(void *buf, void *arg, int kmflag)
470 {
471 	mac_impl_t	*mip = buf;
472 
473 	bzero(buf, sizeof (mac_impl_t));
474 
475 	mip->mi_linkstate = LINK_STATE_UNKNOWN;
476 	mip->mi_nclients = 0;
477 
478 	mutex_init(&mip->mi_lock, NULL, MUTEX_DRIVER, NULL);
479 	rw_init(&mip->mi_rw_lock, NULL, RW_DRIVER, NULL);
480 	mutex_init(&mip->mi_notify_lock, NULL, MUTEX_DRIVER, NULL);
481 	mutex_init(&mip->mi_promisc_lock, NULL, MUTEX_DRIVER, NULL);
482 	mutex_init(&mip->mi_ring_lock, NULL, MUTEX_DEFAULT, NULL);
483 
484 	mip->mi_notify_cb_info.mcbi_lockp = &mip->mi_notify_lock;
485 	cv_init(&mip->mi_notify_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
486 	mip->mi_promisc_cb_info.mcbi_lockp = &mip->mi_promisc_lock;
487 	cv_init(&mip->mi_promisc_cb_info.mcbi_cv, NULL, CV_DRIVER, NULL);
488 	return (0);
489 }
490 
491 /*ARGSUSED*/
492 static void
493 i_mac_destructor(void *buf, void *arg)
494 {
495 	mac_impl_t	*mip = buf;
496 	mac_cb_info_t	*mcbi;
497 
498 	ASSERT(mip->mi_ref == 0);
499 	ASSERT(mip->mi_active == 0);
500 	ASSERT(mip->mi_linkstate == LINK_STATE_UNKNOWN);
501 	ASSERT(mip->mi_devpromisc == 0);
502 	ASSERT(mip->mi_ksp == NULL);
503 	ASSERT(mip->mi_kstat_count == 0);
504 	ASSERT(mip->mi_nclients == 0);
505 	ASSERT(mip->mi_nactiveclients == 0);
506 	ASSERT(mip->mi_single_active_client == NULL);
507 	ASSERT(mip->mi_state_flags == 0);
508 	ASSERT(mip->mi_factory_addr == NULL);
509 	ASSERT(mip->mi_factory_addr_num == 0);
510 	ASSERT(mip->mi_default_tx_ring == NULL);
511 
512 	mcbi = &mip->mi_notify_cb_info;
513 	ASSERT(mcbi->mcbi_del_cnt == 0 && mcbi->mcbi_walker_cnt == 0);
514 	ASSERT(mip->mi_notify_bits == 0);
515 	ASSERT(mip->mi_notify_thread == NULL);
516 	ASSERT(mcbi->mcbi_lockp == &mip->mi_notify_lock);
517 	mcbi->mcbi_lockp = NULL;
518 
519 	mcbi = &mip->mi_promisc_cb_info;
520 	ASSERT(mcbi->mcbi_del_cnt == 0 && mip->mi_promisc_list == NULL);
521 	ASSERT(mip->mi_promisc_list == NULL);
522 	ASSERT(mcbi->mcbi_lockp == &mip->mi_promisc_lock);
523 	mcbi->mcbi_lockp = NULL;
524 
525 	ASSERT(mip->mi_bcast_ngrps == 0 && mip->mi_bcast_grp == NULL);
526 	ASSERT(mip->mi_perim_owner == NULL && mip->mi_perim_ocnt == 0);
527 
528 	mutex_destroy(&mip->mi_lock);
529 	rw_destroy(&mip->mi_rw_lock);
530 
531 	mutex_destroy(&mip->mi_promisc_lock);
532 	cv_destroy(&mip->mi_promisc_cb_info.mcbi_cv);
533 	mutex_destroy(&mip->mi_notify_lock);
534 	cv_destroy(&mip->mi_notify_cb_info.mcbi_cv);
535 	mutex_destroy(&mip->mi_ring_lock);
536 }
537 
538 /* ARGSUSED */
539 static int
540 i_mac_ring_ctor(void *buf, void *arg, int kmflag)
541 {
542 	mac_ring_t *ring = (mac_ring_t *)buf;
543 
544 	bzero(ring, sizeof (mac_ring_t));
545 	cv_init(&ring->mr_cv, NULL, CV_DEFAULT, NULL);
546 	mutex_init(&ring->mr_lock, NULL, MUTEX_DEFAULT, NULL);
547 	ring->mr_state = MR_FREE;
548 	return (0);
549 }
550 
551 /* ARGSUSED */
552 static void
553 i_mac_ring_dtor(void *buf, void *arg)
554 {
555 	mac_ring_t *ring = (mac_ring_t *)buf;
556 
557 	cv_destroy(&ring->mr_cv);
558 	mutex_destroy(&ring->mr_lock);
559 }
560 
561 /*
562  * Common functions to do mac callback addition and deletion. Currently this is
563  * used by promisc callbacks and notify callbacks. List addition and deletion
564  * need to take care of list walkers. List walkers in general, can't hold list
565  * locks and make upcall callbacks due to potential lock order and recursive
566  * reentry issues. Instead list walkers increment the list walker count to mark
567  * the presence of a walker thread. Addition can be carefully done to ensure
568  * that the list walker always sees either the old list or the new list.
569  * However the deletion can't be done while the walker is active, instead the
570  * deleting thread simply marks the entry as logically deleted. The last walker
571  * physically deletes and frees up the logically deleted entries when the walk
572  * is complete.
573  */
574 void
575 mac_callback_add(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
576     mac_cb_t *mcb_elem)
577 {
578 	mac_cb_t	*p;
579 	mac_cb_t	**pp;
580 
581 	/* Verify it is not already in the list */
582 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
583 		if (p == mcb_elem)
584 			break;
585 	}
586 	VERIFY(p == NULL);
587 
588 	/*
589 	 * Add it to the head of the callback list. The membar ensures that
590 	 * the following list pointer manipulations reach global visibility
591 	 * in exactly the program order below.
592 	 */
593 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
594 
595 	mcb_elem->mcb_nextp = *mcb_head;
596 	membar_producer();
597 	*mcb_head = mcb_elem;
598 }
599 
600 /*
601  * Mark the entry as logically deleted. If there aren't any walkers unlink
602  * from the list. In either case return the corresponding status.
603  */
604 boolean_t
605 mac_callback_remove(mac_cb_info_t *mcbi, mac_cb_t **mcb_head,
606     mac_cb_t *mcb_elem)
607 {
608 	mac_cb_t	*p;
609 	mac_cb_t	**pp;
610 
611 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
612 	/*
613 	 * Search the callback list for the entry to be removed
614 	 */
615 	for (pp = mcb_head; (p = *pp) != NULL; pp = &p->mcb_nextp) {
616 		if (p == mcb_elem)
617 			break;
618 	}
619 	VERIFY(p != NULL);
620 
621 	/*
622 	 * If there are walkers just mark it as deleted and the last walker
623 	 * will remove from the list and free it.
624 	 */
625 	if (mcbi->mcbi_walker_cnt != 0) {
626 		p->mcb_flags |= MCB_CONDEMNED;
627 		mcbi->mcbi_del_cnt++;
628 		return (B_FALSE);
629 	}
630 
631 	ASSERT(mcbi->mcbi_del_cnt == 0);
632 	*pp = p->mcb_nextp;
633 	p->mcb_nextp = NULL;
634 	return (B_TRUE);
635 }
636 
637 /*
638  * Wait for all pending callback removals to be completed
639  */
640 void
641 mac_callback_remove_wait(mac_cb_info_t *mcbi)
642 {
643 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
644 	while (mcbi->mcbi_del_cnt != 0) {
645 		DTRACE_PROBE1(need_wait, mac_cb_info_t *, mcbi);
646 		cv_wait(&mcbi->mcbi_cv, mcbi->mcbi_lockp);
647 	}
648 }
649 
650 /*
651  * The last mac callback walker does the cleanup. Walk the list and unlik
652  * all the logically deleted entries and construct a temporary list of
653  * removed entries. Return the list of removed entries to the caller.
654  */
655 mac_cb_t *
656 mac_callback_walker_cleanup(mac_cb_info_t *mcbi, mac_cb_t **mcb_head)
657 {
658 	mac_cb_t	*p;
659 	mac_cb_t	**pp;
660 	mac_cb_t	*rmlist = NULL;		/* List of removed elements */
661 	int	cnt = 0;
662 
663 	ASSERT(MUTEX_HELD(mcbi->mcbi_lockp));
664 	ASSERT(mcbi->mcbi_del_cnt != 0 && mcbi->mcbi_walker_cnt == 0);
665 
666 	pp = mcb_head;
667 	while (*pp != NULL) {
668 		if ((*pp)->mcb_flags & MCB_CONDEMNED) {
669 			p = *pp;
670 			*pp = p->mcb_nextp;
671 			p->mcb_nextp = rmlist;
672 			rmlist = p;
673 			cnt++;
674 			continue;
675 		}
676 		pp = &(*pp)->mcb_nextp;
677 	}
678 
679 	ASSERT(mcbi->mcbi_del_cnt == cnt);
680 	mcbi->mcbi_del_cnt = 0;
681 	return (rmlist);
682 }
683 
684 boolean_t
685 mac_callback_lookup(mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
686 {
687 	mac_cb_t	*mcb;
688 
689 	/* Verify it is not already in the list */
690 	for (mcb = *mcb_headp; mcb != NULL; mcb = mcb->mcb_nextp) {
691 		if (mcb == mcb_elem)
692 			return (B_TRUE);
693 	}
694 
695 	return (B_FALSE);
696 }
697 
698 boolean_t
699 mac_callback_find(mac_cb_info_t *mcbi, mac_cb_t **mcb_headp, mac_cb_t *mcb_elem)
700 {
701 	boolean_t	found;
702 
703 	mutex_enter(mcbi->mcbi_lockp);
704 	found = mac_callback_lookup(mcb_headp, mcb_elem);
705 	mutex_exit(mcbi->mcbi_lockp);
706 
707 	return (found);
708 }
709 
710 /* Free the list of removed callbacks */
711 void
712 mac_callback_free(mac_cb_t *rmlist)
713 {
714 	mac_cb_t	*mcb;
715 	mac_cb_t	*mcb_next;
716 
717 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
718 		mcb_next = mcb->mcb_nextp;
719 		kmem_free(mcb->mcb_objp, mcb->mcb_objsize);
720 	}
721 }
722 
723 /*
724  * The promisc callbacks are in 2 lists, one off the 'mip' and another off the
725  * 'mcip' threaded by mpi_mi_link and mpi_mci_link respectively. However there
726  * is only a single shared total walker count, and an entry can't be physically
727  * unlinked if a walker is active on either list. The last walker does this
728  * cleanup of logically deleted entries.
729  */
730 void
731 i_mac_promisc_walker_cleanup(mac_impl_t *mip)
732 {
733 	mac_cb_t	*rmlist;
734 	mac_cb_t	*mcb;
735 	mac_cb_t	*mcb_next;
736 	mac_promisc_impl_t	*mpip;
737 
738 	/*
739 	 * Construct a temporary list of deleted callbacks by walking the
740 	 * the mi_promisc_list. Then for each entry in the temporary list,
741 	 * remove it from the mci_promisc_list and free the entry.
742 	 */
743 	rmlist = mac_callback_walker_cleanup(&mip->mi_promisc_cb_info,
744 	    &mip->mi_promisc_list);
745 
746 	for (mcb = rmlist; mcb != NULL; mcb = mcb_next) {
747 		mcb_next = mcb->mcb_nextp;
748 		mpip = (mac_promisc_impl_t *)mcb->mcb_objp;
749 		VERIFY(mac_callback_remove(&mip->mi_promisc_cb_info,
750 		    &mpip->mpi_mcip->mci_promisc_list, &mpip->mpi_mci_link));
751 		mcb->mcb_flags = 0;
752 		mcb->mcb_nextp = NULL;
753 		kmem_cache_free(mac_promisc_impl_cache, mpip);
754 	}
755 }
756 
757 void
758 i_mac_notify(mac_impl_t *mip, mac_notify_type_t type)
759 {
760 	mac_cb_info_t	*mcbi;
761 
762 	/*
763 	 * Signal the notify thread even after mi_ref has become zero and
764 	 * mi_disabled is set. The synchronization with the notify thread
765 	 * happens in mac_unregister and that implies the driver must make
766 	 * sure it is single-threaded (with respect to mac calls) and that
767 	 * all pending mac calls have returned before it calls mac_unregister
768 	 */
769 	rw_enter(&i_mac_impl_lock, RW_READER);
770 	if (mip->mi_state_flags & MIS_DISABLED)
771 		goto exit;
772 
773 	/*
774 	 * Guard against incorrect notifications.  (Running a newer
775 	 * mac client against an older implementation?)
776 	 */
777 	if (type >= MAC_NNOTE)
778 		goto exit;
779 
780 	mcbi = &mip->mi_notify_cb_info;
781 	mutex_enter(mcbi->mcbi_lockp);
782 	mip->mi_notify_bits |= (1 << type);
783 	cv_broadcast(&mcbi->mcbi_cv);
784 	mutex_exit(mcbi->mcbi_lockp);
785 
786 exit:
787 	rw_exit(&i_mac_impl_lock);
788 }
789 
790 /*
791  * Mac serialization primitives. Please see the block comment at the
792  * top of the file.
793  */
794 void
795 i_mac_perim_enter(mac_impl_t *mip)
796 {
797 	mac_client_impl_t	*mcip;
798 
799 	if (mip->mi_state_flags & MIS_IS_VNIC) {
800 		/*
801 		 * This is a VNIC. Return the lower mac since that is what
802 		 * we want to serialize on.
803 		 */
804 		mcip = mac_vnic_lower(mip);
805 		mip = mcip->mci_mip;
806 	}
807 
808 	mutex_enter(&mip->mi_perim_lock);
809 	if (mip->mi_perim_owner == curthread) {
810 		mip->mi_perim_ocnt++;
811 		mutex_exit(&mip->mi_perim_lock);
812 		return;
813 	}
814 
815 	while (mip->mi_perim_owner != NULL)
816 		cv_wait(&mip->mi_perim_cv, &mip->mi_perim_lock);
817 
818 	mip->mi_perim_owner = curthread;
819 	ASSERT(mip->mi_perim_ocnt == 0);
820 	mip->mi_perim_ocnt++;
821 #ifdef DEBUG
822 	mip->mi_perim_stack_depth = getpcstack(mip->mi_perim_stack,
823 	    MAC_PERIM_STACK_DEPTH);
824 #endif
825 	mutex_exit(&mip->mi_perim_lock);
826 }
827 
828 int
829 i_mac_perim_enter_nowait(mac_impl_t *mip)
830 {
831 	/*
832 	 * The vnic is a special case, since the serialization is done based
833 	 * on the lower mac. If the lower mac is busy, it does not imply the
834 	 * vnic can't be unregistered. But in the case of other drivers,
835 	 * a busy perimeter or open mac handles implies that the mac is busy
836 	 * and can't be unregistered.
837 	 */
838 	if (mip->mi_state_flags & MIS_IS_VNIC) {
839 		i_mac_perim_enter(mip);
840 		return (0);
841 	}
842 
843 	mutex_enter(&mip->mi_perim_lock);
844 	if (mip->mi_perim_owner != NULL) {
845 		mutex_exit(&mip->mi_perim_lock);
846 		return (EBUSY);
847 	}
848 	ASSERT(mip->mi_perim_ocnt == 0);
849 	mip->mi_perim_owner = curthread;
850 	mip->mi_perim_ocnt++;
851 	mutex_exit(&mip->mi_perim_lock);
852 
853 	return (0);
854 }
855 
856 void
857 i_mac_perim_exit(mac_impl_t *mip)
858 {
859 	mac_client_impl_t *mcip;
860 
861 	if (mip->mi_state_flags & MIS_IS_VNIC) {
862 		/*
863 		 * This is a VNIC. Return the lower mac since that is what
864 		 * we want to serialize on.
865 		 */
866 		mcip = mac_vnic_lower(mip);
867 		mip = mcip->mci_mip;
868 	}
869 
870 	ASSERT(mip->mi_perim_owner == curthread && mip->mi_perim_ocnt != 0);
871 
872 	mutex_enter(&mip->mi_perim_lock);
873 	if (--mip->mi_perim_ocnt == 0) {
874 		mip->mi_perim_owner = NULL;
875 		cv_signal(&mip->mi_perim_cv);
876 	}
877 	mutex_exit(&mip->mi_perim_lock);
878 }
879 
880 /*
881  * Returns whether the current thread holds the mac perimeter. Used in making
882  * assertions.
883  */
884 boolean_t
885 mac_perim_held(mac_handle_t mh)
886 {
887 	mac_impl_t	*mip = (mac_impl_t *)mh;
888 	mac_client_impl_t *mcip;
889 
890 	if (mip->mi_state_flags & MIS_IS_VNIC) {
891 		/*
892 		 * This is a VNIC. Return the lower mac since that is what
893 		 * we want to serialize on.
894 		 */
895 		mcip = mac_vnic_lower(mip);
896 		mip = mcip->mci_mip;
897 	}
898 	return (mip->mi_perim_owner == curthread);
899 }
900 
901 /*
902  * mac client interfaces to enter the mac perimeter of a mac end point, given
903  * its mac handle, or macname or linkid.
904  */
905 void
906 mac_perim_enter_by_mh(mac_handle_t mh, mac_perim_handle_t *mphp)
907 {
908 	mac_impl_t	*mip = (mac_impl_t *)mh;
909 
910 	i_mac_perim_enter(mip);
911 	/*
912 	 * The mac_perim_handle_t returned encodes the 'mip' and whether a
913 	 * mac_open has been done internally while entering the perimeter.
914 	 * This information is used in mac_perim_exit
915 	 */
916 	MAC_ENCODE_MPH(*mphp, mip, 0);
917 }
918 
919 int
920 mac_perim_enter_by_macname(const char *name, mac_perim_handle_t *mphp)
921 {
922 	int	err;
923 	mac_handle_t	mh;
924 
925 	if ((err = mac_open(name, &mh)) != 0)
926 		return (err);
927 
928 	mac_perim_enter_by_mh(mh, mphp);
929 	MAC_ENCODE_MPH(*mphp, mh, 1);
930 	return (0);
931 }
932 
933 int
934 mac_perim_enter_by_linkid(datalink_id_t linkid, mac_perim_handle_t *mphp)
935 {
936 	int	err;
937 	mac_handle_t	mh;
938 
939 	if ((err = mac_open_by_linkid(linkid, &mh)) != 0)
940 		return (err);
941 
942 	mac_perim_enter_by_mh(mh, mphp);
943 	MAC_ENCODE_MPH(*mphp, mh, 1);
944 	return (0);
945 }
946 
947 void
948 mac_perim_exit(mac_perim_handle_t mph)
949 {
950 	mac_impl_t	*mip;
951 	boolean_t	need_close;
952 
953 	MAC_DECODE_MPH(mph, mip, need_close);
954 	i_mac_perim_exit(mip);
955 	if (need_close)
956 		mac_close((mac_handle_t)mip);
957 }
958 
959 int
960 mac_hold(const char *macname, mac_impl_t **pmip)
961 {
962 	mac_impl_t	*mip;
963 	int		err;
964 
965 	/*
966 	 * Check the device name length to make sure it won't overflow our
967 	 * buffer.
968 	 */
969 	if (strlen(macname) >= MAXNAMELEN)
970 		return (EINVAL);
971 
972 	/*
973 	 * Look up its entry in the global hash table.
974 	 */
975 	rw_enter(&i_mac_impl_lock, RW_WRITER);
976 	err = mod_hash_find(i_mac_impl_hash, (mod_hash_key_t)macname,
977 	    (mod_hash_val_t *)&mip);
978 
979 	if (err != 0) {
980 		rw_exit(&i_mac_impl_lock);
981 		return (ENOENT);
982 	}
983 
984 	if (mip->mi_state_flags & MIS_DISABLED) {
985 		rw_exit(&i_mac_impl_lock);
986 		return (ENOENT);
987 	}
988 
989 	if (mip->mi_state_flags & MIS_EXCLUSIVE_HELD) {
990 		rw_exit(&i_mac_impl_lock);
991 		return (EBUSY);
992 	}
993 
994 	mip->mi_ref++;
995 	rw_exit(&i_mac_impl_lock);
996 
997 	*pmip = mip;
998 	return (0);
999 }
1000 
1001 void
1002 mac_rele(mac_impl_t *mip)
1003 {
1004 	rw_enter(&i_mac_impl_lock, RW_WRITER);
1005 	ASSERT(mip->mi_ref != 0);
1006 	if (--mip->mi_ref == 0) {
1007 		ASSERT(mip->mi_nactiveclients == 0 &&
1008 		    !(mip->mi_state_flags & MIS_EXCLUSIVE));
1009 	}
1010 	rw_exit(&i_mac_impl_lock);
1011 }
1012 
1013 /*
1014  * Private GLDv3 function to start a MAC instance.
1015  */
1016 int
1017 mac_start(mac_handle_t mh)
1018 {
1019 	mac_impl_t	*mip = (mac_impl_t *)mh;
1020 	int		err = 0;
1021 
1022 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1023 	ASSERT(mip->mi_start != NULL);
1024 
1025 	/*
1026 	 * Check whether the device is already started.
1027 	 */
1028 	if (mip->mi_active++ == 0) {
1029 		mac_ring_t *ring = NULL;
1030 
1031 		/*
1032 		 * Start the device.
1033 		 */
1034 		err = mip->mi_start(mip->mi_driver);
1035 		if (err != 0) {
1036 			mip->mi_active--;
1037 			return (err);
1038 		}
1039 
1040 		/*
1041 		 * Start the default tx ring.
1042 		 */
1043 		if (mip->mi_default_tx_ring != NULL) {
1044 
1045 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1046 			err = mac_start_ring(ring);
1047 			if (err != 0) {
1048 				mip->mi_active--;
1049 				return (err);
1050 			}
1051 			ring->mr_state = MR_INUSE;
1052 		}
1053 
1054 		if (mip->mi_rx_groups != NULL) {
1055 			/*
1056 			 * Start the default ring, since it will be needed
1057 			 * to receive broadcast and multicast traffic for
1058 			 * both primary and non-primary MAC clients.
1059 			 */
1060 			mac_group_t *grp = &mip->mi_rx_groups[0];
1061 
1062 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_REGISTERED);
1063 			err = mac_start_group_and_rings(grp);
1064 			if (err != 0) {
1065 				mip->mi_active--;
1066 				if (ring != NULL) {
1067 					mac_stop_ring(ring);
1068 					ring->mr_state = MR_FREE;
1069 				}
1070 				return (err);
1071 			}
1072 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_SHARED);
1073 		}
1074 	}
1075 
1076 	return (err);
1077 }
1078 
1079 /*
1080  * Private GLDv3 function to stop a MAC instance.
1081  */
1082 void
1083 mac_stop(mac_handle_t mh)
1084 {
1085 	mac_impl_t	*mip = (mac_impl_t *)mh;
1086 
1087 	ASSERT(mip->mi_stop != NULL);
1088 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1089 
1090 	/*
1091 	 * Check whether the device is still needed.
1092 	 */
1093 	ASSERT(mip->mi_active != 0);
1094 	if (--mip->mi_active == 0) {
1095 		if (mip->mi_rx_groups != NULL) {
1096 			/*
1097 			 * There should be no more active clients since the
1098 			 * MAC is being stopped. Stop the default RX group
1099 			 * and transition it back to registered state.
1100 			 */
1101 			mac_group_t *grp = &mip->mi_rx_groups[0];
1102 
1103 			/*
1104 			 * When clients are torn down, the groups
1105 			 * are release via mac_release_rx_group which
1106 			 * knows the the default group is always in
1107 			 * started mode since broadcast uses it. So
1108 			 * we can assert that their are no clients
1109 			 * (since mac_bcast_add doesn't register itself
1110 			 * as a client) and group is in SHARED state.
1111 			 */
1112 			ASSERT(grp->mrg_state == MAC_GROUP_STATE_SHARED);
1113 			ASSERT(MAC_RX_GROUP_NO_CLIENT(grp) &&
1114 			    mip->mi_nactiveclients == 0);
1115 			mac_stop_group_and_rings(grp);
1116 			mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
1117 		}
1118 
1119 		if (mip->mi_default_tx_ring != NULL) {
1120 			mac_ring_t *ring;
1121 
1122 			ring = (mac_ring_t *)mip->mi_default_tx_ring;
1123 			mac_stop_ring(ring);
1124 			ring->mr_state = MR_FREE;
1125 		}
1126 
1127 		/*
1128 		 * Stop the device.
1129 		 */
1130 		mip->mi_stop(mip->mi_driver);
1131 	}
1132 }
1133 
1134 int
1135 i_mac_promisc_set(mac_impl_t *mip, boolean_t on)
1136 {
1137 	int		err = 0;
1138 
1139 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1140 	ASSERT(mip->mi_setpromisc != NULL);
1141 
1142 	if (on) {
1143 		/*
1144 		 * Enable promiscuous mode on the device if not yet enabled.
1145 		 */
1146 		if (mip->mi_devpromisc++ == 0) {
1147 			err = mip->mi_setpromisc(mip->mi_driver, B_TRUE);
1148 			if (err != 0) {
1149 				mip->mi_devpromisc--;
1150 				return (err);
1151 			}
1152 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1153 		}
1154 	} else {
1155 		if (mip->mi_devpromisc == 0)
1156 			return (EPROTO);
1157 
1158 		/*
1159 		 * Disable promiscuous mode on the device if this is the last
1160 		 * enabling.
1161 		 */
1162 		if (--mip->mi_devpromisc == 0) {
1163 			err = mip->mi_setpromisc(mip->mi_driver, B_FALSE);
1164 			if (err != 0) {
1165 				mip->mi_devpromisc++;
1166 				return (err);
1167 			}
1168 			i_mac_notify(mip, MAC_NOTE_DEVPROMISC);
1169 		}
1170 	}
1171 
1172 	return (0);
1173 }
1174 
1175 /*
1176  * The promiscuity state can change any time. If the caller needs to take
1177  * actions that are atomic with the promiscuity state, then the caller needs
1178  * to bracket the entire sequence with mac_perim_enter/exit
1179  */
1180 boolean_t
1181 mac_promisc_get(mac_handle_t mh)
1182 {
1183 	mac_impl_t		*mip = (mac_impl_t *)mh;
1184 
1185 	/*
1186 	 * Return the current promiscuity.
1187 	 */
1188 	return (mip->mi_devpromisc != 0);
1189 }
1190 
1191 /*
1192  * Invoked at MAC instance attach time to initialize the list
1193  * of factory MAC addresses supported by a MAC instance. This function
1194  * builds a local cache in the mac_impl_t for the MAC addresses
1195  * supported by the underlying hardware. The MAC clients themselves
1196  * use the mac_addr_factory*() functions to query and reserve
1197  * factory MAC addresses.
1198  */
1199 void
1200 mac_addr_factory_init(mac_impl_t *mip)
1201 {
1202 	mac_capab_multifactaddr_t capab;
1203 	uint8_t *addr;
1204 	int i;
1205 
1206 	/*
1207 	 * First round to see how many factory MAC addresses are available.
1208 	 */
1209 	bzero(&capab, sizeof (capab));
1210 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_MULTIFACTADDR,
1211 	    &capab) || (capab.mcm_naddr == 0)) {
1212 		/*
1213 		 * The MAC instance doesn't support multiple factory
1214 		 * MAC addresses, we're done here.
1215 		 */
1216 		return;
1217 	}
1218 
1219 	/*
1220 	 * Allocate the space and get all the factory addresses.
1221 	 */
1222 	addr = kmem_alloc(capab.mcm_naddr * MAXMACADDRLEN, KM_SLEEP);
1223 	capab.mcm_getaddr(mip->mi_driver, capab.mcm_naddr, addr);
1224 
1225 	mip->mi_factory_addr_num = capab.mcm_naddr;
1226 	mip->mi_factory_addr = kmem_zalloc(mip->mi_factory_addr_num *
1227 	    sizeof (mac_factory_addr_t), KM_SLEEP);
1228 
1229 	for (i = 0; i < capab.mcm_naddr; i++) {
1230 		bcopy(addr + i * MAXMACADDRLEN,
1231 		    mip->mi_factory_addr[i].mfa_addr,
1232 		    mip->mi_type->mt_addr_length);
1233 		mip->mi_factory_addr[i].mfa_in_use = B_FALSE;
1234 	}
1235 
1236 	kmem_free(addr, capab.mcm_naddr * MAXMACADDRLEN);
1237 }
1238 
1239 void
1240 mac_addr_factory_fini(mac_impl_t *mip)
1241 {
1242 	if (mip->mi_factory_addr == NULL) {
1243 		ASSERT(mip->mi_factory_addr_num == 0);
1244 		return;
1245 	}
1246 
1247 	kmem_free(mip->mi_factory_addr, mip->mi_factory_addr_num *
1248 	    sizeof (mac_factory_addr_t));
1249 
1250 	mip->mi_factory_addr = NULL;
1251 	mip->mi_factory_addr_num = 0;
1252 }
1253 
1254 /*
1255  * Reserve a factory MAC address. If *slot is set to -1, the function
1256  * attempts to reserve any of the available factory MAC addresses and
1257  * returns the reserved slot id. If no slots are available, the function
1258  * returns ENOSPC. If *slot is not set to -1, the function reserves
1259  * the specified slot if it is available, or returns EBUSY is the slot
1260  * is already used. Returns ENOTSUP if the underlying MAC does not
1261  * support multiple factory addresses. If the slot number is not -1 but
1262  * is invalid, returns EINVAL.
1263  */
1264 int
1265 mac_addr_factory_reserve(mac_client_handle_t mch, int *slot)
1266 {
1267 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1268 	mac_impl_t *mip = mcip->mci_mip;
1269 	int i, ret = 0;
1270 
1271 	i_mac_perim_enter(mip);
1272 	/*
1273 	 * Protect against concurrent readers that may need a self-consistent
1274 	 * view of the factory addresses
1275 	 */
1276 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1277 
1278 	if (mip->mi_factory_addr_num == 0) {
1279 		ret = ENOTSUP;
1280 		goto bail;
1281 	}
1282 
1283 	if (*slot != -1) {
1284 		/* check the specified slot */
1285 		if (*slot < 1 || *slot > mip->mi_factory_addr_num) {
1286 			ret = EINVAL;
1287 			goto bail;
1288 		}
1289 		if (mip->mi_factory_addr[*slot-1].mfa_in_use) {
1290 			ret = EBUSY;
1291 			goto bail;
1292 		}
1293 	} else {
1294 		/* pick the next available slot */
1295 		for (i = 0; i < mip->mi_factory_addr_num; i++) {
1296 			if (!mip->mi_factory_addr[i].mfa_in_use)
1297 				break;
1298 		}
1299 
1300 		if (i == mip->mi_factory_addr_num) {
1301 			ret = ENOSPC;
1302 			goto bail;
1303 		}
1304 		*slot = i+1;
1305 	}
1306 
1307 	mip->mi_factory_addr[*slot-1].mfa_in_use = B_TRUE;
1308 	mip->mi_factory_addr[*slot-1].mfa_client = mcip;
1309 
1310 bail:
1311 	rw_exit(&mip->mi_rw_lock);
1312 	i_mac_perim_exit(mip);
1313 	return (ret);
1314 }
1315 
1316 /*
1317  * Release the specified factory MAC address slot.
1318  */
1319 void
1320 mac_addr_factory_release(mac_client_handle_t mch, uint_t slot)
1321 {
1322 	mac_client_impl_t *mcip = (mac_client_impl_t *)mch;
1323 	mac_impl_t *mip = mcip->mci_mip;
1324 
1325 	i_mac_perim_enter(mip);
1326 	/*
1327 	 * Protect against concurrent readers that may need a self-consistent
1328 	 * view of the factory addresses
1329 	 */
1330 	rw_enter(&mip->mi_rw_lock, RW_WRITER);
1331 
1332 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1333 	ASSERT(mip->mi_factory_addr[slot-1].mfa_in_use);
1334 
1335 	mip->mi_factory_addr[slot-1].mfa_in_use = B_FALSE;
1336 
1337 	rw_exit(&mip->mi_rw_lock);
1338 	i_mac_perim_exit(mip);
1339 }
1340 
1341 /*
1342  * Stores in mac_addr the value of the specified MAC address. Returns
1343  * 0 on success, or EINVAL if the slot number is not valid for the MAC.
1344  * The caller must provide a string of at least MAXNAMELEN bytes.
1345  */
1346 void
1347 mac_addr_factory_value(mac_handle_t mh, int slot, uchar_t *mac_addr,
1348     uint_t *addr_len, char *client_name, boolean_t *in_use_arg)
1349 {
1350 	mac_impl_t *mip = (mac_impl_t *)mh;
1351 	boolean_t in_use;
1352 
1353 	ASSERT(slot > 0 && slot <= mip->mi_factory_addr_num);
1354 
1355 	/*
1356 	 * Readers need to hold mi_rw_lock. Writers need to hold mac perimeter
1357 	 * and mi_rw_lock
1358 	 */
1359 	rw_enter(&mip->mi_rw_lock, RW_READER);
1360 	bcopy(mip->mi_factory_addr[slot-1].mfa_addr, mac_addr, MAXMACADDRLEN);
1361 	*addr_len = mip->mi_type->mt_addr_length;
1362 	in_use = mip->mi_factory_addr[slot-1].mfa_in_use;
1363 	if (in_use && client_name != NULL) {
1364 		bcopy(mip->mi_factory_addr[slot-1].mfa_client->mci_name,
1365 		    client_name, MAXNAMELEN);
1366 	}
1367 	if (in_use_arg != NULL)
1368 		*in_use_arg = in_use;
1369 	rw_exit(&mip->mi_rw_lock);
1370 }
1371 
1372 /*
1373  * Returns the number of factory MAC addresses (in addition to the
1374  * primary MAC address), 0 if the underlying MAC doesn't support
1375  * that feature.
1376  */
1377 uint_t
1378 mac_addr_factory_num(mac_handle_t mh)
1379 {
1380 	mac_impl_t *mip = (mac_impl_t *)mh;
1381 
1382 	return (mip->mi_factory_addr_num);
1383 }
1384 
1385 
1386 void
1387 mac_rx_group_unmark(mac_group_t *grp, uint_t flag)
1388 {
1389 	mac_ring_t	*ring;
1390 
1391 	for (ring = grp->mrg_rings; ring != NULL; ring = ring->mr_next)
1392 		ring->mr_flag &= ~flag;
1393 }
1394 
1395 /*
1396  * The following mac_hwrings_xxx() functions are private mac client functions
1397  * used by the aggr driver to access and control the underlying HW Rx group
1398  * and rings. In this case, the aggr driver has exclusive control of the
1399  * underlying HW Rx group/rings, it calls the following functions to
1400  * start/stop the HW Rx rings, disable/enable polling, add/remove mac'
1401  * addresses, or set up the Rx callback.
1402  */
1403 /* ARGSUSED */
1404 static void
1405 mac_hwrings_rx_process(void *arg, mac_resource_handle_t srs,
1406     mblk_t *mp_chain, boolean_t loopback)
1407 {
1408 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
1409 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1410 	mac_direct_rx_t		proc;
1411 	void			*arg1;
1412 	mac_resource_handle_t	arg2;
1413 
1414 	proc = srs_rx->sr_func;
1415 	arg1 = srs_rx->sr_arg1;
1416 	arg2 = mac_srs->srs_mrh;
1417 
1418 	proc(arg1, arg2, mp_chain, NULL);
1419 }
1420 
1421 /*
1422  * This function is called to get the list of HW rings that are reserved by
1423  * an exclusive mac client.
1424  *
1425  * Return value: the number of HW rings.
1426  */
1427 int
1428 mac_hwrings_get(mac_client_handle_t mch, mac_group_handle_t *hwgh,
1429     mac_ring_handle_t *hwrh, mac_ring_type_t rtype)
1430 {
1431 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1432 	int			cnt = 0;
1433 
1434 	switch (rtype) {
1435 	case MAC_RING_TYPE_RX: {
1436 		flow_entry_t	*flent = mcip->mci_flent;
1437 		mac_group_t	*grp;
1438 		mac_ring_t	*ring;
1439 
1440 		grp = flent->fe_rx_ring_group;
1441 		/*
1442 		 * The mac client did not reserve any RX group, return directly.
1443 		 * This is probably because the underlying MAC does not support
1444 		 * any groups.
1445 		 */
1446 		*hwgh = NULL;
1447 		if (grp == NULL)
1448 			return (0);
1449 		/*
1450 		 * This group must be reserved by this mac client.
1451 		 */
1452 		ASSERT((grp->mrg_state == MAC_GROUP_STATE_RESERVED) &&
1453 		    (mch == (mac_client_handle_t)
1454 		    (MAC_RX_GROUP_ONLY_CLIENT(grp))));
1455 		for (ring = grp->mrg_rings;
1456 		    ring != NULL; ring = ring->mr_next, cnt++) {
1457 			ASSERT(cnt < MAX_RINGS_PER_GROUP);
1458 			hwrh[cnt] = (mac_ring_handle_t)ring;
1459 		}
1460 		*hwgh = (mac_group_handle_t)grp;
1461 		return (cnt);
1462 	}
1463 	case MAC_RING_TYPE_TX: {
1464 		mac_soft_ring_set_t	*tx_srs;
1465 		mac_srs_tx_t		*tx;
1466 
1467 		tx_srs = MCIP_TX_SRS(mcip);
1468 		tx = &tx_srs->srs_tx;
1469 		for (; cnt < tx->st_ring_count; cnt++)
1470 			hwrh[cnt] = tx->st_rings[cnt];
1471 		return (cnt);
1472 	}
1473 	default:
1474 		ASSERT(B_FALSE);
1475 		return (-1);
1476 	}
1477 }
1478 
1479 /*
1480  * Setup the RX callback of the mac client which exclusively controls HW ring.
1481  */
1482 void
1483 mac_hwring_setup(mac_ring_handle_t hwrh, mac_resource_handle_t prh)
1484 {
1485 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1486 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1487 
1488 	mac_srs->srs_mrh = prh;
1489 	mac_srs->srs_rx.sr_lower_proc = mac_hwrings_rx_process;
1490 }
1491 
1492 void
1493 mac_hwring_teardown(mac_ring_handle_t hwrh)
1494 {
1495 	mac_ring_t		*hw_ring = (mac_ring_t *)hwrh;
1496 	mac_soft_ring_set_t	*mac_srs = hw_ring->mr_srs;
1497 
1498 	mac_srs->srs_rx.sr_lower_proc = mac_rx_srs_process;
1499 	mac_srs->srs_mrh = NULL;
1500 }
1501 
1502 int
1503 mac_hwring_disable_intr(mac_ring_handle_t rh)
1504 {
1505 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1506 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1507 
1508 	return (intr->mi_disable(intr->mi_handle));
1509 }
1510 
1511 int
1512 mac_hwring_enable_intr(mac_ring_handle_t rh)
1513 {
1514 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1515 	mac_intr_t *intr = &rr_ring->mr_info.mri_intr;
1516 
1517 	return (intr->mi_enable(intr->mi_handle));
1518 }
1519 
1520 int
1521 mac_hwring_start(mac_ring_handle_t rh)
1522 {
1523 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1524 
1525 	MAC_RING_UNMARK(rr_ring, MR_QUIESCE);
1526 	return (0);
1527 }
1528 
1529 void
1530 mac_hwring_stop(mac_ring_handle_t rh)
1531 {
1532 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1533 
1534 	mac_rx_ring_quiesce(rr_ring, MR_QUIESCE);
1535 }
1536 
1537 mblk_t *
1538 mac_hwring_poll(mac_ring_handle_t rh, int bytes_to_pickup)
1539 {
1540 	mac_ring_t *rr_ring = (mac_ring_t *)rh;
1541 	mac_ring_info_t *info = &rr_ring->mr_info;
1542 
1543 	return (info->mri_poll(info->mri_driver, bytes_to_pickup));
1544 }
1545 
1546 /*
1547  * Send packets through the selected tx ring.
1548  */
1549 mblk_t *
1550 mac_hwring_tx(mac_ring_handle_t rh, mblk_t *mp)
1551 {
1552 	mac_ring_t *ring = (mac_ring_t *)rh;
1553 	mac_ring_info_t *info = &ring->mr_info;
1554 
1555 	ASSERT(ring->mr_type == MAC_RING_TYPE_TX);
1556 	ASSERT(ring->mr_state >= MR_INUSE);
1557 	ASSERT(info->mri_tx != NULL);
1558 
1559 	return (info->mri_tx(info->mri_driver, mp));
1560 }
1561 
1562 int
1563 mac_hwgroup_addmac(mac_group_handle_t gh, const uint8_t *addr)
1564 {
1565 	mac_group_t *group = (mac_group_t *)gh;
1566 
1567 	return (mac_group_addmac(group, addr));
1568 }
1569 
1570 int
1571 mac_hwgroup_remmac(mac_group_handle_t gh, const uint8_t *addr)
1572 {
1573 	mac_group_t *group = (mac_group_t *)gh;
1574 
1575 	return (mac_group_remmac(group, addr));
1576 }
1577 
1578 /*
1579  * Set the RX group to be shared/reserved. Note that the group must be
1580  * started/stopped outside of this function.
1581  */
1582 void
1583 mac_set_rx_group_state(mac_group_t *grp, mac_group_state_t state)
1584 {
1585 	/*
1586 	 * If there is no change in the group state, just return.
1587 	 */
1588 	if (grp->mrg_state == state)
1589 		return;
1590 
1591 	switch (state) {
1592 	case MAC_GROUP_STATE_RESERVED:
1593 		/*
1594 		 * Successfully reserved the group.
1595 		 *
1596 		 * Given that there is an exclusive client controlling this
1597 		 * group, we enable the group level polling when available,
1598 		 * so that SRSs get to turn on/off individual rings they's
1599 		 * assigned to.
1600 		 */
1601 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1602 
1603 		if (GROUP_INTR_DISABLE_FUNC(grp) != NULL)
1604 			GROUP_INTR_DISABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1605 
1606 		break;
1607 
1608 	case MAC_GROUP_STATE_SHARED:
1609 		/*
1610 		 * Set all rings of this group to software classified.
1611 		 * If the group has an overriding interrupt, then re-enable it.
1612 		 */
1613 		ASSERT(MAC_PERIM_HELD(grp->mrg_mh));
1614 
1615 		if (GROUP_INTR_ENABLE_FUNC(grp) != NULL)
1616 			GROUP_INTR_ENABLE_FUNC(grp)(GROUP_INTR_HANDLE(grp));
1617 
1618 		/* The ring is not available for reservations any more */
1619 		break;
1620 
1621 	case MAC_GROUP_STATE_REGISTERED:
1622 		/* Also callable from mac_register, perim is not held */
1623 		break;
1624 
1625 	default:
1626 		ASSERT(B_FALSE);
1627 		break;
1628 	}
1629 
1630 	grp->mrg_state = state;
1631 }
1632 
1633 /*
1634  * Quiesce future hardware classified packets for the specified Rx ring
1635  */
1636 static void
1637 mac_rx_ring_quiesce(mac_ring_t *rx_ring, uint_t ring_flag)
1638 {
1639 	ASSERT(rx_ring->mr_classify_type == MAC_HW_CLASSIFIER);
1640 	ASSERT(ring_flag == MR_CONDEMNED || ring_flag  == MR_QUIESCE);
1641 
1642 	mutex_enter(&rx_ring->mr_lock);
1643 	rx_ring->mr_flag |= ring_flag;
1644 	while (rx_ring->mr_refcnt != 0)
1645 		cv_wait(&rx_ring->mr_cv, &rx_ring->mr_lock);
1646 	mutex_exit(&rx_ring->mr_lock);
1647 }
1648 
1649 /*
1650  * Please see mac_tx for details about the per cpu locking scheme
1651  */
1652 static void
1653 mac_tx_lock_all(mac_client_impl_t *mcip)
1654 {
1655 	int	i;
1656 
1657 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1658 		mutex_enter(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1659 }
1660 
1661 static void
1662 mac_tx_unlock_all(mac_client_impl_t *mcip)
1663 {
1664 	int	i;
1665 
1666 	for (i = mac_tx_percpu_cnt; i >= 0; i--)
1667 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1668 }
1669 
1670 static void
1671 mac_tx_unlock_allbutzero(mac_client_impl_t *mcip)
1672 {
1673 	int	i;
1674 
1675 	for (i = mac_tx_percpu_cnt; i > 0; i--)
1676 		mutex_exit(&mcip->mci_tx_pcpu[i].pcpu_tx_lock);
1677 }
1678 
1679 static int
1680 mac_tx_sum_refcnt(mac_client_impl_t *mcip)
1681 {
1682 	int	i;
1683 	int	refcnt = 0;
1684 
1685 	for (i = 0; i <= mac_tx_percpu_cnt; i++)
1686 		refcnt += mcip->mci_tx_pcpu[i].pcpu_tx_refcnt;
1687 
1688 	return (refcnt);
1689 }
1690 
1691 /*
1692  * Stop future Tx packets coming down from the client in preparation for
1693  * quiescing the Tx side. This is needed for dynamic reclaim and reassignment
1694  * of rings between clients
1695  */
1696 void
1697 mac_tx_client_block(mac_client_impl_t *mcip)
1698 {
1699 	mac_tx_lock_all(mcip);
1700 	mcip->mci_tx_flag |= MCI_TX_QUIESCE;
1701 	while (mac_tx_sum_refcnt(mcip) != 0) {
1702 		mac_tx_unlock_allbutzero(mcip);
1703 		cv_wait(&mcip->mci_tx_cv, &mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1704 		mutex_exit(&mcip->mci_tx_pcpu[0].pcpu_tx_lock);
1705 		mac_tx_lock_all(mcip);
1706 	}
1707 	mac_tx_unlock_all(mcip);
1708 }
1709 
1710 void
1711 mac_tx_client_unblock(mac_client_impl_t *mcip)
1712 {
1713 	mac_tx_lock_all(mcip);
1714 	mcip->mci_tx_flag &= ~MCI_TX_QUIESCE;
1715 	mac_tx_unlock_all(mcip);
1716 	/*
1717 	 * We may fail to disable flow control for the last MAC_NOTE_TX
1718 	 * notification because the MAC client is quiesced. Send the
1719 	 * notification again.
1720 	 */
1721 	i_mac_notify(mcip->mci_mip, MAC_NOTE_TX);
1722 }
1723 
1724 /*
1725  * Wait for an SRS to quiesce. The SRS worker will signal us when the
1726  * quiesce is done.
1727  */
1728 static void
1729 mac_srs_quiesce_wait(mac_soft_ring_set_t *srs, uint_t srs_flag)
1730 {
1731 	mutex_enter(&srs->srs_lock);
1732 	while (!(srs->srs_state & srs_flag))
1733 		cv_wait(&srs->srs_quiesce_done_cv, &srs->srs_lock);
1734 	mutex_exit(&srs->srs_lock);
1735 }
1736 
1737 /*
1738  * Quiescing an Rx SRS is achieved by the following sequence. The protocol
1739  * works bottom up by cutting off packet flow from the bottommost point in the
1740  * mac, then the SRS, and then the soft rings. There are 2 use cases of this
1741  * mechanism. One is a temporary quiesce of the SRS, such as say while changing
1742  * the Rx callbacks. Another use case is Rx SRS teardown. In the former case
1743  * the QUIESCE prefix/suffix is used and in the latter the CONDEMNED is used
1744  * for the SRS and MR flags. In the former case the threads pause waiting for
1745  * a restart, while in the latter case the threads exit. The Tx SRS teardown
1746  * is also mostly similar to the above.
1747  *
1748  * 1. Stop future hardware classified packets at the lowest level in the mac.
1749  *    Remove any hardware classification rule (CONDEMNED case) and mark the
1750  *    rings as CONDEMNED or QUIESCE as appropriate. This prevents the mr_refcnt
1751  *    from increasing. Upcalls from the driver that come through hardware
1752  *    classification will be dropped in mac_rx from now on. Then we wait for
1753  *    the mr_refcnt to drop to zero. When the mr_refcnt reaches zero we are
1754  *    sure there aren't any upcall threads from the driver through hardware
1755  *    classification. In the case of SRS teardown we also remove the
1756  *    classification rule in the driver.
1757  *
1758  * 2. Stop future software classified packets by marking the flow entry with
1759  *    FE_QUIESCE or FE_CONDEMNED as appropriate which prevents the refcnt from
1760  *    increasing. We also remove the flow entry from the table in the latter
1761  *    case. Then wait for the fe_refcnt to reach an appropriate quiescent value
1762  *    that indicates there aren't any active threads using that flow entry.
1763  *
1764  * 3. Quiesce the SRS and softrings by signaling the SRS. The SRS poll thread,
1765  *    SRS worker thread, and the soft ring threads are quiesced in sequence
1766  *    with the SRS worker thread serving as a master controller. This
1767  *    mechansim is explained in mac_srs_worker_quiesce().
1768  *
1769  * The restart mechanism to reactivate the SRS and softrings is explained
1770  * in mac_srs_worker_restart(). Here we just signal the SRS worker to start the
1771  * restart sequence.
1772  */
1773 void
1774 mac_rx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1775 {
1776 	flow_entry_t	*flent = srs->srs_flent;
1777 	uint_t	mr_flag, srs_done_flag;
1778 
1779 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1780 	ASSERT(!(srs->srs_type & SRST_TX));
1781 
1782 	if (srs_quiesce_flag == SRS_CONDEMNED) {
1783 		mr_flag = MR_CONDEMNED;
1784 		srs_done_flag = SRS_CONDEMNED_DONE;
1785 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1786 			mac_srs_client_poll_disable(srs->srs_mcip, srs);
1787 	} else {
1788 		ASSERT(srs_quiesce_flag == SRS_QUIESCE);
1789 		mr_flag = MR_QUIESCE;
1790 		srs_done_flag = SRS_QUIESCE_DONE;
1791 		if (srs->srs_type & SRST_CLIENT_POLL_ENABLED)
1792 			mac_srs_client_poll_quiesce(srs->srs_mcip, srs);
1793 	}
1794 
1795 	if (srs->srs_ring != NULL) {
1796 		mac_rx_ring_quiesce(srs->srs_ring, mr_flag);
1797 	} else {
1798 		/*
1799 		 * SRS is driven by software classification. In case
1800 		 * of CONDEMNED, the top level teardown functions will
1801 		 * deal with flow removal.
1802 		 */
1803 		if (srs_quiesce_flag != SRS_CONDEMNED) {
1804 			FLOW_MARK(flent, FE_QUIESCE);
1805 			mac_flow_wait(flent, FLOW_DRIVER_UPCALL);
1806 		}
1807 	}
1808 
1809 	/*
1810 	 * Signal the SRS to quiesce itself, and then cv_wait for the
1811 	 * SRS quiesce to complete. The SRS worker thread will wake us
1812 	 * up when the quiesce is complete
1813 	 */
1814 	mac_srs_signal(srs, srs_quiesce_flag);
1815 	mac_srs_quiesce_wait(srs, srs_done_flag);
1816 }
1817 
1818 /*
1819  * Remove an SRS.
1820  */
1821 void
1822 mac_rx_srs_remove(mac_soft_ring_set_t *srs)
1823 {
1824 	flow_entry_t *flent = srs->srs_flent;
1825 	int i;
1826 
1827 	mac_rx_srs_quiesce(srs, SRS_CONDEMNED);
1828 	/*
1829 	 * Locate and remove our entry in the fe_rx_srs[] array, and
1830 	 * adjust the fe_rx_srs array entries and array count by
1831 	 * moving the last entry into the vacated spot.
1832 	 */
1833 	mutex_enter(&flent->fe_lock);
1834 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1835 		if (flent->fe_rx_srs[i] == srs)
1836 			break;
1837 	}
1838 
1839 	ASSERT(i != 0 && i < flent->fe_rx_srs_cnt);
1840 	if (i != flent->fe_rx_srs_cnt - 1) {
1841 		flent->fe_rx_srs[i] =
1842 		    flent->fe_rx_srs[flent->fe_rx_srs_cnt - 1];
1843 		i = flent->fe_rx_srs_cnt - 1;
1844 	}
1845 
1846 	flent->fe_rx_srs[i] = NULL;
1847 	flent->fe_rx_srs_cnt--;
1848 	mutex_exit(&flent->fe_lock);
1849 
1850 	mac_srs_free(srs);
1851 }
1852 
1853 static void
1854 mac_srs_clear_flag(mac_soft_ring_set_t *srs, uint_t flag)
1855 {
1856 	mutex_enter(&srs->srs_lock);
1857 	srs->srs_state &= ~flag;
1858 	mutex_exit(&srs->srs_lock);
1859 }
1860 
1861 void
1862 mac_rx_srs_restart(mac_soft_ring_set_t *srs)
1863 {
1864 	flow_entry_t	*flent = srs->srs_flent;
1865 	mac_ring_t	*mr;
1866 
1867 	ASSERT(MAC_PERIM_HELD((mac_handle_t)FLENT_TO_MIP(flent)));
1868 	ASSERT((srs->srs_type & SRST_TX) == 0);
1869 
1870 	/*
1871 	 * This handles a change in the number of SRSs between the quiesce and
1872 	 * and restart operation of a flow.
1873 	 */
1874 	if (!SRS_QUIESCED(srs))
1875 		return;
1876 
1877 	/*
1878 	 * Signal the SRS to restart itself. Wait for the restart to complete
1879 	 * Note that we only restart the SRS if it is not marked as
1880 	 * permanently quiesced.
1881 	 */
1882 	if (!SRS_QUIESCED_PERMANENT(srs)) {
1883 		mac_srs_signal(srs, SRS_RESTART);
1884 		mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
1885 		mac_srs_clear_flag(srs, SRS_RESTART_DONE);
1886 
1887 		mac_srs_client_poll_restart(srs->srs_mcip, srs);
1888 	}
1889 
1890 	/* Finally clear the flags to let the packets in */
1891 	mr = srs->srs_ring;
1892 	if (mr != NULL) {
1893 		MAC_RING_UNMARK(mr, MR_QUIESCE);
1894 		/* In case the ring was stopped, safely restart it */
1895 		(void) mac_start_ring(mr);
1896 	} else {
1897 		FLOW_UNMARK(flent, FE_QUIESCE);
1898 	}
1899 }
1900 
1901 /*
1902  * Temporary quiesce of a flow and associated Rx SRS.
1903  * Please see block comment above mac_rx_classify_flow_rem.
1904  */
1905 /* ARGSUSED */
1906 int
1907 mac_rx_classify_flow_quiesce(flow_entry_t *flent, void *arg)
1908 {
1909 	int		i;
1910 
1911 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1912 		mac_rx_srs_quiesce((mac_soft_ring_set_t *)flent->fe_rx_srs[i],
1913 		    SRS_QUIESCE);
1914 	}
1915 	return (0);
1916 }
1917 
1918 /*
1919  * Restart a flow and associated Rx SRS that has been quiesced temporarily
1920  * Please see block comment above mac_rx_classify_flow_rem
1921  */
1922 /* ARGSUSED */
1923 int
1924 mac_rx_classify_flow_restart(flow_entry_t *flent, void *arg)
1925 {
1926 	int		i;
1927 
1928 	for (i = 0; i < flent->fe_rx_srs_cnt; i++)
1929 		mac_rx_srs_restart((mac_soft_ring_set_t *)flent->fe_rx_srs[i]);
1930 
1931 	return (0);
1932 }
1933 
1934 void
1935 mac_srs_perm_quiesce(mac_client_handle_t mch, boolean_t on)
1936 {
1937 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1938 	flow_entry_t		*flent = mcip->mci_flent;
1939 	mac_impl_t		*mip = mcip->mci_mip;
1940 	mac_soft_ring_set_t	*mac_srs;
1941 	int			i;
1942 
1943 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1944 
1945 	if (flent == NULL)
1946 		return;
1947 
1948 	for (i = 0; i < flent->fe_rx_srs_cnt; i++) {
1949 		mac_srs = flent->fe_rx_srs[i];
1950 		mutex_enter(&mac_srs->srs_lock);
1951 		if (on)
1952 			mac_srs->srs_state |= SRS_QUIESCE_PERM;
1953 		else
1954 			mac_srs->srs_state &= ~SRS_QUIESCE_PERM;
1955 		mutex_exit(&mac_srs->srs_lock);
1956 	}
1957 }
1958 
1959 void
1960 mac_rx_client_quiesce(mac_client_handle_t mch)
1961 {
1962 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1963 	mac_impl_t		*mip = mcip->mci_mip;
1964 
1965 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1966 
1967 	if (MCIP_DATAPATH_SETUP(mcip)) {
1968 		(void) mac_rx_classify_flow_quiesce(mcip->mci_flent,
1969 		    NULL);
1970 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1971 		    mac_rx_classify_flow_quiesce, NULL);
1972 	}
1973 }
1974 
1975 void
1976 mac_rx_client_restart(mac_client_handle_t mch)
1977 {
1978 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
1979 	mac_impl_t		*mip = mcip->mci_mip;
1980 
1981 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
1982 
1983 	if (MCIP_DATAPATH_SETUP(mcip)) {
1984 		(void) mac_rx_classify_flow_restart(mcip->mci_flent, NULL);
1985 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
1986 		    mac_rx_classify_flow_restart, NULL);
1987 	}
1988 }
1989 
1990 /*
1991  * This function only quiesces the Tx SRS and softring worker threads. Callers
1992  * need to make sure that there aren't any mac client threads doing current or
1993  * future transmits in the mac before calling this function.
1994  */
1995 void
1996 mac_tx_srs_quiesce(mac_soft_ring_set_t *srs, uint_t srs_quiesce_flag)
1997 {
1998 	mac_client_impl_t	*mcip = srs->srs_mcip;
1999 
2000 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2001 
2002 	ASSERT(srs->srs_type & SRST_TX);
2003 	ASSERT(srs_quiesce_flag == SRS_CONDEMNED ||
2004 	    srs_quiesce_flag == SRS_QUIESCE);
2005 
2006 	/*
2007 	 * Signal the SRS to quiesce itself, and then cv_wait for the
2008 	 * SRS quiesce to complete. The SRS worker thread will wake us
2009 	 * up when the quiesce is complete
2010 	 */
2011 	mac_srs_signal(srs, srs_quiesce_flag);
2012 	mac_srs_quiesce_wait(srs, srs_quiesce_flag == SRS_QUIESCE ?
2013 	    SRS_QUIESCE_DONE : SRS_CONDEMNED_DONE);
2014 }
2015 
2016 void
2017 mac_tx_srs_restart(mac_soft_ring_set_t *srs)
2018 {
2019 	/*
2020 	 * Resizing the fanout could result in creation of new SRSs.
2021 	 * They may not necessarily be in the quiesced state in which
2022 	 * case it need be restarted
2023 	 */
2024 	if (!SRS_QUIESCED(srs))
2025 		return;
2026 
2027 	mac_srs_signal(srs, SRS_RESTART);
2028 	mac_srs_quiesce_wait(srs, SRS_RESTART_DONE);
2029 	mac_srs_clear_flag(srs, SRS_RESTART_DONE);
2030 }
2031 
2032 /*
2033  * Temporary quiesce of a flow and associated Rx SRS.
2034  * Please see block comment above mac_rx_srs_quiesce
2035  */
2036 /* ARGSUSED */
2037 int
2038 mac_tx_flow_quiesce(flow_entry_t *flent, void *arg)
2039 {
2040 	/*
2041 	 * The fe_tx_srs is null for a subflow on an interface that is
2042 	 * not plumbed
2043 	 */
2044 	if (flent->fe_tx_srs != NULL)
2045 		mac_tx_srs_quiesce(flent->fe_tx_srs, SRS_QUIESCE);
2046 	return (0);
2047 }
2048 
2049 /* ARGSUSED */
2050 int
2051 mac_tx_flow_restart(flow_entry_t *flent, void *arg)
2052 {
2053 	/*
2054 	 * The fe_tx_srs is null for a subflow on an interface that is
2055 	 * not plumbed
2056 	 */
2057 	if (flent->fe_tx_srs != NULL)
2058 		mac_tx_srs_restart(flent->fe_tx_srs);
2059 	return (0);
2060 }
2061 
2062 void
2063 mac_tx_client_quiesce(mac_client_impl_t *mcip, uint_t srs_quiesce_flag)
2064 {
2065 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2066 
2067 	mac_tx_client_block(mcip);
2068 	if (MCIP_TX_SRS(mcip) != NULL) {
2069 		mac_tx_srs_quiesce(MCIP_TX_SRS(mcip), srs_quiesce_flag);
2070 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2071 		    mac_tx_flow_quiesce, NULL);
2072 	}
2073 }
2074 
2075 void
2076 mac_tx_client_restart(mac_client_impl_t *mcip)
2077 {
2078 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2079 
2080 	mac_tx_client_unblock(mcip);
2081 	if (MCIP_TX_SRS(mcip) != NULL) {
2082 		mac_tx_srs_restart(MCIP_TX_SRS(mcip));
2083 		(void) mac_flow_walk_nolock(mcip->mci_subflow_tab,
2084 		    mac_tx_flow_restart, NULL);
2085 	}
2086 }
2087 
2088 void
2089 mac_tx_client_flush(mac_client_impl_t *mcip)
2090 {
2091 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
2092 
2093 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2094 	mac_tx_client_restart(mcip);
2095 }
2096 
2097 void
2098 mac_client_quiesce(mac_client_impl_t *mcip)
2099 {
2100 	mac_rx_client_quiesce((mac_client_handle_t)mcip);
2101 	mac_tx_client_quiesce(mcip, SRS_QUIESCE);
2102 }
2103 
2104 void
2105 mac_client_restart(mac_client_impl_t *mcip)
2106 {
2107 	mac_rx_client_restart((mac_client_handle_t)mcip);
2108 	mac_tx_client_restart(mcip);
2109 }
2110 
2111 /*
2112  * Allocate a minor number.
2113  */
2114 minor_t
2115 mac_minor_hold(boolean_t sleep)
2116 {
2117 	minor_t	minor;
2118 
2119 	/*
2120 	 * Grab a value from the arena.
2121 	 */
2122 	atomic_add_32(&minor_count, 1);
2123 
2124 	if (sleep)
2125 		minor = (uint_t)id_alloc(minor_ids);
2126 	else
2127 		minor = (uint_t)id_alloc_nosleep(minor_ids);
2128 
2129 	if (minor == 0) {
2130 		atomic_add_32(&minor_count, -1);
2131 		return (0);
2132 	}
2133 
2134 	return (minor);
2135 }
2136 
2137 /*
2138  * Release a previously allocated minor number.
2139  */
2140 void
2141 mac_minor_rele(minor_t minor)
2142 {
2143 	/*
2144 	 * Return the value to the arena.
2145 	 */
2146 	id_free(minor_ids, minor);
2147 	atomic_add_32(&minor_count, -1);
2148 }
2149 
2150 uint32_t
2151 mac_no_notification(mac_handle_t mh)
2152 {
2153 	mac_impl_t *mip = (mac_impl_t *)mh;
2154 
2155 	return (((mip->mi_state_flags & MIS_LEGACY) != 0) ?
2156 	    mip->mi_capab_legacy.ml_unsup_note : 0);
2157 }
2158 
2159 /*
2160  * Prevent any new opens of this mac in preparation for unregister
2161  */
2162 int
2163 i_mac_disable(mac_impl_t *mip)
2164 {
2165 	mac_client_impl_t	*mcip;
2166 
2167 	rw_enter(&i_mac_impl_lock, RW_WRITER);
2168 	if (mip->mi_state_flags & MIS_DISABLED) {
2169 		/* Already disabled, return success */
2170 		rw_exit(&i_mac_impl_lock);
2171 		return (0);
2172 	}
2173 	/*
2174 	 * See if there are any other references to this mac_t (e.g., VLAN's).
2175 	 * If so return failure. If all the other checks below pass, then
2176 	 * set mi_disabled atomically under the i_mac_impl_lock to prevent
2177 	 * any new VLAN's from being created or new mac client opens of this
2178 	 * mac end point.
2179 	 */
2180 	if (mip->mi_ref > 0) {
2181 		rw_exit(&i_mac_impl_lock);
2182 		return (EBUSY);
2183 	}
2184 
2185 	/*
2186 	 * mac clients must delete all multicast groups they join before
2187 	 * closing. bcast groups are reference counted, the last client
2188 	 * to delete the group will wait till the group is physically
2189 	 * deleted. Since all clients have closed this mac end point
2190 	 * mi_bcast_ngrps must be zero at this point
2191 	 */
2192 	ASSERT(mip->mi_bcast_ngrps == 0);
2193 
2194 	/*
2195 	 * Don't let go of this if it has some flows.
2196 	 * All other code guarantees no flows are added to a disabled
2197 	 * mac, therefore it is sufficient to check for the flow table
2198 	 * only here.
2199 	 */
2200 	mcip = mac_primary_client_handle(mip);
2201 	if ((mcip != NULL) && mac_link_has_flows((mac_client_handle_t)mcip)) {
2202 		rw_exit(&i_mac_impl_lock);
2203 		return (ENOTEMPTY);
2204 	}
2205 
2206 	mip->mi_state_flags |= MIS_DISABLED;
2207 	rw_exit(&i_mac_impl_lock);
2208 	return (0);
2209 }
2210 
2211 int
2212 mac_disable_nowait(mac_handle_t mh)
2213 {
2214 	mac_impl_t	*mip = (mac_impl_t *)mh;
2215 	int err;
2216 
2217 	if ((err = i_mac_perim_enter_nowait(mip)) != 0)
2218 		return (err);
2219 	err = i_mac_disable(mip);
2220 	i_mac_perim_exit(mip);
2221 	return (err);
2222 }
2223 
2224 int
2225 mac_disable(mac_handle_t mh)
2226 {
2227 	mac_impl_t	*mip = (mac_impl_t *)mh;
2228 	int err;
2229 
2230 	i_mac_perim_enter(mip);
2231 	err = i_mac_disable(mip);
2232 	i_mac_perim_exit(mip);
2233 
2234 	/*
2235 	 * Clean up notification thread and wait for it to exit.
2236 	 */
2237 	if (err == 0)
2238 		i_mac_notify_exit(mip);
2239 
2240 	return (err);
2241 }
2242 
2243 /*
2244  * Called when the MAC instance has a non empty flow table, to de-multiplex
2245  * incoming packets to the right flow.
2246  * The MAC's rw lock is assumed held as a READER.
2247  */
2248 /* ARGSUSED */
2249 static mblk_t *
2250 mac_rx_classify(mac_impl_t *mip, mac_resource_handle_t mrh, mblk_t *mp)
2251 {
2252 	flow_entry_t	*flent = NULL;
2253 	uint_t		flags = FLOW_INBOUND;
2254 	int		err;
2255 
2256 	/*
2257 	 * If the mac is a port of an aggregation, pass FLOW_IGNORE_VLAN
2258 	 * to mac_flow_lookup() so that the VLAN packets can be successfully
2259 	 * passed to the non-VLAN aggregation flows.
2260 	 *
2261 	 * Note that there is possibly a race between this and
2262 	 * mac_unicast_remove/add() and VLAN packets could be incorrectly
2263 	 * classified to non-VLAN flows of non-aggregation mac clients. These
2264 	 * VLAN packets will be then filtered out by the mac module.
2265 	 */
2266 	if ((mip->mi_state_flags & MIS_EXCLUSIVE) != 0)
2267 		flags |= FLOW_IGNORE_VLAN;
2268 
2269 	err = mac_flow_lookup(mip->mi_flow_tab, mp, flags, &flent);
2270 	if (err != 0) {
2271 		/* no registered receive function */
2272 		return (mp);
2273 	} else {
2274 		mac_client_impl_t	*mcip;
2275 
2276 		/*
2277 		 * This flent might just be an additional one on the MAC client,
2278 		 * i.e. for classification purposes (different fdesc), however
2279 		 * the resources, SRS et. al., are in the mci_flent, so if
2280 		 * this isn't the mci_flent, we need to get it.
2281 		 */
2282 		if ((mcip = flent->fe_mcip) != NULL &&
2283 		    mcip->mci_flent != flent) {
2284 			FLOW_REFRELE(flent);
2285 			flent = mcip->mci_flent;
2286 			FLOW_TRY_REFHOLD(flent, err);
2287 			if (err != 0)
2288 				return (mp);
2289 		}
2290 		(flent->fe_cb_fn)(flent->fe_cb_arg1, flent->fe_cb_arg2, mp,
2291 		    B_FALSE);
2292 		FLOW_REFRELE(flent);
2293 	}
2294 	return (NULL);
2295 }
2296 
2297 mblk_t *
2298 mac_rx_flow(mac_handle_t mh, mac_resource_handle_t mrh, mblk_t *mp_chain)
2299 {
2300 	mac_impl_t	*mip = (mac_impl_t *)mh;
2301 	mblk_t		*bp, *bp1, **bpp, *list = NULL;
2302 
2303 	/*
2304 	 * We walk the chain and attempt to classify each packet.
2305 	 * The packets that couldn't be classified will be returned
2306 	 * back to the caller.
2307 	 */
2308 	bp = mp_chain;
2309 	bpp = &list;
2310 	while (bp != NULL) {
2311 		bp1 = bp;
2312 		bp = bp->b_next;
2313 		bp1->b_next = NULL;
2314 
2315 		if (mac_rx_classify(mip, mrh, bp1) != NULL) {
2316 			*bpp = bp1;
2317 			bpp = &bp1->b_next;
2318 		}
2319 	}
2320 	return (list);
2321 }
2322 
2323 static int
2324 mac_tx_flow_srs_wakeup(flow_entry_t *flent, void *arg)
2325 {
2326 	mac_ring_handle_t ring = arg;
2327 
2328 	if (flent->fe_tx_srs)
2329 		mac_tx_srs_wakeup(flent->fe_tx_srs, ring);
2330 	return (0);
2331 }
2332 
2333 void
2334 i_mac_tx_srs_notify(mac_impl_t *mip, mac_ring_handle_t ring)
2335 {
2336 	mac_client_impl_t	*cclient;
2337 	mac_soft_ring_set_t	*mac_srs;
2338 
2339 	/*
2340 	 * After grabbing the mi_rw_lock, the list of clients can't change.
2341 	 * If there are any clients mi_disabled must be B_FALSE and can't
2342 	 * get set since there are clients. If there aren't any clients we
2343 	 * don't do anything. In any case the mip has to be valid. The driver
2344 	 * must make sure that it goes single threaded (with respect to mac
2345 	 * calls) and wait for all pending mac calls to finish before calling
2346 	 * mac_unregister.
2347 	 */
2348 	rw_enter(&i_mac_impl_lock, RW_READER);
2349 	if (mip->mi_state_flags & MIS_DISABLED) {
2350 		rw_exit(&i_mac_impl_lock);
2351 		return;
2352 	}
2353 
2354 	/*
2355 	 * Get MAC tx srs from walking mac_client_handle list.
2356 	 */
2357 	rw_enter(&mip->mi_rw_lock, RW_READER);
2358 	for (cclient = mip->mi_clients_list; cclient != NULL;
2359 	    cclient = cclient->mci_client_next) {
2360 		if ((mac_srs = MCIP_TX_SRS(cclient)) != NULL)
2361 			mac_tx_srs_wakeup(mac_srs, ring);
2362 		(void) mac_flow_walk(cclient->mci_subflow_tab,
2363 		    mac_tx_flow_srs_wakeup, ring);
2364 	}
2365 	rw_exit(&mip->mi_rw_lock);
2366 	rw_exit(&i_mac_impl_lock);
2367 }
2368 
2369 /* ARGSUSED */
2370 void
2371 mac_multicast_refresh(mac_handle_t mh, mac_multicst_t refresh, void *arg,
2372     boolean_t add)
2373 {
2374 	mac_impl_t *mip = (mac_impl_t *)mh;
2375 
2376 	i_mac_perim_enter((mac_impl_t *)mh);
2377 	/*
2378 	 * If no specific refresh function was given then default to the
2379 	 * driver's m_multicst entry point.
2380 	 */
2381 	if (refresh == NULL) {
2382 		refresh = mip->mi_multicst;
2383 		arg = mip->mi_driver;
2384 	}
2385 
2386 	mac_bcast_refresh(mip, refresh, arg, add);
2387 	i_mac_perim_exit((mac_impl_t *)mh);
2388 }
2389 
2390 void
2391 mac_promisc_refresh(mac_handle_t mh, mac_setpromisc_t refresh, void *arg)
2392 {
2393 	mac_impl_t	*mip = (mac_impl_t *)mh;
2394 
2395 	/*
2396 	 * If no specific refresh function was given then default to the
2397 	 * driver's m_promisc entry point.
2398 	 */
2399 	if (refresh == NULL) {
2400 		refresh = mip->mi_setpromisc;
2401 		arg = mip->mi_driver;
2402 	}
2403 	ASSERT(refresh != NULL);
2404 
2405 	/*
2406 	 * Call the refresh function with the current promiscuity.
2407 	 */
2408 	refresh(arg, (mip->mi_devpromisc != 0));
2409 }
2410 
2411 /*
2412  * The mac client requests that the mac not to change its margin size to
2413  * be less than the specified value.  If "current" is B_TRUE, then the client
2414  * requests the mac not to change its margin size to be smaller than the
2415  * current size. Further, return the current margin size value in this case.
2416  *
2417  * We keep every requested size in an ordered list from largest to smallest.
2418  */
2419 int
2420 mac_margin_add(mac_handle_t mh, uint32_t *marginp, boolean_t current)
2421 {
2422 	mac_impl_t		*mip = (mac_impl_t *)mh;
2423 	mac_margin_req_t	**pp, *p;
2424 	int			err = 0;
2425 
2426 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2427 	if (current)
2428 		*marginp = mip->mi_margin;
2429 
2430 	/*
2431 	 * If the current margin value cannot satisfy the margin requested,
2432 	 * return ENOTSUP directly.
2433 	 */
2434 	if (*marginp > mip->mi_margin) {
2435 		err = ENOTSUP;
2436 		goto done;
2437 	}
2438 
2439 	/*
2440 	 * Check whether the given margin is already in the list. If so,
2441 	 * bump the reference count.
2442 	 */
2443 	for (pp = &mip->mi_mmrp; (p = *pp) != NULL; pp = &p->mmr_nextp) {
2444 		if (p->mmr_margin == *marginp) {
2445 			/*
2446 			 * The margin requested is already in the list,
2447 			 * so just bump the reference count.
2448 			 */
2449 			p->mmr_ref++;
2450 			goto done;
2451 		}
2452 		if (p->mmr_margin < *marginp)
2453 			break;
2454 	}
2455 
2456 
2457 	p = kmem_zalloc(sizeof (mac_margin_req_t), KM_SLEEP);
2458 	p->mmr_margin = *marginp;
2459 	p->mmr_ref++;
2460 	p->mmr_nextp = *pp;
2461 	*pp = p;
2462 
2463 done:
2464 	rw_exit(&(mip->mi_rw_lock));
2465 	return (err);
2466 }
2467 
2468 /*
2469  * The mac client requests to cancel its previous mac_margin_add() request.
2470  * We remove the requested margin size from the list.
2471  */
2472 int
2473 mac_margin_remove(mac_handle_t mh, uint32_t margin)
2474 {
2475 	mac_impl_t		*mip = (mac_impl_t *)mh;
2476 	mac_margin_req_t	**pp, *p;
2477 	int			err = 0;
2478 
2479 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2480 	/*
2481 	 * Find the entry in the list for the given margin.
2482 	 */
2483 	for (pp = &(mip->mi_mmrp); (p = *pp) != NULL; pp = &(p->mmr_nextp)) {
2484 		if (p->mmr_margin == margin) {
2485 			if (--p->mmr_ref == 0)
2486 				break;
2487 
2488 			/*
2489 			 * There is still a reference to this address so
2490 			 * there's nothing more to do.
2491 			 */
2492 			goto done;
2493 		}
2494 	}
2495 
2496 	/*
2497 	 * We did not find an entry for the given margin.
2498 	 */
2499 	if (p == NULL) {
2500 		err = ENOENT;
2501 		goto done;
2502 	}
2503 
2504 	ASSERT(p->mmr_ref == 0);
2505 
2506 	/*
2507 	 * Remove it from the list.
2508 	 */
2509 	*pp = p->mmr_nextp;
2510 	kmem_free(p, sizeof (mac_margin_req_t));
2511 done:
2512 	rw_exit(&(mip->mi_rw_lock));
2513 	return (err);
2514 }
2515 
2516 boolean_t
2517 mac_margin_update(mac_handle_t mh, uint32_t margin)
2518 {
2519 	mac_impl_t	*mip = (mac_impl_t *)mh;
2520 	uint32_t	margin_needed = 0;
2521 
2522 	rw_enter(&(mip->mi_rw_lock), RW_WRITER);
2523 
2524 	if (mip->mi_mmrp != NULL)
2525 		margin_needed = mip->mi_mmrp->mmr_margin;
2526 
2527 	if (margin_needed <= margin)
2528 		mip->mi_margin = margin;
2529 
2530 	rw_exit(&(mip->mi_rw_lock));
2531 
2532 	if (margin_needed <= margin)
2533 		i_mac_notify(mip, MAC_NOTE_MARGIN);
2534 
2535 	return (margin_needed <= margin);
2536 }
2537 
2538 /*
2539  * MAC Type Plugin functions.
2540  */
2541 
2542 mactype_t *
2543 mactype_getplugin(const char *pname)
2544 {
2545 	mactype_t	*mtype = NULL;
2546 	boolean_t	tried_modload = B_FALSE;
2547 
2548 	mutex_enter(&i_mactype_lock);
2549 
2550 find_registered_mactype:
2551 	if (mod_hash_find(i_mactype_hash, (mod_hash_key_t)pname,
2552 	    (mod_hash_val_t *)&mtype) != 0) {
2553 		if (!tried_modload) {
2554 			/*
2555 			 * If the plugin has not yet been loaded, then
2556 			 * attempt to load it now.  If modload() succeeds,
2557 			 * the plugin should have registered using
2558 			 * mactype_register(), in which case we can go back
2559 			 * and attempt to find it again.
2560 			 */
2561 			if (modload(MACTYPE_KMODDIR, (char *)pname) != -1) {
2562 				tried_modload = B_TRUE;
2563 				goto find_registered_mactype;
2564 			}
2565 		}
2566 	} else {
2567 		/*
2568 		 * Note that there's no danger that the plugin we've loaded
2569 		 * could be unloaded between the modload() step and the
2570 		 * reference count bump here, as we're holding
2571 		 * i_mactype_lock, which mactype_unregister() also holds.
2572 		 */
2573 		atomic_inc_32(&mtype->mt_ref);
2574 	}
2575 
2576 	mutex_exit(&i_mactype_lock);
2577 	return (mtype);
2578 }
2579 
2580 mactype_register_t *
2581 mactype_alloc(uint_t mactype_version)
2582 {
2583 	mactype_register_t *mtrp;
2584 
2585 	/*
2586 	 * Make sure there isn't a version mismatch between the plugin and
2587 	 * the framework.  In the future, if multiple versions are
2588 	 * supported, this check could become more sophisticated.
2589 	 */
2590 	if (mactype_version != MACTYPE_VERSION)
2591 		return (NULL);
2592 
2593 	mtrp = kmem_zalloc(sizeof (mactype_register_t), KM_SLEEP);
2594 	mtrp->mtr_version = mactype_version;
2595 	return (mtrp);
2596 }
2597 
2598 void
2599 mactype_free(mactype_register_t *mtrp)
2600 {
2601 	kmem_free(mtrp, sizeof (mactype_register_t));
2602 }
2603 
2604 int
2605 mactype_register(mactype_register_t *mtrp)
2606 {
2607 	mactype_t	*mtp;
2608 	mactype_ops_t	*ops = mtrp->mtr_ops;
2609 
2610 	/* Do some sanity checking before we register this MAC type. */
2611 	if (mtrp->mtr_ident == NULL || ops == NULL)
2612 		return (EINVAL);
2613 
2614 	/*
2615 	 * Verify that all mandatory callbacks are set in the ops
2616 	 * vector.
2617 	 */
2618 	if (ops->mtops_unicst_verify == NULL ||
2619 	    ops->mtops_multicst_verify == NULL ||
2620 	    ops->mtops_sap_verify == NULL ||
2621 	    ops->mtops_header == NULL ||
2622 	    ops->mtops_header_info == NULL) {
2623 		return (EINVAL);
2624 	}
2625 
2626 	mtp = kmem_zalloc(sizeof (*mtp), KM_SLEEP);
2627 	mtp->mt_ident = mtrp->mtr_ident;
2628 	mtp->mt_ops = *ops;
2629 	mtp->mt_type = mtrp->mtr_mactype;
2630 	mtp->mt_nativetype = mtrp->mtr_nativetype;
2631 	mtp->mt_addr_length = mtrp->mtr_addrlen;
2632 	if (mtrp->mtr_brdcst_addr != NULL) {
2633 		mtp->mt_brdcst_addr = kmem_alloc(mtrp->mtr_addrlen, KM_SLEEP);
2634 		bcopy(mtrp->mtr_brdcst_addr, mtp->mt_brdcst_addr,
2635 		    mtrp->mtr_addrlen);
2636 	}
2637 
2638 	mtp->mt_stats = mtrp->mtr_stats;
2639 	mtp->mt_statcount = mtrp->mtr_statcount;
2640 
2641 	mtp->mt_mapping = mtrp->mtr_mapping;
2642 	mtp->mt_mappingcount = mtrp->mtr_mappingcount;
2643 
2644 	if (mod_hash_insert(i_mactype_hash,
2645 	    (mod_hash_key_t)mtp->mt_ident, (mod_hash_val_t)mtp) != 0) {
2646 		kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2647 		kmem_free(mtp, sizeof (*mtp));
2648 		return (EEXIST);
2649 	}
2650 	return (0);
2651 }
2652 
2653 int
2654 mactype_unregister(const char *ident)
2655 {
2656 	mactype_t	*mtp;
2657 	mod_hash_val_t	val;
2658 	int 		err;
2659 
2660 	/*
2661 	 * Let's not allow MAC drivers to use this plugin while we're
2662 	 * trying to unregister it.  Holding i_mactype_lock also prevents a
2663 	 * plugin from unregistering while a MAC driver is attempting to
2664 	 * hold a reference to it in i_mactype_getplugin().
2665 	 */
2666 	mutex_enter(&i_mactype_lock);
2667 
2668 	if ((err = mod_hash_find(i_mactype_hash, (mod_hash_key_t)ident,
2669 	    (mod_hash_val_t *)&mtp)) != 0) {
2670 		/* A plugin is trying to unregister, but it never registered. */
2671 		err = ENXIO;
2672 		goto done;
2673 	}
2674 
2675 	if (mtp->mt_ref != 0) {
2676 		err = EBUSY;
2677 		goto done;
2678 	}
2679 
2680 	err = mod_hash_remove(i_mactype_hash, (mod_hash_key_t)ident, &val);
2681 	ASSERT(err == 0);
2682 	if (err != 0) {
2683 		/* This should never happen, thus the ASSERT() above. */
2684 		err = EINVAL;
2685 		goto done;
2686 	}
2687 	ASSERT(mtp == (mactype_t *)val);
2688 
2689 	kmem_free(mtp->mt_brdcst_addr, mtp->mt_addr_length);
2690 	kmem_free(mtp, sizeof (mactype_t));
2691 done:
2692 	mutex_exit(&i_mactype_lock);
2693 	return (err);
2694 }
2695 
2696 /*
2697  * Returns TRUE when the specified property is intended for the MAC framework,
2698  * as opposed to driver defined properties.
2699  */
2700 static boolean_t
2701 mac_is_macprop(mac_prop_t *macprop)
2702 {
2703 	switch (macprop->mp_id) {
2704 	case MAC_PROP_MAXBW:
2705 	case MAC_PROP_PRIO:
2706 	case MAC_PROP_BIND_CPU:
2707 		return (B_TRUE);
2708 	default:
2709 		return (B_FALSE);
2710 	}
2711 }
2712 
2713 /*
2714  * mac_set_prop() sets mac or hardware driver properties:
2715  * 	mac properties include maxbw, priority, and cpu binding list. Driver
2716  *	properties are private properties to the hardware, such as mtu, speed
2717  *	etc.
2718  * If the property is a driver property, mac_set_prop() calls driver's callback
2719  * function to set it.
2720  * If the property is a mac property, mac_set_prop() invokes mac_set_resources()
2721  * which will cache the property value in mac_impl_t and may call
2722  * mac_client_set_resource() to update property value of the primary mac client,
2723  * if it exists.
2724  */
2725 int
2726 mac_set_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize)
2727 {
2728 	int err = ENOTSUP;
2729 	mac_impl_t *mip = (mac_impl_t *)mh;
2730 
2731 	ASSERT(MAC_PERIM_HELD(mh));
2732 
2733 	/* If it is mac property, call mac_set_resources() */
2734 	if (mac_is_macprop(macprop)) {
2735 		mac_resource_props_t mrp;
2736 
2737 		if (valsize < sizeof (mac_resource_props_t))
2738 			return (EINVAL);
2739 		bzero(&mrp, sizeof (mac_resource_props_t));
2740 		bcopy(val, &mrp, sizeof (mrp));
2741 		return (mac_set_resources(mh, &mrp));
2742 	}
2743 	switch (macprop->mp_id) {
2744 	case MAC_PROP_MTU: {
2745 		uint32_t mtu;
2746 
2747 		if (valsize < sizeof (mtu))
2748 			return (EINVAL);
2749 		bcopy(val, &mtu, sizeof (mtu));
2750 		err = mac_set_mtu(mh, mtu, NULL);
2751 		break;
2752 	}
2753 	default:
2754 		/* For other driver properties, call driver's callback */
2755 		if (mip->mi_callbacks->mc_callbacks & MC_SETPROP) {
2756 			err = mip->mi_callbacks->mc_setprop(mip->mi_driver,
2757 			    macprop->mp_name, macprop->mp_id, valsize, val);
2758 		}
2759 	}
2760 	return (err);
2761 }
2762 
2763 /*
2764  * mac_get_prop() gets mac or hardware driver properties.
2765  *
2766  * If the property is a driver property, mac_get_prop() calls driver's callback
2767  * function to get it.
2768  * If the property is a mac property, mac_get_prop() invokes mac_get_resources()
2769  * which returns the cached value in mac_impl_t.
2770  */
2771 int
2772 mac_get_prop(mac_handle_t mh, mac_prop_t *macprop, void *val, uint_t valsize,
2773     uint_t *perm)
2774 {
2775 	int err = ENOTSUP;
2776 	mac_impl_t *mip = (mac_impl_t *)mh;
2777 	link_state_t link_state;
2778 	boolean_t is_getprop, is_setprop;
2779 
2780 	is_getprop = (mip->mi_callbacks->mc_callbacks & MC_GETPROP);
2781 	is_setprop = (mip->mi_callbacks->mc_callbacks & MC_SETPROP);
2782 
2783 	/* If mac property, read from cache */
2784 	if (mac_is_macprop(macprop)) {
2785 		mac_resource_props_t mrp;
2786 
2787 		if (valsize < sizeof (mac_resource_props_t))
2788 			return (EINVAL);
2789 		bzero(&mrp, sizeof (mac_resource_props_t));
2790 		mac_get_resources(mh, &mrp);
2791 		bcopy(&mrp, val, sizeof (mac_resource_props_t));
2792 		return (0);
2793 	}
2794 
2795 	switch (macprop->mp_id) {
2796 	case MAC_PROP_MTU: {
2797 		uint32_t sdu;
2798 		mac_propval_range_t range;
2799 
2800 		if ((macprop->mp_flags & MAC_PROP_POSSIBLE) != 0) {
2801 			if (valsize < sizeof (mac_propval_range_t))
2802 				return (EINVAL);
2803 			if (is_getprop) {
2804 				err = mip->mi_callbacks->mc_getprop(mip->
2805 				    mi_driver, macprop->mp_name, macprop->mp_id,
2806 				    macprop->mp_flags, valsize, val, perm);
2807 			}
2808 			/*
2809 			 * If the driver doesn't have *_m_getprop defined or
2810 			 * if the driver doesn't support setting MTU then
2811 			 * return the CURRENT value as POSSIBLE value.
2812 			 */
2813 			if (!is_getprop || err == ENOTSUP) {
2814 				mac_sdu_get(mh, NULL, &sdu);
2815 				range.mpr_count = 1;
2816 				range.mpr_type = MAC_PROPVAL_UINT32;
2817 				range.range_uint32[0].mpur_min =
2818 				    range.range_uint32[0].mpur_max = sdu;
2819 				bcopy(&range, val, sizeof (range));
2820 				err = 0;
2821 			}
2822 			return (err);
2823 		}
2824 		if (valsize < sizeof (sdu))
2825 			return (EINVAL);
2826 		if ((macprop->mp_flags & MAC_PROP_DEFAULT) == 0) {
2827 			mac_sdu_get(mh, NULL, &sdu);
2828 			bcopy(&sdu, val, sizeof (sdu));
2829 			if (is_setprop && (mip->mi_callbacks->mc_setprop(mip->
2830 			    mi_driver, macprop->mp_name, macprop->mp_id,
2831 			    valsize, val) == 0)) {
2832 				*perm = MAC_PROP_PERM_RW;
2833 			} else {
2834 				*perm = MAC_PROP_PERM_READ;
2835 			}
2836 			return (0);
2837 		} else {
2838 			if (mip->mi_info.mi_media == DL_ETHER) {
2839 				sdu = ETHERMTU;
2840 				bcopy(&sdu, val, sizeof (sdu));
2841 
2842 				return (0);
2843 			}
2844 			/*
2845 			 * ask driver for its default.
2846 			 */
2847 			break;
2848 		}
2849 	}
2850 	case MAC_PROP_STATUS:
2851 		if (valsize < sizeof (link_state))
2852 			return (EINVAL);
2853 		*perm = MAC_PROP_PERM_READ;
2854 		link_state = mac_link_get(mh);
2855 		bcopy(&link_state, val, sizeof (link_state));
2856 		return (0);
2857 	default:
2858 		break;
2859 
2860 	}
2861 	/* If driver property, request from driver */
2862 	if (is_getprop) {
2863 		err = mip->mi_callbacks->mc_getprop(mip->mi_driver,
2864 		    macprop->mp_name, macprop->mp_id, macprop->mp_flags,
2865 		    valsize, val, perm);
2866 	}
2867 	return (err);
2868 }
2869 
2870 int
2871 mac_fastpath_disable(mac_handle_t mh)
2872 {
2873 	mac_impl_t	*mip = (mac_impl_t *)mh;
2874 
2875 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2876 		return (0);
2877 
2878 	return (mip->mi_capab_legacy.ml_fastpath_disable(mip->mi_driver));
2879 }
2880 
2881 void
2882 mac_fastpath_enable(mac_handle_t mh)
2883 {
2884 	mac_impl_t	*mip = (mac_impl_t *)mh;
2885 
2886 	if ((mip->mi_state_flags & MIS_LEGACY) == 0)
2887 		return;
2888 
2889 	mip->mi_capab_legacy.ml_fastpath_enable(mip->mi_driver);
2890 }
2891 
2892 void
2893 mac_register_priv_prop(mac_impl_t *mip, mac_priv_prop_t *mpp, uint_t nprop)
2894 {
2895 	mac_priv_prop_t *mpriv;
2896 
2897 	if (mpp == NULL)
2898 		return;
2899 
2900 	mpriv = kmem_zalloc(nprop * sizeof (*mpriv), KM_SLEEP);
2901 	(void) memcpy(mpriv, mpp, nprop * sizeof (*mpriv));
2902 	mip->mi_priv_prop = mpriv;
2903 	mip->mi_priv_prop_count = nprop;
2904 }
2905 
2906 void
2907 mac_unregister_priv_prop(mac_impl_t *mip)
2908 {
2909 	mac_priv_prop_t	*mpriv;
2910 
2911 	mpriv = mip->mi_priv_prop;
2912 	if (mpriv != NULL) {
2913 		kmem_free(mpriv, mip->mi_priv_prop_count * sizeof (*mpriv));
2914 		mip->mi_priv_prop = NULL;
2915 	}
2916 	mip->mi_priv_prop_count = 0;
2917 }
2918 
2919 /*
2920  * mac_ring_t 'mr' macros. Some rogue drivers may access ring structure
2921  * (by invoking mac_rx()) even after processing mac_stop_ring(). In such
2922  * cases if MAC free's the ring structure after mac_stop_ring(), any
2923  * illegal access to the ring structure coming from the driver will panic
2924  * the system. In order to protect the system from such inadverent access,
2925  * we maintain a cache of rings in the mac_impl_t after they get free'd up.
2926  * When packets are received on free'd up rings, MAC (through the generation
2927  * count mechanism) will drop such packets.
2928  */
2929 static mac_ring_t *
2930 mac_ring_alloc(mac_impl_t *mip, mac_capab_rings_t *cap_rings)
2931 {
2932 	mac_ring_t *ring;
2933 
2934 	if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2935 		mutex_enter(&mip->mi_ring_lock);
2936 		if (mip->mi_ring_freelist != NULL) {
2937 			ring = mip->mi_ring_freelist;
2938 			mip->mi_ring_freelist = ring->mr_next;
2939 			bzero(ring, sizeof (mac_ring_t));
2940 		} else {
2941 			ring = kmem_cache_alloc(mac_ring_cache, KM_SLEEP);
2942 		}
2943 		mutex_exit(&mip->mi_ring_lock);
2944 	} else {
2945 		ring = kmem_zalloc(sizeof (mac_ring_t), KM_SLEEP);
2946 	}
2947 	ASSERT((ring != NULL) && (ring->mr_state == MR_FREE));
2948 	return (ring);
2949 }
2950 
2951 static void
2952 mac_ring_free(mac_impl_t *mip, mac_ring_t *ring)
2953 {
2954 	if (ring->mr_type == MAC_RING_TYPE_RX) {
2955 		mutex_enter(&mip->mi_ring_lock);
2956 		ring->mr_state = MR_FREE;
2957 		ring->mr_flag = 0;
2958 		ring->mr_next = mip->mi_ring_freelist;
2959 		mip->mi_ring_freelist = ring;
2960 		mutex_exit(&mip->mi_ring_lock);
2961 	} else {
2962 		kmem_free(ring, sizeof (mac_ring_t));
2963 	}
2964 }
2965 
2966 static void
2967 mac_ring_freeall(mac_impl_t *mip)
2968 {
2969 	mac_ring_t *ring_next;
2970 	mutex_enter(&mip->mi_ring_lock);
2971 	mac_ring_t *ring = mip->mi_ring_freelist;
2972 	while (ring != NULL) {
2973 		ring_next = ring->mr_next;
2974 		kmem_cache_free(mac_ring_cache, ring);
2975 		ring = ring_next;
2976 	}
2977 	mip->mi_ring_freelist = NULL;
2978 	mutex_exit(&mip->mi_ring_lock);
2979 }
2980 
2981 int
2982 mac_start_ring(mac_ring_t *ring)
2983 {
2984 	int rv = 0;
2985 
2986 	if (ring->mr_start != NULL)
2987 		rv = ring->mr_start(ring->mr_driver, ring->mr_gen_num);
2988 
2989 	return (rv);
2990 }
2991 
2992 void
2993 mac_stop_ring(mac_ring_t *ring)
2994 {
2995 	if (ring->mr_stop != NULL)
2996 		ring->mr_stop(ring->mr_driver);
2997 
2998 	/*
2999 	 * Increment the ring generation number for this ring.
3000 	 */
3001 	ring->mr_gen_num++;
3002 }
3003 
3004 int
3005 mac_start_group(mac_group_t *group)
3006 {
3007 	int rv = 0;
3008 
3009 	if (group->mrg_start != NULL)
3010 		rv = group->mrg_start(group->mrg_driver);
3011 
3012 	return (rv);
3013 }
3014 
3015 void
3016 mac_stop_group(mac_group_t *group)
3017 {
3018 	if (group->mrg_stop != NULL)
3019 		group->mrg_stop(group->mrg_driver);
3020 }
3021 
3022 /*
3023  * Called from mac_start() on the default Rx group. Broadcast and multicast
3024  * packets are received only on the default group. Hence the default group
3025  * needs to be up even if the primary client is not up, for the other groups
3026  * to be functional. We do this by calling this function at mac_start time
3027  * itself. However the broadcast packets that are received can't make their
3028  * way beyond mac_rx until a mac client creates a broadcast flow.
3029  */
3030 static int
3031 mac_start_group_and_rings(mac_group_t *group)
3032 {
3033 	mac_ring_t	*ring;
3034 	int		rv = 0;
3035 
3036 	ASSERT(group->mrg_state == MAC_GROUP_STATE_REGISTERED);
3037 	if ((rv = mac_start_group(group)) != 0)
3038 		return (rv);
3039 
3040 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3041 		ASSERT(ring->mr_state == MR_FREE);
3042 		if ((rv = mac_start_ring(ring)) != 0)
3043 			goto error;
3044 		ring->mr_state = MR_INUSE;
3045 		ring->mr_classify_type = MAC_SW_CLASSIFIER;
3046 	}
3047 	return (0);
3048 
3049 error:
3050 	mac_stop_group_and_rings(group);
3051 	return (rv);
3052 }
3053 
3054 /* Called from mac_stop on the default Rx group */
3055 static void
3056 mac_stop_group_and_rings(mac_group_t *group)
3057 {
3058 	mac_ring_t	*ring;
3059 
3060 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
3061 		if (ring->mr_state != MR_FREE) {
3062 			mac_stop_ring(ring);
3063 			ring->mr_state = MR_FREE;
3064 			ring->mr_flag = 0;
3065 			ring->mr_classify_type = MAC_NO_CLASSIFIER;
3066 		}
3067 	}
3068 	mac_stop_group(group);
3069 }
3070 
3071 
3072 static mac_ring_t *
3073 mac_init_ring(mac_impl_t *mip, mac_group_t *group, int index,
3074     mac_capab_rings_t *cap_rings)
3075 {
3076 	mac_ring_t *ring;
3077 	mac_ring_info_t ring_info;
3078 
3079 	ring = mac_ring_alloc(mip, cap_rings);
3080 
3081 	/* Prepare basic information of ring */
3082 	ring->mr_index = index;
3083 	ring->mr_type = group->mrg_type;
3084 	ring->mr_gh = (mac_group_handle_t)group;
3085 
3086 	/* Insert the new ring to the list. */
3087 	ring->mr_next = group->mrg_rings;
3088 	group->mrg_rings = ring;
3089 
3090 	/* Zero to reuse the info data structure */
3091 	bzero(&ring_info, sizeof (ring_info));
3092 
3093 	/* Query ring information from driver */
3094 	cap_rings->mr_rget(mip->mi_driver, group->mrg_type, group->mrg_index,
3095 	    index, &ring_info, (mac_ring_handle_t)ring);
3096 
3097 	ring->mr_info = ring_info;
3098 
3099 	/* Update ring's status */
3100 	ring->mr_state = MR_FREE;
3101 	ring->mr_flag = 0;
3102 
3103 	/* Update the ring count of the group */
3104 	group->mrg_cur_count++;
3105 	return (ring);
3106 }
3107 
3108 /*
3109  * Rings are chained together for easy regrouping.
3110  */
3111 static void
3112 mac_init_group(mac_impl_t *mip, mac_group_t *group, int size,
3113     mac_capab_rings_t *cap_rings)
3114 {
3115 	int index;
3116 
3117 	/*
3118 	 * Initialize all ring members of this group. Size of zero will not
3119 	 * enter the loop, so it's safe for initializing an empty group.
3120 	 */
3121 	for (index = size - 1; index >= 0; index--)
3122 		(void) mac_init_ring(mip, group, index, cap_rings);
3123 }
3124 
3125 int
3126 mac_init_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3127 {
3128 	mac_capab_rings_t *cap_rings;
3129 	mac_group_t *group, *groups;
3130 	mac_group_info_t group_info;
3131 	uint_t group_free = 0;
3132 	uint_t ring_left;
3133 	mac_ring_t *ring;
3134 	int g, err = 0;
3135 
3136 	switch (rtype) {
3137 	case MAC_RING_TYPE_RX:
3138 		ASSERT(mip->mi_rx_groups == NULL);
3139 
3140 		cap_rings = &mip->mi_rx_rings_cap;
3141 		cap_rings->mr_type = MAC_RING_TYPE_RX;
3142 		break;
3143 	case MAC_RING_TYPE_TX:
3144 		ASSERT(mip->mi_tx_groups == NULL);
3145 
3146 		cap_rings = &mip->mi_tx_rings_cap;
3147 		cap_rings->mr_type = MAC_RING_TYPE_TX;
3148 		break;
3149 	default:
3150 		ASSERT(B_FALSE);
3151 	}
3152 
3153 	if (!i_mac_capab_get((mac_handle_t)mip, MAC_CAPAB_RINGS,
3154 	    cap_rings))
3155 		return (0);
3156 
3157 	/*
3158 	 * Allocate a contiguous buffer for all groups.
3159 	 */
3160 	groups = kmem_zalloc(sizeof (mac_group_t) * (cap_rings->mr_gnum + 1),
3161 	    KM_SLEEP);
3162 
3163 	ring_left = cap_rings->mr_rnum;
3164 
3165 	/*
3166 	 * Get all ring groups if any, and get their ring members
3167 	 * if any.
3168 	 */
3169 	for (g = 0; g < cap_rings->mr_gnum; g++) {
3170 		group = groups + g;
3171 
3172 		/* Prepare basic information of the group */
3173 		group->mrg_index = g;
3174 		group->mrg_type = rtype;
3175 		group->mrg_state = MAC_GROUP_STATE_UNINIT;
3176 		group->mrg_mh = (mac_handle_t)mip;
3177 		group->mrg_next = group + 1;
3178 
3179 		/* Zero to reuse the info data structure */
3180 		bzero(&group_info, sizeof (group_info));
3181 
3182 		/* Query group information from driver */
3183 		cap_rings->mr_gget(mip->mi_driver, rtype, g, &group_info,
3184 		    (mac_group_handle_t)group);
3185 
3186 		switch (cap_rings->mr_group_type) {
3187 		case MAC_GROUP_TYPE_DYNAMIC:
3188 			if (cap_rings->mr_gaddring == NULL ||
3189 			    cap_rings->mr_gremring == NULL) {
3190 				DTRACE_PROBE3(
3191 				    mac__init__rings_no_addremring,
3192 				    char *, mip->mi_name,
3193 				    mac_group_add_ring_t,
3194 				    cap_rings->mr_gaddring,
3195 				    mac_group_add_ring_t,
3196 				    cap_rings->mr_gremring);
3197 				err = EINVAL;
3198 				goto bail;
3199 			}
3200 
3201 			switch (rtype) {
3202 			case MAC_RING_TYPE_RX:
3203 				/*
3204 				 * The first RX group must have non-zero
3205 				 * rings, and the following groups must
3206 				 * have zero rings.
3207 				 */
3208 				if (g == 0 && group_info.mgi_count == 0) {
3209 					DTRACE_PROBE1(
3210 					    mac__init__rings__rx__def__zero,
3211 					    char *, mip->mi_name);
3212 					err = EINVAL;
3213 					goto bail;
3214 				}
3215 				if (g > 0 && group_info.mgi_count != 0) {
3216 					DTRACE_PROBE3(
3217 					    mac__init__rings__rx__nonzero,
3218 					    char *, mip->mi_name,
3219 					    int, g, int, group_info.mgi_count);
3220 					err = EINVAL;
3221 					goto bail;
3222 				}
3223 				break;
3224 			case MAC_RING_TYPE_TX:
3225 				/*
3226 				 * All TX ring groups must have zero rings.
3227 				 */
3228 				if (group_info.mgi_count != 0) {
3229 					DTRACE_PROBE3(
3230 					    mac__init__rings__tx__nonzero,
3231 					    char *, mip->mi_name,
3232 					    int, g, int, group_info.mgi_count);
3233 					err = EINVAL;
3234 					goto bail;
3235 				}
3236 				break;
3237 			}
3238 			break;
3239 		case MAC_GROUP_TYPE_STATIC:
3240 			/*
3241 			 * Note that an empty group is allowed, e.g., an aggr
3242 			 * would start with an empty group.
3243 			 */
3244 			break;
3245 		default:
3246 			/* unknown group type */
3247 			DTRACE_PROBE2(mac__init__rings__unknown__type,
3248 			    char *, mip->mi_name,
3249 			    int, cap_rings->mr_group_type);
3250 			err = EINVAL;
3251 			goto bail;
3252 		}
3253 
3254 
3255 		/*
3256 		 * Driver must register group->mgi_addmac/remmac() for rx groups
3257 		 * to support multiple MAC addresses.
3258 		 */
3259 		if (rtype == MAC_RING_TYPE_RX) {
3260 			if ((group_info.mgi_addmac == NULL) ||
3261 			    (group_info.mgi_addmac == NULL))
3262 				goto bail;
3263 		}
3264 
3265 		/* Cache driver-supplied information */
3266 		group->mrg_info = group_info;
3267 
3268 		/* Update the group's status and group count. */
3269 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3270 		group_free++;
3271 
3272 		group->mrg_rings = NULL;
3273 		group->mrg_cur_count = 0;
3274 		mac_init_group(mip, group, group_info.mgi_count, cap_rings);
3275 		ring_left -= group_info.mgi_count;
3276 
3277 		/* The current group size should be equal to default value */
3278 		ASSERT(group->mrg_cur_count == group_info.mgi_count);
3279 	}
3280 
3281 	/* Build up a dummy group for free resources as a pool */
3282 	group = groups + cap_rings->mr_gnum;
3283 
3284 	/* Prepare basic information of the group */
3285 	group->mrg_index = -1;
3286 	group->mrg_type = rtype;
3287 	group->mrg_state = MAC_GROUP_STATE_UNINIT;
3288 	group->mrg_mh = (mac_handle_t)mip;
3289 	group->mrg_next = NULL;
3290 
3291 	/*
3292 	 * If there are ungrouped rings, allocate a continuous buffer for
3293 	 * remaining resources.
3294 	 */
3295 	if (ring_left != 0) {
3296 		group->mrg_rings = NULL;
3297 		group->mrg_cur_count = 0;
3298 		mac_init_group(mip, group, ring_left, cap_rings);
3299 
3300 		/* The current group size should be equal to ring_left */
3301 		ASSERT(group->mrg_cur_count == ring_left);
3302 
3303 		ring_left = 0;
3304 
3305 		/* Update this group's status */
3306 		mac_set_rx_group_state(group, MAC_GROUP_STATE_REGISTERED);
3307 	} else
3308 		group->mrg_rings = NULL;
3309 
3310 	ASSERT(ring_left == 0);
3311 
3312 bail:
3313 	/* Cache other important information to finalize the initialization */
3314 	switch (rtype) {
3315 	case MAC_RING_TYPE_RX:
3316 		mip->mi_rx_group_type = cap_rings->mr_group_type;
3317 		mip->mi_rx_group_count = cap_rings->mr_gnum;
3318 		mip->mi_rx_groups = groups;
3319 		break;
3320 	case MAC_RING_TYPE_TX:
3321 		mip->mi_tx_group_type = cap_rings->mr_group_type;
3322 		mip->mi_tx_group_count = cap_rings->mr_gnum;
3323 		mip->mi_tx_group_free = group_free;
3324 		mip->mi_tx_groups = groups;
3325 
3326 		/*
3327 		 * Ring 0 is used as the default one and it could be assigned
3328 		 * to a client as well.
3329 		 */
3330 		group = groups + cap_rings->mr_gnum;
3331 		ring = group->mrg_rings;
3332 		while ((ring->mr_index != 0) && (ring->mr_next != NULL))
3333 			ring = ring->mr_next;
3334 		ASSERT(ring->mr_index == 0);
3335 		mip->mi_default_tx_ring = (mac_ring_handle_t)ring;
3336 		break;
3337 	default:
3338 		ASSERT(B_FALSE);
3339 	}
3340 
3341 	if (err != 0)
3342 		mac_free_rings(mip, rtype);
3343 
3344 	return (err);
3345 }
3346 
3347 /*
3348  * Called to free all ring groups with particular type. It's supposed all groups
3349  * have been released by clinet.
3350  */
3351 void
3352 mac_free_rings(mac_impl_t *mip, mac_ring_type_t rtype)
3353 {
3354 	mac_group_t *group, *groups;
3355 	uint_t group_count;
3356 
3357 	switch (rtype) {
3358 	case MAC_RING_TYPE_RX:
3359 		if (mip->mi_rx_groups == NULL)
3360 			return;
3361 
3362 		groups = mip->mi_rx_groups;
3363 		group_count = mip->mi_rx_group_count;
3364 
3365 		mip->mi_rx_groups = NULL;
3366 		mip->mi_rx_group_count = 0;
3367 		break;
3368 	case MAC_RING_TYPE_TX:
3369 		ASSERT(mip->mi_tx_group_count == mip->mi_tx_group_free);
3370 
3371 		if (mip->mi_tx_groups == NULL)
3372 			return;
3373 
3374 		groups = mip->mi_tx_groups;
3375 		group_count = mip->mi_tx_group_count;
3376 
3377 		mip->mi_tx_groups = NULL;
3378 		mip->mi_tx_group_count = 0;
3379 		mip->mi_tx_group_free = 0;
3380 		mip->mi_default_tx_ring = NULL;
3381 		break;
3382 	default:
3383 		ASSERT(B_FALSE);
3384 	}
3385 
3386 	for (group = groups; group != NULL; group = group->mrg_next) {
3387 		mac_ring_t *ring;
3388 
3389 		if (group->mrg_cur_count == 0)
3390 			continue;
3391 
3392 		ASSERT(group->mrg_rings != NULL);
3393 
3394 		while ((ring = group->mrg_rings) != NULL) {
3395 			group->mrg_rings = ring->mr_next;
3396 			mac_ring_free(mip, ring);
3397 		}
3398 	}
3399 
3400 	/* Free all the cached rings */
3401 	mac_ring_freeall(mip);
3402 	/* Free the block of group data strutures */
3403 	kmem_free(groups, sizeof (mac_group_t) * (group_count + 1));
3404 }
3405 
3406 /*
3407  * Associate a MAC address with a receive group.
3408  *
3409  * The return value of this function should always be checked properly, because
3410  * any type of failure could cause unexpected results. A group can be added
3411  * or removed with a MAC address only after it has been reserved. Ideally,
3412  * a successful reservation always leads to calling mac_group_addmac() to
3413  * steer desired traffic. Failure of adding an unicast MAC address doesn't
3414  * always imply that the group is functioning abnormally.
3415  *
3416  * Currently this function is called everywhere, and it reflects assumptions
3417  * about MAC addresses in the implementation. CR 6735196.
3418  */
3419 int
3420 mac_group_addmac(mac_group_t *group, const uint8_t *addr)
3421 {
3422 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3423 	ASSERT(group->mrg_info.mgi_addmac != NULL);
3424 
3425 	return (group->mrg_info.mgi_addmac(group->mrg_info.mgi_driver, addr));
3426 }
3427 
3428 /*
3429  * Remove the association between MAC address and receive group.
3430  */
3431 int
3432 mac_group_remmac(mac_group_t *group, const uint8_t *addr)
3433 {
3434 	ASSERT(group->mrg_type == MAC_RING_TYPE_RX);
3435 	ASSERT(group->mrg_info.mgi_remmac != NULL);
3436 
3437 	return (group->mrg_info.mgi_remmac(group->mrg_info.mgi_driver, addr));
3438 }
3439 
3440 /*
3441  * Release a ring in use by marking it MR_FREE.
3442  * Any other client may reserve it for its use.
3443  */
3444 void
3445 mac_release_tx_ring(mac_ring_handle_t rh)
3446 {
3447 	mac_ring_t *ring = (mac_ring_t *)rh;
3448 	mac_group_t *group = (mac_group_t *)ring->mr_gh;
3449 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3450 
3451 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3452 	ASSERT(ring->mr_state != MR_FREE);
3453 
3454 	/*
3455 	 * Default tx ring will be released by mac_stop().
3456 	 */
3457 	if (rh == mip->mi_default_tx_ring)
3458 		return;
3459 
3460 	mac_stop_ring(ring);
3461 
3462 	ring->mr_state = MR_FREE;
3463 	ring->mr_flag = 0;
3464 }
3465 
3466 /*
3467  * Find a ring from its index.
3468  */
3469 mac_ring_t *
3470 mac_find_ring(mac_group_t *group, int index)
3471 {
3472 	mac_ring_t *ring = group->mrg_rings;
3473 
3474 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next)
3475 		if (ring->mr_index == index)
3476 			break;
3477 
3478 	return (ring);
3479 }
3480 /*
3481  * Add a ring to an existing group.
3482  *
3483  * The ring must be either passed directly (for example if the ring
3484  * movement is initiated by the framework), or specified through a driver
3485  * index (for example when the ring is added by the driver.
3486  *
3487  * The caller needs to call mac_perim_enter() before calling this function.
3488  */
3489 int
3490 i_mac_group_add_ring(mac_group_t *group, mac_ring_t *ring, int index)
3491 {
3492 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3493 	mac_capab_rings_t *cap_rings;
3494 	boolean_t driver_call = (ring == NULL);
3495 	mac_group_type_t group_type;
3496 	int ret = 0;
3497 
3498 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3499 
3500 	switch (group->mrg_type) {
3501 	case MAC_RING_TYPE_RX:
3502 		cap_rings = &mip->mi_rx_rings_cap;
3503 		group_type = mip->mi_rx_group_type;
3504 		break;
3505 	case MAC_RING_TYPE_TX:
3506 		cap_rings = &mip->mi_tx_rings_cap;
3507 		group_type = mip->mi_tx_group_type;
3508 		break;
3509 	default:
3510 		ASSERT(B_FALSE);
3511 	}
3512 
3513 	/*
3514 	 * There should be no ring with the same ring index in the target
3515 	 * group.
3516 	 */
3517 	ASSERT(mac_find_ring(group, driver_call ? index : ring->mr_index) ==
3518 	    NULL);
3519 
3520 	if (driver_call) {
3521 		/*
3522 		 * The function is called as a result of a request from
3523 		 * a driver to add a ring to an existing group, for example
3524 		 * from the aggregation driver. Allocate a new mac_ring_t
3525 		 * for that ring.
3526 		 */
3527 		ring = mac_init_ring(mip, group, index, cap_rings);
3528 		ASSERT(group->mrg_state > MAC_GROUP_STATE_UNINIT);
3529 	} else {
3530 		/*
3531 		 * The function is called as a result of a MAC layer request
3532 		 * to add a ring to an existing group. In this case the
3533 		 * ring is being moved between groups, which requires
3534 		 * the underlying driver to support dynamic grouping,
3535 		 * and the mac_ring_t already exists.
3536 		 */
3537 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3538 		ASSERT(cap_rings->mr_gaddring != NULL);
3539 		ASSERT(ring->mr_gh == NULL);
3540 	}
3541 
3542 	/*
3543 	 * At this point the ring should not be in use, and it should be
3544 	 * of the right for the target group.
3545 	 */
3546 	ASSERT(ring->mr_state < MR_INUSE);
3547 	ASSERT(ring->mr_srs == NULL);
3548 	ASSERT(ring->mr_type == group->mrg_type);
3549 
3550 	if (!driver_call) {
3551 		/*
3552 		 * Add the driver level hardware ring if the process was not
3553 		 * initiated by the driver, and the target group is not the
3554 		 * group.
3555 		 */
3556 		if (group->mrg_driver != NULL) {
3557 			cap_rings->mr_gaddring(group->mrg_driver,
3558 			    ring->mr_driver, ring->mr_type);
3559 		}
3560 
3561 		/*
3562 		 * Insert the ring ahead existing rings.
3563 		 */
3564 		ring->mr_next = group->mrg_rings;
3565 		group->mrg_rings = ring;
3566 		ring->mr_gh = (mac_group_handle_t)group;
3567 		group->mrg_cur_count++;
3568 	}
3569 
3570 	/*
3571 	 * If the group has not been actively used, we're done.
3572 	 */
3573 	if (group->mrg_index != -1 &&
3574 	    group->mrg_state < MAC_GROUP_STATE_RESERVED)
3575 		return (0);
3576 
3577 	/*
3578 	 * Set up SRS/SR according to the ring type.
3579 	 */
3580 	switch (ring->mr_type) {
3581 	case MAC_RING_TYPE_RX:
3582 		/*
3583 		 * Setup SRS on top of the new ring if the group is
3584 		 * reserved for someones exclusive use.
3585 		 */
3586 		if (group->mrg_state == MAC_GROUP_STATE_RESERVED) {
3587 			flow_entry_t *flent;
3588 			mac_client_impl_t *mcip;
3589 
3590 			mcip = MAC_RX_GROUP_ONLY_CLIENT(group);
3591 			ASSERT(mcip != NULL);
3592 			flent = mcip->mci_flent;
3593 			ASSERT(flent->fe_rx_srs_cnt > 0);
3594 			mac_srs_group_setup(mcip, flent, group, SRST_LINK);
3595 		}
3596 		break;
3597 	case MAC_RING_TYPE_TX:
3598 		/*
3599 		 * For TX this function is only invoked during the
3600 		 * initial creation of a group when a share is
3601 		 * associated with a MAC client. So the datapath is not
3602 		 * yet setup, and will be setup later after the
3603 		 * group has been reserved and populated.
3604 		 */
3605 		break;
3606 	default:
3607 		ASSERT(B_FALSE);
3608 	}
3609 
3610 	/*
3611 	 * Start the ring if needed. Failure causes to undo the grouping action.
3612 	 */
3613 	if ((ret = mac_start_ring(ring)) != 0) {
3614 		if (ring->mr_type == MAC_RING_TYPE_RX) {
3615 			if (ring->mr_srs != NULL) {
3616 				mac_rx_srs_remove(ring->mr_srs);
3617 				ring->mr_srs = NULL;
3618 			}
3619 		}
3620 		if (!driver_call) {
3621 			cap_rings->mr_gremring(group->mrg_driver,
3622 			    ring->mr_driver, ring->mr_type);
3623 		}
3624 		group->mrg_cur_count--;
3625 		group->mrg_rings = ring->mr_next;
3626 
3627 		ring->mr_gh = NULL;
3628 
3629 		if (driver_call)
3630 			mac_ring_free(mip, ring);
3631 
3632 		return (ret);
3633 	}
3634 
3635 	/*
3636 	 * Update the ring's state.
3637 	 */
3638 	ring->mr_state = MR_INUSE;
3639 	MAC_RING_UNMARK(ring, MR_INCIPIENT);
3640 	return (0);
3641 }
3642 
3643 /*
3644  * Remove a ring from it's current group. MAC internal function for dynamic
3645  * grouping.
3646  *
3647  * The caller needs to call mac_perim_enter() before calling this function.
3648  */
3649 void
3650 i_mac_group_rem_ring(mac_group_t *group, mac_ring_t *ring,
3651     boolean_t driver_call)
3652 {
3653 	mac_impl_t *mip = (mac_impl_t *)group->mrg_mh;
3654 	mac_capab_rings_t *cap_rings = NULL;
3655 	mac_group_type_t group_type;
3656 
3657 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3658 
3659 	ASSERT(mac_find_ring(group, ring->mr_index) == ring);
3660 	ASSERT((mac_group_t *)ring->mr_gh == group);
3661 	ASSERT(ring->mr_type == group->mrg_type);
3662 
3663 	switch (ring->mr_type) {
3664 	case MAC_RING_TYPE_RX:
3665 		group_type = mip->mi_rx_group_type;
3666 		cap_rings = &mip->mi_rx_rings_cap;
3667 
3668 		if (group->mrg_state >= MAC_GROUP_STATE_RESERVED)
3669 			mac_stop_ring(ring);
3670 
3671 		/*
3672 		 * Only hardware classified packets hold a reference to the
3673 		 * ring all the way up the Rx path. mac_rx_srs_remove()
3674 		 * will take care of quiescing the Rx path and removing the
3675 		 * SRS. The software classified path neither holds a reference
3676 		 * nor any association with the ring in mac_rx.
3677 		 */
3678 		if (ring->mr_srs != NULL) {
3679 			mac_rx_srs_remove(ring->mr_srs);
3680 			ring->mr_srs = NULL;
3681 		}
3682 		ring->mr_state = MR_FREE;
3683 		ring->mr_flag = 0;
3684 
3685 		break;
3686 	case MAC_RING_TYPE_TX:
3687 		/*
3688 		 * For TX this function is only invoked in two
3689 		 * cases:
3690 		 *
3691 		 * 1) In the case of a failure during the
3692 		 * initial creation of a group when a share is
3693 		 * associated with a MAC client. So the SRS is not
3694 		 * yet setup, and will be setup later after the
3695 		 * group has been reserved and populated.
3696 		 *
3697 		 * 2) From mac_release_tx_group() when freeing
3698 		 * a TX SRS.
3699 		 *
3700 		 * In both cases the SRS and its soft rings are
3701 		 * already quiesced.
3702 		 */
3703 		ASSERT(!driver_call);
3704 		group_type = mip->mi_tx_group_type;
3705 		cap_rings = &mip->mi_tx_rings_cap;
3706 		break;
3707 	default:
3708 		ASSERT(B_FALSE);
3709 	}
3710 
3711 	/*
3712 	 * Remove the ring from the group.
3713 	 */
3714 	if (ring == group->mrg_rings)
3715 		group->mrg_rings = ring->mr_next;
3716 	else {
3717 		mac_ring_t *pre;
3718 
3719 		pre = group->mrg_rings;
3720 		while (pre->mr_next != ring)
3721 			pre = pre->mr_next;
3722 		pre->mr_next = ring->mr_next;
3723 	}
3724 	group->mrg_cur_count--;
3725 
3726 	if (!driver_call) {
3727 		ASSERT(group_type == MAC_GROUP_TYPE_DYNAMIC);
3728 		ASSERT(cap_rings->mr_gremring != NULL);
3729 
3730 		/*
3731 		 * Remove the driver level hardware ring.
3732 		 */
3733 		if (group->mrg_driver != NULL) {
3734 			cap_rings->mr_gremring(group->mrg_driver,
3735 			    ring->mr_driver, ring->mr_type);
3736 		}
3737 	}
3738 
3739 	ring->mr_gh = NULL;
3740 	if (driver_call) {
3741 		mac_ring_free(mip, ring);
3742 	} else {
3743 		ring->mr_state = MR_FREE;
3744 		ring->mr_flag = 0;
3745 	}
3746 }
3747 
3748 /*
3749  * Move a ring to the target group. If needed, remove the ring from the group
3750  * that it currently belongs to.
3751  *
3752  * The caller need to enter MAC's perimeter by calling mac_perim_enter().
3753  */
3754 static int
3755 mac_group_mov_ring(mac_impl_t *mip, mac_group_t *d_group, mac_ring_t *ring)
3756 {
3757 	mac_group_t *s_group = (mac_group_t *)ring->mr_gh;
3758 	int rv;
3759 
3760 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3761 	ASSERT(d_group != NULL);
3762 	ASSERT(s_group->mrg_mh == d_group->mrg_mh);
3763 
3764 	if (s_group == d_group)
3765 		return (0);
3766 
3767 	/*
3768 	 * Remove it from current group first.
3769 	 */
3770 	if (s_group != NULL)
3771 		i_mac_group_rem_ring(s_group, ring, B_FALSE);
3772 
3773 	/*
3774 	 * Add it to the new group.
3775 	 */
3776 	rv = i_mac_group_add_ring(d_group, ring, 0);
3777 	if (rv != 0) {
3778 		/*
3779 		 * Failed to add ring back to source group. If
3780 		 * that fails, the ring is stuck in limbo, log message.
3781 		 */
3782 		if (i_mac_group_add_ring(s_group, ring, 0)) {
3783 			cmn_err(CE_WARN, "%s: failed to move ring %p\n",
3784 			    mip->mi_name, (void *)ring);
3785 		}
3786 	}
3787 
3788 	return (rv);
3789 }
3790 
3791 /*
3792  * Find a MAC address according to its value.
3793  */
3794 mac_address_t *
3795 mac_find_macaddr(mac_impl_t *mip, uint8_t *mac_addr)
3796 {
3797 	mac_address_t *map;
3798 
3799 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3800 
3801 	for (map = mip->mi_addresses; map != NULL; map = map->ma_next) {
3802 		if (bcmp(mac_addr, map->ma_addr, map->ma_len) == 0)
3803 			break;
3804 	}
3805 
3806 	return (map);
3807 }
3808 
3809 /*
3810  * Check whether the MAC address is shared by multiple clients.
3811  */
3812 boolean_t
3813 mac_check_macaddr_shared(mac_address_t *map)
3814 {
3815 	ASSERT(MAC_PERIM_HELD((mac_handle_t)map->ma_mip));
3816 
3817 	return (map->ma_nusers > 1);
3818 }
3819 
3820 /*
3821  * Remove the specified MAC address from the MAC address list and free it.
3822  */
3823 static void
3824 mac_free_macaddr(mac_address_t *map)
3825 {
3826 	mac_impl_t *mip = map->ma_mip;
3827 
3828 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3829 	ASSERT(mip->mi_addresses != NULL);
3830 
3831 	map = mac_find_macaddr(mip, map->ma_addr);
3832 
3833 	ASSERT(map != NULL);
3834 	ASSERT(map->ma_nusers == 0);
3835 
3836 	if (map == mip->mi_addresses) {
3837 		mip->mi_addresses = map->ma_next;
3838 	} else {
3839 		mac_address_t *pre;
3840 
3841 		pre = mip->mi_addresses;
3842 		while (pre->ma_next != map)
3843 			pre = pre->ma_next;
3844 		pre->ma_next = map->ma_next;
3845 	}
3846 
3847 	kmem_free(map, sizeof (mac_address_t));
3848 }
3849 
3850 /*
3851  * Add a MAC address reference for a client. If the desired MAC address
3852  * exists, add a reference to it. Otherwise, add the new address by adding
3853  * it to a reserved group or setting promiscuous mode. Won't try different
3854  * group is the group is non-NULL, so the caller must explictly share
3855  * default group when needed.
3856  *
3857  * Note, the primary MAC address is initialized at registration time, so
3858  * to add it to default group only need to activate it if its reference
3859  * count is still zero. Also, some drivers may not have advertised RINGS
3860  * capability.
3861  */
3862 int
3863 mac_add_macaddr(mac_impl_t *mip, mac_group_t *group, uint8_t *mac_addr,
3864     boolean_t use_hw)
3865 {
3866 	mac_address_t *map;
3867 	int err = 0;
3868 	boolean_t allocated_map = B_FALSE;
3869 
3870 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3871 
3872 	map = mac_find_macaddr(mip, mac_addr);
3873 
3874 	/*
3875 	 * If the new MAC address has not been added. Allocate a new one
3876 	 * and set it up.
3877 	 */
3878 	if (map == NULL) {
3879 		map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
3880 		map->ma_len = mip->mi_type->mt_addr_length;
3881 		bcopy(mac_addr, map->ma_addr, map->ma_len);
3882 		map->ma_nusers = 0;
3883 		map->ma_group = group;
3884 		map->ma_mip = mip;
3885 
3886 		/* add the new MAC address to the head of the address list */
3887 		map->ma_next = mip->mi_addresses;
3888 		mip->mi_addresses = map;
3889 
3890 		allocated_map = B_TRUE;
3891 	}
3892 
3893 	ASSERT(map->ma_group == group);
3894 
3895 	/*
3896 	 * If the MAC address is already in use, simply account for the
3897 	 * new client.
3898 	 */
3899 	if (map->ma_nusers++ > 0)
3900 		return (0);
3901 
3902 	/*
3903 	 * Activate this MAC address by adding it to the reserved group.
3904 	 */
3905 	if (group != NULL) {
3906 		err = mac_group_addmac(group, (const uint8_t *)mac_addr);
3907 		if (err == 0) {
3908 			map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3909 			return (0);
3910 		}
3911 	}
3912 
3913 	/*
3914 	 * The MAC address addition failed. If the client requires a
3915 	 * hardware classified MAC address, fail the operation.
3916 	 */
3917 	if (use_hw) {
3918 		err = ENOSPC;
3919 		goto bail;
3920 	}
3921 
3922 	/*
3923 	 * Try promiscuous mode.
3924 	 *
3925 	 * For drivers that don't advertise RINGS capability, do
3926 	 * nothing for the primary address.
3927 	 */
3928 	if ((group == NULL) &&
3929 	    (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) == 0)) {
3930 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
3931 		return (0);
3932 	}
3933 
3934 	/*
3935 	 * Enable promiscuous mode in order to receive traffic
3936 	 * to the new MAC address.
3937 	 */
3938 	if ((err = i_mac_promisc_set(mip, B_TRUE)) == 0) {
3939 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_PROMISC;
3940 		return (0);
3941 	}
3942 
3943 	/*
3944 	 * Free the MAC address that could not be added. Don't free
3945 	 * a pre-existing address, it could have been the entry
3946 	 * for the primary MAC address which was pre-allocated by
3947 	 * mac_init_macaddr(), and which must remain on the list.
3948 	 */
3949 bail:
3950 	map->ma_nusers--;
3951 	if (allocated_map)
3952 		mac_free_macaddr(map);
3953 	return (err);
3954 }
3955 
3956 /*
3957  * Remove a reference to a MAC address. This may cause to remove the MAC
3958  * address from an associated group or to turn off promiscuous mode.
3959  * The caller needs to handle the failure properly.
3960  */
3961 int
3962 mac_remove_macaddr(mac_address_t *map)
3963 {
3964 	mac_impl_t *mip = map->ma_mip;
3965 	int err = 0;
3966 
3967 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
3968 
3969 	ASSERT(map == mac_find_macaddr(mip, map->ma_addr));
3970 
3971 	/*
3972 	 * If it's not the last client using this MAC address, only update
3973 	 * the MAC clients count.
3974 	 */
3975 	if (--map->ma_nusers > 0)
3976 		return (0);
3977 
3978 	/*
3979 	 * The MAC address is no longer used by any MAC client, so remove
3980 	 * it from its associated group, or turn off promiscuous mode
3981 	 * if it was enabled for the MAC address.
3982 	 */
3983 	switch (map->ma_type) {
3984 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
3985 		/*
3986 		 * Don't free the preset primary address for drivers that
3987 		 * don't advertise RINGS capability.
3988 		 */
3989 		if (map->ma_group == NULL)
3990 			return (0);
3991 
3992 		err = mac_group_remmac(map->ma_group, map->ma_addr);
3993 		break;
3994 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
3995 		err = i_mac_promisc_set(mip, B_FALSE);
3996 		break;
3997 	default:
3998 		ASSERT(B_FALSE);
3999 	}
4000 
4001 	if (err != 0)
4002 		return (err);
4003 
4004 	/*
4005 	 * We created MAC address for the primary one at registration, so we
4006 	 * won't free it here. mac_fini_macaddr() will take care of it.
4007 	 */
4008 	if (bcmp(map->ma_addr, mip->mi_addr, map->ma_len) != 0)
4009 		mac_free_macaddr(map);
4010 
4011 	return (0);
4012 }
4013 
4014 /*
4015  * Update an existing MAC address. The caller need to make sure that the new
4016  * value has not been used.
4017  */
4018 int
4019 mac_update_macaddr(mac_address_t *map, uint8_t *mac_addr)
4020 {
4021 	mac_impl_t *mip = map->ma_mip;
4022 	int err = 0;
4023 
4024 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4025 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4026 
4027 	switch (map->ma_type) {
4028 	case MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED:
4029 		/*
4030 		 * Update the primary address for drivers that are not
4031 		 * RINGS capable.
4032 		 */
4033 		if (map->ma_group == NULL) {
4034 			err = mip->mi_unicst(mip->mi_driver, (const uint8_t *)
4035 			    mac_addr);
4036 			if (err != 0)
4037 				return (err);
4038 			break;
4039 		}
4040 
4041 		/*
4042 		 * If this MAC address is not currently in use,
4043 		 * simply break out and update the value.
4044 		 */
4045 		if (map->ma_nusers == 0)
4046 			break;
4047 
4048 		/*
4049 		 * Need to replace the MAC address associated with a group.
4050 		 */
4051 		err = mac_group_remmac(map->ma_group, map->ma_addr);
4052 		if (err != 0)
4053 			return (err);
4054 
4055 		err = mac_group_addmac(map->ma_group, mac_addr);
4056 
4057 		/*
4058 		 * Failure hints hardware error. The MAC layer needs to
4059 		 * have error notification facility to handle this.
4060 		 * Now, simply try to restore the value.
4061 		 */
4062 		if (err != 0)
4063 			(void) mac_group_addmac(map->ma_group, map->ma_addr);
4064 
4065 		break;
4066 	case MAC_ADDRESS_TYPE_UNICAST_PROMISC:
4067 		/*
4068 		 * Need to do nothing more if in promiscuous mode.
4069 		 */
4070 		break;
4071 	default:
4072 		ASSERT(B_FALSE);
4073 	}
4074 
4075 	/*
4076 	 * Successfully replaced the MAC address.
4077 	 */
4078 	if (err == 0)
4079 		bcopy(mac_addr, map->ma_addr, map->ma_len);
4080 
4081 	return (err);
4082 }
4083 
4084 /*
4085  * Freshen the MAC address with new value. Its caller must have updated the
4086  * hardware MAC address before calling this function.
4087  * This funcitons is supposed to be used to handle the MAC address change
4088  * notification from underlying drivers.
4089  */
4090 void
4091 mac_freshen_macaddr(mac_address_t *map, uint8_t *mac_addr)
4092 {
4093 	mac_impl_t *mip = map->ma_mip;
4094 
4095 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4096 	ASSERT(mac_find_macaddr(mip, mac_addr) == NULL);
4097 
4098 	/*
4099 	 * Freshen the MAC address with new value.
4100 	 */
4101 	bcopy(mac_addr, map->ma_addr, map->ma_len);
4102 	bcopy(mac_addr, mip->mi_addr, map->ma_len);
4103 
4104 	/*
4105 	 * Update all MAC clients that share this MAC address.
4106 	 */
4107 	mac_unicast_update_clients(mip, map);
4108 }
4109 
4110 /*
4111  * Set up the primary MAC address.
4112  */
4113 void
4114 mac_init_macaddr(mac_impl_t *mip)
4115 {
4116 	mac_address_t *map;
4117 
4118 	/*
4119 	 * The reference count is initialized to zero, until it's really
4120 	 * activated.
4121 	 */
4122 	map = kmem_zalloc(sizeof (mac_address_t), KM_SLEEP);
4123 	map->ma_len = mip->mi_type->mt_addr_length;
4124 	bcopy(mip->mi_addr, map->ma_addr, map->ma_len);
4125 
4126 	/*
4127 	 * If driver advertises RINGS capability, it shouldn't have initialized
4128 	 * its primary MAC address. For other drivers, including VNIC, the
4129 	 * primary address must work after registration.
4130 	 */
4131 	if (mip->mi_rx_groups == NULL)
4132 		map->ma_type = MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED;
4133 
4134 	/*
4135 	 * The primary MAC address is reserved for default group according
4136 	 * to current design.
4137 	 */
4138 	map->ma_group = mip->mi_rx_groups;
4139 	map->ma_mip = mip;
4140 
4141 	mip->mi_addresses = map;
4142 }
4143 
4144 /*
4145  * Clean up the primary MAC address. Note, only one primary MAC address
4146  * is allowed. All other MAC addresses must have been freed appropriately.
4147  */
4148 void
4149 mac_fini_macaddr(mac_impl_t *mip)
4150 {
4151 	mac_address_t *map = mip->mi_addresses;
4152 
4153 	if (map == NULL)
4154 		return;
4155 
4156 	/*
4157 	 * If mi_addresses is initialized, there should be exactly one
4158 	 * entry left on the list with no users.
4159 	 */
4160 	ASSERT(map->ma_nusers == 0);
4161 	ASSERT(map->ma_next == NULL);
4162 
4163 	kmem_free(map, sizeof (mac_address_t));
4164 	mip->mi_addresses = NULL;
4165 }
4166 
4167 /*
4168  * Logging related functions.
4169  */
4170 
4171 /* Write the Flow description to the log file */
4172 int
4173 mac_write_flow_desc(flow_entry_t *flent, mac_client_impl_t *mcip)
4174 {
4175 	flow_desc_t		*fdesc;
4176 	mac_resource_props_t	*mrp;
4177 	net_desc_t		ndesc;
4178 
4179 	bzero(&ndesc, sizeof (net_desc_t));
4180 
4181 	/*
4182 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4183 	 * Updates to the fe_flow_desc are done under the fe_lock
4184 	 */
4185 	mutex_enter(&flent->fe_lock);
4186 	fdesc = &flent->fe_flow_desc;
4187 	mrp = &flent->fe_resource_props;
4188 
4189 	ndesc.nd_name = flent->fe_flow_name;
4190 	ndesc.nd_devname = mcip->mci_name;
4191 	bcopy(fdesc->fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4192 	bcopy(fdesc->fd_dst_mac, ndesc.nd_edest, ETHERADDRL);
4193 	ndesc.nd_sap = htonl(fdesc->fd_sap);
4194 	ndesc.nd_isv4 = (uint8_t)fdesc->fd_ipversion == IPV4_VERSION;
4195 	ndesc.nd_bw_limit = mrp->mrp_maxbw;
4196 	if (ndesc.nd_isv4) {
4197 		ndesc.nd_saddr[3] = htonl(fdesc->fd_local_addr.s6_addr32[3]);
4198 		ndesc.nd_daddr[3] = htonl(fdesc->fd_remote_addr.s6_addr32[3]);
4199 	} else {
4200 		bcopy(&fdesc->fd_local_addr, ndesc.nd_saddr, IPV6_ADDR_LEN);
4201 		bcopy(&fdesc->fd_remote_addr, ndesc.nd_daddr, IPV6_ADDR_LEN);
4202 	}
4203 	ndesc.nd_sport = htons(fdesc->fd_local_port);
4204 	ndesc.nd_dport = htons(fdesc->fd_remote_port);
4205 	ndesc.nd_protocol = (uint8_t)fdesc->fd_protocol;
4206 	mutex_exit(&flent->fe_lock);
4207 
4208 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_FLDESC_REC));
4209 }
4210 
4211 /* Write the Flow statistics to the log file */
4212 int
4213 mac_write_flow_stats(flow_entry_t *flent)
4214 {
4215 	flow_stats_t	*fl_stats;
4216 	net_stat_t	nstat;
4217 
4218 	fl_stats = &flent->fe_flowstats;
4219 	nstat.ns_name = flent->fe_flow_name;
4220 	nstat.ns_ibytes = fl_stats->fs_rbytes;
4221 	nstat.ns_obytes = fl_stats->fs_obytes;
4222 	nstat.ns_ipackets = fl_stats->fs_ipackets;
4223 	nstat.ns_opackets = fl_stats->fs_opackets;
4224 	nstat.ns_ierrors = fl_stats->fs_ierrors;
4225 	nstat.ns_oerrors = fl_stats->fs_oerrors;
4226 
4227 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_FLSTAT_REC));
4228 }
4229 
4230 /* Write the Link Description to the log file */
4231 int
4232 mac_write_link_desc(mac_client_impl_t *mcip)
4233 {
4234 	net_desc_t		ndesc;
4235 	flow_entry_t		*flent = mcip->mci_flent;
4236 
4237 	bzero(&ndesc, sizeof (net_desc_t));
4238 
4239 	ndesc.nd_name = mcip->mci_name;
4240 	ndesc.nd_devname = mcip->mci_name;
4241 	ndesc.nd_isv4 = B_TRUE;
4242 	/*
4243 	 * Grab the fe_lock to see a self-consistent fe_flow_desc.
4244 	 * Updates to the fe_flow_desc are done under the fe_lock
4245 	 * after removing the flent from the flow table.
4246 	 */
4247 	mutex_enter(&flent->fe_lock);
4248 	bcopy(flent->fe_flow_desc.fd_src_mac, ndesc.nd_ehost, ETHERADDRL);
4249 	mutex_exit(&flent->fe_lock);
4250 
4251 	return (exacct_commit_netinfo((void *)&ndesc, EX_NET_LNDESC_REC));
4252 }
4253 
4254 /* Write the Link statistics to the log file */
4255 int
4256 mac_write_link_stats(mac_client_impl_t *mcip)
4257 {
4258 	net_stat_t	nstat;
4259 
4260 	nstat.ns_name = mcip->mci_name;
4261 	nstat.ns_ibytes = mcip->mci_stat_ibytes;
4262 	nstat.ns_obytes = mcip->mci_stat_obytes;
4263 	nstat.ns_ipackets = mcip->mci_stat_ipackets;
4264 	nstat.ns_opackets = mcip->mci_stat_opackets;
4265 	nstat.ns_ierrors = mcip->mci_stat_ierrors;
4266 	nstat.ns_oerrors = mcip->mci_stat_oerrors;
4267 
4268 	return (exacct_commit_netinfo((void *)&nstat, EX_NET_LNSTAT_REC));
4269 }
4270 
4271 /*
4272  * For a given flow, if the descrition has not been logged before, do it now.
4273  * If it is a VNIC, then we have collected information about it from the MAC
4274  * table, so skip it.
4275  */
4276 /*ARGSUSED*/
4277 static int
4278 mac_log_flowinfo(flow_entry_t *flent, void *args)
4279 {
4280 	mac_client_impl_t	*mcip = flent->fe_mcip;
4281 
4282 	if (mcip == NULL)
4283 		return (0);
4284 
4285 	/*
4286 	 * If the name starts with "vnic", and fe_user_generated is true (to
4287 	 * exclude the mcast and active flow entries created implicitly for
4288 	 * a vnic, it is a VNIC flow.  i.e. vnic1 is a vnic flow,
4289 	 * vnic/bge1/mcast1 is not and neither is vnic/bge1/active.
4290 	 */
4291 	if (strncasecmp(flent->fe_flow_name, "vnic", 4) == 0 &&
4292 	    (flent->fe_type & FLOW_USER) != 0) {
4293 		return (0);
4294 	}
4295 
4296 	if (!flent->fe_desc_logged) {
4297 		/*
4298 		 * We don't return error because we want to continu the
4299 		 * walk in case this is the last walk which means we
4300 		 * need to reset fe_desc_logged in all the flows.
4301 		 */
4302 		if (mac_write_flow_desc(flent, mcip) != 0)
4303 			return (0);
4304 		flent->fe_desc_logged = B_TRUE;
4305 	}
4306 
4307 	/*
4308 	 * Regardless of the error, we want to proceed in case we have to
4309 	 * reset fe_desc_logged.
4310 	 */
4311 	(void) mac_write_flow_stats(flent);
4312 
4313 	if (mcip != NULL && !(mcip->mci_state_flags & MCIS_DESC_LOGGED))
4314 		flent->fe_desc_logged = B_FALSE;
4315 
4316 	return (0);
4317 }
4318 
4319 typedef struct i_mac_log_state_s {
4320 	boolean_t	mi_last;
4321 	int		mi_fenable;
4322 	int		mi_lenable;
4323 } i_mac_log_state_t;
4324 
4325 /*
4326  * Walk the mac_impl_ts and log the description for each mac client of this mac,
4327  * if it hasn't already been done. Additionally, log statistics for the link as
4328  * well. Walk the flow table and log information for each flow as well.
4329  * If it is the last walk (mci_last), then we turn off mci_desc_logged (and
4330  * also fe_desc_logged, if flow logging is on) since we want to log the
4331  * description if and when logging is restarted.
4332  */
4333 /*ARGSUSED*/
4334 static uint_t
4335 i_mac_log_walker(mod_hash_key_t key, mod_hash_val_t *val, void *arg)
4336 {
4337 	mac_impl_t		*mip = (mac_impl_t *)val;
4338 	i_mac_log_state_t	*lstate = (i_mac_log_state_t *)arg;
4339 	int			ret;
4340 	mac_client_impl_t	*mcip;
4341 
4342 	/*
4343 	 * Only walk the client list for NIC and etherstub
4344 	 */
4345 	if ((mip->mi_state_flags & MIS_DISABLED) ||
4346 	    ((mip->mi_state_flags & MIS_IS_VNIC) &&
4347 	    (mac_get_lower_mac_handle((mac_handle_t)mip) != NULL)))
4348 		return (MH_WALK_CONTINUE);
4349 
4350 	for (mcip = mip->mi_clients_list; mcip != NULL;
4351 	    mcip = mcip->mci_client_next) {
4352 		if (!MCIP_DATAPATH_SETUP(mcip))
4353 			continue;
4354 		if (lstate->mi_lenable) {
4355 			if (!(mcip->mci_state_flags & MCIS_DESC_LOGGED)) {
4356 				ret = mac_write_link_desc(mcip);
4357 				if (ret != 0) {
4358 				/*
4359 				 * We can't terminate it if this is the last
4360 				 * walk, else there might be some links with
4361 				 * mi_desc_logged set to true, which means
4362 				 * their description won't be logged the next
4363 				 * time logging is started (similarly for the
4364 				 * flows within such links). We can continue
4365 				 * without walking the flow table (i.e. to
4366 				 * set fe_desc_logged to false) because we
4367 				 * won't have written any flow stuff for this
4368 				 * link as we haven't logged the link itself.
4369 				 */
4370 					if (lstate->mi_last)
4371 						return (MH_WALK_CONTINUE);
4372 					else
4373 						return (MH_WALK_TERMINATE);
4374 				}
4375 				mcip->mci_state_flags |= MCIS_DESC_LOGGED;
4376 			}
4377 		}
4378 
4379 		if (mac_write_link_stats(mcip) != 0 && !lstate->mi_last)
4380 			return (MH_WALK_TERMINATE);
4381 
4382 		if (lstate->mi_last)
4383 			mcip->mci_state_flags &= ~MCIS_DESC_LOGGED;
4384 
4385 		if (lstate->mi_fenable) {
4386 			if (mcip->mci_subflow_tab != NULL) {
4387 				(void) mac_flow_walk(mcip->mci_subflow_tab,
4388 				    mac_log_flowinfo, mip);
4389 			}
4390 		}
4391 	}
4392 	return (MH_WALK_CONTINUE);
4393 }
4394 
4395 /*
4396  * The timer thread that runs every mac_logging_interval seconds and logs
4397  * link and/or flow information.
4398  */
4399 /* ARGSUSED */
4400 void
4401 mac_log_linkinfo(void *arg)
4402 {
4403 	i_mac_log_state_t	lstate;
4404 
4405 	rw_enter(&i_mac_impl_lock, RW_READER);
4406 	if (!mac_flow_log_enable && !mac_link_log_enable) {
4407 		rw_exit(&i_mac_impl_lock);
4408 		return;
4409 	}
4410 	lstate.mi_fenable = mac_flow_log_enable;
4411 	lstate.mi_lenable = mac_link_log_enable;
4412 	lstate.mi_last = B_FALSE;
4413 	rw_exit(&i_mac_impl_lock);
4414 
4415 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4416 
4417 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4418 	if (mac_flow_log_enable || mac_link_log_enable) {
4419 		mac_logging_timer = timeout(mac_log_linkinfo, NULL,
4420 		    SEC_TO_TICK(mac_logging_interval));
4421 	}
4422 	rw_exit(&i_mac_impl_lock);
4423 }
4424 
4425 typedef struct i_mac_fastpath_state_s {
4426 	boolean_t	mf_disable;
4427 	int		mf_err;
4428 } i_mac_fastpath_state_t;
4429 
4430 /*ARGSUSED*/
4431 static uint_t
4432 i_mac_fastpath_disable_walker(mod_hash_key_t key, mod_hash_val_t *val,
4433     void *arg)
4434 {
4435 	i_mac_fastpath_state_t	*state = arg;
4436 	mac_handle_t		mh = (mac_handle_t)val;
4437 
4438 	if (state->mf_disable)
4439 		state->mf_err = mac_fastpath_disable(mh);
4440 	else
4441 		mac_fastpath_enable(mh);
4442 
4443 	return (state->mf_err == 0 ? MH_WALK_CONTINUE : MH_WALK_TERMINATE);
4444 }
4445 
4446 /*
4447  * Start the logging timer.
4448  */
4449 int
4450 mac_start_logusage(mac_logtype_t type, uint_t interval)
4451 {
4452 	i_mac_fastpath_state_t state = {B_TRUE, 0};
4453 	int err;
4454 
4455 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4456 	switch (type) {
4457 	case MAC_LOGTYPE_FLOW:
4458 		if (mac_flow_log_enable) {
4459 			rw_exit(&i_mac_impl_lock);
4460 			return (0);
4461 		}
4462 		/* FALLTHRU */
4463 	case MAC_LOGTYPE_LINK:
4464 		if (mac_link_log_enable) {
4465 			rw_exit(&i_mac_impl_lock);
4466 			return (0);
4467 		}
4468 		break;
4469 	default:
4470 		ASSERT(0);
4471 	}
4472 
4473 	/* Disable fastpath */
4474 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4475 	if ((err = state.mf_err) != 0) {
4476 		/* Reenable fastpath  */
4477 		state.mf_disable = B_FALSE;
4478 		state.mf_err = 0;
4479 		mod_hash_walk(i_mac_impl_hash,
4480 		    i_mac_fastpath_disable_walker, &state);
4481 		rw_exit(&i_mac_impl_lock);
4482 		return (err);
4483 	}
4484 
4485 	switch (type) {
4486 	case MAC_LOGTYPE_FLOW:
4487 		mac_flow_log_enable = B_TRUE;
4488 		/* FALLTHRU */
4489 	case MAC_LOGTYPE_LINK:
4490 		mac_link_log_enable = B_TRUE;
4491 		break;
4492 	}
4493 
4494 	mac_logging_interval = interval;
4495 	rw_exit(&i_mac_impl_lock);
4496 	mac_log_linkinfo(NULL);
4497 	return (0);
4498 }
4499 
4500 /*
4501  * Stop the logging timer if both Link and Flow logging are turned off.
4502  */
4503 void
4504 mac_stop_logusage(mac_logtype_t type)
4505 {
4506 	i_mac_log_state_t	lstate;
4507 	i_mac_fastpath_state_t	state = {B_FALSE, 0};
4508 
4509 	rw_enter(&i_mac_impl_lock, RW_WRITER);
4510 	lstate.mi_fenable = mac_flow_log_enable;
4511 	lstate.mi_lenable = mac_link_log_enable;
4512 
4513 	/* Last walk */
4514 	lstate.mi_last = B_TRUE;
4515 
4516 	switch (type) {
4517 	case MAC_LOGTYPE_FLOW:
4518 		if (lstate.mi_fenable) {
4519 			ASSERT(mac_link_log_enable);
4520 			mac_flow_log_enable = B_FALSE;
4521 			mac_link_log_enable = B_FALSE;
4522 			break;
4523 		}
4524 		/* FALLTHRU */
4525 	case MAC_LOGTYPE_LINK:
4526 		if (!lstate.mi_lenable || mac_flow_log_enable) {
4527 			rw_exit(&i_mac_impl_lock);
4528 			return;
4529 		}
4530 		mac_link_log_enable = B_FALSE;
4531 		break;
4532 	default:
4533 		ASSERT(0);
4534 	}
4535 
4536 	/* Reenable fastpath */
4537 	mod_hash_walk(i_mac_impl_hash, i_mac_fastpath_disable_walker, &state);
4538 
4539 	rw_exit(&i_mac_impl_lock);
4540 	(void) untimeout(mac_logging_timer);
4541 	mac_logging_timer = 0;
4542 
4543 	/* Last walk */
4544 	mod_hash_walk(i_mac_impl_hash, i_mac_log_walker, &lstate);
4545 }
4546 
4547 /*
4548  * Walk the rx and tx SRS/SRs for a flow and update the priority value.
4549  */
4550 void
4551 mac_flow_update_priority(mac_client_impl_t *mcip, flow_entry_t *flent)
4552 {
4553 	pri_t			pri;
4554 	int			count;
4555 	mac_soft_ring_set_t	*mac_srs;
4556 
4557 	if (flent->fe_rx_srs_cnt <= 0)
4558 		return;
4559 
4560 	if (((mac_soft_ring_set_t *)flent->fe_rx_srs[0])->srs_type ==
4561 	    SRST_FLOW) {
4562 		pri = FLOW_PRIORITY(mcip->mci_min_pri,
4563 		    mcip->mci_max_pri,
4564 		    flent->fe_resource_props.mrp_priority);
4565 	} else {
4566 		pri = mcip->mci_max_pri;
4567 	}
4568 
4569 	for (count = 0; count < flent->fe_rx_srs_cnt; count++) {
4570 		mac_srs = flent->fe_rx_srs[count];
4571 		mac_update_srs_priority(mac_srs, pri);
4572 	}
4573 	/*
4574 	 * If we have a Tx SRS, we need to modify all the threads associated
4575 	 * with it.
4576 	 */
4577 	if (flent->fe_tx_srs != NULL)
4578 		mac_update_srs_priority(flent->fe_tx_srs, pri);
4579 }
4580 
4581 /*
4582  * RX and TX rings are reserved according to different semantics depending
4583  * on the requests from the MAC clients and type of rings:
4584  *
4585  * On the Tx side, by default we reserve individual rings, independently from
4586  * the groups.
4587  *
4588  * On the Rx side, the reservation is at the granularity of the group
4589  * of rings, and used for v12n level 1 only. It has a special case for the
4590  * primary client.
4591  *
4592  * If a share is allocated to a MAC client, we allocate a TX group and an
4593  * RX group to the client, and assign TX rings and RX rings to these
4594  * groups according to information gathered from the driver through
4595  * the share capability.
4596  *
4597  * The foreseable evolution of Rx rings will handle v12n level 2 and higher
4598  * to allocate individual rings out of a group and program the hw classifier
4599  * based on IP address or higher level criteria.
4600  */
4601 
4602 /*
4603  * mac_reserve_tx_ring()
4604  * Reserve a unused ring by marking it with MR_INUSE state.
4605  * As reserved, the ring is ready to function.
4606  *
4607  * Notes for Hybrid I/O:
4608  *
4609  * If a specific ring is needed, it is specified through the desired_ring
4610  * argument. Otherwise that argument is set to NULL.
4611  * If the desired ring was previous allocated to another client, this
4612  * function swaps it with a new ring from the group of unassigned rings.
4613  */
4614 mac_ring_t *
4615 mac_reserve_tx_ring(mac_impl_t *mip, mac_ring_t *desired_ring)
4616 {
4617 	mac_group_t *group;
4618 	mac_ring_t *ring;
4619 
4620 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4621 
4622 	if (mip->mi_tx_groups == NULL)
4623 		return (NULL);
4624 
4625 	/*
4626 	 * Find an available ring and start it before changing its status.
4627 	 * The unassigned rings are at the end of the mi_tx_groups
4628 	 * array.
4629 	 */
4630 	group = mip->mi_tx_groups + mip->mi_tx_group_count;
4631 
4632 	for (ring = group->mrg_rings; ring != NULL;
4633 	    ring = ring->mr_next) {
4634 		if (desired_ring == NULL) {
4635 			if (ring->mr_state == MR_FREE)
4636 				/* wanted any free ring and found one */
4637 				break;
4638 		} else {
4639 			mac_ring_t *sring;
4640 			mac_client_impl_t *client;
4641 			mac_soft_ring_set_t *srs;
4642 
4643 			if (ring != desired_ring)
4644 				/* wants a desired ring but this one ain't it */
4645 				continue;
4646 
4647 			if (ring->mr_state == MR_FREE)
4648 				break;
4649 
4650 			/*
4651 			 * Found the desired ring but it's already in use.
4652 			 * Swap it with a new ring.
4653 			 */
4654 
4655 			/* find the client which owns that ring */
4656 			for (client = mip->mi_clients_list; client != NULL;
4657 			    client = client->mci_client_next) {
4658 				srs = MCIP_TX_SRS(client);
4659 				if (srs != NULL && mac_tx_srs_ring_present(srs,
4660 				    desired_ring)) {
4661 					/* found our ring */
4662 					break;
4663 				}
4664 			}
4665 			if (client == NULL) {
4666 				/*
4667 				 * The TX ring is in use, but it's not
4668 				 * associated with any clients, so it
4669 				 * has to be the default ring. In that
4670 				 * case we can simply assign a new ring
4671 				 * as the default ring, and we're done.
4672 				 */
4673 				ASSERT(mip->mi_default_tx_ring ==
4674 				    (mac_ring_handle_t)desired_ring);
4675 
4676 				/*
4677 				 * Quiesce all clients on top of
4678 				 * the NIC to make sure there are no
4679 				 * pending threads still relying on
4680 				 * that default ring, for example
4681 				 * the multicast path.
4682 				 */
4683 				for (client = mip->mi_clients_list;
4684 				    client != NULL;
4685 				    client = client->mci_client_next) {
4686 					mac_tx_client_quiesce(client,
4687 					    SRS_QUIESCE);
4688 				}
4689 
4690 				mip->mi_default_tx_ring = (mac_ring_handle_t)
4691 				    mac_reserve_tx_ring(mip, NULL);
4692 
4693 				/* resume the clients */
4694 				for (client = mip->mi_clients_list;
4695 				    client != NULL;
4696 				    client = client->mci_client_next)
4697 					mac_tx_client_restart(client);
4698 
4699 				break;
4700 			}
4701 
4702 			/*
4703 			 * Note that we cannot simply invoke the group
4704 			 * add/rem routines since the client doesn't have a
4705 			 * TX group. So we need to instead add/remove
4706 			 * the rings from the SRS.
4707 			 */
4708 			ASSERT(client->mci_share == NULL);
4709 
4710 			/* first quiece the client */
4711 			mac_tx_client_quiesce(client, SRS_QUIESCE);
4712 
4713 			/* give a new ring to the client... */
4714 			sring = mac_reserve_tx_ring(mip, NULL);
4715 			if (sring != NULL) {
4716 				/*
4717 				 * There are no other available ring
4718 				 * on that MAC instance. The client
4719 				 * will fallback to the shared TX
4720 				 * ring.
4721 				 */
4722 				mac_tx_srs_add_ring(srs, sring);
4723 			}
4724 
4725 			/* ... in exchange for our desired ring */
4726 			mac_tx_srs_del_ring(srs, desired_ring);
4727 
4728 			/* restart the client */
4729 			mac_tx_client_restart(client);
4730 
4731 			if (mip->mi_default_tx_ring ==
4732 			    (mac_ring_handle_t)desired_ring) {
4733 				/*
4734 				 * The desired ring is the default ring,
4735 				 * and there are one or more clients
4736 				 * using that default ring directly.
4737 				 */
4738 				mip->mi_default_tx_ring =
4739 				    (mac_ring_handle_t)sring;
4740 				/*
4741 				 * Find clients using default ring and
4742 				 * swap it with the new default ring.
4743 				 */
4744 				for (client = mip->mi_clients_list;
4745 				    client != NULL;
4746 				    client = client->mci_client_next) {
4747 					srs = MCIP_TX_SRS(client);
4748 					if (srs != NULL &&
4749 					    mac_tx_srs_ring_present(srs,
4750 					    desired_ring)) {
4751 						/* first quiece the client */
4752 						mac_tx_client_quiesce(client,
4753 						    SRS_QUIESCE);
4754 
4755 						/*
4756 						 * Give it the new default
4757 						 * ring, and remove the old
4758 						 * one.
4759 						 */
4760 						if (sring != NULL) {
4761 							mac_tx_srs_add_ring(srs,
4762 							    sring);
4763 						}
4764 						mac_tx_srs_del_ring(srs,
4765 						    desired_ring);
4766 
4767 						/* restart the client */
4768 						mac_tx_client_restart(client);
4769 					}
4770 				}
4771 			}
4772 			break;
4773 		}
4774 	}
4775 
4776 	if (ring != NULL) {
4777 		if (mac_start_ring(ring) != 0)
4778 			return (NULL);
4779 		ring->mr_state = MR_INUSE;
4780 	}
4781 
4782 	return (ring);
4783 }
4784 
4785 /*
4786  * Minimum number of rings to leave in the default TX group when allocating
4787  * rings to new clients.
4788  */
4789 static uint_t mac_min_rx_default_rings = 1;
4790 
4791 /*
4792  * Populate a zero-ring group with rings. If the share is non-NULL,
4793  * the rings are chosen according to that share.
4794  * Invoked after allocating a new RX or TX group through
4795  * mac_reserve_rx_group() or mac_reserve_tx_group(), respectively.
4796  * Returns zero on success, an errno otherwise.
4797  */
4798 int
4799 i_mac_group_allocate_rings(mac_impl_t *mip, mac_ring_type_t ring_type,
4800     mac_group_t *src_group, mac_group_t *new_group, mac_share_handle_t share)
4801 {
4802 	mac_ring_t **rings, *tmp_ring[1], *ring;
4803 	uint_t nrings;
4804 	int rv, i, j;
4805 
4806 	ASSERT(mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC &&
4807 	    mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
4808 	ASSERT(new_group->mrg_cur_count == 0);
4809 
4810 	/*
4811 	 * First find the rings to allocate to the group.
4812 	 */
4813 	if (share != NULL) {
4814 		/* get rings through ms_squery() */
4815 		mip->mi_share_capab.ms_squery(share, ring_type, NULL, &nrings);
4816 		ASSERT(nrings != 0);
4817 		rings = kmem_alloc(nrings * sizeof (mac_ring_handle_t),
4818 		    KM_SLEEP);
4819 		mip->mi_share_capab.ms_squery(share, ring_type,
4820 		    (mac_ring_handle_t *)rings, &nrings);
4821 	} else {
4822 		/* this function is called for TX only with a share */
4823 		ASSERT(ring_type == MAC_RING_TYPE_RX);
4824 		/*
4825 		 * Pick one ring from default group.
4826 		 *
4827 		 * for now pick the second ring which requires the first ring
4828 		 * at index 0 to stay in the default group, since it is the
4829 		 * ring which carries the multicast traffic.
4830 		 * We need a better way for a driver to indicate this,
4831 		 * for example a per-ring flag.
4832 		 */
4833 		for (ring = src_group->mrg_rings; ring != NULL;
4834 		    ring = ring->mr_next) {
4835 			if (ring->mr_index != 0)
4836 				break;
4837 		}
4838 		ASSERT(ring != NULL);
4839 		nrings = 1;
4840 		tmp_ring[0] = ring;
4841 		rings = tmp_ring;
4842 	}
4843 
4844 	switch (ring_type) {
4845 	case MAC_RING_TYPE_RX:
4846 		if (src_group->mrg_cur_count - nrings <
4847 		    mac_min_rx_default_rings) {
4848 			/* we ran out of rings */
4849 			return (ENOSPC);
4850 		}
4851 
4852 		/* move receive rings to new group */
4853 		for (i = 0; i < nrings; i++) {
4854 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4855 			if (rv != 0) {
4856 				/* move rings back on failure */
4857 				for (j = 0; j < i; j++) {
4858 					(void) mac_group_mov_ring(mip,
4859 					    src_group, rings[j]);
4860 				}
4861 				return (rv);
4862 			}
4863 		}
4864 		break;
4865 
4866 	case MAC_RING_TYPE_TX: {
4867 		mac_ring_t *tmp_ring;
4868 
4869 		/* move the TX rings to the new group */
4870 		ASSERT(src_group == NULL);
4871 		for (i = 0; i < nrings; i++) {
4872 			/* get the desired ring */
4873 			tmp_ring = mac_reserve_tx_ring(mip, rings[i]);
4874 			ASSERT(tmp_ring == rings[i]);
4875 			rv = mac_group_mov_ring(mip, new_group, rings[i]);
4876 			if (rv != 0) {
4877 				/* cleanup on failure */
4878 				for (j = 0; j < i; j++) {
4879 					(void) mac_group_mov_ring(mip,
4880 					    mip->mi_tx_groups +
4881 					    mip->mi_tx_group_count, rings[j]);
4882 				}
4883 			}
4884 		}
4885 		break;
4886 	}
4887 	}
4888 
4889 	if (share != NULL) {
4890 		/* add group to share */
4891 		mip->mi_share_capab.ms_sadd(share, new_group->mrg_driver);
4892 		/* free temporary array of rings */
4893 		kmem_free(rings, nrings * sizeof (mac_ring_handle_t));
4894 	}
4895 
4896 	return (0);
4897 }
4898 
4899 void
4900 mac_rx_group_add_client(mac_group_t *grp, mac_client_impl_t *mcip)
4901 {
4902 	mac_grp_client_t *mgcp;
4903 
4904 	for (mgcp = grp->mrg_clients; mgcp != NULL; mgcp = mgcp->mgc_next) {
4905 		if (mgcp->mgc_client == mcip)
4906 			break;
4907 	}
4908 
4909 	VERIFY(mgcp == NULL);
4910 
4911 	mgcp = kmem_zalloc(sizeof (mac_grp_client_t), KM_SLEEP);
4912 	mgcp->mgc_client = mcip;
4913 	mgcp->mgc_next = grp->mrg_clients;
4914 	grp->mrg_clients = mgcp;
4915 
4916 }
4917 
4918 void
4919 mac_rx_group_remove_client(mac_group_t *grp, mac_client_impl_t *mcip)
4920 {
4921 	mac_grp_client_t *mgcp, **pprev;
4922 
4923 	for (pprev = &grp->mrg_clients, mgcp = *pprev; mgcp != NULL;
4924 	    pprev = &mgcp->mgc_next, mgcp = *pprev) {
4925 		if (mgcp->mgc_client == mcip)
4926 			break;
4927 	}
4928 
4929 	ASSERT(mgcp != NULL);
4930 
4931 	*pprev = mgcp->mgc_next;
4932 	kmem_free(mgcp, sizeof (mac_grp_client_t));
4933 }
4934 
4935 /*
4936  * mac_reserve_rx_group()
4937  *
4938  * Finds an available group and exclusively reserves it for a client.
4939  * The group is chosen to suit the flow's resource controls (bandwidth and
4940  * fanout requirements) and the address type.
4941  * If the requestor is the pimary MAC then return the group with the
4942  * largest number of rings, otherwise the default ring when available.
4943  */
4944 mac_group_t *
4945 mac_reserve_rx_group(mac_client_impl_t *mcip, uint8_t *mac_addr,
4946     mac_rx_group_reserve_type_t rtype)
4947 {
4948 	mac_share_handle_t	share = mcip->mci_share;
4949 	mac_impl_t		*mip = mcip->mci_mip;
4950 	mac_group_t		*grp = NULL;
4951 	int			i, start, loopcount;
4952 	int			err;
4953 	mac_address_t		*map;
4954 
4955 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mip));
4956 
4957 	/* Check if a group already has this mac address (case of VLANs) */
4958 	if ((map = mac_find_macaddr(mip, mac_addr)) != NULL)
4959 		return (map->ma_group);
4960 
4961 	if (mip->mi_rx_groups == NULL || mip->mi_rx_group_count == 0 ||
4962 	    rtype == MAC_RX_NO_RESERVE)
4963 		return (NULL);
4964 
4965 	/*
4966 	 * Try to exclusively reserve a RX group.
4967 	 *
4968 	 * For flows requires SW_RING it always goes to the default group
4969 	 * (Until we can explicitely call out default groups (CR 6695600),
4970 	 * we assume that the default group is always at position zero);
4971 	 *
4972 	 * For flows requires HW_DEFAULT_RING (unicast flow of the primary
4973 	 * client), try to reserve the default RX group only.
4974 	 *
4975 	 * For flows requires HW_RING (unicast flow of other clients), try
4976 	 * to reserve non-default RX group then the default group.
4977 	 */
4978 	switch (rtype) {
4979 	case MAC_RX_RESERVE_DEFAULT:
4980 		start = 0;
4981 		loopcount = 1;
4982 		break;
4983 	case MAC_RX_RESERVE_NONDEFAULT:
4984 		start = 1;
4985 		loopcount = mip->mi_rx_group_count;
4986 	}
4987 
4988 	for (i = start; i < start + loopcount; i++) {
4989 		grp = &mip->mi_rx_groups[i % mip->mi_rx_group_count];
4990 
4991 		DTRACE_PROBE3(rx__group__trying, char *, mip->mi_name,
4992 		    int, grp->mrg_index, mac_group_state_t, grp->mrg_state);
4993 
4994 		/*
4995 		 * Check to see whether this mac client is the only client
4996 		 * on this RX group. If not, we cannot exclusively reserve
4997 		 * this RX group.
4998 		 */
4999 		if (!MAC_RX_GROUP_NO_CLIENT(grp) &&
5000 		    (MAC_RX_GROUP_ONLY_CLIENT(grp) != mcip)) {
5001 			continue;
5002 		}
5003 
5004 		/*
5005 		 * This group could already be SHARED by other multicast
5006 		 * flows on this client. In that case, the group would
5007 		 * be shared and has already been started.
5008 		 */
5009 		ASSERT(grp->mrg_state != MAC_GROUP_STATE_UNINIT);
5010 
5011 		if ((grp->mrg_state == MAC_GROUP_STATE_REGISTERED) &&
5012 		    (mac_start_group(grp) != 0)) {
5013 			continue;
5014 		}
5015 
5016 		if ((i % mip->mi_rx_group_count) == 0 ||
5017 		    mip->mi_rx_group_type != MAC_GROUP_TYPE_DYNAMIC) {
5018 			break;
5019 		}
5020 
5021 		ASSERT(grp->mrg_cur_count == 0);
5022 
5023 		/*
5024 		 * Populate the group. Rings should be taken
5025 		 * from the default group at position 0 for now.
5026 		 */
5027 
5028 		err = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_RX,
5029 		    &mip->mi_rx_groups[0], grp, share);
5030 		if (err == 0)
5031 			break;
5032 
5033 		DTRACE_PROBE3(rx__group__reserve__alloc__rings, char *,
5034 		    mip->mi_name, int, grp->mrg_index, int, err);
5035 
5036 		/*
5037 		 * It's a dynamic group but the grouping operation failed.
5038 		 */
5039 		mac_stop_group(grp);
5040 	}
5041 
5042 	if (i == start + loopcount)
5043 		return (NULL);
5044 
5045 	ASSERT(grp != NULL);
5046 
5047 	DTRACE_PROBE2(rx__group__reserved,
5048 	    char *, mip->mi_name, int, grp->mrg_index);
5049 	return (grp);
5050 }
5051 
5052 /*
5053  * mac_rx_release_group()
5054  *
5055  * This is called when there are no clients left for the group.
5056  * The group is stopped and marked MAC_GROUP_STATE_REGISTERED,
5057  * and if it is a non default group, the shares are removed and
5058  * all rings are assigned back to default group.
5059  */
5060 void
5061 mac_release_rx_group(mac_client_impl_t *mcip, mac_group_t *group)
5062 {
5063 	mac_impl_t	*mip = mcip->mci_mip;
5064 	mac_ring_t	*ring;
5065 
5066 	ASSERT(group != &mip->mi_rx_groups[0]);
5067 
5068 	/*
5069 	 * This is the case where there are no clients left. Any
5070 	 * SRS etc on this group have also be quiesced.
5071 	 */
5072 	for (ring = group->mrg_rings; ring != NULL; ring = ring->mr_next) {
5073 		if (ring->mr_classify_type == MAC_HW_CLASSIFIER) {
5074 			ASSERT(group->mrg_state == MAC_GROUP_STATE_RESERVED);
5075 			/*
5076 			 * Remove the SRS associated with the HW ring.
5077 			 * As a result, polling will be disabled.
5078 			 */
5079 			ring->mr_srs = NULL;
5080 		}
5081 		ASSERT(ring->mr_state == MR_INUSE);
5082 		mac_stop_ring(ring);
5083 		ring->mr_state = MR_FREE;
5084 		ring->mr_flag = 0;
5085 	}
5086 
5087 	/* remove group from share */
5088 	if (mcip->mci_share != NULL) {
5089 		mip->mi_share_capab.ms_sremove(mcip->mci_share,
5090 		    group->mrg_driver);
5091 	}
5092 
5093 	if (mip->mi_rx_group_type == MAC_GROUP_TYPE_DYNAMIC) {
5094 		mac_ring_t *ring;
5095 
5096 		/*
5097 		 * Rings were dynamically allocated to group.
5098 		 * Move rings back to default group.
5099 		 */
5100 		while ((ring = group->mrg_rings) != NULL) {
5101 			(void) mac_group_mov_ring(mip,
5102 			    &mip->mi_rx_groups[0], ring);
5103 		}
5104 	}
5105 	mac_stop_group(group);
5106 	/*
5107 	 * Possible improvement: See if we can assign the group just released
5108 	 * to a another client of the mip
5109 	 */
5110 }
5111 
5112 /*
5113  * Reserves a TX group for the specified share. Invoked by mac_tx_srs_setup()
5114  * when a share was allocated to the client.
5115  */
5116 mac_group_t *
5117 mac_reserve_tx_group(mac_impl_t *mip, mac_share_handle_t share)
5118 {
5119 	mac_group_t *grp;
5120 	int rv, i;
5121 
5122 	/*
5123 	 * TX groups are currently allocated only to MAC clients
5124 	 * which are associated with a share. Since we have a fixed
5125 	 * number of share and groups, and we already successfully
5126 	 * allocated a share, find an available TX group.
5127 	 */
5128 	ASSERT(share != NULL);
5129 	ASSERT(mip->mi_tx_group_free > 0);
5130 
5131 	for (i = 0; i <  mip->mi_tx_group_count; i++) {
5132 		grp = &mip->mi_tx_groups[i];
5133 
5134 		if ((grp->mrg_state == MAC_GROUP_STATE_RESERVED) ||
5135 		    (grp->mrg_state == MAC_GROUP_STATE_UNINIT))
5136 			continue;
5137 
5138 		rv = mac_start_group(grp);
5139 		ASSERT(rv == 0);
5140 
5141 		grp->mrg_state = MAC_GROUP_STATE_RESERVED;
5142 		break;
5143 	}
5144 
5145 	ASSERT(grp != NULL);
5146 
5147 	/*
5148 	 * Populate the group. Rings should be taken from the group
5149 	 * of unassigned rings, which is past the array of TX
5150 	 * groups adversized by the driver.
5151 	 */
5152 	rv = i_mac_group_allocate_rings(mip, MAC_RING_TYPE_TX, NULL,
5153 	    grp, share);
5154 	if (rv != 0) {
5155 		DTRACE_PROBE3(tx__group__reserve__alloc__rings,
5156 		    char *, mip->mi_name, int, grp->mrg_index, int, rv);
5157 
5158 		mac_stop_group(grp);
5159 		grp->mrg_state = MAC_GROUP_STATE_UNINIT;
5160 
5161 		return (NULL);
5162 	}
5163 
5164 	mip->mi_tx_group_free--;
5165 
5166 	return (grp);
5167 }
5168 
5169 void
5170 mac_release_tx_group(mac_impl_t *mip, mac_group_t *grp)
5171 {
5172 	mac_client_impl_t *mcip = grp->mrg_tx_client;
5173 	mac_share_handle_t share = mcip->mci_share;
5174 	mac_ring_t *ring;
5175 
5176 	ASSERT(mip->mi_tx_group_type == MAC_GROUP_TYPE_DYNAMIC);
5177 	ASSERT(share != NULL);
5178 	ASSERT(grp->mrg_state == MAC_GROUP_STATE_RESERVED);
5179 
5180 	mip->mi_share_capab.ms_sremove(share, grp->mrg_driver);
5181 	while ((ring = grp->mrg_rings) != NULL) {
5182 		/* move the ring back to the pool */
5183 		(void) mac_group_mov_ring(mip, mip->mi_tx_groups +
5184 		    mip->mi_tx_group_count, ring);
5185 	}
5186 	mac_stop_group(grp);
5187 	mac_set_rx_group_state(grp, MAC_GROUP_STATE_REGISTERED);
5188 	grp->mrg_tx_client = NULL;
5189 	mip->mi_tx_group_free++;
5190 }
5191 
5192 /*
5193  * This is a 1-time control path activity initiated by the client (IP).
5194  * The mac perimeter protects against other simultaneous control activities,
5195  * for example an ioctl that attempts to change the degree of fanout and
5196  * increase or decrease the number of softrings associated with this Tx SRS.
5197  */
5198 static mac_tx_notify_cb_t *
5199 mac_client_tx_notify_add(mac_client_impl_t *mcip,
5200     mac_tx_notify_t notify, void *arg)
5201 {
5202 	mac_cb_info_t *mcbi;
5203 	mac_tx_notify_cb_t *mtnfp;
5204 
5205 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5206 
5207 	mtnfp = kmem_zalloc(sizeof (mac_tx_notify_cb_t), KM_SLEEP);
5208 	mtnfp->mtnf_fn = notify;
5209 	mtnfp->mtnf_arg = arg;
5210 	mtnfp->mtnf_link.mcb_objp = mtnfp;
5211 	mtnfp->mtnf_link.mcb_objsize = sizeof (mac_tx_notify_cb_t);
5212 	mtnfp->mtnf_link.mcb_flags = MCB_TX_NOTIFY_CB_T;
5213 
5214 	mcbi = &mcip->mci_tx_notify_cb_info;
5215 	mutex_enter(mcbi->mcbi_lockp);
5216 	mac_callback_add(mcbi, &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link);
5217 	mutex_exit(mcbi->mcbi_lockp);
5218 	return (mtnfp);
5219 }
5220 
5221 static void
5222 mac_client_tx_notify_remove(mac_client_impl_t *mcip, mac_tx_notify_cb_t *mtnfp)
5223 {
5224 	mac_cb_info_t	*mcbi;
5225 	mac_cb_t	**cblist;
5226 
5227 	ASSERT(MAC_PERIM_HELD((mac_handle_t)mcip->mci_mip));
5228 
5229 	if (!mac_callback_find(&mcip->mci_tx_notify_cb_info,
5230 	    &mcip->mci_tx_notify_cb_list, &mtnfp->mtnf_link)) {
5231 		cmn_err(CE_WARN,
5232 		    "mac_client_tx_notify_remove: callback not "
5233 		    "found, mcip 0x%p mtnfp 0x%p", (void *)mcip, (void *)mtnfp);
5234 		return;
5235 	}
5236 
5237 	mcbi = &mcip->mci_tx_notify_cb_info;
5238 	cblist = &mcip->mci_tx_notify_cb_list;
5239 	mutex_enter(mcbi->mcbi_lockp);
5240 	if (mac_callback_remove(mcbi, cblist, &mtnfp->mtnf_link))
5241 		kmem_free(mtnfp, sizeof (mac_tx_notify_cb_t));
5242 	else
5243 		mac_callback_remove_wait(&mcip->mci_tx_notify_cb_info);
5244 	mutex_exit(mcbi->mcbi_lockp);
5245 }
5246 
5247 /*
5248  * mac_client_tx_notify():
5249  * call to add and remove flow control callback routine.
5250  */
5251 mac_tx_notify_handle_t
5252 mac_client_tx_notify(mac_client_handle_t mch, mac_tx_notify_t callb_func,
5253     void *ptr)
5254 {
5255 	mac_client_impl_t	*mcip = (mac_client_impl_t *)mch;
5256 	mac_tx_notify_cb_t	*mtnfp = NULL;
5257 
5258 	i_mac_perim_enter(mcip->mci_mip);
5259 
5260 	if (callb_func != NULL) {
5261 		/* Add a notify callback */
5262 		mtnfp = mac_client_tx_notify_add(mcip, callb_func, ptr);
5263 	} else {
5264 		mac_client_tx_notify_remove(mcip, (mac_tx_notify_cb_t *)ptr);
5265 	}
5266 	i_mac_perim_exit(mcip->mci_mip);
5267 
5268 	return ((mac_tx_notify_handle_t)mtnfp);
5269 }
5270