xref: /titanic_41/usr/src/uts/common/io/mac/mac_sched.c (revision 61d7aa6fb64ef65c47c92904fc9ecb48c64eecf5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2011 Joyent, Inc.  All rights reserved.
25  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
26  */
27 
28 #include <sys/types.h>
29 #include <sys/callb.h>
30 #include <sys/sdt.h>
31 #include <sys/strsubr.h>
32 #include <sys/strsun.h>
33 #include <sys/vlan.h>
34 #include <sys/stack.h>
35 #include <sys/archsystm.h>
36 #include <inet/ipsec_impl.h>
37 #include <inet/ip_impl.h>
38 #include <inet/sadb.h>
39 #include <inet/ipsecesp.h>
40 #include <inet/ipsecah.h>
41 #include <inet/ip6.h>
42 
43 #include <sys/mac_impl.h>
44 #include <sys/mac_client_impl.h>
45 #include <sys/mac_client_priv.h>
46 #include <sys/mac_soft_ring.h>
47 #include <sys/mac_flow_impl.h>
48 
49 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
50     uintptr_t, uint16_t, mblk_t **);
51 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
52     uintptr_t, uint16_t, mblk_t **);
53 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
54     uintptr_t, uint16_t, mblk_t **);
55 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
56     uintptr_t, uint16_t, mblk_t **);
57 static mac_tx_cookie_t mac_tx_aggr_mode(mac_soft_ring_set_t *, mblk_t *,
58     uintptr_t, uint16_t, mblk_t **);
59 
60 typedef struct mac_tx_mode_s {
61 	mac_tx_srs_mode_t	mac_tx_mode;
62 	mac_tx_func_t		mac_tx_func;
63 } mac_tx_mode_t;
64 
65 /*
66  * There are seven modes of operation on the Tx side. These modes get set
67  * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
68  * none of the other modes are user configurable. They get selected by
69  * the system depending upon whether the link (or flow) has multiple Tx
70  * rings or a bandwidth configured, or if the link is an aggr, etc.
71  *
72  * When the Tx SRS is operating in aggr mode (st_mode) or if there are
73  * multiple Tx rings owned by Tx SRS, then each Tx ring (pseudo or
74  * otherwise) will have a soft ring associated with it. These soft rings
75  * are stored in srs_tx_soft_rings[] array.
76  *
77  * Additionally in the case of aggr, there is the st_soft_rings[] array
78  * in the mac_srs_tx_t structure. This array is used to store the same
79  * set of soft rings that are present in srs_tx_soft_rings[] array but
80  * in a different manner. The soft ring associated with the pseudo Tx
81  * ring is saved at mr_index (of the pseudo ring) in st_soft_rings[]
82  * array. This helps in quickly getting the soft ring associated with the
83  * Tx ring when aggr_find_tx_ring() returns the pseudo Tx ring that is to
84  * be used for transmit.
85  */
86 mac_tx_mode_t mac_tx_mode_list[] = {
87 	{SRS_TX_DEFAULT,	mac_tx_single_ring_mode},
88 	{SRS_TX_SERIALIZE,	mac_tx_serializer_mode},
89 	{SRS_TX_FANOUT,		mac_tx_fanout_mode},
90 	{SRS_TX_BW,		mac_tx_bw_mode},
91 	{SRS_TX_BW_FANOUT,	mac_tx_bw_mode},
92 	{SRS_TX_AGGR,		mac_tx_aggr_mode},
93 	{SRS_TX_BW_AGGR,	mac_tx_bw_mode}
94 };
95 
96 /*
97  * Soft Ring Set (SRS) - The Run time code that deals with
98  * dynamic polling from the hardware, bandwidth enforcement,
99  * fanout etc.
100  *
101  * We try to use H/W classification on NIC and assign traffic for
102  * a MAC address to a particular Rx ring or ring group. There is a
103  * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
104  * switches the underlying Rx ring between interrupt and
105  * polling mode and enforces any specified B/W control.
106  *
107  * There is always a SRS created and tied to each H/W and S/W rule.
108  * Whenever we create a H/W rule, we always add the the same rule to
109  * S/W classifier and tie a SRS to it.
110  *
111  * In case a B/W control is specified, it is broken into bytes
112  * per ticks and as soon as the quota for a tick is exhausted,
113  * the underlying Rx ring is forced into poll mode for remainder of
114  * the tick. The SRS poll thread only polls for bytes that are
115  * allowed to come in the SRS. We typically let 4x the configured
116  * B/W worth of packets to come in the SRS (to prevent unnecessary
117  * drops due to bursts) but only process the specified amount.
118  *
119  * A MAC client (e.g. a VNIC or aggr) can have 1 or more
120  * Rx rings (and corresponding SRSs) assigned to it. The SRS
121  * in turn can have softrings to do protocol level fanout or
122  * softrings to do S/W based fanout or both. In case the NIC
123  * has no Rx rings, we do S/W classification to respective SRS.
124  * The S/W classification rule is always setup and ready. This
125  * allows the MAC layer to reassign Rx rings whenever needed
126  * but packets still continue to flow via the default path and
127  * getting S/W classified to correct SRS.
128  *
129  * The SRS's are used on both Tx and Rx side. They use the same
130  * data structure but the processing routines have slightly different
131  * semantics due to the fact that Rx side needs to do dynamic
132  * polling etc.
133  *
134  * Dynamic Polling Notes
135  * =====================
136  *
137  * Each Soft ring set is capable of switching its Rx ring between
138  * interrupt and poll mode and actively 'polls' for packets in
139  * poll mode. If the SRS is implementing a B/W limit, it makes
140  * sure that only Max allowed packets are pulled in poll mode
141  * and goes to poll mode as soon as B/W limit is exceeded. As
142  * such, there are no overheads to implement B/W limits.
143  *
144  * In poll mode, its better to keep the pipeline going where the
145  * SRS worker thread keeps processing packets and poll thread
146  * keeps bringing more packets (specially if they get to run
147  * on different CPUs). This also prevents the overheads associated
148  * by excessive signalling (on NUMA machines, this can be
149  * pretty devastating). The exception is latency optimized case
150  * where worker thread does no work and interrupt and poll thread
151  * are allowed to do their own drain.
152  *
153  * We use the following policy to control Dynamic Polling:
154  * 1) We switch to poll mode anytime the processing
155  *    thread causes a backlog to build up in SRS and
156  *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
157  * 2) As long as the backlog stays under the low water
158  *    mark (sr_lowat), we poll the H/W for more packets.
159  * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
160  *    water mark, we stay in poll mode but don't poll
161  *    the H/W for more packets.
162  * 4) Anytime in polling mode, if we poll the H/W for
163  *    packets and find nothing plus we have an existing
164  *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
165  *    mode but don't poll the H/W for packets anymore
166  *    (let the polling thread go to sleep).
167  * 5) Once the backlog is relived (packets are processed)
168  *    we reenable polling (by signalling the poll thread)
169  *    only when the backlog dips below sr_poll_thres.
170  * 6) sr_hiwat is used exclusively when we are not
171  *    polling capable and is used to decide when to
172  *    drop packets so the SRS queue length doesn't grow
173  *    infinitely.
174  *
175  * NOTE: Also see the block level comment on top of mac_soft_ring.c
176  */
177 
178 /*
179  * mac_latency_optimize
180  *
181  * Controls whether the poll thread can process the packets inline
182  * or let the SRS worker thread do the processing. This applies if
183  * the SRS was not being processed. For latency sensitive traffic,
184  * this needs to be true to allow inline processing. For throughput
185  * under load, this should be false.
186  *
187  * This (and other similar) tunable should be rolled into a link
188  * or flow specific workload hint that can be set using dladm
189  * linkprop (instead of multiple such tunables).
190  */
191 boolean_t mac_latency_optimize = B_TRUE;
192 
193 /*
194  * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
195  *
196  * queue a mp or chain in soft ring set and increment the
197  * local count (srs_count) for the SRS and the shared counter
198  * (srs_poll_pkt_cnt - shared between SRS and its soft rings
199  * to track the total unprocessed packets for polling to work
200  * correctly).
201  *
202  * The size (total bytes queued) counters are incremented only
203  * if we are doing B/W control.
204  */
205 #define	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {		\
206 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
207 	if ((mac_srs)->srs_last != NULL)				\
208 		(mac_srs)->srs_last->b_next = (head);			\
209 	else								\
210 		(mac_srs)->srs_first = (head);				\
211 	(mac_srs)->srs_last = (tail);					\
212 	(mac_srs)->srs_count += count;					\
213 }
214 
215 #define	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
216 	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
217 									\
218 	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
219 	srs_rx->sr_poll_pkt_cnt += count;				\
220 	ASSERT(srs_rx->sr_poll_pkt_cnt > 0);				\
221 	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
222 		(mac_srs)->srs_size += (sz);				\
223 		mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);		\
224 		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
225 		mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);		\
226 	}								\
227 }
228 
229 #define	MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
230 	mac_srs->srs_state |= SRS_ENQUEUED;				\
231 	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
232 	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
233 		(mac_srs)->srs_size += (sz);				\
234 		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
235 	}								\
236 }
237 
238 /*
239  * Turn polling on routines
240  */
241 #define	MAC_SRS_POLLING_ON(mac_srs) {					\
242 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
243 	if (((mac_srs)->srs_state &					\
244 	    (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {	\
245 		(mac_srs)->srs_state |= SRS_POLLING;			\
246 		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
247 		    (mac_srs)->srs_ring);				\
248 		(mac_srs)->srs_rx.sr_poll_on++;				\
249 	}								\
250 }
251 
252 #define	MAC_SRS_WORKER_POLLING_ON(mac_srs) {				\
253 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
254 	if (((mac_srs)->srs_state &					\
255 	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == 		\
256 	    (SRS_POLLING_CAPAB|SRS_WORKER)) {				\
257 		(mac_srs)->srs_state |= SRS_POLLING;			\
258 		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
259 		    (mac_srs)->srs_ring);				\
260 		(mac_srs)->srs_rx.sr_worker_poll_on++;			\
261 	}								\
262 }
263 
264 /*
265  * MAC_SRS_POLL_RING
266  *
267  * Signal the SRS poll thread to poll the underlying H/W ring
268  * provided it wasn't already polling (SRS_GET_PKTS was set).
269  *
270  * Poll thread gets to run only from mac_rx_srs_drain() and only
271  * if the drain was being done by the worker thread.
272  */
273 #define	MAC_SRS_POLL_RING(mac_srs) {					\
274 	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
275 									\
276 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
277 	srs_rx->sr_poll_thr_sig++;					\
278 	if (((mac_srs)->srs_state & 					\
279 	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==		\
280 		(SRS_WORKER|SRS_POLLING_CAPAB)) {			\
281 		(mac_srs)->srs_state |= SRS_GET_PKTS;			\
282 		cv_signal(&(mac_srs)->srs_cv);   			\
283 	} else {							\
284 		srs_rx->sr_poll_thr_busy++;				\
285 	}								\
286 }
287 
288 /*
289  * MAC_SRS_CHECK_BW_CONTROL
290  *
291  * Check to see if next tick has started so we can reset the
292  * SRS_BW_ENFORCED flag and allow more packets to come in the
293  * system.
294  */
295 #define	MAC_SRS_CHECK_BW_CONTROL(mac_srs) {				\
296 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
297 	ASSERT(((mac_srs)->srs_type & SRST_TX) ||			\
298 	    MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));		\
299 	clock_t now = ddi_get_lbolt();					\
300 	if ((mac_srs)->srs_bw->mac_bw_curr_time != now) {		\
301 		(mac_srs)->srs_bw->mac_bw_curr_time = now;		\
302 		(mac_srs)->srs_bw->mac_bw_used = 0;	       		\
303 		if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)	\
304 			(mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
305 	}								\
306 }
307 
308 /*
309  * MAC_SRS_WORKER_WAKEUP
310  *
311  * Wake up the SRS worker thread to process the queue as long as
312  * no one else is processing the queue. If we are optimizing for
313  * latency, we wake up the worker thread immediately or else we
314  * wait mac_srs_worker_wakeup_ticks before worker thread gets
315  * woken up.
316  */
317 int mac_srs_worker_wakeup_ticks = 0;
318 #define	MAC_SRS_WORKER_WAKEUP(mac_srs) {				\
319 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
320 	if (!((mac_srs)->srs_state & SRS_PROC) &&			\
321 		(mac_srs)->srs_tid == NULL) {				\
322 		if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||		\
323 			(mac_srs_worker_wakeup_ticks == 0))		\
324 			cv_signal(&(mac_srs)->srs_async);		\
325 		else							\
326 			(mac_srs)->srs_tid =				\
327 				timeout(mac_srs_fire, (mac_srs),	\
328 					mac_srs_worker_wakeup_ticks);	\
329 	}								\
330 }
331 
332 #define	TX_BANDWIDTH_MODE(mac_srs)				\
333 	((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||		\
334 	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT ||	\
335 	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_AGGR)
336 
337 #define	TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {			\
338 	if (tx_mode == SRS_TX_BW_FANOUT)				\
339 		(void) mac_tx_fanout_mode(mac_srs, head, hint, 0, NULL);\
340 	else								\
341 		(void) mac_tx_aggr_mode(mac_srs, head, hint, 0, NULL);	\
342 }
343 
344 /*
345  * MAC_TX_SRS_BLOCK
346  *
347  * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
348  * will be set only if srs_tx_woken_up is FALSE. If
349  * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
350  * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
351  * attempt to transmit again and not setting SRS_TX_BLOCKED does
352  * that.
353  */
354 #define	MAC_TX_SRS_BLOCK(srs, mp)	{			\
355 	ASSERT(MUTEX_HELD(&(srs)->srs_lock));			\
356 	if ((srs)->srs_tx.st_woken_up) {			\
357 		(srs)->srs_tx.st_woken_up = B_FALSE;		\
358 	} else {						\
359 		ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));	\
360 		(srs)->srs_state |= SRS_TX_BLOCKED;		\
361 		(srs)->srs_tx.st_stat.mts_blockcnt++;		\
362 	}							\
363 }
364 
365 /*
366  * MAC_TX_SRS_TEST_HIWAT
367  *
368  * Called before queueing a packet onto Tx SRS to test and set
369  * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
370  */
371 #define	MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {		\
372 	boolean_t enqueue = 1;						\
373 									\
374 	if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {		\
375 		/*							\
376 		 * flow-controlled. Store srs in cookie so that it	\
377 		 * can be returned as mac_tx_cookie_t to client		\
378 		 */							\
379 		(srs)->srs_state |= SRS_TX_HIWAT;			\
380 		cookie = (mac_tx_cookie_t)srs;				\
381 		(srs)->srs_tx.st_hiwat_cnt++;				\
382 		if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {	\
383 			/* increment freed stats */			\
384 			(srs)->srs_tx.st_stat.mts_sdrops += cnt;	\
385 			/*						\
386 			 * b_prev may be set to the fanout hint		\
387 			 * hence can't use freemsg directly		\
388 			 */						\
389 			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
390 			DTRACE_PROBE1(tx_queued_hiwat,			\
391 			    mac_soft_ring_set_t *, srs);		\
392 			enqueue = 0;					\
393 		}							\
394 	}								\
395 	if (enqueue)							\
396 		MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);	\
397 }
398 
399 /* Some utility macros */
400 #define	MAC_SRS_BW_LOCK(srs)						\
401 	if (!(srs->srs_type & SRST_TX))					\
402 		mutex_enter(&srs->srs_bw->mac_bw_lock);
403 
404 #define	MAC_SRS_BW_UNLOCK(srs)						\
405 	if (!(srs->srs_type & SRST_TX))					\
406 		mutex_exit(&srs->srs_bw->mac_bw_lock);
407 
408 #define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
409 	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
410 	/* increment freed stats */				\
411 	mac_srs->srs_tx.st_stat.mts_sdrops++;			\
412 	cookie = (mac_tx_cookie_t)srs;				\
413 }
414 
415 #define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
416 	mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;			\
417 	cookie = (mac_tx_cookie_t)srs;					\
418 	*ret_mp = mp_chain;						\
419 }
420 
421 /*
422  * MAC_RX_SRS_TOODEEP
423  *
424  * Macro called as part of receive-side processing to determine if handling
425  * can occur in situ (in the interrupt thread) or if it should be left to a
426  * worker thread.  Note that the constant used to make this determination is
427  * not entirely made-up, and is a result of some emprical validation. That
428  * said, the constant is left as a static variable to allow it to be
429  * dynamically tuned in the field if and as needed.
430  */
431 static uintptr_t mac_rx_srs_stack_needed = 10240;
432 static uint_t mac_rx_srs_stack_toodeep;
433 
434 #ifndef STACK_GROWTH_DOWN
435 #error Downward stack growth assumed.
436 #endif
437 
438 #define	MAC_RX_SRS_TOODEEP() (STACK_BIAS + (uintptr_t)getfp() - \
439 	(uintptr_t)curthread->t_stkbase < mac_rx_srs_stack_needed && \
440 	++mac_rx_srs_stack_toodeep)
441 
442 
443 /*
444  * Drop the rx packet and advance to the next one in the chain.
445  */
446 static void
447 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
448 {
449 	mac_srs_rx_t	*srs_rx = &srs->srs_rx;
450 
451 	ASSERT(mp->b_next == NULL);
452 	mutex_enter(&srs->srs_lock);
453 	MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
454 	MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
455 	mutex_exit(&srs->srs_lock);
456 
457 	srs_rx->sr_stat.mrs_sdrops++;
458 	freemsg(mp);
459 }
460 
461 /* DATAPATH RUNTIME ROUTINES */
462 
463 /*
464  * mac_srs_fire
465  *
466  * Timer callback routine for waking up the SRS worker thread.
467  */
468 static void
469 mac_srs_fire(void *arg)
470 {
471 	mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
472 
473 	mutex_enter(&mac_srs->srs_lock);
474 	if (mac_srs->srs_tid == 0) {
475 		mutex_exit(&mac_srs->srs_lock);
476 		return;
477 	}
478 
479 	mac_srs->srs_tid = 0;
480 	if (!(mac_srs->srs_state & SRS_PROC))
481 		cv_signal(&mac_srs->srs_async);
482 
483 	mutex_exit(&mac_srs->srs_lock);
484 }
485 
486 /*
487  * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
488  * and it is used on the TX path.
489  */
490 #define	HASH_HINT(hint)	\
491 	((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
492 
493 
494 /*
495  * hash based on the src address, dst address and the port information.
496  */
497 #define	HASH_ADDR(src, dst, ports)					\
498 	(ntohl((src) + (dst)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^	\
499 	((ports) >> 8) ^ (ports))
500 
501 #define	COMPUTE_INDEX(key, sz)	(key % sz)
502 
503 #define	FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {	\
504 	if ((tail) != NULL) {						\
505 		ASSERT((tail)->b_next == NULL);				\
506 		(tail)->b_next = (mp);					\
507 	} else {							\
508 		ASSERT((head) == NULL);					\
509 		(head) = (mp);						\
510 	}								\
511 	(tail) = (mp);							\
512 	(cnt)++;							\
513 	if ((bw_ctl))							\
514 		(sz) += (sz0);						\
515 }
516 
517 #define	MAC_FANOUT_DEFAULT	0
518 #define	MAC_FANOUT_RND_ROBIN	1
519 int mac_fanout_type = MAC_FANOUT_DEFAULT;
520 
521 #define	MAX_SR_TYPES	3
522 /* fanout types for port based hashing */
523 enum pkt_type {
524 	V4_TCP = 0,
525 	V4_UDP,
526 	OTH,
527 	UNDEF
528 };
529 
530 /*
531  * Pair of local and remote ports in the transport header
532  */
533 #define	PORTS_SIZE 4
534 
535 /*
536  * mac_rx_srs_proto_fanout
537  *
538  * This routine delivers packets destined to an SRS into one of the
539  * protocol soft rings.
540  *
541  * Given a chain of packets we need to split it up into multiple sub chains
542  * destined into TCP, UDP or OTH soft ring. Instead of entering
543  * the soft ring one packet at a time, we want to enter it in the form of a
544  * chain otherwise we get this start/stop behaviour where the worker thread
545  * goes to sleep and then next packets comes in forcing it to wake up etc.
546  */
547 static void
548 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
549 {
550 	struct ether_header		*ehp;
551 	struct ether_vlan_header	*evhp;
552 	uint32_t			sap;
553 	ipha_t				*ipha;
554 	uint8_t				*dstaddr;
555 	size_t				hdrsize;
556 	mblk_t				*mp;
557 	mblk_t				*headmp[MAX_SR_TYPES];
558 	mblk_t				*tailmp[MAX_SR_TYPES];
559 	int				cnt[MAX_SR_TYPES];
560 	size_t				sz[MAX_SR_TYPES];
561 	size_t				sz1;
562 	boolean_t			bw_ctl;
563 	boolean_t			hw_classified;
564 	boolean_t			dls_bypass;
565 	boolean_t			is_ether;
566 	boolean_t			is_unicast;
567 	enum pkt_type			type;
568 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
569 
570 	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
571 	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
572 
573 	/*
574 	 * If we don't have a Rx ring, S/W classification would have done
575 	 * its job and its a packet meant for us. If we were polling on
576 	 * the default ring (i.e. there was a ring assigned to this SRS),
577 	 * then we need to make sure that the mac address really belongs
578 	 * to us.
579 	 */
580 	hw_classified = mac_srs->srs_ring != NULL &&
581 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
582 
583 	/*
584 	 * Special clients (eg. VLAN, non ether, etc) need DLS
585 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
586 	 * such SRSs. Another way of disabling bypass is to set the
587 	 * MCIS_RX_BYPASS_DISABLE flag.
588 	 */
589 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
590 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
591 
592 	bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
593 	bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
594 	bzero(cnt, MAX_SR_TYPES * sizeof (int));
595 	bzero(sz, MAX_SR_TYPES * sizeof (size_t));
596 
597 	/*
598 	 * We got a chain from SRS that we need to send to the soft rings.
599 	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
600 	 * performance reasons), we need to separate out v4_tcp, v4_udp
601 	 * and the rest goes in other.
602 	 */
603 	while (head != NULL) {
604 		mp = head;
605 		head = head->b_next;
606 		mp->b_next = NULL;
607 
608 		type = OTH;
609 		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
610 
611 		if (is_ether) {
612 			/*
613 			 * At this point we can be sure the packet at least
614 			 * has an ether header.
615 			 */
616 			if (sz1 < sizeof (struct ether_header)) {
617 				mac_rx_drop_pkt(mac_srs, mp);
618 				continue;
619 			}
620 			ehp = (struct ether_header *)mp->b_rptr;
621 
622 			/*
623 			 * Determine if this is a VLAN or non-VLAN packet.
624 			 */
625 			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
626 				evhp = (struct ether_vlan_header *)mp->b_rptr;
627 				sap = ntohs(evhp->ether_type);
628 				hdrsize = sizeof (struct ether_vlan_header);
629 				/*
630 				 * Check if the VID of the packet, if any,
631 				 * belongs to this client.
632 				 */
633 				if (!mac_client_check_flow_vid(mcip,
634 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
635 					mac_rx_drop_pkt(mac_srs, mp);
636 					continue;
637 				}
638 			} else {
639 				hdrsize = sizeof (struct ether_header);
640 			}
641 			is_unicast =
642 			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
643 			dstaddr = (uint8_t *)&ehp->ether_dhost;
644 		} else {
645 			mac_header_info_t		mhi;
646 
647 			if (mac_header_info((mac_handle_t)mcip->mci_mip,
648 			    mp, &mhi) != 0) {
649 				mac_rx_drop_pkt(mac_srs, mp);
650 				continue;
651 			}
652 			hdrsize = mhi.mhi_hdrsize;
653 			sap = mhi.mhi_bindsap;
654 			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
655 			dstaddr = (uint8_t *)mhi.mhi_daddr;
656 		}
657 
658 		if (!dls_bypass) {
659 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
660 			    cnt[type], bw_ctl, sz[type], sz1, mp);
661 			continue;
662 		}
663 
664 		if (sap == ETHERTYPE_IP) {
665 			/*
666 			 * If we are H/W classified, but we have promisc
667 			 * on, then we need to check for the unicast address.
668 			 */
669 			if (hw_classified && mcip->mci_promisc_list != NULL) {
670 				mac_address_t		*map;
671 
672 				rw_enter(&mcip->mci_rw_lock, RW_READER);
673 				map = mcip->mci_unicast;
674 				if (bcmp(dstaddr, map->ma_addr,
675 				    map->ma_len) == 0)
676 					type = UNDEF;
677 				rw_exit(&mcip->mci_rw_lock);
678 			} else if (is_unicast) {
679 				type = UNDEF;
680 			}
681 		}
682 
683 		/*
684 		 * This needs to become a contract with the driver for
685 		 * the fast path.
686 		 *
687 		 * In the normal case the packet will have at least the L2
688 		 * header and the IP + Transport header in the same mblk.
689 		 * This is usually the case when the NIC driver sends up
690 		 * the packet. This is also true when the stack generates
691 		 * a packet that is looped back and when the stack uses the
692 		 * fastpath mechanism. The normal case is optimized for
693 		 * performance and may bypass DLS. All other cases go through
694 		 * the 'OTH' type path without DLS bypass.
695 		 */
696 
697 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
698 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
699 			type = OTH;
700 
701 		if (type == OTH) {
702 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
703 			    cnt[type], bw_ctl, sz[type], sz1, mp);
704 			continue;
705 		}
706 
707 		ASSERT(type == UNDEF);
708 		/*
709 		 * We look for at least 4 bytes past the IP header to get
710 		 * the port information. If we get an IP fragment, we don't
711 		 * have the port information, and we use just the protocol
712 		 * information.
713 		 */
714 		switch (ipha->ipha_protocol) {
715 		case IPPROTO_TCP:
716 			type = V4_TCP;
717 			mp->b_rptr += hdrsize;
718 			break;
719 		case IPPROTO_UDP:
720 			type = V4_UDP;
721 			mp->b_rptr += hdrsize;
722 			break;
723 		default:
724 			type = OTH;
725 			break;
726 		}
727 
728 		FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
729 		    bw_ctl, sz[type], sz1, mp);
730 	}
731 
732 	for (type = V4_TCP; type < UNDEF; type++) {
733 		if (headmp[type] != NULL) {
734 			mac_soft_ring_t			*softring;
735 
736 			ASSERT(tailmp[type]->b_next == NULL);
737 			switch (type) {
738 			case V4_TCP:
739 				softring = mac_srs->srs_tcp_soft_rings[0];
740 				break;
741 			case V4_UDP:
742 				softring = mac_srs->srs_udp_soft_rings[0];
743 				break;
744 			case OTH:
745 				softring = mac_srs->srs_oth_soft_rings[0];
746 			}
747 			mac_rx_soft_ring_process(mcip, softring,
748 			    headmp[type], tailmp[type], cnt[type], sz[type]);
749 		}
750 	}
751 }
752 
753 int	fanout_unaligned = 0;
754 
755 /*
756  * mac_rx_srs_long_fanout
757  *
758  * The fanout routine for VLANs, and for anything else that isn't performing
759  * explicit dls bypass.  Returns -1 on an error (drop the packet due to a
760  * malformed packet), 0 on success, with values written in *indx and *type.
761  */
762 static int
763 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
764     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
765 {
766 	ip6_t		*ip6h;
767 	ipha_t		*ipha;
768 	uint8_t		*whereptr;
769 	uint_t		hash;
770 	uint16_t	remlen;
771 	uint8_t		nexthdr;
772 	uint16_t	hdr_len;
773 	uint32_t	src_val, dst_val;
774 	boolean_t	modifiable = B_TRUE;
775 	boolean_t	v6;
776 
777 	ASSERT(MBLKL(mp) >= hdrsize);
778 
779 	if (sap == ETHERTYPE_IPV6) {
780 		v6 = B_TRUE;
781 		hdr_len = IPV6_HDR_LEN;
782 	} else if (sap == ETHERTYPE_IP) {
783 		v6 = B_FALSE;
784 		hdr_len = IP_SIMPLE_HDR_LENGTH;
785 	} else {
786 		*indx = 0;
787 		*type = OTH;
788 		return (0);
789 	}
790 
791 	ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
792 	ipha = (ipha_t *)ip6h;
793 
794 	if ((uint8_t *)ip6h == mp->b_wptr) {
795 		/*
796 		 * The first mblk_t only includes the mac header.
797 		 * Note that it is safe to change the mp pointer here,
798 		 * as the subsequent operation does not assume mp
799 		 * points to the start of the mac header.
800 		 */
801 		mp = mp->b_cont;
802 
803 		/*
804 		 * Make sure the IP header points to an entire one.
805 		 */
806 		if (mp == NULL)
807 			return (-1);
808 
809 		if (MBLKL(mp) < hdr_len) {
810 			modifiable = (DB_REF(mp) == 1);
811 
812 			if (modifiable && !pullupmsg(mp, hdr_len))
813 				return (-1);
814 		}
815 
816 		ip6h = (ip6_t *)mp->b_rptr;
817 		ipha = (ipha_t *)ip6h;
818 	}
819 
820 	if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
821 	    ((uint8_t *)ip6h + hdr_len > mp->b_wptr)) {
822 		/*
823 		 * If either the IP header is not aligned, or it does not hold
824 		 * the complete simple structure (a pullupmsg() is not an
825 		 * option since it would result in an unaligned IP header),
826 		 * fanout to the default ring.
827 		 *
828 		 * Note that this may cause packet reordering.
829 		 */
830 		*indx = 0;
831 		*type = OTH;
832 		fanout_unaligned++;
833 		return (0);
834 	}
835 
836 	/*
837 	 * Extract next-header, full header length, and source-hash value
838 	 * using v4/v6 specific fields.
839 	 */
840 	if (v6) {
841 		remlen = ntohs(ip6h->ip6_plen);
842 		nexthdr = ip6h->ip6_nxt;
843 		src_val = V4_PART_OF_V6(ip6h->ip6_src);
844 		dst_val = V4_PART_OF_V6(ip6h->ip6_dst);
845 		/*
846 		 * Do src based fanout if below tunable is set to B_TRUE or
847 		 * when mac_ip_hdr_length_v6() fails because of malformed
848 		 * packets or because mblks need to be concatenated using
849 		 * pullupmsg().
850 		 */
851 		if (!mac_ip_hdr_length_v6(ip6h, mp->b_wptr, &hdr_len, &nexthdr,
852 		    NULL)) {
853 			goto src_dst_based_fanout;
854 		}
855 	} else {
856 		hdr_len = IPH_HDR_LENGTH(ipha);
857 		remlen = ntohs(ipha->ipha_length) - hdr_len;
858 		nexthdr = ipha->ipha_protocol;
859 		src_val = (uint32_t)ipha->ipha_src;
860 		dst_val = (uint32_t)ipha->ipha_dst;
861 		/*
862 		 * Catch IPv4 fragment case here.  IPv6 has nexthdr == FRAG
863 		 * for its equivalent case.
864 		 */
865 		if ((ntohs(ipha->ipha_fragment_offset_and_flags) &
866 		    (IPH_MF | IPH_OFFSET)) != 0) {
867 			goto src_dst_based_fanout;
868 		}
869 	}
870 	if (remlen < MIN_EHDR_LEN)
871 		return (-1);
872 	whereptr = (uint8_t *)ip6h + hdr_len;
873 
874 	/* If the transport is one of below, we do port/SPI based fanout */
875 	switch (nexthdr) {
876 	case IPPROTO_TCP:
877 	case IPPROTO_UDP:
878 	case IPPROTO_SCTP:
879 	case IPPROTO_ESP:
880 		/*
881 		 * If the ports or SPI in the transport header is not part of
882 		 * the mblk, do src_based_fanout, instead of calling
883 		 * pullupmsg().
884 		 */
885 		if (mp->b_cont == NULL || whereptr + PORTS_SIZE <= mp->b_wptr)
886 			break;	/* out of switch... */
887 		/* FALLTHRU */
888 	default:
889 		goto src_dst_based_fanout;
890 	}
891 
892 	switch (nexthdr) {
893 	case IPPROTO_TCP:
894 		hash = HASH_ADDR(src_val, dst_val, *(uint32_t *)whereptr);
895 		*indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
896 		*type = OTH;
897 		break;
898 	case IPPROTO_UDP:
899 	case IPPROTO_SCTP:
900 	case IPPROTO_ESP:
901 		if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
902 			hash = HASH_ADDR(src_val, dst_val,
903 			    *(uint32_t *)whereptr);
904 			*indx = COMPUTE_INDEX(hash,
905 			    mac_srs->srs_udp_ring_count);
906 		} else {
907 			*indx = mac_srs->srs_ind % mac_srs->srs_udp_ring_count;
908 			mac_srs->srs_ind++;
909 		}
910 		*type = OTH;
911 		break;
912 	}
913 	return (0);
914 
915 src_dst_based_fanout:
916 	hash = HASH_ADDR(src_val, dst_val, (uint32_t)0);
917 	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
918 	*type = OTH;
919 	return (0);
920 }
921 
922 /*
923  * mac_rx_srs_fanout
924  *
925  * This routine delivers packets destined to an SRS into a soft ring member
926  * of the set.
927  *
928  * Given a chain of packets we need to split it up into multiple sub chains
929  * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
930  * the soft ring one packet at a time, we want to enter it in the form of a
931  * chain otherwise we get this start/stop behaviour where the worker thread
932  * goes to sleep and then next packets comes in forcing it to wake up etc.
933  *
934  * Note:
935  * Since we know what is the maximum fanout possible, we create a 2D array
936  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
937  * variables so that we can enter the softrings with chain. We need the
938  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
939  * for each packet would be expensive). If we ever want to have the
940  * ability to have unlimited fanout, we should probably declare a head,
941  * tail, cnt, sz with each soft ring (a data struct which contains a softring
942  * along with these members) and create an array of this uber struct so we
943  * don't have to do kmem_alloc.
944  */
945 int	fanout_oth1 = 0;
946 int	fanout_oth2 = 0;
947 int	fanout_oth3 = 0;
948 int	fanout_oth4 = 0;
949 int	fanout_oth5 = 0;
950 
951 static void
952 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
953 {
954 	struct ether_header		*ehp;
955 	struct ether_vlan_header	*evhp;
956 	uint32_t			sap;
957 	ipha_t				*ipha;
958 	uint8_t				*dstaddr;
959 	uint_t				indx;
960 	size_t				ports_offset;
961 	size_t				ipha_len;
962 	size_t				hdrsize;
963 	uint_t				hash;
964 	mblk_t				*mp;
965 	mblk_t				*headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
966 	mblk_t				*tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
967 	int				cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
968 	size_t				sz[MAX_SR_TYPES][MAX_SR_FANOUT];
969 	size_t				sz1;
970 	boolean_t			bw_ctl;
971 	boolean_t			hw_classified;
972 	boolean_t			dls_bypass;
973 	boolean_t			is_ether;
974 	boolean_t			is_unicast;
975 	int				fanout_cnt;
976 	enum pkt_type			type;
977 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
978 
979 	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
980 	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
981 
982 	/*
983 	 * If we don't have a Rx ring, S/W classification would have done
984 	 * its job and its a packet meant for us. If we were polling on
985 	 * the default ring (i.e. there was a ring assigned to this SRS),
986 	 * then we need to make sure that the mac address really belongs
987 	 * to us.
988 	 */
989 	hw_classified = mac_srs->srs_ring != NULL &&
990 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
991 
992 	/*
993 	 * Special clients (eg. VLAN, non ether, etc) need DLS
994 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
995 	 * such SRSs. Another way of disabling bypass is to set the
996 	 * MCIS_RX_BYPASS_DISABLE flag.
997 	 */
998 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
999 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
1000 
1001 	/*
1002 	 * Since the softrings are never destroyed and we always
1003 	 * create equal number of softrings for TCP, UDP and rest,
1004 	 * its OK to check one of them for count and use it without
1005 	 * any lock. In future, if soft rings get destroyed because
1006 	 * of reduction in fanout, we will need to ensure that happens
1007 	 * behind the SRS_PROC.
1008 	 */
1009 	fanout_cnt = mac_srs->srs_tcp_ring_count;
1010 
1011 	bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1012 	bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
1013 	bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
1014 	bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
1015 
1016 	/*
1017 	 * We got a chain from SRS that we need to send to the soft rings.
1018 	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
1019 	 * performance reasons), we need to separate out v4_tcp, v4_udp
1020 	 * and the rest goes in other.
1021 	 */
1022 	while (head != NULL) {
1023 		mp = head;
1024 		head = head->b_next;
1025 		mp->b_next = NULL;
1026 
1027 		type = OTH;
1028 		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
1029 
1030 		if (is_ether) {
1031 			/*
1032 			 * At this point we can be sure the packet at least
1033 			 * has an ether header.
1034 			 */
1035 			if (sz1 < sizeof (struct ether_header)) {
1036 				mac_rx_drop_pkt(mac_srs, mp);
1037 				continue;
1038 			}
1039 			ehp = (struct ether_header *)mp->b_rptr;
1040 
1041 			/*
1042 			 * Determine if this is a VLAN or non-VLAN packet.
1043 			 */
1044 			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
1045 				evhp = (struct ether_vlan_header *)mp->b_rptr;
1046 				sap = ntohs(evhp->ether_type);
1047 				hdrsize = sizeof (struct ether_vlan_header);
1048 				/*
1049 				 * Check if the VID of the packet, if any,
1050 				 * belongs to this client.
1051 				 */
1052 				if (!mac_client_check_flow_vid(mcip,
1053 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
1054 					mac_rx_drop_pkt(mac_srs, mp);
1055 					continue;
1056 				}
1057 			} else {
1058 				hdrsize = sizeof (struct ether_header);
1059 			}
1060 			is_unicast =
1061 			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1062 			dstaddr = (uint8_t *)&ehp->ether_dhost;
1063 		} else {
1064 			mac_header_info_t		mhi;
1065 
1066 			if (mac_header_info((mac_handle_t)mcip->mci_mip,
1067 			    mp, &mhi) != 0) {
1068 				mac_rx_drop_pkt(mac_srs, mp);
1069 				continue;
1070 			}
1071 			hdrsize = mhi.mhi_hdrsize;
1072 			sap = mhi.mhi_bindsap;
1073 			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1074 			dstaddr = (uint8_t *)mhi.mhi_daddr;
1075 		}
1076 
1077 		if (!dls_bypass) {
1078 			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1079 			    hdrsize, &type, &indx) == -1) {
1080 				mac_rx_drop_pkt(mac_srs, mp);
1081 				continue;
1082 			}
1083 
1084 			FANOUT_ENQUEUE_MP(headmp[type][indx],
1085 			    tailmp[type][indx], cnt[type][indx], bw_ctl,
1086 			    sz[type][indx], sz1, mp);
1087 			continue;
1088 		}
1089 
1090 
1091 		/*
1092 		 * If we are using the default Rx ring where H/W or S/W
1093 		 * classification has not happened, we need to verify if
1094 		 * this unicast packet really belongs to us.
1095 		 */
1096 		if (sap == ETHERTYPE_IP) {
1097 			/*
1098 			 * If we are H/W classified, but we have promisc
1099 			 * on, then we need to check for the unicast address.
1100 			 */
1101 			if (hw_classified && mcip->mci_promisc_list != NULL) {
1102 				mac_address_t		*map;
1103 
1104 				rw_enter(&mcip->mci_rw_lock, RW_READER);
1105 				map = mcip->mci_unicast;
1106 				if (bcmp(dstaddr, map->ma_addr,
1107 				    map->ma_len) == 0)
1108 					type = UNDEF;
1109 				rw_exit(&mcip->mci_rw_lock);
1110 			} else if (is_unicast) {
1111 				type = UNDEF;
1112 			}
1113 		}
1114 
1115 		/*
1116 		 * This needs to become a contract with the driver for
1117 		 * the fast path.
1118 		 */
1119 
1120 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1121 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1122 			type = OTH;
1123 			fanout_oth1++;
1124 		}
1125 
1126 		if (type != OTH) {
1127 			uint16_t	frag_offset_flags;
1128 
1129 			switch (ipha->ipha_protocol) {
1130 			case IPPROTO_TCP:
1131 			case IPPROTO_UDP:
1132 			case IPPROTO_SCTP:
1133 			case IPPROTO_ESP:
1134 				ipha_len = IPH_HDR_LENGTH(ipha);
1135 				if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1136 				    mp->b_wptr) {
1137 					type = OTH;
1138 					break;
1139 				}
1140 				frag_offset_flags =
1141 				    ntohs(ipha->ipha_fragment_offset_and_flags);
1142 				if ((frag_offset_flags &
1143 				    (IPH_MF | IPH_OFFSET)) != 0) {
1144 					type = OTH;
1145 					fanout_oth3++;
1146 					break;
1147 				}
1148 				ports_offset = hdrsize + ipha_len;
1149 				break;
1150 			default:
1151 				type = OTH;
1152 				fanout_oth4++;
1153 				break;
1154 			}
1155 		}
1156 
1157 		if (type == OTH) {
1158 			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1159 			    hdrsize, &type, &indx) == -1) {
1160 				mac_rx_drop_pkt(mac_srs, mp);
1161 				continue;
1162 			}
1163 
1164 			FANOUT_ENQUEUE_MP(headmp[type][indx],
1165 			    tailmp[type][indx], cnt[type][indx], bw_ctl,
1166 			    sz[type][indx], sz1, mp);
1167 			continue;
1168 		}
1169 
1170 		ASSERT(type == UNDEF);
1171 
1172 		/*
1173 		 * XXX-Sunay: We should hold srs_lock since ring_count
1174 		 * below can change. But if we are always called from
1175 		 * mac_rx_srs_drain and SRS_PROC is set, then we can
1176 		 * enforce that ring_count can't be changed i.e.
1177 		 * to change fanout type or ring count, the calling
1178 		 * thread needs to be behind SRS_PROC.
1179 		 */
1180 		switch (ipha->ipha_protocol) {
1181 		case IPPROTO_TCP:
1182 			/*
1183 			 * Note that for ESP, we fanout on SPI and it is at the
1184 			 * same offset as the 2x16-bit ports. So it is clumped
1185 			 * along with TCP, UDP and SCTP.
1186 			 */
1187 			hash = HASH_ADDR(ipha->ipha_src, ipha->ipha_dst,
1188 			    *(uint32_t *)(mp->b_rptr + ports_offset));
1189 			indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1190 			type = V4_TCP;
1191 			mp->b_rptr += hdrsize;
1192 			break;
1193 		case IPPROTO_UDP:
1194 		case IPPROTO_SCTP:
1195 		case IPPROTO_ESP:
1196 			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1197 				hash = HASH_ADDR(ipha->ipha_src, ipha->ipha_dst,
1198 				    *(uint32_t *)(mp->b_rptr + ports_offset));
1199 				indx = COMPUTE_INDEX(hash,
1200 				    mac_srs->srs_udp_ring_count);
1201 			} else {
1202 				indx = mac_srs->srs_ind %
1203 				    mac_srs->srs_udp_ring_count;
1204 				mac_srs->srs_ind++;
1205 			}
1206 			type = V4_UDP;
1207 			mp->b_rptr += hdrsize;
1208 			break;
1209 		default:
1210 			indx = 0;
1211 			type = OTH;
1212 		}
1213 
1214 		FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1215 		    cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1216 	}
1217 
1218 	for (type = V4_TCP; type < UNDEF; type++) {
1219 		int	i;
1220 
1221 		for (i = 0; i < fanout_cnt; i++) {
1222 			if (headmp[type][i] != NULL) {
1223 				mac_soft_ring_t	*softring;
1224 
1225 				ASSERT(tailmp[type][i]->b_next == NULL);
1226 				switch (type) {
1227 				case V4_TCP:
1228 					softring =
1229 					    mac_srs->srs_tcp_soft_rings[i];
1230 					break;
1231 				case V4_UDP:
1232 					softring =
1233 					    mac_srs->srs_udp_soft_rings[i];
1234 					break;
1235 				case OTH:
1236 					softring =
1237 					    mac_srs->srs_oth_soft_rings[i];
1238 					break;
1239 				}
1240 				mac_rx_soft_ring_process(mcip,
1241 				    softring, headmp[type][i], tailmp[type][i],
1242 				    cnt[type][i], sz[type][i]);
1243 			}
1244 		}
1245 	}
1246 }
1247 
1248 #define	SRS_BYTES_TO_PICKUP	150000
1249 ssize_t	max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1250 
1251 /*
1252  * mac_rx_srs_poll_ring
1253  *
1254  * This SRS Poll thread uses this routine to poll the underlying hardware
1255  * Rx ring to get a chain of packets. It can inline process that chain
1256  * if mac_latency_optimize is set (default) or signal the SRS worker thread
1257  * to do the remaining processing.
1258  *
1259  * Since packets come in the system via interrupt or poll path, we also
1260  * update the stats and deal with promiscous clients here.
1261  */
1262 void
1263 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1264 {
1265 	kmutex_t 		*lock = &mac_srs->srs_lock;
1266 	kcondvar_t 		*async = &mac_srs->srs_cv;
1267 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1268 	mblk_t 			*head, *tail, *mp;
1269 	callb_cpr_t 		cprinfo;
1270 	ssize_t 		bytes_to_pickup;
1271 	size_t 			sz;
1272 	int			count;
1273 	mac_client_impl_t	*smcip;
1274 
1275 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1276 	mutex_enter(lock);
1277 
1278 start:
1279 	for (;;) {
1280 		if (mac_srs->srs_state & SRS_PAUSE)
1281 			goto done;
1282 
1283 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1284 		cv_wait(async, lock);
1285 		CALLB_CPR_SAFE_END(&cprinfo, lock);
1286 
1287 		if (mac_srs->srs_state & SRS_PAUSE)
1288 			goto done;
1289 
1290 check_again:
1291 		if (mac_srs->srs_type & SRST_BW_CONTROL) {
1292 			/*
1293 			 * We pick as many bytes as we are allowed to queue.
1294 			 * Its possible that we will exceed the total
1295 			 * packets queued in case this SRS is part of the
1296 			 * Rx ring group since > 1 poll thread can be pulling
1297 			 * upto the max allowed packets at the same time
1298 			 * but that should be OK.
1299 			 */
1300 			mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1301 			bytes_to_pickup =
1302 			    mac_srs->srs_bw->mac_bw_drop_threshold -
1303 			    mac_srs->srs_bw->mac_bw_sz;
1304 			/*
1305 			 * We shouldn't have been signalled if we
1306 			 * have 0 or less bytes to pick but since
1307 			 * some of the bytes accounting is driver
1308 			 * dependant, we do the safety check.
1309 			 */
1310 			if (bytes_to_pickup < 0)
1311 				bytes_to_pickup = 0;
1312 			mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1313 		} else {
1314 			/*
1315 			 * ToDO: Need to change the polling API
1316 			 * to add a packet count and a flag which
1317 			 * tells the driver whether we want packets
1318 			 * based on a count, or bytes, or all the
1319 			 * packets queued in the driver/HW. This
1320 			 * way, we never have to check the limits
1321 			 * on poll path. We truly let only as many
1322 			 * packets enter the system as we are willing
1323 			 * to process or queue.
1324 			 *
1325 			 * Something along the lines of
1326 			 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1327 			 *	mac_srs->srs_poll_pkt_cnt
1328 			 */
1329 
1330 			/*
1331 			 * Since we are not doing B/W control, pick
1332 			 * as many packets as allowed.
1333 			 */
1334 			bytes_to_pickup = max_bytes_to_pickup;
1335 		}
1336 
1337 		/* Poll the underlying Hardware */
1338 		mutex_exit(lock);
1339 		head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1340 		mutex_enter(lock);
1341 
1342 		ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1343 		    SRS_POLL_THR_OWNER);
1344 
1345 		mp = tail = head;
1346 		count = 0;
1347 		sz = 0;
1348 		while (mp != NULL) {
1349 			tail = mp;
1350 			sz += msgdsize(mp);
1351 			mp = mp->b_next;
1352 			count++;
1353 		}
1354 
1355 		if (head != NULL) {
1356 			tail->b_next = NULL;
1357 			smcip = mac_srs->srs_mcip;
1358 
1359 			SRS_RX_STAT_UPDATE(mac_srs, pollbytes, sz);
1360 			SRS_RX_STAT_UPDATE(mac_srs, pollcnt, count);
1361 
1362 			/*
1363 			 * If there are any promiscuous mode callbacks
1364 			 * defined for this MAC client, pass them a copy
1365 			 * if appropriate and also update the counters.
1366 			 */
1367 			if (smcip != NULL) {
1368 				if (smcip->mci_mip->mi_promisc_list != NULL) {
1369 					mutex_exit(lock);
1370 					mac_promisc_dispatch(smcip->mci_mip,
1371 					    head, NULL);
1372 					mutex_enter(lock);
1373 				}
1374 			}
1375 			if (mac_srs->srs_type & SRST_BW_CONTROL) {
1376 				mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1377 				mac_srs->srs_bw->mac_bw_polled += sz;
1378 				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1379 			}
1380 			MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1381 			    count, sz);
1382 			if (count <= 10)
1383 				srs_rx->sr_stat.mrs_chaincntundr10++;
1384 			else if (count > 10 && count <= 50)
1385 				srs_rx->sr_stat.mrs_chaincnt10to50++;
1386 			else
1387 				srs_rx->sr_stat.mrs_chaincntover50++;
1388 		}
1389 
1390 		/*
1391 		 * We are guaranteed that SRS_PROC will be set if we
1392 		 * are here. Also, poll thread gets to run only if
1393 		 * the drain was being done by a worker thread although
1394 		 * its possible that worker thread is still running
1395 		 * and poll thread was sent down to keep the pipeline
1396 		 * going instead of doing a complete drain and then
1397 		 * trying to poll the NIC.
1398 		 *
1399 		 * So we need to check SRS_WORKER flag to make sure
1400 		 * that the worker thread is not processing the queue
1401 		 * in parallel to us. The flags and conditions are
1402 		 * protected by the srs_lock to prevent any race. We
1403 		 * ensure that we don't drop the srs_lock from now
1404 		 * till the end and similarly we don't drop the srs_lock
1405 		 * in mac_rx_srs_drain() till similar condition check
1406 		 * are complete. The mac_rx_srs_drain() needs to ensure
1407 		 * that SRS_WORKER flag remains set as long as its
1408 		 * processing the queue.
1409 		 */
1410 		if (!(mac_srs->srs_state & SRS_WORKER) &&
1411 		    (mac_srs->srs_first != NULL)) {
1412 			/*
1413 			 * We have packets to process and worker thread
1414 			 * is not running. Check to see if poll thread is
1415 			 * allowed to process.
1416 			 */
1417 			if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1418 				mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1419 				if (!(mac_srs->srs_state & SRS_PAUSE) &&
1420 				    srs_rx->sr_poll_pkt_cnt <=
1421 				    srs_rx->sr_lowat) {
1422 					srs_rx->sr_poll_again++;
1423 					goto check_again;
1424 				}
1425 				/*
1426 				 * We are already above low water mark
1427 				 * so stay in the polling mode but no
1428 				 * need to poll. Once we dip below
1429 				 * the polling threshold, the processing
1430 				 * thread (soft ring) will signal us
1431 				 * to poll again (MAC_UPDATE_SRS_COUNT)
1432 				 */
1433 				srs_rx->sr_poll_drain_no_poll++;
1434 				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1435 				/*
1436 				 * In B/W control case, its possible
1437 				 * that the backlog built up due to
1438 				 * B/W limit being reached and packets
1439 				 * are queued only in SRS. In this case,
1440 				 * we should schedule worker thread
1441 				 * since no one else will wake us up.
1442 				 */
1443 				if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1444 				    (mac_srs->srs_tid == NULL)) {
1445 					mac_srs->srs_tid =
1446 					    timeout(mac_srs_fire, mac_srs, 1);
1447 					srs_rx->sr_poll_worker_wakeup++;
1448 				}
1449 			} else {
1450 				/*
1451 				 * Wakeup the worker thread for more processing.
1452 				 * We optimize for throughput in this case.
1453 				 */
1454 				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1455 				MAC_SRS_WORKER_WAKEUP(mac_srs);
1456 				srs_rx->sr_poll_sig_worker++;
1457 			}
1458 		} else if ((mac_srs->srs_first == NULL) &&
1459 		    !(mac_srs->srs_state & SRS_WORKER)) {
1460 			/*
1461 			 * There is nothing queued in SRS and
1462 			 * no worker thread running. Plus we
1463 			 * didn't get anything from the H/W
1464 			 * as well (head == NULL);
1465 			 */
1466 			ASSERT(head == NULL);
1467 			mac_srs->srs_state &=
1468 			    ~(SRS_PROC|SRS_GET_PKTS);
1469 
1470 			/*
1471 			 * If we have a packets in soft ring, don't allow
1472 			 * more packets to come into this SRS by keeping the
1473 			 * interrupts off but not polling the H/W. The
1474 			 * poll thread will get signaled as soon as
1475 			 * srs_poll_pkt_cnt dips below poll threshold.
1476 			 */
1477 			if (srs_rx->sr_poll_pkt_cnt == 0) {
1478 				srs_rx->sr_poll_intr_enable++;
1479 				MAC_SRS_POLLING_OFF(mac_srs);
1480 			} else {
1481 				/*
1482 				 * We know nothing is queued in SRS
1483 				 * since we are here after checking
1484 				 * srs_first is NULL. The backlog
1485 				 * is entirely due to packets queued
1486 				 * in Soft ring which will wake us up
1487 				 * and get the interface out of polling
1488 				 * mode once the backlog dips below
1489 				 * sr_poll_thres.
1490 				 */
1491 				srs_rx->sr_poll_no_poll++;
1492 			}
1493 		} else {
1494 			/*
1495 			 * Worker thread is already running.
1496 			 * Nothing much to do. If the polling
1497 			 * was enabled, worker thread will deal
1498 			 * with that.
1499 			 */
1500 			mac_srs->srs_state &= ~SRS_GET_PKTS;
1501 			srs_rx->sr_poll_goto_sleep++;
1502 		}
1503 	}
1504 done:
1505 	mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1506 	cv_signal(&mac_srs->srs_async);
1507 	/*
1508 	 * If this is a temporary quiesce then wait for the restart signal
1509 	 * from the srs worker. Then clear the flags and signal the srs worker
1510 	 * to ensure a positive handshake and go back to start.
1511 	 */
1512 	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1513 		cv_wait(async, lock);
1514 	if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1515 		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1516 		mac_srs->srs_state &=
1517 		    ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1518 		cv_signal(&mac_srs->srs_async);
1519 		goto start;
1520 	} else {
1521 		mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1522 		cv_signal(&mac_srs->srs_async);
1523 		CALLB_CPR_EXIT(&cprinfo);
1524 		thread_exit();
1525 	}
1526 }
1527 
1528 /*
1529  * mac_srs_pick_chain
1530  *
1531  * In Bandwidth control case, checks how many packets can be processed
1532  * and return them in a sub chain.
1533  */
1534 static mblk_t *
1535 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1536     size_t *chain_sz, int *chain_cnt)
1537 {
1538 	mblk_t 			*head = NULL;
1539 	mblk_t 			*tail = NULL;
1540 	size_t			sz;
1541 	size_t 			tsz = 0;
1542 	int			cnt = 0;
1543 	mblk_t 			*mp;
1544 
1545 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1546 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1547 	if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1548 	    mac_srs->srs_bw->mac_bw_limit) ||
1549 	    (mac_srs->srs_bw->mac_bw_limit == 0)) {
1550 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1551 		head = mac_srs->srs_first;
1552 		mac_srs->srs_first = NULL;
1553 		*chain_tail = mac_srs->srs_last;
1554 		mac_srs->srs_last = NULL;
1555 		*chain_sz = mac_srs->srs_size;
1556 		*chain_cnt = mac_srs->srs_count;
1557 		mac_srs->srs_count = 0;
1558 		mac_srs->srs_size = 0;
1559 		return (head);
1560 	}
1561 
1562 	/*
1563 	 * Can't clear the entire backlog.
1564 	 * Need to find how many packets to pick
1565 	 */
1566 	ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1567 	while ((mp = mac_srs->srs_first) != NULL) {
1568 		sz = msgdsize(mp);
1569 		if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1570 		    mac_srs->srs_bw->mac_bw_limit) {
1571 			if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1572 				mac_srs->srs_bw->mac_bw_state |=
1573 				    SRS_BW_ENFORCED;
1574 			break;
1575 		}
1576 
1577 		/*
1578 		 * The _size & cnt is  decremented from the softrings
1579 		 * when they send up the packet for polling to work
1580 		 * properly.
1581 		 */
1582 		tsz += sz;
1583 		cnt++;
1584 		mac_srs->srs_count--;
1585 		mac_srs->srs_size -= sz;
1586 		if (tail != NULL)
1587 			tail->b_next = mp;
1588 		else
1589 			head = mp;
1590 		tail = mp;
1591 		mac_srs->srs_first = mac_srs->srs_first->b_next;
1592 	}
1593 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1594 	if (mac_srs->srs_first == NULL)
1595 		mac_srs->srs_last = NULL;
1596 
1597 	if (tail != NULL)
1598 		tail->b_next = NULL;
1599 	*chain_tail = tail;
1600 	*chain_cnt = cnt;
1601 	*chain_sz = tsz;
1602 
1603 	return (head);
1604 }
1605 
1606 /*
1607  * mac_rx_srs_drain
1608  *
1609  * The SRS drain routine. Gets to run to clear the queue. Any thread
1610  * (worker, interrupt, poll) can call this based on processing model.
1611  * The first thing we do is disable interrupts if possible and then
1612  * drain the queue. we also try to poll the underlying hardware if
1613  * there is a dedicated hardware Rx ring assigned to this SRS.
1614  *
1615  * There is a equivalent drain routine in bandwidth control mode
1616  * mac_rx_srs_drain_bw. There is some code duplication between the two
1617  * routines but they are highly performance sensitive and are easier
1618  * to read/debug if they stay separate. Any code changes here might
1619  * also apply to mac_rx_srs_drain_bw as well.
1620  */
1621 void
1622 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1623 {
1624 	mblk_t 			*head;
1625 	mblk_t			*tail;
1626 	timeout_id_t 		tid;
1627 	int			cnt = 0;
1628 	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
1629 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1630 
1631 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1632 	ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1633 
1634 	/* If we are blanked i.e. can't do upcalls, then we are done */
1635 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1636 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1637 		    (mac_srs->srs_state & SRS_PAUSE));
1638 		goto out;
1639 	}
1640 
1641 	if (mac_srs->srs_first == NULL)
1642 		goto out;
1643 
1644 	if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1645 	    (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1646 		/*
1647 		 * In the normal case, the SRS worker thread does no
1648 		 * work and we wait for a backlog to build up before
1649 		 * we switch into polling mode. In case we are
1650 		 * optimizing for throughput, we use the worker thread
1651 		 * as well. The goal is to let worker thread process
1652 		 * the queue and poll thread to feed packets into
1653 		 * the queue. As such, we should signal the poll
1654 		 * thread to try and get more packets.
1655 		 *
1656 		 * We could have pulled this check in the POLL_RING
1657 		 * macro itself but keeping it explicit here makes
1658 		 * the architecture more human understandable.
1659 		 */
1660 		MAC_SRS_POLL_RING(mac_srs);
1661 	}
1662 
1663 again:
1664 	head = mac_srs->srs_first;
1665 	mac_srs->srs_first = NULL;
1666 	tail = mac_srs->srs_last;
1667 	mac_srs->srs_last = NULL;
1668 	cnt = mac_srs->srs_count;
1669 	mac_srs->srs_count = 0;
1670 
1671 	ASSERT(head != NULL);
1672 	ASSERT(tail != NULL);
1673 
1674 	if ((tid = mac_srs->srs_tid) != 0)
1675 		mac_srs->srs_tid = 0;
1676 
1677 	mac_srs->srs_state |= (SRS_PROC|proc_type);
1678 
1679 
1680 	/*
1681 	 * mcip is NULL for broadcast and multicast flows. The promisc
1682 	 * callbacks for broadcast and multicast packets are delivered from
1683 	 * mac_rx() and we don't need to worry about that case in this path
1684 	 */
1685 	if (mcip != NULL) {
1686 		if (mcip->mci_promisc_list != NULL) {
1687 			mutex_exit(&mac_srs->srs_lock);
1688 			mac_promisc_client_dispatch(mcip, head);
1689 			mutex_enter(&mac_srs->srs_lock);
1690 		}
1691 		if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1692 			mutex_exit(&mac_srs->srs_lock);
1693 			mac_protect_intercept_dhcp(mcip, head);
1694 			mutex_enter(&mac_srs->srs_lock);
1695 		}
1696 	}
1697 
1698 	/*
1699 	 * Check if SRS itself is doing the processing
1700 	 * This direct path does not apply when subflows are present. In this
1701 	 * case, packets need to be dispatched to a soft ring according to the
1702 	 * flow's bandwidth and other resources contraints.
1703 	 */
1704 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1705 		mac_direct_rx_t		proc;
1706 		void			*arg1;
1707 		mac_resource_handle_t	arg2;
1708 
1709 		/*
1710 		 * This is the case when a Rx is directly
1711 		 * assigned and we have a fully classified
1712 		 * protocol chain. We can deal with it in
1713 		 * one shot.
1714 		 */
1715 		proc = srs_rx->sr_func;
1716 		arg1 = srs_rx->sr_arg1;
1717 		arg2 = srs_rx->sr_arg2;
1718 
1719 		mac_srs->srs_state |= SRS_CLIENT_PROC;
1720 		mutex_exit(&mac_srs->srs_lock);
1721 		if (tid != 0) {
1722 			(void) untimeout(tid);
1723 			tid = 0;
1724 		}
1725 
1726 		proc(arg1, arg2, head, NULL);
1727 		/*
1728 		 * Decrement the size and count here itelf
1729 		 * since the packet has been processed.
1730 		 */
1731 		mutex_enter(&mac_srs->srs_lock);
1732 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1733 		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1734 			cv_signal(&mac_srs->srs_client_cv);
1735 		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1736 	} else {
1737 		/* Some kind of softrings based fanout is required */
1738 		mutex_exit(&mac_srs->srs_lock);
1739 		if (tid != 0) {
1740 			(void) untimeout(tid);
1741 			tid = 0;
1742 		}
1743 
1744 		/*
1745 		 * Since the fanout routines can deal with chains,
1746 		 * shoot the entire chain up.
1747 		 */
1748 		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1749 			mac_rx_srs_fanout(mac_srs, head);
1750 		else
1751 			mac_rx_srs_proto_fanout(mac_srs, head);
1752 		mutex_enter(&mac_srs->srs_lock);
1753 	}
1754 
1755 	if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1756 	    (mac_srs->srs_first != NULL)) {
1757 		/*
1758 		 * More packets arrived while we were clearing the
1759 		 * SRS. This can be possible because of one of
1760 		 * three conditions below:
1761 		 * 1) The driver is using multiple worker threads
1762 		 *    to send the packets to us.
1763 		 * 2) The driver has a race in switching
1764 		 *    between interrupt and polling mode or
1765 		 * 3) Packets are arriving in this SRS via the
1766 		 *    S/W classification as well.
1767 		 *
1768 		 * We should switch to polling mode and see if we
1769 		 * need to send the poll thread down. Also, signal
1770 		 * the worker thread to process whats just arrived.
1771 		 */
1772 		MAC_SRS_POLLING_ON(mac_srs);
1773 		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1774 			srs_rx->sr_drain_poll_sig++;
1775 			MAC_SRS_POLL_RING(mac_srs);
1776 		}
1777 
1778 		/*
1779 		 * If we didn't signal the poll thread, we need
1780 		 * to deal with the pending packets ourselves.
1781 		 */
1782 		if (proc_type == SRS_WORKER) {
1783 			srs_rx->sr_drain_again++;
1784 			goto again;
1785 		} else {
1786 			srs_rx->sr_drain_worker_sig++;
1787 			cv_signal(&mac_srs->srs_async);
1788 		}
1789 	}
1790 
1791 out:
1792 	if (mac_srs->srs_state & SRS_GET_PKTS) {
1793 		/*
1794 		 * Poll thread is already running. Leave the
1795 		 * SRS_RPOC set and hand over the control to
1796 		 * poll thread.
1797 		 */
1798 		mac_srs->srs_state &= ~proc_type;
1799 		srs_rx->sr_drain_poll_running++;
1800 		return;
1801 	}
1802 
1803 	/*
1804 	 * Even if there are no packets queued in SRS, we
1805 	 * need to make sure that the shared counter is
1806 	 * clear and any associated softrings have cleared
1807 	 * all the backlog. Otherwise, leave the interface
1808 	 * in polling mode and the poll thread will get
1809 	 * signalled once the count goes down to zero.
1810 	 *
1811 	 * If someone is already draining the queue (SRS_PROC is
1812 	 * set) when the srs_poll_pkt_cnt goes down to zero,
1813 	 * then it means that drain is already running and we
1814 	 * will turn off polling at that time if there is
1815 	 * no backlog.
1816 	 *
1817 	 * As long as there are packets queued either
1818 	 * in soft ring set or its soft rings, we will leave
1819 	 * the interface in polling mode (even if the drain
1820 	 * was done being the interrupt thread). We signal
1821 	 * the poll thread as well if we have dipped below
1822 	 * low water mark.
1823 	 *
1824 	 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1825 	 * since that turn polling on only for worker thread.
1826 	 * Its not worth turning polling on for interrupt
1827 	 * thread (since NIC will not issue another interrupt)
1828 	 * unless a backlog builds up.
1829 	 */
1830 	if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1831 	    (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1832 		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1833 		srs_rx->sr_drain_keep_polling++;
1834 		MAC_SRS_POLLING_ON(mac_srs);
1835 		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1836 			MAC_SRS_POLL_RING(mac_srs);
1837 		return;
1838 	}
1839 
1840 	/* Nothing else to do. Get out of poll mode */
1841 	MAC_SRS_POLLING_OFF(mac_srs);
1842 	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1843 	srs_rx->sr_drain_finish_intr++;
1844 }
1845 
1846 /*
1847  * mac_rx_srs_drain_bw
1848  *
1849  * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1850  * (worker, interrupt, poll) can call this based on processing model.
1851  * The first thing we do is disable interrupts if possible and then
1852  * drain the queue. we also try to poll the underlying hardware if
1853  * there is a dedicated hardware Rx ring assigned to this SRS.
1854  *
1855  * There is a equivalent drain routine in non bandwidth control mode
1856  * mac_rx_srs_drain. There is some code duplication between the two
1857  * routines but they are highly performance sensitive and are easier
1858  * to read/debug if they stay separate. Any code changes here might
1859  * also apply to mac_rx_srs_drain as well.
1860  */
1861 void
1862 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1863 {
1864 	mblk_t 			*head;
1865 	mblk_t			*tail;
1866 	timeout_id_t 		tid;
1867 	size_t			sz = 0;
1868 	int			cnt = 0;
1869 	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
1870 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1871 	clock_t			now;
1872 
1873 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1874 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1875 again:
1876 	/* Check if we are doing B/W control */
1877 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1878 	now = ddi_get_lbolt();
1879 	if (mac_srs->srs_bw->mac_bw_curr_time != now) {
1880 		mac_srs->srs_bw->mac_bw_curr_time = now;
1881 		mac_srs->srs_bw->mac_bw_used = 0;
1882 		if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1883 			mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1884 	} else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1885 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1886 		goto done;
1887 	} else if (mac_srs->srs_bw->mac_bw_used >
1888 	    mac_srs->srs_bw->mac_bw_limit) {
1889 		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1890 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1891 		goto done;
1892 	}
1893 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1894 
1895 	/* If we are blanked i.e. can't do upcalls, then we are done */
1896 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1897 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1898 		    (mac_srs->srs_state & SRS_PAUSE));
1899 		goto done;
1900 	}
1901 
1902 	sz = 0;
1903 	cnt = 0;
1904 	if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1905 		/*
1906 		 * We couldn't pick up a single packet.
1907 		 */
1908 		mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1909 		if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1910 		    (mac_srs->srs_size != 0) &&
1911 		    !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1912 			/*
1913 			 * Seems like configured B/W doesn't
1914 			 * even allow processing of 1 packet
1915 			 * per tick.
1916 			 *
1917 			 * XXX: raise the limit to processing
1918 			 * at least 1 packet per tick.
1919 			 */
1920 			mac_srs->srs_bw->mac_bw_limit +=
1921 			    mac_srs->srs_bw->mac_bw_limit;
1922 			mac_srs->srs_bw->mac_bw_drop_threshold +=
1923 			    mac_srs->srs_bw->mac_bw_drop_threshold;
1924 			cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1925 			    "raised B/W limit to %d since not even a "
1926 			    "single packet can be processed per "
1927 			    "tick %d\n", (void *)mac_srs,
1928 			    (int)mac_srs->srs_bw->mac_bw_limit,
1929 			    (int)msgdsize(mac_srs->srs_first));
1930 		}
1931 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1932 		goto done;
1933 	}
1934 
1935 	ASSERT(head != NULL);
1936 	ASSERT(tail != NULL);
1937 
1938 	/* zero bandwidth: drop all and return to interrupt mode */
1939 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1940 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
1941 		srs_rx->sr_stat.mrs_sdrops += cnt;
1942 		ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1943 		mac_srs->srs_bw->mac_bw_sz -= sz;
1944 		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1945 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1946 		mac_pkt_drop(NULL, NULL, head, B_FALSE);
1947 		goto leave_poll;
1948 	} else {
1949 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1950 	}
1951 
1952 	if ((tid = mac_srs->srs_tid) != 0)
1953 		mac_srs->srs_tid = 0;
1954 
1955 	mac_srs->srs_state |= (SRS_PROC|proc_type);
1956 	MAC_SRS_WORKER_POLLING_ON(mac_srs);
1957 
1958 	/*
1959 	 * mcip is NULL for broadcast and multicast flows. The promisc
1960 	 * callbacks for broadcast and multicast packets are delivered from
1961 	 * mac_rx() and we don't need to worry about that case in this path
1962 	 */
1963 	if (mcip != NULL) {
1964 		if (mcip->mci_promisc_list != NULL) {
1965 			mutex_exit(&mac_srs->srs_lock);
1966 			mac_promisc_client_dispatch(mcip, head);
1967 			mutex_enter(&mac_srs->srs_lock);
1968 		}
1969 		if (MAC_PROTECT_ENABLED(mcip, MPT_IPNOSPOOF)) {
1970 			mutex_exit(&mac_srs->srs_lock);
1971 			mac_protect_intercept_dhcp(mcip, head);
1972 			mutex_enter(&mac_srs->srs_lock);
1973 		}
1974 	}
1975 
1976 	/*
1977 	 * Check if SRS itself is doing the processing
1978 	 * This direct path does not apply when subflows are present. In this
1979 	 * case, packets need to be dispatched to a soft ring according to the
1980 	 * flow's bandwidth and other resources contraints.
1981 	 */
1982 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1983 		mac_direct_rx_t		proc;
1984 		void			*arg1;
1985 		mac_resource_handle_t	arg2;
1986 
1987 		/*
1988 		 * This is the case when a Rx is directly
1989 		 * assigned and we have a fully classified
1990 		 * protocol chain. We can deal with it in
1991 		 * one shot.
1992 		 */
1993 		proc = srs_rx->sr_func;
1994 		arg1 = srs_rx->sr_arg1;
1995 		arg2 = srs_rx->sr_arg2;
1996 
1997 		mac_srs->srs_state |= SRS_CLIENT_PROC;
1998 		mutex_exit(&mac_srs->srs_lock);
1999 		if (tid != 0) {
2000 			(void) untimeout(tid);
2001 			tid = 0;
2002 		}
2003 
2004 		proc(arg1, arg2, head, NULL);
2005 		/*
2006 		 * Decrement the size and count here itelf
2007 		 * since the packet has been processed.
2008 		 */
2009 		mutex_enter(&mac_srs->srs_lock);
2010 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
2011 		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
2012 
2013 		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
2014 			cv_signal(&mac_srs->srs_client_cv);
2015 		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
2016 	} else {
2017 		/* Some kind of softrings based fanout is required */
2018 		mutex_exit(&mac_srs->srs_lock);
2019 		if (tid != 0) {
2020 			(void) untimeout(tid);
2021 			tid = 0;
2022 		}
2023 
2024 		/*
2025 		 * Since the fanout routines can deal with chains,
2026 		 * shoot the entire chain up.
2027 		 */
2028 		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
2029 			mac_rx_srs_fanout(mac_srs, head);
2030 		else
2031 			mac_rx_srs_proto_fanout(mac_srs, head);
2032 		mutex_enter(&mac_srs->srs_lock);
2033 	}
2034 
2035 	/*
2036 	 * Send the poll thread to pick up any packets arrived
2037 	 * so far. This also serves as the last check in case
2038 	 * nothing else is queued in the SRS. The poll thread
2039 	 * is signalled only in the case the drain was done
2040 	 * by the worker thread and SRS_WORKER is set. The
2041 	 * worker thread can run in parallel as long as the
2042 	 * SRS_WORKER flag is set. We we have nothing else to
2043 	 * process, we can exit while leaving SRS_PROC set
2044 	 * which gives the poll thread control to process and
2045 	 * cleanup once it returns from the NIC.
2046 	 *
2047 	 * If we have nothing else to process, we need to
2048 	 * ensure that we keep holding the srs_lock till
2049 	 * all the checks below are done and control is
2050 	 * handed to the poll thread if it was running.
2051 	 */
2052 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2053 	if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2054 		if (mac_srs->srs_first != NULL) {
2055 			if (proc_type == SRS_WORKER) {
2056 				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2057 				if (srs_rx->sr_poll_pkt_cnt <=
2058 				    srs_rx->sr_lowat)
2059 					MAC_SRS_POLL_RING(mac_srs);
2060 				goto again;
2061 			} else {
2062 				cv_signal(&mac_srs->srs_async);
2063 			}
2064 		}
2065 	}
2066 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2067 
2068 done:
2069 
2070 	if (mac_srs->srs_state & SRS_GET_PKTS) {
2071 		/*
2072 		 * Poll thread is already running. Leave the
2073 		 * SRS_RPOC set and hand over the control to
2074 		 * poll thread.
2075 		 */
2076 		mac_srs->srs_state &= ~proc_type;
2077 		return;
2078 	}
2079 
2080 	/*
2081 	 * If we can't process packets because we have exceeded
2082 	 * B/W limit for this tick, just set the timeout
2083 	 * and leave.
2084 	 *
2085 	 * Even if there are no packets queued in SRS, we
2086 	 * need to make sure that the shared counter is
2087 	 * clear and any associated softrings have cleared
2088 	 * all the backlog. Otherwise, leave the interface
2089 	 * in polling mode and the poll thread will get
2090 	 * signalled once the count goes down to zero.
2091 	 *
2092 	 * If someone is already draining the queue (SRS_PROC is
2093 	 * set) when the srs_poll_pkt_cnt goes down to zero,
2094 	 * then it means that drain is already running and we
2095 	 * will turn off polling at that time if there is
2096 	 * no backlog. As long as there are packets queued either
2097 	 * is soft ring set or its soft rings, we will leave
2098 	 * the interface in polling mode.
2099 	 */
2100 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2101 	if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2102 	    ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2103 	    (srs_rx->sr_poll_pkt_cnt > 0))) {
2104 		MAC_SRS_POLLING_ON(mac_srs);
2105 		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2106 		if ((mac_srs->srs_first != NULL) &&
2107 		    (mac_srs->srs_tid == NULL))
2108 			mac_srs->srs_tid = timeout(mac_srs_fire,
2109 			    mac_srs, 1);
2110 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2111 		return;
2112 	}
2113 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2114 
2115 leave_poll:
2116 
2117 	/* Nothing else to do. Get out of poll mode */
2118 	MAC_SRS_POLLING_OFF(mac_srs);
2119 	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2120 }
2121 
2122 /*
2123  * mac_srs_worker
2124  *
2125  * The SRS worker routine. Drains the queue when no one else is
2126  * processing it.
2127  */
2128 void
2129 mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2130 {
2131 	kmutex_t 		*lock = &mac_srs->srs_lock;
2132 	kcondvar_t 		*async = &mac_srs->srs_async;
2133 	callb_cpr_t		cprinfo;
2134 	boolean_t		bw_ctl_flag;
2135 
2136 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2137 	mutex_enter(lock);
2138 
2139 start:
2140 	for (;;) {
2141 		bw_ctl_flag = B_FALSE;
2142 		if (mac_srs->srs_type & SRST_BW_CONTROL) {
2143 			MAC_SRS_BW_LOCK(mac_srs);
2144 			MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2145 			if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2146 				bw_ctl_flag = B_TRUE;
2147 			MAC_SRS_BW_UNLOCK(mac_srs);
2148 		}
2149 		/*
2150 		 * The SRS_BW_ENFORCED flag may change since we have dropped
2151 		 * the mac_bw_lock. However the drain function can handle both
2152 		 * a drainable SRS or a bandwidth controlled SRS, and the
2153 		 * effect of scheduling a timeout is to wakeup the worker
2154 		 * thread which in turn will call the drain function. Since
2155 		 * we release the srs_lock atomically only in the cv_wait there
2156 		 * isn't a fear of waiting for ever.
2157 		 */
2158 		while (((mac_srs->srs_state & SRS_PROC) ||
2159 		    (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2160 		    (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2161 		    !(mac_srs->srs_state & SRS_PAUSE)) {
2162 			/*
2163 			 * If we have packets queued and we are here
2164 			 * because B/W control is in place, we better
2165 			 * schedule the worker wakeup after 1 tick
2166 			 * to see if bandwidth control can be relaxed.
2167 			 */
2168 			if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2169 				/*
2170 				 * We need to ensure that a timer  is already
2171 				 * scheduled or we force  schedule one for
2172 				 * later so that we can continue processing
2173 				 * after this  quanta is over.
2174 				 */
2175 				mac_srs->srs_tid = timeout(mac_srs_fire,
2176 				    mac_srs, 1);
2177 			}
2178 wait:
2179 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2180 			cv_wait(async, lock);
2181 			CALLB_CPR_SAFE_END(&cprinfo, lock);
2182 
2183 			if (mac_srs->srs_state & SRS_PAUSE)
2184 				goto done;
2185 			if (mac_srs->srs_state & SRS_PROC)
2186 				goto wait;
2187 
2188 			if (mac_srs->srs_first != NULL &&
2189 			    mac_srs->srs_type & SRST_BW_CONTROL) {
2190 				MAC_SRS_BW_LOCK(mac_srs);
2191 				if (mac_srs->srs_bw->mac_bw_state &
2192 				    SRS_BW_ENFORCED) {
2193 					MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2194 				}
2195 				bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2196 				    SRS_BW_ENFORCED;
2197 				MAC_SRS_BW_UNLOCK(mac_srs);
2198 			}
2199 		}
2200 
2201 		if (mac_srs->srs_state & SRS_PAUSE)
2202 			goto done;
2203 		mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2204 	}
2205 done:
2206 	/*
2207 	 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2208 	 * from both hard and soft classifications and waits for such threads
2209 	 * to finish before signaling the worker. So at this point the only
2210 	 * thread left that could be competing with the worker is the poll
2211 	 * thread. In the case of Tx, there shouldn't be any thread holding
2212 	 * SRS_PROC at this point.
2213 	 */
2214 	if (!(mac_srs->srs_state & SRS_PROC)) {
2215 		mac_srs->srs_state |= SRS_PROC;
2216 	} else {
2217 		ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2218 		/*
2219 		 * Poll thread still owns the SRS and is still running
2220 		 */
2221 		ASSERT((mac_srs->srs_poll_thr == NULL) ||
2222 		    ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2223 		    SRS_POLL_THR_OWNER));
2224 	}
2225 	mac_srs_worker_quiesce(mac_srs);
2226 	/*
2227 	 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2228 	 * of the quiesce operation
2229 	 */
2230 	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2231 		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2232 
2233 	if (mac_srs->srs_state & SRS_RESTART) {
2234 		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2235 		mac_srs_worker_restart(mac_srs);
2236 		mac_srs->srs_state &= ~SRS_PROC;
2237 		goto start;
2238 	}
2239 
2240 	if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2241 		mac_srs_worker_quiesce(mac_srs);
2242 
2243 	mac_srs->srs_state &= ~SRS_PROC;
2244 	/* The macro drops the srs_lock */
2245 	CALLB_CPR_EXIT(&cprinfo);
2246 	thread_exit();
2247 }
2248 
2249 /*
2250  * mac_rx_srs_subflow_process
2251  *
2252  * Receive side routine called from interrupt path when there are
2253  * sub flows present on this SRS.
2254  */
2255 /* ARGSUSED */
2256 void
2257 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2258     mblk_t *mp_chain, boolean_t loopback)
2259 {
2260 	flow_entry_t		*flent = NULL;
2261 	flow_entry_t		*prev_flent = NULL;
2262 	mblk_t			*mp = NULL;
2263 	mblk_t			*tail = NULL;
2264 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
2265 	mac_client_impl_t	*mcip;
2266 
2267 	mcip = mac_srs->srs_mcip;
2268 	ASSERT(mcip != NULL);
2269 
2270 	/*
2271 	 * We need to determine the SRS for every packet
2272 	 * by walking the flow table, if we don't get any,
2273 	 * then we proceed using the SRS we came with.
2274 	 */
2275 	mp = tail = mp_chain;
2276 	while (mp != NULL) {
2277 
2278 		/*
2279 		 * We will increment the stats for the mactching subflow.
2280 		 * when we get the bytes/pkt count for the classified packets
2281 		 * later in mac_rx_srs_process.
2282 		 */
2283 		(void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2284 		    FLOW_INBOUND, &flent);
2285 
2286 		if (mp == mp_chain || flent == prev_flent) {
2287 			if (prev_flent != NULL)
2288 				FLOW_REFRELE(prev_flent);
2289 			prev_flent = flent;
2290 			flent = NULL;
2291 			tail = mp;
2292 			mp = mp->b_next;
2293 			continue;
2294 		}
2295 		tail->b_next = NULL;
2296 		/*
2297 		 * A null indicates, this is for the mac_srs itself.
2298 		 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2299 		 */
2300 		if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2301 			mac_rx_srs_process(arg,
2302 			    (mac_resource_handle_t)mac_srs, mp_chain,
2303 			    loopback);
2304 		} else {
2305 			(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2306 			    prev_flent->fe_cb_arg2, mp_chain, loopback);
2307 			FLOW_REFRELE(prev_flent);
2308 		}
2309 		prev_flent = flent;
2310 		flent = NULL;
2311 		mp_chain = mp;
2312 		tail = mp;
2313 		mp = mp->b_next;
2314 	}
2315 	/* Last chain */
2316 	ASSERT(mp_chain != NULL);
2317 	if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2318 		mac_rx_srs_process(arg,
2319 		    (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2320 	} else {
2321 		(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2322 		    prev_flent->fe_cb_arg2, mp_chain, loopback);
2323 		FLOW_REFRELE(prev_flent);
2324 	}
2325 }
2326 
2327 /*
2328  * mac_rx_srs_process
2329  *
2330  * Receive side routine called from the interrupt path.
2331  *
2332  * loopback is set to force a context switch on the loopback
2333  * path between MAC clients.
2334  */
2335 /* ARGSUSED */
2336 void
2337 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2338     boolean_t loopback)
2339 {
2340 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
2341 	mblk_t			*mp, *tail, *head;
2342 	int			count = 0;
2343 	int			count1;
2344 	size_t			sz = 0;
2345 	size_t			chain_sz, sz1;
2346 	mac_bw_ctl_t		*mac_bw;
2347 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
2348 
2349 	/*
2350 	 * Set the tail, count and sz. We set the sz irrespective
2351 	 * of whether we are doing B/W control or not for the
2352 	 * purpose of updating the stats.
2353 	 */
2354 	mp = tail = mp_chain;
2355 	while (mp != NULL) {
2356 		tail = mp;
2357 		count++;
2358 		sz += msgdsize(mp);
2359 		mp = mp->b_next;
2360 	}
2361 
2362 	mutex_enter(&mac_srs->srs_lock);
2363 
2364 	if (loopback) {
2365 		SRS_RX_STAT_UPDATE(mac_srs, lclbytes, sz);
2366 		SRS_RX_STAT_UPDATE(mac_srs, lclcnt, count);
2367 
2368 	} else {
2369 		SRS_RX_STAT_UPDATE(mac_srs, intrbytes, sz);
2370 		SRS_RX_STAT_UPDATE(mac_srs, intrcnt, count);
2371 	}
2372 
2373 	/*
2374 	 * If the SRS in already being processed; has been blanked;
2375 	 * can be processed by worker thread only; or the B/W limit
2376 	 * has been reached, then queue the chain and check if
2377 	 * worker thread needs to be awakend.
2378 	 */
2379 	if (mac_srs->srs_type & SRST_BW_CONTROL) {
2380 		mac_bw = mac_srs->srs_bw;
2381 		ASSERT(mac_bw != NULL);
2382 		mutex_enter(&mac_bw->mac_bw_lock);
2383 		mac_bw->mac_bw_intr += sz;
2384 		if (mac_bw->mac_bw_limit == 0) {
2385 			/* zero bandwidth: drop all */
2386 			srs_rx->sr_stat.mrs_sdrops += count;
2387 			mac_bw->mac_bw_drop_bytes += sz;
2388 			mutex_exit(&mac_bw->mac_bw_lock);
2389 			mutex_exit(&mac_srs->srs_lock);
2390 			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2391 			return;
2392 		} else {
2393 			if ((mac_bw->mac_bw_sz + sz) <=
2394 			    mac_bw->mac_bw_drop_threshold) {
2395 				mutex_exit(&mac_bw->mac_bw_lock);
2396 				MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2397 				    tail, count, sz);
2398 			} else {
2399 				mp = mp_chain;
2400 				chain_sz = 0;
2401 				count1 = 0;
2402 				tail = NULL;
2403 				head = NULL;
2404 				while (mp != NULL) {
2405 					sz1 = msgdsize(mp);
2406 					if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2407 					    mac_bw->mac_bw_drop_threshold)
2408 						break;
2409 					chain_sz += sz1;
2410 					count1++;
2411 					tail = mp;
2412 					mp = mp->b_next;
2413 				}
2414 				mutex_exit(&mac_bw->mac_bw_lock);
2415 				if (tail != NULL) {
2416 					head = tail->b_next;
2417 					tail->b_next = NULL;
2418 					MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2419 					    mp_chain, tail, count1, chain_sz);
2420 					sz -= chain_sz;
2421 					count -= count1;
2422 				} else {
2423 					/* Can't pick up any */
2424 					head = mp_chain;
2425 				}
2426 				if (head != NULL) {
2427 					/* Drop any packet over the threshold */
2428 					srs_rx->sr_stat.mrs_sdrops += count;
2429 					mutex_enter(&mac_bw->mac_bw_lock);
2430 					mac_bw->mac_bw_drop_bytes += sz;
2431 					mutex_exit(&mac_bw->mac_bw_lock);
2432 					freemsgchain(head);
2433 				}
2434 			}
2435 			MAC_SRS_WORKER_WAKEUP(mac_srs);
2436 			mutex_exit(&mac_srs->srs_lock);
2437 			return;
2438 		}
2439 	}
2440 
2441 	/*
2442 	 * If the total number of packets queued in the SRS and
2443 	 * its associated soft rings exceeds the max allowed,
2444 	 * then drop the chain. If we are polling capable, this
2445 	 * shouldn't be happening.
2446 	 */
2447 	if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2448 	    (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2449 		mac_bw = mac_srs->srs_bw;
2450 		srs_rx->sr_stat.mrs_sdrops += count;
2451 		mutex_enter(&mac_bw->mac_bw_lock);
2452 		mac_bw->mac_bw_drop_bytes += sz;
2453 		mutex_exit(&mac_bw->mac_bw_lock);
2454 		freemsgchain(mp_chain);
2455 		mutex_exit(&mac_srs->srs_lock);
2456 		return;
2457 	}
2458 
2459 	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2460 
2461 	if (!(mac_srs->srs_state & SRS_PROC)) {
2462 		/*
2463 		 * If we are coming via loopback, if we are not optimizing for
2464 		 * latency, or if our stack is running deep, we should signal
2465 		 * the worker thread.
2466 		 */
2467 		if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT) ||
2468 		    MAC_RX_SRS_TOODEEP()) {
2469 			/*
2470 			 * For loopback, We need to let the worker take
2471 			 * over as we don't want to continue in the same
2472 			 * thread even if we can. This could lead to stack
2473 			 * overflows and may also end up using
2474 			 * resources (cpu) incorrectly.
2475 			 */
2476 			cv_signal(&mac_srs->srs_async);
2477 		} else {
2478 			/*
2479 			 * Seems like no one is processing the SRS and
2480 			 * there is no backlog. We also inline process
2481 			 * our packet if its a single packet in non
2482 			 * latency optimized case (in latency optimized
2483 			 * case, we inline process chains of any size).
2484 			 */
2485 			mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2486 		}
2487 	}
2488 	mutex_exit(&mac_srs->srs_lock);
2489 }
2490 
2491 /* TX SIDE ROUTINES (RUNTIME) */
2492 
2493 /*
2494  * mac_tx_srs_no_desc
2495  *
2496  * This routine is called by Tx single ring default mode
2497  * when Tx ring runs out of descs.
2498  */
2499 mac_tx_cookie_t
2500 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2501     uint16_t flag, mblk_t **ret_mp)
2502 {
2503 	mac_tx_cookie_t cookie = NULL;
2504 	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2505 	boolean_t wakeup_worker = B_TRUE;
2506 	uint32_t tx_mode = srs_tx->st_mode;
2507 	int cnt, sz;
2508 	mblk_t *tail;
2509 
2510 	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2511 	if (flag & MAC_DROP_ON_NO_DESC) {
2512 		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2513 	} else {
2514 		if (mac_srs->srs_first != NULL)
2515 			wakeup_worker = B_FALSE;
2516 		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2517 		if (flag & MAC_TX_NO_ENQUEUE) {
2518 			/*
2519 			 * If TX_QUEUED is not set, queue the
2520 			 * packet and let mac_tx_srs_drain()
2521 			 * set the TX_BLOCKED bit for the
2522 			 * reasons explained above. Otherwise,
2523 			 * return the mblks.
2524 			 */
2525 			if (wakeup_worker) {
2526 				MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2527 				    mp_chain, tail, cnt, sz);
2528 			} else {
2529 				MAC_TX_SET_NO_ENQUEUE(mac_srs,
2530 				    mp_chain, ret_mp, cookie);
2531 			}
2532 		} else {
2533 			MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2534 			    tail, cnt, sz, cookie);
2535 		}
2536 		if (wakeup_worker)
2537 			cv_signal(&mac_srs->srs_async);
2538 	}
2539 	return (cookie);
2540 }
2541 
2542 /*
2543  * mac_tx_srs_enqueue
2544  *
2545  * This routine is called when Tx SRS is operating in either serializer
2546  * or bandwidth mode. In serializer mode, a packet will get enqueued
2547  * when a thread cannot enter SRS exclusively. In bandwidth mode,
2548  * packets gets queued if allowed byte-count limit for a tick is
2549  * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2550  * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2551  * the default mode or fanout mode. Here packets get dropped or
2552  * returned back to the caller only after hi-watermark worth of data
2553  * is queued.
2554  */
2555 static mac_tx_cookie_t
2556 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2557     uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2558 {
2559 	mac_tx_cookie_t cookie = NULL;
2560 	int cnt, sz;
2561 	mblk_t *tail;
2562 	boolean_t wakeup_worker = B_TRUE;
2563 
2564 	/*
2565 	 * Ignore fanout hint if we don't have multiple tx rings.
2566 	 */
2567 	if (!MAC_TX_SOFT_RINGS(mac_srs))
2568 		fanout_hint = 0;
2569 
2570 	if (mac_srs->srs_first != NULL)
2571 		wakeup_worker = B_FALSE;
2572 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2573 	if (flag & MAC_DROP_ON_NO_DESC) {
2574 		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2575 			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2576 		} else {
2577 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2578 			    mp_chain, tail, cnt, sz);
2579 		}
2580 	} else if (flag & MAC_TX_NO_ENQUEUE) {
2581 		if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2582 		    (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2583 			MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2584 			    ret_mp, cookie);
2585 		} else {
2586 			mp_chain->b_prev = (mblk_t *)fanout_hint;
2587 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2588 			    mp_chain, tail, cnt, sz);
2589 		}
2590 	} else {
2591 		/*
2592 		 * If you are BW_ENFORCED, just enqueue the
2593 		 * packet. srs_worker will drain it at the
2594 		 * prescribed rate. Before enqueueing, save
2595 		 * the fanout hint.
2596 		 */
2597 		mp_chain->b_prev = (mblk_t *)fanout_hint;
2598 		MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2599 		    tail, cnt, sz, cookie);
2600 	}
2601 	if (wakeup_worker)
2602 		cv_signal(&mac_srs->srs_async);
2603 	return (cookie);
2604 }
2605 
2606 /*
2607  * There are seven tx modes:
2608  *
2609  * 1) Default mode (SRS_TX_DEFAULT)
2610  * 2) Serialization mode (SRS_TX_SERIALIZE)
2611  * 3) Fanout mode (SRS_TX_FANOUT)
2612  * 4) Bandwdith mode (SRS_TX_BW)
2613  * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2614  * 6) aggr Tx mode (SRS_TX_AGGR)
2615  * 7) aggr Tx bw mode (SRS_TX_BW_AGGR)
2616  *
2617  * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2618  * based on the number of Tx rings requested for an SRS and whether
2619  * bandwidth control is requested or not.
2620  *
2621  * The default mode (i.e., no fanout/no bandwidth) is used when the
2622  * underlying NIC does not have Tx rings or just one Tx ring. In this mode,
2623  * the SRS acts as a pass-thru. Packets will go directly to mac_tx_send().
2624  * When the underlying Tx ring runs out of Tx descs, it starts queueing up
2625  * packets in SRS. When flow-control is relieved, the srs_worker drains
2626  * the queued packets and informs blocked clients to restart sending
2627  * packets.
2628  *
2629  * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. This
2630  * mode is used when the link has no Tx rings or only one Tx ring.
2631  *
2632  * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2633  * Tx rings. Each Tx ring will have a soft ring associated with it.
2634  * These soft rings will be hung off the Tx SRS. Queueing if it happens
2635  * due to lack of Tx desc will be in individual soft ring (and not srs)
2636  * associated with Tx ring.
2637  *
2638  * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2639  * only if bw is available. Otherwise the packets will be queued in
2640  * SRS. If fanout to multiple Tx rings is configured, the packets will
2641  * be fanned out among the soft rings associated with the Tx rings.
2642  *
2643  * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
2644  * invokes an aggr function, aggr_find_tx_ring(), to find a pseudo Tx ring
2645  * belonging to a port on which the packet has to be sent. Aggr will
2646  * always have a pseudo Tx ring associated with it even when it is an
2647  * aggregation over a single NIC that has no Tx rings. Even in such a
2648  * case, the single pseudo Tx ring will have a soft ring associated with
2649  * it and the soft ring will hang off the SRS.
2650  *
2651  * If a bandwidth is specified for an aggr, SRS_TX_BW_AGGR mode is used.
2652  * In this mode, the bandwidth is first applied on the outgoing packets
2653  * and later mac_tx_addr_mode() function is called to send the packet out
2654  * of one of the pseudo Tx rings.
2655  *
2656  * Four flags are used in srs_state for indicating flow control
2657  * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2658  * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2659  * driver below.
2660  * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2661  * and flow-control pressure is applied back to clients. The clients expect
2662  * wakeup when flow-control is relieved.
2663  * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2664  * got returned back to client either due to lack of Tx descs or due to bw
2665  * control reasons. The clients expect a wakeup when condition is relieved.
2666  *
2667  * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2668  * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2669  * MAC_TX_NO_ENQUEUE
2670  * Mac clients that do not want packets to be enqueued in the mac layer set
2671  * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2672  * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2673  * behaviour of this flag is different when the Tx is running in serializer
2674  * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2675  * get dropped when Tx high watermark is reached.
2676  * There are some mac clients like vsw, aggr that want the mblks to be
2677  * returned back to clients instead of being queued in Tx SRS (or Tx soft
2678  * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2679  * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2680  * In the default and Tx fanout mode, the un-transmitted mblks will be
2681  * returned back to the clients when the driver runs out of Tx descs.
2682  * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2683  * soft ring) so that the clients can be woken up when Tx desc become
2684  * available. When running in serializer or bandwidth mode mode,
2685  * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2686  */
2687 
2688 mac_tx_func_t
2689 mac_tx_get_func(uint32_t mode)
2690 {
2691 	return (mac_tx_mode_list[mode].mac_tx_func);
2692 }
2693 
2694 /* ARGSUSED */
2695 static mac_tx_cookie_t
2696 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2697     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2698 {
2699 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2700 	mac_tx_stats_t		stats;
2701 	mac_tx_cookie_t		cookie = NULL;
2702 
2703 	ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2704 
2705 	/* Regular case with a single Tx ring */
2706 	/*
2707 	 * SRS_TX_BLOCKED is set when underlying NIC runs
2708 	 * out of Tx descs and messages start getting
2709 	 * queued. It won't get reset until
2710 	 * tx_srs_drain() completely drains out the
2711 	 * messages.
2712 	 */
2713 	if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2714 		/* Tx descs/resources not available */
2715 		mutex_enter(&mac_srs->srs_lock);
2716 		if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2717 			cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2718 			    flag, ret_mp);
2719 			mutex_exit(&mac_srs->srs_lock);
2720 			return (cookie);
2721 		}
2722 		/*
2723 		 * While we were computing mblk count, the
2724 		 * flow control condition got relieved.
2725 		 * Continue with the transmission.
2726 		 */
2727 		mutex_exit(&mac_srs->srs_lock);
2728 	}
2729 
2730 	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2731 	    mp_chain, &stats);
2732 
2733 	/*
2734 	 * Multiple threads could be here sending packets.
2735 	 * Under such conditions, it is not possible to
2736 	 * automically set SRS_TX_BLOCKED bit to indicate
2737 	 * out of tx desc condition. To atomically set
2738 	 * this, we queue the returned packet and do
2739 	 * the setting of SRS_TX_BLOCKED in
2740 	 * mac_tx_srs_drain().
2741 	 */
2742 	if (mp_chain != NULL) {
2743 		mutex_enter(&mac_srs->srs_lock);
2744 		cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2745 		mutex_exit(&mac_srs->srs_lock);
2746 		return (cookie);
2747 	}
2748 	SRS_TX_STATS_UPDATE(mac_srs, &stats);
2749 
2750 	return (NULL);
2751 }
2752 
2753 /*
2754  * mac_tx_serialize_mode
2755  *
2756  * This is an experimental mode implemented as per the request of PAE.
2757  * In this mode, all callers attempting to send a packet to the NIC
2758  * will get serialized. Only one thread at any time will access the
2759  * NIC to send the packet out.
2760  */
2761 /* ARGSUSED */
2762 static mac_tx_cookie_t
2763 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2764     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2765 {
2766 	mac_tx_stats_t		stats;
2767 	mac_tx_cookie_t		cookie = NULL;
2768 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2769 
2770 	/* Single ring, serialize below */
2771 	ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2772 	mutex_enter(&mac_srs->srs_lock);
2773 	if ((mac_srs->srs_first != NULL) ||
2774 	    (mac_srs->srs_state & SRS_PROC)) {
2775 		/*
2776 		 * In serialization mode, queue all packets until
2777 		 * TX_HIWAT is set.
2778 		 * If drop bit is set, drop if TX_HIWAT is set.
2779 		 * If no_enqueue is set, still enqueue until hiwat
2780 		 * is set and return mblks after TX_HIWAT is set.
2781 		 */
2782 		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2783 		    flag, NULL, ret_mp);
2784 		mutex_exit(&mac_srs->srs_lock);
2785 		return (cookie);
2786 	}
2787 	/*
2788 	 * No packets queued, nothing on proc and no flow
2789 	 * control condition. Fast-path, ok. Do inline
2790 	 * processing.
2791 	 */
2792 	mac_srs->srs_state |= SRS_PROC;
2793 	mutex_exit(&mac_srs->srs_lock);
2794 
2795 	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2796 	    mp_chain, &stats);
2797 
2798 	mutex_enter(&mac_srs->srs_lock);
2799 	mac_srs->srs_state &= ~SRS_PROC;
2800 	if (mp_chain != NULL) {
2801 		cookie = mac_tx_srs_enqueue(mac_srs,
2802 		    mp_chain, flag, NULL, ret_mp);
2803 	}
2804 	if (mac_srs->srs_first != NULL) {
2805 		/*
2806 		 * We processed inline our packet and a new
2807 		 * packet/s got queued while we were
2808 		 * processing. Wakeup srs worker
2809 		 */
2810 		cv_signal(&mac_srs->srs_async);
2811 	}
2812 	mutex_exit(&mac_srs->srs_lock);
2813 
2814 	if (cookie == NULL)
2815 		SRS_TX_STATS_UPDATE(mac_srs, &stats);
2816 
2817 	return (cookie);
2818 }
2819 
2820 /*
2821  * mac_tx_fanout_mode
2822  *
2823  * In this mode, the SRS will have access to multiple Tx rings to send
2824  * the packet out. The fanout hint that is passed as an argument is
2825  * used to find an appropriate ring to fanout the traffic. Each Tx
2826  * ring, in turn,  will have a soft ring associated with it. If a Tx
2827  * ring runs out of Tx desc's the returned packet will be queued in
2828  * the soft ring associated with that Tx ring. The srs itself will not
2829  * queue any packets.
2830  */
2831 
2832 #define	MAC_TX_SOFT_RING_PROCESS(chain) {		       		\
2833 	index = COMPUTE_INDEX(hash, mac_srs->srs_tx_ring_count),	\
2834 	softring = mac_srs->srs_tx_soft_rings[index];			\
2835 	cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2836 	DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);	\
2837 }
2838 
2839 static mac_tx_cookie_t
2840 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2841     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2842 {
2843 	mac_soft_ring_t		*softring;
2844 	uint64_t		hash;
2845 	uint_t			index;
2846 	mac_tx_cookie_t		cookie = NULL;
2847 
2848 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
2849 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
2850 	if (fanout_hint != 0) {
2851 		/*
2852 		 * The hint is specified by the caller, simply pass the
2853 		 * whole chain to the soft ring.
2854 		 */
2855 		hash = HASH_HINT(fanout_hint);
2856 		MAC_TX_SOFT_RING_PROCESS(mp_chain);
2857 	} else {
2858 		mblk_t *last_mp, *cur_mp, *sub_chain;
2859 		uint64_t last_hash = 0;
2860 		uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2861 
2862 		/*
2863 		 * Compute the hash from the contents (headers) of the
2864 		 * packets of the mblk chain. Split the chains into
2865 		 * subchains of the same conversation.
2866 		 *
2867 		 * Since there may be more than one ring used for
2868 		 * sub-chains of the same call, and since the caller
2869 		 * does not maintain per conversation state since it
2870 		 * passed a zero hint, unsent subchains will be
2871 		 * dropped.
2872 		 */
2873 
2874 		flag |= MAC_DROP_ON_NO_DESC;
2875 		ret_mp = NULL;
2876 
2877 		ASSERT(ret_mp == NULL);
2878 
2879 		sub_chain = NULL;
2880 		last_mp = NULL;
2881 
2882 		for (cur_mp = mp_chain; cur_mp != NULL;
2883 		    cur_mp = cur_mp->b_next) {
2884 			hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2885 			    B_TRUE);
2886 			if (last_hash != 0 && hash != last_hash) {
2887 				/*
2888 				 * Starting a different subchain, send current
2889 				 * chain out.
2890 				 */
2891 				ASSERT(last_mp != NULL);
2892 				last_mp->b_next = NULL;
2893 				MAC_TX_SOFT_RING_PROCESS(sub_chain);
2894 				sub_chain = NULL;
2895 			}
2896 
2897 			/* add packet to subchain */
2898 			if (sub_chain == NULL)
2899 				sub_chain = cur_mp;
2900 			last_mp = cur_mp;
2901 			last_hash = hash;
2902 		}
2903 
2904 		if (sub_chain != NULL) {
2905 			/* send last subchain */
2906 			ASSERT(last_mp != NULL);
2907 			last_mp->b_next = NULL;
2908 			MAC_TX_SOFT_RING_PROCESS(sub_chain);
2909 		}
2910 
2911 		cookie = NULL;
2912 	}
2913 
2914 	return (cookie);
2915 }
2916 
2917 /*
2918  * mac_tx_bw_mode
2919  *
2920  * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2921  * only if bw is available. Otherwise the packets will be queued in
2922  * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2923  * out to a Tx rings.
2924  */
2925 static mac_tx_cookie_t
2926 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2927     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2928 {
2929 	int			cnt, sz;
2930 	mblk_t			*tail;
2931 	mac_tx_cookie_t		cookie = NULL;
2932 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2933 	clock_t			now;
2934 
2935 	ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2936 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2937 	mutex_enter(&mac_srs->srs_lock);
2938 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
2939 		/*
2940 		 * zero bandwidth, no traffic is sent: drop the packets,
2941 		 * or return the whole chain if the caller requests all
2942 		 * unsent packets back.
2943 		 */
2944 		if (flag & MAC_TX_NO_ENQUEUE) {
2945 			cookie = (mac_tx_cookie_t)mac_srs;
2946 			*ret_mp = mp_chain;
2947 		} else {
2948 			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2949 		}
2950 		mutex_exit(&mac_srs->srs_lock);
2951 		return (cookie);
2952 	} else if ((mac_srs->srs_first != NULL) ||
2953 	    (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2954 		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2955 		    fanout_hint, ret_mp);
2956 		mutex_exit(&mac_srs->srs_lock);
2957 		return (cookie);
2958 	}
2959 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2960 	now = ddi_get_lbolt();
2961 	if (mac_srs->srs_bw->mac_bw_curr_time != now) {
2962 		mac_srs->srs_bw->mac_bw_curr_time = now;
2963 		mac_srs->srs_bw->mac_bw_used = 0;
2964 	} else if (mac_srs->srs_bw->mac_bw_used >
2965 	    mac_srs->srs_bw->mac_bw_limit) {
2966 		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2967 		MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2968 		    mp_chain, tail, cnt, sz);
2969 		/*
2970 		 * Wakeup worker thread. Note that worker
2971 		 * thread has to be woken up so that it
2972 		 * can fire up the timer to be woken up
2973 		 * on the next tick. Also once
2974 		 * BW_ENFORCED is set, it can only be
2975 		 * reset by srs_worker thread. Until then
2976 		 * all packets will get queued up in SRS
2977 		 * and hence this this code path won't be
2978 		 * entered until BW_ENFORCED is reset.
2979 		 */
2980 		cv_signal(&mac_srs->srs_async);
2981 		mutex_exit(&mac_srs->srs_lock);
2982 		return (cookie);
2983 	}
2984 
2985 	mac_srs->srs_bw->mac_bw_used += sz;
2986 	mutex_exit(&mac_srs->srs_lock);
2987 
2988 	if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2989 		mac_soft_ring_t *softring;
2990 		uint_t indx, hash;
2991 
2992 		hash = HASH_HINT(fanout_hint);
2993 		indx = COMPUTE_INDEX(hash,
2994 		    mac_srs->srs_tx_ring_count);
2995 		softring = mac_srs->srs_tx_soft_rings[indx];
2996 		return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2997 		    ret_mp));
2998 	} else if (srs_tx->st_mode == SRS_TX_BW_AGGR) {
2999 		return (mac_tx_aggr_mode(mac_srs, mp_chain,
3000 		    fanout_hint, flag, ret_mp));
3001 	} else {
3002 		mac_tx_stats_t		stats;
3003 
3004 		mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3005 		    mp_chain, &stats);
3006 
3007 		if (mp_chain != NULL) {
3008 			mutex_enter(&mac_srs->srs_lock);
3009 			MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3010 			if (mac_srs->srs_bw->mac_bw_used > sz)
3011 				mac_srs->srs_bw->mac_bw_used -= sz;
3012 			else
3013 				mac_srs->srs_bw->mac_bw_used = 0;
3014 			cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
3015 			    fanout_hint, ret_mp);
3016 			mutex_exit(&mac_srs->srs_lock);
3017 			return (cookie);
3018 		}
3019 		SRS_TX_STATS_UPDATE(mac_srs, &stats);
3020 
3021 		return (NULL);
3022 	}
3023 }
3024 
3025 /*
3026  * mac_tx_aggr_mode
3027  *
3028  * This routine invokes an aggr function, aggr_find_tx_ring(), to find
3029  * a (pseudo) Tx ring belonging to a port on which the packet has to
3030  * be sent. aggr_find_tx_ring() first finds the outgoing port based on
3031  * L2/L3/L4 policy and then uses the fanout_hint passed to it to pick
3032  * a Tx ring from the selected port.
3033  *
3034  * Note that a port can be deleted from the aggregation. In such a case,
3035  * the aggregation layer first separates the port from the rest of the
3036  * ports making sure that port (and thus any Tx rings associated with
3037  * it) won't get selected in the call to aggr_find_tx_ring() function.
3038  * Later calls are made to mac_group_rem_ring() passing pseudo Tx ring
3039  * handles one by one which in turn will quiesce the Tx SRS and remove
3040  * the soft ring associated with the pseudo Tx ring. Unlike Rx side
3041  * where a cookie is used to protect against mac_rx_ring() calls on
3042  * rings that have been removed, no such cookie is needed on the Tx
3043  * side as the pseudo Tx ring won't be available anymore to
3044  * aggr_find_tx_ring() once the port has been removed.
3045  */
3046 static mac_tx_cookie_t
3047 mac_tx_aggr_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
3048     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
3049 {
3050 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
3051 	mac_tx_ring_fn_t	find_tx_ring_fn;
3052 	mac_ring_handle_t	ring = NULL;
3053 	void			*arg;
3054 	mac_soft_ring_t		*sringp;
3055 
3056 	find_tx_ring_fn = srs_tx->st_capab_aggr.mca_find_tx_ring_fn;
3057 	arg = srs_tx->st_capab_aggr.mca_arg;
3058 	if (find_tx_ring_fn(arg, mp_chain, fanout_hint, &ring) == NULL)
3059 		return (NULL);
3060 	sringp = srs_tx->st_soft_rings[((mac_ring_t *)ring)->mr_index];
3061 	return (mac_tx_soft_ring_process(sringp, mp_chain, flag, ret_mp));
3062 }
3063 
3064 void
3065 mac_tx_invoke_callbacks(mac_client_impl_t *mcip, mac_tx_cookie_t cookie)
3066 {
3067 	mac_cb_t *mcb;
3068 	mac_tx_notify_cb_t *mtnfp;
3069 
3070 	/* Wakeup callback registered clients */
3071 	MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3072 	for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3073 	    mcb = mcb->mcb_nextp) {
3074 		mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3075 		mtnfp->mtnf_fn(mtnfp->mtnf_arg, cookie);
3076 	}
3077 	MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3078 	    &mcip->mci_tx_notify_cb_list);
3079 }
3080 
3081 /* ARGSUSED */
3082 void
3083 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
3084 {
3085 	mblk_t			*head, *tail;
3086 	size_t			sz;
3087 	uint32_t		tx_mode;
3088 	uint_t			saved_pkt_count;
3089 	mac_tx_stats_t		stats;
3090 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
3091 	clock_t			now;
3092 
3093 	saved_pkt_count = 0;
3094 	ASSERT(mutex_owned(&mac_srs->srs_lock));
3095 	ASSERT(!(mac_srs->srs_state & SRS_PROC));
3096 
3097 	mac_srs->srs_state |= SRS_PROC;
3098 
3099 	tx_mode = srs_tx->st_mode;
3100 	if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
3101 		if (mac_srs->srs_first != NULL) {
3102 			head = mac_srs->srs_first;
3103 			tail = mac_srs->srs_last;
3104 			saved_pkt_count = mac_srs->srs_count;
3105 			mac_srs->srs_first = NULL;
3106 			mac_srs->srs_last = NULL;
3107 			mac_srs->srs_count = 0;
3108 			mutex_exit(&mac_srs->srs_lock);
3109 
3110 			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3111 			    head, &stats);
3112 
3113 			mutex_enter(&mac_srs->srs_lock);
3114 			if (head != NULL) {
3115 				/* Device out of tx desc, set block */
3116 				if (head->b_next == NULL)
3117 					VERIFY(head == tail);
3118 				tail->b_next = mac_srs->srs_first;
3119 				mac_srs->srs_first = head;
3120 				mac_srs->srs_count +=
3121 				    (saved_pkt_count - stats.mts_opackets);
3122 				if (mac_srs->srs_last == NULL)
3123 					mac_srs->srs_last = tail;
3124 				MAC_TX_SRS_BLOCK(mac_srs, head);
3125 			} else {
3126 				srs_tx->st_woken_up = B_FALSE;
3127 				SRS_TX_STATS_UPDATE(mac_srs, &stats);
3128 			}
3129 		}
3130 	} else if (tx_mode == SRS_TX_BW) {
3131 		/*
3132 		 * We are here because the timer fired and we have some data
3133 		 * to tranmit. Also mac_tx_srs_worker should have reset
3134 		 * SRS_BW_ENFORCED flag
3135 		 */
3136 		ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3137 		head = tail = mac_srs->srs_first;
3138 		while (mac_srs->srs_first != NULL) {
3139 			tail = mac_srs->srs_first;
3140 			tail->b_prev = NULL;
3141 			mac_srs->srs_first = tail->b_next;
3142 			if (mac_srs->srs_first == NULL)
3143 				mac_srs->srs_last = NULL;
3144 			mac_srs->srs_count--;
3145 			sz = msgdsize(tail);
3146 			mac_srs->srs_size -= sz;
3147 			saved_pkt_count++;
3148 			MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3149 
3150 			if (mac_srs->srs_bw->mac_bw_used <
3151 			    mac_srs->srs_bw->mac_bw_limit)
3152 				continue;
3153 
3154 			now = ddi_get_lbolt();
3155 			if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3156 				mac_srs->srs_bw->mac_bw_curr_time = now;
3157 				mac_srs->srs_bw->mac_bw_used = sz;
3158 				continue;
3159 			}
3160 			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3161 			break;
3162 		}
3163 
3164 		ASSERT((head == NULL && tail == NULL) ||
3165 		    (head != NULL && tail != NULL));
3166 		if (tail != NULL) {
3167 			tail->b_next = NULL;
3168 			mutex_exit(&mac_srs->srs_lock);
3169 
3170 			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3171 			    head, &stats);
3172 
3173 			mutex_enter(&mac_srs->srs_lock);
3174 			if (head != NULL) {
3175 				uint_t size_sent;
3176 
3177 				/* Device out of tx desc, set block */
3178 				if (head->b_next == NULL)
3179 					VERIFY(head == tail);
3180 				tail->b_next = mac_srs->srs_first;
3181 				mac_srs->srs_first = head;
3182 				mac_srs->srs_count +=
3183 				    (saved_pkt_count - stats.mts_opackets);
3184 				if (mac_srs->srs_last == NULL)
3185 					mac_srs->srs_last = tail;
3186 				size_sent = sz - stats.mts_obytes;
3187 				mac_srs->srs_size += size_sent;
3188 				mac_srs->srs_bw->mac_bw_sz += size_sent;
3189 				if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3190 					mac_srs->srs_bw->mac_bw_used -=
3191 					    size_sent;
3192 				} else {
3193 					mac_srs->srs_bw->mac_bw_used = 0;
3194 				}
3195 				MAC_TX_SRS_BLOCK(mac_srs, head);
3196 			} else {
3197 				srs_tx->st_woken_up = B_FALSE;
3198 				SRS_TX_STATS_UPDATE(mac_srs, &stats);
3199 			}
3200 		}
3201 	} else if (tx_mode == SRS_TX_BW_FANOUT || tx_mode == SRS_TX_BW_AGGR) {
3202 		mblk_t *prev;
3203 		uint64_t hint;
3204 
3205 		/*
3206 		 * We are here because the timer fired and we
3207 		 * have some quota to tranmit.
3208 		 */
3209 		prev = NULL;
3210 		head = tail = mac_srs->srs_first;
3211 		while (mac_srs->srs_first != NULL) {
3212 			tail = mac_srs->srs_first;
3213 			mac_srs->srs_first = tail->b_next;
3214 			if (mac_srs->srs_first == NULL)
3215 				mac_srs->srs_last = NULL;
3216 			mac_srs->srs_count--;
3217 			sz = msgdsize(tail);
3218 			mac_srs->srs_size -= sz;
3219 			mac_srs->srs_bw->mac_bw_used += sz;
3220 			if (prev == NULL)
3221 				hint = (ulong_t)tail->b_prev;
3222 			if (hint != (ulong_t)tail->b_prev) {
3223 				prev->b_next = NULL;
3224 				mutex_exit(&mac_srs->srs_lock);
3225 				TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3226 				head = tail;
3227 				hint = (ulong_t)tail->b_prev;
3228 				mutex_enter(&mac_srs->srs_lock);
3229 			}
3230 
3231 			prev = tail;
3232 			tail->b_prev = NULL;
3233 			if (mac_srs->srs_bw->mac_bw_used <
3234 			    mac_srs->srs_bw->mac_bw_limit)
3235 				continue;
3236 
3237 			now = ddi_get_lbolt();
3238 			if (mac_srs->srs_bw->mac_bw_curr_time != now) {
3239 				mac_srs->srs_bw->mac_bw_curr_time = now;
3240 				mac_srs->srs_bw->mac_bw_used = 0;
3241 				continue;
3242 			}
3243 			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3244 			break;
3245 		}
3246 		ASSERT((head == NULL && tail == NULL) ||
3247 		    (head != NULL && tail != NULL));
3248 		if (tail != NULL) {
3249 			tail->b_next = NULL;
3250 			mutex_exit(&mac_srs->srs_lock);
3251 			TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3252 			mutex_enter(&mac_srs->srs_lock);
3253 		}
3254 	}
3255 	/*
3256 	 * SRS_TX_FANOUT case not considered here because packets
3257 	 * won't be queued in the SRS for this case. Packets will
3258 	 * be sent directly to soft rings underneath and if there
3259 	 * is any queueing at all, it would be in Tx side soft
3260 	 * rings.
3261 	 */
3262 
3263 	/*
3264 	 * When srs_count becomes 0, reset SRS_TX_HIWAT and
3265 	 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3266 	 */
3267 	if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3268 	    (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3269 		mac_client_impl_t *mcip = mac_srs->srs_mcip;
3270 		boolean_t wakeup_required = B_FALSE;
3271 
3272 		if (mac_srs->srs_state &
3273 		    (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3274 			wakeup_required = B_TRUE;
3275 		}
3276 		mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3277 		    SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3278 		mutex_exit(&mac_srs->srs_lock);
3279 		if (wakeup_required) {
3280 			mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)mac_srs);
3281 			/*
3282 			 * If the client is not the primary MAC client, then we
3283 			 * need to send the notification to the clients upper
3284 			 * MAC, i.e. mci_upper_mip.
3285 			 */
3286 			mac_tx_notify(mcip->mci_upper_mip != NULL ?
3287 			    mcip->mci_upper_mip : mcip->mci_mip);
3288 		}
3289 		mutex_enter(&mac_srs->srs_lock);
3290 	}
3291 	mac_srs->srs_state &= ~SRS_PROC;
3292 }
3293 
3294 /*
3295  * Given a packet, get the flow_entry that identifies the flow
3296  * to which that packet belongs. The flow_entry will contain
3297  * the transmit function to be used to send the packet. If the
3298  * function returns NULL, the packet should be sent using the
3299  * underlying NIC.
3300  */
3301 static flow_entry_t *
3302 mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3303 {
3304 	flow_entry_t		*flent = NULL;
3305 	mac_client_impl_t	*mcip;
3306 	int	err;
3307 
3308 	/*
3309 	 * Do classification on the packet.
3310 	 */
3311 	err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3312 	if (err != 0)
3313 		return (NULL);
3314 
3315 	/*
3316 	 * This flent might just be an additional one on the MAC client,
3317 	 * i.e. for classification purposes (different fdesc), however
3318 	 * the resources, SRS et. al., are in the mci_flent, so if
3319 	 * this isn't the mci_flent, we need to get it.
3320 	 */
3321 	if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3322 		FLOW_REFRELE(flent);
3323 		flent = mcip->mci_flent;
3324 		FLOW_TRY_REFHOLD(flent, err);
3325 		if (err != 0)
3326 			return (NULL);
3327 	}
3328 
3329 	return (flent);
3330 }
3331 
3332 /*
3333  * This macro is only meant to be used by mac_tx_send().
3334  */
3335 #define	CHECK_VID_AND_ADD_TAG(mp) {			\
3336 	if (vid_check) {				\
3337 		int err = 0;				\
3338 							\
3339 		MAC_VID_CHECK(src_mcip, (mp), err);	\
3340 		if (err != 0) {				\
3341 			freemsg((mp));			\
3342 			(mp) = next;			\
3343 			oerrors++;			\
3344 			continue;			\
3345 		}					\
3346 	}						\
3347 	if (add_tag) {					\
3348 		(mp) = mac_add_vlan_tag((mp), 0, vid);	\
3349 		if ((mp) == NULL) {			\
3350 			(mp) = next;			\
3351 			oerrors++;			\
3352 			continue;			\
3353 		}					\
3354 	}						\
3355 }
3356 
3357 mblk_t *
3358 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3359     mac_tx_stats_t *stats)
3360 {
3361 	mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3362 	mac_impl_t *mip = src_mcip->mci_mip;
3363 	uint_t obytes = 0, opackets = 0, oerrors = 0;
3364 	mblk_t *mp = NULL, *next;
3365 	boolean_t vid_check, add_tag;
3366 	uint16_t vid = 0;
3367 
3368 	if (mip->mi_nclients > 1) {
3369 		vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3370 		add_tag = MAC_TAG_NEEDED(src_mcip);
3371 		if (add_tag)
3372 			vid = mac_client_vid(mch);
3373 	} else {
3374 		ASSERT(mip->mi_nclients == 1);
3375 		vid_check = add_tag = B_FALSE;
3376 	}
3377 
3378 	/*
3379 	 * Fastpath: if there's only one client, we simply send
3380 	 * the packet down to the underlying NIC.
3381 	 */
3382 	if (mip->mi_nactiveclients == 1) {
3383 		DTRACE_PROBE2(fastpath,
3384 		    mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3385 
3386 		mp = mp_chain;
3387 		while (mp != NULL) {
3388 			next = mp->b_next;
3389 			mp->b_next = NULL;
3390 			opackets++;
3391 			obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3392 			    msgdsize(mp));
3393 
3394 			CHECK_VID_AND_ADD_TAG(mp);
3395 			MAC_TX(mip, ring, mp, src_mcip);
3396 
3397 			/*
3398 			 * If the driver is out of descriptors and does a
3399 			 * partial send it will return a chain of unsent
3400 			 * mblks. Adjust the accounting stats.
3401 			 */
3402 			if (mp != NULL) {
3403 				opackets--;
3404 				obytes -= msgdsize(mp);
3405 				mp->b_next = next;
3406 				break;
3407 			}
3408 			mp = next;
3409 		}
3410 		goto done;
3411 	}
3412 
3413 	/*
3414 	 * No fastpath, we either have more than one MAC client
3415 	 * defined on top of the same MAC, or one or more MAC
3416 	 * client promiscuous callbacks.
3417 	 */
3418 	DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3419 	    src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3420 
3421 	mp = mp_chain;
3422 	while (mp != NULL) {
3423 		flow_entry_t *dst_flow_ent;
3424 		void *flow_cookie;
3425 		size_t	pkt_size;
3426 		mblk_t *mp1;
3427 
3428 		next = mp->b_next;
3429 		mp->b_next = NULL;
3430 		opackets++;
3431 		pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3432 		obytes += pkt_size;
3433 		CHECK_VID_AND_ADD_TAG(mp);
3434 
3435 		/*
3436 		 * Find the destination.
3437 		 */
3438 		dst_flow_ent = mac_tx_classify(mip, mp);
3439 
3440 		if (dst_flow_ent != NULL) {
3441 			size_t	hdrsize;
3442 			int	err = 0;
3443 
3444 			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3445 				struct ether_vlan_header *evhp =
3446 				    (struct ether_vlan_header *)mp->b_rptr;
3447 
3448 				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3449 					hdrsize = sizeof (*evhp);
3450 				else
3451 					hdrsize = sizeof (struct ether_header);
3452 			} else {
3453 				mac_header_info_t	mhi;
3454 
3455 				err = mac_header_info((mac_handle_t)mip,
3456 				    mp, &mhi);
3457 				if (err == 0)
3458 					hdrsize = mhi.mhi_hdrsize;
3459 			}
3460 
3461 			/*
3462 			 * Got a matching flow. It's either another
3463 			 * MAC client, or a broadcast/multicast flow.
3464 			 * Make sure the packet size is within the
3465 			 * allowed size. If not drop the packet and
3466 			 * move to next packet.
3467 			 */
3468 			if (err != 0 ||
3469 			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
3470 				oerrors++;
3471 				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3472 				    mblk_t *, mp);
3473 				freemsg(mp);
3474 				mp = next;
3475 				FLOW_REFRELE(dst_flow_ent);
3476 				continue;
3477 			}
3478 			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3479 			if (flow_cookie != NULL) {
3480 				/*
3481 				 * The vnic_bcast_send function expects
3482 				 * to receive the sender MAC client
3483 				 * as value for arg2.
3484 				 */
3485 				mac_bcast_send(flow_cookie, src_mcip, mp,
3486 				    B_TRUE);
3487 			} else {
3488 				/*
3489 				 * loopback the packet to a local MAC
3490 				 * client. We force a context switch
3491 				 * if both source and destination MAC
3492 				 * clients are used by IP, i.e.
3493 				 * bypass is set.
3494 				 */
3495 				boolean_t do_switch;
3496 				mac_client_impl_t *dst_mcip =
3497 				    dst_flow_ent->fe_mcip;
3498 
3499 				/*
3500 				 * Check if there are promiscuous mode
3501 				 * callbacks defined. This check is
3502 				 * done here in the 'else' case and
3503 				 * not in other cases because this
3504 				 * path is for local loopback
3505 				 * communication which does not go
3506 				 * through MAC_TX(). For paths that go
3507 				 * through MAC_TX(), the promisc_list
3508 				 * check is done inside the MAC_TX()
3509 				 * macro.
3510 				 */
3511 				if (mip->mi_promisc_list != NULL)
3512 					mac_promisc_dispatch(mip, mp, src_mcip);
3513 
3514 				do_switch = ((src_mcip->mci_state_flags &
3515 				    dst_mcip->mci_state_flags &
3516 				    MCIS_CLIENT_POLL_CAPABLE) != 0);
3517 
3518 				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3519 					(dst_flow_ent->fe_cb_fn)(
3520 					    dst_flow_ent->fe_cb_arg1,
3521 					    dst_flow_ent->fe_cb_arg2,
3522 					    mp1, do_switch);
3523 				}
3524 			}
3525 			FLOW_REFRELE(dst_flow_ent);
3526 		} else {
3527 			/*
3528 			 * Unknown destination, send via the underlying
3529 			 * NIC.
3530 			 */
3531 			MAC_TX(mip, ring, mp, src_mcip);
3532 			if (mp != NULL) {
3533 				/*
3534 				 * Adjust for the last packet that
3535 				 * could not be transmitted
3536 				 */
3537 				opackets--;
3538 				obytes -= pkt_size;
3539 				mp->b_next = next;
3540 				break;
3541 			}
3542 		}
3543 		mp = next;
3544 	}
3545 
3546 done:
3547 	stats->mts_obytes = obytes;
3548 	stats->mts_opackets = opackets;
3549 	stats->mts_oerrors = oerrors;
3550 	return (mp);
3551 }
3552 
3553 /*
3554  * mac_tx_srs_ring_present
3555  *
3556  * Returns whether the specified ring is part of the specified SRS.
3557  */
3558 boolean_t
3559 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3560 {
3561 	int i;
3562 	mac_soft_ring_t *soft_ring;
3563 
3564 	if (srs->srs_tx.st_arg2 == tx_ring)
3565 		return (B_TRUE);
3566 
3567 	for (i = 0; i < srs->srs_tx_ring_count; i++) {
3568 		soft_ring =  srs->srs_tx_soft_rings[i];
3569 		if (soft_ring->s_ring_tx_arg2 == tx_ring)
3570 			return (B_TRUE);
3571 	}
3572 
3573 	return (B_FALSE);
3574 }
3575 
3576 /*
3577  * mac_tx_srs_get_soft_ring
3578  *
3579  * Returns the TX soft ring associated with the given ring, if present.
3580  */
3581 mac_soft_ring_t *
3582 mac_tx_srs_get_soft_ring(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3583 {
3584 	int		i;
3585 	mac_soft_ring_t	*soft_ring;
3586 
3587 	if (srs->srs_tx.st_arg2 == tx_ring)
3588 		return (NULL);
3589 
3590 	for (i = 0; i < srs->srs_tx_ring_count; i++) {
3591 		soft_ring =  srs->srs_tx_soft_rings[i];
3592 		if (soft_ring->s_ring_tx_arg2 == tx_ring)
3593 			return (soft_ring);
3594 	}
3595 
3596 	return (NULL);
3597 }
3598 
3599 /*
3600  * mac_tx_srs_wakeup
3601  *
3602  * Called when Tx desc become available. Wakeup the appropriate worker
3603  * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3604  * state field.
3605  */
3606 void
3607 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3608 {
3609 	int i;
3610 	mac_soft_ring_t *sringp;
3611 	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3612 
3613 	mutex_enter(&mac_srs->srs_lock);
3614 	/*
3615 	 * srs_tx_ring_count == 0 is the single ring mode case. In
3616 	 * this mode, there will not be Tx soft rings associated
3617 	 * with the SRS.
3618 	 */
3619 	if (!MAC_TX_SOFT_RINGS(mac_srs)) {
3620 		if (srs_tx->st_arg2 == ring &&
3621 		    mac_srs->srs_state & SRS_TX_BLOCKED) {
3622 			mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3623 			srs_tx->st_stat.mts_unblockcnt++;
3624 			cv_signal(&mac_srs->srs_async);
3625 		}
3626 		/*
3627 		 * A wakeup can come before tx_srs_drain() could
3628 		 * grab srs lock and set SRS_TX_BLOCKED. So
3629 		 * always set woken_up flag when we come here.
3630 		 */
3631 		srs_tx->st_woken_up = B_TRUE;
3632 		mutex_exit(&mac_srs->srs_lock);
3633 		return;
3634 	}
3635 
3636 	/*
3637 	 * If you are here, it is for FANOUT, BW_FANOUT,
3638 	 * AGGR_MODE or AGGR_BW_MODE case
3639 	 */
3640 	for (i = 0; i < mac_srs->srs_tx_ring_count; i++) {
3641 		sringp = mac_srs->srs_tx_soft_rings[i];
3642 		mutex_enter(&sringp->s_ring_lock);
3643 		if (sringp->s_ring_tx_arg2 == ring) {
3644 			if (sringp->s_ring_state & S_RING_BLOCK) {
3645 				sringp->s_ring_state &= ~S_RING_BLOCK;
3646 				sringp->s_st_stat.mts_unblockcnt++;
3647 				cv_signal(&sringp->s_ring_async);
3648 			}
3649 			sringp->s_ring_tx_woken_up = B_TRUE;
3650 		}
3651 		mutex_exit(&sringp->s_ring_lock);
3652 	}
3653 	mutex_exit(&mac_srs->srs_lock);
3654 }
3655 
3656 /*
3657  * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3658  * the blocked clients again.
3659  */
3660 void
3661 mac_tx_notify(mac_impl_t *mip)
3662 {
3663 	i_mac_notify(mip, MAC_NOTE_TX);
3664 }
3665 
3666 /*
3667  * RX SOFTRING RELATED FUNCTIONS
3668  *
3669  * These functions really belong in mac_soft_ring.c and here for
3670  * a short period.
3671  */
3672 
3673 #define	SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
3674 	/*								\
3675 	 * Enqueue our mblk chain.					\
3676 	 */								\
3677 	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
3678 									\
3679 	if ((ringp)->s_ring_last != NULL)				\
3680 		(ringp)->s_ring_last->b_next = (mp);			\
3681 	else								\
3682 		(ringp)->s_ring_first = (mp);				\
3683 	(ringp)->s_ring_last = (tail);					\
3684 	(ringp)->s_ring_count += (cnt);					\
3685 	ASSERT((ringp)->s_ring_count > 0);				\
3686 	if ((ringp)->s_ring_type & ST_RING_BW_CTL) {			\
3687 		(ringp)->s_ring_size += sz;				\
3688 	}								\
3689 }
3690 
3691 /*
3692  * Default entry point to deliver a packet chain to a MAC client.
3693  * If the MAC client has flows, do the classification with these
3694  * flows as well.
3695  */
3696 /* ARGSUSED */
3697 void
3698 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3699     mac_header_info_t *arg3)
3700 {
3701 	mac_client_impl_t *mcip = arg1;
3702 
3703 	if (mcip->mci_nvids == 1 &&
3704 	    !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3705 		/*
3706 		 * If the client has exactly one VID associated with it
3707 		 * and striping of VLAN header is not disabled,
3708 		 * remove the VLAN tag from the packet before
3709 		 * passing it on to the client's receive callback.
3710 		 * Note that this needs to be done after we dispatch
3711 		 * the packet to the promiscuous listeners of the
3712 		 * client, since they expect to see the whole
3713 		 * frame including the VLAN headers.
3714 		 */
3715 		mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3716 	}
3717 
3718 	mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3719 }
3720 
3721 /*
3722  * mac_rx_soft_ring_process
3723  *
3724  * process a chain for a given soft ring. The number of packets queued
3725  * in the SRS and its associated soft rings (including this one) is
3726  * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3727  * thread (interrupt or poll thread) to do inline processing. This
3728  * helps keep the latency down under low load.
3729  *
3730  * The proc and arg for each mblk is already stored in the mblk in
3731  * appropriate places.
3732  */
3733 /* ARGSUSED */
3734 void
3735 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3736     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3737 {
3738 	mac_direct_rx_t		proc;
3739 	void			*arg1;
3740 	mac_resource_handle_t	arg2;
3741 	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
3742 
3743 	ASSERT(ringp != NULL);
3744 	ASSERT(mp_chain != NULL);
3745 	ASSERT(tail != NULL);
3746 	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3747 
3748 	mutex_enter(&ringp->s_ring_lock);
3749 	ringp->s_ring_total_inpkt += cnt;
3750 	ringp->s_ring_total_rbytes += sz;
3751 	if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3752 	    !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3753 		/* If on processor or blanking on, then enqueue and return */
3754 		if (ringp->s_ring_state & S_RING_BLANK ||
3755 		    ringp->s_ring_state & S_RING_PROC) {
3756 			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3757 			mutex_exit(&ringp->s_ring_lock);
3758 			return;
3759 		}
3760 		proc = ringp->s_ring_rx_func;
3761 		arg1 = ringp->s_ring_rx_arg1;
3762 		arg2 = ringp->s_ring_rx_arg2;
3763 		/*
3764 		 * See if anything is already queued. If we are the
3765 		 * first packet, do inline processing else queue the
3766 		 * packet and do the drain.
3767 		 */
3768 		if (ringp->s_ring_first == NULL) {
3769 			/*
3770 			 * Fast-path, ok to process and nothing queued.
3771 			 */
3772 			ringp->s_ring_run = curthread;
3773 			ringp->s_ring_state |= (S_RING_PROC);
3774 
3775 			mutex_exit(&ringp->s_ring_lock);
3776 
3777 			/*
3778 			 * We are the chain of 1 packet so
3779 			 * go through this fast path.
3780 			 */
3781 			ASSERT(mp_chain->b_next == NULL);
3782 
3783 			(*proc)(arg1, arg2, mp_chain, NULL);
3784 
3785 			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3786 			/*
3787 			 * If we have a soft ring set which is doing
3788 			 * bandwidth control, we need to decrement
3789 			 * srs_size and count so it the SRS can have a
3790 			 * accurate idea of what is the real data
3791 			 * queued between SRS and its soft rings. We
3792 			 * decrement the counters only when the packet
3793 			 * gets processed by both SRS and the soft ring.
3794 			 */
3795 			mutex_enter(&mac_srs->srs_lock);
3796 			MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3797 			MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3798 			mutex_exit(&mac_srs->srs_lock);
3799 
3800 			mutex_enter(&ringp->s_ring_lock);
3801 			ringp->s_ring_run = NULL;
3802 			ringp->s_ring_state &= ~S_RING_PROC;
3803 			if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3804 				cv_signal(&ringp->s_ring_client_cv);
3805 
3806 			if ((ringp->s_ring_first == NULL) ||
3807 			    (ringp->s_ring_state & S_RING_BLANK)) {
3808 				/*
3809 				 * We processed inline our packet and
3810 				 * nothing new has arrived or our
3811 				 * receiver doesn't want to receive
3812 				 * any packets. We are done.
3813 				 */
3814 				mutex_exit(&ringp->s_ring_lock);
3815 				return;
3816 			}
3817 		} else {
3818 			SOFT_RING_ENQUEUE_CHAIN(ringp,
3819 			    mp_chain, tail, cnt, sz);
3820 		}
3821 
3822 		/*
3823 		 * We are here because either we couldn't do inline
3824 		 * processing (because something was already
3825 		 * queued), or we had a chain of more than one
3826 		 * packet, or something else arrived after we were
3827 		 * done with inline processing.
3828 		 */
3829 		ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3830 		ASSERT(ringp->s_ring_first != NULL);
3831 
3832 		ringp->s_ring_drain_func(ringp);
3833 		mutex_exit(&ringp->s_ring_lock);
3834 		return;
3835 	} else {
3836 		/* ST_RING_WORKER_ONLY case */
3837 		SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3838 		mac_soft_ring_worker_wakeup(ringp);
3839 		mutex_exit(&ringp->s_ring_lock);
3840 	}
3841 }
3842 
3843 /*
3844  * TX SOFTRING RELATED FUNCTIONS
3845  *
3846  * These functions really belong in mac_soft_ring.c and here for
3847  * a short period.
3848  */
3849 
3850 #define	TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
3851 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));			\
3852 	ringp->s_ring_state |= S_RING_ENQUEUED;				\
3853 	SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);	\
3854 }
3855 
3856 /*
3857  * mac_tx_sring_queued
3858  *
3859  * When we are out of transmit descriptors and we already have a
3860  * queue that exceeds hiwat (or the client called us with
3861  * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3862  * soft ring pointer as the opaque cookie for the client enable
3863  * flow control.
3864  */
3865 static mac_tx_cookie_t
3866 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3867     mblk_t **ret_mp)
3868 {
3869 	int cnt;
3870 	size_t sz;
3871 	mblk_t *tail;
3872 	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3873 	mac_tx_cookie_t cookie = NULL;
3874 	boolean_t wakeup_worker = B_TRUE;
3875 
3876 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3877 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3878 	if (flag & MAC_DROP_ON_NO_DESC) {
3879 		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3880 		/* increment freed stats */
3881 		ringp->s_ring_drops += cnt;
3882 		cookie = (mac_tx_cookie_t)ringp;
3883 	} else {
3884 		if (ringp->s_ring_first != NULL)
3885 			wakeup_worker = B_FALSE;
3886 
3887 		if (flag & MAC_TX_NO_ENQUEUE) {
3888 			/*
3889 			 * If QUEUED is not set, queue the packet
3890 			 * and let mac_tx_soft_ring_drain() set
3891 			 * the TX_BLOCKED bit for the reasons
3892 			 * explained above. Otherwise, return the
3893 			 * mblks.
3894 			 */
3895 			if (wakeup_worker) {
3896 				TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3897 				    mp_chain, tail, cnt, sz);
3898 			} else {
3899 				ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3900 				cookie = (mac_tx_cookie_t)ringp;
3901 				*ret_mp = mp_chain;
3902 			}
3903 		} else {
3904 			boolean_t enqueue = B_TRUE;
3905 
3906 			if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3907 				/*
3908 				 * flow-controlled. Store ringp in cookie
3909 				 * so that it can be returned as
3910 				 * mac_tx_cookie_t to client
3911 				 */
3912 				ringp->s_ring_state |= S_RING_TX_HIWAT;
3913 				cookie = (mac_tx_cookie_t)ringp;
3914 				ringp->s_ring_hiwat_cnt++;
3915 				if (ringp->s_ring_count >
3916 				    ringp->s_ring_tx_max_q_cnt) {
3917 					/* increment freed stats */
3918 					ringp->s_ring_drops += cnt;
3919 					/*
3920 					 * b_prev may be set to the fanout hint
3921 					 * hence can't use freemsg directly
3922 					 */
3923 					mac_pkt_drop(NULL, NULL,
3924 					    mp_chain, B_FALSE);
3925 					DTRACE_PROBE1(tx_queued_hiwat,
3926 					    mac_soft_ring_t *, ringp);
3927 					enqueue = B_FALSE;
3928 				}
3929 			}
3930 			if (enqueue) {
3931 				TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3932 				    tail, cnt, sz);
3933 			}
3934 		}
3935 		if (wakeup_worker)
3936 			cv_signal(&ringp->s_ring_async);
3937 	}
3938 	return (cookie);
3939 }
3940 
3941 
3942 /*
3943  * mac_tx_soft_ring_process
3944  *
3945  * This routine is called when fanning out outgoing traffic among
3946  * multipe Tx rings.
3947  * Note that a soft ring is associated with a h/w Tx ring.
3948  */
3949 mac_tx_cookie_t
3950 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3951     uint16_t flag, mblk_t **ret_mp)
3952 {
3953 	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3954 	int	cnt;
3955 	size_t	sz;
3956 	mblk_t	*tail;
3957 	mac_tx_cookie_t cookie = NULL;
3958 
3959 	ASSERT(ringp != NULL);
3960 	ASSERT(mp_chain != NULL);
3961 	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3962 	/*
3963 	 * The following modes can come here: SRS_TX_BW_FANOUT,
3964 	 * SRS_TX_FANOUT, SRS_TX_AGGR, SRS_TX_BW_AGGR.
3965 	 */
3966 	ASSERT(MAC_TX_SOFT_RINGS(mac_srs));
3967 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3968 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT ||
3969 	    mac_srs->srs_tx.st_mode == SRS_TX_AGGR ||
3970 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_AGGR);
3971 
3972 	if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3973 		/* Serialization mode */
3974 
3975 		mutex_enter(&ringp->s_ring_lock);
3976 		if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3977 			cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3978 			    flag, ret_mp);
3979 			mutex_exit(&ringp->s_ring_lock);
3980 			return (cookie);
3981 		}
3982 		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3983 		TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3984 		if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3985 			/*
3986 			 * If ring is blocked due to lack of Tx
3987 			 * descs, just return. Worker thread
3988 			 * will get scheduled when Tx desc's
3989 			 * become available.
3990 			 */
3991 			mutex_exit(&ringp->s_ring_lock);
3992 			return (cookie);
3993 		}
3994 		mac_soft_ring_worker_wakeup(ringp);
3995 		mutex_exit(&ringp->s_ring_lock);
3996 		return (cookie);
3997 	} else {
3998 		/* Default fanout mode */
3999 		/*
4000 		 * S_RING_BLOCKED is set when underlying NIC runs
4001 		 * out of Tx descs and messages start getting
4002 		 * queued. It won't get reset until
4003 		 * tx_srs_drain() completely drains out the
4004 		 * messages.
4005 		 */
4006 		mac_tx_stats_t		stats;
4007 
4008 		if (ringp->s_ring_state & S_RING_ENQUEUED) {
4009 			/* Tx descs/resources not available */
4010 			mutex_enter(&ringp->s_ring_lock);
4011 			if (ringp->s_ring_state & S_RING_ENQUEUED) {
4012 				cookie = mac_tx_sring_enqueue(ringp, mp_chain,
4013 				    flag, ret_mp);
4014 				mutex_exit(&ringp->s_ring_lock);
4015 				return (cookie);
4016 			}
4017 			/*
4018 			 * While we were computing mblk count, the
4019 			 * flow control condition got relieved.
4020 			 * Continue with the transmission.
4021 			 */
4022 			mutex_exit(&ringp->s_ring_lock);
4023 		}
4024 
4025 		mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
4026 		    ringp->s_ring_tx_arg2, mp_chain, &stats);
4027 
4028 		/*
4029 		 * Multiple threads could be here sending packets.
4030 		 * Under such conditions, it is not possible to
4031 		 * automically set S_RING_BLOCKED bit to indicate
4032 		 * out of tx desc condition. To atomically set
4033 		 * this, we queue the returned packet and do
4034 		 * the setting of S_RING_BLOCKED in
4035 		 * mac_tx_soft_ring_drain().
4036 		 */
4037 		if (mp_chain != NULL) {
4038 			mutex_enter(&ringp->s_ring_lock);
4039 			cookie =
4040 			    mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
4041 			mutex_exit(&ringp->s_ring_lock);
4042 			return (cookie);
4043 		}
4044 		SRS_TX_STATS_UPDATE(mac_srs, &stats);
4045 		SOFTRING_TX_STATS_UPDATE(ringp, &stats);
4046 
4047 		return (NULL);
4048 	}
4049 }
4050