xref: /titanic_52/usr/src/uts/common/io/mac/mac_sched.c (revision 6d2259e1baf8d4ac11c96570f45ecdcd9771a68d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/callb.h>
28 #include <sys/sdt.h>
29 #include <sys/strsubr.h>
30 #include <sys/strsun.h>
31 #include <sys/vlan.h>
32 #include <inet/ipsec_impl.h>
33 #include <inet/ip_impl.h>
34 #include <inet/sadb.h>
35 #include <inet/ipsecesp.h>
36 #include <inet/ipsecah.h>
37 #include <inet/ip6.h>
38 
39 #include <sys/mac_impl.h>
40 #include <sys/mac_client_impl.h>
41 #include <sys/mac_client_priv.h>
42 #include <sys/mac_soft_ring.h>
43 #include <sys/mac_flow_impl.h>
44 
45 static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *,
46     uintptr_t, uint16_t, mblk_t **);
47 static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *,
48     uintptr_t, uint16_t, mblk_t **);
49 static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *,
50     uintptr_t, uint16_t, mblk_t **);
51 static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *,
52     uintptr_t, uint16_t, mblk_t **);
53 
54 typedef struct mac_tx_mode_s {
55 	mac_tx_srs_mode_t	mac_tx_mode;
56 	mac_tx_func_t		mac_tx_func;
57 } mac_tx_mode_t;
58 
59 /*
60  * There are five modes of operation on the Tx side. These modes get set
61  * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode,
62  * none of the other modes are user configurable. They get selected by
63  * the system depending upon whether the link (or flow) has multiple Tx
64  * rings or a bandwidth configured, etc.
65  */
66 mac_tx_mode_t mac_tx_mode_list[] = {
67 	{SRS_TX_DEFAULT,	mac_tx_single_ring_mode},
68 	{SRS_TX_SERIALIZE,	mac_tx_serializer_mode},
69 	{SRS_TX_FANOUT,		mac_tx_fanout_mode},
70 	{SRS_TX_BW,		mac_tx_bw_mode},
71 	{SRS_TX_BW_FANOUT,	mac_tx_bw_mode}
72 };
73 
74 /*
75  * Soft Ring Set (SRS) - The Run time code that deals with
76  * dynamic polling from the hardware, bandwidth enforcement,
77  * fanout etc.
78  *
79  * We try to use H/W classification on NIC and assign traffic for
80  * a MAC address to a particular Rx ring or ring group. There is a
81  * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically
82  * switches the underlying Rx ring between interrupt and
83  * polling mode and enforces any specified B/W control.
84  *
85  * There is always a SRS created and tied to each H/W and S/W rule.
86  * Whenever we create a H/W rule, we always add the the same rule to
87  * S/W classifier and tie a SRS to it.
88  *
89  * In case a B/W control is specified, it is broken into bytes
90  * per ticks and as soon as the quota for a tick is exhausted,
91  * the underlying Rx ring is forced into poll mode for remainder of
92  * the tick. The SRS poll thread only polls for bytes that are
93  * allowed to come in the SRS. We typically let 4x the configured
94  * B/W worth of packets to come in the SRS (to prevent unnecessary
95  * drops due to bursts) but only process the specified amount.
96  *
97  * A MAC client (e.g. a VNIC or aggr) can have 1 or more
98  * Rx rings (and corresponding SRSs) assigned to it. The SRS
99  * in turn can have softrings to do protocol level fanout or
100  * softrings to do S/W based fanout or both. In case the NIC
101  * has no Rx rings, we do S/W classification to respective SRS.
102  * The S/W classification rule is always setup and ready. This
103  * allows the MAC layer to reassign Rx rings whenever needed
104  * but packets still continue to flow via the default path and
105  * getting S/W classified to correct SRS.
106  *
107  * The SRS's are used on both Tx and Rx side. They use the same
108  * data structure but the processing routines have slightly different
109  * semantics due to the fact that Rx side needs to do dynamic
110  * polling etc.
111  *
112  * Dynamic Polling Notes
113  * =====================
114  *
115  * Each Soft ring set is capable of switching its Rx ring between
116  * interrupt and poll mode and actively 'polls' for packets in
117  * poll mode. If the SRS is implementing a B/W limit, it makes
118  * sure that only Max allowed packets are pulled in poll mode
119  * and goes to poll mode as soon as B/W limit is exceeded. As
120  * such, there are no overheads to implement B/W limits.
121  *
122  * In poll mode, its better to keep the pipeline going where the
123  * SRS worker thread keeps processing packets and poll thread
124  * keeps bringing more packets (specially if they get to run
125  * on different CPUs). This also prevents the overheads associated
126  * by excessive signalling (on NUMA machines, this can be
127  * pretty devastating). The exception is latency optimized case
128  * where worker thread does no work and interrupt and poll thread
129  * are allowed to do their own drain.
130  *
131  * We use the following policy to control Dynamic Polling:
132  * 1) We switch to poll mode anytime the processing
133  *    thread causes a backlog to build up in SRS and
134  *    its associated Soft Rings (sr_poll_pkt_cnt > 0).
135  * 2) As long as the backlog stays under the low water
136  *    mark (sr_lowat), we poll the H/W for more packets.
137  * 3) If the backlog (sr_poll_pkt_cnt) exceeds low
138  *    water mark, we stay in poll mode but don't poll
139  *    the H/W for more packets.
140  * 4) Anytime in polling mode, if we poll the H/W for
141  *    packets and find nothing plus we have an existing
142  *    backlog (sr_poll_pkt_cnt > 0), we stay in polling
143  *    mode but don't poll the H/W for packets anymore
144  *    (let the polling thread go to sleep).
145  * 5) Once the backlog is relived (packets are processed)
146  *    we reenable polling (by signalling the poll thread)
147  *    only when the backlog dips below sr_poll_thres.
148  * 6) sr_hiwat is used exclusively when we are not
149  *    polling capable and is used to decide when to
150  *    drop packets so the SRS queue length doesn't grow
151  *    infinitely.
152  *
153  * NOTE: Also see the block level comment on top of mac_soft_ring.c
154  */
155 
156 /*
157  * mac_latency_optimize
158  *
159  * Controls whether the poll thread can process the packets inline
160  * or let the SRS worker thread do the processing. This applies if
161  * the SRS was not being processed. For latency sensitive traffic,
162  * this needs to be true to allow inline processing. For throughput
163  * under load, this should be false.
164  *
165  * This (and other similar) tunable should be rolled into a link
166  * or flow specific workload hint that can be set using dladm
167  * linkprop (instead of multiple such tunables).
168  */
169 boolean_t mac_latency_optimize = B_TRUE;
170 
171 /*
172  * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN
173  *
174  * queue a mp or chain in soft ring set and increment the
175  * local count (srs_count) for the SRS and the shared counter
176  * (srs_poll_pkt_cnt - shared between SRS and its soft rings
177  * to track the total unprocessed packets for polling to work
178  * correctly).
179  *
180  * The size (total bytes queued) counters are incremented only
181  * if we are doing B/W control.
182  */
183 #define	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {		\
184 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
185 	if ((mac_srs)->srs_last != NULL)				\
186 		(mac_srs)->srs_last->b_next = (head);			\
187 	else								\
188 		(mac_srs)->srs_first = (head);				\
189 	(mac_srs)->srs_last = (tail);					\
190 	(mac_srs)->srs_count += count;					\
191 }
192 
193 #define	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
194 	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
195 									\
196 	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
197 	srs_rx->sr_poll_pkt_cnt += count;				\
198 	ASSERT(srs_rx->sr_poll_pkt_cnt > 0);				\
199 	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
200 		(mac_srs)->srs_size += (sz);				\
201 		mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock);		\
202 		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
203 		mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock);		\
204 	}								\
205 }
206 
207 #define	MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) {	\
208 	mac_srs->srs_state |= SRS_ENQUEUED;				\
209 	MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz);		\
210 	if ((mac_srs)->srs_type & SRST_BW_CONTROL) {			\
211 		(mac_srs)->srs_size += (sz);				\
212 		(mac_srs)->srs_bw->mac_bw_sz += (sz);			\
213 	}								\
214 }
215 
216 /*
217  * Turn polling on routines
218  */
219 #define	MAC_SRS_POLLING_ON(mac_srs) {					\
220 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
221 	if (((mac_srs)->srs_state &					\
222 	    (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) {	\
223 		(mac_srs)->srs_state |= SRS_POLLING;			\
224 		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
225 		    (mac_srs)->srs_ring);				\
226 		(mac_srs)->srs_rx.sr_poll_on++;				\
227 	}								\
228 }
229 
230 #define	MAC_SRS_WORKER_POLLING_ON(mac_srs) {				\
231 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
232 	if (((mac_srs)->srs_state &					\
233 	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == 		\
234 	    (SRS_POLLING_CAPAB|SRS_WORKER)) {				\
235 		(mac_srs)->srs_state |= SRS_POLLING;			\
236 		(void) mac_hwring_disable_intr((mac_ring_handle_t)	\
237 		    (mac_srs)->srs_ring);				\
238 		(mac_srs)->srs_rx.sr_worker_poll_on++;			\
239 	}								\
240 }
241 
242 /*
243  * MAC_SRS_POLL_RING
244  *
245  * Signal the SRS poll thread to poll the underlying H/W ring
246  * provided it wasn't already polling (SRS_GET_PKTS was set).
247  *
248  * Poll thread gets to run only from mac_rx_srs_drain() and only
249  * if the drain was being done by the worker thread.
250  */
251 #define	MAC_SRS_POLL_RING(mac_srs) {					\
252 	mac_srs_rx_t	*srs_rx = &(mac_srs)->srs_rx;			\
253 									\
254 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
255 	srs_rx->sr_poll_thr_sig++;					\
256 	if (((mac_srs)->srs_state & 					\
257 	    (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) ==		\
258 		(SRS_WORKER|SRS_POLLING_CAPAB)) {			\
259 		(mac_srs)->srs_state |= SRS_GET_PKTS;			\
260 		cv_signal(&(mac_srs)->srs_cv);   			\
261 	} else {							\
262 		srs_rx->sr_poll_thr_busy++;				\
263 	}								\
264 }
265 
266 /*
267  * MAC_SRS_CHECK_BW_CONTROL
268  *
269  * Check to see if next tick has started so we can reset the
270  * SRS_BW_ENFORCED flag and allow more packets to come in the
271  * system.
272  */
273 #define	MAC_SRS_CHECK_BW_CONTROL(mac_srs) {				\
274 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
275 	ASSERT(((mac_srs)->srs_type & SRST_TX) ||			\
276 	    MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock));		\
277 	if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) {    		\
278 		(mac_srs)->srs_bw->mac_bw_curr_time = lbolt;   		\
279 		(mac_srs)->srs_bw->mac_bw_used = 0;	       		\
280 		if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED)	\
281 			(mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \
282 	}								\
283 }
284 
285 /*
286  * MAC_SRS_WORKER_WAKEUP
287  *
288  * Wake up the SRS worker thread to process the queue as long as
289  * no one else is processing the queue. If we are optimizing for
290  * latency, we wake up the worker thread immediately or else we
291  * wait mac_srs_worker_wakeup_ticks before worker thread gets
292  * woken up.
293  */
294 int mac_srs_worker_wakeup_ticks = 0;
295 #define	MAC_SRS_WORKER_WAKEUP(mac_srs) {				\
296 	ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock));			\
297 	if (!((mac_srs)->srs_state & SRS_PROC) &&			\
298 		(mac_srs)->srs_tid == NULL) {				\
299 		if (((mac_srs)->srs_state & SRS_LATENCY_OPT) ||		\
300 			(mac_srs_worker_wakeup_ticks == 0))		\
301 			cv_signal(&(mac_srs)->srs_async);		\
302 		else							\
303 			(mac_srs)->srs_tid =				\
304 				timeout(mac_srs_fire, (mac_srs),	\
305 					mac_srs_worker_wakeup_ticks);	\
306 	}								\
307 }
308 
309 #define	TX_SINGLE_RING_MODE(mac_srs)				\
310 	((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || 	\
311 	    (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE ||	\
312 	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW)
313 
314 #define	TX_BANDWIDTH_MODE(mac_srs)				\
315 	((mac_srs)->srs_tx.st_mode == SRS_TX_BW ||		\
316 	    (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT)
317 
318 #define	TX_SRS_TO_SOFT_RING(mac_srs, head, hint) {			\
319 	uint_t hash, indx;						\
320 	hash = HASH_HINT(hint);					\
321 	indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);	\
322 	softring = mac_srs->srs_oth_soft_rings[indx];			\
323 	(void) (mac_tx_soft_ring_process(softring, head, 0, NULL));	\
324 }
325 
326 /*
327  * MAC_TX_SRS_BLOCK
328  *
329  * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED
330  * will be set only if srs_tx_woken_up is FALSE. If
331  * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived
332  * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to
333  * attempt to transmit again and not setting SRS_TX_BLOCKED does
334  * that.
335  */
336 #define	MAC_TX_SRS_BLOCK(srs, mp)	{			\
337 	ASSERT(MUTEX_HELD(&(srs)->srs_lock));			\
338 	if ((srs)->srs_tx.st_woken_up) {			\
339 		(srs)->srs_tx.st_woken_up = B_FALSE;		\
340 	} else {						\
341 		ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED));	\
342 		(srs)->srs_state |= SRS_TX_BLOCKED;		\
343 		(srs)->srs_tx.st_blocked_cnt++;			\
344 	}							\
345 }
346 
347 /*
348  * MAC_TX_SRS_TEST_HIWAT
349  *
350  * Called before queueing a packet onto Tx SRS to test and set
351  * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat.
352  */
353 #define	MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) {		\
354 	boolean_t enqueue = 1;						\
355 									\
356 	if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) {		\
357 		/*							\
358 		 * flow-controlled. Store srs in cookie so that it	\
359 		 * can be returned as mac_tx_cookie_t to client		\
360 		 */							\
361 		(srs)->srs_state |= SRS_TX_HIWAT;			\
362 		cookie = (mac_tx_cookie_t)srs;				\
363 		(srs)->srs_tx.st_hiwat_cnt++;				\
364 		if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) {	\
365 			/* increment freed stats */			\
366 			(srs)->srs_tx.st_drop_count += cnt;		\
367 			/*						\
368 			 * b_prev may be set to the fanout hint		\
369 			 * hence can't use freemsg directly		\
370 			 */						\
371 			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);	\
372 			DTRACE_PROBE1(tx_queued_hiwat,			\
373 			    mac_soft_ring_set_t *, srs);		\
374 			enqueue = 0;					\
375 		}							\
376 	}								\
377 	if (enqueue)							\
378 		MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz);	\
379 }
380 
381 /* Some utility macros */
382 #define	MAC_SRS_BW_LOCK(srs)						\
383 	if (!(srs->srs_type & SRST_TX))					\
384 		mutex_enter(&srs->srs_bw->mac_bw_lock);
385 
386 #define	MAC_SRS_BW_UNLOCK(srs)						\
387 	if (!(srs->srs_type & SRST_TX))					\
388 		mutex_exit(&srs->srs_bw->mac_bw_lock);
389 
390 #define	MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) {		\
391 	mac_pkt_drop(NULL, NULL, mp, B_FALSE);			\
392 	/* increment freed stats */				\
393 	mac_srs->srs_tx.st_drop_count++;			\
394 	cookie = (mac_tx_cookie_t)srs;				\
395 }
396 
397 #define	MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) {		\
398 	mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT;			\
399 	cookie = (mac_tx_cookie_t)srs;					\
400 	*ret_mp = mp_chain;						\
401 }
402 
403 /*
404  * Drop the rx packet and advance to the next one in the chain.
405  */
406 static void
407 mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp)
408 {
409 	mac_srs_rx_t	*srs_rx = &srs->srs_rx;
410 
411 	ASSERT(mp->b_next == NULL);
412 	mutex_enter(&srs->srs_lock);
413 	MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1);
414 	MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp));
415 	mutex_exit(&srs->srs_lock);
416 
417 	srs_rx->sr_drop_count++;
418 	freemsg(mp);
419 }
420 
421 /* DATAPATH RUNTIME ROUTINES */
422 
423 /*
424  * mac_srs_fire
425  *
426  * Timer callback routine for waking up the SRS worker thread.
427  */
428 static void
429 mac_srs_fire(void *arg)
430 {
431 	mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg;
432 
433 	mutex_enter(&mac_srs->srs_lock);
434 	if (mac_srs->srs_tid == 0) {
435 		mutex_exit(&mac_srs->srs_lock);
436 		return;
437 	}
438 
439 	mac_srs->srs_tid = 0;
440 	if (!(mac_srs->srs_state & SRS_PROC))
441 		cv_signal(&mac_srs->srs_async);
442 
443 	mutex_exit(&mac_srs->srs_lock);
444 }
445 
446 /*
447  * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack,
448  * and it is used on the TX path.
449  */
450 #define	HASH_HINT(hint)	(((hint) << 17) | ((hint) >> 16))
451 
452 /*
453  * hash based on the src address and the port information.
454  */
455 #define	HASH_ADDR(src, ports)					\
456 	(ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^	\
457 	((ports) >> 8) ^ (ports))
458 
459 #define	COMPUTE_INDEX(key, sz)	(key % sz)
460 
461 #define	FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) {	\
462 	if ((tail) != NULL) {						\
463 		ASSERT((tail)->b_next == NULL);				\
464 		(tail)->b_next = (mp);					\
465 	} else {							\
466 		ASSERT((head) == NULL);					\
467 		(head) = (mp);						\
468 	}								\
469 	(tail) = (mp);							\
470 	(cnt)++;							\
471 	if ((bw_ctl))							\
472 		(sz) += (sz0);						\
473 }
474 
475 #define	MAC_FANOUT_DEFAULT	0
476 #define	MAC_FANOUT_RND_ROBIN	1
477 int mac_fanout_type = MAC_FANOUT_DEFAULT;
478 
479 #define	MAX_SR_TYPES	3
480 /* fanout types for port based hashing */
481 enum pkt_type {
482 	V4_TCP = 0,
483 	V4_UDP,
484 	OTH,
485 	UNDEF
486 };
487 
488 /*
489  * In general we do port based hashing to spread traffic over different
490  * softrings. The below tunable allows to override that behavior. Setting it
491  * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior
492  * is also the applicable to ipv6 packets carrying multiple optional headers
493  * and other uncommon packet types.
494  */
495 boolean_t mac_src_ipv6_fanout = B_FALSE;
496 
497 /*
498  * Pair of local and remote ports in the transport header
499  */
500 #define	PORTS_SIZE 4
501 
502 /*
503  * mac_rx_srs_proto_fanout
504  *
505  * This routine delivers packets destined to an SRS into one of the
506  * protocol soft rings.
507  *
508  * Given a chain of packets we need to split it up into multiple sub chains
509  * destined into TCP, UDP or OTH soft ring. Instead of entering
510  * the soft ring one packet at a time, we want to enter it in the form of a
511  * chain otherwise we get this start/stop behaviour where the worker thread
512  * goes to sleep and then next packets comes in forcing it to wake up etc.
513  */
514 static void
515 mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
516 {
517 	struct ether_header		*ehp;
518 	struct ether_vlan_header	*evhp;
519 	uint32_t			sap;
520 	ipha_t				*ipha;
521 	uint8_t				*dstaddr;
522 	size_t				hdrsize;
523 	mblk_t				*mp;
524 	mblk_t				*headmp[MAX_SR_TYPES];
525 	mblk_t				*tailmp[MAX_SR_TYPES];
526 	int				cnt[MAX_SR_TYPES];
527 	size_t				sz[MAX_SR_TYPES];
528 	size_t				sz1;
529 	boolean_t			bw_ctl;
530 	boolean_t			hw_classified;
531 	boolean_t			dls_bypass;
532 	boolean_t			is_ether;
533 	boolean_t			is_unicast;
534 	enum pkt_type			type;
535 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
536 
537 	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
538 	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
539 
540 	/*
541 	 * If we don't have a Rx ring, S/W classification would have done
542 	 * its job and its a packet meant for us. If we were polling on
543 	 * the default ring (i.e. there was a ring assigned to this SRS),
544 	 * then we need to make sure that the mac address really belongs
545 	 * to us.
546 	 */
547 	hw_classified = mac_srs->srs_ring != NULL &&
548 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
549 
550 	/*
551 	 * Special clients (eg. VLAN, non ether, etc) need DLS
552 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
553 	 * such SRSs. Another way of disabling bypass is to set the
554 	 * MCIS_RX_BYPASS_DISABLE flag.
555 	 */
556 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
557 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
558 
559 	bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *));
560 	bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *));
561 	bzero(cnt, MAX_SR_TYPES * sizeof (int));
562 	bzero(sz, MAX_SR_TYPES * sizeof (size_t));
563 
564 	/*
565 	 * We got a chain from SRS that we need to send to the soft rings.
566 	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
567 	 * performance reasons), we need to separate out v4_tcp, v4_udp
568 	 * and the rest goes in other.
569 	 */
570 	while (head != NULL) {
571 		mp = head;
572 		head = head->b_next;
573 		mp->b_next = NULL;
574 
575 		type = OTH;
576 		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
577 
578 		if (is_ether) {
579 			/*
580 			 * At this point we can be sure the packet at least
581 			 * has an ether header.
582 			 */
583 			if (sz1 < sizeof (struct ether_header)) {
584 				mac_rx_drop_pkt(mac_srs, mp);
585 				continue;
586 			}
587 			ehp = (struct ether_header *)mp->b_rptr;
588 
589 			/*
590 			 * Determine if this is a VLAN or non-VLAN packet.
591 			 */
592 			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
593 				evhp = (struct ether_vlan_header *)mp->b_rptr;
594 				sap = ntohs(evhp->ether_type);
595 				hdrsize = sizeof (struct ether_vlan_header);
596 				/*
597 				 * Check if the VID of the packet, if any,
598 				 * belongs to this client.
599 				 */
600 				if (!mac_client_check_flow_vid(mcip,
601 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
602 					mac_rx_drop_pkt(mac_srs, mp);
603 					continue;
604 				}
605 			} else {
606 				hdrsize = sizeof (struct ether_header);
607 			}
608 			is_unicast =
609 			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
610 			dstaddr = (uint8_t *)&ehp->ether_dhost;
611 		} else {
612 			mac_header_info_t		mhi;
613 
614 			if (mac_header_info((mac_handle_t)mcip->mci_mip,
615 			    mp, &mhi) != 0) {
616 				mac_rx_drop_pkt(mac_srs, mp);
617 				continue;
618 			}
619 			hdrsize = mhi.mhi_hdrsize;
620 			sap = mhi.mhi_bindsap;
621 			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
622 			dstaddr = (uint8_t *)mhi.mhi_daddr;
623 		}
624 
625 		if (!dls_bypass) {
626 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
627 			    cnt[type], bw_ctl, sz[type], sz1, mp);
628 			continue;
629 		}
630 
631 		if (sap == ETHERTYPE_IP) {
632 			/*
633 			 * If we are H/W classified, but we have promisc
634 			 * on, then we need to check for the unicast address.
635 			 */
636 			if (hw_classified && mcip->mci_promisc_list != NULL) {
637 				mac_address_t		*map;
638 
639 				rw_enter(&mcip->mci_rw_lock, RW_READER);
640 				map = mcip->mci_unicast;
641 				if (bcmp(dstaddr, map->ma_addr,
642 				    map->ma_len) == 0)
643 					type = UNDEF;
644 				rw_exit(&mcip->mci_rw_lock);
645 			} else if (is_unicast) {
646 				type = UNDEF;
647 			}
648 		}
649 
650 		/*
651 		 * This needs to become a contract with the driver for
652 		 * the fast path.
653 		 *
654 		 * In the normal case the packet will have at least the L2
655 		 * header and the IP + Transport header in the same mblk.
656 		 * This is usually the case when the NIC driver sends up
657 		 * the packet. This is also true when the stack generates
658 		 * a packet that is looped back and when the stack uses the
659 		 * fastpath mechanism. The normal case is optimized for
660 		 * performance and may bypass DLS. All other cases go through
661 		 * the 'OTH' type path without DLS bypass.
662 		 */
663 
664 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
665 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha))
666 			type = OTH;
667 
668 		if (type == OTH) {
669 			FANOUT_ENQUEUE_MP(headmp[type], tailmp[type],
670 			    cnt[type], bw_ctl, sz[type], sz1, mp);
671 			continue;
672 		}
673 
674 		ASSERT(type == UNDEF);
675 		/*
676 		 * We look for at least 4 bytes past the IP header to get
677 		 * the port information. If we get an IP fragment, we don't
678 		 * have the port information, and we use just the protocol
679 		 * information.
680 		 */
681 		switch (ipha->ipha_protocol) {
682 		case IPPROTO_TCP:
683 			type = V4_TCP;
684 			mp->b_rptr += hdrsize;
685 			break;
686 		case IPPROTO_UDP:
687 			type = V4_UDP;
688 			mp->b_rptr += hdrsize;
689 			break;
690 		default:
691 			type = OTH;
692 			break;
693 		}
694 
695 		FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type],
696 		    bw_ctl, sz[type], sz1, mp);
697 	}
698 
699 	for (type = V4_TCP; type < UNDEF; type++) {
700 		if (headmp[type] != NULL) {
701 			mac_soft_ring_t			*softring;
702 
703 			ASSERT(tailmp[type]->b_next == NULL);
704 			switch (type) {
705 			case V4_TCP:
706 				softring = mac_srs->srs_tcp_soft_rings[0];
707 				break;
708 			case V4_UDP:
709 				softring = mac_srs->srs_udp_soft_rings[0];
710 				break;
711 			case OTH:
712 				softring = mac_srs->srs_oth_soft_rings[0];
713 			}
714 			mac_rx_soft_ring_process(mcip, softring,
715 			    headmp[type], tailmp[type], cnt[type], sz[type]);
716 		}
717 	}
718 }
719 
720 int	fanout_unalligned = 0;
721 
722 /*
723  * mac_rx_srs_long_fanout
724  *
725  * The fanout routine for IPv6
726  */
727 static int
728 mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp,
729     uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx)
730 {
731 	ip6_t		*ip6h;
732 	uint8_t		*whereptr;
733 	uint_t		hash;
734 	uint16_t	remlen;
735 	uint8_t		nexthdr;
736 	uint16_t	hdr_len;
737 
738 	if (sap == ETHERTYPE_IPV6) {
739 		boolean_t	modifiable = B_TRUE;
740 
741 		ASSERT(MBLKL(mp) >= hdrsize);
742 
743 		ip6h = (ip6_t *)(mp->b_rptr + hdrsize);
744 		if ((unsigned char *)ip6h == mp->b_wptr) {
745 			/*
746 			 * The first mblk_t only includes the mac header.
747 			 * Note that it is safe to change the mp pointer here,
748 			 * as the subsequent operation does not assume mp
749 			 * points to the start of the mac header.
750 			 */
751 			mp = mp->b_cont;
752 
753 			/*
754 			 * Make sure ip6h holds the full ip6_t structure.
755 			 */
756 			if (mp == NULL)
757 				return (-1);
758 
759 			if (MBLKL(mp) < IPV6_HDR_LEN) {
760 				modifiable = (DB_REF(mp) == 1);
761 
762 				if (modifiable &&
763 				    !pullupmsg(mp, IPV6_HDR_LEN)) {
764 					return (-1);
765 				}
766 			}
767 
768 			ip6h = (ip6_t *)mp->b_rptr;
769 		}
770 
771 		if (!modifiable || !(OK_32PTR((char *)ip6h)) ||
772 		    ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) {
773 			/*
774 			 * If either ip6h is not alligned, or ip6h does not
775 			 * hold the complete ip6_t structure (a pullupmsg()
776 			 * is not an option since it would result in an
777 			 * unalligned ip6h), fanout to the default ring. Note
778 			 * that this may cause packets reordering.
779 			 */
780 			*indx = 0;
781 			*type = OTH;
782 			fanout_unalligned++;
783 			return (0);
784 		}
785 
786 		remlen = ntohs(ip6h->ip6_plen);
787 		nexthdr = ip6h->ip6_nxt;
788 
789 		if (remlen < MIN_EHDR_LEN)
790 			return (-1);
791 		/*
792 		 * Do src based fanout if below tunable is set to B_TRUE or
793 		 * when mac_ip_hdr_length_v6() fails because of malformed
794 		 * packets or because mblk's need to be concatenated using
795 		 * pullupmsg().
796 		 */
797 		if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h,
798 		    &hdr_len, &nexthdr)) {
799 			goto src_based_fanout;
800 		}
801 		whereptr = (uint8_t *)ip6h + hdr_len;
802 
803 		/* If the transport is one of below, we do port based fanout */
804 		switch (nexthdr) {
805 		case IPPROTO_TCP:
806 		case IPPROTO_UDP:
807 		case IPPROTO_SCTP:
808 		case IPPROTO_ESP:
809 			/*
810 			 * If the ports in the transport header is not part of
811 			 * the mblk, do src_based_fanout, instead of calling
812 			 * pullupmsg().
813 			 */
814 			if (mp->b_cont != NULL &&
815 			    whereptr + PORTS_SIZE > mp->b_wptr) {
816 				goto src_based_fanout;
817 			}
818 			break;
819 		default:
820 			break;
821 		}
822 
823 		switch (nexthdr) {
824 		case IPPROTO_TCP:
825 			hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
826 			    *(uint32_t *)whereptr);
827 			*indx = COMPUTE_INDEX(hash,
828 			    mac_srs->srs_tcp_ring_count);
829 			*type = OTH;
830 			break;
831 
832 		case IPPROTO_UDP:
833 		case IPPROTO_SCTP:
834 		case IPPROTO_ESP:
835 			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
836 				hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src),
837 				    *(uint32_t *)whereptr);
838 				*indx = COMPUTE_INDEX(hash,
839 				    mac_srs->srs_udp_ring_count);
840 			} else {
841 				*indx = mac_srs->srs_ind %
842 				    mac_srs->srs_udp_ring_count;
843 				mac_srs->srs_ind++;
844 			}
845 			*type = OTH;
846 			break;
847 
848 			/* For all other protocol, do source based fanout */
849 		default:
850 			goto src_based_fanout;
851 		}
852 	} else {
853 		*indx = 0;
854 		*type = OTH;
855 	}
856 	return (0);
857 
858 src_based_fanout:
859 	hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0);
860 	*indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count);
861 	*type = OTH;
862 	return (0);
863 }
864 
865 /*
866  * mac_rx_srs_fanout
867  *
868  * This routine delivers packets destined to an SRS into a soft ring member
869  * of the set.
870  *
871  * Given a chain of packets we need to split it up into multiple sub chains
872  * destined for one of the TCP, UDP or OTH soft rings. Instead of entering
873  * the soft ring one packet at a time, we want to enter it in the form of a
874  * chain otherwise we get this start/stop behaviour where the worker thread
875  * goes to sleep and then next packets comes in forcing it to wake up etc.
876  *
877  * Note:
878  * Since we know what is the maximum fanout possible, we create a 2D array
879  * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz
880  * variables so that we can enter the softrings with chain. We need the
881  * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc
882  * for each packet would be expensive). If we ever want to have the
883  * ability to have unlimited fanout, we should probably declare a head,
884  * tail, cnt, sz with each soft ring (a data struct which contains a softring
885  * along with these members) and create an array of this uber struct so we
886  * don't have to do kmem_alloc.
887  */
888 int	fanout_oth1 = 0;
889 int	fanout_oth2 = 0;
890 int	fanout_oth3 = 0;
891 int	fanout_oth4 = 0;
892 int	fanout_oth5 = 0;
893 
894 static void
895 mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head)
896 {
897 	struct ether_header		*ehp;
898 	struct ether_vlan_header	*evhp;
899 	uint32_t			sap;
900 	ipha_t				*ipha;
901 	uint8_t				*dstaddr;
902 	uint_t				indx;
903 	size_t				ports_offset;
904 	size_t				ipha_len;
905 	size_t				hdrsize;
906 	uint_t				hash;
907 	mblk_t				*mp;
908 	mblk_t				*headmp[MAX_SR_TYPES][MAX_SR_FANOUT];
909 	mblk_t				*tailmp[MAX_SR_TYPES][MAX_SR_FANOUT];
910 	int				cnt[MAX_SR_TYPES][MAX_SR_FANOUT];
911 	size_t				sz[MAX_SR_TYPES][MAX_SR_FANOUT];
912 	size_t				sz1;
913 	boolean_t			bw_ctl;
914 	boolean_t			hw_classified;
915 	boolean_t			dls_bypass;
916 	boolean_t			is_ether;
917 	boolean_t			is_unicast;
918 	int				fanout_cnt;
919 	enum pkt_type			type;
920 	mac_client_impl_t		*mcip = mac_srs->srs_mcip;
921 
922 	is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER);
923 	bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0);
924 
925 	/*
926 	 * If we don't have a Rx ring, S/W classification would have done
927 	 * its job and its a packet meant for us. If we were polling on
928 	 * the default ring (i.e. there was a ring assigned to this SRS),
929 	 * then we need to make sure that the mac address really belongs
930 	 * to us.
931 	 */
932 	hw_classified = mac_srs->srs_ring != NULL &&
933 	    mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER;
934 
935 	/*
936 	 * Special clients (eg. VLAN, non ether, etc) need DLS
937 	 * processing in the Rx path. SRST_DLS_BYPASS will be clear for
938 	 * such SRSs. Another way of disabling bypass is to set the
939 	 * MCIS_RX_BYPASS_DISABLE flag.
940 	 */
941 	dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) &&
942 	    ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0);
943 
944 	/*
945 	 * Since the softrings are never destroyed and we always
946 	 * create equal number of softrings for TCP, UDP and rest,
947 	 * its OK to check one of them for count and use it without
948 	 * any lock. In future, if soft rings get destroyed because
949 	 * of reduction in fanout, we will need to ensure that happens
950 	 * behind the SRS_PROC.
951 	 */
952 	fanout_cnt = mac_srs->srs_tcp_ring_count;
953 
954 	bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
955 	bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *));
956 	bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int));
957 	bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t));
958 
959 	/*
960 	 * We got a chain from SRS that we need to send to the soft rings.
961 	 * Since squeues for TCP & IPv4 sap poll their soft rings (for
962 	 * performance reasons), we need to separate out v4_tcp, v4_udp
963 	 * and the rest goes in other.
964 	 */
965 	while (head != NULL) {
966 		mp = head;
967 		head = head->b_next;
968 		mp->b_next = NULL;
969 
970 		type = OTH;
971 		sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp);
972 
973 		if (is_ether) {
974 			/*
975 			 * At this point we can be sure the packet at least
976 			 * has an ether header.
977 			 */
978 			if (sz1 < sizeof (struct ether_header)) {
979 				mac_rx_drop_pkt(mac_srs, mp);
980 				continue;
981 			}
982 			ehp = (struct ether_header *)mp->b_rptr;
983 
984 			/*
985 			 * Determine if this is a VLAN or non-VLAN packet.
986 			 */
987 			if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) {
988 				evhp = (struct ether_vlan_header *)mp->b_rptr;
989 				sap = ntohs(evhp->ether_type);
990 				hdrsize = sizeof (struct ether_vlan_header);
991 				/*
992 				 * Check if the VID of the packet, if any,
993 				 * belongs to this client.
994 				 */
995 				if (!mac_client_check_flow_vid(mcip,
996 				    VLAN_ID(ntohs(evhp->ether_tci)))) {
997 					mac_rx_drop_pkt(mac_srs, mp);
998 					continue;
999 				}
1000 			} else {
1001 				hdrsize = sizeof (struct ether_header);
1002 			}
1003 			is_unicast =
1004 			    ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0);
1005 			dstaddr = (uint8_t *)&ehp->ether_dhost;
1006 		} else {
1007 			mac_header_info_t		mhi;
1008 
1009 			if (mac_header_info((mac_handle_t)mcip->mci_mip,
1010 			    mp, &mhi) != 0) {
1011 				mac_rx_drop_pkt(mac_srs, mp);
1012 				continue;
1013 			}
1014 			hdrsize = mhi.mhi_hdrsize;
1015 			sap = mhi.mhi_bindsap;
1016 			is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST);
1017 			dstaddr = (uint8_t *)mhi.mhi_daddr;
1018 		}
1019 
1020 		if (!dls_bypass) {
1021 			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1022 			    hdrsize, &type, &indx) == -1) {
1023 				mac_rx_drop_pkt(mac_srs, mp);
1024 				continue;
1025 			}
1026 
1027 			FANOUT_ENQUEUE_MP(headmp[type][indx],
1028 			    tailmp[type][indx], cnt[type][indx], bw_ctl,
1029 			    sz[type][indx], sz1, mp);
1030 			continue;
1031 		}
1032 
1033 
1034 		/*
1035 		 * If we are using the default Rx ring where H/W or S/W
1036 		 * classification has not happened, we need to verify if
1037 		 * this unicast packet really belongs to us.
1038 		 */
1039 		if (sap == ETHERTYPE_IP) {
1040 			/*
1041 			 * If we are H/W classified, but we have promisc
1042 			 * on, then we need to check for the unicast address.
1043 			 */
1044 			if (hw_classified && mcip->mci_promisc_list != NULL) {
1045 				mac_address_t		*map;
1046 
1047 				rw_enter(&mcip->mci_rw_lock, RW_READER);
1048 				map = mcip->mci_unicast;
1049 				if (bcmp(dstaddr, map->ma_addr,
1050 				    map->ma_len) == 0)
1051 					type = UNDEF;
1052 				rw_exit(&mcip->mci_rw_lock);
1053 			} else if (is_unicast) {
1054 				type = UNDEF;
1055 			}
1056 		}
1057 
1058 		/*
1059 		 * This needs to become a contract with the driver for
1060 		 * the fast path.
1061 		 */
1062 
1063 		ipha = (ipha_t *)(mp->b_rptr + hdrsize);
1064 		if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) {
1065 			type = OTH;
1066 			fanout_oth1++;
1067 		}
1068 
1069 		if (type != OTH) {
1070 			uint16_t	frag_offset_flags;
1071 
1072 			switch (ipha->ipha_protocol) {
1073 			case IPPROTO_TCP:
1074 			case IPPROTO_UDP:
1075 			case IPPROTO_SCTP:
1076 			case IPPROTO_ESP:
1077 				ipha_len = IPH_HDR_LENGTH(ipha);
1078 				if ((uchar_t *)ipha + ipha_len + PORTS_SIZE >
1079 				    mp->b_wptr) {
1080 					type = OTH;
1081 					break;
1082 				}
1083 				frag_offset_flags =
1084 				    ntohs(ipha->ipha_fragment_offset_and_flags);
1085 				if ((frag_offset_flags &
1086 				    (IPH_MF | IPH_OFFSET)) != 0) {
1087 					type = OTH;
1088 					fanout_oth3++;
1089 					break;
1090 				}
1091 				ports_offset = hdrsize + ipha_len;
1092 				break;
1093 			default:
1094 				type = OTH;
1095 				fanout_oth4++;
1096 				break;
1097 			}
1098 		}
1099 
1100 		if (type == OTH) {
1101 			if (mac_rx_srs_long_fanout(mac_srs, mp, sap,
1102 			    hdrsize, &type, &indx) == -1) {
1103 				mac_rx_drop_pkt(mac_srs, mp);
1104 				continue;
1105 			}
1106 
1107 			FANOUT_ENQUEUE_MP(headmp[type][indx],
1108 			    tailmp[type][indx], cnt[type][indx], bw_ctl,
1109 			    sz[type][indx], sz1, mp);
1110 			continue;
1111 		}
1112 
1113 		ASSERT(type == UNDEF);
1114 
1115 		/*
1116 		 * XXX-Sunay: We should hold srs_lock since ring_count
1117 		 * below can change. But if we are always called from
1118 		 * mac_rx_srs_drain and SRS_PROC is set, then we can
1119 		 * enforce that ring_count can't be changed i.e.
1120 		 * to change fanout type or ring count, the calling
1121 		 * thread needs to be behind SRS_PROC.
1122 		 */
1123 		switch (ipha->ipha_protocol) {
1124 		case IPPROTO_TCP:
1125 			/*
1126 			 * Note that for ESP, we fanout on SPI and it is at the
1127 			 * same offset as the 2x16-bit ports. So it is clumped
1128 			 * along with TCP, UDP and SCTP.
1129 			 */
1130 			hash = HASH_ADDR(ipha->ipha_src,
1131 			    *(uint32_t *)(mp->b_rptr + ports_offset));
1132 			indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count);
1133 			type = V4_TCP;
1134 			mp->b_rptr += hdrsize;
1135 			break;
1136 		case IPPROTO_UDP:
1137 		case IPPROTO_SCTP:
1138 		case IPPROTO_ESP:
1139 			if (mac_fanout_type == MAC_FANOUT_DEFAULT) {
1140 				hash = HASH_ADDR(ipha->ipha_src,
1141 				    *(uint32_t *)(mp->b_rptr + ports_offset));
1142 				indx = COMPUTE_INDEX(hash,
1143 				    mac_srs->srs_udp_ring_count);
1144 			} else {
1145 				indx = mac_srs->srs_ind %
1146 				    mac_srs->srs_udp_ring_count;
1147 				mac_srs->srs_ind++;
1148 			}
1149 			type = V4_UDP;
1150 			mp->b_rptr += hdrsize;
1151 			break;
1152 		default:
1153 			indx = 0;
1154 			type = OTH;
1155 		}
1156 
1157 		FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx],
1158 		    cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp);
1159 	}
1160 
1161 	for (type = V4_TCP; type < UNDEF; type++) {
1162 		int	i;
1163 
1164 		for (i = 0; i < fanout_cnt; i++) {
1165 			if (headmp[type][i] != NULL) {
1166 				mac_soft_ring_t	*softring;
1167 
1168 				ASSERT(tailmp[type][i]->b_next == NULL);
1169 				switch (type) {
1170 				case V4_TCP:
1171 					softring =
1172 					    mac_srs->srs_tcp_soft_rings[i];
1173 					break;
1174 				case V4_UDP:
1175 					softring =
1176 					    mac_srs->srs_udp_soft_rings[i];
1177 					break;
1178 				case OTH:
1179 					softring =
1180 					    mac_srs->srs_oth_soft_rings[i];
1181 					break;
1182 				}
1183 				mac_rx_soft_ring_process(mcip,
1184 				    softring, headmp[type][i], tailmp[type][i],
1185 				    cnt[type][i], sz[type][i]);
1186 			}
1187 		}
1188 	}
1189 }
1190 
1191 #define	SRS_BYTES_TO_PICKUP	150000
1192 ssize_t	max_bytes_to_pickup = SRS_BYTES_TO_PICKUP;
1193 
1194 /*
1195  * mac_rx_srs_poll_ring
1196  *
1197  * This SRS Poll thread uses this routine to poll the underlying hardware
1198  * Rx ring to get a chain of packets. It can inline process that chain
1199  * if mac_latency_optimize is set (default) or signal the SRS worker thread
1200  * to do the remaining processing.
1201  *
1202  * Since packets come in the system via interrupt or poll path, we also
1203  * update the stats and deal with promiscous clients here.
1204  */
1205 void
1206 mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs)
1207 {
1208 	kmutex_t 		*lock = &mac_srs->srs_lock;
1209 	kcondvar_t 		*async = &mac_srs->srs_cv;
1210 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1211 	mblk_t 			*head, *tail, *mp;
1212 	callb_cpr_t 		cprinfo;
1213 	ssize_t 		bytes_to_pickup;
1214 	size_t 			sz;
1215 	int			count;
1216 	mac_client_impl_t	*smcip;
1217 
1218 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll");
1219 	mutex_enter(lock);
1220 
1221 start:
1222 	for (;;) {
1223 		if (mac_srs->srs_state & SRS_PAUSE)
1224 			goto done;
1225 
1226 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1227 		cv_wait(async, lock);
1228 		CALLB_CPR_SAFE_END(&cprinfo, lock);
1229 
1230 		if (mac_srs->srs_state & SRS_PAUSE)
1231 			goto done;
1232 
1233 check_again:
1234 		if (mac_srs->srs_type & SRST_BW_CONTROL) {
1235 			/*
1236 			 * We pick as many bytes as we are allowed to queue.
1237 			 * Its possible that we will exceed the total
1238 			 * packets queued in case this SRS is part of the
1239 			 * Rx ring group since > 1 poll thread can be pulling
1240 			 * upto the max allowed packets at the same time
1241 			 * but that should be OK.
1242 			 */
1243 			mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1244 			bytes_to_pickup =
1245 			    mac_srs->srs_bw->mac_bw_drop_threshold -
1246 			    mac_srs->srs_bw->mac_bw_sz;
1247 			/*
1248 			 * We shouldn't have been signalled if we
1249 			 * have 0 or less bytes to pick but since
1250 			 * some of the bytes accounting is driver
1251 			 * dependant, we do the safety check.
1252 			 */
1253 			if (bytes_to_pickup < 0)
1254 				bytes_to_pickup = 0;
1255 			mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1256 		} else {
1257 			/*
1258 			 * ToDO: Need to change the polling API
1259 			 * to add a packet count and a flag which
1260 			 * tells the driver whether we want packets
1261 			 * based on a count, or bytes, or all the
1262 			 * packets queued in the driver/HW. This
1263 			 * way, we never have to check the limits
1264 			 * on poll path. We truly let only as many
1265 			 * packets enter the system as we are willing
1266 			 * to process or queue.
1267 			 *
1268 			 * Something along the lines of
1269 			 * pkts_to_pickup = mac_soft_ring_max_q_cnt -
1270 			 *	mac_srs->srs_poll_pkt_cnt
1271 			 */
1272 
1273 			/*
1274 			 * Since we are not doing B/W control, pick
1275 			 * as many packets as allowed.
1276 			 */
1277 			bytes_to_pickup = max_bytes_to_pickup;
1278 		}
1279 
1280 		/* Poll the underlying Hardware */
1281 		mutex_exit(lock);
1282 		head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup);
1283 		mutex_enter(lock);
1284 
1285 		ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
1286 		    SRS_POLL_THR_OWNER);
1287 
1288 		mp = tail = head;
1289 		count = 0;
1290 		sz = 0;
1291 		while (mp != NULL) {
1292 			tail = mp;
1293 			sz += msgdsize(mp);
1294 			mp = mp->b_next;
1295 			count++;
1296 		}
1297 
1298 		if (head != NULL) {
1299 			tail->b_next = NULL;
1300 			smcip = mac_srs->srs_mcip;
1301 
1302 			if ((mac_srs->srs_type & SRST_FLOW) ||
1303 			    (smcip == NULL)) {
1304 				FLOW_STAT_UPDATE(mac_srs->srs_flent,
1305 				    rbytes, sz);
1306 				FLOW_STAT_UPDATE(mac_srs->srs_flent,
1307 				    ipackets, count);
1308 			}
1309 
1310 			/*
1311 			 * If there are any promiscuous mode callbacks
1312 			 * defined for this MAC client, pass them a copy
1313 			 * if appropriate and also update the counters.
1314 			 */
1315 			if (smcip != NULL) {
1316 				smcip->mci_stat_ibytes += sz;
1317 				smcip->mci_stat_ipackets += count;
1318 
1319 				if (smcip->mci_mip->mi_promisc_list != NULL) {
1320 					mutex_exit(lock);
1321 					mac_promisc_dispatch(smcip->mci_mip,
1322 					    head, NULL);
1323 					mutex_enter(lock);
1324 				}
1325 			}
1326 			if (mac_srs->srs_type & SRST_BW_CONTROL) {
1327 				mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1328 				mac_srs->srs_bw->mac_bw_polled += sz;
1329 				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1330 			}
1331 			srs_rx->sr_poll_count += count;
1332 			MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail,
1333 			    count, sz);
1334 			if (count <= 10)
1335 				srs_rx->sr_chain_cnt_undr10++;
1336 			else if (count > 10 && count <= 50)
1337 				srs_rx->sr_chain_cnt_10to50++;
1338 			else
1339 				srs_rx->sr_chain_cnt_over50++;
1340 		}
1341 
1342 		/*
1343 		 * We are guaranteed that SRS_PROC will be set if we
1344 		 * are here. Also, poll thread gets to run only if
1345 		 * the drain was being done by a worker thread although
1346 		 * its possible that worker thread is still running
1347 		 * and poll thread was sent down to keep the pipeline
1348 		 * going instead of doing a complete drain and then
1349 		 * trying to poll the NIC.
1350 		 *
1351 		 * So we need to check SRS_WORKER flag to make sure
1352 		 * that the worker thread is not processing the queue
1353 		 * in parallel to us. The flags and conditions are
1354 		 * protected by the srs_lock to prevent any race. We
1355 		 * ensure that we don't drop the srs_lock from now
1356 		 * till the end and similarly we don't drop the srs_lock
1357 		 * in mac_rx_srs_drain() till similar condition check
1358 		 * are complete. The mac_rx_srs_drain() needs to ensure
1359 		 * that SRS_WORKER flag remains set as long as its
1360 		 * processing the queue.
1361 		 */
1362 		if (!(mac_srs->srs_state & SRS_WORKER) &&
1363 		    (mac_srs->srs_first != NULL)) {
1364 			/*
1365 			 * We have packets to process and worker thread
1366 			 * is not running. Check to see if poll thread is
1367 			 * allowed to process.
1368 			 */
1369 			if (mac_srs->srs_state & SRS_LATENCY_OPT) {
1370 				mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC);
1371 				if (!(mac_srs->srs_state & SRS_PAUSE) &&
1372 				    srs_rx->sr_poll_pkt_cnt <=
1373 				    srs_rx->sr_lowat) {
1374 					srs_rx->sr_poll_again++;
1375 					goto check_again;
1376 				}
1377 				/*
1378 				 * We are already above low water mark
1379 				 * so stay in the polling mode but no
1380 				 * need to poll. Once we dip below
1381 				 * the polling threshold, the processing
1382 				 * thread (soft ring) will signal us
1383 				 * to poll again (MAC_UPDATE_SRS_COUNT)
1384 				 */
1385 				srs_rx->sr_poll_drain_no_poll++;
1386 				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1387 				/*
1388 				 * In B/W control case, its possible
1389 				 * that the backlog built up due to
1390 				 * B/W limit being reached and packets
1391 				 * are queued only in SRS. In this case,
1392 				 * we should schedule worker thread
1393 				 * since no one else will wake us up.
1394 				 */
1395 				if ((mac_srs->srs_type & SRST_BW_CONTROL) &&
1396 				    (mac_srs->srs_tid == NULL)) {
1397 					mac_srs->srs_tid =
1398 					    timeout(mac_srs_fire, mac_srs, 1);
1399 					srs_rx->sr_poll_worker_wakeup++;
1400 				}
1401 			} else {
1402 				/*
1403 				 * Wakeup the worker thread for more processing.
1404 				 * We optimize for throughput in this case.
1405 				 */
1406 				mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS);
1407 				MAC_SRS_WORKER_WAKEUP(mac_srs);
1408 				srs_rx->sr_poll_sig_worker++;
1409 			}
1410 		} else if ((mac_srs->srs_first == NULL) &&
1411 		    !(mac_srs->srs_state & SRS_WORKER)) {
1412 			/*
1413 			 * There is nothing queued in SRS and
1414 			 * no worker thread running. Plus we
1415 			 * didn't get anything from the H/W
1416 			 * as well (head == NULL);
1417 			 */
1418 			ASSERT(head == NULL);
1419 			mac_srs->srs_state &=
1420 			    ~(SRS_PROC|SRS_GET_PKTS);
1421 
1422 			/*
1423 			 * If we have a packets in soft ring, don't allow
1424 			 * more packets to come into this SRS by keeping the
1425 			 * interrupts off but not polling the H/W. The
1426 			 * poll thread will get signaled as soon as
1427 			 * srs_poll_pkt_cnt dips below poll threshold.
1428 			 */
1429 			if (srs_rx->sr_poll_pkt_cnt == 0) {
1430 				srs_rx->sr_poll_intr_enable++;
1431 				MAC_SRS_POLLING_OFF(mac_srs);
1432 			} else {
1433 				/*
1434 				 * We know nothing is queued in SRS
1435 				 * since we are here after checking
1436 				 * srs_first is NULL. The backlog
1437 				 * is entirely due to packets queued
1438 				 * in Soft ring which will wake us up
1439 				 * and get the interface out of polling
1440 				 * mode once the backlog dips below
1441 				 * sr_poll_thres.
1442 				 */
1443 				srs_rx->sr_poll_no_poll++;
1444 			}
1445 		} else {
1446 			/*
1447 			 * Worker thread is already running.
1448 			 * Nothing much to do. If the polling
1449 			 * was enabled, worker thread will deal
1450 			 * with that.
1451 			 */
1452 			mac_srs->srs_state &= ~SRS_GET_PKTS;
1453 			srs_rx->sr_poll_goto_sleep++;
1454 		}
1455 	}
1456 done:
1457 	mac_srs->srs_state |= SRS_POLL_THR_QUIESCED;
1458 	cv_signal(&mac_srs->srs_async);
1459 	/*
1460 	 * If this is a temporary quiesce then wait for the restart signal
1461 	 * from the srs worker. Then clear the flags and signal the srs worker
1462 	 * to ensure a positive handshake and go back to start.
1463 	 */
1464 	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART)))
1465 		cv_wait(async, lock);
1466 	if (mac_srs->srs_state & SRS_POLL_THR_RESTART) {
1467 		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
1468 		mac_srs->srs_state &=
1469 		    ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART);
1470 		cv_signal(&mac_srs->srs_async);
1471 		goto start;
1472 	} else {
1473 		mac_srs->srs_state |= SRS_POLL_THR_EXITED;
1474 		cv_signal(&mac_srs->srs_async);
1475 		CALLB_CPR_EXIT(&cprinfo);
1476 		thread_exit();
1477 	}
1478 }
1479 
1480 /*
1481  * mac_srs_pick_chain
1482  *
1483  * In Bandwidth control case, checks how many packets can be processed
1484  * and return them in a sub chain.
1485  */
1486 static mblk_t *
1487 mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail,
1488     size_t *chain_sz, int *chain_cnt)
1489 {
1490 	mblk_t 			*head = NULL;
1491 	mblk_t 			*tail = NULL;
1492 	size_t			sz;
1493 	size_t 			tsz = 0;
1494 	int			cnt = 0;
1495 	mblk_t 			*mp;
1496 
1497 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1498 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1499 	if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <=
1500 	    mac_srs->srs_bw->mac_bw_limit) ||
1501 	    (mac_srs->srs_bw->mac_bw_limit == 0)) {
1502 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1503 		head = mac_srs->srs_first;
1504 		mac_srs->srs_first = NULL;
1505 		*chain_tail = mac_srs->srs_last;
1506 		mac_srs->srs_last = NULL;
1507 		*chain_sz = mac_srs->srs_size;
1508 		*chain_cnt = mac_srs->srs_count;
1509 		mac_srs->srs_count = 0;
1510 		mac_srs->srs_size = 0;
1511 		return (head);
1512 	}
1513 
1514 	/*
1515 	 * Can't clear the entire backlog.
1516 	 * Need to find how many packets to pick
1517 	 */
1518 	ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock));
1519 	while ((mp = mac_srs->srs_first) != NULL) {
1520 		sz = msgdsize(mp);
1521 		if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) >
1522 		    mac_srs->srs_bw->mac_bw_limit) {
1523 			if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED))
1524 				mac_srs->srs_bw->mac_bw_state |=
1525 				    SRS_BW_ENFORCED;
1526 			break;
1527 		}
1528 
1529 		/*
1530 		 * The _size & cnt is  decremented from the softrings
1531 		 * when they send up the packet for polling to work
1532 		 * properly.
1533 		 */
1534 		tsz += sz;
1535 		cnt++;
1536 		mac_srs->srs_count--;
1537 		mac_srs->srs_size -= sz;
1538 		if (tail != NULL)
1539 			tail->b_next = mp;
1540 		else
1541 			head = mp;
1542 		tail = mp;
1543 		mac_srs->srs_first = mac_srs->srs_first->b_next;
1544 	}
1545 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1546 	if (mac_srs->srs_first == NULL)
1547 		mac_srs->srs_last = NULL;
1548 
1549 	if (tail != NULL)
1550 		tail->b_next = NULL;
1551 	*chain_tail = tail;
1552 	*chain_cnt = cnt;
1553 	*chain_sz = tsz;
1554 
1555 	return (head);
1556 }
1557 
1558 /*
1559  * mac_rx_srs_drain
1560  *
1561  * The SRS drain routine. Gets to run to clear the queue. Any thread
1562  * (worker, interrupt, poll) can call this based on processing model.
1563  * The first thing we do is disable interrupts if possible and then
1564  * drain the queue. we also try to poll the underlying hardware if
1565  * there is a dedicated hardware Rx ring assigned to this SRS.
1566  *
1567  * There is a equivalent drain routine in bandwidth control mode
1568  * mac_rx_srs_drain_bw. There is some code duplication between the two
1569  * routines but they are highly performance sensitive and are easier
1570  * to read/debug if they stay separate. Any code changes here might
1571  * also apply to mac_rx_srs_drain_bw as well.
1572  */
1573 void
1574 mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1575 {
1576 	mblk_t 			*head;
1577 	mblk_t			*tail;
1578 	timeout_id_t 		tid;
1579 	int			cnt = 0;
1580 	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
1581 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1582 
1583 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1584 	ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL));
1585 
1586 	/* If we are blanked i.e. can't do upcalls, then we are done */
1587 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1588 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1589 		    (mac_srs->srs_state & SRS_PAUSE));
1590 		goto out;
1591 	}
1592 
1593 	if (mac_srs->srs_first == NULL)
1594 		goto out;
1595 
1596 	if (!(mac_srs->srs_state & SRS_LATENCY_OPT) &&
1597 	    (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) {
1598 		/*
1599 		 * In the normal case, the SRS worker thread does no
1600 		 * work and we wait for a backlog to build up before
1601 		 * we switch into polling mode. In case we are
1602 		 * optimizing for throughput, we use the worker thread
1603 		 * as well. The goal is to let worker thread process
1604 		 * the queue and poll thread to feed packets into
1605 		 * the queue. As such, we should signal the poll
1606 		 * thread to try and get more packets.
1607 		 *
1608 		 * We could have pulled this check in the POLL_RING
1609 		 * macro itself but keeping it explicit here makes
1610 		 * the architecture more human understandable.
1611 		 */
1612 		MAC_SRS_POLL_RING(mac_srs);
1613 	}
1614 
1615 again:
1616 	head = mac_srs->srs_first;
1617 	mac_srs->srs_first = NULL;
1618 	tail = mac_srs->srs_last;
1619 	mac_srs->srs_last = NULL;
1620 	cnt = mac_srs->srs_count;
1621 	mac_srs->srs_count = 0;
1622 
1623 	ASSERT(head != NULL);
1624 	ASSERT(tail != NULL);
1625 
1626 	if ((tid = mac_srs->srs_tid) != 0)
1627 		mac_srs->srs_tid = 0;
1628 
1629 	mac_srs->srs_state |= (SRS_PROC|proc_type);
1630 
1631 
1632 	/*
1633 	 * mcip is NULL for broadcast and multicast flows. The promisc
1634 	 * callbacks for broadcast and multicast packets are delivered from
1635 	 * mac_rx() and we don't need to worry about that case in this path
1636 	 */
1637 	if (mcip != NULL && mcip->mci_promisc_list != NULL) {
1638 		mutex_exit(&mac_srs->srs_lock);
1639 		mac_promisc_client_dispatch(mcip, head);
1640 		mutex_enter(&mac_srs->srs_lock);
1641 	}
1642 
1643 	/*
1644 	 * Check if SRS itself is doing the processing
1645 	 * This direct path does not apply when subflows are present. In this
1646 	 * case, packets need to be dispatched to a soft ring according to the
1647 	 * flow's bandwidth and other resources contraints.
1648 	 */
1649 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1650 		mac_direct_rx_t		proc;
1651 		void			*arg1;
1652 		mac_resource_handle_t	arg2;
1653 
1654 		/*
1655 		 * This is the case when a Rx is directly
1656 		 * assigned and we have a fully classified
1657 		 * protocol chain. We can deal with it in
1658 		 * one shot.
1659 		 */
1660 		proc = srs_rx->sr_func;
1661 		arg1 = srs_rx->sr_arg1;
1662 		arg2 = srs_rx->sr_arg2;
1663 
1664 		mac_srs->srs_state |= SRS_CLIENT_PROC;
1665 		mutex_exit(&mac_srs->srs_lock);
1666 		if (tid != 0) {
1667 			(void) untimeout(tid);
1668 			tid = 0;
1669 		}
1670 
1671 		proc(arg1, arg2, head, NULL);
1672 		/*
1673 		 * Decrement the size and count here itelf
1674 		 * since the packet has been processed.
1675 		 */
1676 		mutex_enter(&mac_srs->srs_lock);
1677 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1678 		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1679 			cv_signal(&mac_srs->srs_client_cv);
1680 		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1681 	} else {
1682 		/* Some kind of softrings based fanout is required */
1683 		mutex_exit(&mac_srs->srs_lock);
1684 		if (tid != 0) {
1685 			(void) untimeout(tid);
1686 			tid = 0;
1687 		}
1688 
1689 		/*
1690 		 * Since the fanout routines can deal with chains,
1691 		 * shoot the entire chain up.
1692 		 */
1693 		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1694 			mac_rx_srs_fanout(mac_srs, head);
1695 		else
1696 			mac_rx_srs_proto_fanout(mac_srs, head);
1697 		mutex_enter(&mac_srs->srs_lock);
1698 	}
1699 
1700 	if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) &&
1701 	    (mac_srs->srs_first != NULL)) {
1702 		/*
1703 		 * More packets arrived while we were clearing the
1704 		 * SRS. This can be possible because of one of
1705 		 * three conditions below:
1706 		 * 1) The driver is using multiple worker threads
1707 		 *    to send the packets to us.
1708 		 * 2) The driver has a race in switching
1709 		 *    between interrupt and polling mode or
1710 		 * 3) Packets are arriving in this SRS via the
1711 		 *    S/W classification as well.
1712 		 *
1713 		 * We should switch to polling mode and see if we
1714 		 * need to send the poll thread down. Also, signal
1715 		 * the worker thread to process whats just arrived.
1716 		 */
1717 		MAC_SRS_POLLING_ON(mac_srs);
1718 		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) {
1719 			srs_rx->sr_drain_poll_sig++;
1720 			MAC_SRS_POLL_RING(mac_srs);
1721 		}
1722 
1723 		/*
1724 		 * If we didn't signal the poll thread, we need
1725 		 * to deal with the pending packets ourselves.
1726 		 */
1727 		if (proc_type == SRS_WORKER) {
1728 			srs_rx->sr_drain_again++;
1729 			goto again;
1730 		} else {
1731 			srs_rx->sr_drain_worker_sig++;
1732 			cv_signal(&mac_srs->srs_async);
1733 		}
1734 	}
1735 
1736 out:
1737 	if (mac_srs->srs_state & SRS_GET_PKTS) {
1738 		/*
1739 		 * Poll thread is already running. Leave the
1740 		 * SRS_RPOC set and hand over the control to
1741 		 * poll thread.
1742 		 */
1743 		mac_srs->srs_state &= ~proc_type;
1744 		srs_rx->sr_drain_poll_running++;
1745 		return;
1746 	}
1747 
1748 	/*
1749 	 * Even if there are no packets queued in SRS, we
1750 	 * need to make sure that the shared counter is
1751 	 * clear and any associated softrings have cleared
1752 	 * all the backlog. Otherwise, leave the interface
1753 	 * in polling mode and the poll thread will get
1754 	 * signalled once the count goes down to zero.
1755 	 *
1756 	 * If someone is already draining the queue (SRS_PROC is
1757 	 * set) when the srs_poll_pkt_cnt goes down to zero,
1758 	 * then it means that drain is already running and we
1759 	 * will turn off polling at that time if there is
1760 	 * no backlog.
1761 	 *
1762 	 * As long as there are packets queued either
1763 	 * in soft ring set or its soft rings, we will leave
1764 	 * the interface in polling mode (even if the drain
1765 	 * was done being the interrupt thread). We signal
1766 	 * the poll thread as well if we have dipped below
1767 	 * low water mark.
1768 	 *
1769 	 * NOTE: We can't use the MAC_SRS_POLLING_ON macro
1770 	 * since that turn polling on only for worker thread.
1771 	 * Its not worth turning polling on for interrupt
1772 	 * thread (since NIC will not issue another interrupt)
1773 	 * unless a backlog builds up.
1774 	 */
1775 	if ((srs_rx->sr_poll_pkt_cnt > 0) &&
1776 	    (mac_srs->srs_state & SRS_POLLING_CAPAB)) {
1777 		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1778 		srs_rx->sr_drain_keep_polling++;
1779 		MAC_SRS_POLLING_ON(mac_srs);
1780 		if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)
1781 			MAC_SRS_POLL_RING(mac_srs);
1782 		return;
1783 	}
1784 
1785 	/* Nothing else to do. Get out of poll mode */
1786 	MAC_SRS_POLLING_OFF(mac_srs);
1787 	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
1788 	srs_rx->sr_drain_finish_intr++;
1789 }
1790 
1791 /*
1792  * mac_rx_srs_drain_bw
1793  *
1794  * The SRS BW drain routine. Gets to run to clear the queue. Any thread
1795  * (worker, interrupt, poll) can call this based on processing model.
1796  * The first thing we do is disable interrupts if possible and then
1797  * drain the queue. we also try to poll the underlying hardware if
1798  * there is a dedicated hardware Rx ring assigned to this SRS.
1799  *
1800  * There is a equivalent drain routine in non bandwidth control mode
1801  * mac_rx_srs_drain. There is some code duplication between the two
1802  * routines but they are highly performance sensitive and are easier
1803  * to read/debug if they stay separate. Any code changes here might
1804  * also apply to mac_rx_srs_drain as well.
1805  */
1806 void
1807 mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
1808 {
1809 	mblk_t 			*head;
1810 	mblk_t			*tail;
1811 	timeout_id_t 		tid;
1812 	size_t			sz = 0;
1813 	int			cnt = 0;
1814 	mac_client_impl_t	*mcip = mac_srs->srs_mcip;
1815 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
1816 
1817 	ASSERT(MUTEX_HELD(&mac_srs->srs_lock));
1818 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
1819 again:
1820 	/* Check if we are doing B/W control */
1821 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1822 	if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
1823 		mac_srs->srs_bw->mac_bw_curr_time = lbolt;
1824 		mac_srs->srs_bw->mac_bw_used = 0;
1825 		if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
1826 			mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED;
1827 	} else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) {
1828 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1829 		goto done;
1830 	} else if (mac_srs->srs_bw->mac_bw_used >
1831 	    mac_srs->srs_bw->mac_bw_limit) {
1832 		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
1833 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1834 		goto done;
1835 	}
1836 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1837 
1838 	/* If we are blanked i.e. can't do upcalls, then we are done */
1839 	if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) {
1840 		ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) ||
1841 		    (mac_srs->srs_state & SRS_PAUSE));
1842 		goto done;
1843 	}
1844 
1845 	sz = 0;
1846 	cnt = 0;
1847 	if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) {
1848 		/*
1849 		 * We couldn't pick up a single packet.
1850 		 */
1851 		mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1852 		if ((mac_srs->srs_bw->mac_bw_used == 0) &&
1853 		    (mac_srs->srs_size != 0) &&
1854 		    !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1855 			/*
1856 			 * Seems like configured B/W doesn't
1857 			 * even allow processing of 1 packet
1858 			 * per tick.
1859 			 *
1860 			 * XXX: raise the limit to processing
1861 			 * at least 1 packet per tick.
1862 			 */
1863 			mac_srs->srs_bw->mac_bw_limit +=
1864 			    mac_srs->srs_bw->mac_bw_limit;
1865 			mac_srs->srs_bw->mac_bw_drop_threshold +=
1866 			    mac_srs->srs_bw->mac_bw_drop_threshold;
1867 			cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) "
1868 			    "raised B/W limit to %d since not even a "
1869 			    "single packet can be processed per "
1870 			    "tick %d\n", (void *)mac_srs,
1871 			    (int)mac_srs->srs_bw->mac_bw_limit,
1872 			    (int)msgdsize(mac_srs->srs_first));
1873 		}
1874 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1875 		goto done;
1876 	}
1877 
1878 	ASSERT(head != NULL);
1879 	ASSERT(tail != NULL);
1880 
1881 	/* zero bandwidth: drop all and return to interrupt mode */
1882 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1883 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
1884 		srs_rx->sr_drop_count += cnt;
1885 		ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz);
1886 		mac_srs->srs_bw->mac_bw_sz -= sz;
1887 		mac_srs->srs_bw->mac_bw_drop_bytes += sz;
1888 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1889 		mac_pkt_drop(NULL, NULL, head, B_FALSE);
1890 		goto leave_poll;
1891 	} else {
1892 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1893 	}
1894 
1895 	if ((tid = mac_srs->srs_tid) != 0)
1896 		mac_srs->srs_tid = 0;
1897 
1898 	mac_srs->srs_state |= (SRS_PROC|proc_type);
1899 	MAC_SRS_WORKER_POLLING_ON(mac_srs);
1900 
1901 	/*
1902 	 * mcip is NULL for broadcast and multicast flows. The promisc
1903 	 * callbacks for broadcast and multicast packets are delivered from
1904 	 * mac_rx() and we don't need to worry about that case in this path
1905 	 */
1906 	if (mcip != NULL && mcip->mci_promisc_list != NULL) {
1907 		mutex_exit(&mac_srs->srs_lock);
1908 		mac_promisc_client_dispatch(mcip, head);
1909 		mutex_enter(&mac_srs->srs_lock);
1910 	}
1911 
1912 	/*
1913 	 * Check if SRS itself is doing the processing
1914 	 * This direct path does not apply when subflows are present. In this
1915 	 * case, packets need to be dispatched to a soft ring according to the
1916 	 * flow's bandwidth and other resources contraints.
1917 	 */
1918 	if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) {
1919 		mac_direct_rx_t		proc;
1920 		void			*arg1;
1921 		mac_resource_handle_t	arg2;
1922 
1923 		/*
1924 		 * This is the case when a Rx is directly
1925 		 * assigned and we have a fully classified
1926 		 * protocol chain. We can deal with it in
1927 		 * one shot.
1928 		 */
1929 		proc = srs_rx->sr_func;
1930 		arg1 = srs_rx->sr_arg1;
1931 		arg2 = srs_rx->sr_arg2;
1932 
1933 		mac_srs->srs_state |= SRS_CLIENT_PROC;
1934 		mutex_exit(&mac_srs->srs_lock);
1935 		if (tid != 0) {
1936 			(void) untimeout(tid);
1937 			tid = 0;
1938 		}
1939 
1940 		proc(arg1, arg2, head, NULL);
1941 		/*
1942 		 * Decrement the size and count here itelf
1943 		 * since the packet has been processed.
1944 		 */
1945 		mutex_enter(&mac_srs->srs_lock);
1946 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
1947 		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
1948 
1949 		if (mac_srs->srs_state & SRS_CLIENT_WAIT)
1950 			cv_signal(&mac_srs->srs_client_cv);
1951 		mac_srs->srs_state &= ~SRS_CLIENT_PROC;
1952 	} else {
1953 		/* Some kind of softrings based fanout is required */
1954 		mutex_exit(&mac_srs->srs_lock);
1955 		if (tid != 0) {
1956 			(void) untimeout(tid);
1957 			tid = 0;
1958 		}
1959 
1960 		/*
1961 		 * Since the fanout routines can deal with chains,
1962 		 * shoot the entire chain up.
1963 		 */
1964 		if (mac_srs->srs_type & SRST_FANOUT_SRC_IP)
1965 			mac_rx_srs_fanout(mac_srs, head);
1966 		else
1967 			mac_rx_srs_proto_fanout(mac_srs, head);
1968 		mutex_enter(&mac_srs->srs_lock);
1969 	}
1970 
1971 	/*
1972 	 * Send the poll thread to pick up any packets arrived
1973 	 * so far. This also serves as the last check in case
1974 	 * nothing else is queued in the SRS. The poll thread
1975 	 * is signalled only in the case the drain was done
1976 	 * by the worker thread and SRS_WORKER is set. The
1977 	 * worker thread can run in parallel as long as the
1978 	 * SRS_WORKER flag is set. We we have nothing else to
1979 	 * process, we can exit while leaving SRS_PROC set
1980 	 * which gives the poll thread control to process and
1981 	 * cleanup once it returns from the NIC.
1982 	 *
1983 	 * If we have nothing else to process, we need to
1984 	 * ensure that we keep holding the srs_lock till
1985 	 * all the checks below are done and control is
1986 	 * handed to the poll thread if it was running.
1987 	 */
1988 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
1989 	if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
1990 		if (mac_srs->srs_first != NULL) {
1991 			if (proc_type == SRS_WORKER) {
1992 				mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
1993 				if (srs_rx->sr_poll_pkt_cnt <=
1994 				    srs_rx->sr_lowat)
1995 					MAC_SRS_POLL_RING(mac_srs);
1996 				goto again;
1997 			} else {
1998 				cv_signal(&mac_srs->srs_async);
1999 			}
2000 		}
2001 	}
2002 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2003 
2004 done:
2005 
2006 	if (mac_srs->srs_state & SRS_GET_PKTS) {
2007 		/*
2008 		 * Poll thread is already running. Leave the
2009 		 * SRS_RPOC set and hand over the control to
2010 		 * poll thread.
2011 		 */
2012 		mac_srs->srs_state &= ~proc_type;
2013 		return;
2014 	}
2015 
2016 	/*
2017 	 * If we can't process packets because we have exceeded
2018 	 * B/W limit for this tick, just set the timeout
2019 	 * and leave.
2020 	 *
2021 	 * Even if there are no packets queued in SRS, we
2022 	 * need to make sure that the shared counter is
2023 	 * clear and any associated softrings have cleared
2024 	 * all the backlog. Otherwise, leave the interface
2025 	 * in polling mode and the poll thread will get
2026 	 * signalled once the count goes down to zero.
2027 	 *
2028 	 * If someone is already draining the queue (SRS_PROC is
2029 	 * set) when the srs_poll_pkt_cnt goes down to zero,
2030 	 * then it means that drain is already running and we
2031 	 * will turn off polling at that time if there is
2032 	 * no backlog. As long as there are packets queued either
2033 	 * is soft ring set or its soft rings, we will leave
2034 	 * the interface in polling mode.
2035 	 */
2036 	mutex_enter(&mac_srs->srs_bw->mac_bw_lock);
2037 	if ((mac_srs->srs_state & SRS_POLLING_CAPAB) &&
2038 	    ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) ||
2039 	    (srs_rx->sr_poll_pkt_cnt > 0))) {
2040 		MAC_SRS_POLLING_ON(mac_srs);
2041 		mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2042 		if ((mac_srs->srs_first != NULL) &&
2043 		    (mac_srs->srs_tid == NULL))
2044 			mac_srs->srs_tid = timeout(mac_srs_fire,
2045 			    mac_srs, 1);
2046 		mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2047 		return;
2048 	}
2049 	mutex_exit(&mac_srs->srs_bw->mac_bw_lock);
2050 
2051 leave_poll:
2052 
2053 	/* Nothing else to do. Get out of poll mode */
2054 	MAC_SRS_POLLING_OFF(mac_srs);
2055 	mac_srs->srs_state &= ~(SRS_PROC|proc_type);
2056 }
2057 
2058 /*
2059  * mac_srs_worker
2060  *
2061  * The SRS worker routine. Drains the queue when no one else is
2062  * processing it.
2063  */
2064 void
2065 mac_srs_worker(mac_soft_ring_set_t *mac_srs)
2066 {
2067 	kmutex_t 		*lock = &mac_srs->srs_lock;
2068 	kcondvar_t 		*async = &mac_srs->srs_async;
2069 	callb_cpr_t		cprinfo;
2070 	boolean_t		bw_ctl_flag;
2071 
2072 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker");
2073 	mutex_enter(lock);
2074 
2075 start:
2076 	for (;;) {
2077 		bw_ctl_flag = B_FALSE;
2078 		if (mac_srs->srs_type & SRST_BW_CONTROL) {
2079 			MAC_SRS_BW_LOCK(mac_srs);
2080 			MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2081 			if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)
2082 				bw_ctl_flag = B_TRUE;
2083 			MAC_SRS_BW_UNLOCK(mac_srs);
2084 		}
2085 		/*
2086 		 * The SRS_BW_ENFORCED flag may change since we have dropped
2087 		 * the mac_bw_lock. However the drain function can handle both
2088 		 * a drainable SRS or a bandwidth controlled SRS, and the
2089 		 * effect of scheduling a timeout is to wakeup the worker
2090 		 * thread which in turn will call the drain function. Since
2091 		 * we release the srs_lock atomically only in the cv_wait there
2092 		 * isn't a fear of waiting for ever.
2093 		 */
2094 		while (((mac_srs->srs_state & SRS_PROC) ||
2095 		    (mac_srs->srs_first == NULL) || bw_ctl_flag ||
2096 		    (mac_srs->srs_state & SRS_TX_BLOCKED)) &&
2097 		    !(mac_srs->srs_state & SRS_PAUSE)) {
2098 			/*
2099 			 * If we have packets queued and we are here
2100 			 * because B/W control is in place, we better
2101 			 * schedule the worker wakeup after 1 tick
2102 			 * to see if bandwidth control can be relaxed.
2103 			 */
2104 			if (bw_ctl_flag && mac_srs->srs_tid == NULL) {
2105 				/*
2106 				 * We need to ensure that a timer  is already
2107 				 * scheduled or we force  schedule one for
2108 				 * later so that we can continue processing
2109 				 * after this  quanta is over.
2110 				 */
2111 				mac_srs->srs_tid = timeout(mac_srs_fire,
2112 				    mac_srs, 1);
2113 			}
2114 wait:
2115 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
2116 			cv_wait(async, lock);
2117 			CALLB_CPR_SAFE_END(&cprinfo, lock);
2118 
2119 			if (mac_srs->srs_state & SRS_PAUSE)
2120 				goto done;
2121 			if (mac_srs->srs_state & SRS_PROC)
2122 				goto wait;
2123 
2124 			if (mac_srs->srs_first != NULL &&
2125 			    mac_srs->srs_type & SRST_BW_CONTROL) {
2126 				MAC_SRS_BW_LOCK(mac_srs);
2127 				if (mac_srs->srs_bw->mac_bw_state &
2128 				    SRS_BW_ENFORCED) {
2129 					MAC_SRS_CHECK_BW_CONTROL(mac_srs);
2130 				}
2131 				bw_ctl_flag = mac_srs->srs_bw->mac_bw_state &
2132 				    SRS_BW_ENFORCED;
2133 				MAC_SRS_BW_UNLOCK(mac_srs);
2134 			}
2135 		}
2136 
2137 		if (mac_srs->srs_state & SRS_PAUSE)
2138 			goto done;
2139 		mac_srs->srs_drain_func(mac_srs, SRS_WORKER);
2140 	}
2141 done:
2142 	/*
2143 	 * The Rx SRS quiesce logic first cuts off packet supply to the SRS
2144 	 * from both hard and soft classifications and waits for such threads
2145 	 * to finish before signaling the worker. So at this point the only
2146 	 * thread left that could be competing with the worker is the poll
2147 	 * thread. In the case of Tx, there shouldn't be any thread holding
2148 	 * SRS_PROC at this point.
2149 	 */
2150 	if (!(mac_srs->srs_state & SRS_PROC)) {
2151 		mac_srs->srs_state |= SRS_PROC;
2152 	} else {
2153 		ASSERT((mac_srs->srs_type & SRST_TX) == 0);
2154 		/*
2155 		 * Poll thread still owns the SRS and is still running
2156 		 */
2157 		ASSERT((mac_srs->srs_poll_thr == NULL) ||
2158 		    ((mac_srs->srs_state & SRS_POLL_THR_OWNER) ==
2159 		    SRS_POLL_THR_OWNER));
2160 	}
2161 	mac_srs_worker_quiesce(mac_srs);
2162 	/*
2163 	 * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator
2164 	 * of the quiesce operation
2165 	 */
2166 	while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART)))
2167 		cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock);
2168 
2169 	if (mac_srs->srs_state & SRS_RESTART) {
2170 		ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED));
2171 		mac_srs_worker_restart(mac_srs);
2172 		mac_srs->srs_state &= ~SRS_PROC;
2173 		goto start;
2174 	}
2175 
2176 	if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE))
2177 		mac_srs_worker_quiesce(mac_srs);
2178 
2179 	mac_srs->srs_state &= ~SRS_PROC;
2180 	/* The macro drops the srs_lock */
2181 	CALLB_CPR_EXIT(&cprinfo);
2182 	thread_exit();
2183 }
2184 
2185 /*
2186  * mac_rx_srs_subflow_process
2187  *
2188  * Receive side routine called from interrupt path when there are
2189  * sub flows present on this SRS.
2190  */
2191 /* ARGSUSED */
2192 void
2193 mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs,
2194     mblk_t *mp_chain, boolean_t loopback)
2195 {
2196 	flow_entry_t		*flent = NULL;
2197 	flow_entry_t		*prev_flent = NULL;
2198 	mblk_t			*mp = NULL;
2199 	mblk_t			*tail = NULL;
2200 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
2201 	mac_client_impl_t	*mcip;
2202 
2203 	mcip = mac_srs->srs_mcip;
2204 	ASSERT(mcip != NULL);
2205 
2206 	/*
2207 	 * We need to determine the SRS for every packet
2208 	 * by walking the flow table, if we don't get any,
2209 	 * then we proceed using the SRS we came with.
2210 	 */
2211 	mp = tail = mp_chain;
2212 	while (mp != NULL) {
2213 
2214 		/*
2215 		 * We will increment the stats for the mactching subflow.
2216 		 * when we get the bytes/pkt count for the classified packets
2217 		 * later in mac_rx_srs_process.
2218 		 */
2219 		(void) mac_flow_lookup(mcip->mci_subflow_tab, mp,
2220 		    FLOW_INBOUND, &flent);
2221 
2222 		if (mp == mp_chain || flent == prev_flent) {
2223 			if (prev_flent != NULL)
2224 				FLOW_REFRELE(prev_flent);
2225 			prev_flent = flent;
2226 			flent = NULL;
2227 			tail = mp;
2228 			mp = mp->b_next;
2229 			continue;
2230 		}
2231 		tail->b_next = NULL;
2232 		/*
2233 		 * A null indicates, this is for the mac_srs itself.
2234 		 * XXX-venu : probably assert for fe_rx_srs_cnt == 0.
2235 		 */
2236 		if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2237 			mac_rx_srs_process(arg,
2238 			    (mac_resource_handle_t)mac_srs, mp_chain,
2239 			    loopback);
2240 		} else {
2241 			(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2242 			    prev_flent->fe_cb_arg2, mp_chain, loopback);
2243 			FLOW_REFRELE(prev_flent);
2244 		}
2245 		prev_flent = flent;
2246 		flent = NULL;
2247 		mp_chain = mp;
2248 		tail = mp;
2249 		mp = mp->b_next;
2250 	}
2251 	/* Last chain */
2252 	ASSERT(mp_chain != NULL);
2253 	if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) {
2254 		mac_rx_srs_process(arg,
2255 		    (mac_resource_handle_t)mac_srs, mp_chain, loopback);
2256 	} else {
2257 		(prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1,
2258 		    prev_flent->fe_cb_arg2, mp_chain, loopback);
2259 		FLOW_REFRELE(prev_flent);
2260 	}
2261 }
2262 
2263 /*
2264  * mac_rx_srs_process
2265  *
2266  * Receive side routine called from the interrupt path.
2267  *
2268  * loopback is set to force a context switch on the loopback
2269  * path between MAC clients.
2270  */
2271 /* ARGSUSED */
2272 void
2273 mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain,
2274     boolean_t loopback)
2275 {
2276 	mac_soft_ring_set_t	*mac_srs = (mac_soft_ring_set_t *)srs;
2277 	mblk_t			*mp, *tail, *head;
2278 	int			count = 0;
2279 	int			count1;
2280 	size_t			sz = 0;
2281 	size_t			chain_sz, sz1;
2282 	mac_bw_ctl_t		*mac_bw;
2283 	mac_client_impl_t	*smcip;
2284 	mac_srs_rx_t		*srs_rx = &mac_srs->srs_rx;
2285 
2286 	/*
2287 	 * Set the tail, count and sz. We set the sz irrespective
2288 	 * of whether we are doing B/W control or not for the
2289 	 * purpose of updating the stats.
2290 	 */
2291 	mp = tail = mp_chain;
2292 	while (mp != NULL) {
2293 		tail = mp;
2294 		count++;
2295 		sz += msgdsize(mp);
2296 		mp = mp->b_next;
2297 	}
2298 
2299 	mutex_enter(&mac_srs->srs_lock);
2300 	smcip = mac_srs->srs_mcip;
2301 
2302 	if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) {
2303 		FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz);
2304 		FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count);
2305 	}
2306 	if (smcip != NULL) {
2307 		smcip->mci_stat_ibytes += sz;
2308 		smcip->mci_stat_ipackets += count;
2309 	}
2310 
2311 	/*
2312 	 * If the SRS in already being processed; has been blanked;
2313 	 * can be processed by worker thread only; or the B/W limit
2314 	 * has been reached, then queue the chain and check if
2315 	 * worker thread needs to be awakend.
2316 	 */
2317 	if (mac_srs->srs_type & SRST_BW_CONTROL) {
2318 		mac_bw = mac_srs->srs_bw;
2319 		ASSERT(mac_bw != NULL);
2320 		mutex_enter(&mac_bw->mac_bw_lock);
2321 		/* Count the packets and bytes via interrupt */
2322 		srs_rx->sr_intr_count += count;
2323 		mac_bw->mac_bw_intr += sz;
2324 		if (mac_bw->mac_bw_limit == 0) {
2325 			/* zero bandwidth: drop all */
2326 			srs_rx->sr_drop_count += count;
2327 			mac_bw->mac_bw_drop_bytes += sz;
2328 			mutex_exit(&mac_bw->mac_bw_lock);
2329 			mutex_exit(&mac_srs->srs_lock);
2330 			mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
2331 			return;
2332 		} else {
2333 			if ((mac_bw->mac_bw_sz + sz) <=
2334 			    mac_bw->mac_bw_drop_threshold) {
2335 				mutex_exit(&mac_bw->mac_bw_lock);
2336 				MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain,
2337 				    tail, count, sz);
2338 			} else {
2339 				mp = mp_chain;
2340 				chain_sz = 0;
2341 				count1 = 0;
2342 				tail = NULL;
2343 				head = NULL;
2344 				while (mp != NULL) {
2345 					sz1 = msgdsize(mp);
2346 					if (mac_bw->mac_bw_sz + chain_sz + sz1 >
2347 					    mac_bw->mac_bw_drop_threshold)
2348 						break;
2349 					chain_sz += sz1;
2350 					count1++;
2351 					tail = mp;
2352 					mp = mp->b_next;
2353 				}
2354 				mutex_exit(&mac_bw->mac_bw_lock);
2355 				if (tail != NULL) {
2356 					head = tail->b_next;
2357 					tail->b_next = NULL;
2358 					MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs,
2359 					    mp_chain, tail, count1, chain_sz);
2360 					sz -= chain_sz;
2361 					count -= count1;
2362 				} else {
2363 					/* Can't pick up any */
2364 					head = mp_chain;
2365 				}
2366 				if (head != NULL) {
2367 					/* Drop any packet over the threshold */
2368 					srs_rx->sr_drop_count += count;
2369 					mutex_enter(&mac_bw->mac_bw_lock);
2370 					mac_bw->mac_bw_drop_bytes += sz;
2371 					mutex_exit(&mac_bw->mac_bw_lock);
2372 					freemsgchain(head);
2373 				}
2374 			}
2375 			MAC_SRS_WORKER_WAKEUP(mac_srs);
2376 			mutex_exit(&mac_srs->srs_lock);
2377 			return;
2378 		}
2379 	}
2380 
2381 	/*
2382 	 * If the total number of packets queued in the SRS and
2383 	 * its associated soft rings exceeds the max allowed,
2384 	 * then drop the chain. If we are polling capable, this
2385 	 * shouldn't be happening.
2386 	 */
2387 	if (!(mac_srs->srs_type & SRST_BW_CONTROL) &&
2388 	    (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) {
2389 		mac_bw = mac_srs->srs_bw;
2390 		srs_rx->sr_drop_count += count;
2391 		mutex_enter(&mac_bw->mac_bw_lock);
2392 		mac_bw->mac_bw_drop_bytes += sz;
2393 		mutex_exit(&mac_bw->mac_bw_lock);
2394 		freemsgchain(mp_chain);
2395 		mutex_exit(&mac_srs->srs_lock);
2396 		return;
2397 	}
2398 
2399 	MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz);
2400 	/* Count the packets entering via interrupt path */
2401 	srs_rx->sr_intr_count += count;
2402 
2403 	if (!(mac_srs->srs_state & SRS_PROC)) {
2404 		/*
2405 		 * If we are coming via loopback or if we are not
2406 		 * optimizing for latency, we should signal the
2407 		 * worker thread.
2408 		 */
2409 		if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) {
2410 			/*
2411 			 * For loopback, We need to let the worker take
2412 			 * over as we don't want to continue in the same
2413 			 * thread even if we can. This could lead to stack
2414 			 * overflows and may also end up using
2415 			 * resources (cpu) incorrectly.
2416 			 */
2417 			cv_signal(&mac_srs->srs_async);
2418 		} else {
2419 			/*
2420 			 * Seems like no one is processing the SRS and
2421 			 * there is no backlog. We also inline process
2422 			 * our packet if its a single packet in non
2423 			 * latency optimized case (in latency optimized
2424 			 * case, we inline process chains of any size).
2425 			 */
2426 			mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST);
2427 		}
2428 	}
2429 	mutex_exit(&mac_srs->srs_lock);
2430 }
2431 
2432 /* TX SIDE ROUTINES (RUNTIME) */
2433 
2434 /*
2435  * mac_tx_srs_no_desc
2436  *
2437  * This routine is called by Tx single ring default mode
2438  * when Tx ring runs out of descs.
2439  */
2440 mac_tx_cookie_t
2441 mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2442     uint16_t flag, mblk_t **ret_mp)
2443 {
2444 	mac_tx_cookie_t cookie = NULL;
2445 	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
2446 	boolean_t wakeup_worker = B_TRUE;
2447 	uint32_t tx_mode = srs_tx->st_mode;
2448 	int cnt, sz;
2449 	mblk_t *tail;
2450 
2451 	ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW);
2452 	if (flag & MAC_DROP_ON_NO_DESC) {
2453 		MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2454 	} else {
2455 		if (mac_srs->srs_first != NULL)
2456 			wakeup_worker = B_FALSE;
2457 		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2458 		if (flag & MAC_TX_NO_ENQUEUE) {
2459 			/*
2460 			 * If TX_QUEUED is not set, queue the
2461 			 * packet and let mac_tx_srs_drain()
2462 			 * set the TX_BLOCKED bit for the
2463 			 * reasons explained above. Otherwise,
2464 			 * return the mblks.
2465 			 */
2466 			if (wakeup_worker) {
2467 				MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2468 				    mp_chain, tail, cnt, sz);
2469 			} else {
2470 				MAC_TX_SET_NO_ENQUEUE(mac_srs,
2471 				    mp_chain, ret_mp, cookie);
2472 			}
2473 		} else {
2474 			MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2475 			    tail, cnt, sz, cookie);
2476 		}
2477 		if (wakeup_worker)
2478 			cv_signal(&mac_srs->srs_async);
2479 	}
2480 	return (cookie);
2481 }
2482 
2483 /*
2484  * mac_tx_srs_enqueue
2485  *
2486  * This routine is called when Tx SRS is operating in either serializer
2487  * or bandwidth mode. In serializer mode, a packet will get enqueued
2488  * when a thread cannot enter SRS exclusively. In bandwidth mode,
2489  * packets gets queued if allowed byte-count limit for a tick is
2490  * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and
2491  * MAC_TX_NO_ENQUEUE is set is different than when operaing in either
2492  * the default mode or fanout mode. Here packets get dropped or
2493  * returned back to the caller only after hi-watermark worth of data
2494  * is queued.
2495  */
2496 static mac_tx_cookie_t
2497 mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2498     uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp)
2499 {
2500 	mac_tx_cookie_t cookie = NULL;
2501 	int cnt, sz;
2502 	mblk_t *tail;
2503 	boolean_t wakeup_worker = B_TRUE;
2504 
2505 	/*
2506 	 * Ignore fanout hint if we don't have multiple tx rings.
2507 	 */
2508 	if (!TX_MULTI_RING_MODE(mac_srs))
2509 		fanout_hint = 0;
2510 
2511 	if (mac_srs->srs_first != NULL)
2512 		wakeup_worker = B_FALSE;
2513 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2514 	if (flag & MAC_DROP_ON_NO_DESC) {
2515 		if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) {
2516 			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2517 		} else {
2518 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2519 			    mp_chain, tail, cnt, sz);
2520 		}
2521 	} else if (flag & MAC_TX_NO_ENQUEUE) {
2522 		if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) ||
2523 		    (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) {
2524 			MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain,
2525 			    ret_mp, cookie);
2526 		} else {
2527 			mp_chain->b_prev = (mblk_t *)fanout_hint;
2528 			MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2529 			    mp_chain, tail, cnt, sz);
2530 		}
2531 	} else {
2532 		/*
2533 		 * If you are BW_ENFORCED, just enqueue the
2534 		 * packet. srs_worker will drain it at the
2535 		 * prescribed rate. Before enqueueing, save
2536 		 * the fanout hint.
2537 		 */
2538 		mp_chain->b_prev = (mblk_t *)fanout_hint;
2539 		MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain,
2540 		    tail, cnt, sz, cookie);
2541 	}
2542 	if (wakeup_worker)
2543 		cv_signal(&mac_srs->srs_async);
2544 	return (cookie);
2545 }
2546 
2547 /*
2548  * There are five tx modes:
2549  *
2550  * 1) Default mode (SRS_TX_DEFAULT)
2551  * 2) Serialization mode (SRS_TX_SERIALIZE)
2552  * 3) Fanout mode (SRS_TX_FANOUT)
2553  * 4) Bandwdith mode (SRS_TX_BW)
2554  * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT)
2555  *
2556  * The tx mode in which an SRS operates is decided in mac_tx_srs_setup()
2557  * based on the number of Tx rings requested for an SRS and whether
2558  * bandwidth control is requested or not.
2559  *
2560  * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a
2561  * pass-thru. Packets will go directly to mac_tx_send(). When the underlying
2562  * Tx ring runs out of Tx descs, it starts queueing up packets in SRS.
2563  * When flow-control is relieved, the srs_worker drains the queued
2564  * packets and informs blocked clients to restart sending packets.
2565  *
2566  * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized.
2567  *
2568  * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple
2569  * Tx rings. Each Tx ring will have a soft ring associated with it.
2570  * These soft rings will be hung off the Tx SRS. Queueing if it happens
2571  * due to lack of Tx desc will be in individual soft ring (and not srs)
2572  * associated with Tx ring.
2573  *
2574  * In the TX_BW mode, tx srs will allow packets to go down to Tx ring
2575  * only if bw is available. Otherwise the packets will be queued in
2576  * SRS. If fanout to multiple Tx rings is configured, the packets will
2577  * be fanned out among the soft rings associated with the Tx rings.
2578  *
2579  * Four flags are used in srs_state for indicating flow control
2580  * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT.
2581  * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the
2582  * driver below.
2583  * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat
2584  * and flow-control pressure is applied back to clients. The clients expect
2585  * wakeup when flow-control is relieved.
2586  * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk
2587  * got returned back to client either due to lack of Tx descs or due to bw
2588  * control reasons. The clients expect a wakeup when condition is relieved.
2589  *
2590  * The fourth argument to mac_tx() is the flag. Normally it will be 0 but
2591  * some clients set the following values too: MAC_DROP_ON_NO_DESC,
2592  * MAC_TX_NO_ENQUEUE
2593  * Mac clients that do not want packets to be enqueued in the mac layer set
2594  * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or
2595  * Tx soft rings but instead get dropped when the NIC runs out of desc. The
2596  * behaviour of this flag is different when the Tx is running in serializer
2597  * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet
2598  * get dropped when Tx high watermark is reached.
2599  * There are some mac clients like vsw, aggr that want the mblks to be
2600  * returned back to clients instead of being queued in Tx SRS (or Tx soft
2601  * rings) under flow-control (i.e., out of desc or exceeding bw limits)
2602  * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set.
2603  * In the default and Tx fanout mode, the un-transmitted mblks will be
2604  * returned back to the clients when the driver runs out of Tx descs.
2605  * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or
2606  * soft ring) so that the clients can be woken up when Tx desc become
2607  * available. When running in serializer or bandwidth mode mode,
2608  * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached.
2609  */
2610 
2611 mac_tx_func_t
2612 mac_tx_get_func(uint32_t mode)
2613 {
2614 	return (mac_tx_mode_list[mode].mac_tx_func);
2615 }
2616 
2617 /* ARGSUSED */
2618 static mac_tx_cookie_t
2619 mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2620     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2621 {
2622 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2623 	boolean_t		is_subflow;
2624 	mac_tx_stats_t		stats;
2625 	mac_tx_cookie_t		cookie = NULL;
2626 
2627 	ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT);
2628 
2629 	/* Regular case with a single Tx ring */
2630 	/*
2631 	 * SRS_TX_BLOCKED is set when underlying NIC runs
2632 	 * out of Tx descs and messages start getting
2633 	 * queued. It won't get reset until
2634 	 * tx_srs_drain() completely drains out the
2635 	 * messages.
2636 	 */
2637 	if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2638 		/* Tx descs/resources not available */
2639 		mutex_enter(&mac_srs->srs_lock);
2640 		if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) {
2641 			cookie = mac_tx_srs_no_desc(mac_srs, mp_chain,
2642 			    flag, ret_mp);
2643 			mutex_exit(&mac_srs->srs_lock);
2644 			return (cookie);
2645 		}
2646 		/*
2647 		 * While we were computing mblk count, the
2648 		 * flow control condition got relieved.
2649 		 * Continue with the transmission.
2650 		 */
2651 		mutex_exit(&mac_srs->srs_lock);
2652 	}
2653 
2654 	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2655 
2656 	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2657 	    mp_chain, (is_subflow ? &stats : NULL));
2658 
2659 	/*
2660 	 * Multiple threads could be here sending packets.
2661 	 * Under such conditions, it is not possible to
2662 	 * automically set SRS_TX_BLOCKED bit to indicate
2663 	 * out of tx desc condition. To atomically set
2664 	 * this, we queue the returned packet and do
2665 	 * the setting of SRS_TX_BLOCKED in
2666 	 * mac_tx_srs_drain().
2667 	 */
2668 	if (mp_chain != NULL) {
2669 		mutex_enter(&mac_srs->srs_lock);
2670 		cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp);
2671 		mutex_exit(&mac_srs->srs_lock);
2672 		return (cookie);
2673 	}
2674 
2675 	if (is_subflow)
2676 		FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
2677 
2678 	return (NULL);
2679 }
2680 
2681 /*
2682  * mac_tx_serialize_mode
2683  *
2684  * This is an experimental mode implemented as per the request of PAE.
2685  * In this mode, all callers attempting to send a packet to the NIC
2686  * will get serialized. Only one thread at any time will access the
2687  * NIC to send the packet out.
2688  */
2689 /* ARGSUSED */
2690 static mac_tx_cookie_t
2691 mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2692     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2693 {
2694 	boolean_t		is_subflow;
2695 	mac_tx_stats_t		stats;
2696 	mac_tx_cookie_t		cookie = NULL;
2697 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2698 
2699 	/* Single ring, serialize below */
2700 	ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE);
2701 	mutex_enter(&mac_srs->srs_lock);
2702 	if ((mac_srs->srs_first != NULL) ||
2703 	    (mac_srs->srs_state & SRS_PROC)) {
2704 		/*
2705 		 * In serialization mode, queue all packets until
2706 		 * TX_HIWAT is set.
2707 		 * If drop bit is set, drop if TX_HIWAT is set.
2708 		 * If no_enqueue is set, still enqueue until hiwat
2709 		 * is set and return mblks after TX_HIWAT is set.
2710 		 */
2711 		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain,
2712 		    flag, NULL, ret_mp);
2713 		mutex_exit(&mac_srs->srs_lock);
2714 		return (cookie);
2715 	}
2716 	/*
2717 	 * No packets queued, nothing on proc and no flow
2718 	 * control condition. Fast-path, ok. Do inline
2719 	 * processing.
2720 	 */
2721 	mac_srs->srs_state |= SRS_PROC;
2722 	mutex_exit(&mac_srs->srs_lock);
2723 
2724 	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2725 
2726 	mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2727 	    mp_chain, (is_subflow ? &stats : NULL));
2728 
2729 	mutex_enter(&mac_srs->srs_lock);
2730 	mac_srs->srs_state &= ~SRS_PROC;
2731 	if (mp_chain != NULL) {
2732 		cookie = mac_tx_srs_enqueue(mac_srs,
2733 		    mp_chain, flag, NULL, ret_mp);
2734 	}
2735 	if (mac_srs->srs_first != NULL) {
2736 		/*
2737 		 * We processed inline our packet and a new
2738 		 * packet/s got queued while we were
2739 		 * processing. Wakeup srs worker
2740 		 */
2741 		cv_signal(&mac_srs->srs_async);
2742 	}
2743 	mutex_exit(&mac_srs->srs_lock);
2744 
2745 	if (is_subflow && cookie == NULL)
2746 		FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
2747 
2748 	return (cookie);
2749 }
2750 
2751 /*
2752  * mac_tx_fanout_mode
2753  *
2754  * In this mode, the SRS will have access to multiple Tx rings to send
2755  * the packet out. The fanout hint that is passed as an argument is
2756  * used to find an appropriate ring to fanout the traffic. Each Tx
2757  * ring, in turn,  will have a soft ring associated with it. If a Tx
2758  * ring runs out of Tx desc's the returned packet will be queued in
2759  * the soft ring associated with that Tx ring. The srs itself will not
2760  * queue any packets.
2761  */
2762 
2763 #define	MAC_TX_SOFT_RING_PROCESS(chain) {		       		\
2764 	index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count),	\
2765 	softring = mac_srs->srs_oth_soft_rings[index];			\
2766 	cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \
2767 	DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index);	\
2768 }
2769 
2770 static mac_tx_cookie_t
2771 mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2772     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2773 {
2774 	mac_soft_ring_t		*softring;
2775 	uint64_t		hash;
2776 	uint_t			index;
2777 	mac_tx_cookie_t		cookie = NULL;
2778 
2779 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT);
2780 	if (fanout_hint != 0) {
2781 		/*
2782 		 * The hint is specified by the caller, simply pass the
2783 		 * whole chain to the soft ring.
2784 		 */
2785 		hash = HASH_HINT(fanout_hint);
2786 		MAC_TX_SOFT_RING_PROCESS(mp_chain);
2787 	} else {
2788 		mblk_t *last_mp, *cur_mp, *sub_chain;
2789 		uint64_t last_hash = 0;
2790 		uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media;
2791 
2792 		/*
2793 		 * Compute the hash from the contents (headers) of the
2794 		 * packets of the mblk chain. Split the chains into
2795 		 * subchains of the same conversation.
2796 		 *
2797 		 * Since there may be more than one ring used for
2798 		 * sub-chains of the same call, and since the caller
2799 		 * does not maintain per conversation state since it
2800 		 * passed a zero hint, unsent subchains will be
2801 		 * dropped.
2802 		 */
2803 
2804 		flag |= MAC_DROP_ON_NO_DESC;
2805 		ret_mp = NULL;
2806 
2807 		ASSERT(ret_mp == NULL);
2808 
2809 		sub_chain = NULL;
2810 		last_mp = NULL;
2811 
2812 		for (cur_mp = mp_chain; cur_mp != NULL;
2813 		    cur_mp = cur_mp->b_next) {
2814 			hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4,
2815 			    B_TRUE);
2816 			if (last_hash != 0 && hash != last_hash) {
2817 				/*
2818 				 * Starting a different subchain, send current
2819 				 * chain out.
2820 				 */
2821 				ASSERT(last_mp != NULL);
2822 				last_mp->b_next = NULL;
2823 				MAC_TX_SOFT_RING_PROCESS(sub_chain);
2824 				sub_chain = NULL;
2825 			}
2826 
2827 			/* add packet to subchain */
2828 			if (sub_chain == NULL)
2829 				sub_chain = cur_mp;
2830 			last_mp = cur_mp;
2831 			last_hash = hash;
2832 		}
2833 
2834 		if (sub_chain != NULL) {
2835 			/* send last subchain */
2836 			ASSERT(last_mp != NULL);
2837 			last_mp->b_next = NULL;
2838 			MAC_TX_SOFT_RING_PROCESS(sub_chain);
2839 		}
2840 
2841 		cookie = NULL;
2842 	}
2843 
2844 	return (cookie);
2845 }
2846 
2847 /*
2848  * mac_tx_bw_mode
2849  *
2850  * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring
2851  * only if bw is available. Otherwise the packets will be queued in
2852  * SRS. If the SRS has multiple Tx rings, then packets will get fanned
2853  * out to a Tx rings.
2854  */
2855 static mac_tx_cookie_t
2856 mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain,
2857     uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp)
2858 {
2859 	int			cnt, sz;
2860 	mblk_t			*tail;
2861 	mac_tx_cookie_t		cookie = NULL;
2862 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2863 
2864 	ASSERT(TX_BANDWIDTH_MODE(mac_srs));
2865 	ASSERT(mac_srs->srs_type & SRST_BW_CONTROL);
2866 	mutex_enter(&mac_srs->srs_lock);
2867 	if (mac_srs->srs_bw->mac_bw_limit == 0) {
2868 		/*
2869 		 * zero bandwidth, no traffic is sent: drop the packets,
2870 		 * or return the whole chain if the caller requests all
2871 		 * unsent packets back.
2872 		 */
2873 		if (flag & MAC_TX_NO_ENQUEUE) {
2874 			cookie = (mac_tx_cookie_t)mac_srs;
2875 			*ret_mp = mp_chain;
2876 		} else {
2877 			MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie);
2878 		}
2879 		mutex_exit(&mac_srs->srs_lock);
2880 		return (cookie);
2881 	} else if ((mac_srs->srs_first != NULL) ||
2882 	    (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) {
2883 		cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2884 		    fanout_hint, ret_mp);
2885 		mutex_exit(&mac_srs->srs_lock);
2886 		return (cookie);
2887 	}
2888 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2889 	if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
2890 		mac_srs->srs_bw->mac_bw_curr_time = lbolt;
2891 		mac_srs->srs_bw->mac_bw_used = 0;
2892 	} else if (mac_srs->srs_bw->mac_bw_used >
2893 	    mac_srs->srs_bw->mac_bw_limit) {
2894 		mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
2895 		MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs,
2896 		    mp_chain, tail, cnt, sz);
2897 		/*
2898 		 * Wakeup worker thread. Note that worker
2899 		 * thread has to be woken up so that it
2900 		 * can fire up the timer to be woken up
2901 		 * on the next tick. Also once
2902 		 * BW_ENFORCED is set, it can only be
2903 		 * reset by srs_worker thread. Until then
2904 		 * all packets will get queued up in SRS
2905 		 * and hence this this code path won't be
2906 		 * entered until BW_ENFORCED is reset.
2907 		 */
2908 		cv_signal(&mac_srs->srs_async);
2909 		mutex_exit(&mac_srs->srs_lock);
2910 		return (cookie);
2911 	}
2912 
2913 	mac_srs->srs_bw->mac_bw_used += sz;
2914 	mutex_exit(&mac_srs->srs_lock);
2915 
2916 	if (srs_tx->st_mode == SRS_TX_BW_FANOUT) {
2917 		mac_soft_ring_t *softring;
2918 		uint_t indx, hash;
2919 
2920 		hash = HASH_HINT(fanout_hint);
2921 		indx = COMPUTE_INDEX(hash,
2922 		    mac_srs->srs_oth_ring_count);
2923 		softring = mac_srs->srs_oth_soft_rings[indx];
2924 		return (mac_tx_soft_ring_process(softring, mp_chain, flag,
2925 		    ret_mp));
2926 	} else {
2927 		boolean_t		is_subflow;
2928 		mac_tx_stats_t		stats;
2929 
2930 		is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2931 
2932 		mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2933 		    mp_chain, (is_subflow ? &stats : NULL));
2934 
2935 		if (mp_chain != NULL) {
2936 			mutex_enter(&mac_srs->srs_lock);
2937 			MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
2938 			if (mac_srs->srs_bw->mac_bw_used > sz)
2939 				mac_srs->srs_bw->mac_bw_used -= sz;
2940 			else
2941 				mac_srs->srs_bw->mac_bw_used = 0;
2942 			cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag,
2943 			    fanout_hint, ret_mp);
2944 			mutex_exit(&mac_srs->srs_lock);
2945 			return (cookie);
2946 		}
2947 		if (is_subflow)
2948 			FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
2949 
2950 		return (NULL);
2951 	}
2952 }
2953 
2954 /* ARGSUSED */
2955 void
2956 mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type)
2957 {
2958 	mblk_t			*head, *tail;
2959 	size_t			sz;
2960 	uint32_t		tx_mode;
2961 	uint_t			saved_pkt_count;
2962 	boolean_t		is_subflow;
2963 	mac_tx_stats_t		stats;
2964 	mac_srs_tx_t		*srs_tx = &mac_srs->srs_tx;
2965 
2966 	saved_pkt_count = 0;
2967 	ASSERT(mutex_owned(&mac_srs->srs_lock));
2968 	ASSERT(!(mac_srs->srs_state & SRS_PROC));
2969 
2970 	mac_srs->srs_state |= SRS_PROC;
2971 
2972 	is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
2973 	tx_mode = srs_tx->st_mode;
2974 	if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) {
2975 		if (mac_srs->srs_first != NULL) {
2976 			head = mac_srs->srs_first;
2977 			tail = mac_srs->srs_last;
2978 			saved_pkt_count = mac_srs->srs_count;
2979 			mac_srs->srs_first = NULL;
2980 			mac_srs->srs_last = NULL;
2981 			mac_srs->srs_count = 0;
2982 			mutex_exit(&mac_srs->srs_lock);
2983 
2984 			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
2985 			    head, &stats);
2986 
2987 			mutex_enter(&mac_srs->srs_lock);
2988 			if (head != NULL) {
2989 				/* Device out of tx desc, set block */
2990 				if (head->b_next == NULL)
2991 					VERIFY(head == tail);
2992 				tail->b_next = mac_srs->srs_first;
2993 				mac_srs->srs_first = head;
2994 				mac_srs->srs_count +=
2995 				    (saved_pkt_count - stats.ts_opackets);
2996 				if (mac_srs->srs_last == NULL)
2997 					mac_srs->srs_last = tail;
2998 				MAC_TX_SRS_BLOCK(mac_srs, head);
2999 			} else {
3000 				srs_tx->st_woken_up = B_FALSE;
3001 				if (is_subflow) {
3002 					FLOW_TX_STATS_UPDATE(
3003 					    mac_srs->srs_flent, &stats);
3004 				}
3005 			}
3006 		}
3007 	} else if (tx_mode == SRS_TX_BW) {
3008 		/*
3009 		 * We are here because the timer fired and we have some data
3010 		 * to tranmit. Also mac_tx_srs_worker should have reset
3011 		 * SRS_BW_ENFORCED flag
3012 		 */
3013 		ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED));
3014 		head = tail = mac_srs->srs_first;
3015 		while (mac_srs->srs_first != NULL) {
3016 			tail = mac_srs->srs_first;
3017 			tail->b_prev = NULL;
3018 			mac_srs->srs_first = tail->b_next;
3019 			if (mac_srs->srs_first == NULL)
3020 				mac_srs->srs_last = NULL;
3021 			mac_srs->srs_count--;
3022 			sz = msgdsize(tail);
3023 			mac_srs->srs_size -= sz;
3024 			saved_pkt_count++;
3025 			MAC_TX_UPDATE_BW_INFO(mac_srs, sz);
3026 
3027 			if (mac_srs->srs_bw->mac_bw_used <
3028 			    mac_srs->srs_bw->mac_bw_limit)
3029 				continue;
3030 
3031 			if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
3032 				mac_srs->srs_bw->mac_bw_curr_time = lbolt;
3033 				mac_srs->srs_bw->mac_bw_used = sz;
3034 				continue;
3035 			}
3036 			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3037 			break;
3038 		}
3039 
3040 		ASSERT((head == NULL && tail == NULL) ||
3041 		    (head != NULL && tail != NULL));
3042 		if (tail != NULL) {
3043 			tail->b_next = NULL;
3044 			mutex_exit(&mac_srs->srs_lock);
3045 
3046 			head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2,
3047 			    head, &stats);
3048 
3049 			mutex_enter(&mac_srs->srs_lock);
3050 			if (head != NULL) {
3051 				uint_t size_sent;
3052 
3053 				/* Device out of tx desc, set block */
3054 				if (head->b_next == NULL)
3055 					VERIFY(head == tail);
3056 				tail->b_next = mac_srs->srs_first;
3057 				mac_srs->srs_first = head;
3058 				mac_srs->srs_count +=
3059 				    (saved_pkt_count - stats.ts_opackets);
3060 				if (mac_srs->srs_last == NULL)
3061 					mac_srs->srs_last = tail;
3062 				size_sent = sz - stats.ts_obytes;
3063 				mac_srs->srs_size += size_sent;
3064 				mac_srs->srs_bw->mac_bw_sz += size_sent;
3065 				if (mac_srs->srs_bw->mac_bw_used > size_sent) {
3066 					mac_srs->srs_bw->mac_bw_used -=
3067 					    size_sent;
3068 				} else {
3069 					mac_srs->srs_bw->mac_bw_used = 0;
3070 				}
3071 				MAC_TX_SRS_BLOCK(mac_srs, head);
3072 			} else {
3073 				srs_tx->st_woken_up = B_FALSE;
3074 				if (is_subflow) {
3075 					FLOW_TX_STATS_UPDATE(
3076 					    mac_srs->srs_flent, &stats);
3077 				}
3078 			}
3079 		}
3080 	} else if (tx_mode == SRS_TX_BW_FANOUT) {
3081 		mblk_t *prev;
3082 		mac_soft_ring_t *softring;
3083 		uint64_t hint;
3084 
3085 		/*
3086 		 * We are here because the timer fired and we
3087 		 * have some quota to tranmit.
3088 		 */
3089 		prev = NULL;
3090 		head = tail = mac_srs->srs_first;
3091 		while (mac_srs->srs_first != NULL) {
3092 			tail = mac_srs->srs_first;
3093 			mac_srs->srs_first = tail->b_next;
3094 			if (mac_srs->srs_first == NULL)
3095 				mac_srs->srs_last = NULL;
3096 			mac_srs->srs_count--;
3097 			sz = msgdsize(tail);
3098 			mac_srs->srs_size -= sz;
3099 			mac_srs->srs_bw->mac_bw_used += sz;
3100 			if (prev == NULL)
3101 				hint = (ulong_t)tail->b_prev;
3102 			if (hint != (ulong_t)tail->b_prev) {
3103 				prev->b_next = NULL;
3104 				mutex_exit(&mac_srs->srs_lock);
3105 				TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3106 				head = tail;
3107 				hint = (ulong_t)tail->b_prev;
3108 				mutex_enter(&mac_srs->srs_lock);
3109 			}
3110 
3111 			prev = tail;
3112 			tail->b_prev = NULL;
3113 			if (mac_srs->srs_bw->mac_bw_used <
3114 			    mac_srs->srs_bw->mac_bw_limit)
3115 				continue;
3116 
3117 			if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) {
3118 				mac_srs->srs_bw->mac_bw_curr_time = lbolt;
3119 				mac_srs->srs_bw->mac_bw_used = 0;
3120 				continue;
3121 			}
3122 			mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED;
3123 			break;
3124 		}
3125 		ASSERT((head == NULL && tail == NULL) ||
3126 		    (head != NULL && tail != NULL));
3127 		if (tail != NULL) {
3128 			tail->b_next = NULL;
3129 			mutex_exit(&mac_srs->srs_lock);
3130 			TX_SRS_TO_SOFT_RING(mac_srs, head, hint);
3131 			mutex_enter(&mac_srs->srs_lock);
3132 		}
3133 	}
3134 	/*
3135 	 * SRS_TX_FANOUT case not considered here because packets
3136 	 * won't be queued in the SRS for this case. Packets will
3137 	 * be sent directly to soft rings underneath and if there
3138 	 * is any queueing at all, it would be in Tx side soft
3139 	 * rings.
3140 	 */
3141 
3142 	/*
3143 	 * When srs_count becomes 0, reset SRS_TX_HIWAT and
3144 	 * SRS_TX_WAKEUP_CLIENT and wakeup registered clients.
3145 	 */
3146 	if (mac_srs->srs_count == 0 && (mac_srs->srs_state &
3147 	    (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) {
3148 		mac_tx_notify_cb_t *mtnfp;
3149 		mac_cb_t *mcb;
3150 		mac_client_impl_t *mcip = mac_srs->srs_mcip;
3151 		boolean_t wakeup_required = B_FALSE;
3152 
3153 		if (mac_srs->srs_state &
3154 		    (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) {
3155 			wakeup_required = B_TRUE;
3156 		}
3157 		mac_srs->srs_state &= ~(SRS_TX_HIWAT |
3158 		    SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED);
3159 		mutex_exit(&mac_srs->srs_lock);
3160 		if (wakeup_required) {
3161 			/* Wakeup callback registered clients */
3162 			MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info);
3163 			for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL;
3164 			    mcb = mcb->mcb_nextp) {
3165 				mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp;
3166 				mtnfp->mtnf_fn(mtnfp->mtnf_arg,
3167 				    (mac_tx_cookie_t)mac_srs);
3168 			}
3169 			MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info,
3170 			    &mcip->mci_tx_notify_cb_list);
3171 			/*
3172 			 * If the client is not the primary MAC client, then we
3173 			 * need to send the notification to the clients upper
3174 			 * MAC, i.e. mci_upper_mip.
3175 			 */
3176 			mac_tx_notify(mcip->mci_upper_mip != NULL ?
3177 			    mcip->mci_upper_mip : mcip->mci_mip);
3178 		}
3179 		mutex_enter(&mac_srs->srs_lock);
3180 	}
3181 	mac_srs->srs_state &= ~SRS_PROC;
3182 }
3183 
3184 /*
3185  * Given a packet, get the flow_entry that identifies the flow
3186  * to which that packet belongs. The flow_entry will contain
3187  * the transmit function to be used to send the packet. If the
3188  * function returns NULL, the packet should be sent using the
3189  * underlying NIC.
3190  */
3191 static flow_entry_t *
3192 mac_tx_classify(mac_impl_t *mip, mblk_t *mp)
3193 {
3194 	flow_entry_t		*flent = NULL;
3195 	mac_client_impl_t	*mcip;
3196 	int	err;
3197 
3198 	/*
3199 	 * Do classification on the packet.
3200 	 */
3201 	err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent);
3202 	if (err != 0)
3203 		return (NULL);
3204 
3205 	/*
3206 	 * This flent might just be an additional one on the MAC client,
3207 	 * i.e. for classification purposes (different fdesc), however
3208 	 * the resources, SRS et. al., are in the mci_flent, so if
3209 	 * this isn't the mci_flent, we need to get it.
3210 	 */
3211 	if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) {
3212 		FLOW_REFRELE(flent);
3213 		flent = mcip->mci_flent;
3214 		FLOW_TRY_REFHOLD(flent, err);
3215 		if (err != 0)
3216 			return (NULL);
3217 	}
3218 
3219 	return (flent);
3220 }
3221 
3222 /*
3223  * This macro is only meant to be used by mac_tx_send().
3224  */
3225 #define	CHECK_VID_AND_ADD_TAG(mp) {			\
3226 	if (vid_check) {				\
3227 		int err = 0;				\
3228 							\
3229 		MAC_VID_CHECK(src_mcip, (mp), err);	\
3230 		if (err != 0) {				\
3231 			freemsg((mp));			\
3232 			(mp) = next;			\
3233 			oerrors++;			\
3234 			continue;			\
3235 		}					\
3236 	}						\
3237 	if (add_tag) {					\
3238 		(mp) = mac_add_vlan_tag((mp), 0, vid);	\
3239 		if ((mp) == NULL) {			\
3240 			(mp) = next;			\
3241 			oerrors++;			\
3242 			continue;			\
3243 		}					\
3244 	}						\
3245 }
3246 
3247 mblk_t *
3248 mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain,
3249     mac_tx_stats_t *stats)
3250 {
3251 	mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch;
3252 	mac_impl_t *mip = src_mcip->mci_mip;
3253 	uint_t obytes = 0, opackets = 0, oerrors = 0;
3254 	mblk_t *mp = NULL, *next;
3255 	boolean_t vid_check, add_tag;
3256 	uint16_t vid = 0;
3257 
3258 	if (mip->mi_nclients > 1) {
3259 		vid_check = MAC_VID_CHECK_NEEDED(src_mcip);
3260 		add_tag = MAC_TAG_NEEDED(src_mcip);
3261 		if (add_tag)
3262 			vid = mac_client_vid(mch);
3263 	} else {
3264 		ASSERT(mip->mi_nclients == 1);
3265 		vid_check = add_tag = B_FALSE;
3266 	}
3267 
3268 	/*
3269 	 * Fastpath: if there's only one client, and there's no
3270 	 * multicast listeners, we simply send the packet down to the
3271 	 * underlying NIC.
3272 	 */
3273 	if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL)  {
3274 		DTRACE_PROBE2(fastpath,
3275 		    mac_client_impl_t *, src_mcip, mblk_t *, mp_chain);
3276 
3277 		mp = mp_chain;
3278 		while (mp != NULL) {
3279 			next = mp->b_next;
3280 			mp->b_next = NULL;
3281 			opackets++;
3282 			obytes += (mp->b_cont == NULL ? MBLKL(mp) :
3283 			    msgdsize(mp));
3284 
3285 			CHECK_VID_AND_ADD_TAG(mp);
3286 			MAC_TX(mip, ring, mp,
3287 			    ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) !=
3288 			    0));
3289 
3290 			/*
3291 			 * If the driver is out of descriptors and does a
3292 			 * partial send it will return a chain of unsent
3293 			 * mblks. Adjust the accounting stats.
3294 			 */
3295 			if (mp != NULL) {
3296 				opackets--;
3297 				obytes -= msgdsize(mp);
3298 				mp->b_next = next;
3299 				break;
3300 			}
3301 			mp = next;
3302 		}
3303 		goto done;
3304 	}
3305 
3306 	/*
3307 	 * No fastpath, we either have more than one MAC client
3308 	 * defined on top of the same MAC, or one or more MAC
3309 	 * client promiscuous callbacks.
3310 	 */
3311 	DTRACE_PROBE3(slowpath, mac_client_impl_t *,
3312 	    src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain);
3313 
3314 	mp = mp_chain;
3315 	while (mp != NULL) {
3316 		flow_entry_t *dst_flow_ent;
3317 		void *flow_cookie;
3318 		size_t	pkt_size;
3319 		mblk_t *mp1;
3320 
3321 		next = mp->b_next;
3322 		mp->b_next = NULL;
3323 		opackets++;
3324 		pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp));
3325 		obytes += pkt_size;
3326 		CHECK_VID_AND_ADD_TAG(mp);
3327 
3328 		/*
3329 		 * Check if there are promiscuous mode callbacks defined.
3330 		 */
3331 		if (mip->mi_promisc_list != NULL)
3332 			mac_promisc_dispatch(mip, mp, src_mcip);
3333 
3334 		/*
3335 		 * Find the destination.
3336 		 */
3337 		dst_flow_ent = mac_tx_classify(mip, mp);
3338 
3339 		if (dst_flow_ent != NULL) {
3340 			size_t	hdrsize;
3341 			int	err = 0;
3342 
3343 			if (mip->mi_info.mi_nativemedia == DL_ETHER) {
3344 				struct ether_vlan_header *evhp =
3345 				    (struct ether_vlan_header *)mp->b_rptr;
3346 
3347 				if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN)
3348 					hdrsize = sizeof (*evhp);
3349 				else
3350 					hdrsize = sizeof (struct ether_header);
3351 			} else {
3352 				mac_header_info_t	mhi;
3353 
3354 				err = mac_header_info((mac_handle_t)mip,
3355 				    mp, &mhi);
3356 				if (err == 0)
3357 					hdrsize = mhi.mhi_hdrsize;
3358 			}
3359 
3360 			/*
3361 			 * Got a matching flow. It's either another
3362 			 * MAC client, or a broadcast/multicast flow.
3363 			 * Make sure the packet size is within the
3364 			 * allowed size. If not drop the packet and
3365 			 * move to next packet.
3366 			 */
3367 			if (err != 0 ||
3368 			    (pkt_size - hdrsize) > mip->mi_sdu_max) {
3369 				oerrors++;
3370 				DTRACE_PROBE2(loopback__drop, size_t, pkt_size,
3371 				    mblk_t *, mp);
3372 				freemsg(mp);
3373 				mp = next;
3374 				FLOW_REFRELE(dst_flow_ent);
3375 				continue;
3376 			}
3377 			flow_cookie = mac_flow_get_client_cookie(dst_flow_ent);
3378 			if (flow_cookie != NULL) {
3379 				/*
3380 				 * The vnic_bcast_send function expects
3381 				 * to receive the sender MAC client
3382 				 * as value for arg2.
3383 				 */
3384 				mac_bcast_send(flow_cookie, src_mcip, mp,
3385 				    B_TRUE);
3386 			} else {
3387 				/*
3388 				 * loopback the packet to a
3389 				 * local MAC client. We force a context
3390 				 * switch if both source and destination
3391 				 * MAC clients are used by IP, i.e. bypass
3392 				 * is set.
3393 				 */
3394 				boolean_t do_switch;
3395 				mac_client_impl_t *dst_mcip =
3396 				    dst_flow_ent->fe_mcip;
3397 
3398 				do_switch = ((src_mcip->mci_state_flags &
3399 				    dst_mcip->mci_state_flags &
3400 				    MCIS_CLIENT_POLL_CAPABLE) != 0);
3401 
3402 				if ((mp1 = mac_fix_cksum(mp)) != NULL) {
3403 					(dst_flow_ent->fe_cb_fn)(
3404 					    dst_flow_ent->fe_cb_arg1,
3405 					    dst_flow_ent->fe_cb_arg2,
3406 					    mp1, do_switch);
3407 				}
3408 			}
3409 			FLOW_REFRELE(dst_flow_ent);
3410 		} else {
3411 			/*
3412 			 * Unknown destination, send via the underlying
3413 			 * NIC.
3414 			 */
3415 			MAC_TX(mip, ring, mp,
3416 			    ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) !=
3417 			    0));
3418 			if (mp != NULL) {
3419 				/*
3420 				 * Adjust for the last packet that
3421 				 * could not be transmitted
3422 				 */
3423 				opackets--;
3424 				obytes -= pkt_size;
3425 				mp->b_next = next;
3426 				break;
3427 			}
3428 		}
3429 		mp = next;
3430 	}
3431 
3432 done:
3433 	src_mcip->mci_stat_obytes += obytes;
3434 	src_mcip->mci_stat_opackets += opackets;
3435 	src_mcip->mci_stat_oerrors += oerrors;
3436 
3437 	if (stats != NULL) {
3438 		stats->ts_opackets = opackets;
3439 		stats->ts_obytes = obytes;
3440 		stats->ts_oerrors = oerrors;
3441 	}
3442 	return (mp);
3443 }
3444 
3445 /*
3446  * mac_tx_srs_ring_present
3447  *
3448  * Returns whether the specified ring is part of the specified SRS.
3449  */
3450 boolean_t
3451 mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring)
3452 {
3453 	int i;
3454 	mac_soft_ring_t *soft_ring;
3455 
3456 	if (srs->srs_tx.st_arg2 == tx_ring)
3457 		return (B_TRUE);
3458 
3459 	for (i = 0; i < srs->srs_oth_ring_count; i++) {
3460 		soft_ring =  srs->srs_oth_soft_rings[i];
3461 		if (soft_ring->s_ring_tx_arg2 == tx_ring)
3462 			return (B_TRUE);
3463 	}
3464 
3465 	return (B_FALSE);
3466 }
3467 
3468 /*
3469  * mac_tx_srs_wakeup
3470  *
3471  * Called when Tx desc become available. Wakeup the appropriate worker
3472  * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the
3473  * state field.
3474  */
3475 void
3476 mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring)
3477 {
3478 	int i;
3479 	mac_soft_ring_t *sringp;
3480 	mac_srs_tx_t *srs_tx = &mac_srs->srs_tx;
3481 
3482 	mutex_enter(&mac_srs->srs_lock);
3483 	if (TX_SINGLE_RING_MODE(mac_srs)) {
3484 		if (srs_tx->st_arg2 == ring &&
3485 		    mac_srs->srs_state & SRS_TX_BLOCKED) {
3486 			mac_srs->srs_state &= ~SRS_TX_BLOCKED;
3487 			srs_tx->st_unblocked_cnt++;
3488 			cv_signal(&mac_srs->srs_async);
3489 		}
3490 		/*
3491 		 * A wakeup can come before tx_srs_drain() could
3492 		 * grab srs lock and set SRS_TX_BLOCKED. So
3493 		 * always set woken_up flag when we come here.
3494 		 */
3495 		srs_tx->st_woken_up = B_TRUE;
3496 		mutex_exit(&mac_srs->srs_lock);
3497 		return;
3498 	}
3499 
3500 	/* If you are here, it is for FANOUT or BW_FANOUT case */
3501 	ASSERT(TX_MULTI_RING_MODE(mac_srs));
3502 	for (i = 0; i < mac_srs->srs_oth_ring_count; i++) {
3503 		sringp = mac_srs->srs_oth_soft_rings[i];
3504 		mutex_enter(&sringp->s_ring_lock);
3505 		if (sringp->s_ring_tx_arg2 == ring) {
3506 			if (sringp->s_ring_state & S_RING_BLOCK) {
3507 				sringp->s_ring_state &= ~S_RING_BLOCK;
3508 				sringp->s_ring_unblocked_cnt++;
3509 				cv_signal(&sringp->s_ring_async);
3510 			}
3511 			sringp->s_ring_tx_woken_up = B_TRUE;
3512 		}
3513 		mutex_exit(&sringp->s_ring_lock);
3514 	}
3515 	mutex_exit(&mac_srs->srs_lock);
3516 }
3517 
3518 /*
3519  * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash
3520  * the blocked clients again.
3521  */
3522 void
3523 mac_tx_notify(mac_impl_t *mip)
3524 {
3525 	i_mac_notify(mip, MAC_NOTE_TX);
3526 }
3527 
3528 /*
3529  * RX SOFTRING RELATED FUNCTIONS
3530  *
3531  * These functions really belong in mac_soft_ring.c and here for
3532  * a short period.
3533  */
3534 
3535 #define	SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
3536 	/*								\
3537 	 * Enqueue our mblk chain.					\
3538 	 */								\
3539 	ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock));			\
3540 									\
3541 	if ((ringp)->s_ring_last != NULL)				\
3542 		(ringp)->s_ring_last->b_next = (mp);			\
3543 	else								\
3544 		(ringp)->s_ring_first = (mp);				\
3545 	(ringp)->s_ring_last = (tail);					\
3546 	(ringp)->s_ring_count += (cnt);					\
3547 	ASSERT((ringp)->s_ring_count > 0);				\
3548 	if ((ringp)->s_ring_type & ST_RING_BW_CTL) {			\
3549 		(ringp)->s_ring_size += sz;				\
3550 	}								\
3551 }
3552 
3553 /*
3554  * Default entry point to deliver a packet chain to a MAC client.
3555  * If the MAC client has flows, do the classification with these
3556  * flows as well.
3557  */
3558 /* ARGSUSED */
3559 void
3560 mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain,
3561     mac_header_info_t *arg3)
3562 {
3563 	mac_client_impl_t *mcip = arg1;
3564 
3565 	if (mcip->mci_nvids == 1 &&
3566 	    !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) {
3567 		/*
3568 		 * If the client has exactly one VID associated with it
3569 		 * and striping of VLAN header is not disabled,
3570 		 * remove the VLAN tag from the packet before
3571 		 * passing it on to the client's receive callback.
3572 		 * Note that this needs to be done after we dispatch
3573 		 * the packet to the promiscuous listeners of the
3574 		 * client, since they expect to see the whole
3575 		 * frame including the VLAN headers.
3576 		 */
3577 		mp_chain = mac_strip_vlan_tag_chain(mp_chain);
3578 	}
3579 
3580 	mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE);
3581 }
3582 
3583 /*
3584  * mac_rx_soft_ring_process
3585  *
3586  * process a chain for a given soft ring. The number of packets queued
3587  * in the SRS and its associated soft rings (including this one) is
3588  * very small (tracked by srs_poll_pkt_cnt), then allow the entering
3589  * thread (interrupt or poll thread) to do inline processing. This
3590  * helps keep the latency down under low load.
3591  *
3592  * The proc and arg for each mblk is already stored in the mblk in
3593  * appropriate places.
3594  */
3595 /* ARGSUSED */
3596 void
3597 mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp,
3598     mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz)
3599 {
3600 	mac_direct_rx_t		proc;
3601 	void			*arg1;
3602 	mac_resource_handle_t	arg2;
3603 	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
3604 
3605 	ASSERT(ringp != NULL);
3606 	ASSERT(mp_chain != NULL);
3607 	ASSERT(tail != NULL);
3608 	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3609 
3610 	mutex_enter(&ringp->s_ring_lock);
3611 	ringp->s_ring_total_inpkt += cnt;
3612 	if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) &&
3613 	    !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) {
3614 		/* If on processor or blanking on, then enqueue and return */
3615 		if (ringp->s_ring_state & S_RING_BLANK ||
3616 		    ringp->s_ring_state & S_RING_PROC) {
3617 			SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3618 			mutex_exit(&ringp->s_ring_lock);
3619 			return;
3620 		}
3621 		proc = ringp->s_ring_rx_func;
3622 		arg1 = ringp->s_ring_rx_arg1;
3623 		arg2 = ringp->s_ring_rx_arg2;
3624 		/*
3625 		 * See if anything is already queued. If we are the
3626 		 * first packet, do inline processing else queue the
3627 		 * packet and do the drain.
3628 		 */
3629 		if (ringp->s_ring_first == NULL) {
3630 			/*
3631 			 * Fast-path, ok to process and nothing queued.
3632 			 */
3633 			ringp->s_ring_run = curthread;
3634 			ringp->s_ring_state |= (S_RING_PROC);
3635 
3636 			mutex_exit(&ringp->s_ring_lock);
3637 
3638 			/*
3639 			 * We are the chain of 1 packet so
3640 			 * go through this fast path.
3641 			 */
3642 			ASSERT(mp_chain->b_next == NULL);
3643 
3644 			(*proc)(arg1, arg2, mp_chain, NULL);
3645 
3646 			ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3647 			/*
3648 			 * If we have a soft ring set which is doing
3649 			 * bandwidth control, we need to decrement
3650 			 * srs_size and count so it the SRS can have a
3651 			 * accurate idea of what is the real data
3652 			 * queued between SRS and its soft rings. We
3653 			 * decrement the counters only when the packet
3654 			 * gets processed by both SRS and the soft ring.
3655 			 */
3656 			mutex_enter(&mac_srs->srs_lock);
3657 			MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
3658 			MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
3659 			mutex_exit(&mac_srs->srs_lock);
3660 
3661 			mutex_enter(&ringp->s_ring_lock);
3662 			ringp->s_ring_run = NULL;
3663 			ringp->s_ring_state &= ~S_RING_PROC;
3664 			if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
3665 				cv_signal(&ringp->s_ring_client_cv);
3666 
3667 			if ((ringp->s_ring_first == NULL) ||
3668 			    (ringp->s_ring_state & S_RING_BLANK)) {
3669 				/*
3670 				 * We processed inline our packet and
3671 				 * nothing new has arrived or our
3672 				 * receiver doesn't want to receive
3673 				 * any packets. We are done.
3674 				 */
3675 				mutex_exit(&ringp->s_ring_lock);
3676 				return;
3677 			}
3678 		} else {
3679 			SOFT_RING_ENQUEUE_CHAIN(ringp,
3680 			    mp_chain, tail, cnt, sz);
3681 		}
3682 
3683 		/*
3684 		 * We are here because either we couldn't do inline
3685 		 * processing (because something was already
3686 		 * queued), or we had a chain of more than one
3687 		 * packet, or something else arrived after we were
3688 		 * done with inline processing.
3689 		 */
3690 		ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3691 		ASSERT(ringp->s_ring_first != NULL);
3692 
3693 		ringp->s_ring_drain_func(ringp);
3694 		mutex_exit(&ringp->s_ring_lock);
3695 		return;
3696 	} else {
3697 		/* ST_RING_WORKER_ONLY case */
3698 		SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3699 		mac_soft_ring_worker_wakeup(ringp);
3700 		mutex_exit(&ringp->s_ring_lock);
3701 	}
3702 }
3703 
3704 /*
3705  * TX SOFTRING RELATED FUNCTIONS
3706  *
3707  * These functions really belong in mac_soft_ring.c and here for
3708  * a short period.
3709  */
3710 
3711 #define	TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) {	       	\
3712 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));			\
3713 	ringp->s_ring_state |= S_RING_ENQUEUED;				\
3714 	SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);	\
3715 }
3716 
3717 /*
3718  * mac_tx_sring_queued
3719  *
3720  * When we are out of transmit descriptors and we already have a
3721  * queue that exceeds hiwat (or the client called us with
3722  * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the
3723  * soft ring pointer as the opaque cookie for the client enable
3724  * flow control.
3725  */
3726 static mac_tx_cookie_t
3727 mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag,
3728     mblk_t **ret_mp)
3729 {
3730 	int cnt;
3731 	size_t sz;
3732 	mblk_t *tail;
3733 	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3734 	mac_tx_cookie_t cookie = NULL;
3735 	boolean_t wakeup_worker = B_TRUE;
3736 
3737 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
3738 	MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3739 	if (flag & MAC_DROP_ON_NO_DESC) {
3740 		mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE);
3741 		/* increment freed stats */
3742 		ringp->s_ring_drops += cnt;
3743 		cookie = (mac_tx_cookie_t)ringp;
3744 	} else {
3745 		if (ringp->s_ring_first != NULL)
3746 			wakeup_worker = B_FALSE;
3747 
3748 		if (flag & MAC_TX_NO_ENQUEUE) {
3749 			/*
3750 			 * If QUEUED is not set, queue the packet
3751 			 * and let mac_tx_soft_ring_drain() set
3752 			 * the TX_BLOCKED bit for the reasons
3753 			 * explained above. Otherwise, return the
3754 			 * mblks.
3755 			 */
3756 			if (wakeup_worker) {
3757 				TX_SOFT_RING_ENQUEUE_CHAIN(ringp,
3758 				    mp_chain, tail, cnt, sz);
3759 			} else {
3760 				ringp->s_ring_state |= S_RING_WAKEUP_CLIENT;
3761 				cookie = (mac_tx_cookie_t)ringp;
3762 				*ret_mp = mp_chain;
3763 			}
3764 		} else {
3765 			boolean_t enqueue = B_TRUE;
3766 
3767 			if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3768 				/*
3769 				 * flow-controlled. Store ringp in cookie
3770 				 * so that it can be returned as
3771 				 * mac_tx_cookie_t to client
3772 				 */
3773 				ringp->s_ring_state |= S_RING_TX_HIWAT;
3774 				cookie = (mac_tx_cookie_t)ringp;
3775 				ringp->s_ring_hiwat_cnt++;
3776 				if (ringp->s_ring_count >
3777 				    ringp->s_ring_tx_max_q_cnt) {
3778 					/* increment freed stats */
3779 					ringp->s_ring_drops += cnt;
3780 					/*
3781 					 * b_prev may be set to the fanout hint
3782 					 * hence can't use freemsg directly
3783 					 */
3784 					mac_pkt_drop(NULL, NULL,
3785 					    mp_chain, B_FALSE);
3786 					DTRACE_PROBE1(tx_queued_hiwat,
3787 					    mac_soft_ring_t *, ringp);
3788 					enqueue = B_FALSE;
3789 				}
3790 			}
3791 			if (enqueue) {
3792 				TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain,
3793 				    tail, cnt, sz);
3794 			}
3795 		}
3796 		if (wakeup_worker)
3797 			cv_signal(&ringp->s_ring_async);
3798 	}
3799 	return (cookie);
3800 }
3801 
3802 
3803 /*
3804  * mac_tx_soft_ring_process
3805  *
3806  * This routine is called when fanning out outgoing traffic among
3807  * multipe Tx rings.
3808  * Note that a soft ring is associated with a h/w Tx ring.
3809  */
3810 mac_tx_cookie_t
3811 mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain,
3812     uint16_t flag, mblk_t **ret_mp)
3813 {
3814 	mac_soft_ring_set_t *mac_srs = ringp->s_ring_set;
3815 	int	cnt;
3816 	size_t	sz;
3817 	mblk_t	*tail;
3818 	mac_tx_cookie_t cookie = NULL;
3819 
3820 	ASSERT(ringp != NULL);
3821 	ASSERT(mp_chain != NULL);
3822 	ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
3823 	/*
3824 	 * Only two modes can come here; either it can be
3825 	 * SRS_TX_BW_FANOUT or SRS_TX_FANOUT
3826 	 */
3827 	ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT ||
3828 	    mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT);
3829 
3830 	if (ringp->s_ring_type & ST_RING_WORKER_ONLY) {
3831 		/* Serialization mode */
3832 
3833 		mutex_enter(&ringp->s_ring_lock);
3834 		if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) {
3835 			cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3836 			    flag, ret_mp);
3837 			mutex_exit(&ringp->s_ring_lock);
3838 			return (cookie);
3839 		}
3840 		MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz);
3841 		TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz);
3842 		if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) {
3843 			/*
3844 			 * If ring is blocked due to lack of Tx
3845 			 * descs, just return. Worker thread
3846 			 * will get scheduled when Tx desc's
3847 			 * become available.
3848 			 */
3849 			mutex_exit(&ringp->s_ring_lock);
3850 			return (cookie);
3851 		}
3852 		mac_soft_ring_worker_wakeup(ringp);
3853 		mutex_exit(&ringp->s_ring_lock);
3854 		return (cookie);
3855 	} else {
3856 		/* Default fanout mode */
3857 		/*
3858 		 * S_RING_BLOCKED is set when underlying NIC runs
3859 		 * out of Tx descs and messages start getting
3860 		 * queued. It won't get reset until
3861 		 * tx_srs_drain() completely drains out the
3862 		 * messages.
3863 		 */
3864 		boolean_t		is_subflow;
3865 		mac_tx_stats_t		stats;
3866 
3867 		if (ringp->s_ring_state & S_RING_ENQUEUED) {
3868 			/* Tx descs/resources not available */
3869 			mutex_enter(&ringp->s_ring_lock);
3870 			if (ringp->s_ring_state & S_RING_ENQUEUED) {
3871 				cookie = mac_tx_sring_enqueue(ringp, mp_chain,
3872 				    flag, ret_mp);
3873 				mutex_exit(&ringp->s_ring_lock);
3874 				return (cookie);
3875 			}
3876 			/*
3877 			 * While we were computing mblk count, the
3878 			 * flow control condition got relieved.
3879 			 * Continue with the transmission.
3880 			 */
3881 			mutex_exit(&ringp->s_ring_lock);
3882 		}
3883 		is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0);
3884 
3885 		mp_chain = mac_tx_send(ringp->s_ring_tx_arg1,
3886 		    ringp->s_ring_tx_arg2, mp_chain,
3887 		    (is_subflow ? &stats : NULL));
3888 
3889 		/*
3890 		 * Multiple threads could be here sending packets.
3891 		 * Under such conditions, it is not possible to
3892 		 * automically set S_RING_BLOCKED bit to indicate
3893 		 * out of tx desc condition. To atomically set
3894 		 * this, we queue the returned packet and do
3895 		 * the setting of S_RING_BLOCKED in
3896 		 * mac_tx_soft_ring_drain().
3897 		 */
3898 		if (mp_chain != NULL) {
3899 			mutex_enter(&ringp->s_ring_lock);
3900 			cookie =
3901 			    mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp);
3902 			mutex_exit(&ringp->s_ring_lock);
3903 			return (cookie);
3904 		}
3905 		if (is_subflow) {
3906 			FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats);
3907 		}
3908 		return (NULL);
3909 	}
3910 }
3911