xref: /titanic_54/usr/src/uts/common/inet/squeue.c (revision da14cebe459d3275048785f25bd869cb09b5307f)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5e824d57fSjohnlev  * Common Development and Distribution License (the "License").
6e824d57fSjohnlev  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*da14cebeSEric Cheng  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*
27*da14cebeSEric Cheng  * Squeues: General purpose serialization mechanism
28*da14cebeSEric Cheng  * ------------------------------------------------
297c478bd9Sstevel@tonic-gate  *
30*da14cebeSEric Cheng  * Background:
31*da14cebeSEric Cheng  * -----------
327c478bd9Sstevel@tonic-gate  *
33*da14cebeSEric Cheng  * This is a general purpose high-performance serialization mechanism
34*da14cebeSEric Cheng  * currently used by TCP/IP. It is implement by means of a per CPU queue,
35*da14cebeSEric Cheng  * a worker thread and a polling thread with are bound to the CPU
36*da14cebeSEric Cheng  * associated with the squeue. The squeue is strictly FIFO for both read
37*da14cebeSEric Cheng  * and write side and only one thread can process it at any given time.
38*da14cebeSEric Cheng  * The design goal of squeue was to offer a very high degree of
39*da14cebeSEric Cheng  * parallelization (on a per H/W execution pipeline basis) with at
40*da14cebeSEric Cheng  * most one queuing.
417c478bd9Sstevel@tonic-gate  *
42*da14cebeSEric Cheng  * The modules needing protection typically calls squeue_enter() or
43*da14cebeSEric Cheng  * squeue_enter_chain() routine as soon as a thread enter the module
44*da14cebeSEric Cheng  * from either direction. For each packet, the processing function
45*da14cebeSEric Cheng  * and argument is stored in the mblk itself. When the packet is ready
46*da14cebeSEric Cheng  * to be processed, the squeue retrieves the stored function and calls
47*da14cebeSEric Cheng  * it with the supplied argument and the pointer to the packet itself.
48*da14cebeSEric Cheng  * The called function can assume that no other thread is processing
49*da14cebeSEric Cheng  * the squeue when it is executing.
507c478bd9Sstevel@tonic-gate  *
51*da14cebeSEric Cheng  * Squeue/connection binding:
52*da14cebeSEric Cheng  * --------------------------
537c478bd9Sstevel@tonic-gate  *
54*da14cebeSEric Cheng  * TCP/IP uses an IP classifier in conjunction with squeue where specific
55*da14cebeSEric Cheng  * connections are assigned to specific squeue (based on various policies),
56*da14cebeSEric Cheng  * at the connection creation time. Once assigned, the connection to
57*da14cebeSEric Cheng  * squeue mapping is never changed and all future packets for that
58*da14cebeSEric Cheng  * connection are processed on that squeue. The connection ("conn") to
59*da14cebeSEric Cheng  * squeue mapping is stored in "conn_t" member "conn_sqp".
607c478bd9Sstevel@tonic-gate  *
61*da14cebeSEric Cheng  * Since the processing of the connection cuts across multiple layers
62*da14cebeSEric Cheng  * but still allows packets for different connnection to be processed on
63*da14cebeSEric Cheng  * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
64*da14cebeSEric Cheng  * "Per Connection Vertical Perimeter".
657c478bd9Sstevel@tonic-gate  *
66*da14cebeSEric Cheng  * Processing Model:
67*da14cebeSEric Cheng  * -----------------
687c478bd9Sstevel@tonic-gate  *
69*da14cebeSEric Cheng  * Squeue doesn't necessary processes packets with its own worker thread.
70*da14cebeSEric Cheng  * The callers can pick if they just want to queue the packet, process
71*da14cebeSEric Cheng  * their packet if nothing is queued or drain and process. The first two
72*da14cebeSEric Cheng  * modes are typically employed when the packet was generated while
73*da14cebeSEric Cheng  * already doing the processing behind the squeue and last mode (drain
74*da14cebeSEric Cheng  * and process) is typically employed when the thread is entering squeue
75*da14cebeSEric Cheng  * for the first time. The squeue still imposes a finite time limit
76*da14cebeSEric Cheng  * for which a external thread can do processing after which it switches
77*da14cebeSEric Cheng  * processing to its own worker thread.
787c478bd9Sstevel@tonic-gate  *
79*da14cebeSEric Cheng  * Once created, squeues are never deleted. Hence squeue pointers are
80*da14cebeSEric Cheng  * always valid. This means that functions outside the squeue can still
81*da14cebeSEric Cheng  * refer safely to conn_sqp and their is no need for ref counts.
827c478bd9Sstevel@tonic-gate  *
83*da14cebeSEric Cheng  * Only a thread executing in the squeue can change the squeue of the
84*da14cebeSEric Cheng  * connection. It does so by calling a squeue framework function to do this.
85*da14cebeSEric Cheng  * After changing the squeue, the thread must leave the squeue. It must not
86*da14cebeSEric Cheng  * continue to execute any code that needs squeue protection.
877c478bd9Sstevel@tonic-gate  *
88*da14cebeSEric Cheng  * The squeue framework, after entering the squeue, checks if the current
89*da14cebeSEric Cheng  * squeue matches the conn_sqp. If the check fails, the packet is delivered
90*da14cebeSEric Cheng  * to right squeue.
917c478bd9Sstevel@tonic-gate  *
92*da14cebeSEric Cheng  * Polling Model:
93*da14cebeSEric Cheng  * --------------
947c478bd9Sstevel@tonic-gate  *
95*da14cebeSEric Cheng  * Squeues can control the rate of packet arrival into itself from the
96*da14cebeSEric Cheng  * NIC or specific Rx ring within a NIC. As part of capability negotiation
97*da14cebeSEric Cheng  * between IP and MAC layer, squeue are created for each TCP soft ring
98*da14cebeSEric Cheng  * (or TCP Rx ring - to be implemented in future). As part of this
99*da14cebeSEric Cheng  * negotiation, squeues get a cookie for underlying soft ring or Rx
100*da14cebeSEric Cheng  * ring, a function to turn off incoming packets and a function to call
101*da14cebeSEric Cheng  * to poll for packets. This helps schedule the receive side packet
102*da14cebeSEric Cheng  * processing so that queue backlog doesn't build up and packet processing
103*da14cebeSEric Cheng  * doesn't keep getting disturbed by high priority interrupts. As part
104*da14cebeSEric Cheng  * of this mode, as soon as a backlog starts building, squeue turns off
105*da14cebeSEric Cheng  * the interrupts and switches to poll mode. In poll mode, when poll
106*da14cebeSEric Cheng  * thread goes down to retrieve packets, it retrieves them in the form of
107*da14cebeSEric Cheng  * a chain which improves performance even more. As the squeue/softring
108*da14cebeSEric Cheng  * system gets more packets, it gets more efficient by switching to
109*da14cebeSEric Cheng  * polling more often and dealing with larger packet chains.
1107c478bd9Sstevel@tonic-gate  *
1117c478bd9Sstevel@tonic-gate  */
1127c478bd9Sstevel@tonic-gate 
1137c478bd9Sstevel@tonic-gate #include <sys/types.h>
1147c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
1157c478bd9Sstevel@tonic-gate #include <sys/debug.h>
1167c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
1177c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
1187c478bd9Sstevel@tonic-gate #include <sys/condvar_impl.h>
1197c478bd9Sstevel@tonic-gate #include <sys/systm.h>
1207c478bd9Sstevel@tonic-gate #include <sys/callb.h>
1217c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
1227c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
123*da14cebeSEric Cheng #include <sys/sunddi.h>
1247c478bd9Sstevel@tonic-gate 
1257c478bd9Sstevel@tonic-gate #include <inet/ipclassifier.h>
126d045b987Smasputra #include <inet/udp_impl.h>
1277c478bd9Sstevel@tonic-gate 
1287c478bd9Sstevel@tonic-gate #include <sys/squeue_impl.h>
1297c478bd9Sstevel@tonic-gate 
1307c478bd9Sstevel@tonic-gate static void squeue_fire(void *);
131d19d6468Sbw static void squeue_drain(squeue_t *, uint_t, hrtime_t);
1327c478bd9Sstevel@tonic-gate static void squeue_worker(squeue_t *sqp);
133*da14cebeSEric Cheng static void squeue_polling_thread(squeue_t *sqp);
1347c478bd9Sstevel@tonic-gate 
1357c478bd9Sstevel@tonic-gate kmem_cache_t *squeue_cache;
1367c478bd9Sstevel@tonic-gate 
137d19d6468Sbw #define	SQUEUE_MSEC_TO_NSEC 1000000
138d19d6468Sbw 
139*da14cebeSEric Cheng int squeue_drain_ms = 20;
140*da14cebeSEric Cheng int squeue_workerwait_ms = 0;
1417c478bd9Sstevel@tonic-gate 
142d19d6468Sbw /* The values above converted to ticks or nano seconds */
143*da14cebeSEric Cheng static int squeue_drain_ns = 0;
1447c478bd9Sstevel@tonic-gate static int squeue_workerwait_tick = 0;
1457c478bd9Sstevel@tonic-gate 
146*da14cebeSEric Cheng #define	MAX_BYTES_TO_PICKUP	150000
1477c478bd9Sstevel@tonic-gate 
1487c478bd9Sstevel@tonic-gate #define	ENQUEUE_CHAIN(sqp, mp, tail, cnt) {			\
1497c478bd9Sstevel@tonic-gate 	/*							\
1507c478bd9Sstevel@tonic-gate 	 * Enqueue our mblk chain.				\
1517c478bd9Sstevel@tonic-gate 	 */							\
1527c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
1537c478bd9Sstevel@tonic-gate 								\
1547c478bd9Sstevel@tonic-gate 	if ((sqp)->sq_last != NULL)				\
1557c478bd9Sstevel@tonic-gate 		(sqp)->sq_last->b_next = (mp);			\
1567c478bd9Sstevel@tonic-gate 	else							\
1577c478bd9Sstevel@tonic-gate 		(sqp)->sq_first = (mp);				\
1587c478bd9Sstevel@tonic-gate 	(sqp)->sq_last = (tail);				\
1597c478bd9Sstevel@tonic-gate 	(sqp)->sq_count += (cnt);				\
1607c478bd9Sstevel@tonic-gate 	ASSERT((sqp)->sq_count > 0);				\
1617c478bd9Sstevel@tonic-gate 	DTRACE_PROBE4(squeue__enqueuechain, squeue_t *, sqp,	\
1627c478bd9Sstevel@tonic-gate 		mblk_t *, mp, mblk_t *, tail, int, cnt);	\
1637c478bd9Sstevel@tonic-gate 								\
1647c478bd9Sstevel@tonic-gate }
1657c478bd9Sstevel@tonic-gate 
166*da14cebeSEric Cheng #define	SQS_POLLING_ON(sqp, sq_poll_capable, rx_ring) {		\
1677c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
168*da14cebeSEric Cheng 	if (sq_poll_capable) {					\
169*da14cebeSEric Cheng 		ASSERT(rx_ring != NULL);			\
170*da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
171*da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_POLLING)) {		\
172*da14cebeSEric Cheng 			sqp->sq_state |= SQS_POLLING;		\
173*da14cebeSEric Cheng 			rx_ring->rr_intr_disable(rx_ring->rr_intr_handle); \
174*da14cebeSEric Cheng 		}						\
175*da14cebeSEric Cheng 	}							\
1767c478bd9Sstevel@tonic-gate }
1777c478bd9Sstevel@tonic-gate 
178*da14cebeSEric Cheng #define	SQS_POLLING_OFF(sqp, sq_poll_capable, rx_ring) {	\
1797c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
180*da14cebeSEric Cheng 	if (sq_poll_capable) {					\
181*da14cebeSEric Cheng 		ASSERT(rx_ring != NULL);			\
182*da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
183*da14cebeSEric Cheng 		if (sqp->sq_state & SQS_POLLING) {		\
184*da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_POLLING;		\
185*da14cebeSEric Cheng 			rx_ring->rr_intr_enable(rx_ring->rr_intr_handle); \
186*da14cebeSEric Cheng 		}						\
187*da14cebeSEric Cheng 	}							\
1887c478bd9Sstevel@tonic-gate }
1897c478bd9Sstevel@tonic-gate 
190*da14cebeSEric Cheng #define	SQS_POLL_RING(sqp, sq_poll_capable) {			\
191*da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
192*da14cebeSEric Cheng 	if (sq_poll_capable) {					\
193*da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
194*da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_GET_PKTS)) {		\
195*da14cebeSEric Cheng 			sqp->sq_state |= SQS_GET_PKTS;		\
196*da14cebeSEric Cheng 			cv_signal(&sqp->sq_poll_cv);		\
197*da14cebeSEric Cheng 		}						\
198*da14cebeSEric Cheng 	}							\
199*da14cebeSEric Cheng }
200*da14cebeSEric Cheng 
201*da14cebeSEric Cheng #ifdef DEBUG
202*da14cebeSEric Cheng #define	SQUEUE_DBG_SET(sqp, mp, proc, connp, tag) {		\
203*da14cebeSEric Cheng 	(sqp)->sq_curmp = (mp);					\
204*da14cebeSEric Cheng 	(sqp)->sq_curproc = (proc);				\
205*da14cebeSEric Cheng 	(sqp)->sq_connp = (connp);				\
206*da14cebeSEric Cheng 	(mp)->b_tag = (sqp)->sq_tag = (tag);			\
207*da14cebeSEric Cheng }
208*da14cebeSEric Cheng 
209*da14cebeSEric Cheng #define	SQUEUE_DBG_CLEAR(sqp)	{				\
210*da14cebeSEric Cheng 	(sqp)->sq_curmp = NULL;					\
211*da14cebeSEric Cheng 	(sqp)->sq_curproc = NULL;				\
212*da14cebeSEric Cheng 	(sqp)->sq_connp = NULL;					\
213*da14cebeSEric Cheng }
214*da14cebeSEric Cheng #else
215*da14cebeSEric Cheng #define	SQUEUE_DBG_SET(sqp, mp, proc, connp, tag)
216*da14cebeSEric Cheng #define	SQUEUE_DBG_CLEAR(sqp)
217*da14cebeSEric Cheng #endif
218*da14cebeSEric Cheng 
2197c478bd9Sstevel@tonic-gate void
2207c478bd9Sstevel@tonic-gate squeue_init(void)
2217c478bd9Sstevel@tonic-gate {
2227c478bd9Sstevel@tonic-gate 	squeue_cache = kmem_cache_create("squeue_cache",
2237c478bd9Sstevel@tonic-gate 	    sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
2247c478bd9Sstevel@tonic-gate 
225*da14cebeSEric Cheng 	squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC;
2267c478bd9Sstevel@tonic-gate 	squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms);
2277c478bd9Sstevel@tonic-gate }
2287c478bd9Sstevel@tonic-gate 
2297c478bd9Sstevel@tonic-gate /* ARGSUSED */
2307c478bd9Sstevel@tonic-gate squeue_t *
231*da14cebeSEric Cheng squeue_create(clock_t wait, pri_t pri)
2327c478bd9Sstevel@tonic-gate {
2337c478bd9Sstevel@tonic-gate 	squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
2347c478bd9Sstevel@tonic-gate 
2357c478bd9Sstevel@tonic-gate 	bzero(sqp, sizeof (squeue_t));
236*da14cebeSEric Cheng 	sqp->sq_bind = PBIND_NONE;
237*da14cebeSEric Cheng 	sqp->sq_priority = pri;
2387c478bd9Sstevel@tonic-gate 	sqp->sq_wait = MSEC_TO_TICK(wait);
2397c478bd9Sstevel@tonic-gate 	sqp->sq_worker = thread_create(NULL, 0, squeue_worker,
2407c478bd9Sstevel@tonic-gate 	    sqp, 0, &p0, TS_RUN, pri);
2417c478bd9Sstevel@tonic-gate 
242*da14cebeSEric Cheng 	sqp->sq_poll_thr = thread_create(NULL, 0, squeue_polling_thread,
243*da14cebeSEric Cheng 	    sqp, 0, &p0, TS_RUN, pri);
244*da14cebeSEric Cheng 
245*da14cebeSEric Cheng 	sqp->sq_enter = squeue_enter;
246*da14cebeSEric Cheng 	sqp->sq_drain = squeue_drain;
247*da14cebeSEric Cheng 
2487c478bd9Sstevel@tonic-gate 	return (sqp);
2497c478bd9Sstevel@tonic-gate }
2507c478bd9Sstevel@tonic-gate 
251*da14cebeSEric Cheng /*
252*da14cebeSEric Cheng  * Bind squeue worker thread to the specified CPU, given by CPU id.
253*da14cebeSEric Cheng  * If the CPU id  value is -1, bind the worker thread to the value
254*da14cebeSEric Cheng  * specified in sq_bind field. If a thread is already bound to a
255*da14cebeSEric Cheng  * different CPU, unbind it from the old CPU and bind to the new one.
256*da14cebeSEric Cheng  */
257*da14cebeSEric Cheng 
2587c478bd9Sstevel@tonic-gate void
2597c478bd9Sstevel@tonic-gate squeue_bind(squeue_t *sqp, processorid_t bind)
2607c478bd9Sstevel@tonic-gate {
2617c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
262*da14cebeSEric Cheng 	ASSERT(sqp->sq_bind != PBIND_NONE || bind != PBIND_NONE);
263*da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&cpu_lock));
264*da14cebeSEric Cheng 
2657c478bd9Sstevel@tonic-gate 	if (sqp->sq_state & SQS_BOUND) {
266*da14cebeSEric Cheng 		if (sqp->sq_bind == bind) {
2677c478bd9Sstevel@tonic-gate 			mutex_exit(&sqp->sq_lock);
2687c478bd9Sstevel@tonic-gate 			return;
2697c478bd9Sstevel@tonic-gate 		}
270*da14cebeSEric Cheng 		thread_affinity_clear(sqp->sq_worker);
271*da14cebeSEric Cheng 	} else {
2727c478bd9Sstevel@tonic-gate 		sqp->sq_state |= SQS_BOUND;
273*da14cebeSEric Cheng 	}
274*da14cebeSEric Cheng 
275*da14cebeSEric Cheng 	if (bind != PBIND_NONE)
276*da14cebeSEric Cheng 		sqp->sq_bind = bind;
2777c478bd9Sstevel@tonic-gate 
2787c478bd9Sstevel@tonic-gate 	thread_affinity_set(sqp->sq_worker, sqp->sq_bind);
279*da14cebeSEric Cheng 	mutex_exit(&sqp->sq_lock);
2807c478bd9Sstevel@tonic-gate }
2817c478bd9Sstevel@tonic-gate 
2827c478bd9Sstevel@tonic-gate void
2837c478bd9Sstevel@tonic-gate squeue_unbind(squeue_t *sqp)
2847c478bd9Sstevel@tonic-gate {
2857c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
2867c478bd9Sstevel@tonic-gate 	if (!(sqp->sq_state & SQS_BOUND)) {
2877c478bd9Sstevel@tonic-gate 		mutex_exit(&sqp->sq_lock);
2887c478bd9Sstevel@tonic-gate 		return;
2897c478bd9Sstevel@tonic-gate 	}
2907c478bd9Sstevel@tonic-gate 
2917c478bd9Sstevel@tonic-gate 	sqp->sq_state &= ~SQS_BOUND;
2927c478bd9Sstevel@tonic-gate 	thread_affinity_clear(sqp->sq_worker);
293*da14cebeSEric Cheng 	mutex_exit(&sqp->sq_lock);
294*da14cebeSEric Cheng }
295*da14cebeSEric Cheng 
296*da14cebeSEric Cheng void
297*da14cebeSEric Cheng squeue_worker_wakeup(squeue_t *sqp)
298*da14cebeSEric Cheng {
299*da14cebeSEric Cheng 	timeout_id_t tid = (sqp)->sq_tid;
300*da14cebeSEric Cheng 
301*da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));
302*da14cebeSEric Cheng 
303*da14cebeSEric Cheng 	if (sqp->sq_wait == 0) {
304*da14cebeSEric Cheng 		ASSERT(tid == 0);
305*da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & SQS_TMO_PROG));
306*da14cebeSEric Cheng 		sqp->sq_awaken = lbolt;
307*da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
308*da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
309*da14cebeSEric Cheng 		return;
310*da14cebeSEric Cheng 	}
311*da14cebeSEric Cheng 
312*da14cebeSEric Cheng 	/*
313*da14cebeSEric Cheng 	 * Queue isn't being processed, so take
314*da14cebeSEric Cheng 	 * any post enqueue actions needed before leaving.
315*da14cebeSEric Cheng 	 */
316*da14cebeSEric Cheng 	if (tid != 0) {
317*da14cebeSEric Cheng 		/*
318*da14cebeSEric Cheng 		 * Waiting for an enter() to process mblk(s).
319*da14cebeSEric Cheng 		 */
320*da14cebeSEric Cheng 		clock_t	waited = lbolt - sqp->sq_awaken;
321*da14cebeSEric Cheng 
322*da14cebeSEric Cheng 		if (TICK_TO_MSEC(waited) >= sqp->sq_wait) {
323*da14cebeSEric Cheng 			/*
324*da14cebeSEric Cheng 			 * Times up and have a worker thread
325*da14cebeSEric Cheng 			 * waiting for work, so schedule it.
326*da14cebeSEric Cheng 			 */
327*da14cebeSEric Cheng 			sqp->sq_tid = 0;
328*da14cebeSEric Cheng 			sqp->sq_awaken = lbolt;
329*da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
330*da14cebeSEric Cheng 			mutex_exit(&sqp->sq_lock);
331*da14cebeSEric Cheng 			(void) untimeout(tid);
332*da14cebeSEric Cheng 			return;
333*da14cebeSEric Cheng 		}
334*da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
335*da14cebeSEric Cheng 		return;
336*da14cebeSEric Cheng 	} else if (sqp->sq_state & SQS_TMO_PROG) {
337*da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
338*da14cebeSEric Cheng 		return;
339*da14cebeSEric Cheng 	} else {
340*da14cebeSEric Cheng 		clock_t	wait = sqp->sq_wait;
341*da14cebeSEric Cheng 		/*
342*da14cebeSEric Cheng 		 * Wait up to sqp->sq_wait ms for an
343*da14cebeSEric Cheng 		 * enter() to process this queue. We
344*da14cebeSEric Cheng 		 * don't want to contend on timeout locks
345*da14cebeSEric Cheng 		 * with sq_lock held for performance reasons,
346*da14cebeSEric Cheng 		 * so drop the sq_lock before calling timeout
347*da14cebeSEric Cheng 		 * but we need to check if timeout is required
348*da14cebeSEric Cheng 		 * after re acquiring the sq_lock. Once
349*da14cebeSEric Cheng 		 * the sq_lock is dropped, someone else could
350*da14cebeSEric Cheng 		 * have processed the packet or the timeout could
351*da14cebeSEric Cheng 		 * have already fired.
352*da14cebeSEric Cheng 		 */
353*da14cebeSEric Cheng 		sqp->sq_state |= SQS_TMO_PROG;
354*da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
355*da14cebeSEric Cheng 		tid = timeout(squeue_fire, sqp, wait);
356*da14cebeSEric Cheng 		mutex_enter(&sqp->sq_lock);
357*da14cebeSEric Cheng 		/* Check again if we still need the timeout */
358*da14cebeSEric Cheng 		if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) ==
359*da14cebeSEric Cheng 		    SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
360*da14cebeSEric Cheng 		    (sqp->sq_first != NULL)) {
361*da14cebeSEric Cheng 				sqp->sq_state &= ~SQS_TMO_PROG;
362*da14cebeSEric Cheng 				sqp->sq_tid = tid;
363*da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
364*da14cebeSEric Cheng 				return;
365*da14cebeSEric Cheng 		} else {
366*da14cebeSEric Cheng 			if (sqp->sq_state & SQS_TMO_PROG) {
367*da14cebeSEric Cheng 				sqp->sq_state &= ~SQS_TMO_PROG;
368*da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
369*da14cebeSEric Cheng 				(void) untimeout(tid);
370*da14cebeSEric Cheng 			} else {
371*da14cebeSEric Cheng 				/*
372*da14cebeSEric Cheng 				 * The timer fired before we could
373*da14cebeSEric Cheng 				 * reacquire the sq_lock. squeue_fire
374*da14cebeSEric Cheng 				 * removes the SQS_TMO_PROG flag
375*da14cebeSEric Cheng 				 * and we don't need to	do anything
376*da14cebeSEric Cheng 				 * else.
377*da14cebeSEric Cheng 				 */
378*da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
379*da14cebeSEric Cheng 			}
380*da14cebeSEric Cheng 		}
381*da14cebeSEric Cheng 	}
382*da14cebeSEric Cheng 
383*da14cebeSEric Cheng 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
3847c478bd9Sstevel@tonic-gate }
3857c478bd9Sstevel@tonic-gate 
3867c478bd9Sstevel@tonic-gate /*
3877c478bd9Sstevel@tonic-gate  * squeue_enter() - enter squeue sqp with mblk mp (which can be
3887c478bd9Sstevel@tonic-gate  * a chain), while tail points to the end and cnt in number of
3897c478bd9Sstevel@tonic-gate  * mblks in the chain.
3907c478bd9Sstevel@tonic-gate  *
3917c478bd9Sstevel@tonic-gate  * For a chain of single packet (i.e. mp == tail), go through the
3927c478bd9Sstevel@tonic-gate  * fast path if no one is processing the squeue and nothing is queued.
3937c478bd9Sstevel@tonic-gate  *
3947c478bd9Sstevel@tonic-gate  * The proc and arg for each mblk is already stored in the mblk in
3957c478bd9Sstevel@tonic-gate  * appropriate places.
396*da14cebeSEric Cheng  *
397*da14cebeSEric Cheng  * The process_flag specifies if we are allowed to process the mblk
398*da14cebeSEric Cheng  * and drain in the entering thread context. If process_flag is
399*da14cebeSEric Cheng  * SQ_FILL, then we just queue the mblk and return (after signaling
400*da14cebeSEric Cheng  * the worker thread if no one else is processing the squeue).
4017c478bd9Sstevel@tonic-gate  */
402*da14cebeSEric Cheng /* ARGSUSED */
4037c478bd9Sstevel@tonic-gate void
404*da14cebeSEric Cheng squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
405*da14cebeSEric Cheng     int process_flag, uint8_t tag)
4067c478bd9Sstevel@tonic-gate {
407*da14cebeSEric Cheng 	conn_t		*connp;
4087c478bd9Sstevel@tonic-gate 	sqproc_t	proc;
409d19d6468Sbw 	hrtime_t	now;
4107c478bd9Sstevel@tonic-gate 
4117c478bd9Sstevel@tonic-gate 	ASSERT(sqp != NULL);
4127c478bd9Sstevel@tonic-gate 	ASSERT(mp != NULL);
4137c478bd9Sstevel@tonic-gate 	ASSERT(tail != NULL);
4147c478bd9Sstevel@tonic-gate 	ASSERT(cnt > 0);
4157c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
4167c478bd9Sstevel@tonic-gate 
4177c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
418*da14cebeSEric Cheng 
419*da14cebeSEric Cheng 	/*
420*da14cebeSEric Cheng 	 * Try to process the packet if SQ_FILL flag is not set and
421*da14cebeSEric Cheng 	 * we are allowed to process the squeue. The SQ_NODRAIN is
422*da14cebeSEric Cheng 	 * ignored if the packet chain consists of more than 1 packet.
423*da14cebeSEric Cheng 	 */
424*da14cebeSEric Cheng 	if (!(sqp->sq_state & SQS_PROC) && ((process_flag == SQ_PROCESS) ||
425*da14cebeSEric Cheng 	    (process_flag == SQ_NODRAIN && sqp->sq_first == NULL))) {
4267c478bd9Sstevel@tonic-gate 		/*
4277c478bd9Sstevel@tonic-gate 		 * See if anything is already queued. If we are the
4287c478bd9Sstevel@tonic-gate 		 * first packet, do inline processing else queue the
4297c478bd9Sstevel@tonic-gate 		 * packet and do the drain.
4307c478bd9Sstevel@tonic-gate 		 */
4317c478bd9Sstevel@tonic-gate 		if (sqp->sq_first == NULL && cnt == 1) {
4327c478bd9Sstevel@tonic-gate 			/*
4337c478bd9Sstevel@tonic-gate 			 * Fast-path, ok to process and nothing queued.
4347c478bd9Sstevel@tonic-gate 			 */
4357c478bd9Sstevel@tonic-gate 			sqp->sq_state |= (SQS_PROC|SQS_FAST);
436*da14cebeSEric Cheng 			sqp->sq_run = curthread;
4377c478bd9Sstevel@tonic-gate 			mutex_exit(&sqp->sq_lock);
4387c478bd9Sstevel@tonic-gate 
4397c478bd9Sstevel@tonic-gate 			/*
4407c478bd9Sstevel@tonic-gate 			 * We are the chain of 1 packet so
4417c478bd9Sstevel@tonic-gate 			 * go through this fast path.
4427c478bd9Sstevel@tonic-gate 			 */
443*da14cebeSEric Cheng 			ASSERT(mp->b_prev != NULL);
444*da14cebeSEric Cheng 			ASSERT(mp->b_queue != NULL);
445*da14cebeSEric Cheng 			connp = (conn_t *)mp->b_prev;
446*da14cebeSEric Cheng 			mp->b_prev = NULL;
447*da14cebeSEric Cheng 			proc = (sqproc_t)mp->b_queue;
448*da14cebeSEric Cheng 			mp->b_queue = NULL;
449*da14cebeSEric Cheng 			ASSERT(proc != NULL && connp != NULL);
450*da14cebeSEric Cheng 			ASSERT(mp->b_next == NULL);
451*da14cebeSEric Cheng 
452*da14cebeSEric Cheng 			/*
453*da14cebeSEric Cheng 			 * Handle squeue switching. More details in the
454*da14cebeSEric Cheng 			 * block comment at the top of the file
455*da14cebeSEric Cheng 			 */
456*da14cebeSEric Cheng 			if (connp->conn_sqp == sqp) {
457*da14cebeSEric Cheng 				SQUEUE_DBG_SET(sqp, mp, proc, connp,
458*da14cebeSEric Cheng 				    tag);
459*da14cebeSEric Cheng 				connp->conn_on_sqp = B_TRUE;
460*da14cebeSEric Cheng 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
461*da14cebeSEric Cheng 				    sqp, mblk_t *, mp, conn_t *, connp);
462*da14cebeSEric Cheng 				(*proc)(connp, mp, sqp);
463*da14cebeSEric Cheng 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
464*da14cebeSEric Cheng 				    sqp, conn_t *, connp);
465*da14cebeSEric Cheng 				connp->conn_on_sqp = B_FALSE;
466*da14cebeSEric Cheng 				SQUEUE_DBG_CLEAR(sqp);
467*da14cebeSEric Cheng 				CONN_DEC_REF(connp);
468*da14cebeSEric Cheng 			} else {
469*da14cebeSEric Cheng 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
470*da14cebeSEric Cheng 				    connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
471*da14cebeSEric Cheng 			}
472*da14cebeSEric Cheng 			ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
473*da14cebeSEric Cheng 			mutex_enter(&sqp->sq_lock);
474*da14cebeSEric Cheng 			sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
475*da14cebeSEric Cheng 			sqp->sq_run = NULL;
476*da14cebeSEric Cheng 			if (sqp->sq_first == NULL ||
477*da14cebeSEric Cheng 			    process_flag == SQ_NODRAIN) {
478*da14cebeSEric Cheng 				if (sqp->sq_first != NULL) {
479*da14cebeSEric Cheng 					squeue_worker_wakeup(sqp);
480*da14cebeSEric Cheng 					return;
481*da14cebeSEric Cheng 				}
482*da14cebeSEric Cheng 				/*
483*da14cebeSEric Cheng 				 * We processed inline our packet and nothing
484*da14cebeSEric Cheng 				 * new has arrived. We are done. In case any
485*da14cebeSEric Cheng 				 * control actions are pending, wake up the
486*da14cebeSEric Cheng 				 * worker.
487*da14cebeSEric Cheng 				 */
488*da14cebeSEric Cheng 				if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
489*da14cebeSEric Cheng 					cv_signal(&sqp->sq_worker_cv);
490*da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
491*da14cebeSEric Cheng 				return;
492*da14cebeSEric Cheng 			}
493*da14cebeSEric Cheng 		} else {
494*da14cebeSEric Cheng 			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
495*da14cebeSEric Cheng #ifdef DEBUG
496*da14cebeSEric Cheng 			mp->b_tag = tag;
497*da14cebeSEric Cheng #endif
498*da14cebeSEric Cheng 		}
499*da14cebeSEric Cheng 		/*
500*da14cebeSEric Cheng 		 * We are here because either we couldn't do inline
501*da14cebeSEric Cheng 		 * processing (because something was already queued),
502*da14cebeSEric Cheng 		 * or we had a chain of more than one packet,
503*da14cebeSEric Cheng 		 * or something else arrived after we were done with
504*da14cebeSEric Cheng 		 * inline processing.
505*da14cebeSEric Cheng 		 */
506*da14cebeSEric Cheng 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
507*da14cebeSEric Cheng 		ASSERT(sqp->sq_first != NULL);
508*da14cebeSEric Cheng 		now = gethrtime();
509*da14cebeSEric Cheng 		sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
510*da14cebeSEric Cheng 
511*da14cebeSEric Cheng 		/*
512*da14cebeSEric Cheng 		 * If we didn't do a complete drain, the worker
513*da14cebeSEric Cheng 		 * thread was already signalled by squeue_drain.
514*da14cebeSEric Cheng 		 * In case any control actions are pending, wake
515*da14cebeSEric Cheng 		 * up the worker.
516*da14cebeSEric Cheng 		 */
517*da14cebeSEric Cheng 		sqp->sq_run = NULL;
518*da14cebeSEric Cheng 		if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
519*da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
520*da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
521*da14cebeSEric Cheng 		return;
522*da14cebeSEric Cheng 	} else {
523*da14cebeSEric Cheng 		/*
524*da14cebeSEric Cheng 		 * We let a thread processing a squeue reenter only
525*da14cebeSEric Cheng 		 * once. This helps the case of incoming connection
526*da14cebeSEric Cheng 		 * where a SYN-ACK-ACK that triggers the conn_ind
527*da14cebeSEric Cheng 		 * doesn't have to queue the packet if listener and
528*da14cebeSEric Cheng 		 * eager are on the same squeue. Also helps the
529*da14cebeSEric Cheng 		 * loopback connection where the two ends are bound
530*da14cebeSEric Cheng 		 * to the same squeue (which is typical on single
531*da14cebeSEric Cheng 		 * CPU machines).
532*da14cebeSEric Cheng 		 *
533*da14cebeSEric Cheng 		 * We let the thread reenter only once for the fear
534*da14cebeSEric Cheng 		 * of stack getting blown with multiple traversal.
535*da14cebeSEric Cheng 		 */
536*da14cebeSEric Cheng 		connp = (conn_t *)mp->b_prev;
537*da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_REENTER) &&
538*da14cebeSEric Cheng 		    (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
539*da14cebeSEric Cheng 		    (sqp->sq_run == curthread) && (cnt == 1) &&
540*da14cebeSEric Cheng 		    (connp->conn_on_sqp == B_FALSE)) {
541*da14cebeSEric Cheng 			sqp->sq_state |= SQS_REENTER;
542*da14cebeSEric Cheng 			mutex_exit(&sqp->sq_lock);
543*da14cebeSEric Cheng 
544*da14cebeSEric Cheng 			ASSERT(mp->b_prev != NULL);
545*da14cebeSEric Cheng 			ASSERT(mp->b_queue != NULL);
546*da14cebeSEric Cheng 
5477c478bd9Sstevel@tonic-gate 			mp->b_prev = NULL;
5487c478bd9Sstevel@tonic-gate 			proc = (sqproc_t)mp->b_queue;
5497c478bd9Sstevel@tonic-gate 			mp->b_queue = NULL;
5507c478bd9Sstevel@tonic-gate 
551*da14cebeSEric Cheng 			/*
552*da14cebeSEric Cheng 			 * Handle squeue switching. More details in the
553*da14cebeSEric Cheng 			 * block comment at the top of the file
554*da14cebeSEric Cheng 			 */
555*da14cebeSEric Cheng 			if (connp->conn_sqp == sqp) {
556*da14cebeSEric Cheng 				connp->conn_on_sqp = B_TRUE;
5577c478bd9Sstevel@tonic-gate 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
558*da14cebeSEric Cheng 				    sqp, mblk_t *, mp, conn_t *, connp);
559*da14cebeSEric Cheng 				(*proc)(connp, mp, sqp);
5607c478bd9Sstevel@tonic-gate 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
561*da14cebeSEric Cheng 				    sqp, conn_t *, connp);
562*da14cebeSEric Cheng 				connp->conn_on_sqp = B_FALSE;
563*da14cebeSEric Cheng 				CONN_DEC_REF(connp);
564*da14cebeSEric Cheng 			} else {
565*da14cebeSEric Cheng 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
566*da14cebeSEric Cheng 				    connp, SQ_FILL, SQTAG_SQUEUE_CHANGE);
5677c478bd9Sstevel@tonic-gate 			}
5687c478bd9Sstevel@tonic-gate 
5697c478bd9Sstevel@tonic-gate 			mutex_enter(&sqp->sq_lock);
570*da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_REENTER;
5717c478bd9Sstevel@tonic-gate 			mutex_exit(&sqp->sq_lock);
5727c478bd9Sstevel@tonic-gate 			return;
5737c478bd9Sstevel@tonic-gate 		}
574*da14cebeSEric Cheng 
575*da14cebeSEric Cheng 		/*
576*da14cebeSEric Cheng 		 * Queue is already being processed or there is already
577*da14cebeSEric Cheng 		 * one or more paquets on the queue. Enqueue the
578*da14cebeSEric Cheng 		 * packet and wakeup the squeue worker thread if the
579*da14cebeSEric Cheng 		 * squeue is not being processed.
580*da14cebeSEric Cheng 		 */
581*da14cebeSEric Cheng #ifdef DEBUG
5827c478bd9Sstevel@tonic-gate 		mp->b_tag = tag;
5837c478bd9Sstevel@tonic-gate #endif
5847c478bd9Sstevel@tonic-gate 
5857c478bd9Sstevel@tonic-gate 		ENQUEUE_CHAIN(sqp, mp, tail, cnt);
5867c478bd9Sstevel@tonic-gate 		if (!(sqp->sq_state & SQS_PROC)) {
587*da14cebeSEric Cheng 			squeue_worker_wakeup(sqp);
5887c478bd9Sstevel@tonic-gate 			return;
5897c478bd9Sstevel@tonic-gate 		}
5907c478bd9Sstevel@tonic-gate 		/*
591*da14cebeSEric Cheng 		 * In case any control actions are pending, wake
592*da14cebeSEric Cheng 		 * up the worker.
5937c478bd9Sstevel@tonic-gate 		 */
594*da14cebeSEric Cheng 		if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
595*da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
5967c478bd9Sstevel@tonic-gate 		mutex_exit(&sqp->sq_lock);
5977c478bd9Sstevel@tonic-gate 		return;
5987c478bd9Sstevel@tonic-gate 	}
5997c478bd9Sstevel@tonic-gate }
6007c478bd9Sstevel@tonic-gate 
6017c478bd9Sstevel@tonic-gate /*
6027c478bd9Sstevel@tonic-gate  * PRIVATE FUNCTIONS
6037c478bd9Sstevel@tonic-gate  */
6047c478bd9Sstevel@tonic-gate 
6057c478bd9Sstevel@tonic-gate static void
6067c478bd9Sstevel@tonic-gate squeue_fire(void *arg)
6077c478bd9Sstevel@tonic-gate {
6087c478bd9Sstevel@tonic-gate 	squeue_t	*sqp = arg;
6097c478bd9Sstevel@tonic-gate 	uint_t		state;
6107c478bd9Sstevel@tonic-gate 
6117c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
6127c478bd9Sstevel@tonic-gate 
6137c478bd9Sstevel@tonic-gate 	state = sqp->sq_state;
6147c478bd9Sstevel@tonic-gate 	if (sqp->sq_tid == 0 && !(state & SQS_TMO_PROG)) {
6157c478bd9Sstevel@tonic-gate 		mutex_exit(&sqp->sq_lock);
6167c478bd9Sstevel@tonic-gate 		return;
6177c478bd9Sstevel@tonic-gate 	}
6187c478bd9Sstevel@tonic-gate 
6197c478bd9Sstevel@tonic-gate 	sqp->sq_tid = 0;
6207c478bd9Sstevel@tonic-gate 	/*
6217c478bd9Sstevel@tonic-gate 	 * The timeout fired before we got a chance to set it.
6227c478bd9Sstevel@tonic-gate 	 * Process it anyway but remove the SQS_TMO_PROG so that
6237c478bd9Sstevel@tonic-gate 	 * the guy trying to set the timeout knows that it has
6247c478bd9Sstevel@tonic-gate 	 * already been processed.
6257c478bd9Sstevel@tonic-gate 	 */
6267c478bd9Sstevel@tonic-gate 	if (state & SQS_TMO_PROG)
6277c478bd9Sstevel@tonic-gate 		sqp->sq_state &= ~SQS_TMO_PROG;
6287c478bd9Sstevel@tonic-gate 
6297c478bd9Sstevel@tonic-gate 	if (!(state & SQS_PROC)) {
6307c478bd9Sstevel@tonic-gate 		sqp->sq_awaken = lbolt;
631*da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
6327c478bd9Sstevel@tonic-gate 	}
6337c478bd9Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
6347c478bd9Sstevel@tonic-gate }
6357c478bd9Sstevel@tonic-gate 
6367c478bd9Sstevel@tonic-gate static void
637d19d6468Sbw squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
6387c478bd9Sstevel@tonic-gate {
6397c478bd9Sstevel@tonic-gate 	mblk_t		*mp;
6407c478bd9Sstevel@tonic-gate 	mblk_t 		*head;
6417c478bd9Sstevel@tonic-gate 	sqproc_t 	proc;
6427c478bd9Sstevel@tonic-gate 	conn_t		*connp;
6437c478bd9Sstevel@tonic-gate 	timeout_id_t 	tid;
6447c478bd9Sstevel@tonic-gate 	ill_rx_ring_t	*sq_rx_ring = sqp->sq_rx_ring;
645d19d6468Sbw 	hrtime_t 	now;
646*da14cebeSEric Cheng 	boolean_t	did_wakeup = B_FALSE;
647*da14cebeSEric Cheng 	boolean_t	sq_poll_capable;
6487c478bd9Sstevel@tonic-gate 
649*da14cebeSEric Cheng 	sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
650*da14cebeSEric Cheng again:
6517c478bd9Sstevel@tonic-gate 	ASSERT(mutex_owned(&sqp->sq_lock));
652*da14cebeSEric Cheng 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
653*da14cebeSEric Cheng 	    SQS_POLL_QUIESCE_DONE)));
6547c478bd9Sstevel@tonic-gate 
655*da14cebeSEric Cheng 	head = sqp->sq_first;
656*da14cebeSEric Cheng 	sqp->sq_first = NULL;
657*da14cebeSEric Cheng 	sqp->sq_last = NULL;
658*da14cebeSEric Cheng 	sqp->sq_count = 0;
6597c478bd9Sstevel@tonic-gate 
6607c478bd9Sstevel@tonic-gate 	if ((tid = sqp->sq_tid) != 0)
6617c478bd9Sstevel@tonic-gate 		sqp->sq_tid = 0;
6627c478bd9Sstevel@tonic-gate 
6637c478bd9Sstevel@tonic-gate 	sqp->sq_state |= SQS_PROC | proc_type;
664*da14cebeSEric Cheng 
6657c478bd9Sstevel@tonic-gate 
6667c478bd9Sstevel@tonic-gate 	/*
6677c478bd9Sstevel@tonic-gate 	 * We have backlog built up. Switch to polling mode if the
668*da14cebeSEric Cheng 	 * device underneath allows it. Need to do it so that
669*da14cebeSEric Cheng 	 * more packets don't come in and disturb us (by contending
670*da14cebeSEric Cheng 	 * for sq_lock or higher priority thread preempting us).
671*da14cebeSEric Cheng 	 *
672*da14cebeSEric Cheng 	 * The worker thread is allowed to do active polling while we
673*da14cebeSEric Cheng 	 * just disable the interrupts for drain by non worker (kernel
674*da14cebeSEric Cheng 	 * or userland) threads so they can peacefully process the
675*da14cebeSEric Cheng 	 * packets during time allocated to them.
6767c478bd9Sstevel@tonic-gate 	 */
677*da14cebeSEric Cheng 	SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring);
6787c478bd9Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
6797c478bd9Sstevel@tonic-gate 
6807c478bd9Sstevel@tonic-gate 	if (tid != 0)
6817c478bd9Sstevel@tonic-gate 		(void) untimeout(tid);
682*da14cebeSEric Cheng 
6837c478bd9Sstevel@tonic-gate 	while ((mp = head) != NULL) {
684*da14cebeSEric Cheng 
6857c478bd9Sstevel@tonic-gate 		head = mp->b_next;
6867c478bd9Sstevel@tonic-gate 		mp->b_next = NULL;
6877c478bd9Sstevel@tonic-gate 
6887c478bd9Sstevel@tonic-gate 		proc = (sqproc_t)mp->b_queue;
6897c478bd9Sstevel@tonic-gate 		mp->b_queue = NULL;
6907c478bd9Sstevel@tonic-gate 		connp = (conn_t *)mp->b_prev;
6917c478bd9Sstevel@tonic-gate 		mp->b_prev = NULL;
6927c478bd9Sstevel@tonic-gate 
693*da14cebeSEric Cheng 		/*
694*da14cebeSEric Cheng 		 * Handle squeue switching. More details in the
695*da14cebeSEric Cheng 		 * block comment at the top of the file
696*da14cebeSEric Cheng 		 */
697*da14cebeSEric Cheng 		if (connp->conn_sqp == sqp) {
698*da14cebeSEric Cheng 			SQUEUE_DBG_SET(sqp, mp, proc, connp,
699*da14cebeSEric Cheng 			    mp->b_tag);
7007c478bd9Sstevel@tonic-gate 			connp->conn_on_sqp = B_TRUE;
7017c478bd9Sstevel@tonic-gate 			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
7027c478bd9Sstevel@tonic-gate 			    sqp, mblk_t *, mp, conn_t *, connp);
7037c478bd9Sstevel@tonic-gate 			(*proc)(connp, mp, sqp);
7047c478bd9Sstevel@tonic-gate 			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
7057c478bd9Sstevel@tonic-gate 			    sqp, conn_t *, connp);
7067c478bd9Sstevel@tonic-gate 			connp->conn_on_sqp = B_FALSE;
7077c478bd9Sstevel@tonic-gate 			CONN_DEC_REF(connp);
708*da14cebeSEric Cheng 		} else {
709*da14cebeSEric Cheng 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp,
710*da14cebeSEric Cheng 			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
711*da14cebeSEric Cheng 		}
7127c478bd9Sstevel@tonic-gate 	}
7137c478bd9Sstevel@tonic-gate 
714*da14cebeSEric Cheng 	SQUEUE_DBG_CLEAR(sqp);
7157c478bd9Sstevel@tonic-gate 
7167c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
7177c478bd9Sstevel@tonic-gate 
718*da14cebeSEric Cheng 	/*
719*da14cebeSEric Cheng 	 * Check if there is still work to do (either more arrived or timer
720*da14cebeSEric Cheng 	 * expired). If we are the worker thread and we are polling capable,
721*da14cebeSEric Cheng 	 * continue doing the work since no one else is around to do the
722*da14cebeSEric Cheng 	 * work anyway (but signal the poll thread to retrieve some packets
723*da14cebeSEric Cheng 	 * in the meanwhile). If we are not the worker thread, just
724*da14cebeSEric Cheng 	 * signal the worker thread to take up the work if processing time
725*da14cebeSEric Cheng 	 * has expired.
726*da14cebeSEric Cheng 	 */
7277c478bd9Sstevel@tonic-gate 	if (sqp->sq_first != NULL) {
728*da14cebeSEric Cheng 		/*
729*da14cebeSEric Cheng 		 * Still more to process. If time quanta not expired, we
730*da14cebeSEric Cheng 		 * should let the drain go on. The worker thread is allowed
731*da14cebeSEric Cheng 		 * to drain as long as there is anything left.
732*da14cebeSEric Cheng 		 */
733*da14cebeSEric Cheng 		now = gethrtime();
734*da14cebeSEric Cheng 		if ((now < expire) || (proc_type == SQS_WORKER)) {
735*da14cebeSEric Cheng 			/*
736*da14cebeSEric Cheng 			 * If time not expired or we are worker thread and
737*da14cebeSEric Cheng 			 * this squeue is polling capable, continue to do
738*da14cebeSEric Cheng 			 * the drain.
739*da14cebeSEric Cheng 			 *
740*da14cebeSEric Cheng 			 * We turn off interrupts for all userland threads
741*da14cebeSEric Cheng 			 * doing drain but we do active polling only for
742*da14cebeSEric Cheng 			 * worker thread.
743*da14cebeSEric Cheng 			 */
744*da14cebeSEric Cheng 			if (proc_type == SQS_WORKER)
745*da14cebeSEric Cheng 				SQS_POLL_RING(sqp, sq_poll_capable);
746*da14cebeSEric Cheng 			goto again;
747*da14cebeSEric Cheng 		} else {
748*da14cebeSEric Cheng 			did_wakeup = B_TRUE;
749*da14cebeSEric Cheng 			sqp->sq_awaken = lbolt;
750*da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
751*da14cebeSEric Cheng 		}
752*da14cebeSEric Cheng 	}
753*da14cebeSEric Cheng 
754*da14cebeSEric Cheng 	/*
755*da14cebeSEric Cheng 	 * If the poll thread is already running, just return. The
756*da14cebeSEric Cheng 	 * poll thread continues to hold the proc and will finish
757*da14cebeSEric Cheng 	 * processing.
758*da14cebeSEric Cheng 	 */
759*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_GET_PKTS) {
760*da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
761*da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE)));
762*da14cebeSEric Cheng 		sqp->sq_state &= ~proc_type;
763*da14cebeSEric Cheng 		return;
764*da14cebeSEric Cheng 	}
765*da14cebeSEric Cheng 
766*da14cebeSEric Cheng 	/*
767*da14cebeSEric Cheng 	 *
768*da14cebeSEric Cheng 	 * If we are the worker thread and no work is left, send the poll
769*da14cebeSEric Cheng 	 * thread down once more to see if something arrived. Otherwise,
770*da14cebeSEric Cheng 	 * turn the interrupts back on and we are done.
771*da14cebeSEric Cheng 	 */
772*da14cebeSEric Cheng 	if ((proc_type == SQS_WORKER) &&
773*da14cebeSEric Cheng 	    (sqp->sq_state & SQS_POLL_CAPAB)) {
774*da14cebeSEric Cheng 		/*
775*da14cebeSEric Cheng 		 * Do one last check to see if anything arrived
776*da14cebeSEric Cheng 		 * in the NIC. We leave the SQS_PROC set to ensure
777*da14cebeSEric Cheng 		 * that poll thread keeps the PROC and can decide
778*da14cebeSEric Cheng 		 * if it needs to turn polling off or continue
779*da14cebeSEric Cheng 		 * processing.
780*da14cebeSEric Cheng 		 *
781*da14cebeSEric Cheng 		 * If we drop the SQS_PROC here and poll thread comes
782*da14cebeSEric Cheng 		 * up empty handed, it can not safely turn polling off
783*da14cebeSEric Cheng 		 * since someone else could have acquired the PROC
784*da14cebeSEric Cheng 		 * and started draining. The previously running poll
785*da14cebeSEric Cheng 		 * thread and the current thread doing drain would end
786*da14cebeSEric Cheng 		 * up in a race for turning polling on/off and more
787*da14cebeSEric Cheng 		 * complex code would be required to deal with it.
788*da14cebeSEric Cheng 		 *
789*da14cebeSEric Cheng 		 * Its lot simpler for drain to hand the SQS_PROC to
790*da14cebeSEric Cheng 		 * poll thread (if running) and let poll thread finish
791*da14cebeSEric Cheng 		 * without worrying about racing with any other thread.
792*da14cebeSEric Cheng 		 */
793*da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
794*da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE)));
795*da14cebeSEric Cheng 		SQS_POLL_RING(sqp, sq_poll_capable);
796*da14cebeSEric Cheng 		sqp->sq_state &= ~proc_type;
797*da14cebeSEric Cheng 	} else {
798*da14cebeSEric Cheng 		/*
799*da14cebeSEric Cheng 		 * The squeue is either not capable of polling or
800*da14cebeSEric Cheng 		 * poll thread already finished processing and didn't
801*da14cebeSEric Cheng 		 * find anything. Since there is nothing queued and
802*da14cebeSEric Cheng 		 * we already turn polling on (for all threads doing
803*da14cebeSEric Cheng 		 * drain), we should turn polling off and relinquish
804*da14cebeSEric Cheng 		 * the PROC.
805*da14cebeSEric Cheng 		 */
806*da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
807*da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE)));
808*da14cebeSEric Cheng 		SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring);
809*da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_PROC | proc_type);
810*da14cebeSEric Cheng 		if (!did_wakeup && sqp->sq_first != NULL) {
811*da14cebeSEric Cheng 			squeue_worker_wakeup(sqp);
812*da14cebeSEric Cheng 			mutex_enter(&sqp->sq_lock);
813*da14cebeSEric Cheng 		}
814*da14cebeSEric Cheng 		/*
815*da14cebeSEric Cheng 		 * If we are not the worker and there is a pending quiesce
816*da14cebeSEric Cheng 		 * event, wake up the worker
817*da14cebeSEric Cheng 		 */
818*da14cebeSEric Cheng 		if ((proc_type != SQS_WORKER) &&
819*da14cebeSEric Cheng 		    (sqp->sq_state & SQS_WORKER_THR_CONTROL))
820*da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
821*da14cebeSEric Cheng 	}
822*da14cebeSEric Cheng }
823*da14cebeSEric Cheng 
824*da14cebeSEric Cheng /*
825*da14cebeSEric Cheng  * Quiesce, Restart, or Cleanup of the squeue poll thread.
826*da14cebeSEric Cheng  *
827*da14cebeSEric Cheng  * Quiesce and Restart: After an squeue poll thread has been quiesced, it does
828*da14cebeSEric Cheng  * not attempt to poll the underlying soft ring any more. The quiesce is
829*da14cebeSEric Cheng  * triggered by the mac layer when it wants to quiesce a soft ring. Typically
830*da14cebeSEric Cheng  * control operations such as changing the fanout of a NIC or VNIC (dladm
831*da14cebeSEric Cheng  * setlinkprop) need to quiesce data flow before changing the wiring.
832*da14cebeSEric Cheng  * The operation is done by the mac layer, but it calls back into IP to
833*da14cebeSEric Cheng  * quiesce the soft ring. After completing the operation (say increase or
834*da14cebeSEric Cheng  * decrease of the fanout) the mac layer then calls back into IP to restart
835*da14cebeSEric Cheng  * the quiesced soft ring.
836*da14cebeSEric Cheng  *
837*da14cebeSEric Cheng  * Cleanup: This is triggered when the squeue binding to a soft ring is
838*da14cebeSEric Cheng  * removed permanently. Typically interface plumb and unplumb would trigger
839*da14cebeSEric Cheng  * this. It can also be triggered from the mac layer when a soft ring is
840*da14cebeSEric Cheng  * being deleted say as the result of a fanout reduction. Since squeues are
841*da14cebeSEric Cheng  * never deleted, the cleanup marks the squeue as fit for recycling and
842*da14cebeSEric Cheng  * moves it to the zeroth squeue set.
843*da14cebeSEric Cheng  */
844*da14cebeSEric Cheng static void
845*da14cebeSEric Cheng squeue_poll_thr_control(squeue_t *sqp)
846*da14cebeSEric Cheng {
847*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_THR_RESTART) {
848*da14cebeSEric Cheng 		/* Restart implies a previous quiesce */
849*da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_THR_QUIESCED);
850*da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_POLL_THR_QUIESCED |
851*da14cebeSEric Cheng 		    SQS_POLL_THR_RESTART);
852*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_CAPAB;
853*da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
854*da14cebeSEric Cheng 		return;
855*da14cebeSEric Cheng 	}
856*da14cebeSEric Cheng 
857*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_THR_QUIESCE) {
858*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_THR_QUIESCED;
859*da14cebeSEric Cheng 		sqp->sq_state &= ~SQS_POLL_THR_QUIESCE;
860*da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
861*da14cebeSEric Cheng 		return;
862*da14cebeSEric Cheng 	}
863*da14cebeSEric Cheng }
864*da14cebeSEric Cheng 
865*da14cebeSEric Cheng /*
866*da14cebeSEric Cheng  * POLLING Notes
867*da14cebeSEric Cheng  *
868*da14cebeSEric Cheng  * With polling mode, we want to do as much processing as we possibly can
869*da14cebeSEric Cheng  * in worker thread context. The sweet spot is worker thread keeps doing
870*da14cebeSEric Cheng  * work all the time in polling mode and writers etc. keep dumping packets
871*da14cebeSEric Cheng  * to worker thread. Occassionally, we send the poll thread (running at
872*da14cebeSEric Cheng  * lower priority to NIC to get the chain of packets to feed to worker).
873*da14cebeSEric Cheng  * Sending the poll thread down to NIC is dependant on 3 criterions
874*da14cebeSEric Cheng  *
875*da14cebeSEric Cheng  * 1) Its always driven from squeue_drain and only if worker thread is
876*da14cebeSEric Cheng  *	doing the drain.
877*da14cebeSEric Cheng  * 2) We clear the backlog once and more packets arrived in between.
878*da14cebeSEric Cheng  *	Before starting drain again, send the poll thread down if
879*da14cebeSEric Cheng  *	the drain is being done by worker thread.
880*da14cebeSEric Cheng  * 3) Before exiting the squeue_drain, if the poll thread is not already
881*da14cebeSEric Cheng  *	working and we are the worker thread, try to poll one more time.
882*da14cebeSEric Cheng  *
883*da14cebeSEric Cheng  * For latency sake, we do allow any thread calling squeue_enter
884*da14cebeSEric Cheng  * to process its packet provided:
885*da14cebeSEric Cheng  *
886*da14cebeSEric Cheng  * 1) Nothing is queued
887*da14cebeSEric Cheng  * 2) If more packets arrived in between, the non worker thread are allowed
888*da14cebeSEric Cheng  *	to do the drain till their time quanta expired provided SQS_GET_PKTS
889*da14cebeSEric Cheng  *	wasn't set in between.
890*da14cebeSEric Cheng  *
891*da14cebeSEric Cheng  * Avoiding deadlocks with interrupts
892*da14cebeSEric Cheng  * ==================================
893*da14cebeSEric Cheng  *
894*da14cebeSEric Cheng  * One of the big problem is that we can't send poll_thr down while holding
895*da14cebeSEric Cheng  * the sq_lock since the thread can block. So we drop the sq_lock before
896*da14cebeSEric Cheng  * calling sq_get_pkts(). We keep holding the SQS_PROC as long as the
897*da14cebeSEric Cheng  * poll thread is running so that no other thread can acquire the
898*da14cebeSEric Cheng  * perimeter in between. If the squeue_drain gets done (no more work
899*da14cebeSEric Cheng  * left), it leaves the SQS_PROC set if poll thread is running.
900*da14cebeSEric Cheng  */
901*da14cebeSEric Cheng 
902*da14cebeSEric Cheng /*
903*da14cebeSEric Cheng  * This is the squeue poll thread. In poll mode, it polls the underlying
904*da14cebeSEric Cheng  * TCP softring and feeds packets into the squeue. The worker thread then
905*da14cebeSEric Cheng  * drains the squeue. The poll thread also responds to control signals for
906*da14cebeSEric Cheng  * quiesceing, restarting, or cleanup of an squeue. These are driven by
907*da14cebeSEric Cheng  * control operations like plumb/unplumb or as a result of dynamic Rx ring
908*da14cebeSEric Cheng  * related operations that are driven from the mac layer.
909*da14cebeSEric Cheng  */
910*da14cebeSEric Cheng static void
911*da14cebeSEric Cheng squeue_polling_thread(squeue_t *sqp)
912*da14cebeSEric Cheng {
913*da14cebeSEric Cheng 	kmutex_t *lock = &sqp->sq_lock;
914*da14cebeSEric Cheng 	kcondvar_t *async = &sqp->sq_poll_cv;
915*da14cebeSEric Cheng 	ip_mac_rx_t sq_get_pkts;
916*da14cebeSEric Cheng 	ip_accept_t ip_accept;
917*da14cebeSEric Cheng 	ill_rx_ring_t *sq_rx_ring;
918*da14cebeSEric Cheng 	ill_t *sq_ill;
919*da14cebeSEric Cheng 	mblk_t *head, *tail, *mp;
920*da14cebeSEric Cheng 	uint_t cnt;
921*da14cebeSEric Cheng 	void *sq_mac_handle;
922*da14cebeSEric Cheng 	callb_cpr_t cprinfo;
923*da14cebeSEric Cheng 	size_t bytes_to_pickup;
924*da14cebeSEric Cheng 	uint32_t ctl_state;
925*da14cebeSEric Cheng 
926*da14cebeSEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_poll");
927*da14cebeSEric Cheng 	mutex_enter(lock);
928*da14cebeSEric Cheng 
929*da14cebeSEric Cheng 	for (;;) {
930*da14cebeSEric Cheng 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
931*da14cebeSEric Cheng 		cv_wait(async, lock);
932*da14cebeSEric Cheng 		CALLB_CPR_SAFE_END(&cprinfo, lock);
933*da14cebeSEric Cheng 
934*da14cebeSEric Cheng 		ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
935*da14cebeSEric Cheng 		    SQS_POLL_THR_QUIESCED);
936*da14cebeSEric Cheng 		if (ctl_state != 0) {
937*da14cebeSEric Cheng 			/*
938*da14cebeSEric Cheng 			 * If the squeue is quiesced, then wait for a control
939*da14cebeSEric Cheng 			 * request. A quiesced squeue must not poll the
940*da14cebeSEric Cheng 			 * underlying soft ring.
941*da14cebeSEric Cheng 			 */
942*da14cebeSEric Cheng 			if (ctl_state == SQS_POLL_THR_QUIESCED)
943*da14cebeSEric Cheng 				continue;
944*da14cebeSEric Cheng 			/*
945*da14cebeSEric Cheng 			 * Act on control requests to quiesce, cleanup or
946*da14cebeSEric Cheng 			 * restart an squeue
947*da14cebeSEric Cheng 			 */
948*da14cebeSEric Cheng 			squeue_poll_thr_control(sqp);
949*da14cebeSEric Cheng 			continue;
950*da14cebeSEric Cheng 		}
951*da14cebeSEric Cheng 
952*da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_POLL_CAPAB))
953*da14cebeSEric Cheng 			continue;
954*da14cebeSEric Cheng 
955*da14cebeSEric Cheng 		ASSERT((sqp->sq_state &
956*da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
957*da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
958*da14cebeSEric Cheng 
959*da14cebeSEric Cheng poll_again:
960*da14cebeSEric Cheng 		sq_rx_ring = sqp->sq_rx_ring;
961*da14cebeSEric Cheng 		sq_get_pkts = sq_rx_ring->rr_rx;
962*da14cebeSEric Cheng 		sq_mac_handle = sq_rx_ring->rr_rx_handle;
963*da14cebeSEric Cheng 		ip_accept = sq_rx_ring->rr_ip_accept;
964*da14cebeSEric Cheng 		sq_ill = sq_rx_ring->rr_ill;
965*da14cebeSEric Cheng 		bytes_to_pickup = MAX_BYTES_TO_PICKUP;
966*da14cebeSEric Cheng 		mutex_exit(lock);
967*da14cebeSEric Cheng 		head = sq_get_pkts(sq_mac_handle, bytes_to_pickup);
968*da14cebeSEric Cheng 		mp = NULL;
969*da14cebeSEric Cheng 		if (head != NULL) {
970*da14cebeSEric Cheng 			/*
971*da14cebeSEric Cheng 			 * We got the packet chain from the mac layer. It
972*da14cebeSEric Cheng 			 * would be nice to be able to process it inline
973*da14cebeSEric Cheng 			 * for better performance but we need to give
974*da14cebeSEric Cheng 			 * IP a chance to look at this chain to ensure
975*da14cebeSEric Cheng 			 * that packets are really meant for this squeue
976*da14cebeSEric Cheng 			 * and do the IP processing.
977*da14cebeSEric Cheng 			 */
978*da14cebeSEric Cheng 			mp = ip_accept(sq_ill, sq_rx_ring, sqp, head,
979*da14cebeSEric Cheng 			    &tail, &cnt);
980*da14cebeSEric Cheng 		}
981*da14cebeSEric Cheng 		mutex_enter(lock);
982*da14cebeSEric Cheng 		if (mp != NULL)
983*da14cebeSEric Cheng 			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
984*da14cebeSEric Cheng 
985*da14cebeSEric Cheng 		ASSERT((sqp->sq_state &
986*da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
987*da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
988*da14cebeSEric Cheng 
989*da14cebeSEric Cheng 		if (sqp->sq_first != NULL && !(sqp->sq_state & SQS_WORKER)) {
990*da14cebeSEric Cheng 			/*
991*da14cebeSEric Cheng 			 * We have packets to process and worker thread
992*da14cebeSEric Cheng 			 * is not running.  Check to see if poll thread is
993*da14cebeSEric Cheng 			 * allowed to process. Let it do processing only if it
994*da14cebeSEric Cheng 			 * picked up some packets from the NIC otherwise
995*da14cebeSEric Cheng 			 * wakeup the worker thread.
996*da14cebeSEric Cheng 			 */
997*da14cebeSEric Cheng 			if (mp != NULL) {
998*da14cebeSEric Cheng 				hrtime_t  now;
999d19d6468Sbw 
1000d19d6468Sbw 				now = gethrtime();
1001*da14cebeSEric Cheng 				sqp->sq_run = curthread;
1002*da14cebeSEric Cheng 				sqp->sq_drain(sqp, SQS_POLL_PROC, now +
1003*da14cebeSEric Cheng 				    squeue_drain_ns);
1004*da14cebeSEric Cheng 				sqp->sq_run = NULL;
1005*da14cebeSEric Cheng 
1006*da14cebeSEric Cheng 				if (sqp->sq_first == NULL)
1007*da14cebeSEric Cheng 					goto poll_again;
10087c478bd9Sstevel@tonic-gate 
10097c478bd9Sstevel@tonic-gate 				/*
1010*da14cebeSEric Cheng 				 * Couldn't do the entire drain because the
1011*da14cebeSEric Cheng 				 * time limit expired, let the
1012*da14cebeSEric Cheng 				 * worker thread take over.
10137c478bd9Sstevel@tonic-gate 				 */
1014*da14cebeSEric Cheng 			}
1015*da14cebeSEric Cheng 
10167c478bd9Sstevel@tonic-gate 			sqp->sq_awaken = lbolt;
1017*da14cebeSEric Cheng 			/*
1018*da14cebeSEric Cheng 			 * Put the SQS_PROC_HELD on so the worker
1019*da14cebeSEric Cheng 			 * thread can distinguish where its called from. We
1020*da14cebeSEric Cheng 			 * can remove the SQS_PROC flag here and turn off the
1021*da14cebeSEric Cheng 			 * polling so that it wouldn't matter who gets the
1022*da14cebeSEric Cheng 			 * processing but we get better performance this way
1023*da14cebeSEric Cheng 			 * and save the cost of turn polling off and possibly
1024*da14cebeSEric Cheng 			 * on again as soon as we start draining again.
1025*da14cebeSEric Cheng 			 *
1026*da14cebeSEric Cheng 			 * We can't remove the SQS_PROC flag without turning
1027*da14cebeSEric Cheng 			 * polling off until we can guarantee that control
1028*da14cebeSEric Cheng 			 * will return to squeue_drain immediately.
1029*da14cebeSEric Cheng 			 */
1030*da14cebeSEric Cheng 			sqp->sq_state |= SQS_PROC_HELD;
1031*da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_GET_PKTS;
1032*da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
1033*da14cebeSEric Cheng 		} else if (sqp->sq_first == NULL &&
1034*da14cebeSEric Cheng 		    !(sqp->sq_state & SQS_WORKER)) {
1035*da14cebeSEric Cheng 			/*
1036*da14cebeSEric Cheng 			 * Nothing queued and worker thread not running.
1037*da14cebeSEric Cheng 			 * Since we hold the proc, no other thread is
1038*da14cebeSEric Cheng 			 * processing the squeue. This means that there
1039*da14cebeSEric Cheng 			 * is no work to be done and nothing is queued
1040*da14cebeSEric Cheng 			 * in squeue or in NIC. Turn polling off and go
1041*da14cebeSEric Cheng 			 * back to interrupt mode.
1042*da14cebeSEric Cheng 			 */
1043*da14cebeSEric Cheng 			sqp->sq_state &= ~(SQS_PROC|SQS_GET_PKTS);
1044*da14cebeSEric Cheng 			/* LINTED: constant in conditional context */
1045*da14cebeSEric Cheng 			SQS_POLLING_OFF(sqp, B_TRUE, sq_rx_ring);
1046*da14cebeSEric Cheng 		} else {
1047*da14cebeSEric Cheng 			/*
1048*da14cebeSEric Cheng 			 * Worker thread is already running. We don't need
1049*da14cebeSEric Cheng 			 * to do anything. Indicate that poll thread is done.
1050*da14cebeSEric Cheng 			 */
1051*da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_GET_PKTS;
1052*da14cebeSEric Cheng 		}
1053*da14cebeSEric Cheng 		if (sqp->sq_state & SQS_POLL_THR_CONTROL) {
1054*da14cebeSEric Cheng 			/*
1055*da14cebeSEric Cheng 			 * Act on control requests to quiesce, cleanup or
1056*da14cebeSEric Cheng 			 * restart an squeue
1057*da14cebeSEric Cheng 			 */
1058*da14cebeSEric Cheng 			squeue_poll_thr_control(sqp);
1059*da14cebeSEric Cheng 		}
10607c478bd9Sstevel@tonic-gate 	}
10617c478bd9Sstevel@tonic-gate }
10627c478bd9Sstevel@tonic-gate 
10637c478bd9Sstevel@tonic-gate /*
1064*da14cebeSEric Cheng  * The squeue worker thread acts on any control requests to quiesce, cleanup
1065*da14cebeSEric Cheng  * or restart an ill_rx_ring_t by calling this function. The worker thread
1066*da14cebeSEric Cheng  * synchronizes with the squeue poll thread to complete the request and finally
1067*da14cebeSEric Cheng  * wakes up the requestor when the request is completed.
10687c478bd9Sstevel@tonic-gate  */
1069*da14cebeSEric Cheng static void
1070*da14cebeSEric Cheng squeue_worker_thr_control(squeue_t *sqp)
1071*da14cebeSEric Cheng {
1072*da14cebeSEric Cheng 	ill_t	*ill;
1073*da14cebeSEric Cheng 	ill_rx_ring_t	*rx_ring;
10747c478bd9Sstevel@tonic-gate 
1075*da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
1076*da14cebeSEric Cheng 
1077*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_RESTART) {
1078*da14cebeSEric Cheng 		/* Restart implies a previous quiesce. */
1079*da14cebeSEric Cheng 		ASSERT((sqp->sq_state & (SQS_PROC_HELD |
1080*da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER)) ==
1081*da14cebeSEric Cheng 		    (SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER));
1082*da14cebeSEric Cheng 		/*
1083*da14cebeSEric Cheng 		 * Request the squeue poll thread to restart and wait till
1084*da14cebeSEric Cheng 		 * it actually restarts.
1085*da14cebeSEric Cheng 		 */
1086*da14cebeSEric Cheng 		sqp->sq_state &= ~SQS_POLL_QUIESCE_DONE;
1087*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_THR_RESTART;
1088*da14cebeSEric Cheng 		cv_signal(&sqp->sq_poll_cv);
1089*da14cebeSEric Cheng 		while (sqp->sq_state & SQS_POLL_THR_QUIESCED)
1090*da14cebeSEric Cheng 			cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
1091*da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_POLL_RESTART | SQS_PROC |
1092*da14cebeSEric Cheng 		    SQS_WORKER);
1093*da14cebeSEric Cheng 		/*
1094*da14cebeSEric Cheng 		 * Signal any waiter that is waiting for the restart
1095*da14cebeSEric Cheng 		 * to complete
1096*da14cebeSEric Cheng 		 */
1097*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_RESTART_DONE;
1098*da14cebeSEric Cheng 		cv_signal(&sqp->sq_ctrlop_done_cv);
1099*da14cebeSEric Cheng 		return;
1100*da14cebeSEric Cheng 	}
1101*da14cebeSEric Cheng 
1102*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_PROC_HELD) {
1103*da14cebeSEric Cheng 		/* The squeue poll thread handed control to us */
1104*da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_PROC);
1105*da14cebeSEric Cheng 	}
11067c478bd9Sstevel@tonic-gate 
11077c478bd9Sstevel@tonic-gate 	/*
1108*da14cebeSEric Cheng 	 * Prevent any other thread from processing the squeue
1109*da14cebeSEric Cheng 	 * until we finish the control actions by setting SQS_PROC.
1110*da14cebeSEric Cheng 	 * But allow ourself to reenter by setting SQS_WORKER
11117c478bd9Sstevel@tonic-gate 	 */
1112*da14cebeSEric Cheng 	sqp->sq_state |= (SQS_PROC | SQS_WORKER);
1113*da14cebeSEric Cheng 
1114*da14cebeSEric Cheng 	/* Signal the squeue poll thread and wait for it to quiesce itself */
1115*da14cebeSEric Cheng 	if (!(sqp->sq_state & SQS_POLL_THR_QUIESCED)) {
1116*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_THR_QUIESCE;
1117*da14cebeSEric Cheng 		cv_signal(&sqp->sq_poll_cv);
1118*da14cebeSEric Cheng 		while (!(sqp->sq_state & SQS_POLL_THR_QUIESCED))
1119*da14cebeSEric Cheng 			cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
1120*da14cebeSEric Cheng 	}
1121*da14cebeSEric Cheng 
1122*da14cebeSEric Cheng 	rx_ring = sqp->sq_rx_ring;
1123*da14cebeSEric Cheng 	ill = rx_ring->rr_ill;
1124*da14cebeSEric Cheng 	/*
1125*da14cebeSEric Cheng 	 * The lock hierarchy is as follows.
1126*da14cebeSEric Cheng 	 * cpu_lock -> ill_lock -> sqset_lock -> sq_lock
1127*da14cebeSEric Cheng 	 */
1128*da14cebeSEric Cheng 	mutex_exit(&sqp->sq_lock);
1129*da14cebeSEric Cheng 	mutex_enter(&ill->ill_lock);
1130*da14cebeSEric Cheng 	mutex_enter(&sqp->sq_lock);
1131*da14cebeSEric Cheng 
1132*da14cebeSEric Cheng 	SQS_POLLING_OFF(sqp, (sqp->sq_state & SQS_POLL_CAPAB) != 0,
1133*da14cebeSEric Cheng 	    sqp->sq_rx_ring);
1134*da14cebeSEric Cheng 	sqp->sq_state &= ~(SQS_POLL_CAPAB | SQS_GET_PKTS | SQS_PROC_HELD);
1135*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_CLEANUP) {
1136*da14cebeSEric Cheng 		/*
1137*da14cebeSEric Cheng 		 * Disassociate this squeue from its ill_rx_ring_t.
1138*da14cebeSEric Cheng 		 * The rr_sqp, sq_rx_ring fields are protected by the
1139*da14cebeSEric Cheng 		 * corresponding squeue, ill_lock* and sq_lock. Holding any
1140*da14cebeSEric Cheng 		 * of them will ensure that the ring to squeue mapping does
1141*da14cebeSEric Cheng 		 * not change.
1142*da14cebeSEric Cheng 		 */
1143*da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & SQS_DEFAULT));
1144*da14cebeSEric Cheng 
1145*da14cebeSEric Cheng 		sqp->sq_rx_ring = NULL;
1146*da14cebeSEric Cheng 		rx_ring->rr_sqp = NULL;
1147*da14cebeSEric Cheng 
1148*da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_POLL_CLEANUP | SQS_POLL_THR_QUIESCED |
1149*da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE);
1150*da14cebeSEric Cheng 		sqp->sq_ill = NULL;
1151*da14cebeSEric Cheng 
1152*da14cebeSEric Cheng 		rx_ring->rr_rx_handle = NULL;
1153*da14cebeSEric Cheng 		rx_ring->rr_intr_handle = NULL;
1154*da14cebeSEric Cheng 		rx_ring->rr_intr_enable = NULL;
1155*da14cebeSEric Cheng 		rx_ring->rr_intr_disable = NULL;
1156*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_CLEANUP_DONE;
1157*da14cebeSEric Cheng 	} else {
1158*da14cebeSEric Cheng 		sqp->sq_state &= ~SQS_POLL_QUIESCE;
1159*da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_QUIESCE_DONE;
1160*da14cebeSEric Cheng 	}
1161*da14cebeSEric Cheng 	/*
1162*da14cebeSEric Cheng 	 * Signal any waiter that is waiting for the quiesce or cleanup
1163*da14cebeSEric Cheng 	 * to complete and also wait for it to actually see and reset the
1164*da14cebeSEric Cheng 	 * SQS_POLL_CLEANUP_DONE.
1165*da14cebeSEric Cheng 	 */
1166*da14cebeSEric Cheng 	cv_signal(&sqp->sq_ctrlop_done_cv);
1167*da14cebeSEric Cheng 	mutex_exit(&ill->ill_lock);
1168*da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_CLEANUP_DONE) {
1169*da14cebeSEric Cheng 		cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
1170*da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_PROC | SQS_WORKER);
11717c478bd9Sstevel@tonic-gate 	}
11727c478bd9Sstevel@tonic-gate }
11737c478bd9Sstevel@tonic-gate 
11747c478bd9Sstevel@tonic-gate static void
11757c478bd9Sstevel@tonic-gate squeue_worker(squeue_t *sqp)
11767c478bd9Sstevel@tonic-gate {
11777c478bd9Sstevel@tonic-gate 	kmutex_t *lock = &sqp->sq_lock;
1178*da14cebeSEric Cheng 	kcondvar_t *async = &sqp->sq_worker_cv;
11797c478bd9Sstevel@tonic-gate 	callb_cpr_t cprinfo;
1180d19d6468Sbw 	hrtime_t now;
11817c478bd9Sstevel@tonic-gate 
1182*da14cebeSEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_worker");
11837c478bd9Sstevel@tonic-gate 	mutex_enter(lock);
11847c478bd9Sstevel@tonic-gate 
11857c478bd9Sstevel@tonic-gate 	for (;;) {
1186*da14cebeSEric Cheng 		for (;;) {
1187*da14cebeSEric Cheng 			/*
1188*da14cebeSEric Cheng 			 * If the poll thread has handed control to us
1189*da14cebeSEric Cheng 			 * we need to break out of the wait.
1190*da14cebeSEric Cheng 			 */
1191*da14cebeSEric Cheng 			if (sqp->sq_state & SQS_PROC_HELD)
1192*da14cebeSEric Cheng 				break;
1193*da14cebeSEric Cheng 
1194*da14cebeSEric Cheng 			/*
1195*da14cebeSEric Cheng 			 * If the squeue is not being processed and we either
1196*da14cebeSEric Cheng 			 * have messages to drain or some thread has signaled
1197*da14cebeSEric Cheng 			 * some control activity we need to break
1198*da14cebeSEric Cheng 			 */
1199*da14cebeSEric Cheng 			if (!(sqp->sq_state & SQS_PROC) &&
1200*da14cebeSEric Cheng 			    ((sqp->sq_state & SQS_WORKER_THR_CONTROL) ||
1201*da14cebeSEric Cheng 			    (sqp->sq_first != NULL)))
1202*da14cebeSEric Cheng 				break;
1203*da14cebeSEric Cheng 
1204*da14cebeSEric Cheng 			/*
1205*da14cebeSEric Cheng 			 * If we have started some control action, then check
1206*da14cebeSEric Cheng 			 * for the SQS_WORKER flag (since we don't
1207*da14cebeSEric Cheng 			 * release the squeue) to make sure we own the squeue
1208*da14cebeSEric Cheng 			 * and break out
1209*da14cebeSEric Cheng 			 */
1210*da14cebeSEric Cheng 			if ((sqp->sq_state & SQS_WORKER_THR_CONTROL) &&
1211*da14cebeSEric Cheng 			    (sqp->sq_state & SQS_WORKER))
1212*da14cebeSEric Cheng 				break;
1213*da14cebeSEric Cheng 
12147c478bd9Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
12157c478bd9Sstevel@tonic-gate 			cv_wait(async, lock);
12167c478bd9Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, lock);
12177c478bd9Sstevel@tonic-gate 		}
1218*da14cebeSEric Cheng 		if (sqp->sq_state & SQS_WORKER_THR_CONTROL) {
1219*da14cebeSEric Cheng 			squeue_worker_thr_control(sqp);
1220*da14cebeSEric Cheng 			continue;
12217c478bd9Sstevel@tonic-gate 		}
1222*da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
1223*da14cebeSEric Cheng 		    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
1224*da14cebeSEric Cheng 		    SQS_WORKER_THR_CONTROL | SQS_POLL_THR_CONTROL)));
12257c478bd9Sstevel@tonic-gate 
1226*da14cebeSEric Cheng 		if (sqp->sq_state & SQS_PROC_HELD)
1227*da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_PROC_HELD;
1228*da14cebeSEric Cheng 
1229d19d6468Sbw 		now = gethrtime();
12307c478bd9Sstevel@tonic-gate 		sqp->sq_run = curthread;
1231*da14cebeSEric Cheng 		sqp->sq_drain(sqp, SQS_WORKER, now +  squeue_drain_ns);
12327c478bd9Sstevel@tonic-gate 		sqp->sq_run = NULL;
12337c478bd9Sstevel@tonic-gate 	}
12347c478bd9Sstevel@tonic-gate }
12357c478bd9Sstevel@tonic-gate 
12367c478bd9Sstevel@tonic-gate uintptr_t *
12377c478bd9Sstevel@tonic-gate squeue_getprivate(squeue_t *sqp, sqprivate_t p)
12387c478bd9Sstevel@tonic-gate {
12397c478bd9Sstevel@tonic-gate 	ASSERT(p < SQPRIVATE_MAX);
12407c478bd9Sstevel@tonic-gate 
12417c478bd9Sstevel@tonic-gate 	return (&sqp->sq_private[p]);
12427c478bd9Sstevel@tonic-gate }
1243