xref: /titanic_52/usr/src/uts/common/inet/squeue.c (revision bd670b35a010421b6e1a5536c34453a827007c81)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5e824d57fSjohnlev  * Common Development and Distribution License (the "License").
6e824d57fSjohnlev  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22f3124163SAnders Persson  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
237c478bd9Sstevel@tonic-gate  * Use is subject to license terms.
247c478bd9Sstevel@tonic-gate  */
257c478bd9Sstevel@tonic-gate 
267c478bd9Sstevel@tonic-gate /*
27da14cebeSEric Cheng  * Squeues: General purpose serialization mechanism
28da14cebeSEric Cheng  * ------------------------------------------------
297c478bd9Sstevel@tonic-gate  *
30da14cebeSEric Cheng  * Background:
31da14cebeSEric Cheng  * -----------
327c478bd9Sstevel@tonic-gate  *
33da14cebeSEric Cheng  * This is a general purpose high-performance serialization mechanism
34da14cebeSEric Cheng  * currently used by TCP/IP. It is implement by means of a per CPU queue,
35da14cebeSEric Cheng  * a worker thread and a polling thread with are bound to the CPU
36da14cebeSEric Cheng  * associated with the squeue. The squeue is strictly FIFO for both read
37da14cebeSEric Cheng  * and write side and only one thread can process it at any given time.
38da14cebeSEric Cheng  * The design goal of squeue was to offer a very high degree of
39da14cebeSEric Cheng  * parallelization (on a per H/W execution pipeline basis) with at
40da14cebeSEric Cheng  * most one queuing.
417c478bd9Sstevel@tonic-gate  *
42*bd670b35SErik Nordmark  * The modules needing protection typically calls SQUEUE_ENTER_ONE() or
43*bd670b35SErik Nordmark  * SQUEUE_ENTER() macro as soon as a thread enter the module
44da14cebeSEric Cheng  * from either direction. For each packet, the processing function
45da14cebeSEric Cheng  * and argument is stored in the mblk itself. When the packet is ready
46da14cebeSEric Cheng  * to be processed, the squeue retrieves the stored function and calls
47da14cebeSEric Cheng  * it with the supplied argument and the pointer to the packet itself.
48da14cebeSEric Cheng  * The called function can assume that no other thread is processing
49da14cebeSEric Cheng  * the squeue when it is executing.
507c478bd9Sstevel@tonic-gate  *
51da14cebeSEric Cheng  * Squeue/connection binding:
52da14cebeSEric Cheng  * --------------------------
537c478bd9Sstevel@tonic-gate  *
54da14cebeSEric Cheng  * TCP/IP uses an IP classifier in conjunction with squeue where specific
55da14cebeSEric Cheng  * connections are assigned to specific squeue (based on various policies),
56da14cebeSEric Cheng  * at the connection creation time. Once assigned, the connection to
57da14cebeSEric Cheng  * squeue mapping is never changed and all future packets for that
58da14cebeSEric Cheng  * connection are processed on that squeue. The connection ("conn") to
59da14cebeSEric Cheng  * squeue mapping is stored in "conn_t" member "conn_sqp".
607c478bd9Sstevel@tonic-gate  *
61da14cebeSEric Cheng  * Since the processing of the connection cuts across multiple layers
62da14cebeSEric Cheng  * but still allows packets for different connnection to be processed on
63da14cebeSEric Cheng  * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
64da14cebeSEric Cheng  * "Per Connection Vertical Perimeter".
657c478bd9Sstevel@tonic-gate  *
66da14cebeSEric Cheng  * Processing Model:
67da14cebeSEric Cheng  * -----------------
687c478bd9Sstevel@tonic-gate  *
69da14cebeSEric Cheng  * Squeue doesn't necessary processes packets with its own worker thread.
70da14cebeSEric Cheng  * The callers can pick if they just want to queue the packet, process
71da14cebeSEric Cheng  * their packet if nothing is queued or drain and process. The first two
72da14cebeSEric Cheng  * modes are typically employed when the packet was generated while
73da14cebeSEric Cheng  * already doing the processing behind the squeue and last mode (drain
74da14cebeSEric Cheng  * and process) is typically employed when the thread is entering squeue
75da14cebeSEric Cheng  * for the first time. The squeue still imposes a finite time limit
76da14cebeSEric Cheng  * for which a external thread can do processing after which it switches
77da14cebeSEric Cheng  * processing to its own worker thread.
787c478bd9Sstevel@tonic-gate  *
79da14cebeSEric Cheng  * Once created, squeues are never deleted. Hence squeue pointers are
80da14cebeSEric Cheng  * always valid. This means that functions outside the squeue can still
81da14cebeSEric Cheng  * refer safely to conn_sqp and their is no need for ref counts.
827c478bd9Sstevel@tonic-gate  *
83da14cebeSEric Cheng  * Only a thread executing in the squeue can change the squeue of the
84da14cebeSEric Cheng  * connection. It does so by calling a squeue framework function to do this.
85da14cebeSEric Cheng  * After changing the squeue, the thread must leave the squeue. It must not
86da14cebeSEric Cheng  * continue to execute any code that needs squeue protection.
877c478bd9Sstevel@tonic-gate  *
88da14cebeSEric Cheng  * The squeue framework, after entering the squeue, checks if the current
89da14cebeSEric Cheng  * squeue matches the conn_sqp. If the check fails, the packet is delivered
90da14cebeSEric Cheng  * to right squeue.
917c478bd9Sstevel@tonic-gate  *
92da14cebeSEric Cheng  * Polling Model:
93da14cebeSEric Cheng  * --------------
947c478bd9Sstevel@tonic-gate  *
95da14cebeSEric Cheng  * Squeues can control the rate of packet arrival into itself from the
96da14cebeSEric Cheng  * NIC or specific Rx ring within a NIC. As part of capability negotiation
97da14cebeSEric Cheng  * between IP and MAC layer, squeue are created for each TCP soft ring
98da14cebeSEric Cheng  * (or TCP Rx ring - to be implemented in future). As part of this
99da14cebeSEric Cheng  * negotiation, squeues get a cookie for underlying soft ring or Rx
100da14cebeSEric Cheng  * ring, a function to turn off incoming packets and a function to call
101da14cebeSEric Cheng  * to poll for packets. This helps schedule the receive side packet
102da14cebeSEric Cheng  * processing so that queue backlog doesn't build up and packet processing
103da14cebeSEric Cheng  * doesn't keep getting disturbed by high priority interrupts. As part
104da14cebeSEric Cheng  * of this mode, as soon as a backlog starts building, squeue turns off
105da14cebeSEric Cheng  * the interrupts and switches to poll mode. In poll mode, when poll
106da14cebeSEric Cheng  * thread goes down to retrieve packets, it retrieves them in the form of
107da14cebeSEric Cheng  * a chain which improves performance even more. As the squeue/softring
108da14cebeSEric Cheng  * system gets more packets, it gets more efficient by switching to
109da14cebeSEric Cheng  * polling more often and dealing with larger packet chains.
1107c478bd9Sstevel@tonic-gate  *
1117c478bd9Sstevel@tonic-gate  */
1127c478bd9Sstevel@tonic-gate 
1137c478bd9Sstevel@tonic-gate #include <sys/types.h>
1147c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
1157c478bd9Sstevel@tonic-gate #include <sys/debug.h>
1167c478bd9Sstevel@tonic-gate #include <sys/kmem.h>
1177c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h>
1187c478bd9Sstevel@tonic-gate #include <sys/condvar_impl.h>
1197c478bd9Sstevel@tonic-gate #include <sys/systm.h>
1207c478bd9Sstevel@tonic-gate #include <sys/callb.h>
1217c478bd9Sstevel@tonic-gate #include <sys/sdt.h>
1227c478bd9Sstevel@tonic-gate #include <sys/ddi.h>
123da14cebeSEric Cheng #include <sys/sunddi.h>
1247c478bd9Sstevel@tonic-gate 
1257c478bd9Sstevel@tonic-gate #include <inet/ipclassifier.h>
126d045b987Smasputra #include <inet/udp_impl.h>
1277c478bd9Sstevel@tonic-gate 
1287c478bd9Sstevel@tonic-gate #include <sys/squeue_impl.h>
1297c478bd9Sstevel@tonic-gate 
1307c478bd9Sstevel@tonic-gate static void squeue_fire(void *);
131d19d6468Sbw static void squeue_drain(squeue_t *, uint_t, hrtime_t);
1327c478bd9Sstevel@tonic-gate static void squeue_worker(squeue_t *sqp);
133da14cebeSEric Cheng static void squeue_polling_thread(squeue_t *sqp);
1347c478bd9Sstevel@tonic-gate 
1357c478bd9Sstevel@tonic-gate kmem_cache_t *squeue_cache;
1367c478bd9Sstevel@tonic-gate 
137d19d6468Sbw #define	SQUEUE_MSEC_TO_NSEC 1000000
138d19d6468Sbw 
139da14cebeSEric Cheng int squeue_drain_ms = 20;
140da14cebeSEric Cheng int squeue_workerwait_ms = 0;
1417c478bd9Sstevel@tonic-gate 
142d19d6468Sbw /* The values above converted to ticks or nano seconds */
143da14cebeSEric Cheng static int squeue_drain_ns = 0;
1447c478bd9Sstevel@tonic-gate static int squeue_workerwait_tick = 0;
1457c478bd9Sstevel@tonic-gate 
146da14cebeSEric Cheng #define	MAX_BYTES_TO_PICKUP	150000
1477c478bd9Sstevel@tonic-gate 
1487c478bd9Sstevel@tonic-gate #define	ENQUEUE_CHAIN(sqp, mp, tail, cnt) {			\
1497c478bd9Sstevel@tonic-gate 	/*							\
1507c478bd9Sstevel@tonic-gate 	 * Enqueue our mblk chain.				\
1517c478bd9Sstevel@tonic-gate 	 */							\
1527c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
1537c478bd9Sstevel@tonic-gate 								\
1547c478bd9Sstevel@tonic-gate 	if ((sqp)->sq_last != NULL)				\
1557c478bd9Sstevel@tonic-gate 		(sqp)->sq_last->b_next = (mp);			\
1567c478bd9Sstevel@tonic-gate 	else							\
1577c478bd9Sstevel@tonic-gate 		(sqp)->sq_first = (mp);				\
1587c478bd9Sstevel@tonic-gate 	(sqp)->sq_last = (tail);				\
1597c478bd9Sstevel@tonic-gate 	(sqp)->sq_count += (cnt);				\
1607c478bd9Sstevel@tonic-gate 	ASSERT((sqp)->sq_count > 0);				\
1617c478bd9Sstevel@tonic-gate 	DTRACE_PROBE4(squeue__enqueuechain, squeue_t *, sqp,	\
1627c478bd9Sstevel@tonic-gate 		mblk_t *, mp, mblk_t *, tail, int, cnt);	\
1637c478bd9Sstevel@tonic-gate 								\
1647c478bd9Sstevel@tonic-gate }
1657c478bd9Sstevel@tonic-gate 
166efe28d82SRajagopal Kunhappan /*
167efe28d82SRajagopal Kunhappan  * Blank the receive ring (in this case it is the soft ring). When
168efe28d82SRajagopal Kunhappan  * blanked, the soft ring will not send any more packets up.
169efe28d82SRajagopal Kunhappan  * Blanking may not succeed when there is a CPU already in the soft
170efe28d82SRajagopal Kunhappan  * ring sending packets up. In that case, SQS_POLLING will not be
171efe28d82SRajagopal Kunhappan  * set.
172efe28d82SRajagopal Kunhappan  */
173da14cebeSEric Cheng #define	SQS_POLLING_ON(sqp, sq_poll_capable, rx_ring) {		\
1747c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
175da14cebeSEric Cheng 	if (sq_poll_capable) {					\
176da14cebeSEric Cheng 		ASSERT(rx_ring != NULL);			\
177da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
178da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_POLLING)) {		\
179efe28d82SRajagopal Kunhappan 			if (rx_ring->rr_intr_disable(rx_ring->rr_intr_handle)) \
180da14cebeSEric Cheng 				sqp->sq_state |= SQS_POLLING;	\
181da14cebeSEric Cheng 		}						\
182da14cebeSEric Cheng 	}							\
1837c478bd9Sstevel@tonic-gate }
1847c478bd9Sstevel@tonic-gate 
185da14cebeSEric Cheng #define	SQS_POLLING_OFF(sqp, sq_poll_capable, rx_ring) {	\
1867c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
187da14cebeSEric Cheng 	if (sq_poll_capable) {					\
188da14cebeSEric Cheng 		ASSERT(rx_ring != NULL);			\
189da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
190da14cebeSEric Cheng 		if (sqp->sq_state & SQS_POLLING) {		\
191da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_POLLING;		\
192da14cebeSEric Cheng 			rx_ring->rr_intr_enable(rx_ring->rr_intr_handle); \
193da14cebeSEric Cheng 		}						\
194da14cebeSEric Cheng 	}							\
1957c478bd9Sstevel@tonic-gate }
1967c478bd9Sstevel@tonic-gate 
197efe28d82SRajagopal Kunhappan /* Wakeup poll thread only if SQS_POLLING is set */
198efe28d82SRajagopal Kunhappan #define	SQS_POLL_RING(sqp) {			\
199da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));			\
200efe28d82SRajagopal Kunhappan 	if (sqp->sq_state & SQS_POLLING) {			\
201da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_CAPAB);		\
202da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_GET_PKTS)) {		\
203da14cebeSEric Cheng 			sqp->sq_state |= SQS_GET_PKTS;		\
204da14cebeSEric Cheng 			cv_signal(&sqp->sq_poll_cv);		\
205da14cebeSEric Cheng 		}						\
206da14cebeSEric Cheng 	}							\
207da14cebeSEric Cheng }
208da14cebeSEric Cheng 
209da14cebeSEric Cheng #ifdef DEBUG
210da14cebeSEric Cheng #define	SQUEUE_DBG_SET(sqp, mp, proc, connp, tag) {		\
211da14cebeSEric Cheng 	(sqp)->sq_curmp = (mp);					\
212da14cebeSEric Cheng 	(sqp)->sq_curproc = (proc);				\
213da14cebeSEric Cheng 	(sqp)->sq_connp = (connp);				\
214da14cebeSEric Cheng 	(mp)->b_tag = (sqp)->sq_tag = (tag);			\
215da14cebeSEric Cheng }
216da14cebeSEric Cheng 
217da14cebeSEric Cheng #define	SQUEUE_DBG_CLEAR(sqp)	{				\
218da14cebeSEric Cheng 	(sqp)->sq_curmp = NULL;					\
219da14cebeSEric Cheng 	(sqp)->sq_curproc = NULL;				\
220da14cebeSEric Cheng 	(sqp)->sq_connp = NULL;					\
221da14cebeSEric Cheng }
222da14cebeSEric Cheng #else
223da14cebeSEric Cheng #define	SQUEUE_DBG_SET(sqp, mp, proc, connp, tag)
224da14cebeSEric Cheng #define	SQUEUE_DBG_CLEAR(sqp)
225da14cebeSEric Cheng #endif
226da14cebeSEric Cheng 
2277c478bd9Sstevel@tonic-gate void
2287c478bd9Sstevel@tonic-gate squeue_init(void)
2297c478bd9Sstevel@tonic-gate {
2307c478bd9Sstevel@tonic-gate 	squeue_cache = kmem_cache_create("squeue_cache",
2317c478bd9Sstevel@tonic-gate 	    sizeof (squeue_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
2327c478bd9Sstevel@tonic-gate 
233da14cebeSEric Cheng 	squeue_drain_ns = squeue_drain_ms * SQUEUE_MSEC_TO_NSEC;
2347c478bd9Sstevel@tonic-gate 	squeue_workerwait_tick = MSEC_TO_TICK_ROUNDUP(squeue_workerwait_ms);
2357c478bd9Sstevel@tonic-gate }
2367c478bd9Sstevel@tonic-gate 
2377c478bd9Sstevel@tonic-gate /* ARGSUSED */
2387c478bd9Sstevel@tonic-gate squeue_t *
239da14cebeSEric Cheng squeue_create(clock_t wait, pri_t pri)
2407c478bd9Sstevel@tonic-gate {
2417c478bd9Sstevel@tonic-gate 	squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
2427c478bd9Sstevel@tonic-gate 
2437c478bd9Sstevel@tonic-gate 	bzero(sqp, sizeof (squeue_t));
244da14cebeSEric Cheng 	sqp->sq_bind = PBIND_NONE;
245da14cebeSEric Cheng 	sqp->sq_priority = pri;
2467c478bd9Sstevel@tonic-gate 	sqp->sq_wait = MSEC_TO_TICK(wait);
2477c478bd9Sstevel@tonic-gate 	sqp->sq_worker = thread_create(NULL, 0, squeue_worker,
2487c478bd9Sstevel@tonic-gate 	    sqp, 0, &p0, TS_RUN, pri);
2497c478bd9Sstevel@tonic-gate 
250da14cebeSEric Cheng 	sqp->sq_poll_thr = thread_create(NULL, 0, squeue_polling_thread,
251da14cebeSEric Cheng 	    sqp, 0, &p0, TS_RUN, pri);
252da14cebeSEric Cheng 
253da14cebeSEric Cheng 	sqp->sq_enter = squeue_enter;
254da14cebeSEric Cheng 	sqp->sq_drain = squeue_drain;
255da14cebeSEric Cheng 
2567c478bd9Sstevel@tonic-gate 	return (sqp);
2577c478bd9Sstevel@tonic-gate }
2587c478bd9Sstevel@tonic-gate 
259da14cebeSEric Cheng /*
260da14cebeSEric Cheng  * Bind squeue worker thread to the specified CPU, given by CPU id.
261da14cebeSEric Cheng  * If the CPU id  value is -1, bind the worker thread to the value
262da14cebeSEric Cheng  * specified in sq_bind field. If a thread is already bound to a
263da14cebeSEric Cheng  * different CPU, unbind it from the old CPU and bind to the new one.
264da14cebeSEric Cheng  */
265da14cebeSEric Cheng 
2667c478bd9Sstevel@tonic-gate void
2677c478bd9Sstevel@tonic-gate squeue_bind(squeue_t *sqp, processorid_t bind)
2687c478bd9Sstevel@tonic-gate {
2697c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
270da14cebeSEric Cheng 	ASSERT(sqp->sq_bind != PBIND_NONE || bind != PBIND_NONE);
271da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&cpu_lock));
272da14cebeSEric Cheng 
2737c478bd9Sstevel@tonic-gate 	if (sqp->sq_state & SQS_BOUND) {
274da14cebeSEric Cheng 		if (sqp->sq_bind == bind) {
2757c478bd9Sstevel@tonic-gate 			mutex_exit(&sqp->sq_lock);
2767c478bd9Sstevel@tonic-gate 			return;
2777c478bd9Sstevel@tonic-gate 		}
278da14cebeSEric Cheng 		thread_affinity_clear(sqp->sq_worker);
279da14cebeSEric Cheng 	} else {
2807c478bd9Sstevel@tonic-gate 		sqp->sq_state |= SQS_BOUND;
281da14cebeSEric Cheng 	}
282da14cebeSEric Cheng 
283da14cebeSEric Cheng 	if (bind != PBIND_NONE)
284da14cebeSEric Cheng 		sqp->sq_bind = bind;
2857c478bd9Sstevel@tonic-gate 
2867c478bd9Sstevel@tonic-gate 	thread_affinity_set(sqp->sq_worker, sqp->sq_bind);
287da14cebeSEric Cheng 	mutex_exit(&sqp->sq_lock);
2887c478bd9Sstevel@tonic-gate }
2897c478bd9Sstevel@tonic-gate 
2907c478bd9Sstevel@tonic-gate void
2917c478bd9Sstevel@tonic-gate squeue_unbind(squeue_t *sqp)
2927c478bd9Sstevel@tonic-gate {
2937c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
2947c478bd9Sstevel@tonic-gate 	if (!(sqp->sq_state & SQS_BOUND)) {
2957c478bd9Sstevel@tonic-gate 		mutex_exit(&sqp->sq_lock);
2967c478bd9Sstevel@tonic-gate 		return;
2977c478bd9Sstevel@tonic-gate 	}
2987c478bd9Sstevel@tonic-gate 
2997c478bd9Sstevel@tonic-gate 	sqp->sq_state &= ~SQS_BOUND;
3007c478bd9Sstevel@tonic-gate 	thread_affinity_clear(sqp->sq_worker);
301da14cebeSEric Cheng 	mutex_exit(&sqp->sq_lock);
302da14cebeSEric Cheng }
303da14cebeSEric Cheng 
304da14cebeSEric Cheng void
305da14cebeSEric Cheng squeue_worker_wakeup(squeue_t *sqp)
306da14cebeSEric Cheng {
307da14cebeSEric Cheng 	timeout_id_t tid = (sqp)->sq_tid;
308da14cebeSEric Cheng 
309da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&(sqp)->sq_lock));
310da14cebeSEric Cheng 
311da14cebeSEric Cheng 	if (sqp->sq_wait == 0) {
312da14cebeSEric Cheng 		ASSERT(tid == 0);
313da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & SQS_TMO_PROG));
314da14cebeSEric Cheng 		sqp->sq_awaken = lbolt;
315da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
316da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
317da14cebeSEric Cheng 		return;
318da14cebeSEric Cheng 	}
319da14cebeSEric Cheng 
320da14cebeSEric Cheng 	/*
321da14cebeSEric Cheng 	 * Queue isn't being processed, so take
322da14cebeSEric Cheng 	 * any post enqueue actions needed before leaving.
323da14cebeSEric Cheng 	 */
324da14cebeSEric Cheng 	if (tid != 0) {
325da14cebeSEric Cheng 		/*
326da14cebeSEric Cheng 		 * Waiting for an enter() to process mblk(s).
327da14cebeSEric Cheng 		 */
328da14cebeSEric Cheng 		clock_t	waited = lbolt - sqp->sq_awaken;
329da14cebeSEric Cheng 
330da14cebeSEric Cheng 		if (TICK_TO_MSEC(waited) >= sqp->sq_wait) {
331da14cebeSEric Cheng 			/*
332da14cebeSEric Cheng 			 * Times up and have a worker thread
333da14cebeSEric Cheng 			 * waiting for work, so schedule it.
334da14cebeSEric Cheng 			 */
335da14cebeSEric Cheng 			sqp->sq_tid = 0;
336da14cebeSEric Cheng 			sqp->sq_awaken = lbolt;
337da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
338da14cebeSEric Cheng 			mutex_exit(&sqp->sq_lock);
339da14cebeSEric Cheng 			(void) untimeout(tid);
340da14cebeSEric Cheng 			return;
341da14cebeSEric Cheng 		}
342da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
343da14cebeSEric Cheng 		return;
344da14cebeSEric Cheng 	} else if (sqp->sq_state & SQS_TMO_PROG) {
345da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
346da14cebeSEric Cheng 		return;
347da14cebeSEric Cheng 	} else {
348da14cebeSEric Cheng 		clock_t	wait = sqp->sq_wait;
349da14cebeSEric Cheng 		/*
350da14cebeSEric Cheng 		 * Wait up to sqp->sq_wait ms for an
351da14cebeSEric Cheng 		 * enter() to process this queue. We
352da14cebeSEric Cheng 		 * don't want to contend on timeout locks
353da14cebeSEric Cheng 		 * with sq_lock held for performance reasons,
354da14cebeSEric Cheng 		 * so drop the sq_lock before calling timeout
355da14cebeSEric Cheng 		 * but we need to check if timeout is required
356da14cebeSEric Cheng 		 * after re acquiring the sq_lock. Once
357da14cebeSEric Cheng 		 * the sq_lock is dropped, someone else could
358da14cebeSEric Cheng 		 * have processed the packet or the timeout could
359da14cebeSEric Cheng 		 * have already fired.
360da14cebeSEric Cheng 		 */
361da14cebeSEric Cheng 		sqp->sq_state |= SQS_TMO_PROG;
362da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
363da14cebeSEric Cheng 		tid = timeout(squeue_fire, sqp, wait);
364da14cebeSEric Cheng 		mutex_enter(&sqp->sq_lock);
365da14cebeSEric Cheng 		/* Check again if we still need the timeout */
366da14cebeSEric Cheng 		if (((sqp->sq_state & (SQS_PROC|SQS_TMO_PROG)) ==
367da14cebeSEric Cheng 		    SQS_TMO_PROG) && (sqp->sq_tid == 0) &&
368da14cebeSEric Cheng 		    (sqp->sq_first != NULL)) {
369da14cebeSEric Cheng 				sqp->sq_state &= ~SQS_TMO_PROG;
370da14cebeSEric Cheng 				sqp->sq_tid = tid;
371da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
372da14cebeSEric Cheng 				return;
373da14cebeSEric Cheng 		} else {
374da14cebeSEric Cheng 			if (sqp->sq_state & SQS_TMO_PROG) {
375da14cebeSEric Cheng 				sqp->sq_state &= ~SQS_TMO_PROG;
376da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
377da14cebeSEric Cheng 				(void) untimeout(tid);
378da14cebeSEric Cheng 			} else {
379da14cebeSEric Cheng 				/*
380da14cebeSEric Cheng 				 * The timer fired before we could
381da14cebeSEric Cheng 				 * reacquire the sq_lock. squeue_fire
382da14cebeSEric Cheng 				 * removes the SQS_TMO_PROG flag
383da14cebeSEric Cheng 				 * and we don't need to	do anything
384da14cebeSEric Cheng 				 * else.
385da14cebeSEric Cheng 				 */
386da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
387da14cebeSEric Cheng 			}
388da14cebeSEric Cheng 		}
389da14cebeSEric Cheng 	}
390da14cebeSEric Cheng 
391da14cebeSEric Cheng 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
3927c478bd9Sstevel@tonic-gate }
3937c478bd9Sstevel@tonic-gate 
3947c478bd9Sstevel@tonic-gate /*
3957c478bd9Sstevel@tonic-gate  * squeue_enter() - enter squeue sqp with mblk mp (which can be
3967c478bd9Sstevel@tonic-gate  * a chain), while tail points to the end and cnt in number of
3977c478bd9Sstevel@tonic-gate  * mblks in the chain.
3987c478bd9Sstevel@tonic-gate  *
3997c478bd9Sstevel@tonic-gate  * For a chain of single packet (i.e. mp == tail), go through the
4007c478bd9Sstevel@tonic-gate  * fast path if no one is processing the squeue and nothing is queued.
4017c478bd9Sstevel@tonic-gate  *
4027c478bd9Sstevel@tonic-gate  * The proc and arg for each mblk is already stored in the mblk in
4037c478bd9Sstevel@tonic-gate  * appropriate places.
404da14cebeSEric Cheng  *
405da14cebeSEric Cheng  * The process_flag specifies if we are allowed to process the mblk
406da14cebeSEric Cheng  * and drain in the entering thread context. If process_flag is
407da14cebeSEric Cheng  * SQ_FILL, then we just queue the mblk and return (after signaling
408da14cebeSEric Cheng  * the worker thread if no one else is processing the squeue).
409*bd670b35SErik Nordmark  *
410*bd670b35SErik Nordmark  * The ira argument can be used when the count is one.
411*bd670b35SErik Nordmark  * For a chain the caller needs to prepend any needed mblks from
412*bd670b35SErik Nordmark  * ip_recv_attr_to_mblk().
4137c478bd9Sstevel@tonic-gate  */
414da14cebeSEric Cheng /* ARGSUSED */
4157c478bd9Sstevel@tonic-gate void
416da14cebeSEric Cheng squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
417*bd670b35SErik Nordmark     ip_recv_attr_t *ira, int process_flag, uint8_t tag)
4187c478bd9Sstevel@tonic-gate {
419da14cebeSEric Cheng 	conn_t		*connp;
4207c478bd9Sstevel@tonic-gate 	sqproc_t	proc;
421d19d6468Sbw 	hrtime_t	now;
4227c478bd9Sstevel@tonic-gate 
4237c478bd9Sstevel@tonic-gate 	ASSERT(sqp != NULL);
4247c478bd9Sstevel@tonic-gate 	ASSERT(mp != NULL);
4257c478bd9Sstevel@tonic-gate 	ASSERT(tail != NULL);
4267c478bd9Sstevel@tonic-gate 	ASSERT(cnt > 0);
4277c478bd9Sstevel@tonic-gate 	ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
428*bd670b35SErik Nordmark 	ASSERT(ira == NULL || cnt == 1);
4297c478bd9Sstevel@tonic-gate 
4307c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
431da14cebeSEric Cheng 
432da14cebeSEric Cheng 	/*
433da14cebeSEric Cheng 	 * Try to process the packet if SQ_FILL flag is not set and
434da14cebeSEric Cheng 	 * we are allowed to process the squeue. The SQ_NODRAIN is
435da14cebeSEric Cheng 	 * ignored if the packet chain consists of more than 1 packet.
436da14cebeSEric Cheng 	 */
437da14cebeSEric Cheng 	if (!(sqp->sq_state & SQS_PROC) && ((process_flag == SQ_PROCESS) ||
438da14cebeSEric Cheng 	    (process_flag == SQ_NODRAIN && sqp->sq_first == NULL))) {
4397c478bd9Sstevel@tonic-gate 		/*
4407c478bd9Sstevel@tonic-gate 		 * See if anything is already queued. If we are the
4417c478bd9Sstevel@tonic-gate 		 * first packet, do inline processing else queue the
4427c478bd9Sstevel@tonic-gate 		 * packet and do the drain.
4437c478bd9Sstevel@tonic-gate 		 */
4447c478bd9Sstevel@tonic-gate 		if (sqp->sq_first == NULL && cnt == 1) {
4457c478bd9Sstevel@tonic-gate 			/*
4467c478bd9Sstevel@tonic-gate 			 * Fast-path, ok to process and nothing queued.
4477c478bd9Sstevel@tonic-gate 			 */
4487c478bd9Sstevel@tonic-gate 			sqp->sq_state |= (SQS_PROC|SQS_FAST);
449da14cebeSEric Cheng 			sqp->sq_run = curthread;
4507c478bd9Sstevel@tonic-gate 			mutex_exit(&sqp->sq_lock);
4517c478bd9Sstevel@tonic-gate 
4527c478bd9Sstevel@tonic-gate 			/*
4537c478bd9Sstevel@tonic-gate 			 * We are the chain of 1 packet so
4547c478bd9Sstevel@tonic-gate 			 * go through this fast path.
4557c478bd9Sstevel@tonic-gate 			 */
456da14cebeSEric Cheng 			ASSERT(mp->b_prev != NULL);
457da14cebeSEric Cheng 			ASSERT(mp->b_queue != NULL);
458da14cebeSEric Cheng 			connp = (conn_t *)mp->b_prev;
459da14cebeSEric Cheng 			mp->b_prev = NULL;
460da14cebeSEric Cheng 			proc = (sqproc_t)mp->b_queue;
461da14cebeSEric Cheng 			mp->b_queue = NULL;
462da14cebeSEric Cheng 			ASSERT(proc != NULL && connp != NULL);
463da14cebeSEric Cheng 			ASSERT(mp->b_next == NULL);
464da14cebeSEric Cheng 
465da14cebeSEric Cheng 			/*
466da14cebeSEric Cheng 			 * Handle squeue switching. More details in the
467da14cebeSEric Cheng 			 * block comment at the top of the file
468da14cebeSEric Cheng 			 */
469da14cebeSEric Cheng 			if (connp->conn_sqp == sqp) {
470da14cebeSEric Cheng 				SQUEUE_DBG_SET(sqp, mp, proc, connp,
471da14cebeSEric Cheng 				    tag);
472da14cebeSEric Cheng 				connp->conn_on_sqp = B_TRUE;
473da14cebeSEric Cheng 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
474da14cebeSEric Cheng 				    sqp, mblk_t *, mp, conn_t *, connp);
475*bd670b35SErik Nordmark 				(*proc)(connp, mp, sqp, ira);
476da14cebeSEric Cheng 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
477da14cebeSEric Cheng 				    sqp, conn_t *, connp);
478da14cebeSEric Cheng 				connp->conn_on_sqp = B_FALSE;
479da14cebeSEric Cheng 				SQUEUE_DBG_CLEAR(sqp);
480da14cebeSEric Cheng 				CONN_DEC_REF(connp);
481da14cebeSEric Cheng 			} else {
482da14cebeSEric Cheng 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
483*bd670b35SErik Nordmark 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
484da14cebeSEric Cheng 			}
485da14cebeSEric Cheng 			ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock));
486da14cebeSEric Cheng 			mutex_enter(&sqp->sq_lock);
487da14cebeSEric Cheng 			sqp->sq_state &= ~(SQS_PROC|SQS_FAST);
488da14cebeSEric Cheng 			sqp->sq_run = NULL;
489da14cebeSEric Cheng 			if (sqp->sq_first == NULL ||
490da14cebeSEric Cheng 			    process_flag == SQ_NODRAIN) {
491da14cebeSEric Cheng 				if (sqp->sq_first != NULL) {
492da14cebeSEric Cheng 					squeue_worker_wakeup(sqp);
493da14cebeSEric Cheng 					return;
494da14cebeSEric Cheng 				}
495da14cebeSEric Cheng 				/*
496da14cebeSEric Cheng 				 * We processed inline our packet and nothing
497da14cebeSEric Cheng 				 * new has arrived. We are done. In case any
498da14cebeSEric Cheng 				 * control actions are pending, wake up the
499da14cebeSEric Cheng 				 * worker.
500da14cebeSEric Cheng 				 */
501da14cebeSEric Cheng 				if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
502da14cebeSEric Cheng 					cv_signal(&sqp->sq_worker_cv);
503da14cebeSEric Cheng 				mutex_exit(&sqp->sq_lock);
504da14cebeSEric Cheng 				return;
505da14cebeSEric Cheng 			}
506da14cebeSEric Cheng 		} else {
507*bd670b35SErik Nordmark 			if (ira != NULL) {
508*bd670b35SErik Nordmark 				mblk_t	*attrmp;
509*bd670b35SErik Nordmark 
510*bd670b35SErik Nordmark 				ASSERT(cnt == 1);
511*bd670b35SErik Nordmark 				attrmp = ip_recv_attr_to_mblk(ira);
512*bd670b35SErik Nordmark 				if (attrmp == NULL) {
513*bd670b35SErik Nordmark 					mutex_exit(&sqp->sq_lock);
514*bd670b35SErik Nordmark 					ip_drop_input("squeue: "
515*bd670b35SErik Nordmark 					    "ip_recv_attr_to_mblk",
516*bd670b35SErik Nordmark 					    mp, NULL);
517*bd670b35SErik Nordmark 					/* Caller already set b_prev/b_next */
518*bd670b35SErik Nordmark 					mp->b_prev = mp->b_next = NULL;
519*bd670b35SErik Nordmark 					freemsg(mp);
520*bd670b35SErik Nordmark 					return;
521*bd670b35SErik Nordmark 				}
522*bd670b35SErik Nordmark 				ASSERT(attrmp->b_cont == NULL);
523*bd670b35SErik Nordmark 				attrmp->b_cont = mp;
524*bd670b35SErik Nordmark 				/* Move connp and func to new */
525*bd670b35SErik Nordmark 				attrmp->b_queue = mp->b_queue;
526*bd670b35SErik Nordmark 				mp->b_queue = NULL;
527*bd670b35SErik Nordmark 				attrmp->b_prev = mp->b_prev;
528*bd670b35SErik Nordmark 				mp->b_prev = NULL;
529*bd670b35SErik Nordmark 
530*bd670b35SErik Nordmark 				ASSERT(mp == tail);
531*bd670b35SErik Nordmark 				tail = mp = attrmp;
532*bd670b35SErik Nordmark 			}
533*bd670b35SErik Nordmark 
534da14cebeSEric Cheng 			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
535da14cebeSEric Cheng #ifdef DEBUG
536da14cebeSEric Cheng 			mp->b_tag = tag;
537da14cebeSEric Cheng #endif
538da14cebeSEric Cheng 		}
539da14cebeSEric Cheng 		/*
540da14cebeSEric Cheng 		 * We are here because either we couldn't do inline
541da14cebeSEric Cheng 		 * processing (because something was already queued),
542da14cebeSEric Cheng 		 * or we had a chain of more than one packet,
543da14cebeSEric Cheng 		 * or something else arrived after we were done with
544da14cebeSEric Cheng 		 * inline processing.
545da14cebeSEric Cheng 		 */
546da14cebeSEric Cheng 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
547da14cebeSEric Cheng 		ASSERT(sqp->sq_first != NULL);
548da14cebeSEric Cheng 		now = gethrtime();
549da14cebeSEric Cheng 		sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
550da14cebeSEric Cheng 
551da14cebeSEric Cheng 		/*
552da14cebeSEric Cheng 		 * If we didn't do a complete drain, the worker
553da14cebeSEric Cheng 		 * thread was already signalled by squeue_drain.
554da14cebeSEric Cheng 		 * In case any control actions are pending, wake
555da14cebeSEric Cheng 		 * up the worker.
556da14cebeSEric Cheng 		 */
557da14cebeSEric Cheng 		sqp->sq_run = NULL;
558da14cebeSEric Cheng 		if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
559da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
560da14cebeSEric Cheng 		mutex_exit(&sqp->sq_lock);
561da14cebeSEric Cheng 		return;
562da14cebeSEric Cheng 	} else {
563da14cebeSEric Cheng 		/*
564da14cebeSEric Cheng 		 * We let a thread processing a squeue reenter only
565da14cebeSEric Cheng 		 * once. This helps the case of incoming connection
566da14cebeSEric Cheng 		 * where a SYN-ACK-ACK that triggers the conn_ind
567da14cebeSEric Cheng 		 * doesn't have to queue the packet if listener and
568da14cebeSEric Cheng 		 * eager are on the same squeue. Also helps the
569da14cebeSEric Cheng 		 * loopback connection where the two ends are bound
570da14cebeSEric Cheng 		 * to the same squeue (which is typical on single
571da14cebeSEric Cheng 		 * CPU machines).
572da14cebeSEric Cheng 		 *
573da14cebeSEric Cheng 		 * We let the thread reenter only once for the fear
574da14cebeSEric Cheng 		 * of stack getting blown with multiple traversal.
575da14cebeSEric Cheng 		 */
576da14cebeSEric Cheng 		connp = (conn_t *)mp->b_prev;
577da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_REENTER) &&
578da14cebeSEric Cheng 		    (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
579da14cebeSEric Cheng 		    (sqp->sq_run == curthread) && (cnt == 1) &&
580da14cebeSEric Cheng 		    (connp->conn_on_sqp == B_FALSE)) {
581da14cebeSEric Cheng 			sqp->sq_state |= SQS_REENTER;
582da14cebeSEric Cheng 			mutex_exit(&sqp->sq_lock);
583da14cebeSEric Cheng 
584da14cebeSEric Cheng 			ASSERT(mp->b_prev != NULL);
585da14cebeSEric Cheng 			ASSERT(mp->b_queue != NULL);
586da14cebeSEric Cheng 
5877c478bd9Sstevel@tonic-gate 			mp->b_prev = NULL;
5887c478bd9Sstevel@tonic-gate 			proc = (sqproc_t)mp->b_queue;
5897c478bd9Sstevel@tonic-gate 			mp->b_queue = NULL;
5907c478bd9Sstevel@tonic-gate 
591da14cebeSEric Cheng 			/*
592da14cebeSEric Cheng 			 * Handle squeue switching. More details in the
593da14cebeSEric Cheng 			 * block comment at the top of the file
594da14cebeSEric Cheng 			 */
595da14cebeSEric Cheng 			if (connp->conn_sqp == sqp) {
596da14cebeSEric Cheng 				connp->conn_on_sqp = B_TRUE;
5977c478bd9Sstevel@tonic-gate 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
598da14cebeSEric Cheng 				    sqp, mblk_t *, mp, conn_t *, connp);
599*bd670b35SErik Nordmark 				(*proc)(connp, mp, sqp, ira);
6007c478bd9Sstevel@tonic-gate 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
601da14cebeSEric Cheng 				    sqp, conn_t *, connp);
602da14cebeSEric Cheng 				connp->conn_on_sqp = B_FALSE;
603da14cebeSEric Cheng 				CONN_DEC_REF(connp);
604da14cebeSEric Cheng 			} else {
605da14cebeSEric Cheng 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
606*bd670b35SErik Nordmark 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
6077c478bd9Sstevel@tonic-gate 			}
6087c478bd9Sstevel@tonic-gate 
6097c478bd9Sstevel@tonic-gate 			mutex_enter(&sqp->sq_lock);
610da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_REENTER;
6117c478bd9Sstevel@tonic-gate 			mutex_exit(&sqp->sq_lock);
6127c478bd9Sstevel@tonic-gate 			return;
6137c478bd9Sstevel@tonic-gate 		}
614da14cebeSEric Cheng 
615da14cebeSEric Cheng 		/*
616da14cebeSEric Cheng 		 * Queue is already being processed or there is already
617da14cebeSEric Cheng 		 * one or more paquets on the queue. Enqueue the
618da14cebeSEric Cheng 		 * packet and wakeup the squeue worker thread if the
619da14cebeSEric Cheng 		 * squeue is not being processed.
620da14cebeSEric Cheng 		 */
621da14cebeSEric Cheng #ifdef DEBUG
6227c478bd9Sstevel@tonic-gate 		mp->b_tag = tag;
6237c478bd9Sstevel@tonic-gate #endif
624*bd670b35SErik Nordmark 		if (ira != NULL) {
625*bd670b35SErik Nordmark 			mblk_t	*attrmp;
6267c478bd9Sstevel@tonic-gate 
627*bd670b35SErik Nordmark 			ASSERT(cnt == 1);
628*bd670b35SErik Nordmark 			attrmp = ip_recv_attr_to_mblk(ira);
629*bd670b35SErik Nordmark 			if (attrmp == NULL) {
630*bd670b35SErik Nordmark 				mutex_exit(&sqp->sq_lock);
631*bd670b35SErik Nordmark 				ip_drop_input("squeue: ip_recv_attr_to_mblk",
632*bd670b35SErik Nordmark 				    mp, NULL);
633*bd670b35SErik Nordmark 				/* Caller already set b_prev/b_next */
634*bd670b35SErik Nordmark 				mp->b_prev = mp->b_next = NULL;
635*bd670b35SErik Nordmark 				freemsg(mp);
636*bd670b35SErik Nordmark 				return;
637*bd670b35SErik Nordmark 			}
638*bd670b35SErik Nordmark 			ASSERT(attrmp->b_cont == NULL);
639*bd670b35SErik Nordmark 			attrmp->b_cont = mp;
640*bd670b35SErik Nordmark 			/* Move connp and func to new */
641*bd670b35SErik Nordmark 			attrmp->b_queue = mp->b_queue;
642*bd670b35SErik Nordmark 			mp->b_queue = NULL;
643*bd670b35SErik Nordmark 			attrmp->b_prev = mp->b_prev;
644*bd670b35SErik Nordmark 			mp->b_prev = NULL;
645*bd670b35SErik Nordmark 
646*bd670b35SErik Nordmark 			ASSERT(mp == tail);
647*bd670b35SErik Nordmark 			tail = mp = attrmp;
648*bd670b35SErik Nordmark 		}
6497c478bd9Sstevel@tonic-gate 		ENQUEUE_CHAIN(sqp, mp, tail, cnt);
6507c478bd9Sstevel@tonic-gate 		if (!(sqp->sq_state & SQS_PROC)) {
651da14cebeSEric Cheng 			squeue_worker_wakeup(sqp);
6527c478bd9Sstevel@tonic-gate 			return;
6537c478bd9Sstevel@tonic-gate 		}
6547c478bd9Sstevel@tonic-gate 		/*
655da14cebeSEric Cheng 		 * In case any control actions are pending, wake
656da14cebeSEric Cheng 		 * up the worker.
6577c478bd9Sstevel@tonic-gate 		 */
658da14cebeSEric Cheng 		if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
659da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
6607c478bd9Sstevel@tonic-gate 		mutex_exit(&sqp->sq_lock);
6617c478bd9Sstevel@tonic-gate 		return;
6627c478bd9Sstevel@tonic-gate 	}
6637c478bd9Sstevel@tonic-gate }
6647c478bd9Sstevel@tonic-gate 
6657c478bd9Sstevel@tonic-gate /*
6667c478bd9Sstevel@tonic-gate  * PRIVATE FUNCTIONS
6677c478bd9Sstevel@tonic-gate  */
6687c478bd9Sstevel@tonic-gate 
6697c478bd9Sstevel@tonic-gate static void
6707c478bd9Sstevel@tonic-gate squeue_fire(void *arg)
6717c478bd9Sstevel@tonic-gate {
6727c478bd9Sstevel@tonic-gate 	squeue_t	*sqp = arg;
6737c478bd9Sstevel@tonic-gate 	uint_t		state;
6747c478bd9Sstevel@tonic-gate 
6757c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
6767c478bd9Sstevel@tonic-gate 
6777c478bd9Sstevel@tonic-gate 	state = sqp->sq_state;
6787c478bd9Sstevel@tonic-gate 	if (sqp->sq_tid == 0 && !(state & SQS_TMO_PROG)) {
6797c478bd9Sstevel@tonic-gate 		mutex_exit(&sqp->sq_lock);
6807c478bd9Sstevel@tonic-gate 		return;
6817c478bd9Sstevel@tonic-gate 	}
6827c478bd9Sstevel@tonic-gate 
6837c478bd9Sstevel@tonic-gate 	sqp->sq_tid = 0;
6847c478bd9Sstevel@tonic-gate 	/*
6857c478bd9Sstevel@tonic-gate 	 * The timeout fired before we got a chance to set it.
6867c478bd9Sstevel@tonic-gate 	 * Process it anyway but remove the SQS_TMO_PROG so that
6877c478bd9Sstevel@tonic-gate 	 * the guy trying to set the timeout knows that it has
6887c478bd9Sstevel@tonic-gate 	 * already been processed.
6897c478bd9Sstevel@tonic-gate 	 */
6907c478bd9Sstevel@tonic-gate 	if (state & SQS_TMO_PROG)
6917c478bd9Sstevel@tonic-gate 		sqp->sq_state &= ~SQS_TMO_PROG;
6927c478bd9Sstevel@tonic-gate 
6937c478bd9Sstevel@tonic-gate 	if (!(state & SQS_PROC)) {
6947c478bd9Sstevel@tonic-gate 		sqp->sq_awaken = lbolt;
695da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
6967c478bd9Sstevel@tonic-gate 	}
6977c478bd9Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
6987c478bd9Sstevel@tonic-gate }
6997c478bd9Sstevel@tonic-gate 
7007c478bd9Sstevel@tonic-gate static void
701d19d6468Sbw squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
7027c478bd9Sstevel@tonic-gate {
7037c478bd9Sstevel@tonic-gate 	mblk_t		*mp;
7047c478bd9Sstevel@tonic-gate 	mblk_t 		*head;
7057c478bd9Sstevel@tonic-gate 	sqproc_t 	proc;
7067c478bd9Sstevel@tonic-gate 	conn_t		*connp;
7077c478bd9Sstevel@tonic-gate 	timeout_id_t 	tid;
7087c478bd9Sstevel@tonic-gate 	ill_rx_ring_t	*sq_rx_ring = sqp->sq_rx_ring;
709d19d6468Sbw 	hrtime_t 	now;
710da14cebeSEric Cheng 	boolean_t	did_wakeup = B_FALSE;
711da14cebeSEric Cheng 	boolean_t	sq_poll_capable;
712*bd670b35SErik Nordmark 	ip_recv_attr_t	*ira, iras;
7137c478bd9Sstevel@tonic-gate 
714da14cebeSEric Cheng 	sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
715da14cebeSEric Cheng again:
7167c478bd9Sstevel@tonic-gate 	ASSERT(mutex_owned(&sqp->sq_lock));
717da14cebeSEric Cheng 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
718da14cebeSEric Cheng 	    SQS_POLL_QUIESCE_DONE)));
7197c478bd9Sstevel@tonic-gate 
720da14cebeSEric Cheng 	head = sqp->sq_first;
721da14cebeSEric Cheng 	sqp->sq_first = NULL;
722da14cebeSEric Cheng 	sqp->sq_last = NULL;
723da14cebeSEric Cheng 	sqp->sq_count = 0;
7247c478bd9Sstevel@tonic-gate 
7257c478bd9Sstevel@tonic-gate 	if ((tid = sqp->sq_tid) != 0)
7267c478bd9Sstevel@tonic-gate 		sqp->sq_tid = 0;
7277c478bd9Sstevel@tonic-gate 
7287c478bd9Sstevel@tonic-gate 	sqp->sq_state |= SQS_PROC | proc_type;
729da14cebeSEric Cheng 
7307c478bd9Sstevel@tonic-gate 	/*
7317c478bd9Sstevel@tonic-gate 	 * We have backlog built up. Switch to polling mode if the
732da14cebeSEric Cheng 	 * device underneath allows it. Need to do it so that
733da14cebeSEric Cheng 	 * more packets don't come in and disturb us (by contending
734da14cebeSEric Cheng 	 * for sq_lock or higher priority thread preempting us).
735da14cebeSEric Cheng 	 *
736da14cebeSEric Cheng 	 * The worker thread is allowed to do active polling while we
737da14cebeSEric Cheng 	 * just disable the interrupts for drain by non worker (kernel
738da14cebeSEric Cheng 	 * or userland) threads so they can peacefully process the
739da14cebeSEric Cheng 	 * packets during time allocated to them.
7407c478bd9Sstevel@tonic-gate 	 */
741da14cebeSEric Cheng 	SQS_POLLING_ON(sqp, sq_poll_capable, sq_rx_ring);
7427c478bd9Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
7437c478bd9Sstevel@tonic-gate 
7447c478bd9Sstevel@tonic-gate 	if (tid != 0)
7457c478bd9Sstevel@tonic-gate 		(void) untimeout(tid);
746da14cebeSEric Cheng 
7477c478bd9Sstevel@tonic-gate 	while ((mp = head) != NULL) {
748da14cebeSEric Cheng 
7497c478bd9Sstevel@tonic-gate 		head = mp->b_next;
7507c478bd9Sstevel@tonic-gate 		mp->b_next = NULL;
7517c478bd9Sstevel@tonic-gate 
7527c478bd9Sstevel@tonic-gate 		proc = (sqproc_t)mp->b_queue;
7537c478bd9Sstevel@tonic-gate 		mp->b_queue = NULL;
7547c478bd9Sstevel@tonic-gate 		connp = (conn_t *)mp->b_prev;
7557c478bd9Sstevel@tonic-gate 		mp->b_prev = NULL;
7567c478bd9Sstevel@tonic-gate 
757*bd670b35SErik Nordmark 		/* Is there an ip_recv_attr_t to handle? */
758*bd670b35SErik Nordmark 		if (ip_recv_attr_is_mblk(mp)) {
759*bd670b35SErik Nordmark 			mblk_t	*attrmp = mp;
760*bd670b35SErik Nordmark 
761*bd670b35SErik Nordmark 			ASSERT(attrmp->b_cont != NULL);
762*bd670b35SErik Nordmark 
763*bd670b35SErik Nordmark 			mp = attrmp->b_cont;
764*bd670b35SErik Nordmark 			attrmp->b_cont = NULL;
765*bd670b35SErik Nordmark 			ASSERT(mp->b_queue == NULL);
766*bd670b35SErik Nordmark 			ASSERT(mp->b_prev == NULL);
767*bd670b35SErik Nordmark 
768*bd670b35SErik Nordmark 			if (!ip_recv_attr_from_mblk(attrmp, &iras)) {
769*bd670b35SErik Nordmark 				/* The ill or ip_stack_t disappeared on us */
770*bd670b35SErik Nordmark 				ip_drop_input("ip_recv_attr_from_mblk",
771*bd670b35SErik Nordmark 				    mp, NULL);
772*bd670b35SErik Nordmark 				ira_cleanup(&iras, B_TRUE);
773*bd670b35SErik Nordmark 				CONN_DEC_REF(connp);
774*bd670b35SErik Nordmark 				continue;
775*bd670b35SErik Nordmark 			}
776*bd670b35SErik Nordmark 			ira = &iras;
777*bd670b35SErik Nordmark 		} else {
778*bd670b35SErik Nordmark 			ira = NULL;
779*bd670b35SErik Nordmark 		}
780*bd670b35SErik Nordmark 
781*bd670b35SErik Nordmark 
782da14cebeSEric Cheng 		/*
783da14cebeSEric Cheng 		 * Handle squeue switching. More details in the
784da14cebeSEric Cheng 		 * block comment at the top of the file
785da14cebeSEric Cheng 		 */
786da14cebeSEric Cheng 		if (connp->conn_sqp == sqp) {
787da14cebeSEric Cheng 			SQUEUE_DBG_SET(sqp, mp, proc, connp,
788da14cebeSEric Cheng 			    mp->b_tag);
7897c478bd9Sstevel@tonic-gate 			connp->conn_on_sqp = B_TRUE;
7907c478bd9Sstevel@tonic-gate 			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
7917c478bd9Sstevel@tonic-gate 			    sqp, mblk_t *, mp, conn_t *, connp);
792*bd670b35SErik Nordmark 			(*proc)(connp, mp, sqp, ira);
7937c478bd9Sstevel@tonic-gate 			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
7947c478bd9Sstevel@tonic-gate 			    sqp, conn_t *, connp);
7957c478bd9Sstevel@tonic-gate 			connp->conn_on_sqp = B_FALSE;
7967c478bd9Sstevel@tonic-gate 			CONN_DEC_REF(connp);
797da14cebeSEric Cheng 		} else {
798*bd670b35SErik Nordmark 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
799da14cebeSEric Cheng 			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
800da14cebeSEric Cheng 		}
801*bd670b35SErik Nordmark 		if (ira != NULL)
802*bd670b35SErik Nordmark 			ira_cleanup(ira, B_TRUE);
8037c478bd9Sstevel@tonic-gate 	}
8047c478bd9Sstevel@tonic-gate 
805da14cebeSEric Cheng 	SQUEUE_DBG_CLEAR(sqp);
8067c478bd9Sstevel@tonic-gate 
8077c478bd9Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
8087c478bd9Sstevel@tonic-gate 
809da14cebeSEric Cheng 	/*
810da14cebeSEric Cheng 	 * Check if there is still work to do (either more arrived or timer
811da14cebeSEric Cheng 	 * expired). If we are the worker thread and we are polling capable,
812da14cebeSEric Cheng 	 * continue doing the work since no one else is around to do the
813da14cebeSEric Cheng 	 * work anyway (but signal the poll thread to retrieve some packets
814da14cebeSEric Cheng 	 * in the meanwhile). If we are not the worker thread, just
815da14cebeSEric Cheng 	 * signal the worker thread to take up the work if processing time
816da14cebeSEric Cheng 	 * has expired.
817da14cebeSEric Cheng 	 */
8187c478bd9Sstevel@tonic-gate 	if (sqp->sq_first != NULL) {
819da14cebeSEric Cheng 		/*
820da14cebeSEric Cheng 		 * Still more to process. If time quanta not expired, we
821da14cebeSEric Cheng 		 * should let the drain go on. The worker thread is allowed
822da14cebeSEric Cheng 		 * to drain as long as there is anything left.
823da14cebeSEric Cheng 		 */
824da14cebeSEric Cheng 		now = gethrtime();
825da14cebeSEric Cheng 		if ((now < expire) || (proc_type == SQS_WORKER)) {
826da14cebeSEric Cheng 			/*
827da14cebeSEric Cheng 			 * If time not expired or we are worker thread and
828da14cebeSEric Cheng 			 * this squeue is polling capable, continue to do
829da14cebeSEric Cheng 			 * the drain.
830da14cebeSEric Cheng 			 *
831da14cebeSEric Cheng 			 * We turn off interrupts for all userland threads
832da14cebeSEric Cheng 			 * doing drain but we do active polling only for
833da14cebeSEric Cheng 			 * worker thread.
834efe28d82SRajagopal Kunhappan 			 *
835efe28d82SRajagopal Kunhappan 			 * Calling SQS_POLL_RING() even in the case of
836efe28d82SRajagopal Kunhappan 			 * SQS_POLLING_ON() not succeeding is ok as
837efe28d82SRajagopal Kunhappan 			 * SQS_POLL_RING() will not wake up poll thread
838efe28d82SRajagopal Kunhappan 			 * if SQS_POLLING bit is not set.
839da14cebeSEric Cheng 			 */
840da14cebeSEric Cheng 			if (proc_type == SQS_WORKER)
841efe28d82SRajagopal Kunhappan 				SQS_POLL_RING(sqp);
842da14cebeSEric Cheng 			goto again;
843da14cebeSEric Cheng 		} else {
844da14cebeSEric Cheng 			did_wakeup = B_TRUE;
845da14cebeSEric Cheng 			sqp->sq_awaken = lbolt;
846da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
847da14cebeSEric Cheng 		}
848da14cebeSEric Cheng 	}
849da14cebeSEric Cheng 
850da14cebeSEric Cheng 	/*
851da14cebeSEric Cheng 	 * If the poll thread is already running, just return. The
852da14cebeSEric Cheng 	 * poll thread continues to hold the proc and will finish
853da14cebeSEric Cheng 	 * processing.
854da14cebeSEric Cheng 	 */
855da14cebeSEric Cheng 	if (sqp->sq_state & SQS_GET_PKTS) {
856da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
857da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE)));
858da14cebeSEric Cheng 		sqp->sq_state &= ~proc_type;
859da14cebeSEric Cheng 		return;
860da14cebeSEric Cheng 	}
861da14cebeSEric Cheng 
862da14cebeSEric Cheng 	/*
863da14cebeSEric Cheng 	 *
864da14cebeSEric Cheng 	 * If we are the worker thread and no work is left, send the poll
865da14cebeSEric Cheng 	 * thread down once more to see if something arrived. Otherwise,
866da14cebeSEric Cheng 	 * turn the interrupts back on and we are done.
867da14cebeSEric Cheng 	 */
868efe28d82SRajagopal Kunhappan 	if ((proc_type == SQS_WORKER) && (sqp->sq_state & SQS_POLLING)) {
869da14cebeSEric Cheng 		/*
870da14cebeSEric Cheng 		 * Do one last check to see if anything arrived
871da14cebeSEric Cheng 		 * in the NIC. We leave the SQS_PROC set to ensure
872da14cebeSEric Cheng 		 * that poll thread keeps the PROC and can decide
873da14cebeSEric Cheng 		 * if it needs to turn polling off or continue
874da14cebeSEric Cheng 		 * processing.
875da14cebeSEric Cheng 		 *
876da14cebeSEric Cheng 		 * If we drop the SQS_PROC here and poll thread comes
877da14cebeSEric Cheng 		 * up empty handed, it can not safely turn polling off
878da14cebeSEric Cheng 		 * since someone else could have acquired the PROC
879da14cebeSEric Cheng 		 * and started draining. The previously running poll
880da14cebeSEric Cheng 		 * thread and the current thread doing drain would end
881da14cebeSEric Cheng 		 * up in a race for turning polling on/off and more
882da14cebeSEric Cheng 		 * complex code would be required to deal with it.
883da14cebeSEric Cheng 		 *
884da14cebeSEric Cheng 		 * Its lot simpler for drain to hand the SQS_PROC to
885da14cebeSEric Cheng 		 * poll thread (if running) and let poll thread finish
886da14cebeSEric Cheng 		 * without worrying about racing with any other thread.
887da14cebeSEric Cheng 		 */
888da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
889da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE)));
890efe28d82SRajagopal Kunhappan 		SQS_POLL_RING(sqp);
891da14cebeSEric Cheng 		sqp->sq_state &= ~proc_type;
892da14cebeSEric Cheng 	} else {
893da14cebeSEric Cheng 		/*
894efe28d82SRajagopal Kunhappan 		 * The squeue is either not capable of polling or the
895efe28d82SRajagopal Kunhappan 		 * attempt to blank (i.e., turn SQS_POLLING_ON()) was
896efe28d82SRajagopal Kunhappan 		 * unsuccessful or poll thread already finished
897efe28d82SRajagopal Kunhappan 		 * processing and didn't find anything. Since there
898efe28d82SRajagopal Kunhappan 		 * is nothing queued and we already turn polling on
899efe28d82SRajagopal Kunhappan 		 * (for all threads doing drain), we should turn
900efe28d82SRajagopal Kunhappan 		 * polling off and relinquish the PROC.
901da14cebeSEric Cheng 		 */
902da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
903da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE)));
904da14cebeSEric Cheng 		SQS_POLLING_OFF(sqp, sq_poll_capable, sq_rx_ring);
905da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_PROC | proc_type);
906da14cebeSEric Cheng 		if (!did_wakeup && sqp->sq_first != NULL) {
907da14cebeSEric Cheng 			squeue_worker_wakeup(sqp);
908da14cebeSEric Cheng 			mutex_enter(&sqp->sq_lock);
909da14cebeSEric Cheng 		}
910da14cebeSEric Cheng 		/*
911da14cebeSEric Cheng 		 * If we are not the worker and there is a pending quiesce
912da14cebeSEric Cheng 		 * event, wake up the worker
913da14cebeSEric Cheng 		 */
914da14cebeSEric Cheng 		if ((proc_type != SQS_WORKER) &&
915da14cebeSEric Cheng 		    (sqp->sq_state & SQS_WORKER_THR_CONTROL))
916da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
917da14cebeSEric Cheng 	}
918da14cebeSEric Cheng }
919da14cebeSEric Cheng 
920da14cebeSEric Cheng /*
921da14cebeSEric Cheng  * Quiesce, Restart, or Cleanup of the squeue poll thread.
922da14cebeSEric Cheng  *
923da14cebeSEric Cheng  * Quiesce and Restart: After an squeue poll thread has been quiesced, it does
924da14cebeSEric Cheng  * not attempt to poll the underlying soft ring any more. The quiesce is
925da14cebeSEric Cheng  * triggered by the mac layer when it wants to quiesce a soft ring. Typically
926da14cebeSEric Cheng  * control operations such as changing the fanout of a NIC or VNIC (dladm
927da14cebeSEric Cheng  * setlinkprop) need to quiesce data flow before changing the wiring.
928da14cebeSEric Cheng  * The operation is done by the mac layer, but it calls back into IP to
929da14cebeSEric Cheng  * quiesce the soft ring. After completing the operation (say increase or
930da14cebeSEric Cheng  * decrease of the fanout) the mac layer then calls back into IP to restart
931da14cebeSEric Cheng  * the quiesced soft ring.
932da14cebeSEric Cheng  *
933da14cebeSEric Cheng  * Cleanup: This is triggered when the squeue binding to a soft ring is
934da14cebeSEric Cheng  * removed permanently. Typically interface plumb and unplumb would trigger
935da14cebeSEric Cheng  * this. It can also be triggered from the mac layer when a soft ring is
936da14cebeSEric Cheng  * being deleted say as the result of a fanout reduction. Since squeues are
937da14cebeSEric Cheng  * never deleted, the cleanup marks the squeue as fit for recycling and
938da14cebeSEric Cheng  * moves it to the zeroth squeue set.
939da14cebeSEric Cheng  */
940da14cebeSEric Cheng static void
941da14cebeSEric Cheng squeue_poll_thr_control(squeue_t *sqp)
942da14cebeSEric Cheng {
943da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_THR_RESTART) {
944da14cebeSEric Cheng 		/* Restart implies a previous quiesce */
945da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_POLL_THR_QUIESCED);
946da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_POLL_THR_QUIESCED |
947da14cebeSEric Cheng 		    SQS_POLL_THR_RESTART);
948da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_CAPAB;
949da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
950da14cebeSEric Cheng 		return;
951da14cebeSEric Cheng 	}
952da14cebeSEric Cheng 
953da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_THR_QUIESCE) {
954da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_THR_QUIESCED;
955da14cebeSEric Cheng 		sqp->sq_state &= ~SQS_POLL_THR_QUIESCE;
956da14cebeSEric Cheng 		cv_signal(&sqp->sq_worker_cv);
957da14cebeSEric Cheng 		return;
958da14cebeSEric Cheng 	}
959da14cebeSEric Cheng }
960da14cebeSEric Cheng 
961da14cebeSEric Cheng /*
962da14cebeSEric Cheng  * POLLING Notes
963da14cebeSEric Cheng  *
964da14cebeSEric Cheng  * With polling mode, we want to do as much processing as we possibly can
965da14cebeSEric Cheng  * in worker thread context. The sweet spot is worker thread keeps doing
966da14cebeSEric Cheng  * work all the time in polling mode and writers etc. keep dumping packets
967da14cebeSEric Cheng  * to worker thread. Occassionally, we send the poll thread (running at
968da14cebeSEric Cheng  * lower priority to NIC to get the chain of packets to feed to worker).
969da14cebeSEric Cheng  * Sending the poll thread down to NIC is dependant on 3 criterions
970da14cebeSEric Cheng  *
971da14cebeSEric Cheng  * 1) Its always driven from squeue_drain and only if worker thread is
972da14cebeSEric Cheng  *	doing the drain.
973da14cebeSEric Cheng  * 2) We clear the backlog once and more packets arrived in between.
974da14cebeSEric Cheng  *	Before starting drain again, send the poll thread down if
975da14cebeSEric Cheng  *	the drain is being done by worker thread.
976da14cebeSEric Cheng  * 3) Before exiting the squeue_drain, if the poll thread is not already
977da14cebeSEric Cheng  *	working and we are the worker thread, try to poll one more time.
978da14cebeSEric Cheng  *
979da14cebeSEric Cheng  * For latency sake, we do allow any thread calling squeue_enter
980da14cebeSEric Cheng  * to process its packet provided:
981da14cebeSEric Cheng  *
982da14cebeSEric Cheng  * 1) Nothing is queued
983da14cebeSEric Cheng  * 2) If more packets arrived in between, the non worker thread are allowed
984da14cebeSEric Cheng  *	to do the drain till their time quanta expired provided SQS_GET_PKTS
985da14cebeSEric Cheng  *	wasn't set in between.
986da14cebeSEric Cheng  *
987da14cebeSEric Cheng  * Avoiding deadlocks with interrupts
988da14cebeSEric Cheng  * ==================================
989da14cebeSEric Cheng  *
990da14cebeSEric Cheng  * One of the big problem is that we can't send poll_thr down while holding
991da14cebeSEric Cheng  * the sq_lock since the thread can block. So we drop the sq_lock before
992da14cebeSEric Cheng  * calling sq_get_pkts(). We keep holding the SQS_PROC as long as the
993da14cebeSEric Cheng  * poll thread is running so that no other thread can acquire the
994da14cebeSEric Cheng  * perimeter in between. If the squeue_drain gets done (no more work
995da14cebeSEric Cheng  * left), it leaves the SQS_PROC set if poll thread is running.
996da14cebeSEric Cheng  */
997da14cebeSEric Cheng 
998da14cebeSEric Cheng /*
999da14cebeSEric Cheng  * This is the squeue poll thread. In poll mode, it polls the underlying
1000da14cebeSEric Cheng  * TCP softring and feeds packets into the squeue. The worker thread then
1001da14cebeSEric Cheng  * drains the squeue. The poll thread also responds to control signals for
1002da14cebeSEric Cheng  * quiesceing, restarting, or cleanup of an squeue. These are driven by
1003da14cebeSEric Cheng  * control operations like plumb/unplumb or as a result of dynamic Rx ring
1004da14cebeSEric Cheng  * related operations that are driven from the mac layer.
1005da14cebeSEric Cheng  */
1006da14cebeSEric Cheng static void
1007da14cebeSEric Cheng squeue_polling_thread(squeue_t *sqp)
1008da14cebeSEric Cheng {
1009da14cebeSEric Cheng 	kmutex_t *lock = &sqp->sq_lock;
1010da14cebeSEric Cheng 	kcondvar_t *async = &sqp->sq_poll_cv;
1011da14cebeSEric Cheng 	ip_mac_rx_t sq_get_pkts;
1012da14cebeSEric Cheng 	ip_accept_t ip_accept;
1013da14cebeSEric Cheng 	ill_rx_ring_t *sq_rx_ring;
1014da14cebeSEric Cheng 	ill_t *sq_ill;
1015da14cebeSEric Cheng 	mblk_t *head, *tail, *mp;
1016da14cebeSEric Cheng 	uint_t cnt;
1017da14cebeSEric Cheng 	void *sq_mac_handle;
1018da14cebeSEric Cheng 	callb_cpr_t cprinfo;
1019da14cebeSEric Cheng 	size_t bytes_to_pickup;
1020da14cebeSEric Cheng 	uint32_t ctl_state;
1021da14cebeSEric Cheng 
1022da14cebeSEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_poll");
1023da14cebeSEric Cheng 	mutex_enter(lock);
1024da14cebeSEric Cheng 
1025da14cebeSEric Cheng 	for (;;) {
1026da14cebeSEric Cheng 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
1027da14cebeSEric Cheng 		cv_wait(async, lock);
1028da14cebeSEric Cheng 		CALLB_CPR_SAFE_END(&cprinfo, lock);
1029da14cebeSEric Cheng 
1030da14cebeSEric Cheng 		ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
1031da14cebeSEric Cheng 		    SQS_POLL_THR_QUIESCED);
1032da14cebeSEric Cheng 		if (ctl_state != 0) {
1033da14cebeSEric Cheng 			/*
1034da14cebeSEric Cheng 			 * If the squeue is quiesced, then wait for a control
1035da14cebeSEric Cheng 			 * request. A quiesced squeue must not poll the
1036da14cebeSEric Cheng 			 * underlying soft ring.
1037da14cebeSEric Cheng 			 */
1038da14cebeSEric Cheng 			if (ctl_state == SQS_POLL_THR_QUIESCED)
1039da14cebeSEric Cheng 				continue;
1040da14cebeSEric Cheng 			/*
1041da14cebeSEric Cheng 			 * Act on control requests to quiesce, cleanup or
1042da14cebeSEric Cheng 			 * restart an squeue
1043da14cebeSEric Cheng 			 */
1044da14cebeSEric Cheng 			squeue_poll_thr_control(sqp);
1045da14cebeSEric Cheng 			continue;
1046da14cebeSEric Cheng 		}
1047da14cebeSEric Cheng 
1048da14cebeSEric Cheng 		if (!(sqp->sq_state & SQS_POLL_CAPAB))
1049da14cebeSEric Cheng 			continue;
1050da14cebeSEric Cheng 
1051da14cebeSEric Cheng 		ASSERT((sqp->sq_state &
1052da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
1053da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
1054da14cebeSEric Cheng 
1055da14cebeSEric Cheng poll_again:
1056da14cebeSEric Cheng 		sq_rx_ring = sqp->sq_rx_ring;
1057da14cebeSEric Cheng 		sq_get_pkts = sq_rx_ring->rr_rx;
1058da14cebeSEric Cheng 		sq_mac_handle = sq_rx_ring->rr_rx_handle;
1059da14cebeSEric Cheng 		ip_accept = sq_rx_ring->rr_ip_accept;
1060da14cebeSEric Cheng 		sq_ill = sq_rx_ring->rr_ill;
1061da14cebeSEric Cheng 		bytes_to_pickup = MAX_BYTES_TO_PICKUP;
1062da14cebeSEric Cheng 		mutex_exit(lock);
1063da14cebeSEric Cheng 		head = sq_get_pkts(sq_mac_handle, bytes_to_pickup);
1064da14cebeSEric Cheng 		mp = NULL;
1065da14cebeSEric Cheng 		if (head != NULL) {
1066da14cebeSEric Cheng 			/*
1067da14cebeSEric Cheng 			 * We got the packet chain from the mac layer. It
1068da14cebeSEric Cheng 			 * would be nice to be able to process it inline
1069da14cebeSEric Cheng 			 * for better performance but we need to give
1070da14cebeSEric Cheng 			 * IP a chance to look at this chain to ensure
1071da14cebeSEric Cheng 			 * that packets are really meant for this squeue
1072da14cebeSEric Cheng 			 * and do the IP processing.
1073da14cebeSEric Cheng 			 */
1074da14cebeSEric Cheng 			mp = ip_accept(sq_ill, sq_rx_ring, sqp, head,
1075da14cebeSEric Cheng 			    &tail, &cnt);
1076da14cebeSEric Cheng 		}
1077da14cebeSEric Cheng 		mutex_enter(lock);
1078*bd670b35SErik Nordmark 		if (mp != NULL) {
1079*bd670b35SErik Nordmark 			/*
1080*bd670b35SErik Nordmark 			 * The ip_accept function has already added an
1081*bd670b35SErik Nordmark 			 * ip_recv_attr_t mblk if that is needed.
1082*bd670b35SErik Nordmark 			 */
1083da14cebeSEric Cheng 			ENQUEUE_CHAIN(sqp, mp, tail, cnt);
1084*bd670b35SErik Nordmark 		}
1085da14cebeSEric Cheng 		ASSERT((sqp->sq_state &
1086da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
1087da14cebeSEric Cheng 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
1088da14cebeSEric Cheng 
1089da14cebeSEric Cheng 		if (sqp->sq_first != NULL && !(sqp->sq_state & SQS_WORKER)) {
1090da14cebeSEric Cheng 			/*
1091da14cebeSEric Cheng 			 * We have packets to process and worker thread
1092da14cebeSEric Cheng 			 * is not running.  Check to see if poll thread is
1093da14cebeSEric Cheng 			 * allowed to process. Let it do processing only if it
1094da14cebeSEric Cheng 			 * picked up some packets from the NIC otherwise
1095da14cebeSEric Cheng 			 * wakeup the worker thread.
1096da14cebeSEric Cheng 			 */
1097da14cebeSEric Cheng 			if (mp != NULL) {
1098da14cebeSEric Cheng 				hrtime_t  now;
1099d19d6468Sbw 
1100d19d6468Sbw 				now = gethrtime();
1101da14cebeSEric Cheng 				sqp->sq_run = curthread;
1102da14cebeSEric Cheng 				sqp->sq_drain(sqp, SQS_POLL_PROC, now +
1103da14cebeSEric Cheng 				    squeue_drain_ns);
1104da14cebeSEric Cheng 				sqp->sq_run = NULL;
1105da14cebeSEric Cheng 
1106da14cebeSEric Cheng 				if (sqp->sq_first == NULL)
1107da14cebeSEric Cheng 					goto poll_again;
11087c478bd9Sstevel@tonic-gate 
11097c478bd9Sstevel@tonic-gate 				/*
1110da14cebeSEric Cheng 				 * Couldn't do the entire drain because the
1111da14cebeSEric Cheng 				 * time limit expired, let the
1112da14cebeSEric Cheng 				 * worker thread take over.
11137c478bd9Sstevel@tonic-gate 				 */
1114da14cebeSEric Cheng 			}
1115da14cebeSEric Cheng 
11167c478bd9Sstevel@tonic-gate 			sqp->sq_awaken = lbolt;
1117da14cebeSEric Cheng 			/*
1118da14cebeSEric Cheng 			 * Put the SQS_PROC_HELD on so the worker
1119da14cebeSEric Cheng 			 * thread can distinguish where its called from. We
1120da14cebeSEric Cheng 			 * can remove the SQS_PROC flag here and turn off the
1121da14cebeSEric Cheng 			 * polling so that it wouldn't matter who gets the
1122da14cebeSEric Cheng 			 * processing but we get better performance this way
1123da14cebeSEric Cheng 			 * and save the cost of turn polling off and possibly
1124da14cebeSEric Cheng 			 * on again as soon as we start draining again.
1125da14cebeSEric Cheng 			 *
1126da14cebeSEric Cheng 			 * We can't remove the SQS_PROC flag without turning
1127da14cebeSEric Cheng 			 * polling off until we can guarantee that control
1128da14cebeSEric Cheng 			 * will return to squeue_drain immediately.
1129da14cebeSEric Cheng 			 */
1130da14cebeSEric Cheng 			sqp->sq_state |= SQS_PROC_HELD;
1131da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_GET_PKTS;
1132da14cebeSEric Cheng 			cv_signal(&sqp->sq_worker_cv);
1133da14cebeSEric Cheng 		} else if (sqp->sq_first == NULL &&
1134da14cebeSEric Cheng 		    !(sqp->sq_state & SQS_WORKER)) {
1135da14cebeSEric Cheng 			/*
1136da14cebeSEric Cheng 			 * Nothing queued and worker thread not running.
1137da14cebeSEric Cheng 			 * Since we hold the proc, no other thread is
1138da14cebeSEric Cheng 			 * processing the squeue. This means that there
1139da14cebeSEric Cheng 			 * is no work to be done and nothing is queued
1140da14cebeSEric Cheng 			 * in squeue or in NIC. Turn polling off and go
1141da14cebeSEric Cheng 			 * back to interrupt mode.
1142da14cebeSEric Cheng 			 */
1143da14cebeSEric Cheng 			sqp->sq_state &= ~(SQS_PROC|SQS_GET_PKTS);
1144da14cebeSEric Cheng 			/* LINTED: constant in conditional context */
1145da14cebeSEric Cheng 			SQS_POLLING_OFF(sqp, B_TRUE, sq_rx_ring);
11464cc34124SThirumalai Srinivasan 
11474cc34124SThirumalai Srinivasan 			/*
11484cc34124SThirumalai Srinivasan 			 * If there is a pending control operation
11494cc34124SThirumalai Srinivasan 			 * wake up the worker, since it is currently
11504cc34124SThirumalai Srinivasan 			 * not running.
11514cc34124SThirumalai Srinivasan 			 */
11524cc34124SThirumalai Srinivasan 			if (sqp->sq_state & SQS_WORKER_THR_CONTROL)
11534cc34124SThirumalai Srinivasan 				cv_signal(&sqp->sq_worker_cv);
1154da14cebeSEric Cheng 		} else {
1155da14cebeSEric Cheng 			/*
1156da14cebeSEric Cheng 			 * Worker thread is already running. We don't need
1157da14cebeSEric Cheng 			 * to do anything. Indicate that poll thread is done.
1158da14cebeSEric Cheng 			 */
1159da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_GET_PKTS;
1160da14cebeSEric Cheng 		}
1161da14cebeSEric Cheng 		if (sqp->sq_state & SQS_POLL_THR_CONTROL) {
1162da14cebeSEric Cheng 			/*
1163da14cebeSEric Cheng 			 * Act on control requests to quiesce, cleanup or
1164da14cebeSEric Cheng 			 * restart an squeue
1165da14cebeSEric Cheng 			 */
1166da14cebeSEric Cheng 			squeue_poll_thr_control(sqp);
1167da14cebeSEric Cheng 		}
11687c478bd9Sstevel@tonic-gate 	}
11697c478bd9Sstevel@tonic-gate }
11707c478bd9Sstevel@tonic-gate 
11717c478bd9Sstevel@tonic-gate /*
1172da14cebeSEric Cheng  * The squeue worker thread acts on any control requests to quiesce, cleanup
1173da14cebeSEric Cheng  * or restart an ill_rx_ring_t by calling this function. The worker thread
1174da14cebeSEric Cheng  * synchronizes with the squeue poll thread to complete the request and finally
1175da14cebeSEric Cheng  * wakes up the requestor when the request is completed.
11767c478bd9Sstevel@tonic-gate  */
1177da14cebeSEric Cheng static void
1178da14cebeSEric Cheng squeue_worker_thr_control(squeue_t *sqp)
1179da14cebeSEric Cheng {
1180da14cebeSEric Cheng 	ill_t	*ill;
1181da14cebeSEric Cheng 	ill_rx_ring_t	*rx_ring;
11827c478bd9Sstevel@tonic-gate 
1183da14cebeSEric Cheng 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
1184da14cebeSEric Cheng 
1185da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_RESTART) {
1186da14cebeSEric Cheng 		/* Restart implies a previous quiesce. */
1187da14cebeSEric Cheng 		ASSERT((sqp->sq_state & (SQS_PROC_HELD |
1188da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER)) ==
1189da14cebeSEric Cheng 		    (SQS_POLL_QUIESCE_DONE | SQS_PROC | SQS_WORKER));
1190da14cebeSEric Cheng 		/*
1191da14cebeSEric Cheng 		 * Request the squeue poll thread to restart and wait till
1192da14cebeSEric Cheng 		 * it actually restarts.
1193da14cebeSEric Cheng 		 */
1194da14cebeSEric Cheng 		sqp->sq_state &= ~SQS_POLL_QUIESCE_DONE;
1195da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_THR_RESTART;
1196da14cebeSEric Cheng 		cv_signal(&sqp->sq_poll_cv);
1197da14cebeSEric Cheng 		while (sqp->sq_state & SQS_POLL_THR_QUIESCED)
1198da14cebeSEric Cheng 			cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
1199da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_POLL_RESTART | SQS_PROC |
1200da14cebeSEric Cheng 		    SQS_WORKER);
1201da14cebeSEric Cheng 		/*
1202da14cebeSEric Cheng 		 * Signal any waiter that is waiting for the restart
1203da14cebeSEric Cheng 		 * to complete
1204da14cebeSEric Cheng 		 */
1205da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_RESTART_DONE;
1206da14cebeSEric Cheng 		cv_signal(&sqp->sq_ctrlop_done_cv);
1207da14cebeSEric Cheng 		return;
1208da14cebeSEric Cheng 	}
1209da14cebeSEric Cheng 
1210da14cebeSEric Cheng 	if (sqp->sq_state & SQS_PROC_HELD) {
1211da14cebeSEric Cheng 		/* The squeue poll thread handed control to us */
1212da14cebeSEric Cheng 		ASSERT(sqp->sq_state & SQS_PROC);
1213da14cebeSEric Cheng 	}
12147c478bd9Sstevel@tonic-gate 
12157c478bd9Sstevel@tonic-gate 	/*
1216da14cebeSEric Cheng 	 * Prevent any other thread from processing the squeue
1217da14cebeSEric Cheng 	 * until we finish the control actions by setting SQS_PROC.
1218da14cebeSEric Cheng 	 * But allow ourself to reenter by setting SQS_WORKER
12197c478bd9Sstevel@tonic-gate 	 */
1220da14cebeSEric Cheng 	sqp->sq_state |= (SQS_PROC | SQS_WORKER);
1221da14cebeSEric Cheng 
1222da14cebeSEric Cheng 	/* Signal the squeue poll thread and wait for it to quiesce itself */
1223da14cebeSEric Cheng 	if (!(sqp->sq_state & SQS_POLL_THR_QUIESCED)) {
1224da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_THR_QUIESCE;
1225da14cebeSEric Cheng 		cv_signal(&sqp->sq_poll_cv);
1226da14cebeSEric Cheng 		while (!(sqp->sq_state & SQS_POLL_THR_QUIESCED))
1227da14cebeSEric Cheng 			cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
1228da14cebeSEric Cheng 	}
1229da14cebeSEric Cheng 
1230da14cebeSEric Cheng 	rx_ring = sqp->sq_rx_ring;
1231da14cebeSEric Cheng 	ill = rx_ring->rr_ill;
1232da14cebeSEric Cheng 	/*
1233da14cebeSEric Cheng 	 * The lock hierarchy is as follows.
1234da14cebeSEric Cheng 	 * cpu_lock -> ill_lock -> sqset_lock -> sq_lock
1235da14cebeSEric Cheng 	 */
1236da14cebeSEric Cheng 	mutex_exit(&sqp->sq_lock);
1237da14cebeSEric Cheng 	mutex_enter(&ill->ill_lock);
1238da14cebeSEric Cheng 	mutex_enter(&sqp->sq_lock);
1239da14cebeSEric Cheng 
1240da14cebeSEric Cheng 	SQS_POLLING_OFF(sqp, (sqp->sq_state & SQS_POLL_CAPAB) != 0,
1241da14cebeSEric Cheng 	    sqp->sq_rx_ring);
1242da14cebeSEric Cheng 	sqp->sq_state &= ~(SQS_POLL_CAPAB | SQS_GET_PKTS | SQS_PROC_HELD);
1243da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_CLEANUP) {
1244da14cebeSEric Cheng 		/*
1245da14cebeSEric Cheng 		 * Disassociate this squeue from its ill_rx_ring_t.
1246da14cebeSEric Cheng 		 * The rr_sqp, sq_rx_ring fields are protected by the
1247da14cebeSEric Cheng 		 * corresponding squeue, ill_lock* and sq_lock. Holding any
1248da14cebeSEric Cheng 		 * of them will ensure that the ring to squeue mapping does
1249da14cebeSEric Cheng 		 * not change.
1250da14cebeSEric Cheng 		 */
1251da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & SQS_DEFAULT));
1252da14cebeSEric Cheng 
1253da14cebeSEric Cheng 		sqp->sq_rx_ring = NULL;
1254da14cebeSEric Cheng 		rx_ring->rr_sqp = NULL;
1255da14cebeSEric Cheng 
1256da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_POLL_CLEANUP | SQS_POLL_THR_QUIESCED |
1257da14cebeSEric Cheng 		    SQS_POLL_QUIESCE_DONE);
1258da14cebeSEric Cheng 		sqp->sq_ill = NULL;
1259da14cebeSEric Cheng 
1260da14cebeSEric Cheng 		rx_ring->rr_rx_handle = NULL;
1261da14cebeSEric Cheng 		rx_ring->rr_intr_handle = NULL;
1262da14cebeSEric Cheng 		rx_ring->rr_intr_enable = NULL;
1263da14cebeSEric Cheng 		rx_ring->rr_intr_disable = NULL;
1264da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_CLEANUP_DONE;
1265da14cebeSEric Cheng 	} else {
1266da14cebeSEric Cheng 		sqp->sq_state &= ~SQS_POLL_QUIESCE;
1267da14cebeSEric Cheng 		sqp->sq_state |= SQS_POLL_QUIESCE_DONE;
1268da14cebeSEric Cheng 	}
1269da14cebeSEric Cheng 	/*
1270da14cebeSEric Cheng 	 * Signal any waiter that is waiting for the quiesce or cleanup
1271da14cebeSEric Cheng 	 * to complete and also wait for it to actually see and reset the
1272da14cebeSEric Cheng 	 * SQS_POLL_CLEANUP_DONE.
1273da14cebeSEric Cheng 	 */
1274da14cebeSEric Cheng 	cv_signal(&sqp->sq_ctrlop_done_cv);
1275da14cebeSEric Cheng 	mutex_exit(&ill->ill_lock);
1276da14cebeSEric Cheng 	if (sqp->sq_state & SQS_POLL_CLEANUP_DONE) {
1277da14cebeSEric Cheng 		cv_wait(&sqp->sq_worker_cv, &sqp->sq_lock);
1278da14cebeSEric Cheng 		sqp->sq_state &= ~(SQS_PROC | SQS_WORKER);
12797c478bd9Sstevel@tonic-gate 	}
12807c478bd9Sstevel@tonic-gate }
12817c478bd9Sstevel@tonic-gate 
12827c478bd9Sstevel@tonic-gate static void
12837c478bd9Sstevel@tonic-gate squeue_worker(squeue_t *sqp)
12847c478bd9Sstevel@tonic-gate {
12857c478bd9Sstevel@tonic-gate 	kmutex_t *lock = &sqp->sq_lock;
1286da14cebeSEric Cheng 	kcondvar_t *async = &sqp->sq_worker_cv;
12877c478bd9Sstevel@tonic-gate 	callb_cpr_t cprinfo;
1288d19d6468Sbw 	hrtime_t now;
12897c478bd9Sstevel@tonic-gate 
1290da14cebeSEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "sq_worker");
12917c478bd9Sstevel@tonic-gate 	mutex_enter(lock);
12927c478bd9Sstevel@tonic-gate 
12937c478bd9Sstevel@tonic-gate 	for (;;) {
1294da14cebeSEric Cheng 		for (;;) {
1295da14cebeSEric Cheng 			/*
1296da14cebeSEric Cheng 			 * If the poll thread has handed control to us
1297da14cebeSEric Cheng 			 * we need to break out of the wait.
1298da14cebeSEric Cheng 			 */
1299da14cebeSEric Cheng 			if (sqp->sq_state & SQS_PROC_HELD)
1300da14cebeSEric Cheng 				break;
1301da14cebeSEric Cheng 
1302da14cebeSEric Cheng 			/*
1303da14cebeSEric Cheng 			 * If the squeue is not being processed and we either
1304da14cebeSEric Cheng 			 * have messages to drain or some thread has signaled
1305da14cebeSEric Cheng 			 * some control activity we need to break
1306da14cebeSEric Cheng 			 */
1307da14cebeSEric Cheng 			if (!(sqp->sq_state & SQS_PROC) &&
1308da14cebeSEric Cheng 			    ((sqp->sq_state & SQS_WORKER_THR_CONTROL) ||
1309da14cebeSEric Cheng 			    (sqp->sq_first != NULL)))
1310da14cebeSEric Cheng 				break;
1311da14cebeSEric Cheng 
1312da14cebeSEric Cheng 			/*
1313da14cebeSEric Cheng 			 * If we have started some control action, then check
1314da14cebeSEric Cheng 			 * for the SQS_WORKER flag (since we don't
1315da14cebeSEric Cheng 			 * release the squeue) to make sure we own the squeue
1316da14cebeSEric Cheng 			 * and break out
1317da14cebeSEric Cheng 			 */
1318da14cebeSEric Cheng 			if ((sqp->sq_state & SQS_WORKER_THR_CONTROL) &&
1319da14cebeSEric Cheng 			    (sqp->sq_state & SQS_WORKER))
1320da14cebeSEric Cheng 				break;
1321da14cebeSEric Cheng 
13227c478bd9Sstevel@tonic-gate 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
13237c478bd9Sstevel@tonic-gate 			cv_wait(async, lock);
13247c478bd9Sstevel@tonic-gate 			CALLB_CPR_SAFE_END(&cprinfo, lock);
13257c478bd9Sstevel@tonic-gate 		}
1326da14cebeSEric Cheng 		if (sqp->sq_state & SQS_WORKER_THR_CONTROL) {
1327da14cebeSEric Cheng 			squeue_worker_thr_control(sqp);
1328da14cebeSEric Cheng 			continue;
13297c478bd9Sstevel@tonic-gate 		}
1330da14cebeSEric Cheng 		ASSERT(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
1331da14cebeSEric Cheng 		    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
1332da14cebeSEric Cheng 		    SQS_WORKER_THR_CONTROL | SQS_POLL_THR_CONTROL)));
13337c478bd9Sstevel@tonic-gate 
1334da14cebeSEric Cheng 		if (sqp->sq_state & SQS_PROC_HELD)
1335da14cebeSEric Cheng 			sqp->sq_state &= ~SQS_PROC_HELD;
1336da14cebeSEric Cheng 
1337d19d6468Sbw 		now = gethrtime();
13387c478bd9Sstevel@tonic-gate 		sqp->sq_run = curthread;
1339da14cebeSEric Cheng 		sqp->sq_drain(sqp, SQS_WORKER, now +  squeue_drain_ns);
13407c478bd9Sstevel@tonic-gate 		sqp->sq_run = NULL;
13417c478bd9Sstevel@tonic-gate 	}
13427c478bd9Sstevel@tonic-gate }
13437c478bd9Sstevel@tonic-gate 
13447c478bd9Sstevel@tonic-gate uintptr_t *
13457c478bd9Sstevel@tonic-gate squeue_getprivate(squeue_t *sqp, sqprivate_t p)
13467c478bd9Sstevel@tonic-gate {
13477c478bd9Sstevel@tonic-gate 	ASSERT(p < SQPRIVATE_MAX);
13487c478bd9Sstevel@tonic-gate 
13497c478bd9Sstevel@tonic-gate 	return (&sqp->sq_private[p]);
13507c478bd9Sstevel@tonic-gate }
13510f1702c5SYu Xiangning 
13520f1702c5SYu Xiangning /* ARGSUSED */
13530f1702c5SYu Xiangning void
1354*bd670b35SErik Nordmark squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
13550f1702c5SYu Xiangning {
13560f1702c5SYu Xiangning 	conn_t *connp = (conn_t *)arg;
13570f1702c5SYu Xiangning 	squeue_t *sqp = connp->conn_sqp;
13580f1702c5SYu Xiangning 
13590f1702c5SYu Xiangning 	/*
13600f1702c5SYu Xiangning 	 * Mark the squeue as paused before waking up the thread stuck
13610f1702c5SYu Xiangning 	 * in squeue_synch_enter().
13620f1702c5SYu Xiangning 	 */
13630f1702c5SYu Xiangning 	mutex_enter(&sqp->sq_lock);
13640f1702c5SYu Xiangning 	sqp->sq_state |= SQS_PAUSE;
13650f1702c5SYu Xiangning 
13660f1702c5SYu Xiangning 	/*
13670f1702c5SYu Xiangning 	 * Notify the thread that it's OK to proceed; that is done by
13680f1702c5SYu Xiangning 	 * clearing the MSGWAITSYNC flag. The synch thread will free the mblk.
13690f1702c5SYu Xiangning 	 */
13700f1702c5SYu Xiangning 	ASSERT(mp->b_flag & MSGWAITSYNC);
13710f1702c5SYu Xiangning 	mp->b_flag &= ~MSGWAITSYNC;
13720f1702c5SYu Xiangning 	cv_broadcast(&connp->conn_sq_cv);
13730f1702c5SYu Xiangning 
13740f1702c5SYu Xiangning 	/*
13750f1702c5SYu Xiangning 	 * We are doing something on behalf of another thread, so we have to
13760f1702c5SYu Xiangning 	 * pause and wait until it finishes.
13770f1702c5SYu Xiangning 	 */
13780f1702c5SYu Xiangning 	while (sqp->sq_state & SQS_PAUSE) {
13790f1702c5SYu Xiangning 		cv_wait(&sqp->sq_synch_cv, &sqp->sq_lock);
13800f1702c5SYu Xiangning 	}
13810f1702c5SYu Xiangning 	mutex_exit(&sqp->sq_lock);
13820f1702c5SYu Xiangning }
13830f1702c5SYu Xiangning 
13840f1702c5SYu Xiangning int
1385f3124163SAnders Persson squeue_synch_enter(squeue_t *sqp, conn_t *connp, mblk_t *use_mp)
13860f1702c5SYu Xiangning {
13870f1702c5SYu Xiangning 	mutex_enter(&sqp->sq_lock);
13880f1702c5SYu Xiangning 	if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
13890f1702c5SYu Xiangning 		/*
13900f1702c5SYu Xiangning 		 * We are OK to proceed if the squeue is empty, and
13910f1702c5SYu Xiangning 		 * no one owns the squeue.
13920f1702c5SYu Xiangning 		 *
13930f1702c5SYu Xiangning 		 * The caller won't own the squeue as this is called from the
13940f1702c5SYu Xiangning 		 * application.
13950f1702c5SYu Xiangning 		 */
13960f1702c5SYu Xiangning 		ASSERT(sqp->sq_run == NULL);
13970f1702c5SYu Xiangning 
13980f1702c5SYu Xiangning 		sqp->sq_state |= SQS_PROC;
13990f1702c5SYu Xiangning 		sqp->sq_run = curthread;
14000f1702c5SYu Xiangning 		mutex_exit(&sqp->sq_lock);
14010f1702c5SYu Xiangning 
14020f1702c5SYu Xiangning #if SQUEUE_DEBUG
14030f1702c5SYu Xiangning 		sqp->sq_curmp = NULL;
14040f1702c5SYu Xiangning 		sqp->sq_curproc = NULL;
14050f1702c5SYu Xiangning 		sqp->sq_connp = connp;
14060f1702c5SYu Xiangning #endif
14070f1702c5SYu Xiangning 		connp->conn_on_sqp = B_TRUE;
14080f1702c5SYu Xiangning 		return (0);
14090f1702c5SYu Xiangning 	} else {
14100f1702c5SYu Xiangning 		mblk_t  *mp;
14110f1702c5SYu Xiangning 
1412f3124163SAnders Persson 		mp = (use_mp == NULL) ? allocb(0, BPRI_MED) : use_mp;
14130f1702c5SYu Xiangning 		if (mp == NULL) {
14140f1702c5SYu Xiangning 			mutex_exit(&sqp->sq_lock);
14150f1702c5SYu Xiangning 			return (ENOMEM);
14160f1702c5SYu Xiangning 		}
14170f1702c5SYu Xiangning 
14180f1702c5SYu Xiangning 		/*
14190f1702c5SYu Xiangning 		 * We mark the mblk as awaiting synchronous squeue access
14200f1702c5SYu Xiangning 		 * by setting the MSGWAITSYNC flag. Once squeue_wakeup_conn
14210f1702c5SYu Xiangning 		 * fires, MSGWAITSYNC is cleared, at which point we know we
14220f1702c5SYu Xiangning 		 * have exclusive access.
14230f1702c5SYu Xiangning 		 */
14240f1702c5SYu Xiangning 		mp->b_flag |= MSGWAITSYNC;
14250f1702c5SYu Xiangning 
14260f1702c5SYu Xiangning 		CONN_INC_REF(connp);
14270f1702c5SYu Xiangning 		SET_SQUEUE(mp, squeue_wakeup_conn, connp);
14280f1702c5SYu Xiangning 		ENQUEUE_CHAIN(sqp, mp, mp, 1);
14290f1702c5SYu Xiangning 
14300f1702c5SYu Xiangning 		ASSERT(sqp->sq_run != curthread);
14310f1702c5SYu Xiangning 
14320f1702c5SYu Xiangning 		/* Wait until the enqueued mblk get processed. */
14330f1702c5SYu Xiangning 		while (mp->b_flag & MSGWAITSYNC)
14340f1702c5SYu Xiangning 			cv_wait(&connp->conn_sq_cv, &sqp->sq_lock);
14350f1702c5SYu Xiangning 		mutex_exit(&sqp->sq_lock);
14360f1702c5SYu Xiangning 
1437f3124163SAnders Persson 		if (use_mp == NULL)
14380f1702c5SYu Xiangning 			freeb(mp);
14390f1702c5SYu Xiangning 
14400f1702c5SYu Xiangning 		return (0);
14410f1702c5SYu Xiangning 	}
14420f1702c5SYu Xiangning }
14430f1702c5SYu Xiangning 
14440f1702c5SYu Xiangning void
1445f3124163SAnders Persson squeue_synch_exit(squeue_t *sqp, conn_t *connp)
14460f1702c5SYu Xiangning {
14470f1702c5SYu Xiangning 	mutex_enter(&sqp->sq_lock);
14480f1702c5SYu Xiangning 	if (sqp->sq_run == curthread) {
14490f1702c5SYu Xiangning 		ASSERT(sqp->sq_state & SQS_PROC);
14500f1702c5SYu Xiangning 
14510f1702c5SYu Xiangning 		sqp->sq_state &= ~SQS_PROC;
14520f1702c5SYu Xiangning 		sqp->sq_run = NULL;
14530f1702c5SYu Xiangning 		connp->conn_on_sqp = B_FALSE;
14540f1702c5SYu Xiangning 
14550f1702c5SYu Xiangning 		if (sqp->sq_first == NULL) {
14560f1702c5SYu Xiangning 			mutex_exit(&sqp->sq_lock);
14570f1702c5SYu Xiangning 		} else {
14580f1702c5SYu Xiangning 			/*
14590f1702c5SYu Xiangning 			 * If this was a normal thread, then it would
14600f1702c5SYu Xiangning 			 * (most likely) continue processing the pending
14610f1702c5SYu Xiangning 			 * requests. Since the just completed operation
14620f1702c5SYu Xiangning 			 * was executed synchronously, the thread should
14630f1702c5SYu Xiangning 			 * not be delayed. To compensate, wake up the
14640f1702c5SYu Xiangning 			 * worker thread right away when there are outstanding
14650f1702c5SYu Xiangning 			 * requests.
14660f1702c5SYu Xiangning 			 */
14670f1702c5SYu Xiangning 			sqp->sq_awaken = lbolt;
14680f1702c5SYu Xiangning 			cv_signal(&sqp->sq_worker_cv);
14690f1702c5SYu Xiangning 			mutex_exit(&sqp->sq_lock);
14700f1702c5SYu Xiangning 		}
14710f1702c5SYu Xiangning 	} else {
14720f1702c5SYu Xiangning 		/*
14730f1702c5SYu Xiangning 		 * The caller doesn't own the squeue, clear the SQS_PAUSE flag,
14740f1702c5SYu Xiangning 		 * and wake up the squeue owner, such that owner can continue
14750f1702c5SYu Xiangning 		 * processing.
14760f1702c5SYu Xiangning 		 */
14770f1702c5SYu Xiangning 		ASSERT(sqp->sq_state & SQS_PAUSE);
14780f1702c5SYu Xiangning 		sqp->sq_state &= ~SQS_PAUSE;
14790f1702c5SYu Xiangning 
14800f1702c5SYu Xiangning 		/* There should be only one thread blocking on sq_synch_cv. */
14810f1702c5SYu Xiangning 		cv_signal(&sqp->sq_synch_cv);
14820f1702c5SYu Xiangning 		mutex_exit(&sqp->sq_lock);
14830f1702c5SYu Xiangning 	}
14840f1702c5SYu Xiangning }
1485