xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 3cac7b0d73edf3f2674ad0f64d1fff3d2e59ae8c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  * Copyright 2017 Joyent, Inc.
25  */
26 
27 /*
28  * IP interface to squeues.
29  *
30  * IP uses squeues to force serialization of packets, both incoming and
31  * outgoing. Each squeue is associated with a connection instance (conn_t)
32  * above, and a soft ring (if enabled) below. Each CPU will have a default
33  * squeue for outbound connections, and each soft ring of an interface will
34  * have an squeue to which it sends incoming packets. squeues are never
35  * destroyed, and if they become unused they are kept around against future
36  * needs.
37  *
38  * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
39  * in the system there will be one squeue set, all of whose squeues will be
40  * bound to that CPU, plus one additional set known as the unbound set. Sets
41  * associated with CPUs will have one default squeue, for outbound
42  * connections, and a linked list of squeues used by various NICs for inbound
43  * packets. The unbound set also has a linked list of squeues, but no default
44  * squeue.
45  *
46  * When a CPU goes offline its squeue set is destroyed, and all its squeues
47  * are moved to the unbound set. When a CPU comes online, a new squeue set is
48  * created and the default set is searched for a default squeue formerly bound
49  * to this CPU. If no default squeue is found, a new one is created.
50  *
51  * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
52  * and not the squeue code. squeue.c will not touch them, and we can modify
53  * them without holding the squeue lock because of the guarantee that squeues
54  * are never destroyed. ip_squeue locks must be held, however.
55  *
56  * All the squeue sets are protected by a single lock, the sqset_lock. This
57  * is also used to protect the sq_next and sq_set fields of an squeue_t.
58  *
59  * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
60  *
61  * There are two modes of associating connection with squeues. The first mode
62  * associates each connection with the CPU that creates the connection (either
63  * during open time or during accept time). The second mode associates each
64  * connection with a random CPU, effectively distributing load over all CPUs
65  * and all squeues in the system. The mode is controlled by the
66  * ip_squeue_fanout variable.
67  *
68  * NOTE: The fact that there is an association between each connection and
69  * squeue and squeue and CPU does not mean that each connection is always
70  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
71  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
72  * binding is only relevant for the worker thread.
73  *
74  * INTERFACE:
75  *
76  * squeue_t *ip_squeue_get(ill_rx_ring_t)
77  *
78  * Returns the squeue associated with an ill receive ring. If the ring is
79  * not bound to a CPU, and we're currently servicing the interrupt which
80  * generated the packet, then bind the squeue to CPU.
81  *
82  *
83  * DR Notes
84  * ========
85  *
86  * The ip_squeue_init() registers a call-back function with the CPU DR
87  * subsystem using register_cpu_setup_func(). The call-back function does two
88  * things:
89  *
90  * o When the CPU is going off-line or unconfigured, the worker thread is
91  *	unbound from the CPU. This allows the CPU unconfig code to move it to
92  *	another CPU.
93  *
94  * o When the CPU is going online, it creates a new squeue for this CPU if
95  *	necessary and binds the squeue worker thread to this CPU.
96  *
97  * TUNABLES:
98  *
99  * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
100  * pick the default squeue from a random CPU, otherwise use our CPU's default
101  * squeue.
102  *
103  * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
104  * /dev/ip.
105  */
106 
107 #include <sys/types.h>
108 #include <sys/debug.h>
109 #include <sys/kmem.h>
110 #include <sys/cpuvar.h>
111 #include <sys/cmn_err.h>
112 
113 #include <inet/common.h>
114 #include <inet/ip.h>
115 #include <netinet/ip6.h>
116 #include <inet/ip_if.h>
117 #include <inet/ip_ire.h>
118 #include <inet/nd.h>
119 #include <inet/ipclassifier.h>
120 #include <sys/types.h>
121 #include <sys/conf.h>
122 #include <sys/sunddi.h>
123 #include <sys/dlpi.h>
124 #include <sys/squeue_impl.h>
125 #include <sys/tihdr.h>
126 #include <inet/udp_impl.h>
127 #include <sys/strsubr.h>
128 #include <sys/zone.h>
129 #include <sys/dld.h>
130 #include <sys/atomic.h>
131 
132 /*
133  * List of all created squeue sets. The list and its size are protected by
134  * sqset_lock.
135  */
136 static squeue_set_t	**sqset_global_list; /* list 0 is the unbound list */
137 static uint_t		sqset_global_size;
138 kmutex_t		sqset_lock;
139 
140 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
141 
142 static squeue_t *ip_squeue_create(pri_t);
143 static squeue_set_t *ip_squeue_set_create(processorid_t);
144 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
145 static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
146 static void ip_squeue_set_destroy(cpu_t *);
147 static void ip_squeue_clean(void *, mblk_t *, void *);
148 
149 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
150 
151 static squeue_t *
152 ip_squeue_create(pri_t pri)
153 {
154 	squeue_t *sqp;
155 
156 	sqp = squeue_create(pri);
157 	ASSERT(sqp != NULL);
158 	if (ip_squeue_create_callback != NULL)
159 		ip_squeue_create_callback(sqp);
160 	return (sqp);
161 }
162 
163 /*
164  * Create a new squeue_set. If id == -1, then we're creating the unbound set,
165  * which should only happen once when we are first initialized. Otherwise id
166  * is the id of the CPU that needs a set, either because we are initializing
167  * or because the CPU has come online.
168  *
169  * If id != -1, then we need at a minimum to provide a default squeue for the
170  * new set. We search the unbound set for candidates, and if none are found we
171  * create a new one.
172  */
173 static squeue_set_t *
174 ip_squeue_set_create(processorid_t id)
175 {
176 	squeue_set_t	*sqs;
177 	squeue_set_t	*src = sqset_global_list[0];
178 	squeue_t	**lastsqp, *sq;
179 	squeue_t	**defaultq_lastp = NULL;
180 
181 	sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
182 	sqs->sqs_cpuid = id;
183 
184 	if (id == -1) {
185 		ASSERT(sqset_global_size == 0);
186 		sqset_global_list[0] = sqs;
187 		sqset_global_size = 1;
188 		return (sqs);
189 	}
190 
191 	/*
192 	 * When we create an squeue set id != -1, we need to give it a
193 	 * default squeue, in order to support fanout of conns across
194 	 * CPUs. Try to find a former default squeue that matches this
195 	 * cpu id on the unbound squeue set. If no such squeue is found,
196 	 * find some non-default TCP squeue that is free. If still no such
197 	 * candidate is found, create a new squeue.
198 	 */
199 
200 	ASSERT(MUTEX_HELD(&cpu_lock));
201 	mutex_enter(&sqset_lock);
202 	lastsqp = &src->sqs_head;
203 
204 	while (*lastsqp) {
205 		if ((*lastsqp)->sq_bind == id &&
206 		    (*lastsqp)->sq_state & SQS_DEFAULT) {
207 			/*
208 			 * Exact match. Former default squeue of cpu 'id'
209 			 */
210 			ASSERT(!((*lastsqp)->sq_state & SQS_ILL_BOUND));
211 			defaultq_lastp = lastsqp;
212 			break;
213 		}
214 		if (defaultq_lastp == NULL &&
215 		    !((*lastsqp)->sq_state & (SQS_ILL_BOUND | SQS_DEFAULT))) {
216 			/*
217 			 * A free non-default TCP squeue
218 			 */
219 			defaultq_lastp = lastsqp;
220 		}
221 		lastsqp = &(*lastsqp)->sq_next;
222 	}
223 
224 	if (defaultq_lastp != NULL) {
225 		/* Remove from src set and set SQS_DEFAULT */
226 		sq = *defaultq_lastp;
227 		*defaultq_lastp = sq->sq_next;
228 		sq->sq_next = NULL;
229 		if (!(sq->sq_state & SQS_DEFAULT)) {
230 			mutex_enter(&sq->sq_lock);
231 			sq->sq_state |= SQS_DEFAULT;
232 			mutex_exit(&sq->sq_lock);
233 		}
234 	} else {
235 		sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
236 		sq->sq_state |= SQS_DEFAULT;
237 	}
238 
239 	sq->sq_set = sqs;
240 	sqs->sqs_default = sq;
241 	squeue_bind(sq, id); /* this locks squeue mutex */
242 
243 	ASSERT(sqset_global_size <= NCPU);
244 	sqset_global_list[sqset_global_size++] = sqs;
245 	mutex_exit(&sqset_lock);
246 	return (sqs);
247 }
248 
249 /*
250  * Called by ill_ring_add() to find an squeue to associate with a new ring.
251  */
252 
253 squeue_t *
254 ip_squeue_getfree(pri_t pri)
255 {
256 	squeue_set_t	*sqs = sqset_global_list[0];
257 	squeue_t	*sq;
258 
259 	mutex_enter(&sqset_lock);
260 	for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
261 		/*
262 		 * Select a non-default TCP squeue that is free i.e. not
263 		 * bound to any ill.
264 		 */
265 		if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
266 			break;
267 	}
268 
269 	if (sq == NULL) {
270 		sq = ip_squeue_create(pri);
271 		sq->sq_set = sqs;
272 		sq->sq_next = sqs->sqs_head;
273 		sqs->sqs_head = sq;
274 	}
275 
276 	ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
277 	    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
278 	    SQS_POLL_THR_QUIESCED)));
279 
280 	mutex_enter(&sq->sq_lock);
281 	sq->sq_state |= SQS_ILL_BOUND;
282 	mutex_exit(&sq->sq_lock);
283 	mutex_exit(&sqset_lock);
284 
285 	if (sq->sq_priority != pri) {
286 		thread_lock(sq->sq_worker);
287 		(void) thread_change_pri(sq->sq_worker, pri, 0);
288 		thread_unlock(sq->sq_worker);
289 
290 		thread_lock(sq->sq_poll_thr);
291 		(void) thread_change_pri(sq->sq_poll_thr, pri, 0);
292 		thread_unlock(sq->sq_poll_thr);
293 
294 		sq->sq_priority = pri;
295 	}
296 	return (sq);
297 }
298 
299 /*
300  * Initialize IP squeues.
301  */
302 void
303 ip_squeue_init(void (*callback)(squeue_t *))
304 {
305 	int i;
306 	squeue_set_t	*sqs;
307 
308 	ASSERT(sqset_global_list == NULL);
309 
310 	ip_squeue_create_callback = callback;
311 	squeue_init();
312 	mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
313 	sqset_global_list =
314 	    kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
315 	sqset_global_size = 0;
316 	/*
317 	 * We are called at system boot time and we don't
318 	 * expect memory allocation failure.
319 	 */
320 	sqs = ip_squeue_set_create(-1);
321 	ASSERT(sqs != NULL);
322 
323 	mutex_enter(&cpu_lock);
324 	/* Create squeue for each active CPU available */
325 	for (i = 0; i < NCPU; i++) {
326 		cpu_t *cp = cpu_get(i);
327 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
328 			/*
329 			 * We are called at system boot time and we don't
330 			 * expect memory allocation failure then
331 			 */
332 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
333 			ASSERT(cp->cpu_squeue_set != NULL);
334 		}
335 	}
336 
337 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
338 	mutex_exit(&cpu_lock);
339 }
340 
341 /*
342  * Get a default squeue, either from the current CPU or a CPU derived by hash
343  * from the index argument, depending upon the setting of ip_squeue_fanout.
344  */
345 squeue_t *
346 ip_squeue_random(uint_t index)
347 {
348 	squeue_set_t *sqs = NULL;
349 	squeue_t *sq;
350 
351 	/*
352 	 * The minimum value of sqset_global_size is 2, one for the unbound
353 	 * squeue set and another for the squeue set of the zeroth CPU.
354 	 * Even though the value could be changing, it can never go below 2,
355 	 * so the assert does not need the lock protection.
356 	 */
357 	ASSERT(sqset_global_size > 1);
358 
359 	/* Protect against changes to sqset_global_list */
360 	mutex_enter(&sqset_lock);
361 
362 	if (!ip_squeue_fanout)
363 		sqs = CPU->cpu_squeue_set;
364 
365 	/*
366 	 * sqset_global_list[0] corresponds to the unbound squeue set.
367 	 * The computation below picks a set other than the unbound set.
368 	 */
369 	if (sqs == NULL)
370 		sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
371 	sq = sqs->sqs_default;
372 
373 	mutex_exit(&sqset_lock);
374 	ASSERT(sq);
375 	return (sq);
376 }
377 
378 /*
379  * Move squeue from its current set to newset. Not used for default squeues.
380  * Bind or unbind the worker thread as appropriate.
381  */
382 
383 static void
384 ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
385 {
386 	squeue_set_t	*set;
387 	squeue_t	**lastsqp;
388 	processorid_t	cpuid = newset->sqs_cpuid;
389 
390 	ASSERT(!(sq->sq_state & SQS_DEFAULT));
391 	ASSERT(!MUTEX_HELD(&sq->sq_lock));
392 	ASSERT(MUTEX_HELD(&sqset_lock));
393 
394 	set = sq->sq_set;
395 	if (set == newset)
396 		return;
397 
398 	lastsqp = &set->sqs_head;
399 	while (*lastsqp != sq)
400 		lastsqp = &(*lastsqp)->sq_next;
401 
402 	*lastsqp = sq->sq_next;
403 	sq->sq_next = newset->sqs_head;
404 	newset->sqs_head = sq;
405 	sq->sq_set = newset;
406 	if (cpuid == -1)
407 		squeue_unbind(sq);
408 	else
409 		squeue_bind(sq, cpuid);
410 }
411 
412 /*
413  * Move squeue from its current set to cpuid's set and bind to cpuid.
414  */
415 
416 int
417 ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
418 {
419 	cpu_t *cpu;
420 	squeue_set_t *set;
421 
422 	if (sq->sq_state & SQS_DEFAULT)
423 		return (-1);
424 
425 	ASSERT(MUTEX_HELD(&cpu_lock));
426 
427 	cpu = cpu_get(cpuid);
428 	if (!CPU_ISON(cpu))
429 		return (-1);
430 
431 	mutex_enter(&sqset_lock);
432 	set = cpu->cpu_squeue_set;
433 	if (set != NULL)
434 		ip_squeue_set_move(sq, set);
435 	mutex_exit(&sqset_lock);
436 	return ((set == NULL) ? -1 : 0);
437 }
438 
439 /*
440  * The mac layer is calling, asking us to move an squeue to a
441  * new CPU. This routine is called with cpu_lock held.
442  */
443 void
444 ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
445 {
446 	ASSERT(ILL_MAC_PERIM_HELD(ill));
447 	ASSERT(rx_ring->rr_ill == ill);
448 
449 	mutex_enter(&ill->ill_lock);
450 	if (rx_ring->rr_ring_state == RR_FREE ||
451 	    rx_ring->rr_ring_state == RR_FREE_INPROG) {
452 		mutex_exit(&ill->ill_lock);
453 		return;
454 	}
455 
456 	if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
457 		rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
458 
459 	mutex_exit(&ill->ill_lock);
460 }
461 
462 void *
463 ip_squeue_add_ring(ill_t *ill, void *mrp)
464 {
465 	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
466 	ill_rx_ring_t		*rx_ring, *ring_tbl;
467 	int			ip_rx_index;
468 	squeue_t		*sq = NULL;
469 	pri_t			pri;
470 
471 	ASSERT(ILL_MAC_PERIM_HELD(ill));
472 	ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
473 	ASSERT(ill->ill_dld_capab != NULL);
474 
475 	ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
476 
477 	mutex_enter(&ill->ill_lock);
478 	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
479 		rx_ring = &ring_tbl[ip_rx_index];
480 		if (rx_ring->rr_ring_state == RR_FREE)
481 			break;
482 	}
483 
484 	if (ip_rx_index == ILL_MAX_RINGS) {
485 		/*
486 		 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
487 		 * we have devices which can overwhelm this limit,
488 		 * ILL_MAX_RING should be made configurable. Meanwhile it
489 		 * cause no panic because driver will pass ip_input a NULL
490 		 * handle which will make IP allocate the default squeue and
491 		 * Polling mode will not be used for this ring.
492 		 */
493 		cmn_err(CE_NOTE,
494 		    "Reached maximum number of receiving rings (%d) for %s\n",
495 		    ILL_MAX_RINGS, ill->ill_name);
496 		mutex_exit(&ill->ill_lock);
497 		return (NULL);
498 	}
499 
500 	bzero(rx_ring, sizeof (ill_rx_ring_t));
501 	rx_ring->rr_rx = mrfp->mrf_receive;
502 	/* XXX: Hard code it to tcp accept for now */
503 	rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
504 
505 	rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
506 	rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
507 	rx_ring->rr_intr_disable =
508 	    (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
509 	rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
510 	rx_ring->rr_ill = ill;
511 
512 	pri = mrfp->mrf_flow_priority;
513 
514 	sq = ip_squeue_getfree(pri);
515 
516 	mutex_enter(&sq->sq_lock);
517 	sq->sq_rx_ring = rx_ring;
518 	rx_ring->rr_sqp = sq;
519 
520 	sq->sq_state |= SQS_POLL_CAPAB;
521 
522 	rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
523 	sq->sq_ill = ill;
524 	mutex_exit(&sq->sq_lock);
525 	mutex_exit(&ill->ill_lock);
526 
527 	DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
528 	    ip_rx_index, void *, mrfp->mrf_rx_arg);
529 
530 	/* Assign the squeue to the specified CPU as well */
531 	mutex_enter(&cpu_lock);
532 	(void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
533 	mutex_exit(&cpu_lock);
534 
535 	return (rx_ring);
536 }
537 
538 /*
539  * sanitize the squeue etc. Some of the processing
540  * needs to be done from inside the perimeter.
541  */
542 void
543 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
544 {
545 	squeue_t *sqp;
546 
547 	ASSERT(ILL_MAC_PERIM_HELD(ill));
548 	ASSERT(rx_ring != NULL);
549 
550 	/* Just clean one squeue */
551 	mutex_enter(&ill->ill_lock);
552 	if (rx_ring->rr_ring_state == RR_FREE) {
553 		mutex_exit(&ill->ill_lock);
554 		return;
555 	}
556 	rx_ring->rr_ring_state = RR_FREE_INPROG;
557 	sqp = rx_ring->rr_sqp;
558 
559 	mutex_enter(&sqp->sq_lock);
560 	sqp->sq_state |= SQS_POLL_CLEANUP;
561 	cv_signal(&sqp->sq_worker_cv);
562 	mutex_exit(&ill->ill_lock);
563 	while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
564 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
565 	sqp->sq_state &= ~SQS_POLL_CLEANUP_DONE;
566 
567 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
568 	    SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
569 	    SQS_POLL_THR_QUIESCED)));
570 
571 	cv_signal(&sqp->sq_worker_cv);
572 	mutex_exit(&sqp->sq_lock);
573 
574 	/*
575 	 * Move the squeue to sqset_global_list[0] which holds the set of
576 	 * squeues not bound to any cpu. Note that the squeue is still
577 	 * considered bound to an ill as long as SQS_ILL_BOUND is set.
578 	 */
579 	mutex_enter(&sqset_lock);
580 	ip_squeue_set_move(sqp, sqset_global_list[0]);
581 	mutex_exit(&sqset_lock);
582 
583 	/*
584 	 * CPU going offline can also trigger a move of the squeue to the
585 	 * unbound set sqset_global_list[0]. However the squeue won't be
586 	 * recycled for the next use as long as the SQS_ILL_BOUND flag
587 	 * is set. Hence we clear the SQS_ILL_BOUND flag only towards the
588 	 * end after the move.
589 	 */
590 	mutex_enter(&sqp->sq_lock);
591 	sqp->sq_state &= ~SQS_ILL_BOUND;
592 	mutex_exit(&sqp->sq_lock);
593 
594 	mutex_enter(&ill->ill_lock);
595 	rx_ring->rr_ring_state = RR_FREE;
596 	mutex_exit(&ill->ill_lock);
597 }
598 
599 /*
600  * Stop the squeue from polling. This needs to be done
601  * from inside the perimeter.
602  */
603 void
604 ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
605 {
606 	squeue_t *sqp;
607 
608 	ASSERT(ILL_MAC_PERIM_HELD(ill));
609 	ASSERT(rx_ring != NULL);
610 
611 	sqp = rx_ring->rr_sqp;
612 	mutex_enter(&sqp->sq_lock);
613 	sqp->sq_state |= SQS_POLL_QUIESCE;
614 	cv_signal(&sqp->sq_worker_cv);
615 	while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
616 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
617 
618 	mutex_exit(&sqp->sq_lock);
619 }
620 
621 /*
622  * Restart polling etc. Needs to be inside the perimeter to
623  * prevent races.
624  */
625 void
626 ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
627 {
628 	squeue_t *sqp;
629 
630 	ASSERT(ILL_MAC_PERIM_HELD(ill));
631 	ASSERT(rx_ring != NULL);
632 
633 	sqp = rx_ring->rr_sqp;
634 	mutex_enter(&sqp->sq_lock);
635 	/*
636 	 * Handle change in number of rings between the quiesce and
637 	 * restart operations by checking for a previous quiesce before
638 	 * attempting a restart.
639 	 */
640 	if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
641 		mutex_exit(&sqp->sq_lock);
642 		return;
643 	}
644 	sqp->sq_state |= SQS_POLL_RESTART;
645 	cv_signal(&sqp->sq_worker_cv);
646 	while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
647 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
648 	sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
649 	mutex_exit(&sqp->sq_lock);
650 }
651 
652 /*
653  * sanitize all squeues associated with the ill.
654  */
655 void
656 ip_squeue_clean_all(ill_t *ill)
657 {
658 	int idx;
659 	ill_rx_ring_t	*rx_ring;
660 
661 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
662 		rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
663 		ip_squeue_clean_ring(ill, rx_ring);
664 	}
665 }
666 
667 /*
668  * Used by IP to get the squeue associated with a ring. If the squeue isn't
669  * yet bound to a CPU, and we're being called directly from the NIC's
670  * interrupt, then we know what CPU we want to assign the squeue to, so
671  * dispatch that task to a taskq.
672  */
673 squeue_t *
674 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
675 {
676 	squeue_t	*sqp;
677 
678 	if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
679 		return (IP_SQUEUE_GET(CPU_PSEUDO_RANDOM()));
680 
681 	return (sqp);
682 }
683 
684 /*
685  * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
686  * squeues are unboudn and moved to the unbound set.
687  */
688 static void
689 ip_squeue_set_destroy(cpu_t *cpu)
690 {
691 	int i;
692 	squeue_t *sqp, *lastsqp = NULL;
693 	squeue_set_t *sqs, *unbound = sqset_global_list[0];
694 
695 	mutex_enter(&sqset_lock);
696 	if ((sqs = cpu->cpu_squeue_set) == NULL) {
697 		mutex_exit(&sqset_lock);
698 		return;
699 	}
700 
701 	/* Move all squeues to unbound set */
702 
703 	for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
704 		squeue_unbind(sqp);
705 		sqp->sq_set = unbound;
706 	}
707 	if (sqs->sqs_head) {
708 		lastsqp->sq_next = unbound->sqs_head;
709 		unbound->sqs_head = sqs->sqs_head;
710 	}
711 
712 	/* Also move default squeue to unbound set */
713 
714 	sqp = sqs->sqs_default;
715 	ASSERT(sqp != NULL);
716 	ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
717 
718 	sqp->sq_next = unbound->sqs_head;
719 	unbound->sqs_head = sqp;
720 	squeue_unbind(sqp);
721 	sqp->sq_set = unbound;
722 
723 	for (i = 1; i < sqset_global_size; i++)
724 		if (sqset_global_list[i] == sqs)
725 			break;
726 
727 	ASSERT(i < sqset_global_size);
728 	sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
729 	sqset_global_list[sqset_global_size - 1] = NULL;
730 	sqset_global_size--;
731 
732 	mutex_exit(&sqset_lock);
733 	kmem_free(sqs, sizeof (*sqs));
734 }
735 
736 /*
737  * Reconfiguration callback
738  */
739 /* ARGSUSED */
740 static int
741 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
742 {
743 	cpu_t *cp = cpu_get(id);
744 
745 	ASSERT(MUTEX_HELD(&cpu_lock));
746 	switch (what) {
747 	case CPU_CONFIG:
748 	case CPU_ON:
749 	case CPU_INIT:
750 	case CPU_CPUPART_IN:
751 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL)
752 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
753 		break;
754 	case CPU_UNCONFIG:
755 	case CPU_OFF:
756 	case CPU_CPUPART_OUT:
757 		if (cp->cpu_squeue_set != NULL) {
758 			ip_squeue_set_destroy(cp);
759 			cp->cpu_squeue_set = NULL;
760 		}
761 		break;
762 	default:
763 		break;
764 	}
765 	return (0);
766 }
767