xref: /titanic_51/usr/src/uts/common/inet/ip/ip_squeue.c (revision fc51f9bbbff02dbd8c3adf640b1a184ceeb58fa5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IP interface to squeues.
28  *
29  * IP uses squeues to force serialization of packets, both incoming and
30  * outgoing. Each squeue is associated with a connection instance (conn_t)
31  * above, and a soft ring (if enabled) below. Each CPU will have a default
32  * squeue for outbound connections, and each soft ring of an interface will
33  * have an squeue to which it sends incoming packets. squeues are never
34  * destroyed, and if they become unused they are kept around against future
35  * needs.
36  *
37  * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
38  * in the system there will be one squeue set, all of whose squeues will be
39  * bound to that CPU, plus one additional set known as the unbound set. Sets
40  * associated with CPUs will have one default squeue, for outbound
41  * connections, and a linked list of squeues used by various NICs for inbound
42  * packets. The unbound set also has a linked list of squeues, but no default
43  * squeue.
44  *
45  * When a CPU goes offline its squeue set is destroyed, and all its squeues
46  * are moved to the unbound set. When a CPU comes online, a new squeue set is
47  * created and the default set is searched for a default squeue formerly bound
48  * to this CPU. If no default squeue is found, a new one is created.
49  *
50  * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
51  * and not the squeue code. squeue.c will not touch them, and we can modify
52  * them without holding the squeue lock because of the guarantee that squeues
53  * are never destroyed. ip_squeue locks must be held, however.
54  *
55  * All the squeue sets are protected by a single lock, the sqset_lock. This
56  * is also used to protect the sq_next and sq_set fields of an squeue_t.
57  *
58  * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
59  *
60  * There are two modes of associating connection with squeues. The first mode
61  * associates each connection with the CPU that creates the connection (either
62  * during open time or during accept time). The second mode associates each
63  * connection with a random CPU, effectively distributing load over all CPUs
64  * and all squeues in the system. The mode is controlled by the
65  * ip_squeue_fanout variable.
66  *
67  * NOTE: The fact that there is an association between each connection and
68  * squeue and squeue and CPU does not mean that each connection is always
69  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
70  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
71  * binding is only relevant for the worker thread.
72  *
73  * INTERFACE:
74  *
75  * squeue_t *ip_squeue_get(ill_rx_ring_t)
76  *
77  * Returns the squeue associated with an ill receive ring. If the ring is
78  * not bound to a CPU, and we're currently servicing the interrupt which
79  * generated the packet, then bind the squeue to CPU.
80  *
81  *
82  * DR Notes
83  * ========
84  *
85  * The ip_squeue_init() registers a call-back function with the CPU DR
86  * subsystem using register_cpu_setup_func(). The call-back function does two
87  * things:
88  *
89  * o When the CPU is going off-line or unconfigured, the worker thread is
90  *	unbound from the CPU. This allows the CPU unconfig code to move it to
91  *	another CPU.
92  *
93  * o When the CPU is going online, it creates a new squeue for this CPU if
94  *	necessary and binds the squeue worker thread to this CPU.
95  *
96  * TUNABLES:
97  *
98  * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
99  * pick the default squeue from a random CPU, otherwise use our CPU's default
100  * squeue.
101  *
102  * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
103  * /dev/ip.
104  *
105  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
106  * created. This is the time squeue code waits before waking up the worker
107  * thread after queuing a request.
108  */
109 
110 #include <sys/types.h>
111 #include <sys/debug.h>
112 #include <sys/kmem.h>
113 #include <sys/cpuvar.h>
114 #include <sys/cmn_err.h>
115 
116 #include <inet/common.h>
117 #include <inet/ip.h>
118 #include <netinet/ip6.h>
119 #include <inet/ip_if.h>
120 #include <inet/ip_ire.h>
121 #include <inet/nd.h>
122 #include <inet/ipclassifier.h>
123 #include <sys/types.h>
124 #include <sys/conf.h>
125 #include <sys/sunddi.h>
126 #include <sys/dlpi.h>
127 #include <sys/squeue_impl.h>
128 #include <sys/tihdr.h>
129 #include <inet/udp_impl.h>
130 #include <sys/strsubr.h>
131 #include <sys/zone.h>
132 #include <sys/dld.h>
133 #include <sys/atomic.h>
134 
135 /*
136  * List of all created squeue sets. The list and its size are protected by
137  * sqset_lock.
138  */
139 static squeue_set_t	**sqset_global_list; /* list 0 is the unbound list */
140 static uint_t		sqset_global_size;
141 kmutex_t		sqset_lock;
142 
143 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
144 
145 /*
146  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
147  *	created. This is the time squeue code waits before waking up the worker
148  *	thread after queuing a request.
149  */
150 uint_t ip_squeue_worker_wait = 10;
151 
152 static squeue_t *ip_squeue_create(pri_t);
153 static squeue_set_t *ip_squeue_set_create(processorid_t);
154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
155 static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
156 static void ip_squeue_set_destroy(cpu_t *);
157 static void ip_squeue_clean(void *, mblk_t *, void *);
158 
159 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
160 
161 static squeue_t *
162 ip_squeue_create(pri_t pri)
163 {
164 	squeue_t *sqp;
165 
166 	sqp = squeue_create(ip_squeue_worker_wait, pri);
167 	ASSERT(sqp != NULL);
168 	if (ip_squeue_create_callback != NULL)
169 		ip_squeue_create_callback(sqp);
170 	return (sqp);
171 }
172 
173 /*
174  * Create a new squeue_set. If id == -1, then we're creating the unbound set,
175  * which should only happen once when we are first initialized. Otherwise id
176  * is the id of the CPU that needs a set, either because we are initializing
177  * or because the CPU has come online.
178  *
179  * If id != -1, then we need at a minimum to provide a default squeue for the
180  * new set. We search the unbound set for candidates, and if none are found we
181  * create a new one.
182  */
183 static squeue_set_t *
184 ip_squeue_set_create(processorid_t id)
185 {
186 	squeue_set_t	*sqs;
187 	squeue_set_t	*src = sqset_global_list[0];
188 	squeue_t	**lastsqp, *sq;
189 	squeue_t	**defaultq_lastp = NULL;
190 
191 	sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
192 	sqs->sqs_cpuid = id;
193 
194 	if (id == -1) {
195 		ASSERT(sqset_global_size == 0);
196 		sqset_global_list[0] = sqs;
197 		sqset_global_size = 1;
198 		return (sqs);
199 	}
200 
201 	/*
202 	 * When we create an squeue set id != -1, we need to give it a
203 	 * default squeue, in order to support fanout of conns across
204 	 * CPUs. Try to find a former default squeue that matches this
205 	 * cpu id on the unbound squeue set. If no such squeue is found,
206 	 * find some non-default TCP squeue and steal it. If still no such
207 	 * candidate is found, create a new squeue.
208 	 */
209 
210 	ASSERT(MUTEX_HELD(&cpu_lock));
211 	mutex_enter(&sqset_lock);
212 	lastsqp = &src->sqs_head;
213 
214 	while (*lastsqp) {
215 		if ((*lastsqp)->sq_bind == id &&
216 		    (*lastsqp)->sq_state & SQS_DEFAULT) {
217 			defaultq_lastp = lastsqp;
218 			break;
219 		}
220 		if (defaultq_lastp == NULL &&
221 		    !((*lastsqp)->sq_state & SQS_DEFAULT)) {
222 			defaultq_lastp = lastsqp;
223 		}
224 		lastsqp = &(*lastsqp)->sq_next;
225 
226 	}
227 	if (defaultq_lastp) {
228 		/* Remove from src set and set SQS_DEFAULT */
229 		sq = *defaultq_lastp;
230 		*defaultq_lastp = sq->sq_next;
231 		sq->sq_next = NULL;
232 		if (!(sq->sq_state & SQS_DEFAULT)) {
233 			mutex_enter(&sq->sq_lock);
234 			sq->sq_state |= SQS_DEFAULT;
235 			mutex_exit(&sq->sq_lock);
236 		}
237 	} else {
238 		sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
239 		sq->sq_state |= SQS_DEFAULT;
240 	}
241 
242 	sq->sq_set = sqs;
243 	sqs->sqs_default = sq;
244 	squeue_bind(sq, id); /* this locks squeue mutex */
245 
246 	ASSERT(sqset_global_size <= NCPU);
247 	sqset_global_list[sqset_global_size++] = sqs;
248 	mutex_exit(&sqset_lock);
249 	return (sqs);
250 }
251 
252 /*
253  * Called by ill_ring_add() to find an squeue to associate with a new ring.
254  */
255 
256 squeue_t *
257 ip_squeue_getfree(pri_t pri)
258 {
259 	squeue_set_t	*sqs = sqset_global_list[0];
260 	squeue_t	*sq;
261 
262 	mutex_enter(&sqset_lock);
263 	for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
264 		/*
265 		 * Select a non-default squeue
266 		 */
267 		if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
268 			break;
269 	}
270 
271 	if (sq == NULL) {
272 		sq = ip_squeue_create(pri);
273 		sq->sq_set = sqs;
274 		sq->sq_next = sqs->sqs_head;
275 		sqs->sqs_head = sq;
276 	}
277 
278 	ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
279 	    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
280 	    SQS_POLL_THR_QUIESCED)));
281 
282 	mutex_enter(&sq->sq_lock);
283 	sq->sq_state |= SQS_ILL_BOUND;
284 	mutex_exit(&sq->sq_lock);
285 	mutex_exit(&sqset_lock);
286 
287 	if (sq->sq_priority != pri) {
288 		thread_lock(sq->sq_worker);
289 		(void) thread_change_pri(sq->sq_worker, pri, 0);
290 		thread_unlock(sq->sq_worker);
291 
292 		thread_lock(sq->sq_poll_thr);
293 		(void) thread_change_pri(sq->sq_poll_thr, pri, 0);
294 		thread_unlock(sq->sq_poll_thr);
295 
296 		sq->sq_priority = pri;
297 	}
298 	return (sq);
299 }
300 
301 /*
302  * Initialize IP squeues.
303  */
304 void
305 ip_squeue_init(void (*callback)(squeue_t *))
306 {
307 	int i;
308 	squeue_set_t	*sqs;
309 
310 	ASSERT(sqset_global_list == NULL);
311 
312 	ip_squeue_create_callback = callback;
313 	squeue_init();
314 	mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
315 	sqset_global_list =
316 	    kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
317 	sqset_global_size = 0;
318 	/*
319 	 * We are called at system boot time and we don't
320 	 * expect memory allocation failure.
321 	 */
322 	sqs = ip_squeue_set_create(-1);
323 	ASSERT(sqs != NULL);
324 
325 	mutex_enter(&cpu_lock);
326 	/* Create squeue for each active CPU available */
327 	for (i = 0; i < NCPU; i++) {
328 		cpu_t *cp = cpu_get(i);
329 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
330 			/*
331 			 * We are called at system boot time and we don't
332 			 * expect memory allocation failure then
333 			 */
334 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
335 			ASSERT(cp->cpu_squeue_set != NULL);
336 		}
337 	}
338 
339 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
340 	mutex_exit(&cpu_lock);
341 }
342 
343 /*
344  * Get a default squeue, either from the current CPU or a CPU derived by hash
345  * from the index argument, depending upon the setting of ip_squeue_fanout.
346  */
347 squeue_t *
348 ip_squeue_random(uint_t index)
349 {
350 	squeue_set_t *sqs = NULL;
351 	squeue_t *sq;
352 
353 	/*
354 	 * The minimum value of sqset_global_size is 2, one for the unbound
355 	 * squeue set and another for the squeue set of the zeroth CPU.
356 	 * Even though the value could be changing, it can never go below 2,
357 	 * so the assert does not need the lock protection.
358 	 */
359 	ASSERT(sqset_global_size > 1);
360 
361 	/* Protect against changes to sqset_global_list */
362 	mutex_enter(&sqset_lock);
363 
364 	if (!ip_squeue_fanout)
365 		sqs = CPU->cpu_squeue_set;
366 
367 	/*
368 	 * sqset_global_list[0] corresponds to the unbound squeue set.
369 	 * The computation below picks a set other than the unbound set.
370 	 */
371 	if (sqs == NULL)
372 		sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
373 	sq = sqs->sqs_default;
374 
375 	mutex_exit(&sqset_lock);
376 	ASSERT(sq);
377 	return (sq);
378 }
379 
380 /*
381  * Move squeue from its current set to newset. Not used for default squeues.
382  * Bind or unbind the worker thread as appropriate.
383  */
384 
385 static void
386 ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
387 {
388 	squeue_set_t	*set;
389 	squeue_t	**lastsqp;
390 	processorid_t	cpuid = newset->sqs_cpuid;
391 
392 	ASSERT(!(sq->sq_state & SQS_DEFAULT));
393 	ASSERT(!MUTEX_HELD(&sq->sq_lock));
394 	ASSERT(MUTEX_HELD(&sqset_lock));
395 
396 	set = sq->sq_set;
397 	if (set == newset)
398 		return;
399 
400 	lastsqp = &set->sqs_head;
401 	while (*lastsqp != sq)
402 		lastsqp = &(*lastsqp)->sq_next;
403 
404 	*lastsqp = sq->sq_next;
405 	sq->sq_next = newset->sqs_head;
406 	newset->sqs_head = sq;
407 	sq->sq_set = newset;
408 	if (cpuid == -1)
409 		squeue_unbind(sq);
410 	else
411 		squeue_bind(sq, cpuid);
412 }
413 
414 /*
415  * Move squeue from its current set to cpuid's set and bind to cpuid.
416  */
417 
418 int
419 ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
420 {
421 	cpu_t *cpu;
422 	squeue_set_t *set;
423 
424 	if (sq->sq_state & SQS_DEFAULT)
425 		return (-1);
426 
427 	ASSERT(MUTEX_HELD(&cpu_lock));
428 
429 	cpu = cpu_get(cpuid);
430 	if (!CPU_ISON(cpu))
431 		return (-1);
432 
433 	mutex_enter(&sqset_lock);
434 	set = cpu->cpu_squeue_set;
435 	if (set != NULL)
436 		ip_squeue_set_move(sq, set);
437 	mutex_exit(&sqset_lock);
438 	return ((set == NULL) ? -1 : 0);
439 }
440 
441 /*
442  * The mac layer is calling, asking us to move an squeue to a
443  * new CPU. This routine is called with cpu_lock held.
444  */
445 void
446 ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
447 {
448 	ASSERT(ILL_MAC_PERIM_HELD(ill));
449 	ASSERT(rx_ring->rr_ill == ill);
450 
451 	mutex_enter(&ill->ill_lock);
452 	if (rx_ring->rr_ring_state == RR_FREE ||
453 	    rx_ring->rr_ring_state == RR_FREE_INPROG) {
454 		mutex_exit(&ill->ill_lock);
455 		return;
456 	}
457 
458 	if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
459 		rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
460 
461 	mutex_exit(&ill->ill_lock);
462 }
463 
464 void *
465 ip_squeue_add_ring(ill_t *ill, void *mrp)
466 {
467 	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
468 	ill_rx_ring_t		*rx_ring, *ring_tbl;
469 	int			ip_rx_index;
470 	squeue_t		*sq = NULL;
471 	pri_t			pri;
472 
473 	ASSERT(ILL_MAC_PERIM_HELD(ill));
474 	ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
475 	ASSERT(ill->ill_dld_capab != NULL);
476 
477 	ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
478 
479 	mutex_enter(&ill->ill_lock);
480 	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
481 		rx_ring = &ring_tbl[ip_rx_index];
482 		if (rx_ring->rr_ring_state == RR_FREE)
483 			break;
484 	}
485 
486 	if (ip_rx_index == ILL_MAX_RINGS) {
487 		/*
488 		 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
489 		 * we have devices which can overwhelm this limit,
490 		 * ILL_MAX_RING should be made configurable. Meanwhile it
491 		 * cause no panic because driver will pass ip_input a NULL
492 		 * handle which will make IP allocate the default squeue and
493 		 * Polling mode will not be used for this ring.
494 		 */
495 		cmn_err(CE_NOTE,
496 		    "Reached maximum number of receiving rings (%d) for %s\n",
497 		    ILL_MAX_RINGS, ill->ill_name);
498 		mutex_exit(&ill->ill_lock);
499 		return (NULL);
500 	}
501 
502 	bzero(rx_ring, sizeof (ill_rx_ring_t));
503 	rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
504 	/* XXX: Hard code it to tcp accept for now */
505 	rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
506 
507 	rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
508 	rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
509 	rx_ring->rr_intr_disable =
510 	    (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
511 	rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
512 	rx_ring->rr_ill = ill;
513 
514 	pri = mrfp->mrf_flow_priority;
515 
516 	sq = ip_squeue_getfree(pri);
517 
518 	mutex_enter(&sq->sq_lock);
519 	sq->sq_rx_ring = rx_ring;
520 	rx_ring->rr_sqp = sq;
521 
522 	sq->sq_state |= SQS_POLL_CAPAB;
523 
524 	rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
525 	sq->sq_ill = ill;
526 	mutex_exit(&sq->sq_lock);
527 	mutex_exit(&ill->ill_lock);
528 
529 	DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
530 	    ip_rx_index, void *, mrfp->mrf_rx_arg);
531 
532 	/* Assign the squeue to the specified CPU as well */
533 	mutex_enter(&cpu_lock);
534 	(void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
535 	mutex_exit(&cpu_lock);
536 
537 	return (rx_ring);
538 }
539 
540 /*
541  * sanitize the squeue etc. Some of the processing
542  * needs to be done from inside the perimeter.
543  */
544 void
545 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
546 {
547 	squeue_t *sqp;
548 
549 	ASSERT(ILL_MAC_PERIM_HELD(ill));
550 	ASSERT(rx_ring != NULL);
551 
552 	/* Just clean one squeue */
553 	mutex_enter(&ill->ill_lock);
554 	if (rx_ring->rr_ring_state == RR_FREE) {
555 		mutex_exit(&ill->ill_lock);
556 		return;
557 	}
558 	rx_ring->rr_ring_state = RR_FREE_INPROG;
559 	sqp = rx_ring->rr_sqp;
560 
561 	mutex_enter(&sqp->sq_lock);
562 	sqp->sq_state |= SQS_POLL_CLEANUP;
563 	cv_signal(&sqp->sq_worker_cv);
564 	mutex_exit(&ill->ill_lock);
565 	while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
566 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
567 	sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND);
568 
569 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
570 	    SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
571 	    SQS_POLL_THR_QUIESCED)));
572 
573 	cv_signal(&sqp->sq_worker_cv);
574 	mutex_exit(&sqp->sq_lock);
575 
576 	/*
577 	 * Logically free the squeue. It goes back to the set of unused
578 	 * squeues
579 	 */
580 	mutex_enter(&sqset_lock);
581 	ip_squeue_set_move(sqp, sqset_global_list[0]);
582 	mutex_exit(&sqset_lock);
583 
584 	mutex_enter(&ill->ill_lock);
585 	rx_ring->rr_ring_state = RR_FREE;
586 	mutex_exit(&ill->ill_lock);
587 }
588 
589 /*
590  * Stop the squeue from polling. This needs to be done
591  * from inside the perimeter.
592  */
593 void
594 ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
595 {
596 	squeue_t *sqp;
597 
598 	ASSERT(ILL_MAC_PERIM_HELD(ill));
599 	ASSERT(rx_ring != NULL);
600 
601 	sqp = rx_ring->rr_sqp;
602 	mutex_enter(&sqp->sq_lock);
603 	sqp->sq_state |= SQS_POLL_QUIESCE;
604 	cv_signal(&sqp->sq_worker_cv);
605 	while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
606 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
607 
608 	mutex_exit(&sqp->sq_lock);
609 }
610 
611 /*
612  * Restart polling etc. Needs to be inside the perimeter to
613  * prevent races.
614  */
615 void
616 ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
617 {
618 	squeue_t *sqp;
619 
620 	ASSERT(ILL_MAC_PERIM_HELD(ill));
621 	ASSERT(rx_ring != NULL);
622 
623 	sqp = rx_ring->rr_sqp;
624 	mutex_enter(&sqp->sq_lock);
625 	/*
626 	 * Handle change in number of rings between the quiesce and
627 	 * restart operations by checking for a previous quiesce before
628 	 * attempting a restart.
629 	 */
630 	if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
631 		mutex_exit(&sqp->sq_lock);
632 		return;
633 	}
634 	sqp->sq_state |= SQS_POLL_RESTART;
635 	cv_signal(&sqp->sq_worker_cv);
636 	while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
637 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
638 	sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
639 	mutex_exit(&sqp->sq_lock);
640 }
641 
642 /*
643  * sanitize all squeues associated with the ill.
644  */
645 void
646 ip_squeue_clean_all(ill_t *ill)
647 {
648 	int idx;
649 	ill_rx_ring_t	*rx_ring;
650 
651 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
652 		rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
653 		ip_squeue_clean_ring(ill, rx_ring);
654 	}
655 }
656 
657 /*
658  * Used by IP to get the squeue associated with a ring. If the squeue isn't
659  * yet bound to a CPU, and we're being called directly from the NIC's
660  * interrupt, then we know what CPU we want to assign the squeue to, so
661  * dispatch that task to a taskq.
662  */
663 squeue_t *
664 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
665 {
666 	squeue_t 	*sqp;
667 
668 	if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
669 		return (IP_SQUEUE_GET(lbolt));
670 
671 	return (sqp);
672 }
673 
674 /*
675  * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
676  * squeues are unboudn and moved to the unbound set.
677  */
678 static void
679 ip_squeue_set_destroy(cpu_t *cpu)
680 {
681 	int i;
682 	squeue_t *sqp, *lastsqp = NULL;
683 	squeue_set_t *sqs, *unbound = sqset_global_list[0];
684 
685 	mutex_enter(&sqset_lock);
686 	if ((sqs = cpu->cpu_squeue_set) == NULL) {
687 		mutex_exit(&sqset_lock);
688 		return;
689 	}
690 
691 	/* Move all squeues to unbound set */
692 
693 	for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
694 		squeue_unbind(sqp);
695 		sqp->sq_set = unbound;
696 	}
697 	if (sqs->sqs_head) {
698 		lastsqp->sq_next = unbound->sqs_head;
699 		unbound->sqs_head = sqs->sqs_head;
700 	}
701 
702 	/* Also move default squeue to unbound set */
703 
704 	sqp = sqs->sqs_default;
705 	ASSERT(sqp);
706 	ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
707 
708 	sqp->sq_next = unbound->sqs_head;
709 	unbound->sqs_head = sqp;
710 	squeue_unbind(sqp);
711 	sqp->sq_set = unbound;
712 
713 	for (i = 1; i < sqset_global_size; i++)
714 		if (sqset_global_list[i] == sqs)
715 			break;
716 
717 	ASSERT(i < sqset_global_size);
718 	sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
719 	sqset_global_list[sqset_global_size - 1] = NULL;
720 	sqset_global_size--;
721 
722 	mutex_exit(&sqset_lock);
723 	kmem_free(sqs, sizeof (*sqs));
724 }
725 
726 /*
727  * Reconfiguration callback
728  */
729 /* ARGSUSED */
730 static int
731 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
732 {
733 	cpu_t *cp = cpu_get(id);
734 
735 	ASSERT(MUTEX_HELD(&cpu_lock));
736 	switch (what) {
737 	case CPU_CONFIG:
738 	case CPU_ON:
739 	case CPU_INIT:
740 	case CPU_CPUPART_IN:
741 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL)
742 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
743 		break;
744 	case CPU_UNCONFIG:
745 	case CPU_OFF:
746 	case CPU_CPUPART_OUT:
747 		if (cp->cpu_squeue_set != NULL) {
748 			ip_squeue_set_destroy(cp);
749 			cp->cpu_squeue_set = NULL;
750 		}
751 		break;
752 	default:
753 		break;
754 	}
755 	return (0);
756 }
757