xref: /titanic_51/usr/src/uts/common/inet/ip/ip_squeue.c (revision 261a51afbf7133d9f7c89f1388050677f56b7d1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * IP interface to squeues.
30  *
31  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
32  * cpu_squeue field of the cpu structure. Each squeue is associated with a
33  * connection instance (conn_t).
34  *
35  * For CPUs available at system startup time the squeue creation and association
36  * with CPU happens at MP initialization time. For CPUs added during dynamic
37  * reconfiguration, the initialization happens when the new CPU is configured in
38  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
39  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
40  * variable.
41  *
42  * There are two modes of associating connection with squeues. The first mode
43  * associates each connection with the CPU that creates the connection (either
44  * during open time or during accept time). The second mode associates each
45  * connection with a random CPU, effectively distributing load over all CPUs
46  * and all squeues in the system. The mode is controlled by the
47  * ip_squeue_fanout variable.
48  *
49  * NOTE: The fact that there is an association between each connection and
50  * squeue and squeue and CPU does not mean that each connection is always
51  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
52  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
53  * binding is only relevant for the worker thread.
54  *
55  * The list of all created squeues is kept in squeue_set structure. This list is
56  * used when ip_squeue_fanout is set and the load is distributed across all
57  * squeues.
58  *
59  * INTERFACE:
60  *
61  * squeue_t *ip_squeue_get(hint)
62  *
63  * 	Find an squeue based on the 'hint' value. The hint is used as an index
64  * 	in the array of IP squeues available. The way hint is computed may
65  * 	affect the effectiveness of the squeue distribution. Currently squeues
66  * 	are assigned in round-robin fashion using lbolt as a hint.
67  *
68  *
69  * DR Notes
70  * ========
71  *
72  * The ip_squeue_init() registers a call-back function with the CPU DR
73  * subsystem using register_cpu_setup_func(). The call-back function does two
74  * things:
75  *
76  * o When the CPU is going off-line or unconfigured, the worker thread is
77  *	unbound from the CPU. This allows the CPU unconfig code to move it to
78  *	another CPU.
79  *
80  * o When the CPU is going online, it creates a new squeue for this CPU if
81  *	necessary and binds the squeue worker thread to this CPU.
82  *
83  * TUNEBALES:
84  *
85  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
86  * 	associated with an squeue instance.
87  *
88  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
89  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
90  *	an impact.
91  *
92  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
93  *	otherwise get it from CPU->cpu_squeue.
94  *
95  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
96  * changed using ndd on /dev/tcp or /dev/ip.
97  *
98  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
99  *	created. This is the time squeue code waits before waking up the worker
100  *	thread after queuing a request.
101  */
102 
103 #include <sys/types.h>
104 #include <sys/debug.h>
105 #include <sys/kmem.h>
106 #include <sys/cpuvar.h>
107 
108 #include <sys/cmn_err.h>
109 
110 #include <inet/common.h>
111 #include <inet/ip.h>
112 #include <inet/ip_if.h>
113 #include <inet/nd.h>
114 #include <inet/ipclassifier.h>
115 #include <sys/types.h>
116 #include <sys/conf.h>
117 #include <sys/sunddi.h>
118 #include <sys/dlpi.h>
119 #include <sys/squeue_impl.h>
120 
121 /*
122  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
123  * mapping between squeue and NIC (or Rx ring) for performance reasons so
124  * each squeue can uniquely own a NIC or a Rx ring and do polling
125  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
126  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
127  * can be created dynamically as needed.
128  */
129 #define	MAX_SQUEUES_PER_CPU	32
130 #define	MIN_SQUEUES_PER_CPU	1
131 uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
132 
133 #define	IP_NUM_SOFT_RINGS	2
134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
135 
136 /*
137  * List of all created squeue sets. The size is protected by cpu_lock
138  */
139 squeue_set_t	**sqset_global_list;
140 uint_t		sqset_global_size;
141 
142 int ip_squeue_bind = B_TRUE;
143 int ip_squeue_profile = B_TRUE;
144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
145 
146 /*
147  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
148  *	created. This is the time squeue code waits before waking up the worker
149  *	thread after queuing a request.
150  */
151 uint_t ip_squeue_worker_wait = 10;
152 
153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
155 
156 static void ip_squeue_set_bind(squeue_set_t *);
157 static void ip_squeue_set_unbind(squeue_set_t *);
158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
159 
160 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
161 
162 /*
163  * Create squeue set containing ip_squeues_per_cpu number of squeues
164  * for this CPU and bind them all to the CPU.
165  */
166 static squeue_set_t *
167 ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
168 {
169 	int i;
170 	squeue_set_t	*sqs;
171 	squeue_t 	*sqp;
172 	char 		sqname[64];
173 	processorid_t 	id = cp->cpu_id;
174 
175 	if (reuse) {
176 		int i;
177 
178 		/*
179 		 * We may already have an squeue created for this CPU. Try to
180 		 * find one and reuse it if possible.
181 		 */
182 		for (i = 0; i < sqset_global_size; i++) {
183 			sqs = sqset_global_list[i];
184 			if (id == sqs->sqs_bind)
185 				return (sqs);
186 		}
187 	}
188 
189 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
190 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
191 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
192 	sqs->sqs_list = (squeue_t **)&sqs[1];
193 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
194 	sqs->sqs_bind = id;
195 
196 	for (i = 0; i < ip_squeues_per_cpu; i++) {
197 		bzero(sqname, sizeof (sqname));
198 
199 		(void) snprintf(sqname, sizeof (sqname),
200 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
201 		    cp->cpu_id, i);
202 
203 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
204 		    minclsyspri);
205 
206 		/*
207 		 * The first squeue in each squeue_set is the DEFAULT
208 		 * squeue.
209 		 */
210 		sqp->sq_state |= SQS_DEFAULT;
211 
212 		ASSERT(sqp != NULL);
213 
214 		squeue_profile_enable(sqp);
215 		sqs->sqs_list[sqs->sqs_size++] = sqp;
216 
217 		if (ip_squeue_create_callback != NULL)
218 			ip_squeue_create_callback(sqp);
219 	}
220 
221 	if (ip_squeue_bind && cpu_is_online(cp))
222 		ip_squeue_set_bind(sqs);
223 
224 	sqset_global_list[sqset_global_size++] = sqs;
225 	ASSERT(sqset_global_size <= NCPU);
226 	return (sqs);
227 }
228 
229 /*
230  * Initialize IP squeues.
231  */
232 void
233 ip_squeue_init(void (*callback)(squeue_t *))
234 {
235 	int i;
236 
237 	ASSERT(sqset_global_list == NULL);
238 
239 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
240 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
241 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
242 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
243 
244 	ip_squeue_create_callback = callback;
245 	squeue_init();
246 	sqset_global_list =
247 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
248 	sqset_global_size = 0;
249 	mutex_enter(&cpu_lock);
250 
251 	/* Create squeue for each active CPU available */
252 	for (i = 0; i < NCPU; i++) {
253 		cpu_t *cp = cpu[i];
254 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
255 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
256 		}
257 	}
258 
259 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
260 
261 	mutex_exit(&cpu_lock);
262 
263 	if (ip_squeue_profile)
264 		squeue_profile_start();
265 }
266 
267 /*
268  * Get squeue_t structure based on index.
269  * Since the squeue list can only grow, no need to grab any lock.
270  */
271 squeue_t *
272 ip_squeue_random(uint_t index)
273 {
274 	squeue_set_t *sqs;
275 
276 	sqs = sqset_global_list[index % sqset_global_size];
277 	return (sqs->sqs_list[index % sqs->sqs_size]);
278 }
279 
280 /* ARGSUSED */
281 void
282 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
283 {
284 	squeue_t	*sqp = arg2;
285 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
286 	ill_t		*ill;
287 
288 	ASSERT(sqp != NULL);
289 
290 	if (ring == NULL) {
291 		return;
292 	}
293 
294 	/*
295 	 * Clean up squeue
296 	 */
297 	mutex_enter(&sqp->sq_lock);
298 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
299 	sqp->sq_rx_ring = NULL;
300 	mutex_exit(&sqp->sq_lock);
301 
302 	ill = ring->rr_ill;
303 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
304 		ASSERT(ring->rr_handle != NULL);
305 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
306 	}
307 
308 	/*
309 	 * Cleanup the ring
310 	 */
311 
312 	ring->rr_blank = NULL;
313 	ring->rr_handle = NULL;
314 	ring->rr_sqp = NULL;
315 
316 	/*
317 	 * Signal ill that cleanup is done
318 	 */
319 	mutex_enter(&ill->ill_lock);
320 	ring->rr_ring_state = ILL_RING_FREE;
321 	cv_signal(&ill->ill_cv);
322 	mutex_exit(&ill->ill_lock);
323 }
324 
325 typedef struct ip_taskq_arg {
326 	ill_t		*ip_taskq_ill;
327 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
328 	cpu_t		*ip_taskq_cpu;
329 } ip_taskq_arg_t;
330 
331 /*
332  * Do a Rx ring to squeue binding. Find a unique squeue that is not
333  * managing a receive ring. If no such squeue exists, dynamically
334  * create a new one in the squeue set.
335  *
336  * The function runs via the system taskq. The ill passed as an
337  * argument can't go away since we hold a ref. The lock order is
338  * ill_lock -> sqs_lock -> sq_lock.
339  *
340  * If we are binding a Rx ring to a squeue attached to the offline CPU,
341  * no need to check that because squeues are never destroyed once
342  * created.
343  */
344 /* ARGSUSED */
345 static void
346 ip_squeue_extend(void *arg)
347 {
348 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
349 	ill_t		*ill = sq_arg->ip_taskq_ill;
350 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
351 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
352 	squeue_set_t 	*sqs;
353 	squeue_t 	*sqp = NULL;
354 
355 	ASSERT(ill != NULL);
356 	ASSERT(ill_rx_ring != NULL);
357 	kmem_free(arg, sizeof (ip_taskq_arg_t));
358 
359 	/*
360 	 * Make sure the CPU that originally took the interrupt still
361 	 * exists.
362 	 */
363 	if (!CPU_ISON(intr_cpu))
364 		intr_cpu = CPU;
365 
366 	sqs = intr_cpu->cpu_squeue_set;
367 
368 	/*
369 	 * If this ill represents link aggregation, then there might be
370 	 * multiple NICs trying to register them selves at the same time
371 	 * and in order to ensure that test and assignment of free rings
372 	 * is sequential, we need to hold the ill_lock.
373 	 */
374 	mutex_enter(&ill->ill_lock);
375 	sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
376 	if (sqp == NULL) {
377 		/*
378 		 * We hit the max limit of squeues allowed per CPU.
379 		 * Assign this rx_ring to DEFAULT squeue of the
380 		 * interrupted CPU but the squeue will not manage
381 		 * the ring. Also print a warning.
382 		 */
383 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
384 		    "has max number of squeues. System performance might "
385 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
386 
387 		/* the first squeue in the list is the default squeue */
388 		sqp = sqs->sqs_list[0];
389 		ASSERT(sqp != NULL);
390 		ill_rx_ring->rr_sqp = sqp;
391 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
392 
393 		mutex_exit(&ill->ill_lock);
394 		ill_waiter_dcr(ill);
395 		return;
396 	}
397 
398 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
399 	sqp->sq_rx_ring = ill_rx_ring;
400 	ill_rx_ring->rr_sqp = sqp;
401 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
402 
403 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
404 	mutex_exit(&sqp->sq_lock);
405 
406 	mutex_exit(&ill->ill_lock);
407 
408 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
409 	ill_waiter_dcr(ill);
410 }
411 
412 /*
413  * Do a Rx ring to squeue binding. Find a unique squeue that is not
414  * managing a receive ring. If no such squeue exists, dynamically
415  * create a new one in the squeue set.
416  *
417  * The function runs via the system taskq. The ill passed as an
418  * argument can't go away since we hold a ref. The lock order is
419  * ill_lock -> sqs_lock -> sq_lock.
420  *
421  * If we are binding a Rx ring to a squeue attached to the offline CPU,
422  * no need to check that because squeues are never destroyed once
423  * created.
424  */
425 /* ARGSUSED */
426 static void
427 ip_squeue_soft_ring_affinity(void *arg)
428 {
429 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
430 	ill_t			*ill = sq_arg->ip_taskq_ill;
431 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
432 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
433 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
434 	cpu_t			*bind_cpu;
435 	int			cpu_id = intr_cpu->cpu_id;
436 	int			min_cpu_id, max_cpu_id;
437 	boolean_t		enough_uniq_cpus = B_FALSE;
438 	boolean_t		enough_cpus = B_FALSE;
439 	squeue_set_t 		*sqs, *last_sqs;
440 	squeue_t 		*sqp = NULL;
441 	int			i, j;
442 
443 	ASSERT(ill != NULL);
444 	kmem_free(arg, sizeof (ip_taskq_arg_t));
445 
446 	/*
447 	 * Make sure the CPU that originally took the interrupt still
448 	 * exists.
449 	 */
450 	if (!CPU_ISON(intr_cpu)) {
451 		intr_cpu = CPU;
452 		cpu_id = intr_cpu->cpu_id;
453 	}
454 
455 	/*
456 	 * If this ill represents link aggregation, then there might be
457 	 * multiple NICs trying to register them selves at the same time
458 	 * and in order to ensure that test and assignment of free rings
459 	 * is sequential, we need to hold the ill_lock.
460 	 */
461 	mutex_enter(&ill->ill_lock);
462 
463 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
464 		mutex_exit(&ill->ill_lock);
465 		return;
466 	}
467 	/*
468 	 * We need to fanout the interrupts from the NIC. We do that by
469 	 * telling the driver underneath to create soft rings and use
470 	 * worker threads (if the driver advertized SOFT_RING capability)
471 	 * Its still a big performance win to if we can fanout to the
472 	 * threads on the same core that is taking interrupts.
473 	 *
474 	 * Since we don't know the interrupt to CPU binding, we don't
475 	 * assign any squeues or affinity to worker threads in the NIC.
476 	 * At the time of the first interrupt, we know which CPU is
477 	 * taking interrupts and try to find other threads on the same
478 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
479 	 * numbered sequentially for each core (XXX need something better
480 	 * than this in future), find the lowest number and highest
481 	 * number thread for that core.
482 	 *
483 	 * If we have one more thread per core than number of soft rings,
484 	 * then don't assign any worker threads to the H/W thread (cpu)
485 	 * taking interrupts (capability negotiation tries to ensure this)
486 	 *
487 	 * If the number of threads per core are same as the number of
488 	 * soft rings, then assign the worker affinity and squeue to
489 	 * the same cpu.
490 	 *
491 	 * Otherwise, just fanout to higher number CPUs starting from
492 	 * the interrupted CPU.
493 	 */
494 
495 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
496 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
497 
498 	/*
499 	 * Quickly check if there are enough CPUs present for fanout
500 	 * and also max_cpu_id is less than the id of the active CPU.
501 	 * We use the cpu_id stored in the last squeue_set to get
502 	 * an idea. The scheme is by no means perfect since it doesn't
503 	 * take into account CPU DR operations and the fact that
504 	 * interrupts themselves might change. An ideal scenario
505 	 * would be to ensure that interrupts run cpus by themselves
506 	 * and worker threads never have affinity to those CPUs. If
507 	 * the interrupts move to CPU which had a worker thread, it
508 	 * should be changed. Probably callbacks similar to CPU offline
509 	 * are needed to make it work perfectly.
510 	 */
511 	last_sqs = sqset_global_list[sqset_global_size - 1];
512 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
513 		if ((max_cpu_id - min_cpu_id) >
514 		    ill_soft_ring->ill_dls_soft_ring_cnt)
515 			enough_uniq_cpus = B_TRUE;
516 		else if ((max_cpu_id - min_cpu_id) >=
517 		    ill_soft_ring->ill_dls_soft_ring_cnt)
518 			enough_cpus = B_TRUE;
519 	}
520 
521 	j = 0;
522 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
523 		if (enough_uniq_cpus) {
524 			if ((min_cpu_id + i) == cpu_id) {
525 				j++;
526 				continue;
527 			}
528 			bind_cpu = cpu[min_cpu_id + i];
529 		} else if (enough_cpus) {
530 			bind_cpu = cpu[min_cpu_id + i];
531 		} else {
532 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
533 			bind_cpu = cpu[(cpu_id + i) % ncpus];
534 		}
535 
536 		/*
537 		 * Check if the CPU actually exist and active. If not,
538 		 * use the interrupted CPU. ip_find_unused_squeue() will
539 		 * find the right CPU to fanout anyway.
540 		 */
541 		if (!CPU_ISON(bind_cpu))
542 			bind_cpu = intr_cpu;
543 
544 		sqs = bind_cpu->cpu_squeue_set;
545 		ASSERT(sqs != NULL);
546 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
547 
548 		sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
549 		if (sqp == NULL) {
550 			/*
551 			 * We hit the max limit of squeues allowed per CPU.
552 			 * Assign this rx_ring to DEFAULT squeue of the
553 			 * interrupted CPU but thesqueue will not manage
554 			 * the ring. Also print a warning.
555 			 */
556 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
557 			    "%d/%p already has max number of squeues. System "
558 			    "performance might become suboptimal\n",
559 			    sqs->sqs_bind, (void *)sqs);
560 
561 			/* the first squeue in the list is the default squeue */
562 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
563 			ASSERT(sqp != NULL);
564 
565 			ill_rx_ring->rr_sqp = sqp;
566 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
567 			continue;
568 
569 		}
570 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
571 		ill_rx_ring->rr_sqp = sqp;
572 		sqp->sq_rx_ring = ill_rx_ring;
573 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
574 		sqp->sq_state |= SQS_ILL_BOUND;
575 
576 		/* assign affinity to soft ring */
577 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
578 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
579 			    sqp->sq_bind);
580 		}
581 		mutex_exit(&sqp->sq_lock);
582 	}
583 	mutex_exit(&ill->ill_lock);
584 
585 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
586 	    SOFT_RING_FANOUT);
587 
588 	mutex_enter(&ill->ill_lock);
589 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
590 	mutex_exit(&ill->ill_lock);
591 
592 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
593 	ill_waiter_dcr(ill);
594 }
595 
596 /* ARGSUSED */
597 void
598 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
599     mblk_t *mp_chain, struct mac_header_info_s *mhip)
600 {
601 	ip_taskq_arg_t	*taskq_arg;
602 	boolean_t	refheld;
603 
604 	ASSERT(servicing_interrupt());
605 
606 	mutex_enter(&ill->ill_lock);
607 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
608 		taskq_arg = (ip_taskq_arg_t *)
609 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
610 
611 		if (taskq_arg == NULL)
612 			goto out;
613 
614 		taskq_arg->ip_taskq_ill = ill;
615 		taskq_arg->ip_taskq_ill_rx_ring = NULL;
616 		taskq_arg->ip_taskq_cpu = CPU;
617 
618 		/*
619 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
620 		 * the next interrupt to schedule a task for calling
621 		 * ip_squeue_soft_ring_affinity();
622 		 */
623 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
624 	} else {
625 		mutex_exit(&ill->ill_lock);
626 		goto out;
627 	}
628 	mutex_exit(&ill->ill_lock);
629 	refheld = ill_waiter_inc(ill);
630 	if (refheld) {
631 		if (taskq_dispatch(system_taskq,
632 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
633 			goto out;
634 
635 		/* release ref on ill if taskq dispatch fails */
636 		ill_waiter_dcr(ill);
637 	}
638 	/*
639 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
640 	 * can be tried again later.
641 	 */
642 	mutex_enter(&ill->ill_lock);
643 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
644 	mutex_exit(&ill->ill_lock);
645 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
646 
647 out:
648 	ip_input(ill, NULL, mp_chain, mhip);
649 }
650 
651 static squeue_t *
652 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
653 {
654 	int 		i;
655 	squeue_set_t	*best_sqs = NULL;
656 	squeue_set_t	*curr_sqs = NULL;
657 	int		min_sq = 0;
658 	squeue_t 	*sqp = NULL;
659 	char		sqname[64];
660 
661 	/*
662 	 * If fanout is set and the passed squeue_set already has some
663 	 * squeues which are managing the NICs, try to find squeues on
664 	 * unused CPU.
665 	 */
666 	if (sqs->sqs_size > 1 && fanout) {
667 		/*
668 		 * First check to see if any squeue on the CPU passed
669 		 * is managing a NIC.
670 		 */
671 		for (i = 0; i < sqs->sqs_size; i++) {
672 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
673 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
674 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
675 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
676 				break;
677 			}
678 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
679 		}
680 		if (i != sqs->sqs_size) {
681 			best_sqs = sqset_global_list[sqset_global_size - 1];
682 			min_sq = best_sqs->sqs_size;
683 
684 			for (i = sqset_global_size - 2; i >= 0; i--) {
685 				curr_sqs = sqset_global_list[i];
686 				if (curr_sqs->sqs_size < min_sq) {
687 					best_sqs = curr_sqs;
688 					min_sq = curr_sqs->sqs_size;
689 				}
690 			}
691 
692 			ASSERT(best_sqs != NULL);
693 			sqs = best_sqs;
694 			bind_cpu = cpu[sqs->sqs_bind];
695 		}
696 	}
697 
698 	mutex_enter(&sqs->sqs_lock);
699 
700 	for (i = 0; i < sqs->sqs_size; i++) {
701 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
702 		if ((sqs->sqs_list[i]->sq_state &
703 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
704 			sqp = sqs->sqs_list[i];
705 			break;
706 		}
707 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
708 	}
709 
710 	if (sqp == NULL) {
711 		/* Need to create a new squeue */
712 		if (sqs->sqs_size == sqs->sqs_max_size) {
713 			/*
714 			 * Reached the max limit for squeue
715 			 * we can allocate on this CPU.
716 			 */
717 			mutex_exit(&sqs->sqs_lock);
718 			return (NULL);
719 		}
720 
721 		bzero(sqname, sizeof (sqname));
722 		(void) snprintf(sqname, sizeof (sqname),
723 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
724 		    bind_cpu->cpu_id, sqs->sqs_size);
725 
726 		sqp = squeue_create(sqname, bind_cpu->cpu_id,
727 		    ip_squeue_worker_wait, minclsyspri);
728 
729 		ASSERT(sqp != NULL);
730 
731 		squeue_profile_enable(sqp);
732 		sqs->sqs_list[sqs->sqs_size++] = sqp;
733 
734 		if (ip_squeue_create_callback != NULL)
735 			ip_squeue_create_callback(sqp);
736 
737 		mutex_enter(&cpu_lock);
738 		if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
739 			squeue_bind(sqp, -1);
740 		}
741 		mutex_exit(&cpu_lock);
742 
743 		mutex_enter(&sqp->sq_lock);
744 	}
745 
746 	mutex_exit(&sqs->sqs_lock);
747 	ASSERT(sqp != NULL);
748 	return (sqp);
749 }
750 
751 /*
752  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
753  * owned by a squeue yet, do the assignment. When the NIC registers it
754  * Rx rings with IP, we don't know where the interrupts will land and
755  * hence we need to wait till this point to do the assignment.
756  */
757 squeue_t *
758 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
759 {
760 	squeue_t 	*sqp;
761 	ill_t 		*ill;
762 	int		interrupt;
763 	ip_taskq_arg_t	*taskq_arg;
764 	boolean_t	refheld;
765 
766 	if (ill_rx_ring == NULL)
767 		return (IP_SQUEUE_GET(lbolt));
768 
769 	sqp = ill_rx_ring->rr_sqp;
770 	/*
771 	 * Do a quick check. If it's not NULL, we are done.
772 	 * Squeues are never destroyed so worse we will bind
773 	 * this connection to a suboptimal squeue.
774 	 *
775 	 * This is the fast path case.
776 	 */
777 	if (sqp != NULL)
778 		return (sqp);
779 
780 	ill = ill_rx_ring->rr_ill;
781 	ASSERT(ill != NULL);
782 
783 	interrupt = servicing_interrupt();
784 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
785 	    KM_NOSLEEP);
786 
787 	mutex_enter(&ill->ill_lock);
788 	/*
789 	 * Check sqp under the lock again for atomicity. Possible race with
790 	 * a previously scheduled ip_squeue_get -> ip_squeue_extend.
791 	 * Do the ring to squeue binding only if we are in interrupt context
792 	 * AND the ring is not already bound AND there is no one else trying
793 	 * the bind already.
794 	 */
795 	sqp = ill_rx_ring->rr_sqp;
796 	if (sqp != NULL || !interrupt ||
797 	    ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
798 		/*
799 		 * Note that the ring might get bound once we drop the lock
800 		 * below, if a previous request is in progress i.e. if the ring
801 		 * state is ILL_RING_INPROC. The incoming connection on whose
802 		 * behalf we are currently here might get a suboptimal squeue
803 		 * via the call to IP_SQUEUE_GET below, but there is no
804 		 * correctness issue.
805 		 */
806 		mutex_exit(&ill->ill_lock);
807 		if (taskq_arg != NULL)
808 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
809 		if (sqp != NULL)
810 			return (sqp);
811 		return (IP_SQUEUE_GET(lbolt));
812 	}
813 
814 	/*
815 	 * No sqp assigned yet. Can't really do that in interrupt
816 	 * context. Assign the default sqp to this connection and
817 	 * trigger creation of new sqp and binding it to this ring
818 	 * via taskq. Need to make sure ill stays around.
819 	 */
820 	taskq_arg->ip_taskq_ill = ill;
821 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
822 	taskq_arg->ip_taskq_cpu = CPU;
823 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
824 	mutex_exit(&ill->ill_lock);
825 	refheld = ill_waiter_inc(ill);
826 	if (refheld) {
827 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
828 		    taskq_arg, TQ_NOSLEEP) != NULL) {
829 			return (IP_SQUEUE_GET(lbolt));
830 		}
831 	}
832 	/*
833 	 * The ill is closing and we could not get a reference on the ill OR
834 	 * taskq_dispatch failed probably due to memory allocation failure.
835 	 * We will try again next time.
836 	 */
837 	mutex_enter(&ill->ill_lock);
838 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
839 	mutex_exit(&ill->ill_lock);
840 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
841 	if (refheld)
842 		ill_waiter_dcr(ill);
843 
844 	return (IP_SQUEUE_GET(lbolt));
845 }
846 
847 /*
848  * NDD hooks for setting ip_squeue_xxx tuneables.
849  */
850 
851 /* ARGSUSED */
852 int
853 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
854     caddr_t addr, cred_t *cr)
855 {
856 	int *bind_enabled = (int *)addr;
857 	long new_value;
858 	int i;
859 
860 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
861 		return (EINVAL);
862 
863 	if (ip_squeue_bind == new_value)
864 		return (0);
865 
866 	*bind_enabled = new_value;
867 	mutex_enter(&cpu_lock);
868 	if (new_value == 0) {
869 		for (i = 0; i < sqset_global_size; i++)
870 			ip_squeue_set_unbind(sqset_global_list[i]);
871 	} else {
872 		for (i = 0; i < sqset_global_size; i++)
873 			ip_squeue_set_bind(sqset_global_list[i]);
874 	}
875 
876 	mutex_exit(&cpu_lock);
877 	return (0);
878 }
879 
880 /*
881  * Set squeue profiling.
882  * 0 means "disable"
883  * 1 means "enable"
884  * 2 means "enable and reset"
885  */
886 /* ARGSUSED */
887 int
888 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
889     cred_t *cr)
890 {
891 	int *profile_enabled = (int *)cp;
892 	long new_value;
893 	squeue_set_t *sqs;
894 
895 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
896 		return (EINVAL);
897 
898 	if (new_value == 0)
899 		squeue_profile_stop();
900 	else if (new_value == 1)
901 		squeue_profile_start();
902 	else if (new_value == 2) {
903 		int i, j;
904 
905 		squeue_profile_stop();
906 		mutex_enter(&cpu_lock);
907 		for (i = 0; i < sqset_global_size; i++) {
908 			sqs = sqset_global_list[i];
909 			for (j = 0; j < sqs->sqs_size; j++) {
910 				squeue_profile_reset(sqs->sqs_list[j]);
911 			}
912 		}
913 		mutex_exit(&cpu_lock);
914 
915 		new_value = 1;
916 		squeue_profile_start();
917 	}
918 	*profile_enabled = new_value;
919 
920 	return (0);
921 }
922 
923 /*
924  * Reconfiguration callback
925  */
926 
927 /* ARGSUSED */
928 static int
929 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
930 {
931 	cpu_t *cp = cpu[id];
932 
933 	ASSERT(MUTEX_HELD(&cpu_lock));
934 	switch (what) {
935 	case CPU_CONFIG:
936 		/*
937 		 * A new CPU is added. Create an squeue for it but do not bind
938 		 * it yet.
939 		 */
940 		if (cp->cpu_squeue_set == NULL)
941 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
942 		break;
943 	case CPU_ON:
944 	case CPU_INIT:
945 	case CPU_CPUPART_IN:
946 		if (cp->cpu_squeue_set == NULL) {
947 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
948 		}
949 		if (ip_squeue_bind)
950 			ip_squeue_set_bind(cp->cpu_squeue_set);
951 		break;
952 	case CPU_UNCONFIG:
953 	case CPU_OFF:
954 	case CPU_CPUPART_OUT:
955 		ASSERT((cp->cpu_squeue_set != NULL) ||
956 		    (cp->cpu_flags & CPU_OFFLINE));
957 
958 		if (cp->cpu_squeue_set != NULL) {
959 			ip_squeue_set_unbind(cp->cpu_squeue_set);
960 		}
961 		break;
962 	default:
963 		break;
964 	}
965 	return (0);
966 }
967 
968 /* ARGSUSED */
969 static void
970 ip_squeue_set_bind(squeue_set_t *sqs)
971 {
972 	int i;
973 	squeue_t *sqp;
974 
975 	if (!ip_squeue_bind)
976 		return;
977 
978 	mutex_enter(&sqs->sqs_lock);
979 	for (i = 0; i < sqs->sqs_size; i++) {
980 		sqp = sqs->sqs_list[i];
981 		if (sqp->sq_state & SQS_BOUND)
982 			continue;
983 		squeue_bind(sqp, -1);
984 	}
985 	mutex_exit(&sqs->sqs_lock);
986 }
987 
988 static void
989 ip_squeue_set_unbind(squeue_set_t *sqs)
990 {
991 	int i;
992 	squeue_t *sqp;
993 
994 	mutex_enter(&sqs->sqs_lock);
995 	for (i = 0; i < sqs->sqs_size; i++) {
996 		sqp = sqs->sqs_list[i];
997 
998 		/*
999 		 * CPU is going offline. Remove the thread affinity
1000 		 * for any soft ring threads the squeue is managing.
1001 		 */
1002 		if (sqp->sq_state & SQS_ILL_BOUND) {
1003 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
1004 			ill_t		*ill = ring->rr_ill;
1005 
1006 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
1007 				ASSERT(ring->rr_handle != NULL);
1008 				ill->ill_dls_capab->ill_dls_unbind(
1009 					ring->rr_handle);
1010 			}
1011 		}
1012 		if (!(sqp->sq_state & SQS_BOUND))
1013 			continue;
1014 		squeue_unbind(sqp);
1015 	}
1016 	mutex_exit(&sqs->sqs_lock);
1017 }
1018