xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision a07094369b21309434206d9b3601d162693466fc)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * IP interface to squeues.
31  *
32  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
33  * cpu_squeue field of the cpu structure. Each squeue is associated with a
34  * connection instance (conn_t).
35  *
36  * For CPUs available at system startup time the squeue creation and association
37  * with CPU happens at MP initialization time. For CPUs added during dynamic
38  * reconfiguration, the initialization happens when the new CPU is configured in
39  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
40  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
41  * variable.
42  *
43  * There are two modes of associating connection with squeues. The first mode
44  * associates each connection with the CPU that creates the connection (either
45  * during open time or during accept time). The second mode associates each
46  * connection with a random CPU, effectively distributing load over all CPUs
47  * and all squeues in the system. The mode is controlled by the
48  * ip_squeue_fanout variable.
49  *
50  * NOTE: The fact that there is an association between each connection and
51  * squeue and squeue and CPU does not mean that each connection is always
52  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
53  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
54  * binding is only relevant for the worker thread.
55  *
56  * The list of all created squeues is kept in squeue_set structure. This list is
57  * used when ip_squeue_fanout is set and the load is distributed across all
58  * squeues.
59  *
60  * INTERFACE:
61  *
62  * squeue_t *ip_squeue_get(hint)
63  *
64  * 	Find an squeue based on the 'hint' value. The hint is used as an index
65  * 	in the array of IP squeues available. The way hint is computed may
66  * 	affect the effectiveness of the squeue distribution. Currently squeues
67  * 	are assigned in round-robin fashion using lbolt as a hint.
68  *
69  *
70  * DR Notes
71  * ========
72  *
73  * The ip_squeue_init() registers a call-back function with the CPU DR
74  * subsystem using register_cpu_setup_func(). The call-back function does two
75  * things:
76  *
77  * o When the CPU is going off-line or unconfigured, the worker thread is
78  *	unbound from the CPU. This allows the CPU unconfig code to move it to
79  *	another CPU.
80  *
81  * o When the CPU is going online, it creates a new squeue for this CPU if
82  *	necessary and binds the squeue worker thread to this CPU.
83  *
84  * TUNEBALES:
85  *
86  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
87  * 	associated with an squeue instance.
88  *
89  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
90  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
91  *	an impact.
92  *
93  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
94  *	otherwise get it from CPU->cpu_squeue.
95  *
96  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
97  * changed using ndd on /dev/tcp or /dev/ip.
98  *
99  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
100  *	created. This is the time squeue code waits before waking up the worker
101  *	thread after queuing a request.
102  */
103 
104 #include <sys/types.h>
105 #include <sys/debug.h>
106 #include <sys/kmem.h>
107 #include <sys/cpuvar.h>
108 
109 #include <sys/cmn_err.h>
110 
111 #include <inet/common.h>
112 #include <inet/ip.h>
113 #include <inet/ip_if.h>
114 #include <inet/mi.h>
115 #include <inet/nd.h>
116 #include <inet/ipclassifier.h>
117 #include <sys/types.h>
118 #include <sys/conf.h>
119 #include <sys/sunddi.h>
120 #include <sys/ddi.h>
121 #include <sys/squeue_impl.h>
122 
123 
124 /*
125  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
126  * mapping between squeue and NIC (or Rx ring) for performance reasons so
127  * each squeue can uniquely own a NIC or a Rx ring and do polling
128  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
129  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
130  * can be created dynamically as needed.
131  */
132 #define	MAX_SQUEUES_PER_CPU	32
133 #define	MIN_SQUEUES_PER_CPU	1
134 uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
135 
136 #define	IP_NUM_SOFT_RINGS	2
137 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
138 
139 /*
140  * List of all created squeue sets. The size is protected by cpu_lock
141  */
142 squeue_set_t	**sqset_global_list;
143 uint_t		sqset_global_size;
144 
145 int ip_squeue_bind = B_TRUE;
146 int ip_squeue_profile = B_TRUE;
147 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
148 
149 /*
150  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
151  *	created. This is the time squeue code waits before waking up the worker
152  *	thread after queuing a request.
153  */
154 uint_t ip_squeue_worker_wait = 10;
155 
156 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
157 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
158 
159 static void ip_squeue_set_bind(squeue_set_t *);
160 static void ip_squeue_set_unbind(squeue_set_t *);
161 static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
162 
163 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
164 
165 /*
166  * Create squeue set containing ip_squeues_per_cpu number of squeues
167  * for this CPU and bind them all to the CPU.
168  */
169 static squeue_set_t *
170 ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
171 {
172 	int i;
173 	squeue_set_t	*sqs;
174 	squeue_t 	*sqp;
175 	char 		sqname[64];
176 	processorid_t 	id = cp->cpu_id;
177 
178 	if (reuse) {
179 		int i;
180 
181 		/*
182 		 * We may already have an squeue created for this CPU. Try to
183 		 * find one and reuse it if possible.
184 		 */
185 		for (i = 0; i < sqset_global_size; i++) {
186 			sqs = sqset_global_list[i];
187 			if (id == sqs->sqs_bind)
188 				return (sqs);
189 		}
190 	}
191 
192 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
193 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
194 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
195 	sqs->sqs_list = (squeue_t **)&sqs[1];
196 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
197 	sqs->sqs_bind = id;
198 
199 	for (i = 0; i < ip_squeues_per_cpu; i++) {
200 		bzero(sqname, sizeof (sqname));
201 
202 		(void) snprintf(sqname, sizeof (sqname),
203 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
204 		    cp->cpu_id, i);
205 
206 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
207 		    minclsyspri);
208 
209 		/*
210 		 * The first squeue in each squeue_set is the DEFAULT
211 		 * squeue.
212 		 */
213 		sqp->sq_state |= SQS_DEFAULT;
214 
215 		ASSERT(sqp != NULL);
216 
217 		squeue_profile_enable(sqp);
218 		sqs->sqs_list[sqs->sqs_size++] = sqp;
219 
220 		if (ip_squeue_create_callback != NULL)
221 			ip_squeue_create_callback(sqp);
222 	}
223 
224 	if (ip_squeue_bind && cpu_is_online(cp))
225 		ip_squeue_set_bind(sqs);
226 
227 	sqset_global_list[sqset_global_size++] = sqs;
228 	ASSERT(sqset_global_size <= NCPU);
229 	return (sqs);
230 }
231 
232 /*
233  * Initialize IP squeues.
234  */
235 void
236 ip_squeue_init(void (*callback)(squeue_t *))
237 {
238 	int i;
239 
240 	ASSERT(sqset_global_list == NULL);
241 
242 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
243 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
244 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
245 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
246 
247 	ip_squeue_create_callback = callback;
248 	squeue_init();
249 	sqset_global_list =
250 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
251 	sqset_global_size = 0;
252 	mutex_enter(&cpu_lock);
253 
254 	/* Create squeue for each active CPU available */
255 	for (i = 0; i < NCPU; i++) {
256 		cpu_t *cp = cpu[i];
257 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
258 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
259 		}
260 	}
261 
262 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
263 
264 	mutex_exit(&cpu_lock);
265 
266 	if (ip_squeue_profile)
267 		squeue_profile_start();
268 }
269 
270 /*
271  * Get squeue_t structure based on index.
272  * Since the squeue list can only grow, no need to grab any lock.
273  */
274 squeue_t *
275 ip_squeue_random(uint_t index)
276 {
277 	squeue_set_t *sqs;
278 
279 	sqs = sqset_global_list[index % sqset_global_size];
280 	return (sqs->sqs_list[index % sqs->sqs_size]);
281 }
282 
283 /* ARGSUSED */
284 void
285 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
286 {
287 	squeue_t	*sqp = arg2;
288 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
289 	ill_t		*ill;
290 
291 	ASSERT(sqp != NULL);
292 
293 	if (ring == NULL) {
294 		return;
295 	}
296 
297 	/*
298 	 * Clean up squeue
299 	 */
300 	mutex_enter(&sqp->sq_lock);
301 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
302 	sqp->sq_rx_ring = NULL;
303 	mutex_exit(&sqp->sq_lock);
304 
305 	ill = ring->rr_ill;
306 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
307 		ASSERT(ring->rr_handle != NULL);
308 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
309 	}
310 
311 	/*
312 	 * Cleanup the ring
313 	 */
314 
315 	ring->rr_blank = NULL;
316 	ring->rr_handle = NULL;
317 	ring->rr_sqp = NULL;
318 
319 	/*
320 	 * Signal ill that cleanup is done
321 	 */
322 	mutex_enter(&ill->ill_lock);
323 	ring->rr_ring_state = ILL_RING_FREE;
324 	cv_signal(&ill->ill_cv);
325 	mutex_exit(&ill->ill_lock);
326 }
327 
328 typedef struct ip_taskq_arg {
329 	ill_t		*ip_taskq_ill;
330 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
331 	cpu_t		*ip_taskq_cpu;
332 } ip_taskq_arg_t;
333 
334 /*
335  * Do a Rx ring to squeue binding. Find a unique squeue that is not
336  * managing a receive ring. If no such squeue exists, dynamically
337  * create a new one in the squeue set.
338  *
339  * The function runs via the system taskq. The ill passed as an
340  * argument can't go away since we hold a ref. The lock order is
341  * ill_lock -> sqs_lock -> sq_lock.
342  *
343  * If we are binding a Rx ring to a squeue attached to the offline CPU,
344  * no need to check that because squeues are never destroyed once
345  * created.
346  */
347 /* ARGSUSED */
348 static void
349 ip_squeue_extend(void *arg)
350 {
351 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
352 	ill_t		*ill = sq_arg->ip_taskq_ill;
353 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
354 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
355 	squeue_set_t 	*sqs;
356 	squeue_t 	*sqp = NULL;
357 
358 	ASSERT(ill != NULL);
359 	ASSERT(ill_rx_ring != NULL);
360 	kmem_free(arg, sizeof (ip_taskq_arg_t));
361 
362 	/*
363 	 * Make sure the CPU that originally took the interrupt still
364 	 * exists.
365 	 */
366 	if (!CPU_ISON(intr_cpu))
367 		intr_cpu = CPU;
368 
369 	sqs = intr_cpu->cpu_squeue_set;
370 
371 	/*
372 	 * If this ill represents link aggregation, then there might be
373 	 * multiple NICs trying to register them selves at the same time
374 	 * and in order to ensure that test and assignment of free rings
375 	 * is sequential, we need to hold the ill_lock.
376 	 */
377 	mutex_enter(&ill->ill_lock);
378 	sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
379 	if (sqp == NULL) {
380 		/*
381 		 * We hit the max limit of squeues allowed per CPU.
382 		 * Assign this rx_ring to DEFAULT squeue of the
383 		 * interrupted CPU but the squeue will not manage
384 		 * the ring. Also print a warning.
385 		 */
386 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
387 		    "has max number of squeues. System performance might "
388 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
389 
390 		/* the first squeue in the list is the default squeue */
391 		sqp = sqs->sqs_list[0];
392 		ASSERT(sqp != NULL);
393 		ill_rx_ring->rr_sqp = sqp;
394 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
395 
396 		mutex_exit(&ill->ill_lock);
397 		ill_waiter_dcr(ill);
398 		return;
399 	}
400 
401 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
402 	sqp->sq_rx_ring = ill_rx_ring;
403 	ill_rx_ring->rr_sqp = sqp;
404 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
405 
406 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
407 	mutex_exit(&sqp->sq_lock);
408 
409 	mutex_exit(&ill->ill_lock);
410 
411 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
412 	ill_waiter_dcr(ill);
413 }
414 
415 /*
416  * Do a Rx ring to squeue binding. Find a unique squeue that is not
417  * managing a receive ring. If no such squeue exists, dynamically
418  * create a new one in the squeue set.
419  *
420  * The function runs via the system taskq. The ill passed as an
421  * argument can't go away since we hold a ref. The lock order is
422  * ill_lock -> sqs_lock -> sq_lock.
423  *
424  * If we are binding a Rx ring to a squeue attached to the offline CPU,
425  * no need to check that because squeues are never destroyed once
426  * created.
427  */
428 /* ARGSUSED */
429 static void
430 ip_squeue_soft_ring_affinity(void *arg)
431 {
432 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
433 	ill_t			*ill = sq_arg->ip_taskq_ill;
434 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
435 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
436 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
437 	cpu_t			*bind_cpu;
438 	int			cpu_id = intr_cpu->cpu_id;
439 	int			min_cpu_id, max_cpu_id;
440 	boolean_t		enough_uniq_cpus = B_FALSE;
441 	boolean_t		enough_cpus = B_FALSE;
442 	squeue_set_t 		*sqs, *last_sqs;
443 	squeue_t 		*sqp = NULL;
444 	int			i, j;
445 
446 	ASSERT(ill != NULL);
447 	kmem_free(arg, sizeof (ip_taskq_arg_t));
448 
449 	/*
450 	 * Make sure the CPU that originally took the interrupt still
451 	 * exists.
452 	 */
453 	if (!CPU_ISON(intr_cpu)) {
454 		intr_cpu = CPU;
455 		cpu_id = intr_cpu->cpu_id;
456 	}
457 
458 	/*
459 	 * If this ill represents link aggregation, then there might be
460 	 * multiple NICs trying to register them selves at the same time
461 	 * and in order to ensure that test and assignment of free rings
462 	 * is sequential, we need to hold the ill_lock.
463 	 */
464 	mutex_enter(&ill->ill_lock);
465 
466 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
467 		mutex_exit(&ill->ill_lock);
468 		return;
469 	}
470 	/*
471 	 * We need to fanout the interrupts from the NIC. We do that by
472 	 * telling the driver underneath to create soft rings and use
473 	 * worker threads (if the driver advertized SOFT_RING capability)
474 	 * Its still a big performance win to if we can fanout to the
475 	 * threads on the same core that is taking interrupts.
476 	 *
477 	 * Since we don't know the interrupt to CPU binding, we don't
478 	 * assign any squeues or affinity to worker threads in the NIC.
479 	 * At the time of the first interrupt, we know which CPU is
480 	 * taking interrupts and try to find other threads on the same
481 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
482 	 * numbered sequentially for each core (XXX need something better
483 	 * than this in future), find the lowest number and highest
484 	 * number thread for that core.
485 	 *
486 	 * If we have one more thread per core than number of soft rings,
487 	 * then don't assign any worker threads to the H/W thread (cpu)
488 	 * taking interrupts (capability negotiation tries to ensure this)
489 	 *
490 	 * If the number of threads per core are same as the number of
491 	 * soft rings, then assign the worker affinity and squeue to
492 	 * the same cpu.
493 	 *
494 	 * Otherwise, just fanout to higher number CPUs starting from
495 	 * the interrupted CPU.
496 	 */
497 
498 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
499 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
500 
501 	cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n",
502 	    min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id);
503 
504 	/*
505 	 * Quickly check if there are enough CPUs present for fanout
506 	 * and also max_cpu_id is less than the id of the active CPU.
507 	 * We use the cpu_id stored in the last squeue_set to get
508 	 * an idea. The scheme is by no means perfect since it doesn't
509 	 * take into account CPU DR operations and the fact that
510 	 * interrupts themselves might change. An ideal scenario
511 	 * would be to ensure that interrupts run cpus by themselves
512 	 * and worker threads never have affinity to those CPUs. If
513 	 * the interrupts move to CPU which had a worker thread, it
514 	 * should be changed. Probably callbacks similar to CPU offline
515 	 * are needed to make it work perfectly.
516 	 */
517 	last_sqs = sqset_global_list[sqset_global_size - 1];
518 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
519 		if ((max_cpu_id - min_cpu_id) >
520 		    ill_soft_ring->ill_dls_soft_ring_cnt)
521 			enough_uniq_cpus = B_TRUE;
522 		else if ((max_cpu_id - min_cpu_id) >=
523 		    ill_soft_ring->ill_dls_soft_ring_cnt)
524 			enough_cpus = B_TRUE;
525 	}
526 
527 	j = 0;
528 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
529 		if (enough_uniq_cpus) {
530 			if ((min_cpu_id + i) == cpu_id) {
531 				j++;
532 				continue;
533 			}
534 			bind_cpu = cpu[min_cpu_id + i];
535 		} else if (enough_cpus) {
536 			bind_cpu = cpu[min_cpu_id + i];
537 		} else {
538 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
539 			bind_cpu = cpu[(cpu_id + i) % ncpus];
540 		}
541 
542 		/*
543 		 * Check if the CPU actually exist and active. If not,
544 		 * use the interrupted CPU. ip_find_unused_squeue() will
545 		 * find the right CPU to fanout anyway.
546 		 */
547 		if (!CPU_ISON(bind_cpu))
548 			bind_cpu = intr_cpu;
549 
550 		sqs = bind_cpu->cpu_squeue_set;
551 		ASSERT(sqs != NULL);
552 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
553 
554 		sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
555 		if (sqp == NULL) {
556 			/*
557 			 * We hit the max limit of squeues allowed per CPU.
558 			 * Assign this rx_ring to DEFAULT squeue of the
559 			 * interrupted CPU but thesqueue will not manage
560 			 * the ring. Also print a warning.
561 			 */
562 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
563 			    "%d/%p already has max number of squeues. System "
564 			    "performance might become suboptimal\n",
565 			    sqs->sqs_bind, (void *)sqs);
566 
567 			/* the first squeue in the list is the default squeue */
568 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
569 			ASSERT(sqp != NULL);
570 
571 			ill_rx_ring->rr_sqp = sqp;
572 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
573 			continue;
574 
575 		}
576 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
577 		ill_rx_ring->rr_sqp = sqp;
578 		sqp->sq_rx_ring = ill_rx_ring;
579 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
580 		sqp->sq_state |= SQS_ILL_BOUND;
581 
582 		/* assign affinity to soft ring */
583 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
584 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
585 			    sqp->sq_bind);
586 		}
587 		mutex_exit(&sqp->sq_lock);
588 
589 		cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n",
590 		    i - j, sqp->sq_bind);
591 	}
592 	mutex_exit(&ill->ill_lock);
593 
594 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
595 	    SOFT_RING_SRC_HASH);
596 
597 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
598 	ill_waiter_dcr(ill);
599 }
600 
601 void
602 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
603 mblk_t *mp_chain, size_t hdrlen)
604 {
605 	ip_taskq_arg_t	*taskq_arg;
606 	boolean_t	refheld;
607 
608 	ASSERT(servicing_interrupt());
609 	ASSERT(ip_ring == NULL);
610 
611 	mutex_enter(&ill->ill_lock);
612 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
613 		taskq_arg = (ip_taskq_arg_t *)
614 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
615 
616 		if (taskq_arg == NULL)
617 			goto out;
618 
619 		taskq_arg->ip_taskq_ill = ill;
620 		taskq_arg->ip_taskq_ill_rx_ring = ip_ring;
621 		taskq_arg->ip_taskq_cpu = CPU;
622 
623 		/*
624 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
625 		 * the next interrupt to schedule a task for calling
626 		 * ip_squeue_soft_ring_affinity();
627 		 */
628 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
629 	} else {
630 		mutex_exit(&ill->ill_lock);
631 		goto out;
632 	}
633 	mutex_exit(&ill->ill_lock);
634 	refheld = ill_waiter_inc(ill);
635 	if (refheld) {
636 		if (taskq_dispatch(system_taskq,
637 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
638 			goto out;
639 
640 		/* release ref on ill if taskq dispatch fails */
641 		ill_waiter_dcr(ill);
642 	}
643 	/*
644 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
645 	 * can be tried again later.
646 	 */
647 	mutex_enter(&ill->ill_lock);
648 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
649 	mutex_exit(&ill->ill_lock);
650 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
651 
652 out:
653 	ip_input(ill, ip_ring, mp_chain, hdrlen);
654 }
655 
656 static squeue_t *
657 ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
658 {
659 	int 		i;
660 	squeue_set_t	*best_sqs = NULL;
661 	squeue_set_t	*curr_sqs = NULL;
662 	int		min_sq = 0;
663 	squeue_t 	*sqp = NULL;
664 	char		sqname[64];
665 
666 	/*
667 	 * If fanout is set and the passed squeue_set already has some
668 	 * squeues which are managing the NICs, try to find squeues on
669 	 * unused CPU.
670 	 */
671 	if (sqs->sqs_size > 1 && fanout) {
672 		/*
673 		 * First check to see if any squeue on the CPU passed
674 		 * is managing a NIC.
675 		 */
676 		for (i = 0; i < sqs->sqs_size; i++) {
677 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
678 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
679 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
680 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
681 				break;
682 			}
683 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
684 		}
685 		if (i != sqs->sqs_size) {
686 			best_sqs = sqset_global_list[sqset_global_size - 1];
687 			min_sq = best_sqs->sqs_size;
688 
689 			for (i = sqset_global_size - 2; i >= 0; i--) {
690 				curr_sqs = sqset_global_list[i];
691 				if (curr_sqs->sqs_size < min_sq) {
692 					best_sqs = curr_sqs;
693 					min_sq = curr_sqs->sqs_size;
694 				}
695 			}
696 
697 			ASSERT(best_sqs != NULL);
698 			sqs = best_sqs;
699 			bind_cpu = cpu[sqs->sqs_bind];
700 		}
701 	}
702 
703 	mutex_enter(&sqs->sqs_lock);
704 
705 	for (i = 0; i < sqs->sqs_size; i++) {
706 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
707 		if ((sqs->sqs_list[i]->sq_state &
708 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
709 			sqp = sqs->sqs_list[i];
710 			break;
711 		}
712 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
713 	}
714 
715 	if (sqp == NULL) {
716 		/* Need to create a new squeue */
717 		if (sqs->sqs_size == sqs->sqs_max_size) {
718 			/*
719 			 * Reached the max limit for squeue
720 			 * we can allocate on this CPU.
721 			 */
722 			mutex_exit(&sqs->sqs_lock);
723 			return (NULL);
724 		}
725 
726 		bzero(sqname, sizeof (sqname));
727 		(void) snprintf(sqname, sizeof (sqname),
728 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
729 		    bind_cpu->cpu_id, sqs->sqs_size);
730 
731 		sqp = squeue_create(sqname, bind_cpu->cpu_id,
732 		    ip_squeue_worker_wait, minclsyspri);
733 
734 		ASSERT(sqp != NULL);
735 
736 		squeue_profile_enable(sqp);
737 		sqs->sqs_list[sqs->sqs_size++] = sqp;
738 
739 		if (ip_squeue_create_callback != NULL)
740 			ip_squeue_create_callback(sqp);
741 
742 		mutex_enter(&cpu_lock);
743 		if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
744 			squeue_bind(sqp, -1);
745 		}
746 		mutex_exit(&cpu_lock);
747 
748 		mutex_enter(&sqp->sq_lock);
749 	}
750 
751 	mutex_exit(&sqs->sqs_lock);
752 	ASSERT(sqp != NULL);
753 	return (sqp);
754 }
755 
756 /*
757  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
758  * owned by a squeue yet, do the assignment. When the NIC registers it
759  * Rx rings with IP, we don't know where the interrupts will land and
760  * hence we need to wait till this point to do the assignment.
761  */
762 squeue_t *
763 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
764 {
765 	squeue_t 	*sqp;
766 	ill_t 		*ill;
767 	int		interrupt;
768 	ip_taskq_arg_t	*taskq_arg;
769 	boolean_t	refheld;
770 
771 	if (ill_rx_ring == NULL)
772 		return (IP_SQUEUE_GET(lbolt));
773 
774 	sqp = ill_rx_ring->rr_sqp;
775 	/*
776 	 * Do a quick check. If it's not NULL, we are done.
777 	 * Squeues are never destroyed so worse we will bind
778 	 * this connection to a suboptimal squeue.
779 	 *
780 	 * This is the fast path case.
781 	 */
782 	if (sqp != NULL)
783 		return (sqp);
784 
785 	ill = ill_rx_ring->rr_ill;
786 	ASSERT(ill != NULL);
787 
788 	interrupt = servicing_interrupt();
789 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
790 	    KM_NOSLEEP);
791 
792 	mutex_enter(&ill->ill_lock);
793 	if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE ||
794 		taskq_arg == NULL) {
795 		/*
796 		 * Do the ring to squeue binding only if we are in interrupt
797 		 * context and there is no one else trying the bind already.
798 		 */
799 		mutex_exit(&ill->ill_lock);
800 		if (taskq_arg != NULL)
801 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
802 		return (IP_SQUEUE_GET(lbolt));
803 	}
804 
805 	/*
806 	 * No sqp assigned yet. Can't really do that in interrupt
807 	 * context. Assign the default sqp to this connection and
808 	 * trigger creation of new sqp and binding it to this ring
809 	 * via taskq. Need to make sure ill stays around.
810 	 */
811 	taskq_arg->ip_taskq_ill = ill;
812 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
813 	taskq_arg->ip_taskq_cpu = CPU;
814 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
815 	mutex_exit(&ill->ill_lock);
816 	refheld = ill_waiter_inc(ill);
817 	if (refheld) {
818 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
819 		    taskq_arg, TQ_NOSLEEP) != NULL) {
820 			return (IP_SQUEUE_GET(lbolt));
821 		}
822 	}
823 	/*
824 	 * The ill is closing and we could not get a reference on the ill OR
825 	 * taskq_dispatch failed probably due to memory allocation failure.
826 	 * We will try again next time.
827 	 */
828 	mutex_enter(&ill->ill_lock);
829 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
830 	mutex_exit(&ill->ill_lock);
831 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
832 	if (refheld)
833 		ill_waiter_dcr(ill);
834 
835 	return (IP_SQUEUE_GET(lbolt));
836 }
837 
838 /*
839  * NDD hooks for setting ip_squeue_xxx tuneables.
840  */
841 
842 /* ARGSUSED */
843 int
844 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
845     caddr_t addr, cred_t *cr)
846 {
847 	int *bind_enabled = (int *)addr;
848 	long new_value;
849 	int i;
850 
851 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
852 		return (EINVAL);
853 
854 	if (ip_squeue_bind == new_value)
855 		return (0);
856 
857 	*bind_enabled = new_value;
858 	mutex_enter(&cpu_lock);
859 	if (new_value == 0) {
860 		for (i = 0; i < sqset_global_size; i++)
861 			ip_squeue_set_unbind(sqset_global_list[i]);
862 	} else {
863 		for (i = 0; i < sqset_global_size; i++)
864 			ip_squeue_set_bind(sqset_global_list[i]);
865 	}
866 
867 	mutex_exit(&cpu_lock);
868 	return (0);
869 }
870 
871 /*
872  * Set squeue profiling.
873  * 0 means "disable"
874  * 1 means "enable"
875  * 2 means "enable and reset"
876  */
877 /* ARGSUSED */
878 int
879 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
880     cred_t *cr)
881 {
882 	int *profile_enabled = (int *)cp;
883 	long new_value;
884 	squeue_set_t *sqs;
885 
886 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
887 		return (EINVAL);
888 
889 	if (new_value == 0)
890 		squeue_profile_stop();
891 	else if (new_value == 1)
892 		squeue_profile_start();
893 	else if (new_value == 2) {
894 		int i, j;
895 
896 		squeue_profile_stop();
897 		mutex_enter(&cpu_lock);
898 		for (i = 0; i < sqset_global_size; i++) {
899 			sqs = sqset_global_list[i];
900 			for (j = 0; j < sqs->sqs_size; j++) {
901 				squeue_profile_reset(sqs->sqs_list[j]);
902 			}
903 		}
904 		mutex_exit(&cpu_lock);
905 
906 		new_value = 1;
907 		squeue_profile_start();
908 	}
909 	*profile_enabled = new_value;
910 
911 	return (0);
912 }
913 
914 /*
915  * Reconfiguration callback
916  */
917 
918 /* ARGSUSED */
919 static int
920 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
921 {
922 	cpu_t *cp = cpu[id];
923 
924 	ASSERT(MUTEX_HELD(&cpu_lock));
925 	switch (what) {
926 	case CPU_CONFIG:
927 		/*
928 		 * A new CPU is added. Create an squeue for it but do not bind
929 		 * it yet.
930 		 */
931 		if (cp->cpu_squeue_set == NULL)
932 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
933 		break;
934 	case CPU_ON:
935 	case CPU_INIT:
936 	case CPU_CPUPART_IN:
937 		if (cp->cpu_squeue_set == NULL) {
938 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
939 		}
940 		if (ip_squeue_bind)
941 			ip_squeue_set_bind(cp->cpu_squeue_set);
942 		break;
943 	case CPU_UNCONFIG:
944 	case CPU_OFF:
945 	case CPU_CPUPART_OUT:
946 		ASSERT((cp->cpu_squeue_set != NULL) ||
947 		    (cp->cpu_flags & CPU_OFFLINE));
948 
949 		if (cp->cpu_squeue_set != NULL) {
950 			ip_squeue_set_unbind(cp->cpu_squeue_set);
951 		}
952 		break;
953 	default:
954 		break;
955 	}
956 	return (0);
957 }
958 
959 /* ARGSUSED */
960 static void
961 ip_squeue_set_bind(squeue_set_t *sqs)
962 {
963 	int i;
964 	squeue_t *sqp;
965 
966 	if (!ip_squeue_bind)
967 		return;
968 
969 	mutex_enter(&sqs->sqs_lock);
970 	for (i = 0; i < sqs->sqs_size; i++) {
971 		sqp = sqs->sqs_list[i];
972 		if (sqp->sq_state & SQS_BOUND)
973 			continue;
974 		squeue_bind(sqp, -1);
975 	}
976 	mutex_exit(&sqs->sqs_lock);
977 }
978 
979 static void
980 ip_squeue_set_unbind(squeue_set_t *sqs)
981 {
982 	int i;
983 	squeue_t *sqp;
984 
985 	mutex_enter(&sqs->sqs_lock);
986 	for (i = 0; i < sqs->sqs_size; i++) {
987 		sqp = sqs->sqs_list[i];
988 
989 		/*
990 		 * CPU is going offline. Remove the thread affinity
991 		 * for any soft ring threads the squeue is managing.
992 		 */
993 		if (sqp->sq_state & SQS_ILL_BOUND) {
994 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
995 			ill_t		*ill = ring->rr_ill;
996 
997 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
998 				ASSERT(ring->rr_handle != NULL);
999 				ill->ill_dls_capab->ill_dls_unbind(
1000 					ring->rr_handle);
1001 			}
1002 		}
1003 		if (!(sqp->sq_state & SQS_BOUND))
1004 			continue;
1005 		squeue_unbind(sqp);
1006 	}
1007 	mutex_exit(&sqs->sqs_lock);
1008 }
1009