xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 6cefaae1e90a413ba01560575bb3998e1a3df40e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * IP interface to squeues.
28  *
29  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
30  * cpu_squeue field of the cpu structure. Each squeue is associated with a
31  * connection instance (conn_t).
32  *
33  * For CPUs available at system startup time the squeue creation and association
34  * with CPU happens at MP initialization time. For CPUs added during dynamic
35  * reconfiguration, the initialization happens when the new CPU is configured in
36  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
37  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
38  * variable.
39  *
40  * There are two modes of associating connection with squeues. The first mode
41  * associates each connection with the CPU that creates the connection (either
42  * during open time or during accept time). The second mode associates each
43  * connection with a random CPU, effectively distributing load over all CPUs
44  * and all squeues in the system. The mode is controlled by the
45  * ip_squeue_fanout variable.
46  *
47  * NOTE: The fact that there is an association between each connection and
48  * squeue and squeue and CPU does not mean that each connection is always
49  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
50  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
51  * binding is only relevant for the worker thread.
52  *
53  * The list of all created squeues is kept in squeue_set structure. This list is
54  * used when ip_squeue_fanout is set and the load is distributed across all
55  * squeues.
56  *
57  * INTERFACE:
58  *
59  * squeue_t *ip_squeue_get(hint)
60  *
61  * 	Find an squeue based on the 'hint' value. The hint is used as an index
62  * 	in the array of IP squeues available. The way hint is computed may
63  * 	affect the effectiveness of the squeue distribution. Currently squeues
64  * 	are assigned in round-robin fashion using lbolt as a hint.
65  *
66  *
67  * DR Notes
68  * ========
69  *
70  * The ip_squeue_init() registers a call-back function with the CPU DR
71  * subsystem using register_cpu_setup_func(). The call-back function does two
72  * things:
73  *
74  * o When the CPU is going off-line or unconfigured, the worker thread is
75  *	unbound from the CPU. This allows the CPU unconfig code to move it to
76  *	another CPU.
77  *
78  * o When the CPU is going online, it creates a new squeue for this CPU if
79  *	necessary and binds the squeue worker thread to this CPU.
80  *
81  * TUNEBALES:
82  *
83  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
84  * 	associated with an squeue instance.
85  *
86  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
87  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
88  *	an impact.
89  *
90  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
91  *	otherwise get it from CPU->cpu_squeue.
92  *
93  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
94  * changed using ndd on /dev/tcp or /dev/ip.
95  *
96  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
97  *	created. This is the time squeue code waits before waking up the worker
98  *	thread after queuing a request.
99  */
100 
101 #include <sys/types.h>
102 #include <sys/debug.h>
103 #include <sys/kmem.h>
104 #include <sys/cpuvar.h>
105 
106 #include <sys/cmn_err.h>
107 
108 #include <inet/common.h>
109 #include <inet/ip.h>
110 #include <inet/ip_if.h>
111 #include <inet/nd.h>
112 #include <inet/ipclassifier.h>
113 #include <sys/types.h>
114 #include <sys/conf.h>
115 #include <sys/sunddi.h>
116 #include <sys/dlpi.h>
117 #include <sys/squeue_impl.h>
118 #include <sys/atomic.h>
119 
120 /*
121  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
122  * mapping between squeue and NIC (or Rx ring) for performance reasons so
123  * each squeue can uniquely own a NIC or a Rx ring and do polling
124  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
125  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
126  * can be created dynamically as needed.
127  */
128 #define	MAX_SQUEUES_PER_CPU	32
129 #define	MIN_SQUEUES_PER_CPU	1
130 uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
131 
132 #define	IP_NUM_SOFT_RINGS	2
133 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
134 
135 /*
136  * List of all created squeue sets. The size is protected by cpu_lock
137  */
138 squeue_set_t	**sqset_global_list;
139 uint_t		sqset_global_size;
140 
141 int ip_squeue_bind = B_TRUE;
142 int ip_squeue_profile = B_TRUE;
143 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
144 
145 /*
146  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
147  *	created. This is the time squeue code waits before waking up the worker
148  *	thread after queuing a request.
149  */
150 uint_t ip_squeue_worker_wait = 10;
151 
152 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
153 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
154 
155 static void ip_squeue_set_bind(squeue_set_t *);
156 static void ip_squeue_set_unbind(squeue_set_t *);
157 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t);
158 static void ip_squeue_clean(void *, mblk_t *, void *);
159 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
160 
161 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
162 
163 /*
164  * Create squeue set containing ip_squeues_per_cpu number of squeues
165  * for this CPU and bind them all to the CPU.
166  */
167 static squeue_set_t *
168 ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
169 {
170 	int i;
171 	squeue_set_t	*sqs;
172 	squeue_t 	*sqp;
173 	char 		sqname[64];
174 	processorid_t 	id = cp->cpu_id;
175 
176 	if (reuse) {
177 		int i;
178 
179 		/*
180 		 * We may already have an squeue created for this CPU. Try to
181 		 * find one and reuse it if possible.
182 		 */
183 		for (i = 0; i < sqset_global_size; i++) {
184 			sqs = sqset_global_list[i];
185 			if (id == sqs->sqs_bind)
186 				return (sqs);
187 		}
188 	}
189 
190 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
191 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
192 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
193 	sqs->sqs_list = (squeue_t **)&sqs[1];
194 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
195 	sqs->sqs_bind = id;
196 
197 	for (i = 0; i < ip_squeues_per_cpu; i++) {
198 		bzero(sqname, sizeof (sqname));
199 
200 		(void) snprintf(sqname, sizeof (sqname),
201 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
202 		    cp->cpu_id, i);
203 
204 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
205 		    minclsyspri);
206 
207 		/*
208 		 * The first squeue in each squeue_set is the DEFAULT
209 		 * squeue.
210 		 */
211 		sqp->sq_state |= SQS_DEFAULT;
212 
213 		ASSERT(sqp != NULL);
214 
215 		squeue_profile_enable(sqp);
216 		sqs->sqs_list[sqs->sqs_size++] = sqp;
217 
218 		if (ip_squeue_create_callback != NULL)
219 			ip_squeue_create_callback(sqp);
220 	}
221 
222 	if (ip_squeue_bind && cpu_is_online(cp))
223 		ip_squeue_set_bind(sqs);
224 
225 	sqset_global_list[sqset_global_size++] = sqs;
226 	ASSERT(sqset_global_size <= NCPU);
227 	return (sqs);
228 }
229 
230 /*
231  * Initialize IP squeues.
232  */
233 void
234 ip_squeue_init(void (*callback)(squeue_t *))
235 {
236 	int i;
237 
238 	ASSERT(sqset_global_list == NULL);
239 
240 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
241 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
242 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
243 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
244 
245 	ip_squeue_create_callback = callback;
246 	squeue_init();
247 	sqset_global_list =
248 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
249 	sqset_global_size = 0;
250 	mutex_enter(&cpu_lock);
251 
252 	/* Create squeue for each active CPU available */
253 	for (i = 0; i < NCPU; i++) {
254 		cpu_t *cp = cpu[i];
255 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
256 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
257 		}
258 	}
259 
260 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
261 
262 	mutex_exit(&cpu_lock);
263 
264 	if (ip_squeue_profile)
265 		squeue_profile_start();
266 }
267 
268 /*
269  * Get squeue_t structure based on index.
270  * Since the squeue list can only grow, no need to grab any lock.
271  */
272 squeue_t *
273 ip_squeue_random(uint_t index)
274 {
275 	squeue_set_t *sqs;
276 
277 	sqs = sqset_global_list[index % sqset_global_size];
278 	return (sqs->sqs_list[index % sqs->sqs_size]);
279 }
280 
281 /* ARGSUSED */
282 static void
283 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
284 {
285 	squeue_t	*sqp = arg2;
286 	ill_rx_ring_t	*ring = (ill_rx_ring_t *)mp->b_wptr;
287 	ill_t		*ill;
288 
289 	ASSERT(sqp != NULL);
290 	mp->b_wptr = NULL;
291 
292 	if (ring == NULL) {
293 		return;
294 	}
295 
296 	/*
297 	 * Clean up squeue
298 	 */
299 	mutex_enter(&sqp->sq_lock);
300 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
301 	sqp->sq_rx_ring = NULL;
302 	mutex_exit(&sqp->sq_lock);
303 
304 	ill = ring->rr_ill;
305 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
306 		ASSERT(ring->rr_handle != NULL);
307 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
308 	}
309 
310 	/*
311 	 * Cleanup the ring
312 	 */
313 
314 	ring->rr_blank = NULL;
315 	ring->rr_handle = NULL;
316 	ring->rr_sqp = NULL;
317 
318 	/*
319 	 * Signal ill that cleanup is done
320 	 */
321 	mutex_enter(&ill->ill_lock);
322 	ring->rr_ring_state = ILL_RING_FREE;
323 	cv_signal(&ill->ill_cv);
324 	mutex_exit(&ill->ill_lock);
325 }
326 
327 /*
328  * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
329  * The real cleanup happens behind the squeue via ip_squeue_clean function but
330  * we need to protect ourselves from 2 threads trying to cleanup at the same
331  * time (possible with one port going down for aggr and someone tearing down the
332  * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
333  * to indicate when the cleanup has started (1 ref) and when the cleanup
334  * is done (0 ref). When a new ring gets assigned to squeue, we start by
335  * putting 2 ref on ill_inuse_ref.
336  */
337 static void
338 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
339 {
340 	conn_t *connp;
341 	squeue_t *sqp;
342 	mblk_t *mp;
343 
344 	ASSERT(rx_ring != NULL);
345 
346 	/* Just clean one squeue */
347 	mutex_enter(&ill->ill_lock);
348 	/*
349 	 * Reset the ILL_SOFT_RING_ASSIGN bit so that
350 	 * ip_squeue_soft_ring_affinty() will not go
351 	 * ahead with assigning rings.
352 	 */
353 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
354 	while (rx_ring->rr_ring_state == ILL_RING_INPROC)
355 		/* Some operations pending on the ring. Wait */
356 		cv_wait(&ill->ill_cv, &ill->ill_lock);
357 
358 	if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
359 		/*
360 		 * Someone already trying to clean
361 		 * this squeue or it's already been cleaned.
362 		 */
363 		mutex_exit(&ill->ill_lock);
364 		return;
365 	}
366 	sqp = rx_ring->rr_sqp;
367 
368 	if (sqp == NULL) {
369 		/*
370 		 * The rx_ring never had a squeue assigned to it.
371 		 * We are under ill_lock so we can clean it up
372 		 * here itself since no one can get to it.
373 		 */
374 		rx_ring->rr_blank = NULL;
375 		rx_ring->rr_handle = NULL;
376 		rx_ring->rr_sqp = NULL;
377 		rx_ring->rr_ring_state = ILL_RING_FREE;
378 		mutex_exit(&ill->ill_lock);
379 		return;
380 	}
381 
382 	/* Indicate that it's being cleaned */
383 	rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
384 	ASSERT(sqp != NULL);
385 	mutex_exit(&ill->ill_lock);
386 
387 	/*
388 	 * Use the preallocated ill_unbind_conn for this purpose
389 	 */
390 	connp = ill->ill_dls_capab->ill_unbind_conn;
391 
392 	if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
393 		connp->conn_tcp->tcp_closemp_used = B_TRUE;
394 	} else {
395 		cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
396 		    "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
397 		    (void *)connp, (void *)connp->conn_tcp);
398 	}
399 
400 	TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
401 	mp = &connp->conn_tcp->tcp_closemp;
402 	CONN_INC_REF(connp);
403 
404 	/*
405 	 * Since the field sq_rx_ring for default squeue is NULL,
406 	 * ip_squeue_clean() will have no way to get the ring if we
407 	 * don't pass the pointer to it. We use b_wptr to do so
408 	 * as use of b_wptr for any other purpose is not expected.
409 	 */
410 
411 	ASSERT(mp->b_wptr == NULL);
412 	mp->b_wptr = (unsigned char *)rx_ring;
413 	squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
414 
415 	mutex_enter(&ill->ill_lock);
416 	while (rx_ring->rr_ring_state != ILL_RING_FREE)
417 		cv_wait(&ill->ill_cv, &ill->ill_lock);
418 	mutex_exit(&ill->ill_lock);
419 }
420 
421 void
422 ip_squeue_clean_all(ill_t *ill)
423 {
424 	int idx;
425 
426 	/*
427 	 * No need to clean if poll_capab isn't set for this ill
428 	 */
429 	if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
430 		return;
431 
432 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
433 		ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
434 
435 		ip_squeue_clean_ring(ill, ipr);
436 	}
437 
438 	ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
439 }
440 
441 typedef struct ip_taskq_arg {
442 	ill_t		*ip_taskq_ill;
443 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
444 	cpu_t		*ip_taskq_cpu;
445 } ip_taskq_arg_t;
446 
447 /*
448  * Do a Rx ring to squeue binding. Find a unique squeue that is not
449  * managing a receive ring. If no such squeue exists, dynamically
450  * create a new one in the squeue set.
451  *
452  * The function runs via the system taskq. The ill passed as an
453  * argument can't go away since we hold a ref. The lock order is
454  * ill_lock -> sqs_lock -> sq_lock.
455  *
456  * If we are binding a Rx ring to a squeue attached to the offline CPU,
457  * no need to check that because squeues are never destroyed once
458  * created.
459  */
460 /* ARGSUSED */
461 static void
462 ip_squeue_extend(void *arg)
463 {
464 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
465 	ill_t		*ill = sq_arg->ip_taskq_ill;
466 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
467 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
468 	squeue_set_t 	*sqs;
469 	squeue_t 	*sqp = NULL;
470 
471 	ASSERT(ill != NULL);
472 	ASSERT(ill_rx_ring != NULL);
473 	kmem_free(arg, sizeof (ip_taskq_arg_t));
474 
475 	/*
476 	 * Make sure the CPU that originally took the interrupt still
477 	 * exists.
478 	 */
479 	if (!CPU_ISON(intr_cpu))
480 		intr_cpu = CPU;
481 
482 	sqs = intr_cpu->cpu_squeue_set;
483 
484 	/*
485 	 * If this ill represents link aggregation, then there might be
486 	 * multiple NICs trying to register them selves at the same time
487 	 * and in order to ensure that test and assignment of free rings
488 	 * is sequential, we need to hold the ill_lock.
489 	 */
490 	mutex_enter(&ill->ill_lock);
491 	sqp = ip_find_unused_squeue(sqs, B_FALSE);
492 	if (sqp == NULL) {
493 		/*
494 		 * We hit the max limit of squeues allowed per CPU.
495 		 * Assign this rx_ring to DEFAULT squeue of the
496 		 * interrupted CPU but the squeue will not manage
497 		 * the ring. Also print a warning.
498 		 */
499 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
500 		    "has max number of squeues. System performance might "
501 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
502 
503 		/* the first squeue in the list is the default squeue */
504 		sqp = sqs->sqs_list[0];
505 		ASSERT(sqp != NULL);
506 		ill_rx_ring->rr_sqp = sqp;
507 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
508 
509 		mutex_exit(&ill->ill_lock);
510 		ill_waiter_dcr(ill);
511 		return;
512 	}
513 
514 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
515 	sqp->sq_rx_ring = ill_rx_ring;
516 	ill_rx_ring->rr_sqp = sqp;
517 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
518 
519 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
520 	mutex_exit(&sqp->sq_lock);
521 
522 	mutex_exit(&ill->ill_lock);
523 
524 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
525 	ill_waiter_dcr(ill);
526 }
527 
528 /*
529  * Do a Rx ring to squeue binding. Find a unique squeue that is not
530  * managing a receive ring. If no such squeue exists, dynamically
531  * create a new one in the squeue set.
532  *
533  * The function runs via the system taskq. The ill passed as an
534  * argument can't go away since we hold a ref. The lock order is
535  * ill_lock -> sqs_lock -> sq_lock.
536  *
537  * If we are binding a Rx ring to a squeue attached to the offline CPU,
538  * no need to check that because squeues are never destroyed once
539  * created.
540  */
541 /* ARGSUSED */
542 static void
543 ip_squeue_soft_ring_affinity(void *arg)
544 {
545 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
546 	ill_t			*ill = sq_arg->ip_taskq_ill;
547 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
548 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
549 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
550 	cpu_t			*bind_cpu;
551 	int			cpu_id = intr_cpu->cpu_id;
552 	int			min_cpu_id, max_cpu_id;
553 	boolean_t		enough_uniq_cpus = B_FALSE;
554 	boolean_t		enough_cpus = B_FALSE;
555 	squeue_set_t 		*sqs, *last_sqs;
556 	squeue_t 		*sqp = NULL;
557 	int			i, j;
558 
559 	ASSERT(ill != NULL);
560 	kmem_free(arg, sizeof (ip_taskq_arg_t));
561 
562 	/*
563 	 * Make sure the CPU that originally took the interrupt still
564 	 * exists.
565 	 */
566 	if (!CPU_ISON(intr_cpu)) {
567 		intr_cpu = CPU;
568 		cpu_id = intr_cpu->cpu_id;
569 	}
570 
571 	/*
572 	 * If this ill represents link aggregation, then there might be
573 	 * multiple NICs trying to register them selves at the same time
574 	 * and in order to ensure that test and assignment of free rings
575 	 * is sequential, we need to hold the ill_lock.
576 	 */
577 	mutex_enter(&ill->ill_lock);
578 
579 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
580 		mutex_exit(&ill->ill_lock);
581 		return;
582 	}
583 	/*
584 	 * We need to fanout the interrupts from the NIC. We do that by
585 	 * telling the driver underneath to create soft rings and use
586 	 * worker threads (if the driver advertized SOFT_RING capability)
587 	 * Its still a big performance win to if we can fanout to the
588 	 * threads on the same core that is taking interrupts.
589 	 *
590 	 * Since we don't know the interrupt to CPU binding, we don't
591 	 * assign any squeues or affinity to worker threads in the NIC.
592 	 * At the time of the first interrupt, we know which CPU is
593 	 * taking interrupts and try to find other threads on the same
594 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
595 	 * numbered sequentially for each core (XXX need something better
596 	 * than this in future), find the lowest number and highest
597 	 * number thread for that core.
598 	 *
599 	 * If we have one more thread per core than number of soft rings,
600 	 * then don't assign any worker threads to the H/W thread (cpu)
601 	 * taking interrupts (capability negotiation tries to ensure this)
602 	 *
603 	 * If the number of threads per core are same as the number of
604 	 * soft rings, then assign the worker affinity and squeue to
605 	 * the same cpu.
606 	 *
607 	 * Otherwise, just fanout to higher number CPUs starting from
608 	 * the interrupted CPU.
609 	 */
610 
611 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
612 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
613 
614 	/*
615 	 * Quickly check if there are enough CPUs present for fanout
616 	 * and also max_cpu_id is less than the id of the active CPU.
617 	 * We use the cpu_id stored in the last squeue_set to get
618 	 * an idea. The scheme is by no means perfect since it doesn't
619 	 * take into account CPU DR operations and the fact that
620 	 * interrupts themselves might change. An ideal scenario
621 	 * would be to ensure that interrupts run cpus by themselves
622 	 * and worker threads never have affinity to those CPUs. If
623 	 * the interrupts move to CPU which had a worker thread, it
624 	 * should be changed. Probably callbacks similar to CPU offline
625 	 * are needed to make it work perfectly.
626 	 */
627 	last_sqs = sqset_global_list[sqset_global_size - 1];
628 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
629 		if ((max_cpu_id - min_cpu_id) >
630 		    ill_soft_ring->ill_dls_soft_ring_cnt)
631 			enough_uniq_cpus = B_TRUE;
632 		else if ((max_cpu_id - min_cpu_id) >=
633 		    ill_soft_ring->ill_dls_soft_ring_cnt)
634 			enough_cpus = B_TRUE;
635 	}
636 
637 	j = 0;
638 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
639 		if (enough_uniq_cpus) {
640 			if ((min_cpu_id + i) == cpu_id) {
641 				j++;
642 				continue;
643 			}
644 			bind_cpu = cpu[min_cpu_id + i];
645 		} else if (enough_cpus) {
646 			bind_cpu = cpu[min_cpu_id + i];
647 		} else {
648 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
649 			bind_cpu = cpu[(cpu_id + i) % ncpus];
650 		}
651 
652 		/*
653 		 * Check if the CPU actually exist and active. If not,
654 		 * use the interrupted CPU. ip_find_unused_squeue() will
655 		 * find the right CPU to fanout anyway.
656 		 */
657 		if (!CPU_ISON(bind_cpu))
658 			bind_cpu = intr_cpu;
659 
660 		sqs = bind_cpu->cpu_squeue_set;
661 		ASSERT(sqs != NULL);
662 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
663 
664 		sqp = ip_find_unused_squeue(sqs, B_TRUE);
665 		if (sqp == NULL) {
666 			/*
667 			 * We hit the max limit of squeues allowed per CPU.
668 			 * Assign this rx_ring to DEFAULT squeue of the
669 			 * interrupted CPU but thesqueue will not manage
670 			 * the ring. Also print a warning.
671 			 */
672 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
673 			    "%d/%p already has max number of squeues. System "
674 			    "performance might become suboptimal\n",
675 			    sqs->sqs_bind, (void *)sqs);
676 
677 			/* the first squeue in the list is the default squeue */
678 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
679 			ASSERT(sqp != NULL);
680 
681 			ill_rx_ring->rr_sqp = sqp;
682 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
683 			continue;
684 
685 		}
686 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
687 		ill_rx_ring->rr_sqp = sqp;
688 		sqp->sq_rx_ring = ill_rx_ring;
689 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
690 		sqp->sq_state |= SQS_ILL_BOUND;
691 
692 		/* assign affinity to soft ring */
693 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
694 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
695 			    sqp->sq_bind);
696 		}
697 		mutex_exit(&sqp->sq_lock);
698 	}
699 	mutex_exit(&ill->ill_lock);
700 
701 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
702 	    SOFT_RING_FANOUT);
703 
704 	mutex_enter(&ill->ill_lock);
705 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
706 	mutex_exit(&ill->ill_lock);
707 
708 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
709 	ill_waiter_dcr(ill);
710 }
711 
712 /* ARGSUSED */
713 void
714 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
715     mblk_t *mp_chain, struct mac_header_info_s *mhip)
716 {
717 	ip_taskq_arg_t	*taskq_arg;
718 	boolean_t	refheld;
719 
720 	mutex_enter(&ill->ill_lock);
721 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
722 		taskq_arg = (ip_taskq_arg_t *)
723 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
724 
725 		if (taskq_arg == NULL)
726 			goto out;
727 
728 		taskq_arg->ip_taskq_ill = ill;
729 		taskq_arg->ip_taskq_ill_rx_ring = NULL;
730 		taskq_arg->ip_taskq_cpu = CPU;
731 
732 		/*
733 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
734 		 * the next interrupt to schedule a task for calling
735 		 * ip_squeue_soft_ring_affinity();
736 		 */
737 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
738 	} else {
739 		mutex_exit(&ill->ill_lock);
740 		goto out;
741 	}
742 	mutex_exit(&ill->ill_lock);
743 	refheld = ill_waiter_inc(ill);
744 	if (refheld) {
745 		if (taskq_dispatch(system_taskq,
746 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
747 			goto out;
748 
749 		/* release ref on ill if taskq dispatch fails */
750 		ill_waiter_dcr(ill);
751 	}
752 	/*
753 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
754 	 * can be tried again later.
755 	 */
756 	mutex_enter(&ill->ill_lock);
757 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
758 	mutex_exit(&ill->ill_lock);
759 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
760 
761 out:
762 	ip_input(ill, NULL, mp_chain, mhip);
763 }
764 
765 static squeue_t *
766 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout)
767 {
768 	int 		i;
769 	squeue_set_t	*best_sqs = NULL;
770 	squeue_set_t	*curr_sqs = NULL;
771 	int		min_sq = 0;
772 	squeue_t 	*sqp = NULL;
773 	char		sqname[64];
774 	cpu_t		*bind_cpu;
775 
776 	/*
777 	 * If fanout is set and the passed squeue_set already has some
778 	 * squeues which are managing the NICs, try to find squeues on
779 	 * unused CPU.
780 	 */
781 	if (sqs->sqs_size > 1 && fanout) {
782 		/*
783 		 * First check to see if any squeue on the CPU passed
784 		 * is managing a NIC.
785 		 */
786 		mutex_enter(&sqs->sqs_lock);
787 		for (i = 0; i < sqs->sqs_size; i++) {
788 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
789 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
790 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
791 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
792 				break;
793 			}
794 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
795 		}
796 		mutex_exit(&sqs->sqs_lock);
797 		if (i != sqs->sqs_size) {
798 			best_sqs = NULL;
799 
800 			for (i = sqset_global_size - 1; i >= 0; i--) {
801 				curr_sqs = sqset_global_list[i];
802 				/*
803 				 * Check and make sure the CPU that sqs
804 				 * is bound to is valid. There could be
805 				 * sqs's around whose CPUs could have
806 				 * been DR'd out.
807 				 */
808 				mutex_enter(&cpu_lock);
809 				if (cpu_get(curr_sqs->sqs_bind) != NULL) {
810 					if (best_sqs == NULL) {
811 						best_sqs = curr_sqs;
812 						min_sq = curr_sqs->sqs_size;
813 					} else if (curr_sqs->sqs_size <
814 					    min_sq) {
815 						best_sqs = curr_sqs;
816 						min_sq = curr_sqs->sqs_size;
817 					}
818 				}
819 				mutex_exit(&cpu_lock);
820 			}
821 
822 			ASSERT(best_sqs != NULL);
823 			sqs = best_sqs;
824 		}
825 	}
826 
827 	mutex_enter(&sqs->sqs_lock);
828 
829 	for (i = 0; i < sqs->sqs_size; i++) {
830 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
831 		if ((sqs->sqs_list[i]->sq_state &
832 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
833 			sqp = sqs->sqs_list[i];
834 			break;
835 		}
836 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
837 	}
838 
839 	if (sqp == NULL) {
840 		/* Need to create a new squeue */
841 		if (sqs->sqs_size == sqs->sqs_max_size) {
842 			/*
843 			 * Reached the max limit for squeue
844 			 * we can allocate on this CPU.
845 			 */
846 			mutex_exit(&sqs->sqs_lock);
847 			return (NULL);
848 		}
849 
850 		mutex_enter(&cpu_lock);
851 		if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) {
852 			/* Too bad, CPU got DR'd out, return NULL */
853 			mutex_exit(&cpu_lock);
854 			mutex_exit(&sqs->sqs_lock);
855 			return (NULL);
856 		}
857 
858 		bzero(sqname, sizeof (sqname));
859 		(void) snprintf(sqname, sizeof (sqname),
860 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
861 		    bind_cpu->cpu_id, sqs->sqs_size);
862 		mutex_exit(&cpu_lock);
863 
864 		sqp = squeue_create(sqname, sqs->sqs_bind,
865 		    ip_squeue_worker_wait, minclsyspri);
866 
867 		ASSERT(sqp != NULL);
868 
869 		squeue_profile_enable(sqp);
870 		/*
871 		 * Other functions scanning sqs_list don't take sqs_lock.
872 		 * Once sqp is stored in sqs_list[] global visibility is
873 		 * ensured before incrementing the sqs_size counter.
874 		 */
875 		sqs->sqs_list[sqs->sqs_size] = sqp;
876 		membar_producer();
877 		sqs->sqs_size++;
878 
879 		if (ip_squeue_create_callback != NULL)
880 			ip_squeue_create_callback(sqp);
881 
882 		if (ip_squeue_bind) {
883 			mutex_enter(&cpu_lock);
884 			bind_cpu = cpu_get(sqs->sqs_bind);
885 			if (bind_cpu != NULL && cpu_is_online(bind_cpu)) {
886 				squeue_bind(sqp, -1);
887 			}
888 			mutex_exit(&cpu_lock);
889 		}
890 		mutex_enter(&sqp->sq_lock);
891 	}
892 
893 	mutex_exit(&sqs->sqs_lock);
894 	ASSERT(sqp != NULL);
895 	return (sqp);
896 }
897 
898 /*
899  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
900  * owned by a squeue yet, do the assignment. When the NIC registers it
901  * Rx rings with IP, we don't know where the interrupts will land and
902  * hence we need to wait till this point to do the assignment.
903  */
904 squeue_t *
905 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
906 {
907 	squeue_t 	*sqp;
908 	ill_t 		*ill;
909 	int		interrupt;
910 	ip_taskq_arg_t	*taskq_arg;
911 	boolean_t	refheld;
912 
913 	if (ill_rx_ring == NULL)
914 		return (IP_SQUEUE_GET(lbolt));
915 
916 	sqp = ill_rx_ring->rr_sqp;
917 	/*
918 	 * Do a quick check. If it's not NULL, we are done.
919 	 * Squeues are never destroyed so worse we will bind
920 	 * this connection to a suboptimal squeue.
921 	 *
922 	 * This is the fast path case.
923 	 */
924 	if (sqp != NULL)
925 		return (sqp);
926 
927 	ill = ill_rx_ring->rr_ill;
928 	ASSERT(ill != NULL);
929 
930 	interrupt = servicing_interrupt();
931 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
932 	    KM_NOSLEEP);
933 
934 	mutex_enter(&ill->ill_lock);
935 	/*
936 	 * Check sqp under the lock again for atomicity. Possible race with
937 	 * a previously scheduled ip_squeue_get -> ip_squeue_extend.
938 	 * Do the ring to squeue binding only if we are in interrupt context
939 	 * AND the ring is not already bound AND there is no one else trying
940 	 * the bind already.
941 	 */
942 	sqp = ill_rx_ring->rr_sqp;
943 	if (sqp != NULL || !interrupt ||
944 	    ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
945 		/*
946 		 * Note that the ring might get bound once we drop the lock
947 		 * below, if a previous request is in progress i.e. if the ring
948 		 * state is ILL_RING_INPROC. The incoming connection on whose
949 		 * behalf we are currently here might get a suboptimal squeue
950 		 * via the call to IP_SQUEUE_GET below, but there is no
951 		 * correctness issue.
952 		 */
953 		mutex_exit(&ill->ill_lock);
954 		if (taskq_arg != NULL)
955 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
956 		if (sqp != NULL)
957 			return (sqp);
958 		return (IP_SQUEUE_GET(lbolt));
959 	}
960 
961 	/*
962 	 * No sqp assigned yet. Can't really do that in interrupt
963 	 * context. Assign the default sqp to this connection and
964 	 * trigger creation of new sqp and binding it to this ring
965 	 * via taskq. Need to make sure ill stays around.
966 	 */
967 	taskq_arg->ip_taskq_ill = ill;
968 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
969 	taskq_arg->ip_taskq_cpu = CPU;
970 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
971 	mutex_exit(&ill->ill_lock);
972 	refheld = ill_waiter_inc(ill);
973 	if (refheld) {
974 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
975 		    taskq_arg, TQ_NOSLEEP) != NULL) {
976 			return (IP_SQUEUE_GET(lbolt));
977 		}
978 	}
979 	/*
980 	 * The ill is closing and we could not get a reference on the ill OR
981 	 * taskq_dispatch failed probably due to memory allocation failure.
982 	 * We will try again next time.
983 	 */
984 	mutex_enter(&ill->ill_lock);
985 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
986 	mutex_exit(&ill->ill_lock);
987 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
988 	if (refheld)
989 		ill_waiter_dcr(ill);
990 
991 	return (IP_SQUEUE_GET(lbolt));
992 }
993 
994 /*
995  * NDD hooks for setting ip_squeue_xxx tuneables.
996  */
997 
998 /* ARGSUSED */
999 int
1000 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
1001     caddr_t addr, cred_t *cr)
1002 {
1003 	int *bind_enabled = (int *)addr;
1004 	long new_value;
1005 	int i;
1006 
1007 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
1008 		return (EINVAL);
1009 
1010 	if (ip_squeue_bind == new_value)
1011 		return (0);
1012 
1013 	*bind_enabled = new_value;
1014 	mutex_enter(&cpu_lock);
1015 	if (new_value == 0) {
1016 		for (i = 0; i < sqset_global_size; i++)
1017 			ip_squeue_set_unbind(sqset_global_list[i]);
1018 	} else {
1019 		for (i = 0; i < sqset_global_size; i++)
1020 			ip_squeue_set_bind(sqset_global_list[i]);
1021 	}
1022 
1023 	mutex_exit(&cpu_lock);
1024 	return (0);
1025 }
1026 
1027 /*
1028  * Set squeue profiling.
1029  * 0 means "disable"
1030  * 1 means "enable"
1031  * 2 means "enable and reset"
1032  */
1033 /* ARGSUSED */
1034 int
1035 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
1036     cred_t *cr)
1037 {
1038 	int *profile_enabled = (int *)cp;
1039 	long new_value;
1040 	squeue_set_t *sqs;
1041 
1042 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
1043 		return (EINVAL);
1044 
1045 	if (new_value == 0)
1046 		squeue_profile_stop();
1047 	else if (new_value == 1)
1048 		squeue_profile_start();
1049 	else if (new_value == 2) {
1050 		int i, j;
1051 
1052 		squeue_profile_stop();
1053 		mutex_enter(&cpu_lock);
1054 		for (i = 0; i < sqset_global_size; i++) {
1055 			sqs = sqset_global_list[i];
1056 			for (j = 0; j < sqs->sqs_size; j++) {
1057 				squeue_profile_reset(sqs->sqs_list[j]);
1058 			}
1059 		}
1060 		mutex_exit(&cpu_lock);
1061 
1062 		new_value = 1;
1063 		squeue_profile_start();
1064 	}
1065 	*profile_enabled = new_value;
1066 
1067 	return (0);
1068 }
1069 
1070 /*
1071  * Reconfiguration callback
1072  */
1073 
1074 /* ARGSUSED */
1075 static int
1076 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
1077 {
1078 	cpu_t *cp = cpu[id];
1079 
1080 	ASSERT(MUTEX_HELD(&cpu_lock));
1081 	switch (what) {
1082 	case CPU_CONFIG:
1083 		/*
1084 		 * A new CPU is added. Create an squeue for it but do not bind
1085 		 * it yet.
1086 		 */
1087 		if (cp->cpu_squeue_set == NULL)
1088 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
1089 		break;
1090 	case CPU_ON:
1091 	case CPU_INIT:
1092 	case CPU_CPUPART_IN:
1093 		if (cp->cpu_squeue_set == NULL) {
1094 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
1095 		}
1096 		if (ip_squeue_bind)
1097 			ip_squeue_set_bind(cp->cpu_squeue_set);
1098 		break;
1099 	case CPU_UNCONFIG:
1100 	case CPU_OFF:
1101 	case CPU_CPUPART_OUT:
1102 		ASSERT((cp->cpu_squeue_set != NULL) ||
1103 		    (cp->cpu_flags & CPU_OFFLINE));
1104 
1105 		if (cp->cpu_squeue_set != NULL) {
1106 			ip_squeue_set_unbind(cp->cpu_squeue_set);
1107 		}
1108 		break;
1109 	default:
1110 		break;
1111 	}
1112 	return (0);
1113 }
1114 
1115 /* ARGSUSED */
1116 static void
1117 ip_squeue_set_bind(squeue_set_t *sqs)
1118 {
1119 	int i;
1120 	squeue_t *sqp;
1121 
1122 	if (!ip_squeue_bind)
1123 		return;
1124 
1125 	mutex_enter(&sqs->sqs_lock);
1126 	for (i = 0; i < sqs->sqs_size; i++) {
1127 		sqp = sqs->sqs_list[i];
1128 		if (sqp->sq_state & SQS_BOUND)
1129 			continue;
1130 		squeue_bind(sqp, -1);
1131 	}
1132 	mutex_exit(&sqs->sqs_lock);
1133 }
1134 
1135 static void
1136 ip_squeue_set_unbind(squeue_set_t *sqs)
1137 {
1138 	int i;
1139 	squeue_t *sqp;
1140 
1141 	mutex_enter(&sqs->sqs_lock);
1142 	for (i = 0; i < sqs->sqs_size; i++) {
1143 		sqp = sqs->sqs_list[i];
1144 
1145 		/*
1146 		 * CPU is going offline. Remove the thread affinity
1147 		 * for any soft ring threads the squeue is managing.
1148 		 */
1149 		if (sqp->sq_state & SQS_ILL_BOUND) {
1150 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
1151 			ill_t		*ill = ring->rr_ill;
1152 
1153 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
1154 				ASSERT(ring->rr_handle != NULL);
1155 				ill->ill_dls_capab->ill_dls_unbind(
1156 				    ring->rr_handle);
1157 			}
1158 		}
1159 		if (!(sqp->sq_state & SQS_BOUND))
1160 			continue;
1161 		squeue_unbind(sqp);
1162 	}
1163 	mutex_exit(&sqs->sqs_lock);
1164 }
1165