xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 628e3cbed6489fa1db545d8524a06cd6535af456)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * IP interface to squeues.
30  *
31  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
32  * cpu_squeue field of the cpu structure. Each squeue is associated with a
33  * connection instance (conn_t).
34  *
35  * For CPUs available at system startup time the squeue creation and association
36  * with CPU happens at MP initialization time. For CPUs added during dynamic
37  * reconfiguration, the initialization happens when the new CPU is configured in
38  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
39  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
40  * variable.
41  *
42  * There are two modes of associating connection with squeues. The first mode
43  * associates each connection with the CPU that creates the connection (either
44  * during open time or during accept time). The second mode associates each
45  * connection with a random CPU, effectively distributing load over all CPUs
46  * and all squeues in the system. The mode is controlled by the
47  * ip_squeue_fanout variable.
48  *
49  * NOTE: The fact that there is an association between each connection and
50  * squeue and squeue and CPU does not mean that each connection is always
51  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
52  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
53  * binding is only relevant for the worker thread.
54  *
55  * The list of all created squeues is kept in squeue_set structure. This list is
56  * used when ip_squeue_fanout is set and the load is distributed across all
57  * squeues.
58  *
59  * INTERFACE:
60  *
61  * squeue_t *ip_squeue_get(hint)
62  *
63  * 	Find an squeue based on the 'hint' value. The hint is used as an index
64  * 	in the array of IP squeues available. The way hint is computed may
65  * 	affect the effectiveness of the squeue distribution. Currently squeues
66  * 	are assigned in round-robin fashion using lbolt as a hint.
67  *
68  *
69  * DR Notes
70  * ========
71  *
72  * The ip_squeue_init() registers a call-back function with the CPU DR
73  * subsystem using register_cpu_setup_func(). The call-back function does two
74  * things:
75  *
76  * o When the CPU is going off-line or unconfigured, the worker thread is
77  *	unbound from the CPU. This allows the CPU unconfig code to move it to
78  *	another CPU.
79  *
80  * o When the CPU is going online, it creates a new squeue for this CPU if
81  *	necessary and binds the squeue worker thread to this CPU.
82  *
83  * TUNEBALES:
84  *
85  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
86  * 	associated with an squeue instance.
87  *
88  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
89  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
90  *	an impact.
91  *
92  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
93  *	otherwise get it from CPU->cpu_squeue.
94  *
95  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
96  * changed using ndd on /dev/tcp or /dev/ip.
97  *
98  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
99  *	created. This is the time squeue code waits before waking up the worker
100  *	thread after queuing a request.
101  */
102 
103 #include <sys/types.h>
104 #include <sys/debug.h>
105 #include <sys/kmem.h>
106 #include <sys/cpuvar.h>
107 
108 #include <sys/cmn_err.h>
109 
110 #include <inet/common.h>
111 #include <inet/ip.h>
112 #include <inet/ip_if.h>
113 #include <inet/nd.h>
114 #include <inet/ipclassifier.h>
115 #include <sys/types.h>
116 #include <sys/conf.h>
117 #include <sys/sunddi.h>
118 #include <sys/dlpi.h>
119 #include <sys/squeue_impl.h>
120 
121 /*
122  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
123  * mapping between squeue and NIC (or Rx ring) for performance reasons so
124  * each squeue can uniquely own a NIC or a Rx ring and do polling
125  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
126  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
127  * can be created dynamically as needed.
128  */
129 #define	MAX_SQUEUES_PER_CPU	32
130 #define	MIN_SQUEUES_PER_CPU	1
131 uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
132 
133 #define	IP_NUM_SOFT_RINGS	2
134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
135 
136 /*
137  * List of all created squeue sets. The size is protected by cpu_lock
138  */
139 squeue_set_t	**sqset_global_list;
140 uint_t		sqset_global_size;
141 
142 int ip_squeue_bind = B_TRUE;
143 int ip_squeue_profile = B_TRUE;
144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
145 
146 /*
147  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
148  *	created. This is the time squeue code waits before waking up the worker
149  *	thread after queuing a request.
150  */
151 uint_t ip_squeue_worker_wait = 10;
152 
153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
155 
156 static void ip_squeue_set_bind(squeue_set_t *);
157 static void ip_squeue_set_unbind(squeue_set_t *);
158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t);
159 static void ip_squeue_clean(void *, mblk_t *, void *);
160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
161 
162 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
163 
164 /*
165  * Create squeue set containing ip_squeues_per_cpu number of squeues
166  * for this CPU and bind them all to the CPU.
167  */
168 static squeue_set_t *
169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
170 {
171 	int i;
172 	squeue_set_t	*sqs;
173 	squeue_t 	*sqp;
174 	char 		sqname[64];
175 	processorid_t 	id = cp->cpu_id;
176 
177 	if (reuse) {
178 		int i;
179 
180 		/*
181 		 * We may already have an squeue created for this CPU. Try to
182 		 * find one and reuse it if possible.
183 		 */
184 		for (i = 0; i < sqset_global_size; i++) {
185 			sqs = sqset_global_list[i];
186 			if (id == sqs->sqs_bind)
187 				return (sqs);
188 		}
189 	}
190 
191 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
192 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
193 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
194 	sqs->sqs_list = (squeue_t **)&sqs[1];
195 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
196 	sqs->sqs_bind = id;
197 
198 	for (i = 0; i < ip_squeues_per_cpu; i++) {
199 		bzero(sqname, sizeof (sqname));
200 
201 		(void) snprintf(sqname, sizeof (sqname),
202 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
203 		    cp->cpu_id, i);
204 
205 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
206 		    minclsyspri);
207 
208 		/*
209 		 * The first squeue in each squeue_set is the DEFAULT
210 		 * squeue.
211 		 */
212 		sqp->sq_state |= SQS_DEFAULT;
213 
214 		ASSERT(sqp != NULL);
215 
216 		squeue_profile_enable(sqp);
217 		sqs->sqs_list[sqs->sqs_size++] = sqp;
218 
219 		if (ip_squeue_create_callback != NULL)
220 			ip_squeue_create_callback(sqp);
221 	}
222 
223 	if (ip_squeue_bind && cpu_is_online(cp))
224 		ip_squeue_set_bind(sqs);
225 
226 	sqset_global_list[sqset_global_size++] = sqs;
227 	ASSERT(sqset_global_size <= NCPU);
228 	return (sqs);
229 }
230 
231 /*
232  * Initialize IP squeues.
233  */
234 void
235 ip_squeue_init(void (*callback)(squeue_t *))
236 {
237 	int i;
238 
239 	ASSERT(sqset_global_list == NULL);
240 
241 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
242 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
243 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
244 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
245 
246 	ip_squeue_create_callback = callback;
247 	squeue_init();
248 	sqset_global_list =
249 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
250 	sqset_global_size = 0;
251 	mutex_enter(&cpu_lock);
252 
253 	/* Create squeue for each active CPU available */
254 	for (i = 0; i < NCPU; i++) {
255 		cpu_t *cp = cpu[i];
256 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
257 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
258 		}
259 	}
260 
261 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
262 
263 	mutex_exit(&cpu_lock);
264 
265 	if (ip_squeue_profile)
266 		squeue_profile_start();
267 }
268 
269 /*
270  * Get squeue_t structure based on index.
271  * Since the squeue list can only grow, no need to grab any lock.
272  */
273 squeue_t *
274 ip_squeue_random(uint_t index)
275 {
276 	squeue_set_t *sqs;
277 
278 	sqs = sqset_global_list[index % sqset_global_size];
279 	return (sqs->sqs_list[index % sqs->sqs_size]);
280 }
281 
282 /* ARGSUSED */
283 static void
284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
285 {
286 	squeue_t	*sqp = arg2;
287 	ill_rx_ring_t	*ring = (ill_rx_ring_t *)mp->b_wptr;
288 	ill_t		*ill;
289 
290 	ASSERT(sqp != NULL);
291 	mp->b_wptr = NULL;
292 
293 	if (ring == NULL) {
294 		return;
295 	}
296 
297 	/*
298 	 * Clean up squeue
299 	 */
300 	mutex_enter(&sqp->sq_lock);
301 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
302 	sqp->sq_rx_ring = NULL;
303 	mutex_exit(&sqp->sq_lock);
304 
305 	ill = ring->rr_ill;
306 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
307 		ASSERT(ring->rr_handle != NULL);
308 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
309 	}
310 
311 	/*
312 	 * Cleanup the ring
313 	 */
314 
315 	ring->rr_blank = NULL;
316 	ring->rr_handle = NULL;
317 	ring->rr_sqp = NULL;
318 
319 	/*
320 	 * Signal ill that cleanup is done
321 	 */
322 	mutex_enter(&ill->ill_lock);
323 	ring->rr_ring_state = ILL_RING_FREE;
324 	cv_signal(&ill->ill_cv);
325 	mutex_exit(&ill->ill_lock);
326 }
327 
328 /*
329  * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
330  * The real cleanup happens behind the squeue via ip_squeue_clean function but
331  * we need to protect ourselves from 2 threads trying to cleanup at the same
332  * time (possible with one port going down for aggr and someone tearing down the
333  * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
334  * to indicate when the cleanup has started (1 ref) and when the cleanup
335  * is done (0 ref). When a new ring gets assigned to squeue, we start by
336  * putting 2 ref on ill_inuse_ref.
337  */
338 static void
339 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
340 {
341 	conn_t *connp;
342 	squeue_t *sqp;
343 	mblk_t *mp;
344 
345 	ASSERT(rx_ring != NULL);
346 
347 	/* Just clean one squeue */
348 	mutex_enter(&ill->ill_lock);
349 	/*
350 	 * Reset the ILL_SOFT_RING_ASSIGN bit so that
351 	 * ip_squeue_soft_ring_affinty() will not go
352 	 * ahead with assigning rings.
353 	 */
354 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
355 	while (rx_ring->rr_ring_state == ILL_RING_INPROC)
356 		/* Some operations pending on the ring. Wait */
357 		cv_wait(&ill->ill_cv, &ill->ill_lock);
358 
359 	if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
360 		/*
361 		 * Someone already trying to clean
362 		 * this squeue or it's already been cleaned.
363 		 */
364 		mutex_exit(&ill->ill_lock);
365 		return;
366 	}
367 	sqp = rx_ring->rr_sqp;
368 
369 	if (sqp == NULL) {
370 		/*
371 		 * The rx_ring never had a squeue assigned to it.
372 		 * We are under ill_lock so we can clean it up
373 		 * here itself since no one can get to it.
374 		 */
375 		rx_ring->rr_blank = NULL;
376 		rx_ring->rr_handle = NULL;
377 		rx_ring->rr_sqp = NULL;
378 		rx_ring->rr_ring_state = ILL_RING_FREE;
379 		mutex_exit(&ill->ill_lock);
380 		return;
381 	}
382 
383 	/* Indicate that it's being cleaned */
384 	rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
385 	ASSERT(sqp != NULL);
386 	mutex_exit(&ill->ill_lock);
387 
388 	/*
389 	 * Use the preallocated ill_unbind_conn for this purpose
390 	 */
391 	connp = ill->ill_dls_capab->ill_unbind_conn;
392 
393 	if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
394 		connp->conn_tcp->tcp_closemp_used = B_TRUE;
395 	} else {
396 		cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
397 		    "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
398 		    (void *)connp, (void *)connp->conn_tcp);
399 	}
400 
401 	TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
402 	mp = &connp->conn_tcp->tcp_closemp;
403 	CONN_INC_REF(connp);
404 
405 	/*
406 	 * Since the field sq_rx_ring for default squeue is NULL,
407 	 * ip_squeue_clean() will have no way to get the ring if we
408 	 * don't pass the pointer to it. We use b_wptr to do so
409 	 * as use of b_wptr for any other purpose is not expected.
410 	 */
411 
412 	ASSERT(mp->b_wptr == NULL);
413 	mp->b_wptr = (unsigned char *)rx_ring;
414 	squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
415 
416 	mutex_enter(&ill->ill_lock);
417 	while (rx_ring->rr_ring_state != ILL_RING_FREE)
418 		cv_wait(&ill->ill_cv, &ill->ill_lock);
419 	mutex_exit(&ill->ill_lock);
420 }
421 
422 void
423 ip_squeue_clean_all(ill_t *ill)
424 {
425 	int idx;
426 
427 	/*
428 	 * No need to clean if poll_capab isn't set for this ill
429 	 */
430 	if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
431 		return;
432 
433 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
434 		ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
435 
436 		ip_squeue_clean_ring(ill, ipr);
437 	}
438 
439 	ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
440 }
441 
442 typedef struct ip_taskq_arg {
443 	ill_t		*ip_taskq_ill;
444 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
445 	cpu_t		*ip_taskq_cpu;
446 } ip_taskq_arg_t;
447 
448 /*
449  * Do a Rx ring to squeue binding. Find a unique squeue that is not
450  * managing a receive ring. If no such squeue exists, dynamically
451  * create a new one in the squeue set.
452  *
453  * The function runs via the system taskq. The ill passed as an
454  * argument can't go away since we hold a ref. The lock order is
455  * ill_lock -> sqs_lock -> sq_lock.
456  *
457  * If we are binding a Rx ring to a squeue attached to the offline CPU,
458  * no need to check that because squeues are never destroyed once
459  * created.
460  */
461 /* ARGSUSED */
462 static void
463 ip_squeue_extend(void *arg)
464 {
465 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
466 	ill_t		*ill = sq_arg->ip_taskq_ill;
467 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
468 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
469 	squeue_set_t 	*sqs;
470 	squeue_t 	*sqp = NULL;
471 
472 	ASSERT(ill != NULL);
473 	ASSERT(ill_rx_ring != NULL);
474 	kmem_free(arg, sizeof (ip_taskq_arg_t));
475 
476 	/*
477 	 * Make sure the CPU that originally took the interrupt still
478 	 * exists.
479 	 */
480 	if (!CPU_ISON(intr_cpu))
481 		intr_cpu = CPU;
482 
483 	sqs = intr_cpu->cpu_squeue_set;
484 
485 	/*
486 	 * If this ill represents link aggregation, then there might be
487 	 * multiple NICs trying to register them selves at the same time
488 	 * and in order to ensure that test and assignment of free rings
489 	 * is sequential, we need to hold the ill_lock.
490 	 */
491 	mutex_enter(&ill->ill_lock);
492 	sqp = ip_find_unused_squeue(sqs, B_FALSE);
493 	if (sqp == NULL) {
494 		/*
495 		 * We hit the max limit of squeues allowed per CPU.
496 		 * Assign this rx_ring to DEFAULT squeue of the
497 		 * interrupted CPU but the squeue will not manage
498 		 * the ring. Also print a warning.
499 		 */
500 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
501 		    "has max number of squeues. System performance might "
502 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
503 
504 		/* the first squeue in the list is the default squeue */
505 		sqp = sqs->sqs_list[0];
506 		ASSERT(sqp != NULL);
507 		ill_rx_ring->rr_sqp = sqp;
508 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
509 
510 		mutex_exit(&ill->ill_lock);
511 		ill_waiter_dcr(ill);
512 		return;
513 	}
514 
515 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
516 	sqp->sq_rx_ring = ill_rx_ring;
517 	ill_rx_ring->rr_sqp = sqp;
518 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
519 
520 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
521 	mutex_exit(&sqp->sq_lock);
522 
523 	mutex_exit(&ill->ill_lock);
524 
525 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
526 	ill_waiter_dcr(ill);
527 }
528 
529 /*
530  * Do a Rx ring to squeue binding. Find a unique squeue that is not
531  * managing a receive ring. If no such squeue exists, dynamically
532  * create a new one in the squeue set.
533  *
534  * The function runs via the system taskq. The ill passed as an
535  * argument can't go away since we hold a ref. The lock order is
536  * ill_lock -> sqs_lock -> sq_lock.
537  *
538  * If we are binding a Rx ring to a squeue attached to the offline CPU,
539  * no need to check that because squeues are never destroyed once
540  * created.
541  */
542 /* ARGSUSED */
543 static void
544 ip_squeue_soft_ring_affinity(void *arg)
545 {
546 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
547 	ill_t			*ill = sq_arg->ip_taskq_ill;
548 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
549 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
550 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
551 	cpu_t			*bind_cpu;
552 	int			cpu_id = intr_cpu->cpu_id;
553 	int			min_cpu_id, max_cpu_id;
554 	boolean_t		enough_uniq_cpus = B_FALSE;
555 	boolean_t		enough_cpus = B_FALSE;
556 	squeue_set_t 		*sqs, *last_sqs;
557 	squeue_t 		*sqp = NULL;
558 	int			i, j;
559 
560 	ASSERT(ill != NULL);
561 	kmem_free(arg, sizeof (ip_taskq_arg_t));
562 
563 	/*
564 	 * Make sure the CPU that originally took the interrupt still
565 	 * exists.
566 	 */
567 	if (!CPU_ISON(intr_cpu)) {
568 		intr_cpu = CPU;
569 		cpu_id = intr_cpu->cpu_id;
570 	}
571 
572 	/*
573 	 * If this ill represents link aggregation, then there might be
574 	 * multiple NICs trying to register them selves at the same time
575 	 * and in order to ensure that test and assignment of free rings
576 	 * is sequential, we need to hold the ill_lock.
577 	 */
578 	mutex_enter(&ill->ill_lock);
579 
580 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
581 		mutex_exit(&ill->ill_lock);
582 		return;
583 	}
584 	/*
585 	 * We need to fanout the interrupts from the NIC. We do that by
586 	 * telling the driver underneath to create soft rings and use
587 	 * worker threads (if the driver advertized SOFT_RING capability)
588 	 * Its still a big performance win to if we can fanout to the
589 	 * threads on the same core that is taking interrupts.
590 	 *
591 	 * Since we don't know the interrupt to CPU binding, we don't
592 	 * assign any squeues or affinity to worker threads in the NIC.
593 	 * At the time of the first interrupt, we know which CPU is
594 	 * taking interrupts and try to find other threads on the same
595 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
596 	 * numbered sequentially for each core (XXX need something better
597 	 * than this in future), find the lowest number and highest
598 	 * number thread for that core.
599 	 *
600 	 * If we have one more thread per core than number of soft rings,
601 	 * then don't assign any worker threads to the H/W thread (cpu)
602 	 * taking interrupts (capability negotiation tries to ensure this)
603 	 *
604 	 * If the number of threads per core are same as the number of
605 	 * soft rings, then assign the worker affinity and squeue to
606 	 * the same cpu.
607 	 *
608 	 * Otherwise, just fanout to higher number CPUs starting from
609 	 * the interrupted CPU.
610 	 */
611 
612 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
613 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
614 
615 	/*
616 	 * Quickly check if there are enough CPUs present for fanout
617 	 * and also max_cpu_id is less than the id of the active CPU.
618 	 * We use the cpu_id stored in the last squeue_set to get
619 	 * an idea. The scheme is by no means perfect since it doesn't
620 	 * take into account CPU DR operations and the fact that
621 	 * interrupts themselves might change. An ideal scenario
622 	 * would be to ensure that interrupts run cpus by themselves
623 	 * and worker threads never have affinity to those CPUs. If
624 	 * the interrupts move to CPU which had a worker thread, it
625 	 * should be changed. Probably callbacks similar to CPU offline
626 	 * are needed to make it work perfectly.
627 	 */
628 	last_sqs = sqset_global_list[sqset_global_size - 1];
629 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
630 		if ((max_cpu_id - min_cpu_id) >
631 		    ill_soft_ring->ill_dls_soft_ring_cnt)
632 			enough_uniq_cpus = B_TRUE;
633 		else if ((max_cpu_id - min_cpu_id) >=
634 		    ill_soft_ring->ill_dls_soft_ring_cnt)
635 			enough_cpus = B_TRUE;
636 	}
637 
638 	j = 0;
639 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
640 		if (enough_uniq_cpus) {
641 			if ((min_cpu_id + i) == cpu_id) {
642 				j++;
643 				continue;
644 			}
645 			bind_cpu = cpu[min_cpu_id + i];
646 		} else if (enough_cpus) {
647 			bind_cpu = cpu[min_cpu_id + i];
648 		} else {
649 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
650 			bind_cpu = cpu[(cpu_id + i) % ncpus];
651 		}
652 
653 		/*
654 		 * Check if the CPU actually exist and active. If not,
655 		 * use the interrupted CPU. ip_find_unused_squeue() will
656 		 * find the right CPU to fanout anyway.
657 		 */
658 		if (!CPU_ISON(bind_cpu))
659 			bind_cpu = intr_cpu;
660 
661 		sqs = bind_cpu->cpu_squeue_set;
662 		ASSERT(sqs != NULL);
663 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
664 
665 		sqp = ip_find_unused_squeue(sqs, B_TRUE);
666 		if (sqp == NULL) {
667 			/*
668 			 * We hit the max limit of squeues allowed per CPU.
669 			 * Assign this rx_ring to DEFAULT squeue of the
670 			 * interrupted CPU but thesqueue will not manage
671 			 * the ring. Also print a warning.
672 			 */
673 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
674 			    "%d/%p already has max number of squeues. System "
675 			    "performance might become suboptimal\n",
676 			    sqs->sqs_bind, (void *)sqs);
677 
678 			/* the first squeue in the list is the default squeue */
679 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
680 			ASSERT(sqp != NULL);
681 
682 			ill_rx_ring->rr_sqp = sqp;
683 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
684 			continue;
685 
686 		}
687 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
688 		ill_rx_ring->rr_sqp = sqp;
689 		sqp->sq_rx_ring = ill_rx_ring;
690 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
691 		sqp->sq_state |= SQS_ILL_BOUND;
692 
693 		/* assign affinity to soft ring */
694 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
695 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
696 			    sqp->sq_bind);
697 		}
698 		mutex_exit(&sqp->sq_lock);
699 	}
700 	mutex_exit(&ill->ill_lock);
701 
702 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
703 	    SOFT_RING_FANOUT);
704 
705 	mutex_enter(&ill->ill_lock);
706 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
707 	mutex_exit(&ill->ill_lock);
708 
709 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
710 	ill_waiter_dcr(ill);
711 }
712 
713 /* ARGSUSED */
714 void
715 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
716     mblk_t *mp_chain, struct mac_header_info_s *mhip)
717 {
718 	ip_taskq_arg_t	*taskq_arg;
719 	boolean_t	refheld;
720 
721 	mutex_enter(&ill->ill_lock);
722 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
723 		taskq_arg = (ip_taskq_arg_t *)
724 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
725 
726 		if (taskq_arg == NULL)
727 			goto out;
728 
729 		taskq_arg->ip_taskq_ill = ill;
730 		taskq_arg->ip_taskq_ill_rx_ring = NULL;
731 		taskq_arg->ip_taskq_cpu = CPU;
732 
733 		/*
734 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
735 		 * the next interrupt to schedule a task for calling
736 		 * ip_squeue_soft_ring_affinity();
737 		 */
738 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
739 	} else {
740 		mutex_exit(&ill->ill_lock);
741 		goto out;
742 	}
743 	mutex_exit(&ill->ill_lock);
744 	refheld = ill_waiter_inc(ill);
745 	if (refheld) {
746 		if (taskq_dispatch(system_taskq,
747 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
748 			goto out;
749 
750 		/* release ref on ill if taskq dispatch fails */
751 		ill_waiter_dcr(ill);
752 	}
753 	/*
754 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
755 	 * can be tried again later.
756 	 */
757 	mutex_enter(&ill->ill_lock);
758 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
759 	mutex_exit(&ill->ill_lock);
760 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
761 
762 out:
763 	ip_input(ill, NULL, mp_chain, mhip);
764 }
765 
766 static squeue_t *
767 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout)
768 {
769 	int 		i;
770 	squeue_set_t	*best_sqs = NULL;
771 	squeue_set_t	*curr_sqs = NULL;
772 	int		min_sq = 0;
773 	squeue_t 	*sqp = NULL;
774 	char		sqname[64];
775 	cpu_t		*bind_cpu;
776 
777 	/*
778 	 * If fanout is set and the passed squeue_set already has some
779 	 * squeues which are managing the NICs, try to find squeues on
780 	 * unused CPU.
781 	 */
782 	if (sqs->sqs_size > 1 && fanout) {
783 		/*
784 		 * First check to see if any squeue on the CPU passed
785 		 * is managing a NIC.
786 		 */
787 		for (i = 0; i < sqs->sqs_size; i++) {
788 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
789 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
790 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
791 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
792 				break;
793 			}
794 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
795 		}
796 		if (i != sqs->sqs_size) {
797 			best_sqs = NULL;
798 
799 			for (i = sqset_global_size - 1; i >= 0; i--) {
800 				curr_sqs = sqset_global_list[i];
801 				/*
802 				 * Check and make sure the CPU that sqs
803 				 * is bound to is valid. There could be
804 				 * sqs's around whose CPUs could have
805 				 * been DR'd out.
806 				 */
807 				mutex_enter(&cpu_lock);
808 				if (cpu_get(curr_sqs->sqs_bind) != NULL) {
809 					if (best_sqs == NULL) {
810 						best_sqs = curr_sqs;
811 						min_sq = curr_sqs->sqs_size;
812 					} else if (curr_sqs->sqs_size <
813 					    min_sq) {
814 						best_sqs = curr_sqs;
815 						min_sq = curr_sqs->sqs_size;
816 					}
817 				}
818 				mutex_exit(&cpu_lock);
819 			}
820 
821 			ASSERT(best_sqs != NULL);
822 			sqs = best_sqs;
823 		}
824 	}
825 
826 	mutex_enter(&sqs->sqs_lock);
827 
828 	for (i = 0; i < sqs->sqs_size; i++) {
829 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
830 		if ((sqs->sqs_list[i]->sq_state &
831 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
832 			sqp = sqs->sqs_list[i];
833 			break;
834 		}
835 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
836 	}
837 
838 	if (sqp == NULL) {
839 		/* Need to create a new squeue */
840 		if (sqs->sqs_size == sqs->sqs_max_size) {
841 			/*
842 			 * Reached the max limit for squeue
843 			 * we can allocate on this CPU.
844 			 */
845 			mutex_exit(&sqs->sqs_lock);
846 			return (NULL);
847 		}
848 
849 		mutex_enter(&cpu_lock);
850 		if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) {
851 			/* Too bad, CPU got DR'd out, return NULL */
852 			mutex_exit(&cpu_lock);
853 			mutex_exit(&sqs->sqs_lock);
854 			return (NULL);
855 		}
856 
857 		bzero(sqname, sizeof (sqname));
858 		(void) snprintf(sqname, sizeof (sqname),
859 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
860 		    bind_cpu->cpu_id, sqs->sqs_size);
861 		mutex_exit(&cpu_lock);
862 
863 		sqp = squeue_create(sqname, sqs->sqs_bind,
864 		    ip_squeue_worker_wait, minclsyspri);
865 
866 		ASSERT(sqp != NULL);
867 
868 		squeue_profile_enable(sqp);
869 		sqs->sqs_list[sqs->sqs_size++] = sqp;
870 
871 		if (ip_squeue_create_callback != NULL)
872 			ip_squeue_create_callback(sqp);
873 
874 		if (ip_squeue_bind) {
875 			mutex_enter(&cpu_lock);
876 			bind_cpu = cpu_get(sqs->sqs_bind);
877 			if (bind_cpu != NULL && cpu_is_online(bind_cpu)) {
878 				squeue_bind(sqp, -1);
879 			}
880 			mutex_exit(&cpu_lock);
881 		}
882 		mutex_enter(&sqp->sq_lock);
883 	}
884 
885 	mutex_exit(&sqs->sqs_lock);
886 	ASSERT(sqp != NULL);
887 	return (sqp);
888 }
889 
890 /*
891  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
892  * owned by a squeue yet, do the assignment. When the NIC registers it
893  * Rx rings with IP, we don't know where the interrupts will land and
894  * hence we need to wait till this point to do the assignment.
895  */
896 squeue_t *
897 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
898 {
899 	squeue_t 	*sqp;
900 	ill_t 		*ill;
901 	int		interrupt;
902 	ip_taskq_arg_t	*taskq_arg;
903 	boolean_t	refheld;
904 
905 	if (ill_rx_ring == NULL)
906 		return (IP_SQUEUE_GET(lbolt));
907 
908 	sqp = ill_rx_ring->rr_sqp;
909 	/*
910 	 * Do a quick check. If it's not NULL, we are done.
911 	 * Squeues are never destroyed so worse we will bind
912 	 * this connection to a suboptimal squeue.
913 	 *
914 	 * This is the fast path case.
915 	 */
916 	if (sqp != NULL)
917 		return (sqp);
918 
919 	ill = ill_rx_ring->rr_ill;
920 	ASSERT(ill != NULL);
921 
922 	interrupt = servicing_interrupt();
923 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
924 	    KM_NOSLEEP);
925 
926 	mutex_enter(&ill->ill_lock);
927 	/*
928 	 * Check sqp under the lock again for atomicity. Possible race with
929 	 * a previously scheduled ip_squeue_get -> ip_squeue_extend.
930 	 * Do the ring to squeue binding only if we are in interrupt context
931 	 * AND the ring is not already bound AND there is no one else trying
932 	 * the bind already.
933 	 */
934 	sqp = ill_rx_ring->rr_sqp;
935 	if (sqp != NULL || !interrupt ||
936 	    ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
937 		/*
938 		 * Note that the ring might get bound once we drop the lock
939 		 * below, if a previous request is in progress i.e. if the ring
940 		 * state is ILL_RING_INPROC. The incoming connection on whose
941 		 * behalf we are currently here might get a suboptimal squeue
942 		 * via the call to IP_SQUEUE_GET below, but there is no
943 		 * correctness issue.
944 		 */
945 		mutex_exit(&ill->ill_lock);
946 		if (taskq_arg != NULL)
947 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
948 		if (sqp != NULL)
949 			return (sqp);
950 		return (IP_SQUEUE_GET(lbolt));
951 	}
952 
953 	/*
954 	 * No sqp assigned yet. Can't really do that in interrupt
955 	 * context. Assign the default sqp to this connection and
956 	 * trigger creation of new sqp and binding it to this ring
957 	 * via taskq. Need to make sure ill stays around.
958 	 */
959 	taskq_arg->ip_taskq_ill = ill;
960 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
961 	taskq_arg->ip_taskq_cpu = CPU;
962 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
963 	mutex_exit(&ill->ill_lock);
964 	refheld = ill_waiter_inc(ill);
965 	if (refheld) {
966 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
967 		    taskq_arg, TQ_NOSLEEP) != NULL) {
968 			return (IP_SQUEUE_GET(lbolt));
969 		}
970 	}
971 	/*
972 	 * The ill is closing and we could not get a reference on the ill OR
973 	 * taskq_dispatch failed probably due to memory allocation failure.
974 	 * We will try again next time.
975 	 */
976 	mutex_enter(&ill->ill_lock);
977 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
978 	mutex_exit(&ill->ill_lock);
979 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
980 	if (refheld)
981 		ill_waiter_dcr(ill);
982 
983 	return (IP_SQUEUE_GET(lbolt));
984 }
985 
986 /*
987  * NDD hooks for setting ip_squeue_xxx tuneables.
988  */
989 
990 /* ARGSUSED */
991 int
992 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
993     caddr_t addr, cred_t *cr)
994 {
995 	int *bind_enabled = (int *)addr;
996 	long new_value;
997 	int i;
998 
999 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
1000 		return (EINVAL);
1001 
1002 	if (ip_squeue_bind == new_value)
1003 		return (0);
1004 
1005 	*bind_enabled = new_value;
1006 	mutex_enter(&cpu_lock);
1007 	if (new_value == 0) {
1008 		for (i = 0; i < sqset_global_size; i++)
1009 			ip_squeue_set_unbind(sqset_global_list[i]);
1010 	} else {
1011 		for (i = 0; i < sqset_global_size; i++)
1012 			ip_squeue_set_bind(sqset_global_list[i]);
1013 	}
1014 
1015 	mutex_exit(&cpu_lock);
1016 	return (0);
1017 }
1018 
1019 /*
1020  * Set squeue profiling.
1021  * 0 means "disable"
1022  * 1 means "enable"
1023  * 2 means "enable and reset"
1024  */
1025 /* ARGSUSED */
1026 int
1027 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
1028     cred_t *cr)
1029 {
1030 	int *profile_enabled = (int *)cp;
1031 	long new_value;
1032 	squeue_set_t *sqs;
1033 
1034 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
1035 		return (EINVAL);
1036 
1037 	if (new_value == 0)
1038 		squeue_profile_stop();
1039 	else if (new_value == 1)
1040 		squeue_profile_start();
1041 	else if (new_value == 2) {
1042 		int i, j;
1043 
1044 		squeue_profile_stop();
1045 		mutex_enter(&cpu_lock);
1046 		for (i = 0; i < sqset_global_size; i++) {
1047 			sqs = sqset_global_list[i];
1048 			for (j = 0; j < sqs->sqs_size; j++) {
1049 				squeue_profile_reset(sqs->sqs_list[j]);
1050 			}
1051 		}
1052 		mutex_exit(&cpu_lock);
1053 
1054 		new_value = 1;
1055 		squeue_profile_start();
1056 	}
1057 	*profile_enabled = new_value;
1058 
1059 	return (0);
1060 }
1061 
1062 /*
1063  * Reconfiguration callback
1064  */
1065 
1066 /* ARGSUSED */
1067 static int
1068 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
1069 {
1070 	cpu_t *cp = cpu[id];
1071 
1072 	ASSERT(MUTEX_HELD(&cpu_lock));
1073 	switch (what) {
1074 	case CPU_CONFIG:
1075 		/*
1076 		 * A new CPU is added. Create an squeue for it but do not bind
1077 		 * it yet.
1078 		 */
1079 		if (cp->cpu_squeue_set == NULL)
1080 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
1081 		break;
1082 	case CPU_ON:
1083 	case CPU_INIT:
1084 	case CPU_CPUPART_IN:
1085 		if (cp->cpu_squeue_set == NULL) {
1086 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
1087 		}
1088 		if (ip_squeue_bind)
1089 			ip_squeue_set_bind(cp->cpu_squeue_set);
1090 		break;
1091 	case CPU_UNCONFIG:
1092 	case CPU_OFF:
1093 	case CPU_CPUPART_OUT:
1094 		ASSERT((cp->cpu_squeue_set != NULL) ||
1095 		    (cp->cpu_flags & CPU_OFFLINE));
1096 
1097 		if (cp->cpu_squeue_set != NULL) {
1098 			ip_squeue_set_unbind(cp->cpu_squeue_set);
1099 		}
1100 		break;
1101 	default:
1102 		break;
1103 	}
1104 	return (0);
1105 }
1106 
1107 /* ARGSUSED */
1108 static void
1109 ip_squeue_set_bind(squeue_set_t *sqs)
1110 {
1111 	int i;
1112 	squeue_t *sqp;
1113 
1114 	if (!ip_squeue_bind)
1115 		return;
1116 
1117 	mutex_enter(&sqs->sqs_lock);
1118 	for (i = 0; i < sqs->sqs_size; i++) {
1119 		sqp = sqs->sqs_list[i];
1120 		if (sqp->sq_state & SQS_BOUND)
1121 			continue;
1122 		squeue_bind(sqp, -1);
1123 	}
1124 	mutex_exit(&sqs->sqs_lock);
1125 }
1126 
1127 static void
1128 ip_squeue_set_unbind(squeue_set_t *sqs)
1129 {
1130 	int i;
1131 	squeue_t *sqp;
1132 
1133 	mutex_enter(&sqs->sqs_lock);
1134 	for (i = 0; i < sqs->sqs_size; i++) {
1135 		sqp = sqs->sqs_list[i];
1136 
1137 		/*
1138 		 * CPU is going offline. Remove the thread affinity
1139 		 * for any soft ring threads the squeue is managing.
1140 		 */
1141 		if (sqp->sq_state & SQS_ILL_BOUND) {
1142 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
1143 			ill_t		*ill = ring->rr_ill;
1144 
1145 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
1146 				ASSERT(ring->rr_handle != NULL);
1147 				ill->ill_dls_capab->ill_dls_unbind(
1148 				    ring->rr_handle);
1149 			}
1150 		}
1151 		if (!(sqp->sq_state & SQS_BOUND))
1152 			continue;
1153 		squeue_unbind(sqp);
1154 	}
1155 	mutex_exit(&sqs->sqs_lock);
1156 }
1157