xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 7247f8883be6bcac5fe4735b6f87f873387dbbef)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * IP interface to squeues.
30  *
31  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
32  * cpu_squeue field of the cpu structure. Each squeue is associated with a
33  * connection instance (conn_t).
34  *
35  * For CPUs available at system startup time the squeue creation and association
36  * with CPU happens at MP initialization time. For CPUs added during dynamic
37  * reconfiguration, the initialization happens when the new CPU is configured in
38  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
39  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
40  * variable.
41  *
42  * There are two modes of associating connection with squeues. The first mode
43  * associates each connection with the CPU that creates the connection (either
44  * during open time or during accept time). The second mode associates each
45  * connection with a random CPU, effectively distributing load over all CPUs
46  * and all squeues in the system. The mode is controlled by the
47  * ip_squeue_fanout variable.
48  *
49  * NOTE: The fact that there is an association between each connection and
50  * squeue and squeue and CPU does not mean that each connection is always
51  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
52  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
53  * binding is only relevant for the worker thread.
54  *
55  * The list of all created squeues is kept in squeue_set structure. This list is
56  * used when ip_squeue_fanout is set and the load is distributed across all
57  * squeues.
58  *
59  * INTERFACE:
60  *
61  * squeue_t *ip_squeue_get(hint)
62  *
63  * 	Find an squeue based on the 'hint' value. The hint is used as an index
64  * 	in the array of IP squeues available. The way hint is computed may
65  * 	affect the effectiveness of the squeue distribution. Currently squeues
66  * 	are assigned in round-robin fashion using lbolt as a hint.
67  *
68  *
69  * DR Notes
70  * ========
71  *
72  * The ip_squeue_init() registers a call-back function with the CPU DR
73  * subsystem using register_cpu_setup_func(). The call-back function does two
74  * things:
75  *
76  * o When the CPU is going off-line or unconfigured, the worker thread is
77  *	unbound from the CPU. This allows the CPU unconfig code to move it to
78  *	another CPU.
79  *
80  * o When the CPU is going online, it creates a new squeue for this CPU if
81  *	necessary and binds the squeue worker thread to this CPU.
82  *
83  * TUNEBALES:
84  *
85  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
86  * 	associated with an squeue instance.
87  *
88  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
89  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
90  *	an impact.
91  *
92  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
93  *	otherwise get it from CPU->cpu_squeue.
94  *
95  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
96  * changed using ndd on /dev/tcp or /dev/ip.
97  *
98  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
99  *	created. This is the time squeue code waits before waking up the worker
100  *	thread after queuing a request.
101  */
102 
103 #include <sys/types.h>
104 #include <sys/debug.h>
105 #include <sys/kmem.h>
106 #include <sys/cpuvar.h>
107 
108 #include <sys/cmn_err.h>
109 
110 #include <inet/common.h>
111 #include <inet/ip.h>
112 #include <inet/ip_if.h>
113 #include <inet/nd.h>
114 #include <inet/ipclassifier.h>
115 #include <sys/types.h>
116 #include <sys/conf.h>
117 #include <sys/sunddi.h>
118 #include <sys/dlpi.h>
119 #include <sys/squeue_impl.h>
120 
121 /*
122  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
123  * mapping between squeue and NIC (or Rx ring) for performance reasons so
124  * each squeue can uniquely own a NIC or a Rx ring and do polling
125  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
126  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
127  * can be created dynamically as needed.
128  */
129 #define	MAX_SQUEUES_PER_CPU	32
130 #define	MIN_SQUEUES_PER_CPU	1
131 uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
132 
133 #define	IP_NUM_SOFT_RINGS	2
134 uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
135 
136 /*
137  * List of all created squeue sets. The size is protected by cpu_lock
138  */
139 squeue_set_t	**sqset_global_list;
140 uint_t		sqset_global_size;
141 
142 int ip_squeue_bind = B_TRUE;
143 int ip_squeue_profile = B_TRUE;
144 static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
145 
146 /*
147  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
148  *	created. This is the time squeue code waits before waking up the worker
149  *	thread after queuing a request.
150  */
151 uint_t ip_squeue_worker_wait = 10;
152 
153 static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
154 static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
155 
156 static void ip_squeue_set_bind(squeue_set_t *);
157 static void ip_squeue_set_unbind(squeue_set_t *);
158 static squeue_t *ip_find_unused_squeue(squeue_set_t *, boolean_t);
159 static void ip_squeue_clean(void *, mblk_t *, void *);
160 static void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *);
161 
162 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
163 
164 /*
165  * Create squeue set containing ip_squeues_per_cpu number of squeues
166  * for this CPU and bind them all to the CPU.
167  */
168 static squeue_set_t *
169 ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
170 {
171 	int i;
172 	squeue_set_t	*sqs;
173 	squeue_t 	*sqp;
174 	char 		sqname[64];
175 	processorid_t 	id = cp->cpu_id;
176 
177 	if (reuse) {
178 		int i;
179 
180 		/*
181 		 * We may already have an squeue created for this CPU. Try to
182 		 * find one and reuse it if possible.
183 		 */
184 		for (i = 0; i < sqset_global_size; i++) {
185 			sqs = sqset_global_list[i];
186 			if (id == sqs->sqs_bind)
187 				return (sqs);
188 		}
189 	}
190 
191 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
192 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
193 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
194 	sqs->sqs_list = (squeue_t **)&sqs[1];
195 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
196 	sqs->sqs_bind = id;
197 
198 	for (i = 0; i < ip_squeues_per_cpu; i++) {
199 		bzero(sqname, sizeof (sqname));
200 
201 		(void) snprintf(sqname, sizeof (sqname),
202 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
203 		    cp->cpu_id, i);
204 
205 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
206 		    minclsyspri);
207 
208 		/*
209 		 * The first squeue in each squeue_set is the DEFAULT
210 		 * squeue.
211 		 */
212 		sqp->sq_state |= SQS_DEFAULT;
213 
214 		ASSERT(sqp != NULL);
215 
216 		squeue_profile_enable(sqp);
217 		sqs->sqs_list[sqs->sqs_size++] = sqp;
218 
219 		if (ip_squeue_create_callback != NULL)
220 			ip_squeue_create_callback(sqp);
221 	}
222 
223 	if (ip_squeue_bind && cpu_is_online(cp))
224 		ip_squeue_set_bind(sqs);
225 
226 	sqset_global_list[sqset_global_size++] = sqs;
227 	ASSERT(sqset_global_size <= NCPU);
228 	return (sqs);
229 }
230 
231 /*
232  * Initialize IP squeues.
233  */
234 void
235 ip_squeue_init(void (*callback)(squeue_t *))
236 {
237 	int i;
238 
239 	ASSERT(sqset_global_list == NULL);
240 
241 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
242 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
243 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
244 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
245 
246 	ip_squeue_create_callback = callback;
247 	squeue_init();
248 	sqset_global_list =
249 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
250 	sqset_global_size = 0;
251 	mutex_enter(&cpu_lock);
252 
253 	/* Create squeue for each active CPU available */
254 	for (i = 0; i < NCPU; i++) {
255 		cpu_t *cp = cpu[i];
256 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
257 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
258 		}
259 	}
260 
261 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
262 
263 	mutex_exit(&cpu_lock);
264 
265 	if (ip_squeue_profile)
266 		squeue_profile_start();
267 }
268 
269 /*
270  * Get squeue_t structure based on index.
271  * Since the squeue list can only grow, no need to grab any lock.
272  */
273 squeue_t *
274 ip_squeue_random(uint_t index)
275 {
276 	squeue_set_t *sqs;
277 
278 	sqs = sqset_global_list[index % sqset_global_size];
279 	return (sqs->sqs_list[index % sqs->sqs_size]);
280 }
281 
282 /* ARGSUSED */
283 static void
284 ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
285 {
286 	squeue_t	*sqp = arg2;
287 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
288 	ill_t		*ill;
289 
290 	ASSERT(sqp != NULL);
291 
292 	if (ring == NULL) {
293 		return;
294 	}
295 
296 	/*
297 	 * Clean up squeue
298 	 */
299 	mutex_enter(&sqp->sq_lock);
300 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
301 	sqp->sq_rx_ring = NULL;
302 	mutex_exit(&sqp->sq_lock);
303 
304 	ill = ring->rr_ill;
305 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
306 		ASSERT(ring->rr_handle != NULL);
307 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
308 	}
309 
310 	/*
311 	 * Cleanup the ring
312 	 */
313 
314 	ring->rr_blank = NULL;
315 	ring->rr_handle = NULL;
316 	ring->rr_sqp = NULL;
317 
318 	/*
319 	 * Signal ill that cleanup is done
320 	 */
321 	mutex_enter(&ill->ill_lock);
322 	ring->rr_ring_state = ILL_RING_FREE;
323 	cv_signal(&ill->ill_cv);
324 	mutex_exit(&ill->ill_lock);
325 }
326 
327 /*
328  * Clean up one squeue element. ill_inuse_ref is protected by ill_lock.
329  * The real cleanup happens behind the squeue via ip_squeue_clean function but
330  * we need to protect ourselves from 2 threads trying to cleanup at the same
331  * time (possible with one port going down for aggr and someone tearing down the
332  * entire aggr simultaneously). So we use ill_inuse_ref protected by ill_lock
333  * to indicate when the cleanup has started (1 ref) and when the cleanup
334  * is done (0 ref). When a new ring gets assigned to squeue, we start by
335  * putting 2 ref on ill_inuse_ref.
336  */
337 static void
338 ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
339 {
340 	conn_t *connp;
341 	squeue_t *sqp;
342 	mblk_t *mp;
343 
344 	ASSERT(rx_ring != NULL);
345 
346 	/* Just clean one squeue */
347 	mutex_enter(&ill->ill_lock);
348 	/*
349 	 * Reset the ILL_SOFT_RING_ASSIGN bit so that
350 	 * ip_squeue_soft_ring_affinty() will not go
351 	 * ahead with assigning rings.
352 	 */
353 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
354 	while (rx_ring->rr_ring_state == ILL_RING_INPROC)
355 		/* Some operations pending on the ring. Wait */
356 		cv_wait(&ill->ill_cv, &ill->ill_lock);
357 
358 	if (rx_ring->rr_ring_state != ILL_RING_INUSE) {
359 		/*
360 		 * Someone already trying to clean
361 		 * this squeue or it's already been cleaned.
362 		 */
363 		mutex_exit(&ill->ill_lock);
364 		return;
365 	}
366 	sqp = rx_ring->rr_sqp;
367 
368 	if (sqp == NULL) {
369 		/*
370 		 * The rx_ring never had a squeue assigned to it.
371 		 * We are under ill_lock so we can clean it up
372 		 * here itself since no one can get to it.
373 		 */
374 		rx_ring->rr_blank = NULL;
375 		rx_ring->rr_handle = NULL;
376 		rx_ring->rr_sqp = NULL;
377 		rx_ring->rr_ring_state = ILL_RING_FREE;
378 		mutex_exit(&ill->ill_lock);
379 		return;
380 	}
381 
382 	/* Indicate that it's being cleaned */
383 	rx_ring->rr_ring_state = ILL_RING_BEING_FREED;
384 	ASSERT(sqp != NULL);
385 	mutex_exit(&ill->ill_lock);
386 
387 	/*
388 	 * Use the preallocated ill_unbind_conn for this purpose
389 	 */
390 	connp = ill->ill_dls_capab->ill_unbind_conn;
391 
392 	if (connp->conn_tcp->tcp_closemp.b_prev == NULL) {
393 		connp->conn_tcp->tcp_closemp_used = B_TRUE;
394 	} else {
395 		cmn_err(CE_PANIC, "ip_squeue_clean_ring: "
396 		    "concurrent use of tcp_closemp_used: connp %p tcp %p\n",
397 		    (void *)connp, (void *)connp->conn_tcp);
398 	}
399 
400 	TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
401 	mp = &connp->conn_tcp->tcp_closemp;
402 	CONN_INC_REF(connp);
403 	squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
404 
405 	mutex_enter(&ill->ill_lock);
406 	while (rx_ring->rr_ring_state != ILL_RING_FREE)
407 		cv_wait(&ill->ill_cv, &ill->ill_lock);
408 	mutex_exit(&ill->ill_lock);
409 }
410 
411 void
412 ip_squeue_clean_all(ill_t *ill)
413 {
414 	int idx;
415 
416 	/*
417 	 * No need to clean if poll_capab isn't set for this ill
418 	 */
419 	if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
420 		return;
421 
422 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
423 		ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
424 
425 		ip_squeue_clean_ring(ill, ipr);
426 	}
427 
428 	ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
429 }
430 
431 typedef struct ip_taskq_arg {
432 	ill_t		*ip_taskq_ill;
433 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
434 	cpu_t		*ip_taskq_cpu;
435 } ip_taskq_arg_t;
436 
437 /*
438  * Do a Rx ring to squeue binding. Find a unique squeue that is not
439  * managing a receive ring. If no such squeue exists, dynamically
440  * create a new one in the squeue set.
441  *
442  * The function runs via the system taskq. The ill passed as an
443  * argument can't go away since we hold a ref. The lock order is
444  * ill_lock -> sqs_lock -> sq_lock.
445  *
446  * If we are binding a Rx ring to a squeue attached to the offline CPU,
447  * no need to check that because squeues are never destroyed once
448  * created.
449  */
450 /* ARGSUSED */
451 static void
452 ip_squeue_extend(void *arg)
453 {
454 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
455 	ill_t		*ill = sq_arg->ip_taskq_ill;
456 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
457 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
458 	squeue_set_t 	*sqs;
459 	squeue_t 	*sqp = NULL;
460 
461 	ASSERT(ill != NULL);
462 	ASSERT(ill_rx_ring != NULL);
463 	kmem_free(arg, sizeof (ip_taskq_arg_t));
464 
465 	/*
466 	 * Make sure the CPU that originally took the interrupt still
467 	 * exists.
468 	 */
469 	if (!CPU_ISON(intr_cpu))
470 		intr_cpu = CPU;
471 
472 	sqs = intr_cpu->cpu_squeue_set;
473 
474 	/*
475 	 * If this ill represents link aggregation, then there might be
476 	 * multiple NICs trying to register them selves at the same time
477 	 * and in order to ensure that test and assignment of free rings
478 	 * is sequential, we need to hold the ill_lock.
479 	 */
480 	mutex_enter(&ill->ill_lock);
481 	sqp = ip_find_unused_squeue(sqs, B_FALSE);
482 	if (sqp == NULL) {
483 		/*
484 		 * We hit the max limit of squeues allowed per CPU.
485 		 * Assign this rx_ring to DEFAULT squeue of the
486 		 * interrupted CPU but the squeue will not manage
487 		 * the ring. Also print a warning.
488 		 */
489 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
490 		    "has max number of squeues. System performance might "
491 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
492 
493 		/* the first squeue in the list is the default squeue */
494 		sqp = sqs->sqs_list[0];
495 		ASSERT(sqp != NULL);
496 		ill_rx_ring->rr_sqp = sqp;
497 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
498 
499 		mutex_exit(&ill->ill_lock);
500 		ill_waiter_dcr(ill);
501 		return;
502 	}
503 
504 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
505 	sqp->sq_rx_ring = ill_rx_ring;
506 	ill_rx_ring->rr_sqp = sqp;
507 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
508 
509 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
510 	mutex_exit(&sqp->sq_lock);
511 
512 	mutex_exit(&ill->ill_lock);
513 
514 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
515 	ill_waiter_dcr(ill);
516 }
517 
518 /*
519  * Do a Rx ring to squeue binding. Find a unique squeue that is not
520  * managing a receive ring. If no such squeue exists, dynamically
521  * create a new one in the squeue set.
522  *
523  * The function runs via the system taskq. The ill passed as an
524  * argument can't go away since we hold a ref. The lock order is
525  * ill_lock -> sqs_lock -> sq_lock.
526  *
527  * If we are binding a Rx ring to a squeue attached to the offline CPU,
528  * no need to check that because squeues are never destroyed once
529  * created.
530  */
531 /* ARGSUSED */
532 static void
533 ip_squeue_soft_ring_affinity(void *arg)
534 {
535 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
536 	ill_t			*ill = sq_arg->ip_taskq_ill;
537 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
538 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
539 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
540 	cpu_t			*bind_cpu;
541 	int			cpu_id = intr_cpu->cpu_id;
542 	int			min_cpu_id, max_cpu_id;
543 	boolean_t		enough_uniq_cpus = B_FALSE;
544 	boolean_t		enough_cpus = B_FALSE;
545 	squeue_set_t 		*sqs, *last_sqs;
546 	squeue_t 		*sqp = NULL;
547 	int			i, j;
548 
549 	ASSERT(ill != NULL);
550 	kmem_free(arg, sizeof (ip_taskq_arg_t));
551 
552 	/*
553 	 * Make sure the CPU that originally took the interrupt still
554 	 * exists.
555 	 */
556 	if (!CPU_ISON(intr_cpu)) {
557 		intr_cpu = CPU;
558 		cpu_id = intr_cpu->cpu_id;
559 	}
560 
561 	/*
562 	 * If this ill represents link aggregation, then there might be
563 	 * multiple NICs trying to register them selves at the same time
564 	 * and in order to ensure that test and assignment of free rings
565 	 * is sequential, we need to hold the ill_lock.
566 	 */
567 	mutex_enter(&ill->ill_lock);
568 
569 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
570 		mutex_exit(&ill->ill_lock);
571 		return;
572 	}
573 	/*
574 	 * We need to fanout the interrupts from the NIC. We do that by
575 	 * telling the driver underneath to create soft rings and use
576 	 * worker threads (if the driver advertized SOFT_RING capability)
577 	 * Its still a big performance win to if we can fanout to the
578 	 * threads on the same core that is taking interrupts.
579 	 *
580 	 * Since we don't know the interrupt to CPU binding, we don't
581 	 * assign any squeues or affinity to worker threads in the NIC.
582 	 * At the time of the first interrupt, we know which CPU is
583 	 * taking interrupts and try to find other threads on the same
584 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
585 	 * numbered sequentially for each core (XXX need something better
586 	 * than this in future), find the lowest number and highest
587 	 * number thread for that core.
588 	 *
589 	 * If we have one more thread per core than number of soft rings,
590 	 * then don't assign any worker threads to the H/W thread (cpu)
591 	 * taking interrupts (capability negotiation tries to ensure this)
592 	 *
593 	 * If the number of threads per core are same as the number of
594 	 * soft rings, then assign the worker affinity and squeue to
595 	 * the same cpu.
596 	 *
597 	 * Otherwise, just fanout to higher number CPUs starting from
598 	 * the interrupted CPU.
599 	 */
600 
601 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
602 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
603 
604 	/*
605 	 * Quickly check if there are enough CPUs present for fanout
606 	 * and also max_cpu_id is less than the id of the active CPU.
607 	 * We use the cpu_id stored in the last squeue_set to get
608 	 * an idea. The scheme is by no means perfect since it doesn't
609 	 * take into account CPU DR operations and the fact that
610 	 * interrupts themselves might change. An ideal scenario
611 	 * would be to ensure that interrupts run cpus by themselves
612 	 * and worker threads never have affinity to those CPUs. If
613 	 * the interrupts move to CPU which had a worker thread, it
614 	 * should be changed. Probably callbacks similar to CPU offline
615 	 * are needed to make it work perfectly.
616 	 */
617 	last_sqs = sqset_global_list[sqset_global_size - 1];
618 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
619 		if ((max_cpu_id - min_cpu_id) >
620 		    ill_soft_ring->ill_dls_soft_ring_cnt)
621 			enough_uniq_cpus = B_TRUE;
622 		else if ((max_cpu_id - min_cpu_id) >=
623 		    ill_soft_ring->ill_dls_soft_ring_cnt)
624 			enough_cpus = B_TRUE;
625 	}
626 
627 	j = 0;
628 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
629 		if (enough_uniq_cpus) {
630 			if ((min_cpu_id + i) == cpu_id) {
631 				j++;
632 				continue;
633 			}
634 			bind_cpu = cpu[min_cpu_id + i];
635 		} else if (enough_cpus) {
636 			bind_cpu = cpu[min_cpu_id + i];
637 		} else {
638 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
639 			bind_cpu = cpu[(cpu_id + i) % ncpus];
640 		}
641 
642 		/*
643 		 * Check if the CPU actually exist and active. If not,
644 		 * use the interrupted CPU. ip_find_unused_squeue() will
645 		 * find the right CPU to fanout anyway.
646 		 */
647 		if (!CPU_ISON(bind_cpu))
648 			bind_cpu = intr_cpu;
649 
650 		sqs = bind_cpu->cpu_squeue_set;
651 		ASSERT(sqs != NULL);
652 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
653 
654 		sqp = ip_find_unused_squeue(sqs, B_TRUE);
655 		if (sqp == NULL) {
656 			/*
657 			 * We hit the max limit of squeues allowed per CPU.
658 			 * Assign this rx_ring to DEFAULT squeue of the
659 			 * interrupted CPU but thesqueue will not manage
660 			 * the ring. Also print a warning.
661 			 */
662 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
663 			    "%d/%p already has max number of squeues. System "
664 			    "performance might become suboptimal\n",
665 			    sqs->sqs_bind, (void *)sqs);
666 
667 			/* the first squeue in the list is the default squeue */
668 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
669 			ASSERT(sqp != NULL);
670 
671 			ill_rx_ring->rr_sqp = sqp;
672 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
673 			continue;
674 
675 		}
676 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
677 		ill_rx_ring->rr_sqp = sqp;
678 		sqp->sq_rx_ring = ill_rx_ring;
679 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
680 		sqp->sq_state |= SQS_ILL_BOUND;
681 
682 		/* assign affinity to soft ring */
683 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
684 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
685 			    sqp->sq_bind);
686 		}
687 		mutex_exit(&sqp->sq_lock);
688 	}
689 	mutex_exit(&ill->ill_lock);
690 
691 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
692 	    SOFT_RING_FANOUT);
693 
694 	mutex_enter(&ill->ill_lock);
695 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
696 	mutex_exit(&ill->ill_lock);
697 
698 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
699 	ill_waiter_dcr(ill);
700 }
701 
702 /* ARGSUSED */
703 void
704 ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
705     mblk_t *mp_chain, struct mac_header_info_s *mhip)
706 {
707 	ip_taskq_arg_t	*taskq_arg;
708 	boolean_t	refheld;
709 
710 	ASSERT(servicing_interrupt());
711 
712 	mutex_enter(&ill->ill_lock);
713 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
714 		taskq_arg = (ip_taskq_arg_t *)
715 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
716 
717 		if (taskq_arg == NULL)
718 			goto out;
719 
720 		taskq_arg->ip_taskq_ill = ill;
721 		taskq_arg->ip_taskq_ill_rx_ring = NULL;
722 		taskq_arg->ip_taskq_cpu = CPU;
723 
724 		/*
725 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
726 		 * the next interrupt to schedule a task for calling
727 		 * ip_squeue_soft_ring_affinity();
728 		 */
729 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
730 	} else {
731 		mutex_exit(&ill->ill_lock);
732 		goto out;
733 	}
734 	mutex_exit(&ill->ill_lock);
735 	refheld = ill_waiter_inc(ill);
736 	if (refheld) {
737 		if (taskq_dispatch(system_taskq,
738 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
739 			goto out;
740 
741 		/* release ref on ill if taskq dispatch fails */
742 		ill_waiter_dcr(ill);
743 	}
744 	/*
745 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
746 	 * can be tried again later.
747 	 */
748 	mutex_enter(&ill->ill_lock);
749 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
750 	mutex_exit(&ill->ill_lock);
751 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
752 
753 out:
754 	ip_input(ill, NULL, mp_chain, mhip);
755 }
756 
757 static squeue_t *
758 ip_find_unused_squeue(squeue_set_t *sqs, boolean_t fanout)
759 {
760 	int 		i;
761 	squeue_set_t	*best_sqs = NULL;
762 	squeue_set_t	*curr_sqs = NULL;
763 	int		min_sq = 0;
764 	squeue_t 	*sqp = NULL;
765 	char		sqname[64];
766 	cpu_t		*bind_cpu;
767 
768 	/*
769 	 * If fanout is set and the passed squeue_set already has some
770 	 * squeues which are managing the NICs, try to find squeues on
771 	 * unused CPU.
772 	 */
773 	if (sqs->sqs_size > 1 && fanout) {
774 		/*
775 		 * First check to see if any squeue on the CPU passed
776 		 * is managing a NIC.
777 		 */
778 		for (i = 0; i < sqs->sqs_size; i++) {
779 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
780 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
781 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
782 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
783 				break;
784 			}
785 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
786 		}
787 		if (i != sqs->sqs_size) {
788 			best_sqs = NULL;
789 
790 			for (i = sqset_global_size - 1; i >= 0; i--) {
791 				curr_sqs = sqset_global_list[i];
792 				/*
793 				 * Check and make sure the CPU that sqs
794 				 * is bound to is valid. There could be
795 				 * sqs's around whose CPUs could have
796 				 * been DR'd out. Also note cpu_lock is
797 				 * not held here. It is ok as later we
798 				 * do cpu_lock when we access cpu_t
799 				 * members.
800 				 */
801 				if (cpu_get(curr_sqs->sqs_bind) != NULL) {
802 					if (best_sqs == NULL) {
803 						best_sqs = curr_sqs;
804 						min_sq = curr_sqs->sqs_size;
805 					} else if (curr_sqs->sqs_size <
806 					    min_sq) {
807 						best_sqs = curr_sqs;
808 						min_sq = curr_sqs->sqs_size;
809 					}
810 				}
811 			}
812 
813 			ASSERT(best_sqs != NULL);
814 			sqs = best_sqs;
815 		}
816 	}
817 
818 	mutex_enter(&sqs->sqs_lock);
819 
820 	for (i = 0; i < sqs->sqs_size; i++) {
821 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
822 		if ((sqs->sqs_list[i]->sq_state &
823 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
824 			sqp = sqs->sqs_list[i];
825 			break;
826 		}
827 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
828 	}
829 
830 	if (sqp == NULL) {
831 		/* Need to create a new squeue */
832 		if (sqs->sqs_size == sqs->sqs_max_size) {
833 			/*
834 			 * Reached the max limit for squeue
835 			 * we can allocate on this CPU.
836 			 */
837 			mutex_exit(&sqs->sqs_lock);
838 			return (NULL);
839 		}
840 
841 		mutex_enter(&cpu_lock);
842 		if ((bind_cpu = cpu_get(sqs->sqs_bind)) == NULL) {
843 			/* Too bad, CPU got DR'd out, return NULL */
844 			mutex_exit(&cpu_lock);
845 			mutex_exit(&sqs->sqs_lock);
846 			return (NULL);
847 		}
848 
849 		bzero(sqname, sizeof (sqname));
850 		(void) snprintf(sqname, sizeof (sqname),
851 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
852 		    bind_cpu->cpu_id, sqs->sqs_size);
853 		mutex_exit(&cpu_lock);
854 
855 		sqp = squeue_create(sqname, sqs->sqs_bind,
856 		    ip_squeue_worker_wait, minclsyspri);
857 
858 		ASSERT(sqp != NULL);
859 
860 		squeue_profile_enable(sqp);
861 		sqs->sqs_list[sqs->sqs_size++] = sqp;
862 
863 		if (ip_squeue_create_callback != NULL)
864 			ip_squeue_create_callback(sqp);
865 
866 		if (ip_squeue_bind) {
867 			mutex_enter(&cpu_lock);
868 			bind_cpu = cpu_get(sqs->sqs_bind);
869 			if (bind_cpu != NULL && cpu_is_online(bind_cpu)) {
870 				squeue_bind(sqp, -1);
871 			}
872 			mutex_exit(&cpu_lock);
873 		}
874 		mutex_enter(&sqp->sq_lock);
875 	}
876 
877 	mutex_exit(&sqs->sqs_lock);
878 	ASSERT(sqp != NULL);
879 	return (sqp);
880 }
881 
882 /*
883  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
884  * owned by a squeue yet, do the assignment. When the NIC registers it
885  * Rx rings with IP, we don't know where the interrupts will land and
886  * hence we need to wait till this point to do the assignment.
887  */
888 squeue_t *
889 ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
890 {
891 	squeue_t 	*sqp;
892 	ill_t 		*ill;
893 	int		interrupt;
894 	ip_taskq_arg_t	*taskq_arg;
895 	boolean_t	refheld;
896 
897 	if (ill_rx_ring == NULL)
898 		return (IP_SQUEUE_GET(lbolt));
899 
900 	sqp = ill_rx_ring->rr_sqp;
901 	/*
902 	 * Do a quick check. If it's not NULL, we are done.
903 	 * Squeues are never destroyed so worse we will bind
904 	 * this connection to a suboptimal squeue.
905 	 *
906 	 * This is the fast path case.
907 	 */
908 	if (sqp != NULL)
909 		return (sqp);
910 
911 	ill = ill_rx_ring->rr_ill;
912 	ASSERT(ill != NULL);
913 
914 	interrupt = servicing_interrupt();
915 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
916 	    KM_NOSLEEP);
917 
918 	mutex_enter(&ill->ill_lock);
919 	/*
920 	 * Check sqp under the lock again for atomicity. Possible race with
921 	 * a previously scheduled ip_squeue_get -> ip_squeue_extend.
922 	 * Do the ring to squeue binding only if we are in interrupt context
923 	 * AND the ring is not already bound AND there is no one else trying
924 	 * the bind already.
925 	 */
926 	sqp = ill_rx_ring->rr_sqp;
927 	if (sqp != NULL || !interrupt ||
928 	    ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
929 		/*
930 		 * Note that the ring might get bound once we drop the lock
931 		 * below, if a previous request is in progress i.e. if the ring
932 		 * state is ILL_RING_INPROC. The incoming connection on whose
933 		 * behalf we are currently here might get a suboptimal squeue
934 		 * via the call to IP_SQUEUE_GET below, but there is no
935 		 * correctness issue.
936 		 */
937 		mutex_exit(&ill->ill_lock);
938 		if (taskq_arg != NULL)
939 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
940 		if (sqp != NULL)
941 			return (sqp);
942 		return (IP_SQUEUE_GET(lbolt));
943 	}
944 
945 	/*
946 	 * No sqp assigned yet. Can't really do that in interrupt
947 	 * context. Assign the default sqp to this connection and
948 	 * trigger creation of new sqp and binding it to this ring
949 	 * via taskq. Need to make sure ill stays around.
950 	 */
951 	taskq_arg->ip_taskq_ill = ill;
952 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
953 	taskq_arg->ip_taskq_cpu = CPU;
954 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
955 	mutex_exit(&ill->ill_lock);
956 	refheld = ill_waiter_inc(ill);
957 	if (refheld) {
958 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
959 		    taskq_arg, TQ_NOSLEEP) != NULL) {
960 			return (IP_SQUEUE_GET(lbolt));
961 		}
962 	}
963 	/*
964 	 * The ill is closing and we could not get a reference on the ill OR
965 	 * taskq_dispatch failed probably due to memory allocation failure.
966 	 * We will try again next time.
967 	 */
968 	mutex_enter(&ill->ill_lock);
969 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
970 	mutex_exit(&ill->ill_lock);
971 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
972 	if (refheld)
973 		ill_waiter_dcr(ill);
974 
975 	return (IP_SQUEUE_GET(lbolt));
976 }
977 
978 /*
979  * NDD hooks for setting ip_squeue_xxx tuneables.
980  */
981 
982 /* ARGSUSED */
983 int
984 ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
985     caddr_t addr, cred_t *cr)
986 {
987 	int *bind_enabled = (int *)addr;
988 	long new_value;
989 	int i;
990 
991 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
992 		return (EINVAL);
993 
994 	if (ip_squeue_bind == new_value)
995 		return (0);
996 
997 	*bind_enabled = new_value;
998 	mutex_enter(&cpu_lock);
999 	if (new_value == 0) {
1000 		for (i = 0; i < sqset_global_size; i++)
1001 			ip_squeue_set_unbind(sqset_global_list[i]);
1002 	} else {
1003 		for (i = 0; i < sqset_global_size; i++)
1004 			ip_squeue_set_bind(sqset_global_list[i]);
1005 	}
1006 
1007 	mutex_exit(&cpu_lock);
1008 	return (0);
1009 }
1010 
1011 /*
1012  * Set squeue profiling.
1013  * 0 means "disable"
1014  * 1 means "enable"
1015  * 2 means "enable and reset"
1016  */
1017 /* ARGSUSED */
1018 int
1019 ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
1020     cred_t *cr)
1021 {
1022 	int *profile_enabled = (int *)cp;
1023 	long new_value;
1024 	squeue_set_t *sqs;
1025 
1026 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
1027 		return (EINVAL);
1028 
1029 	if (new_value == 0)
1030 		squeue_profile_stop();
1031 	else if (new_value == 1)
1032 		squeue_profile_start();
1033 	else if (new_value == 2) {
1034 		int i, j;
1035 
1036 		squeue_profile_stop();
1037 		mutex_enter(&cpu_lock);
1038 		for (i = 0; i < sqset_global_size; i++) {
1039 			sqs = sqset_global_list[i];
1040 			for (j = 0; j < sqs->sqs_size; j++) {
1041 				squeue_profile_reset(sqs->sqs_list[j]);
1042 			}
1043 		}
1044 		mutex_exit(&cpu_lock);
1045 
1046 		new_value = 1;
1047 		squeue_profile_start();
1048 	}
1049 	*profile_enabled = new_value;
1050 
1051 	return (0);
1052 }
1053 
1054 /*
1055  * Reconfiguration callback
1056  */
1057 
1058 /* ARGSUSED */
1059 static int
1060 ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
1061 {
1062 	cpu_t *cp = cpu[id];
1063 
1064 	ASSERT(MUTEX_HELD(&cpu_lock));
1065 	switch (what) {
1066 	case CPU_CONFIG:
1067 		/*
1068 		 * A new CPU is added. Create an squeue for it but do not bind
1069 		 * it yet.
1070 		 */
1071 		if (cp->cpu_squeue_set == NULL)
1072 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
1073 		break;
1074 	case CPU_ON:
1075 	case CPU_INIT:
1076 	case CPU_CPUPART_IN:
1077 		if (cp->cpu_squeue_set == NULL) {
1078 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
1079 		}
1080 		if (ip_squeue_bind)
1081 			ip_squeue_set_bind(cp->cpu_squeue_set);
1082 		break;
1083 	case CPU_UNCONFIG:
1084 	case CPU_OFF:
1085 	case CPU_CPUPART_OUT:
1086 		ASSERT((cp->cpu_squeue_set != NULL) ||
1087 		    (cp->cpu_flags & CPU_OFFLINE));
1088 
1089 		if (cp->cpu_squeue_set != NULL) {
1090 			ip_squeue_set_unbind(cp->cpu_squeue_set);
1091 		}
1092 		break;
1093 	default:
1094 		break;
1095 	}
1096 	return (0);
1097 }
1098 
1099 /* ARGSUSED */
1100 static void
1101 ip_squeue_set_bind(squeue_set_t *sqs)
1102 {
1103 	int i;
1104 	squeue_t *sqp;
1105 
1106 	if (!ip_squeue_bind)
1107 		return;
1108 
1109 	mutex_enter(&sqs->sqs_lock);
1110 	for (i = 0; i < sqs->sqs_size; i++) {
1111 		sqp = sqs->sqs_list[i];
1112 		if (sqp->sq_state & SQS_BOUND)
1113 			continue;
1114 		squeue_bind(sqp, -1);
1115 	}
1116 	mutex_exit(&sqs->sqs_lock);
1117 }
1118 
1119 static void
1120 ip_squeue_set_unbind(squeue_set_t *sqs)
1121 {
1122 	int i;
1123 	squeue_t *sqp;
1124 
1125 	mutex_enter(&sqs->sqs_lock);
1126 	for (i = 0; i < sqs->sqs_size; i++) {
1127 		sqp = sqs->sqs_list[i];
1128 
1129 		/*
1130 		 * CPU is going offline. Remove the thread affinity
1131 		 * for any soft ring threads the squeue is managing.
1132 		 */
1133 		if (sqp->sq_state & SQS_ILL_BOUND) {
1134 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
1135 			ill_t		*ill = ring->rr_ill;
1136 
1137 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
1138 				ASSERT(ring->rr_handle != NULL);
1139 				ill->ill_dls_capab->ill_dls_unbind(
1140 				    ring->rr_handle);
1141 			}
1142 		}
1143 		if (!(sqp->sq_state & SQS_BOUND))
1144 			continue;
1145 		squeue_unbind(sqp);
1146 	}
1147 	mutex_exit(&sqs->sqs_lock);
1148 }
1149