xref: /freebsd/sys/kern/subr_gtaskqueue.c (revision 094fc1ed0f2627525c7b0342efcbad5be7a8546a)
1 /*-
2  * Copyright (c) 2000 Doug Rabson
3  * Copyright (c) 2014 Jeff Roberson
4  * Copyright (c) 2016 Matthew Macy
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/bus.h>
35 #include <sys/cpuset.h>
36 #include <sys/interrupt.h>
37 #include <sys/kernel.h>
38 #include <sys/kthread.h>
39 #include <sys/libkern.h>
40 #include <sys/limits.h>
41 #include <sys/lock.h>
42 #include <sys/malloc.h>
43 #include <sys/mutex.h>
44 #include <sys/proc.h>
45 #include <sys/sched.h>
46 #include <sys/smp.h>
47 #include <sys/gtaskqueue.h>
48 #include <sys/unistd.h>
49 #include <machine/stdarg.h>
50 
51 static MALLOC_DEFINE(M_GTASKQUEUE, "gtaskqueue", "Group Task Queues");
52 static void	gtaskqueue_thread_enqueue(void *);
53 static void	gtaskqueue_thread_loop(void *arg);
54 
55 TASKQGROUP_DEFINE(softirq, mp_ncpus, 1);
56 
57 struct gtaskqueue_busy {
58 	struct gtask	*tb_running;
59 	TAILQ_ENTRY(gtaskqueue_busy) tb_link;
60 };
61 
62 static struct gtask * const TB_DRAIN_WAITER = (struct gtask *)0x1;
63 
64 struct gtaskqueue {
65 	STAILQ_HEAD(, gtask)	tq_queue;
66 	gtaskqueue_enqueue_fn	tq_enqueue;
67 	void			*tq_context;
68 	char			*tq_name;
69 	TAILQ_HEAD(, gtaskqueue_busy) tq_active;
70 	struct mtx		tq_mutex;
71 	struct thread		**tq_threads;
72 	int			tq_tcount;
73 	int			tq_spin;
74 	int			tq_flags;
75 	int			tq_callouts;
76 	taskqueue_callback_fn	tq_callbacks[TASKQUEUE_NUM_CALLBACKS];
77 	void			*tq_cb_contexts[TASKQUEUE_NUM_CALLBACKS];
78 };
79 
80 #define	TQ_FLAGS_ACTIVE		(1 << 0)
81 #define	TQ_FLAGS_BLOCKED	(1 << 1)
82 #define	TQ_FLAGS_UNLOCKED_ENQUEUE	(1 << 2)
83 
84 #define	DT_CALLOUT_ARMED	(1 << 0)
85 
86 #define	TQ_LOCK(tq)							\
87 	do {								\
88 		if ((tq)->tq_spin)					\
89 			mtx_lock_spin(&(tq)->tq_mutex);			\
90 		else							\
91 			mtx_lock(&(tq)->tq_mutex);			\
92 	} while (0)
93 #define	TQ_ASSERT_LOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_OWNED)
94 
95 #define	TQ_UNLOCK(tq)							\
96 	do {								\
97 		if ((tq)->tq_spin)					\
98 			mtx_unlock_spin(&(tq)->tq_mutex);		\
99 		else							\
100 			mtx_unlock(&(tq)->tq_mutex);			\
101 	} while (0)
102 #define	TQ_ASSERT_UNLOCKED(tq)	mtx_assert(&(tq)->tq_mutex, MA_NOTOWNED)
103 
104 #ifdef INVARIANTS
105 static void
106 gtask_dump(struct gtask *gtask)
107 {
108 	printf("gtask: %p ta_flags=%x ta_priority=%d ta_func=%p ta_context=%p\n",
109 	       gtask, gtask->ta_flags, gtask->ta_priority, gtask->ta_func, gtask->ta_context);
110 }
111 #endif
112 
113 static __inline int
114 TQ_SLEEP(struct gtaskqueue *tq, void *p, struct mtx *m, int pri, const char *wm,
115     int t)
116 {
117 	if (tq->tq_spin)
118 		return (msleep_spin(p, m, wm, t));
119 	return (msleep(p, m, pri, wm, t));
120 }
121 
122 static struct gtaskqueue *
123 _gtaskqueue_create(const char *name, int mflags,
124 		 taskqueue_enqueue_fn enqueue, void *context,
125 		 int mtxflags, const char *mtxname __unused)
126 {
127 	struct gtaskqueue *queue;
128 	char *tq_name;
129 
130 	tq_name = malloc(TASKQUEUE_NAMELEN, M_GTASKQUEUE, mflags | M_ZERO);
131 	if (!tq_name)
132 		return (NULL);
133 
134 	snprintf(tq_name, TASKQUEUE_NAMELEN, "%s", (name) ? name : "taskqueue");
135 
136 	queue = malloc(sizeof(struct gtaskqueue), M_GTASKQUEUE, mflags | M_ZERO);
137 	if (!queue)
138 		return (NULL);
139 
140 	STAILQ_INIT(&queue->tq_queue);
141 	TAILQ_INIT(&queue->tq_active);
142 	queue->tq_enqueue = enqueue;
143 	queue->tq_context = context;
144 	queue->tq_name = tq_name;
145 	queue->tq_spin = (mtxflags & MTX_SPIN) != 0;
146 	queue->tq_flags |= TQ_FLAGS_ACTIVE;
147 	if (enqueue == gtaskqueue_thread_enqueue)
148 		queue->tq_flags |= TQ_FLAGS_UNLOCKED_ENQUEUE;
149 	mtx_init(&queue->tq_mutex, tq_name, NULL, mtxflags);
150 
151 	return (queue);
152 }
153 
154 
155 /*
156  * Signal a taskqueue thread to terminate.
157  */
158 static void
159 gtaskqueue_terminate(struct thread **pp, struct gtaskqueue *tq)
160 {
161 
162 	while (tq->tq_tcount > 0 || tq->tq_callouts > 0) {
163 		wakeup(tq);
164 		TQ_SLEEP(tq, pp, &tq->tq_mutex, PWAIT, "taskqueue_destroy", 0);
165 	}
166 }
167 
168 static void
169 gtaskqueue_free(struct gtaskqueue *queue)
170 {
171 
172 	TQ_LOCK(queue);
173 	queue->tq_flags &= ~TQ_FLAGS_ACTIVE;
174 	gtaskqueue_terminate(queue->tq_threads, queue);
175 	KASSERT(TAILQ_EMPTY(&queue->tq_active), ("Tasks still running?"));
176 	KASSERT(queue->tq_callouts == 0, ("Armed timeout tasks"));
177 	mtx_destroy(&queue->tq_mutex);
178 	free(queue->tq_threads, M_GTASKQUEUE);
179 	free(queue->tq_name, M_GTASKQUEUE);
180 	free(queue, M_GTASKQUEUE);
181 }
182 
183 int
184 grouptaskqueue_enqueue(struct gtaskqueue *queue, struct gtask *gtask)
185 {
186 #ifdef INVARIANTS
187 	if (queue == NULL) {
188 		gtask_dump(gtask);
189 		panic("queue == NULL");
190 	}
191 #endif
192 	TQ_LOCK(queue);
193 	if (gtask->ta_flags & TASK_ENQUEUED) {
194 		TQ_UNLOCK(queue);
195 		return (0);
196 	}
197 	STAILQ_INSERT_TAIL(&queue->tq_queue, gtask, ta_link);
198 	gtask->ta_flags |= TASK_ENQUEUED;
199 	TQ_UNLOCK(queue);
200 	if ((queue->tq_flags & TQ_FLAGS_BLOCKED) == 0)
201 		queue->tq_enqueue(queue->tq_context);
202 	return (0);
203 }
204 
205 static void
206 gtaskqueue_task_nop_fn(void *context)
207 {
208 }
209 
210 /*
211  * Block until all currently queued tasks in this taskqueue
212  * have begun execution.  Tasks queued during execution of
213  * this function are ignored.
214  */
215 static void
216 gtaskqueue_drain_tq_queue(struct gtaskqueue *queue)
217 {
218 	struct gtask t_barrier;
219 
220 	if (STAILQ_EMPTY(&queue->tq_queue))
221 		return;
222 
223 	/*
224 	 * Enqueue our barrier after all current tasks, but with
225 	 * the highest priority so that newly queued tasks cannot
226 	 * pass it.  Because of the high priority, we can not use
227 	 * taskqueue_enqueue_locked directly (which drops the lock
228 	 * anyway) so just insert it at tail while we have the
229 	 * queue lock.
230 	 */
231 	GTASK_INIT(&t_barrier, 0, USHRT_MAX, gtaskqueue_task_nop_fn, &t_barrier);
232 	STAILQ_INSERT_TAIL(&queue->tq_queue, &t_barrier, ta_link);
233 	t_barrier.ta_flags |= TASK_ENQUEUED;
234 
235 	/*
236 	 * Once the barrier has executed, all previously queued tasks
237 	 * have completed or are currently executing.
238 	 */
239 	while (t_barrier.ta_flags & TASK_ENQUEUED)
240 		TQ_SLEEP(queue, &t_barrier, &queue->tq_mutex, PWAIT, "-", 0);
241 }
242 
243 /*
244  * Block until all currently executing tasks for this taskqueue
245  * complete.  Tasks that begin execution during the execution
246  * of this function are ignored.
247  */
248 static void
249 gtaskqueue_drain_tq_active(struct gtaskqueue *queue)
250 {
251 	struct gtaskqueue_busy tb_marker, *tb_first;
252 
253 	if (TAILQ_EMPTY(&queue->tq_active))
254 		return;
255 
256 	/* Block taskq_terminate().*/
257 	queue->tq_callouts++;
258 
259 	/*
260 	 * Wait for all currently executing taskqueue threads
261 	 * to go idle.
262 	 */
263 	tb_marker.tb_running = TB_DRAIN_WAITER;
264 	TAILQ_INSERT_TAIL(&queue->tq_active, &tb_marker, tb_link);
265 	while (TAILQ_FIRST(&queue->tq_active) != &tb_marker)
266 		TQ_SLEEP(queue, &tb_marker, &queue->tq_mutex, PWAIT, "-", 0);
267 	TAILQ_REMOVE(&queue->tq_active, &tb_marker, tb_link);
268 
269 	/*
270 	 * Wakeup any other drain waiter that happened to queue up
271 	 * without any intervening active thread.
272 	 */
273 	tb_first = TAILQ_FIRST(&queue->tq_active);
274 	if (tb_first != NULL && tb_first->tb_running == TB_DRAIN_WAITER)
275 		wakeup(tb_first);
276 
277 	/* Release taskqueue_terminate(). */
278 	queue->tq_callouts--;
279 	if ((queue->tq_flags & TQ_FLAGS_ACTIVE) == 0)
280 		wakeup_one(queue->tq_threads);
281 }
282 
283 void
284 gtaskqueue_block(struct gtaskqueue *queue)
285 {
286 
287 	TQ_LOCK(queue);
288 	queue->tq_flags |= TQ_FLAGS_BLOCKED;
289 	TQ_UNLOCK(queue);
290 }
291 
292 void
293 gtaskqueue_unblock(struct gtaskqueue *queue)
294 {
295 
296 	TQ_LOCK(queue);
297 	queue->tq_flags &= ~TQ_FLAGS_BLOCKED;
298 	if (!STAILQ_EMPTY(&queue->tq_queue))
299 		queue->tq_enqueue(queue->tq_context);
300 	TQ_UNLOCK(queue);
301 }
302 
303 static void
304 gtaskqueue_run_locked(struct gtaskqueue *queue)
305 {
306 	struct gtaskqueue_busy tb;
307 	struct gtaskqueue_busy *tb_first;
308 	struct gtask *gtask;
309 
310 	KASSERT(queue != NULL, ("tq is NULL"));
311 	TQ_ASSERT_LOCKED(queue);
312 	tb.tb_running = NULL;
313 
314 	while (STAILQ_FIRST(&queue->tq_queue)) {
315 		TAILQ_INSERT_TAIL(&queue->tq_active, &tb, tb_link);
316 
317 		/*
318 		 * Carefully remove the first task from the queue and
319 		 * clear its TASK_ENQUEUED flag
320 		 */
321 		gtask = STAILQ_FIRST(&queue->tq_queue);
322 		KASSERT(gtask != NULL, ("task is NULL"));
323 		STAILQ_REMOVE_HEAD(&queue->tq_queue, ta_link);
324 		gtask->ta_flags &= ~TASK_ENQUEUED;
325 		tb.tb_running = gtask;
326 		TQ_UNLOCK(queue);
327 
328 		KASSERT(gtask->ta_func != NULL, ("task->ta_func is NULL"));
329 		gtask->ta_func(gtask->ta_context);
330 
331 		TQ_LOCK(queue);
332 		tb.tb_running = NULL;
333 		wakeup(gtask);
334 
335 		TAILQ_REMOVE(&queue->tq_active, &tb, tb_link);
336 		tb_first = TAILQ_FIRST(&queue->tq_active);
337 		if (tb_first != NULL &&
338 		    tb_first->tb_running == TB_DRAIN_WAITER)
339 			wakeup(tb_first);
340 	}
341 }
342 
343 static int
344 task_is_running(struct gtaskqueue *queue, struct gtask *gtask)
345 {
346 	struct gtaskqueue_busy *tb;
347 
348 	TQ_ASSERT_LOCKED(queue);
349 	TAILQ_FOREACH(tb, &queue->tq_active, tb_link) {
350 		if (tb->tb_running == gtask)
351 			return (1);
352 	}
353 	return (0);
354 }
355 
356 static int
357 gtaskqueue_cancel_locked(struct gtaskqueue *queue, struct gtask *gtask)
358 {
359 
360 	if (gtask->ta_flags & TASK_ENQUEUED)
361 		STAILQ_REMOVE(&queue->tq_queue, gtask, gtask, ta_link);
362 	gtask->ta_flags &= ~TASK_ENQUEUED;
363 	return (task_is_running(queue, gtask) ? EBUSY : 0);
364 }
365 
366 int
367 gtaskqueue_cancel(struct gtaskqueue *queue, struct gtask *gtask)
368 {
369 	int error;
370 
371 	TQ_LOCK(queue);
372 	error = gtaskqueue_cancel_locked(queue, gtask);
373 	TQ_UNLOCK(queue);
374 
375 	return (error);
376 }
377 
378 void
379 gtaskqueue_drain(struct gtaskqueue *queue, struct gtask *gtask)
380 {
381 
382 	if (!queue->tq_spin)
383 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
384 
385 	TQ_LOCK(queue);
386 	while ((gtask->ta_flags & TASK_ENQUEUED) || task_is_running(queue, gtask))
387 		TQ_SLEEP(queue, gtask, &queue->tq_mutex, PWAIT, "-", 0);
388 	TQ_UNLOCK(queue);
389 }
390 
391 void
392 gtaskqueue_drain_all(struct gtaskqueue *queue)
393 {
394 
395 	if (!queue->tq_spin)
396 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
397 
398 	TQ_LOCK(queue);
399 	gtaskqueue_drain_tq_queue(queue);
400 	gtaskqueue_drain_tq_active(queue);
401 	TQ_UNLOCK(queue);
402 }
403 
404 static int
405 _gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
406     cpuset_t *mask, const char *name, va_list ap)
407 {
408 	char ktname[MAXCOMLEN + 1];
409 	struct thread *td;
410 	struct gtaskqueue *tq;
411 	int i, error;
412 
413 	if (count <= 0)
414 		return (EINVAL);
415 
416 	vsnprintf(ktname, sizeof(ktname), name, ap);
417 	tq = *tqp;
418 
419 	tq->tq_threads = malloc(sizeof(struct thread *) * count, M_GTASKQUEUE,
420 	    M_NOWAIT | M_ZERO);
421 	if (tq->tq_threads == NULL) {
422 		printf("%s: no memory for %s threads\n", __func__, ktname);
423 		return (ENOMEM);
424 	}
425 
426 	for (i = 0; i < count; i++) {
427 		if (count == 1)
428 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
429 			    &tq->tq_threads[i], RFSTOPPED, 0, "%s", ktname);
430 		else
431 			error = kthread_add(gtaskqueue_thread_loop, tqp, NULL,
432 			    &tq->tq_threads[i], RFSTOPPED, 0,
433 			    "%s_%d", ktname, i);
434 		if (error) {
435 			/* should be ok to continue, taskqueue_free will dtrt */
436 			printf("%s: kthread_add(%s): error %d", __func__,
437 			    ktname, error);
438 			tq->tq_threads[i] = NULL;		/* paranoid */
439 		} else
440 			tq->tq_tcount++;
441 	}
442 	for (i = 0; i < count; i++) {
443 		if (tq->tq_threads[i] == NULL)
444 			continue;
445 		td = tq->tq_threads[i];
446 		if (mask) {
447 			error = cpuset_setthread(td->td_tid, mask);
448 			/*
449 			 * Failing to pin is rarely an actual fatal error;
450 			 * it'll just affect performance.
451 			 */
452 			if (error)
453 				printf("%s: curthread=%llu: can't pin; "
454 				    "error=%d\n",
455 				    __func__,
456 				    (unsigned long long) td->td_tid,
457 				    error);
458 		}
459 		thread_lock(td);
460 		sched_prio(td, pri);
461 		sched_add(td, SRQ_BORING);
462 		thread_unlock(td);
463 	}
464 
465 	return (0);
466 }
467 
468 static int
469 gtaskqueue_start_threads(struct gtaskqueue **tqp, int count, int pri,
470     const char *name, ...)
471 {
472 	va_list ap;
473 	int error;
474 
475 	va_start(ap, name);
476 	error = _gtaskqueue_start_threads(tqp, count, pri, NULL, name, ap);
477 	va_end(ap);
478 	return (error);
479 }
480 
481 static inline void
482 gtaskqueue_run_callback(struct gtaskqueue *tq,
483     enum taskqueue_callback_type cb_type)
484 {
485 	taskqueue_callback_fn tq_callback;
486 
487 	TQ_ASSERT_UNLOCKED(tq);
488 	tq_callback = tq->tq_callbacks[cb_type];
489 	if (tq_callback != NULL)
490 		tq_callback(tq->tq_cb_contexts[cb_type]);
491 }
492 
493 static void
494 gtaskqueue_thread_loop(void *arg)
495 {
496 	struct gtaskqueue **tqp, *tq;
497 
498 	tqp = arg;
499 	tq = *tqp;
500 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_INIT);
501 	TQ_LOCK(tq);
502 	while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0) {
503 		/* XXX ? */
504 		gtaskqueue_run_locked(tq);
505 		/*
506 		 * Because taskqueue_run() can drop tq_mutex, we need to
507 		 * check if the TQ_FLAGS_ACTIVE flag wasn't removed in the
508 		 * meantime, which means we missed a wakeup.
509 		 */
510 		if ((tq->tq_flags & TQ_FLAGS_ACTIVE) == 0)
511 			break;
512 		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
513 	}
514 	gtaskqueue_run_locked(tq);
515 	/*
516 	 * This thread is on its way out, so just drop the lock temporarily
517 	 * in order to call the shutdown callback.  This allows the callback
518 	 * to look at the taskqueue, even just before it dies.
519 	 */
520 	TQ_UNLOCK(tq);
521 	gtaskqueue_run_callback(tq, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN);
522 	TQ_LOCK(tq);
523 
524 	/* rendezvous with thread that asked us to terminate */
525 	tq->tq_tcount--;
526 	wakeup_one(tq->tq_threads);
527 	TQ_UNLOCK(tq);
528 	kthread_exit();
529 }
530 
531 static void
532 gtaskqueue_thread_enqueue(void *context)
533 {
534 	struct gtaskqueue **tqp, *tq;
535 
536 	tqp = context;
537 	tq = *tqp;
538 	wakeup_one(tq);
539 }
540 
541 
542 static struct gtaskqueue *
543 gtaskqueue_create_fast(const char *name, int mflags,
544 		 taskqueue_enqueue_fn enqueue, void *context)
545 {
546 	return _gtaskqueue_create(name, mflags, enqueue, context,
547 			MTX_SPIN, "fast_taskqueue");
548 }
549 
550 
551 struct taskqgroup_cpu {
552 	LIST_HEAD(, grouptask)	tgc_tasks;
553 	struct gtaskqueue	*tgc_taskq;
554 	int	tgc_cnt;
555 	int	tgc_cpu;
556 };
557 
558 struct taskqgroup {
559 	struct taskqgroup_cpu tqg_queue[MAXCPU];
560 	struct mtx	tqg_lock;
561 	char *		tqg_name;
562 	int		tqg_adjusting;
563 	int		tqg_stride;
564 	int		tqg_cnt;
565 };
566 
567 struct taskq_bind_task {
568 	struct gtask bt_task;
569 	int	bt_cpuid;
570 };
571 
572 static void
573 taskqgroup_cpu_create(struct taskqgroup *qgroup, int idx, int cpu)
574 {
575 	struct taskqgroup_cpu *qcpu;
576 
577 	qcpu = &qgroup->tqg_queue[idx];
578 	LIST_INIT(&qcpu->tgc_tasks);
579 	qcpu->tgc_taskq = gtaskqueue_create_fast(NULL, M_WAITOK,
580 	    taskqueue_thread_enqueue, &qcpu->tgc_taskq);
581 	gtaskqueue_start_threads(&qcpu->tgc_taskq, 1, PI_SOFT,
582 	    "%s_%d", qgroup->tqg_name, idx);
583 	qcpu->tgc_cpu = cpu;
584 }
585 
586 static void
587 taskqgroup_cpu_remove(struct taskqgroup *qgroup, int idx)
588 {
589 
590 	gtaskqueue_free(qgroup->tqg_queue[idx].tgc_taskq);
591 }
592 
593 /*
594  * Find the taskq with least # of tasks that doesn't currently have any
595  * other queues from the uniq identifier.
596  */
597 static int
598 taskqgroup_find(struct taskqgroup *qgroup, void *uniq)
599 {
600 	struct grouptask *n;
601 	int i, idx, mincnt;
602 	int strict;
603 
604 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
605 	if (qgroup->tqg_cnt == 0)
606 		return (0);
607 	idx = -1;
608 	mincnt = INT_MAX;
609 	/*
610 	 * Two passes;  First scan for a queue with the least tasks that
611 	 * does not already service this uniq id.  If that fails simply find
612 	 * the queue with the least total tasks;
613 	 */
614 	for (strict = 1; mincnt == INT_MAX; strict = 0) {
615 		for (i = 0; i < qgroup->tqg_cnt; i++) {
616 			if (qgroup->tqg_queue[i].tgc_cnt > mincnt)
617 				continue;
618 			if (strict) {
619 				LIST_FOREACH(n,
620 				    &qgroup->tqg_queue[i].tgc_tasks, gt_list)
621 					if (n->gt_uniq == uniq)
622 						break;
623 				if (n != NULL)
624 					continue;
625 			}
626 			mincnt = qgroup->tqg_queue[i].tgc_cnt;
627 			idx = i;
628 		}
629 	}
630 	if (idx == -1)
631 		panic("taskqgroup_find: Failed to pick a qid.");
632 
633 	return (idx);
634 }
635 
636 /*
637  * smp_started is unusable since it is not set for UP kernels or even for
638  * SMP kernels when there is 1 CPU.  This is usually handled by adding a
639  * (mp_ncpus == 1) test, but that would be broken here since we need to
640  * to synchronize with the SI_SUB_SMP ordering.  Even in the pure SMP case
641  * smp_started only gives a fuzzy ordering relative to SI_SUB_SMP.
642  *
643  * So maintain our own flag.  It must be set after all CPUs are started
644  * and before SI_SUB_SMP:SI_ORDER_ANY so that the SYSINIT for delayed
645  * adjustment is properly delayed.  SI_ORDER_FOURTH is clearly before
646  * SI_ORDER_ANY and unclearly after the CPUs are started.  It would be
647  * simpler for adjustment to pass a flag indicating if it is delayed.
648  */
649 
650 static int tqg_smp_started;
651 
652 static void
653 tqg_record_smp_started(void *arg)
654 {
655 	tqg_smp_started = 1;
656 }
657 
658 SYSINIT(tqg_record_smp_started, SI_SUB_SMP, SI_ORDER_FOURTH,
659 	tqg_record_smp_started, NULL);
660 
661 void
662 taskqgroup_attach(struct taskqgroup *qgroup, struct grouptask *gtask,
663     void *uniq, int irq, char *name)
664 {
665 	cpuset_t mask;
666 	int qid, error;
667 
668 	gtask->gt_uniq = uniq;
669 	snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
670 	gtask->gt_irq = irq;
671 	gtask->gt_cpu = -1;
672 	mtx_lock(&qgroup->tqg_lock);
673 	qid = taskqgroup_find(qgroup, uniq);
674 	qgroup->tqg_queue[qid].tgc_cnt++;
675 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
676 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
677 	if (irq != -1 && tqg_smp_started) {
678 		gtask->gt_cpu = qgroup->tqg_queue[qid].tgc_cpu;
679 		CPU_ZERO(&mask);
680 		CPU_SET(qgroup->tqg_queue[qid].tgc_cpu, &mask);
681 		mtx_unlock(&qgroup->tqg_lock);
682 		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
683 		if (error)
684 			printf("%s: setaffinity failed for %s: %d\n", __func__, gtask->gt_name, error);
685 	} else
686 		mtx_unlock(&qgroup->tqg_lock);
687 }
688 
689 static void
690 taskqgroup_attach_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
691 {
692 	cpuset_t mask;
693 	int qid, cpu, error;
694 
695 	mtx_lock(&qgroup->tqg_lock);
696 	qid = taskqgroup_find(qgroup, gtask->gt_uniq);
697 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
698 	if (gtask->gt_irq != -1) {
699 		mtx_unlock(&qgroup->tqg_lock);
700 
701 		CPU_ZERO(&mask);
702 		CPU_SET(cpu, &mask);
703 		error = intr_setaffinity(gtask->gt_irq, CPU_WHICH_IRQ, &mask);
704 		mtx_lock(&qgroup->tqg_lock);
705 		if (error)
706 			printf("%s: %s setaffinity failed: %d\n", __func__, gtask->gt_name, error);
707 
708 	}
709 	qgroup->tqg_queue[qid].tgc_cnt++;
710 
711 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask,
712 			 gt_list);
713 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
714 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
715 	mtx_unlock(&qgroup->tqg_lock);
716 }
717 
718 int
719 taskqgroup_attach_cpu(struct taskqgroup *qgroup, struct grouptask *gtask,
720 	void *uniq, int cpu, int irq, char *name)
721 {
722 	cpuset_t mask;
723 	int i, qid, error;
724 
725 	qid = -1;
726 	gtask->gt_uniq = uniq;
727 	snprintf(gtask->gt_name, GROUPTASK_NAMELEN, "%s", name ? name : "grouptask");
728 	gtask->gt_irq = irq;
729 	gtask->gt_cpu = cpu;
730 	mtx_lock(&qgroup->tqg_lock);
731 	if (tqg_smp_started) {
732 		for (i = 0; i < qgroup->tqg_cnt; i++)
733 			if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
734 				qid = i;
735 				break;
736 			}
737 		if (qid == -1) {
738 			mtx_unlock(&qgroup->tqg_lock);
739 			printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
740 			return (EINVAL);
741 		}
742 	} else
743 		qid = 0;
744 	qgroup->tqg_queue[qid].tgc_cnt++;
745 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
746 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
747 	cpu = qgroup->tqg_queue[qid].tgc_cpu;
748 	mtx_unlock(&qgroup->tqg_lock);
749 
750 	CPU_ZERO(&mask);
751 	CPU_SET(cpu, &mask);
752 	if (irq != -1 && tqg_smp_started) {
753 		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
754 		if (error)
755 			printf("%s: setaffinity failed: %d\n", __func__, error);
756 	}
757 	return (0);
758 }
759 
760 static int
761 taskqgroup_attach_cpu_deferred(struct taskqgroup *qgroup, struct grouptask *gtask)
762 {
763 	cpuset_t mask;
764 	int i, qid, irq, cpu, error;
765 
766 	qid = -1;
767 	irq = gtask->gt_irq;
768 	cpu = gtask->gt_cpu;
769 	MPASS(tqg_smp_started);
770 	mtx_lock(&qgroup->tqg_lock);
771 	for (i = 0; i < qgroup->tqg_cnt; i++)
772 		if (qgroup->tqg_queue[i].tgc_cpu == cpu) {
773 			qid = i;
774 			break;
775 		}
776 	if (qid == -1) {
777 		mtx_unlock(&qgroup->tqg_lock);
778 		printf("%s: qid not found for %s cpu=%d\n", __func__, gtask->gt_name, cpu);
779 		return (EINVAL);
780 	}
781 	qgroup->tqg_queue[qid].tgc_cnt++;
782 	LIST_INSERT_HEAD(&qgroup->tqg_queue[qid].tgc_tasks, gtask, gt_list);
783 	MPASS(qgroup->tqg_queue[qid].tgc_taskq != NULL);
784 	gtask->gt_taskqueue = qgroup->tqg_queue[qid].tgc_taskq;
785 	mtx_unlock(&qgroup->tqg_lock);
786 
787 	CPU_ZERO(&mask);
788 	CPU_SET(cpu, &mask);
789 
790 	if (irq != -1) {
791 		error = intr_setaffinity(irq, CPU_WHICH_IRQ, &mask);
792 		if (error)
793 			printf("%s: setaffinity failed: %d\n", __func__, error);
794 	}
795 	return (0);
796 }
797 
798 void
799 taskqgroup_detach(struct taskqgroup *qgroup, struct grouptask *gtask)
800 {
801 	int i;
802 
803 	mtx_lock(&qgroup->tqg_lock);
804 	for (i = 0; i < qgroup->tqg_cnt; i++)
805 		if (qgroup->tqg_queue[i].tgc_taskq == gtask->gt_taskqueue)
806 			break;
807 	if (i == qgroup->tqg_cnt)
808 		panic("taskqgroup_detach: task %s not in group\n", gtask->gt_name);
809 	qgroup->tqg_queue[i].tgc_cnt--;
810 	LIST_REMOVE(gtask, gt_list);
811 	mtx_unlock(&qgroup->tqg_lock);
812 	gtask->gt_taskqueue = NULL;
813 }
814 
815 static void
816 taskqgroup_binder(void *ctx)
817 {
818 	struct taskq_bind_task *gtask = (struct taskq_bind_task *)ctx;
819 	cpuset_t mask;
820 	int error;
821 
822 	CPU_ZERO(&mask);
823 	CPU_SET(gtask->bt_cpuid, &mask);
824 	error = cpuset_setthread(curthread->td_tid, &mask);
825 	thread_lock(curthread);
826 	sched_bind(curthread, gtask->bt_cpuid);
827 	thread_unlock(curthread);
828 
829 	if (error)
830 		printf("%s: setaffinity failed: %d\n", __func__,
831 		    error);
832 	free(gtask, M_DEVBUF);
833 }
834 
835 static void
836 taskqgroup_bind(struct taskqgroup *qgroup)
837 {
838 	struct taskq_bind_task *gtask;
839 	int i;
840 
841 	/*
842 	 * Bind taskqueue threads to specific CPUs, if they have been assigned
843 	 * one.
844 	 */
845 	if (qgroup->tqg_cnt == 1)
846 		return;
847 
848 	for (i = 0; i < qgroup->tqg_cnt; i++) {
849 		gtask = malloc(sizeof (*gtask), M_DEVBUF, M_WAITOK);
850 		GTASK_INIT(&gtask->bt_task, 0, 0, taskqgroup_binder, gtask);
851 		gtask->bt_cpuid = qgroup->tqg_queue[i].tgc_cpu;
852 		grouptaskqueue_enqueue(qgroup->tqg_queue[i].tgc_taskq,
853 		    &gtask->bt_task);
854 	}
855 }
856 
857 static int
858 _taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
859 {
860 	LIST_HEAD(, grouptask) gtask_head = LIST_HEAD_INITIALIZER(NULL);
861 	struct grouptask *gtask;
862 	int i, k, old_cnt, old_cpu, cpu;
863 
864 	mtx_assert(&qgroup->tqg_lock, MA_OWNED);
865 
866 	if (cnt < 1 || cnt * stride > mp_ncpus || !tqg_smp_started) {
867 		printf("%s: failed cnt: %d stride: %d "
868 		    "mp_ncpus: %d tqg_smp_started: %d\n",
869 		    __func__, cnt, stride, mp_ncpus, tqg_smp_started);
870 		return (EINVAL);
871 	}
872 	if (qgroup->tqg_adjusting) {
873 		printf("%s failed: adjusting\n", __func__);
874 		return (EBUSY);
875 	}
876 	qgroup->tqg_adjusting = 1;
877 	old_cnt = qgroup->tqg_cnt;
878 	old_cpu = 0;
879 	if (old_cnt < cnt)
880 		old_cpu = qgroup->tqg_queue[old_cnt].tgc_cpu;
881 	mtx_unlock(&qgroup->tqg_lock);
882 	/*
883 	 * Set up queue for tasks added before boot.
884 	 */
885 	if (old_cnt == 0) {
886 		LIST_SWAP(&gtask_head, &qgroup->tqg_queue[0].tgc_tasks,
887 		    grouptask, gt_list);
888 		qgroup->tqg_queue[0].tgc_cnt = 0;
889 	}
890 
891 	/*
892 	 * If new taskq threads have been added.
893 	 */
894 	cpu = old_cpu;
895 	for (i = old_cnt; i < cnt; i++) {
896 		taskqgroup_cpu_create(qgroup, i, cpu);
897 
898 		for (k = 0; k < stride; k++)
899 			cpu = CPU_NEXT(cpu);
900 	}
901 	mtx_lock(&qgroup->tqg_lock);
902 	qgroup->tqg_cnt = cnt;
903 	qgroup->tqg_stride = stride;
904 
905 	/*
906 	 * Adjust drivers to use new taskqs.
907 	 */
908 	for (i = 0; i < old_cnt; i++) {
909 		while ((gtask = LIST_FIRST(&qgroup->tqg_queue[i].tgc_tasks))) {
910 			LIST_REMOVE(gtask, gt_list);
911 			qgroup->tqg_queue[i].tgc_cnt--;
912 			LIST_INSERT_HEAD(&gtask_head, gtask, gt_list);
913 		}
914 	}
915 	mtx_unlock(&qgroup->tqg_lock);
916 
917 	while ((gtask = LIST_FIRST(&gtask_head))) {
918 		LIST_REMOVE(gtask, gt_list);
919 		if (gtask->gt_cpu == -1)
920 			taskqgroup_attach_deferred(qgroup, gtask);
921 		else if (taskqgroup_attach_cpu_deferred(qgroup, gtask))
922 			taskqgroup_attach_deferred(qgroup, gtask);
923 	}
924 
925 #ifdef INVARIANTS
926 	mtx_lock(&qgroup->tqg_lock);
927 	for (i = 0; i < qgroup->tqg_cnt; i++) {
928 		MPASS(qgroup->tqg_queue[i].tgc_taskq != NULL);
929 		LIST_FOREACH(gtask, &qgroup->tqg_queue[i].tgc_tasks, gt_list)
930 			MPASS(gtask->gt_taskqueue != NULL);
931 	}
932 	mtx_unlock(&qgroup->tqg_lock);
933 #endif
934 	/*
935 	 * If taskq thread count has been reduced.
936 	 */
937 	for (i = cnt; i < old_cnt; i++)
938 		taskqgroup_cpu_remove(qgroup, i);
939 
940 	taskqgroup_bind(qgroup);
941 
942 	mtx_lock(&qgroup->tqg_lock);
943 	qgroup->tqg_adjusting = 0;
944 
945 	return (0);
946 }
947 
948 int
949 taskqgroup_adjust(struct taskqgroup *qgroup, int cnt, int stride)
950 {
951 	int error;
952 
953 	mtx_lock(&qgroup->tqg_lock);
954 	error = _taskqgroup_adjust(qgroup, cnt, stride);
955 	mtx_unlock(&qgroup->tqg_lock);
956 
957 	return (error);
958 }
959 
960 struct taskqgroup *
961 taskqgroup_create(char *name)
962 {
963 	struct taskqgroup *qgroup;
964 
965 	qgroup = malloc(sizeof(*qgroup), M_GTASKQUEUE, M_WAITOK | M_ZERO);
966 	mtx_init(&qgroup->tqg_lock, "taskqgroup", NULL, MTX_DEF);
967 	qgroup->tqg_name = name;
968 	LIST_INIT(&qgroup->tqg_queue[0].tgc_tasks);
969 
970 	return (qgroup);
971 }
972 
973 void
974 taskqgroup_destroy(struct taskqgroup *qgroup)
975 {
976 
977 }
978