xref: /freebsd/sys/kern/kern_switch.c (revision be8e84c2e9240110ce99bb8d14259073340e4ef6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
5  * All rights reserved.
6  * Copyright (c) 2024 The FreeBSD Foundation
7  *
8  * Portions of this software were developed by Olivier Certner
9  * <olce.freebsd@certner.fr> at Kumacom SARL under sponsorship from the FreeBSD
10  * Foundation.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  */
33 
34 #include <sys/cdefs.h>
35 #include "opt_sched.h"
36 
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kdb.h>
40 #include <sys/kernel.h>
41 #include <sys/ktr.h>
42 #include <sys/lock.h>
43 #include <sys/mutex.h>
44 #include <sys/proc.h>
45 #include <sys/queue.h>
46 #include <sys/runq.h>
47 #include <sys/sched.h>
48 #include <sys/smp.h>
49 #include <sys/sysctl.h>
50 
51 #include <machine/cpu.h>
52 
53 /* Uncomment this to enable logging of critical_enter/exit. */
54 #if 0
55 #define	KTR_CRITICAL	KTR_SCHED
56 #else
57 #define	KTR_CRITICAL	0
58 #endif
59 
60 #ifdef FULL_PREEMPTION
61 #ifndef PREEMPTION
62 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
63 #endif
64 #endif
65 
66 /*
67  * kern.sched.preemption allows user space to determine if preemption support
68  * is compiled in or not.  It is not currently a boot or runtime flag that
69  * can be changed.
70  */
71 #ifdef PREEMPTION
72 static int kern_sched_preemption = 1;
73 #else
74 static int kern_sched_preemption = 0;
75 #endif
76 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
77     &kern_sched_preemption, 0, "Kernel preemption enabled");
78 
79 /*
80  * Support for scheduler stats exported via kern.sched.stats.  All stats may
81  * be reset with kern.sched.stats.reset = 1.  Stats may be defined elsewhere
82  * with SCHED_STAT_DEFINE().
83  */
84 #ifdef SCHED_STATS
85 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
86     "switch stats");
87 
88 /* Switch reasons from mi_switch(9). */
89 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]);
90 SCHED_STAT_DEFINE_VAR(owepreempt,
91     &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), "");
92 SCHED_STAT_DEFINE_VAR(turnstile,
93     &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), "");
94 SCHED_STAT_DEFINE_VAR(sleepq,
95     &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), "");
96 SCHED_STAT_DEFINE_VAR(relinquish,
97     &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), "");
98 SCHED_STAT_DEFINE_VAR(needresched,
99     &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), "");
100 SCHED_STAT_DEFINE_VAR(idle,
101     &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), "");
102 SCHED_STAT_DEFINE_VAR(iwait,
103     &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), "");
104 SCHED_STAT_DEFINE_VAR(suspend,
105     &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), "");
106 SCHED_STAT_DEFINE_VAR(remotepreempt,
107     &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), "");
108 SCHED_STAT_DEFINE_VAR(remotewakeidle,
109     &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), "");
110 SCHED_STAT_DEFINE_VAR(bind,
111     &DPCPU_NAME(sched_switch_stats[SWT_BIND]), "");
112 
113 static int
sysctl_stats_reset(SYSCTL_HANDLER_ARGS)114 sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
115 {
116 	struct sysctl_oid *p;
117 	uintptr_t counter;
118         int error;
119 	int val;
120 	int i;
121 
122         val = 0;
123         error = sysctl_handle_int(oidp, &val, 0, req);
124         if (error != 0 || req->newptr == NULL)
125                 return (error);
126         if (val == 0)
127                 return (0);
128 	/*
129 	 * Traverse the list of children of _kern_sched_stats and reset each
130 	 * to 0.  Skip the reset entry.
131 	 */
132 	RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) {
133 		if (p == oidp || p->oid_arg1 == NULL)
134 			continue;
135 		counter = (uintptr_t)p->oid_arg1;
136 		CPU_FOREACH(i) {
137 			*(long *)(dpcpu_off[i] + counter) = 0;
138 		}
139 	}
140 	return (0);
141 }
142 
143 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset,
144     CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0,
145     sysctl_stats_reset, "I",
146     "Reset scheduler statistics");
147 #endif
148 
149 /************************************************************************
150  * Functions that manipulate runnability from a thread perspective.	*
151  ************************************************************************/
152 /*
153  * Select the thread that will be run next.
154  */
155 
156 static __noinline struct thread *
choosethread_panic(struct thread * td)157 choosethread_panic(struct thread *td)
158 {
159 
160 	/*
161 	 * If we are in panic, only allow system threads,
162 	 * plus the one we are running in, to be run.
163 	 */
164 retry:
165 	if (((td->td_proc->p_flag & P_SYSTEM) == 0 &&
166 	    (td->td_flags & TDF_INPANIC) == 0)) {
167 		/* note that it is no longer on the run queue */
168 		TD_SET_CAN_RUN(td);
169 		td = sched_choose();
170 		goto retry;
171 	}
172 
173 	TD_SET_RUNNING(td);
174 	return (td);
175 }
176 
177 struct thread *
choosethread(void)178 choosethread(void)
179 {
180 	struct thread *td;
181 
182 	td = sched_choose();
183 
184 	if (KERNEL_PANICKED())
185 		return (choosethread_panic(td));
186 
187 	TD_SET_RUNNING(td);
188 	return (td);
189 }
190 
191 /*
192  * Kernel thread preemption implementation.  Critical sections mark
193  * regions of code in which preemptions are not allowed.
194  *
195  * It might seem a good idea to inline critical_enter() but, in order
196  * to prevent instructions reordering by the compiler, a __compiler_membar()
197  * would have to be used here (the same as sched_pin()).  The performance
198  * penalty imposed by the membar could, then, produce slower code than
199  * the function call itself, for most cases.
200  */
201 void
critical_enter_KBI(void)202 critical_enter_KBI(void)
203 {
204 #ifdef KTR
205 	struct thread *td = curthread;
206 #endif
207 	critical_enter();
208 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
209 	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
210 }
211 
212 void __noinline
critical_exit_preempt(void)213 critical_exit_preempt(void)
214 {
215 	struct thread *td;
216 	int flags;
217 
218 	/*
219 	 * If td_critnest is 0, it is possible that we are going to get
220 	 * preempted again before reaching the code below. This happens
221 	 * rarely and is harmless. However, this means td_owepreempt may
222 	 * now be unset.
223 	 */
224 	td = curthread;
225 	if (td->td_critnest != 0)
226 		return;
227 	if (kdb_active)
228 		return;
229 
230 	/*
231 	 * Microoptimization: we committed to switch,
232 	 * disable preemption in interrupt handlers
233 	 * while spinning for the thread lock.
234 	 */
235 	td->td_critnest = 1;
236 	thread_lock(td);
237 	td->td_critnest--;
238 	flags = SW_INVOL | SW_PREEMPT;
239 	if (TD_IS_IDLETHREAD(td))
240 		flags |= SWT_IDLE;
241 	else
242 		flags |= SWT_OWEPREEMPT;
243 	mi_switch(flags);
244 }
245 
246 void
critical_exit_KBI(void)247 critical_exit_KBI(void)
248 {
249 #ifdef KTR
250 	struct thread *td = curthread;
251 #endif
252 	critical_exit();
253 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
254 	    (long)td->td_proc->p_pid, td->td_name, td->td_critnest);
255 }
256 
257 /************************************************************************
258  * SYSTEM RUN QUEUE manipulations and tests				*
259  ************************************************************************/
260 _Static_assert(RQ_NQS <= 256,
261     "'td_rqindex' must be turned into a bigger unsigned type");
262 /* A macro instead of a function to get the proper calling function's name. */
263 #define CHECK_IDX(idx) ({						\
264 	__typeof(idx) _idx __unused = (idx);					\
265 	KASSERT(0 <= _idx && _idx < RQ_NQS,				\
266 	    ("%s: %s out of range: %d", __func__, __STRING(idx), _idx)); \
267 })
268 
269 /* Status words' individual bit manipulators' internals. */
270 typedef uintptr_t	runq_sw_op(int idx, int sw_idx, rqsw_t sw_bit,
271 			    rqsw_t *swp);
272 static inline uintptr_t	runq_sw_apply(struct runq *rq, int idx,
273 			    runq_sw_op *op);
274 
275 static inline uintptr_t	runq_sw_set_not_empty_op(int idx, int sw_idx,
276 			    rqsw_t sw_bit, rqsw_t *swp);
277 static inline uintptr_t	runq_sw_set_empty_op(int idx, int sw_idx,
278 			    rqsw_t sw_bit, rqsw_t *swp);
279 static inline uintptr_t	runq_sw_is_empty_op(int idx, int sw_idx,
280 			    rqsw_t sw_bit, rqsw_t *swp);
281 
282 /* Status words' individual bit manipulators. */
283 static inline void	runq_sw_set_not_empty(struct runq *rq, int idx);
284 static inline void	runq_sw_set_empty(struct runq *rq, int idx);
285 static inline bool	runq_sw_is_empty(struct runq *rq, int idx);
286 
287 /*
288  * Initialize a run structure.
289  */
290 void
runq_init(struct runq * rq)291 runq_init(struct runq *rq)
292 {
293 	int i;
294 
295 	bzero(rq, sizeof(*rq));
296 	for (i = 0; i < RQ_NQS; i++)
297 		TAILQ_INIT(&rq->rq_queues[i]);
298 }
299 
300 /*
301  * Helper to implement functions operating on a particular status word bit.
302  *
303  * The operator is passed the initial 'idx', the corresponding status word index
304  * in 'rq_status' in 'sw_idx', a status word with only that bit set in 'sw_bit'
305  * and a pointer to the corresponding status word in 'swp'.
306  */
307 static inline uintptr_t
runq_sw_apply(struct runq * rq,int idx,runq_sw_op * op)308 runq_sw_apply(struct runq *rq, int idx, runq_sw_op *op)
309 {
310 	rqsw_t *swp;
311 	rqsw_t sw_bit;
312 	int sw_idx;
313 
314 	CHECK_IDX(idx);
315 
316 	sw_idx = RQSW_IDX(idx);
317 	sw_bit = RQSW_BIT(idx);
318 	swp = &rq->rq_status.rq_sw[sw_idx];
319 
320 	return (op(idx, sw_idx, sw_bit, swp));
321 }
322 
323 static inline uintptr_t
runq_sw_set_not_empty_op(int idx,int sw_idx,rqsw_t sw_bit,rqsw_t * swp)324 runq_sw_set_not_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
325 {
326 	rqsw_t old_sw __unused = *swp;
327 
328 	*swp |= sw_bit;
329 	CTR4(KTR_RUNQ,
330 	    "runq_sw_set_not_empty: idx=%d sw_idx=%d "
331 	    "bits=" RQSW_PRI "->" RQSW_PRI,
332 	    idx, sw_idx, old_sw, *swp);
333 	return (0);
334 }
335 
336 /*
337  * Modify the status words to indicate that some queue is not empty.
338  *
339  * Sets the status bit corresponding to the queue at index 'idx'.
340  */
341 static inline void
runq_sw_set_not_empty(struct runq * rq,int idx)342 runq_sw_set_not_empty(struct runq *rq, int idx)
343 {
344 
345 	(void)runq_sw_apply(rq, idx, &runq_sw_set_not_empty_op);
346 }
347 
348 static inline uintptr_t
runq_sw_set_empty_op(int idx,int sw_idx,rqsw_t sw_bit,rqsw_t * swp)349 runq_sw_set_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
350 {
351 	rqsw_t old_sw __unused = *swp;
352 
353 	*swp &= ~sw_bit;
354 	CTR4(KTR_RUNQ,
355 	    "runq_sw_set_empty: idx=%d sw_idx=%d "
356 	    "bits=" RQSW_PRI "->" RQSW_PRI,
357 	    idx, sw_idx, old_sw, *swp);
358 	return (0);
359 }
360 
361 /*
362  * Modify the status words to indicate that some queue is empty.
363  *
364  * Clears the status bit corresponding to the queue at index 'idx'.
365  */
366 static inline void
runq_sw_set_empty(struct runq * rq,int idx)367 runq_sw_set_empty(struct runq *rq, int idx)
368 {
369 
370 	(void)runq_sw_apply(rq, idx, &runq_sw_set_empty_op);
371 }
372 
373 static inline uintptr_t
runq_sw_is_empty_op(int idx,int sw_idx,rqsw_t sw_bit,rqsw_t * swp)374 runq_sw_is_empty_op(int idx, int sw_idx, rqsw_t sw_bit, rqsw_t *swp)
375 {
376 	return ((*swp & sw_bit) == 0);
377 }
378 
379 /*
380  * Returns whether the status words indicate that some queue is empty.
381  */
382 static inline bool
runq_sw_is_empty(struct runq * rq,int idx)383 runq_sw_is_empty(struct runq *rq, int idx)
384 {
385 	return (runq_sw_apply(rq, idx, &runq_sw_is_empty_op));
386 }
387 
388 /*
389  * Returns whether a particular queue is empty.
390  */
391 bool
runq_is_queue_empty(struct runq * rq,int idx)392 runq_is_queue_empty(struct runq *rq, int idx)
393 {
394 
395 	return (runq_sw_is_empty(rq, idx));
396 }
397 
398 /*
399  * Add the thread to the queue specified by its priority, and set the
400  * corresponding status bit.
401  */
402 void
runq_add(struct runq * rq,struct thread * td,int flags)403 runq_add(struct runq *rq, struct thread *td, int flags)
404 {
405 
406 	runq_add_idx(rq, td, RQ_PRI_TO_QUEUE_IDX(td->td_priority), flags);
407 }
408 
409 void
runq_add_idx(struct runq * rq,struct thread * td,int idx,int flags)410 runq_add_idx(struct runq *rq, struct thread *td, int idx, int flags)
411 {
412 	struct rq_queue *rqq;
413 
414 	/*
415 	 * runq_sw_*() functions assert that 'idx' is non-negative and below
416 	 * 'RQ_NQS', and a static assert earlier in this file ensures that
417 	 * 'RQ_NQS' is no more than 256.
418 	 */
419 	td->td_rqindex = idx;
420 	runq_sw_set_not_empty(rq, idx);
421 	rqq = &rq->rq_queues[idx];
422 	CTR4(KTR_RUNQ, "runq_add_idx: td=%p pri=%d idx=%d rqq=%p",
423 	    td, td->td_priority, idx, rqq);
424 	if (flags & SRQ_PREEMPTED)
425 		TAILQ_INSERT_HEAD(rqq, td, td_runq);
426 	else
427 		TAILQ_INSERT_TAIL(rqq, td, td_runq);
428 }
429 
430 /*
431  * Remove the thread from the queue specified by its priority, and clear the
432  * corresponding status bit if the queue becomes empty.
433  *
434  * Returns whether the corresponding queue is empty after removal.
435  */
436 bool
runq_remove(struct runq * rq,struct thread * td)437 runq_remove(struct runq *rq, struct thread *td)
438 {
439 	struct rq_queue *rqq;
440 	int idx;
441 
442 	KASSERT(td->td_flags & TDF_INMEM, ("runq_remove: Thread swapped out"));
443 	idx = td->td_rqindex;
444 	CHECK_IDX(idx);
445 	rqq = &rq->rq_queues[idx];
446 	CTR4(KTR_RUNQ, "runq_remove: td=%p pri=%d idx=%d rqq=%p",
447 	    td, td->td_priority, idx, rqq);
448 	TAILQ_REMOVE(rqq, td, td_runq);
449 	if (TAILQ_EMPTY(rqq)) {
450 		runq_sw_set_empty(rq, idx);
451 		CTR1(KTR_RUNQ, "runq_remove: queue at idx=%d now empty", idx);
452 		return (true);
453 	}
454 	return (false);
455 }
456 
457 static inline int
runq_findq_status_word(struct runq * const rq,const int w_idx,const rqsw_t w,runq_pred_t * const pred,void * const pred_data)458 runq_findq_status_word(struct runq *const rq, const int w_idx,
459     const rqsw_t w, runq_pred_t *const pred, void *const pred_data)
460 {
461 	struct rq_queue *q;
462 	rqsw_t tw = w;
463 	int idx, b_idx;
464 
465 	while (tw != 0) {
466 		b_idx = RQSW_BSF(tw);
467 		idx = RQSW_TO_QUEUE_IDX(w_idx, b_idx);
468 		q = &rq->rq_queues[idx];
469 		KASSERT(!TAILQ_EMPTY(q),
470 		    ("runq_findq(): No thread on non-empty queue with idx=%d",
471 		    idx));
472 		if (pred(idx, q, pred_data))
473 			return (idx);
474 		tw &= ~RQSW_BIT(idx);
475 	}
476 
477 	return (-1);
478 }
479 
480 /*
481  * Find in the passed range (bounds included) the index of the first (i.e.,
482  * having lower index) non-empty queue that passes pred().
483  *
484  * Considered queues are those with index 'lvl_min' up to 'lvl_max' (bounds
485  * included).  If no queue matches, returns -1.
486  *
487  * This is done by scanning the status words (a set bit indicates a non-empty
488  * queue) and calling pred() with corresponding queue indices.  pred() must
489  * return whether the corresponding queue is accepted.  It is passed private
490  * data through 'pred_data', which can be used both for extra input and output.
491  */
492 int
runq_findq(struct runq * const rq,const int lvl_min,const int lvl_max,runq_pred_t * const pred,void * const pred_data)493 runq_findq(struct runq *const rq, const int lvl_min, const int lvl_max,
494     runq_pred_t *const pred, void *const pred_data)
495 {
496 	const rqsw_t (*const rqsw)[RQSW_NB] = &rq->rq_status.rq_sw;
497 	rqsw_t w;
498 	int i, last, idx;
499 
500 	CHECK_IDX(lvl_min);
501 	CHECK_IDX(lvl_max);
502 	KASSERT(lvl_min <= lvl_max,
503 	    ("lvl_min: %d > lvl_max: %d!", lvl_min, lvl_max));
504 
505 	i = RQSW_IDX(lvl_min);
506 	last = RQSW_IDX(lvl_max);
507 	/* Clear bits for runqueues below 'lvl_min'. */
508 	w = (*rqsw)[i] & ~(RQSW_BIT(lvl_min) - 1);
509 	if (i == last)
510 		goto last_mask;
511 	idx = runq_findq_status_word(rq, i, w, pred, pred_data);
512 	if (idx != -1)
513 		goto return_idx;
514 
515 	for (++i; i < last; ++i) {
516 		w = (*rqsw)[i];
517 		idx = runq_findq_status_word(rq, i, w, pred, pred_data);
518 		if (idx != -1)
519 			goto return_idx;
520 	}
521 
522 	MPASS(i == last);
523 	w = (*rqsw)[i];
524 last_mask:
525 	/* Clear bits for runqueues above 'lvl_max'. */
526 	w &= (RQSW_BIT(lvl_max) - 1) | RQSW_BIT(lvl_max);
527 	idx = runq_findq_status_word(rq, i, w, pred, pred_data);
528 	if (idx != -1)
529 		goto return_idx;
530 	return (-1);
531 return_idx:
532 	CTR4(KTR_RUNQ,
533 	    "runq_findq: bits=" RQSW_PRI "->" RQSW_PRI " i=%d idx=%d",
534 	    (*rqsw)[i], w, i, idx);
535 	return (idx);
536 }
537 
538 static bool
runq_first_thread_pred(const int idx,struct rq_queue * const q,void * const data)539 runq_first_thread_pred(const int idx, struct rq_queue *const q, void *const data)
540 {
541 	struct thread **const tdp = data;
542 	struct thread *const td = TAILQ_FIRST(q);
543 
544 	*tdp = td;
545 	return (true);
546 }
547 
548 /*
549  * Inline this function for the benefit of this file's internal uses, but make
550  * sure it has an external definition as it is exported.
551  */
552 extern inline struct thread *
runq_first_thread_range(struct runq * const rq,const int lvl_min,const int lvl_max)553 runq_first_thread_range(struct runq *const rq, const int lvl_min,
554     const int lvl_max)
555 {
556 	struct thread *td = NULL;
557 
558 	(void)runq_findq(rq, lvl_min, lvl_max, runq_first_thread_pred, &td);
559 	return (td);
560 }
561 
562 static inline struct thread *
runq_first_thread(struct runq * const rq)563 runq_first_thread(struct runq *const rq)
564 {
565 
566 	return (runq_first_thread_range(rq, 0, RQ_NQS - 1));
567 }
568 
569 /*
570  * Return true if there are some processes of any priority on the run queue,
571  * false otherwise.  Has no side effects.  Supports racy lookups (required by
572  * 4BSD).
573  */
574 bool
runq_not_empty(struct runq * rq)575 runq_not_empty(struct runq *rq)
576 {
577 	const rqsw_t (*const rqsw)[RQSW_NB] = &rq->rq_status.rq_sw;
578 	int sw_idx;
579 
580 	for (sw_idx = 0; sw_idx < RQSW_NB; ++sw_idx) {
581 		const rqsw_t w = (*rqsw)[sw_idx];
582 
583 		if (w != 0) {
584 			CTR3(KTR_RUNQ, "runq_not_empty: not empty; "
585 			    "rq=%p, sw_idx=%d, bits=" RQSW_PRI,
586 			    rq, sw_idx, w);
587 			return (true);
588 		}
589 	}
590 
591 	CTR1(KTR_RUNQ, "runq_not_empty: empty; rq=%p", rq);
592 	return (false);
593 }
594 
595 /*
596  * Find the highest priority process on the run queue.
597  */
598 struct thread *
runq_choose(struct runq * rq)599 runq_choose(struct runq *rq)
600 {
601 	struct thread *td;
602 
603 	td = runq_first_thread(rq);
604 	if (td != NULL) {
605 		CTR2(KTR_RUNQ, "runq_choose: idx=%d td=%p", td->td_rqindex, td);
606 		return (td);
607 	}
608 
609 	CTR0(KTR_RUNQ, "runq_choose: idlethread");
610 	return (NULL);
611 }
612 
613 struct runq_fuzz_pred_data {
614 	int fuzz;
615 	struct thread *td;
616 };
617 
618 static bool
runq_fuzz_pred(const int idx,struct rq_queue * const q,void * const data)619 runq_fuzz_pred(const int idx, struct rq_queue *const q, void *const data)
620 {
621 	struct runq_fuzz_pred_data *const d = data;
622 	const int fuzz = d->fuzz;
623 	struct thread *td;
624 
625 	td = TAILQ_FIRST(q);
626 
627 	if (fuzz > 1) {
628 		/*
629 		 * In the first couple of entries, check if
630 		 * there is one for our CPU as a preference.
631 		 */
632 		struct thread *td2 = td;
633 		int count = fuzz;
634 		int cpu = PCPU_GET(cpuid);
635 
636 		while (count-- != 0 && td2 != NULL) {
637 			if (td2->td_lastcpu == cpu) {
638 				td = td2;
639 				break;
640 			}
641 			td2 = TAILQ_NEXT(td2, td_runq);
642 		}
643 	}
644 
645 	d->td = td;
646 	return (true);
647 }
648 
649 /*
650  * Find the highest priority process on the run queue.
651  */
652 struct thread *
runq_choose_fuzz(struct runq * rq,int fuzz)653 runq_choose_fuzz(struct runq *rq, int fuzz)
654 {
655 	struct runq_fuzz_pred_data data = {
656 		.fuzz = fuzz,
657 		.td = NULL
658 	};
659 	int idx;
660 
661 	idx = runq_findq(rq, 0, RQ_NQS - 1, runq_fuzz_pred, &data);
662 	if (idx != -1) {
663 		MPASS(data.td != NULL);
664 		CTR2(KTR_RUNQ, "runq_choose_fuzz: idx=%d td=%p", idx, data.td);
665 		return (data.td);
666 	}
667 
668 	MPASS(data.td == NULL);
669 	CTR0(KTR_RUNQ, "runq_choose_fuzz: idlethread");
670 	return (NULL);
671 }
672