1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 #include <sys/cdefs.h> 30 #include "opt_sched.h" 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/kdb.h> 35 #include <sys/kernel.h> 36 #include <sys/ktr.h> 37 #include <sys/lock.h> 38 #include <sys/mutex.h> 39 #include <sys/proc.h> 40 #include <sys/queue.h> 41 #include <sys/sched.h> 42 #include <sys/smp.h> 43 #include <sys/sysctl.h> 44 45 #include <machine/cpu.h> 46 47 /* Uncomment this to enable logging of critical_enter/exit. */ 48 #if 0 49 #define KTR_CRITICAL KTR_SCHED 50 #else 51 #define KTR_CRITICAL 0 52 #endif 53 54 #ifdef FULL_PREEMPTION 55 #ifndef PREEMPTION 56 #error "The FULL_PREEMPTION option requires the PREEMPTION option" 57 #endif 58 #endif 59 60 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); 61 62 /* 63 * kern.sched.preemption allows user space to determine if preemption support 64 * is compiled in or not. It is not currently a boot or runtime flag that 65 * can be changed. 66 */ 67 #ifdef PREEMPTION 68 static int kern_sched_preemption = 1; 69 #else 70 static int kern_sched_preemption = 0; 71 #endif 72 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, 73 &kern_sched_preemption, 0, "Kernel preemption enabled"); 74 75 /* 76 * Support for scheduler stats exported via kern.sched.stats. All stats may 77 * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere 78 * with SCHED_STAT_DEFINE(). 79 */ 80 #ifdef SCHED_STATS 81 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 82 "switch stats"); 83 84 /* Switch reasons from mi_switch(9). */ 85 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]); 86 SCHED_STAT_DEFINE_VAR(owepreempt, 87 &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), ""); 88 SCHED_STAT_DEFINE_VAR(turnstile, 89 &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), ""); 90 SCHED_STAT_DEFINE_VAR(sleepq, 91 &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), ""); 92 SCHED_STAT_DEFINE_VAR(relinquish, 93 &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), ""); 94 SCHED_STAT_DEFINE_VAR(needresched, 95 &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), ""); 96 SCHED_STAT_DEFINE_VAR(idle, 97 &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), ""); 98 SCHED_STAT_DEFINE_VAR(iwait, 99 &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), ""); 100 SCHED_STAT_DEFINE_VAR(suspend, 101 &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), ""); 102 SCHED_STAT_DEFINE_VAR(remotepreempt, 103 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), ""); 104 SCHED_STAT_DEFINE_VAR(remotewakeidle, 105 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), ""); 106 SCHED_STAT_DEFINE_VAR(bind, 107 &DPCPU_NAME(sched_switch_stats[SWT_BIND]), ""); 108 109 static int 110 sysctl_stats_reset(SYSCTL_HANDLER_ARGS) 111 { 112 struct sysctl_oid *p; 113 uintptr_t counter; 114 int error; 115 int val; 116 int i; 117 118 val = 0; 119 error = sysctl_handle_int(oidp, &val, 0, req); 120 if (error != 0 || req->newptr == NULL) 121 return (error); 122 if (val == 0) 123 return (0); 124 /* 125 * Traverse the list of children of _kern_sched_stats and reset each 126 * to 0. Skip the reset entry. 127 */ 128 RB_FOREACH(p, sysctl_oid_list, oidp->oid_parent) { 129 if (p == oidp || p->oid_arg1 == NULL) 130 continue; 131 counter = (uintptr_t)p->oid_arg1; 132 CPU_FOREACH(i) { 133 *(long *)(dpcpu_off[i] + counter) = 0; 134 } 135 } 136 return (0); 137 } 138 139 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, 140 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE, NULL, 0, 141 sysctl_stats_reset, "I", 142 "Reset scheduler statistics"); 143 #endif 144 145 /************************************************************************ 146 * Functions that manipulate runnability from a thread perspective. * 147 ************************************************************************/ 148 /* 149 * Select the thread that will be run next. 150 */ 151 152 static __noinline struct thread * 153 choosethread_panic(struct thread *td) 154 { 155 156 /* 157 * If we are in panic, only allow system threads, 158 * plus the one we are running in, to be run. 159 */ 160 retry: 161 if (((td->td_proc->p_flag & P_SYSTEM) == 0 && 162 (td->td_flags & TDF_INPANIC) == 0)) { 163 /* note that it is no longer on the run queue */ 164 TD_SET_CAN_RUN(td); 165 td = sched_choose(); 166 goto retry; 167 } 168 169 TD_SET_RUNNING(td); 170 return (td); 171 } 172 173 struct thread * 174 choosethread(void) 175 { 176 struct thread *td; 177 178 td = sched_choose(); 179 180 if (KERNEL_PANICKED()) 181 return (choosethread_panic(td)); 182 183 TD_SET_RUNNING(td); 184 return (td); 185 } 186 187 /* 188 * Kernel thread preemption implementation. Critical sections mark 189 * regions of code in which preemptions are not allowed. 190 * 191 * It might seem a good idea to inline critical_enter() but, in order 192 * to prevent instructions reordering by the compiler, a __compiler_membar() 193 * would have to be used here (the same as sched_pin()). The performance 194 * penalty imposed by the membar could, then, produce slower code than 195 * the function call itself, for most cases. 196 */ 197 void 198 critical_enter_KBI(void) 199 { 200 #ifdef KTR 201 struct thread *td = curthread; 202 #endif 203 critical_enter(); 204 CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td, 205 (long)td->td_proc->p_pid, td->td_name, td->td_critnest); 206 } 207 208 void __noinline 209 critical_exit_preempt(void) 210 { 211 struct thread *td; 212 int flags; 213 214 /* 215 * If td_critnest is 0, it is possible that we are going to get 216 * preempted again before reaching the code below. This happens 217 * rarely and is harmless. However, this means td_owepreempt may 218 * now be unset. 219 */ 220 td = curthread; 221 if (td->td_critnest != 0) 222 return; 223 if (kdb_active) 224 return; 225 226 /* 227 * Microoptimization: we committed to switch, 228 * disable preemption in interrupt handlers 229 * while spinning for the thread lock. 230 */ 231 td->td_critnest = 1; 232 thread_lock(td); 233 td->td_critnest--; 234 flags = SW_INVOL | SW_PREEMPT; 235 if (TD_IS_IDLETHREAD(td)) 236 flags |= SWT_IDLE; 237 else 238 flags |= SWT_OWEPREEMPT; 239 mi_switch(flags); 240 } 241 242 void 243 critical_exit_KBI(void) 244 { 245 #ifdef KTR 246 struct thread *td = curthread; 247 #endif 248 critical_exit(); 249 CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td, 250 (long)td->td_proc->p_pid, td->td_name, td->td_critnest); 251 } 252 253 /************************************************************************ 254 * SYSTEM RUN QUEUE manipulations and tests * 255 ************************************************************************/ 256 /* 257 * Initialize a run structure. 258 */ 259 void 260 runq_init(struct runq *rq) 261 { 262 int i; 263 264 bzero(rq, sizeof *rq); 265 for (i = 0; i < RQ_NQS; i++) 266 TAILQ_INIT(&rq->rq_queues[i]); 267 } 268 269 /* 270 * Clear the status bit of the queue corresponding to priority level pri, 271 * indicating that it is empty. 272 */ 273 static __inline void 274 runq_clrbit(struct runq *rq, int pri) 275 { 276 struct rqbits *rqb; 277 278 rqb = &rq->rq_status; 279 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", 280 rqb->rqb_bits[RQB_WORD(pri)], 281 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), 282 RQB_BIT(pri), RQB_WORD(pri)); 283 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); 284 } 285 286 /* 287 * Find the index of the first non-empty run queue. This is done by 288 * scanning the status bits, a set bit indicates a non-empty queue. 289 */ 290 static __inline int 291 runq_findbit(struct runq *rq) 292 { 293 struct rqbits *rqb; 294 int pri; 295 int i; 296 297 rqb = &rq->rq_status; 298 for (i = 0; i < RQB_LEN; i++) 299 if (rqb->rqb_bits[i]) { 300 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); 301 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", 302 rqb->rqb_bits[i], i, pri); 303 return (pri); 304 } 305 306 return (-1); 307 } 308 309 static __inline int 310 runq_findbit_from(struct runq *rq, u_char pri) 311 { 312 struct rqbits *rqb; 313 rqb_word_t mask; 314 int i; 315 316 /* 317 * Set the mask for the first word so we ignore priorities before 'pri'. 318 */ 319 mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1)); 320 rqb = &rq->rq_status; 321 again: 322 for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) { 323 mask = rqb->rqb_bits[i] & mask; 324 if (mask == 0) 325 continue; 326 pri = RQB_FFS(mask) + (i << RQB_L2BPW); 327 CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d", 328 mask, i, pri); 329 return (pri); 330 } 331 if (pri == 0) 332 return (-1); 333 /* 334 * Wrap back around to the beginning of the list just once so we 335 * scan the whole thing. 336 */ 337 pri = 0; 338 goto again; 339 } 340 341 /* 342 * Set the status bit of the queue corresponding to priority level pri, 343 * indicating that it is non-empty. 344 */ 345 static __inline void 346 runq_setbit(struct runq *rq, int pri) 347 { 348 struct rqbits *rqb; 349 350 rqb = &rq->rq_status; 351 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", 352 rqb->rqb_bits[RQB_WORD(pri)], 353 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), 354 RQB_BIT(pri), RQB_WORD(pri)); 355 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); 356 } 357 358 /* 359 * Add the thread to the queue specified by its priority, and set the 360 * corresponding status bit. 361 */ 362 void 363 runq_add(struct runq *rq, struct thread *td, int flags) 364 { 365 struct rqhead *rqh; 366 int pri; 367 368 pri = td->td_priority / RQ_PPQ; 369 td->td_rqindex = pri; 370 runq_setbit(rq, pri); 371 rqh = &rq->rq_queues[pri]; 372 CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p", 373 td, td->td_priority, pri, rqh); 374 if (flags & SRQ_PREEMPTED) { 375 TAILQ_INSERT_HEAD(rqh, td, td_runq); 376 } else { 377 TAILQ_INSERT_TAIL(rqh, td, td_runq); 378 } 379 } 380 381 void 382 runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags) 383 { 384 struct rqhead *rqh; 385 386 KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri)); 387 td->td_rqindex = pri; 388 runq_setbit(rq, pri); 389 rqh = &rq->rq_queues[pri]; 390 CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p", 391 td, td->td_priority, pri, rqh); 392 if (flags & SRQ_PREEMPTED) { 393 TAILQ_INSERT_HEAD(rqh, td, td_runq); 394 } else { 395 TAILQ_INSERT_TAIL(rqh, td, td_runq); 396 } 397 } 398 /* 399 * Return true if there are runnable processes of any priority on the run 400 * queue, false otherwise. Has no side effects, does not modify the run 401 * queue structure. 402 */ 403 int 404 runq_check(struct runq *rq) 405 { 406 struct rqbits *rqb; 407 int i; 408 409 rqb = &rq->rq_status; 410 for (i = 0; i < RQB_LEN; i++) 411 if (rqb->rqb_bits[i]) { 412 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", 413 rqb->rqb_bits[i], i); 414 return (1); 415 } 416 CTR0(KTR_RUNQ, "runq_check: empty"); 417 418 return (0); 419 } 420 421 /* 422 * Find the highest priority process on the run queue. 423 */ 424 struct thread * 425 runq_choose_fuzz(struct runq *rq, int fuzz) 426 { 427 struct rqhead *rqh; 428 struct thread *td; 429 int pri; 430 431 while ((pri = runq_findbit(rq)) != -1) { 432 rqh = &rq->rq_queues[pri]; 433 /* fuzz == 1 is normal.. 0 or less are ignored */ 434 if (fuzz > 1) { 435 /* 436 * In the first couple of entries, check if 437 * there is one for our CPU as a preference. 438 */ 439 int count = fuzz; 440 int cpu = PCPU_GET(cpuid); 441 struct thread *td2; 442 td2 = td = TAILQ_FIRST(rqh); 443 444 while (count-- && td2) { 445 if (td2->td_lastcpu == cpu) { 446 td = td2; 447 break; 448 } 449 td2 = TAILQ_NEXT(td2, td_runq); 450 } 451 } else 452 td = TAILQ_FIRST(rqh); 453 KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue")); 454 CTR3(KTR_RUNQ, 455 "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh); 456 return (td); 457 } 458 CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri); 459 460 return (NULL); 461 } 462 463 /* 464 * Find the highest priority process on the run queue. 465 */ 466 struct thread * 467 runq_choose(struct runq *rq) 468 { 469 struct rqhead *rqh; 470 struct thread *td; 471 int pri; 472 473 while ((pri = runq_findbit(rq)) != -1) { 474 rqh = &rq->rq_queues[pri]; 475 td = TAILQ_FIRST(rqh); 476 KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); 477 CTR3(KTR_RUNQ, 478 "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh); 479 return (td); 480 } 481 CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri); 482 483 return (NULL); 484 } 485 486 struct thread * 487 runq_choose_from(struct runq *rq, u_char idx) 488 { 489 struct rqhead *rqh; 490 struct thread *td; 491 int pri; 492 493 if ((pri = runq_findbit_from(rq, idx)) != -1) { 494 rqh = &rq->rq_queues[pri]; 495 td = TAILQ_FIRST(rqh); 496 KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); 497 CTR4(KTR_RUNQ, 498 "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p", 499 pri, td, td->td_rqindex, rqh); 500 return (td); 501 } 502 CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri); 503 504 return (NULL); 505 } 506 /* 507 * Remove the thread from the queue specified by its priority, and clear the 508 * corresponding status bit if the queue becomes empty. 509 * Caller must set state afterwards. 510 */ 511 void 512 runq_remove(struct runq *rq, struct thread *td) 513 { 514 515 runq_remove_idx(rq, td, NULL); 516 } 517 518 void 519 runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx) 520 { 521 struct rqhead *rqh; 522 u_char pri; 523 524 KASSERT(td->td_flags & TDF_INMEM, 525 ("runq_remove_idx: thread swapped out")); 526 pri = td->td_rqindex; 527 KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri)); 528 rqh = &rq->rq_queues[pri]; 529 CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p", 530 td, td->td_priority, pri, rqh); 531 TAILQ_REMOVE(rqh, td, td_runq); 532 if (TAILQ_EMPTY(rqh)) { 533 CTR0(KTR_RUNQ, "runq_remove_idx: empty"); 534 runq_clrbit(rq, pri); 535 if (idx != NULL && *idx == pri) 536 *idx = (pri + 1) % RQ_NQS; 537 } 538 } 539