1 /*- 2 * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_sched.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kdb.h> 36 #include <sys/kernel.h> 37 #include <sys/ktr.h> 38 #include <sys/lock.h> 39 #include <sys/mutex.h> 40 #include <sys/proc.h> 41 #include <sys/queue.h> 42 #include <sys/sched.h> 43 #include <sys/smp.h> 44 #include <sys/sysctl.h> 45 46 #include <machine/cpu.h> 47 48 /* Uncomment this to enable logging of critical_enter/exit. */ 49 #if 0 50 #define KTR_CRITICAL KTR_SCHED 51 #else 52 #define KTR_CRITICAL 0 53 #endif 54 55 #ifdef FULL_PREEMPTION 56 #ifndef PREEMPTION 57 #error "The FULL_PREEMPTION option requires the PREEMPTION option" 58 #endif 59 #endif 60 61 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS); 62 63 /* 64 * kern.sched.preemption allows user space to determine if preemption support 65 * is compiled in or not. It is not currently a boot or runtime flag that 66 * can be changed. 67 */ 68 #ifdef PREEMPTION 69 static int kern_sched_preemption = 1; 70 #else 71 static int kern_sched_preemption = 0; 72 #endif 73 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD, 74 &kern_sched_preemption, 0, "Kernel preemption enabled"); 75 76 /* 77 * Support for scheduler stats exported via kern.sched.stats. All stats may 78 * be reset with kern.sched.stats.reset = 1. Stats may be defined elsewhere 79 * with SCHED_STAT_DEFINE(). 80 */ 81 #ifdef SCHED_STATS 82 SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats"); 83 84 /* Switch reasons from mi_switch(). */ 85 DPCPU_DEFINE(long, sched_switch_stats[SWT_COUNT]); 86 SCHED_STAT_DEFINE_VAR(uncategorized, 87 &DPCPU_NAME(sched_switch_stats[SWT_NONE]), ""); 88 SCHED_STAT_DEFINE_VAR(preempt, 89 &DPCPU_NAME(sched_switch_stats[SWT_PREEMPT]), ""); 90 SCHED_STAT_DEFINE_VAR(owepreempt, 91 &DPCPU_NAME(sched_switch_stats[SWT_OWEPREEMPT]), ""); 92 SCHED_STAT_DEFINE_VAR(turnstile, 93 &DPCPU_NAME(sched_switch_stats[SWT_TURNSTILE]), ""); 94 SCHED_STAT_DEFINE_VAR(sleepq, 95 &DPCPU_NAME(sched_switch_stats[SWT_SLEEPQ]), ""); 96 SCHED_STAT_DEFINE_VAR(relinquish, 97 &DPCPU_NAME(sched_switch_stats[SWT_RELINQUISH]), ""); 98 SCHED_STAT_DEFINE_VAR(needresched, 99 &DPCPU_NAME(sched_switch_stats[SWT_NEEDRESCHED]), ""); 100 SCHED_STAT_DEFINE_VAR(idle, 101 &DPCPU_NAME(sched_switch_stats[SWT_IDLE]), ""); 102 SCHED_STAT_DEFINE_VAR(iwait, 103 &DPCPU_NAME(sched_switch_stats[SWT_IWAIT]), ""); 104 SCHED_STAT_DEFINE_VAR(suspend, 105 &DPCPU_NAME(sched_switch_stats[SWT_SUSPEND]), ""); 106 SCHED_STAT_DEFINE_VAR(remotepreempt, 107 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEPREEMPT]), ""); 108 SCHED_STAT_DEFINE_VAR(remotewakeidle, 109 &DPCPU_NAME(sched_switch_stats[SWT_REMOTEWAKEIDLE]), ""); 110 111 static int 112 sysctl_stats_reset(SYSCTL_HANDLER_ARGS) 113 { 114 struct sysctl_oid *p; 115 uintptr_t counter; 116 int error; 117 int val; 118 int i; 119 120 val = 0; 121 error = sysctl_handle_int(oidp, &val, 0, req); 122 if (error != 0 || req->newptr == NULL) 123 return (error); 124 if (val == 0) 125 return (0); 126 /* 127 * Traverse the list of children of _kern_sched_stats and reset each 128 * to 0. Skip the reset entry. 129 */ 130 SLIST_FOREACH(p, oidp->oid_parent, oid_link) { 131 if (p == oidp || p->oid_arg1 == NULL) 132 continue; 133 counter = (uintptr_t)p->oid_arg1; 134 CPU_FOREACH(i) { 135 *(long *)(dpcpu_off[i] + counter) = 0; 136 } 137 } 138 return (0); 139 } 140 141 SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL, 142 0, sysctl_stats_reset, "I", "Reset scheduler statistics"); 143 #endif 144 145 /************************************************************************ 146 * Functions that manipulate runnability from a thread perspective. * 147 ************************************************************************/ 148 /* 149 * Select the thread that will be run next. 150 */ 151 struct thread * 152 choosethread(void) 153 { 154 struct thread *td; 155 156 retry: 157 td = sched_choose(); 158 159 /* 160 * If we are in panic, only allow system threads, 161 * plus the one we are running in, to be run. 162 */ 163 if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 && 164 (td->td_flags & TDF_INPANIC) == 0)) { 165 /* note that it is no longer on the run queue */ 166 TD_SET_CAN_RUN(td); 167 goto retry; 168 } 169 170 TD_SET_RUNNING(td); 171 return (td); 172 } 173 174 /* 175 * Kernel thread preemption implementation. Critical sections mark 176 * regions of code in which preemptions are not allowed. 177 * 178 * It might seem a good idea to inline critical_enter() but, in order 179 * to prevent instructions reordering by the compiler, a __compiler_membar() 180 * would have to be used here (the same as sched_pin()). The performance 181 * penalty imposed by the membar could, then, produce slower code than 182 * the function call itself, for most cases. 183 */ 184 void 185 critical_enter(void) 186 { 187 struct thread *td; 188 189 td = curthread; 190 td->td_critnest++; 191 CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td, 192 (long)td->td_proc->p_pid, td->td_name, td->td_critnest); 193 } 194 195 void 196 critical_exit(void) 197 { 198 struct thread *td; 199 int flags; 200 201 td = curthread; 202 KASSERT(td->td_critnest != 0, 203 ("critical_exit: td_critnest == 0")); 204 205 if (td->td_critnest == 1) { 206 td->td_critnest = 0; 207 if (td->td_owepreempt && !kdb_active) { 208 td->td_critnest = 1; 209 thread_lock(td); 210 td->td_critnest--; 211 flags = SW_INVOL | SW_PREEMPT; 212 if (TD_IS_IDLETHREAD(td)) 213 flags |= SWT_IDLE; 214 else 215 flags |= SWT_OWEPREEMPT; 216 mi_switch(flags, NULL); 217 thread_unlock(td); 218 } 219 } else 220 td->td_critnest--; 221 222 CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td, 223 (long)td->td_proc->p_pid, td->td_name, td->td_critnest); 224 } 225 226 /************************************************************************ 227 * SYSTEM RUN QUEUE manipulations and tests * 228 ************************************************************************/ 229 /* 230 * Initialize a run structure. 231 */ 232 void 233 runq_init(struct runq *rq) 234 { 235 int i; 236 237 bzero(rq, sizeof *rq); 238 for (i = 0; i < RQ_NQS; i++) 239 TAILQ_INIT(&rq->rq_queues[i]); 240 } 241 242 /* 243 * Clear the status bit of the queue corresponding to priority level pri, 244 * indicating that it is empty. 245 */ 246 static __inline void 247 runq_clrbit(struct runq *rq, int pri) 248 { 249 struct rqbits *rqb; 250 251 rqb = &rq->rq_status; 252 CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d", 253 rqb->rqb_bits[RQB_WORD(pri)], 254 rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri), 255 RQB_BIT(pri), RQB_WORD(pri)); 256 rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri); 257 } 258 259 /* 260 * Find the index of the first non-empty run queue. This is done by 261 * scanning the status bits, a set bit indicates a non-empty queue. 262 */ 263 static __inline int 264 runq_findbit(struct runq *rq) 265 { 266 struct rqbits *rqb; 267 int pri; 268 int i; 269 270 rqb = &rq->rq_status; 271 for (i = 0; i < RQB_LEN; i++) 272 if (rqb->rqb_bits[i]) { 273 pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW); 274 CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d", 275 rqb->rqb_bits[i], i, pri); 276 return (pri); 277 } 278 279 return (-1); 280 } 281 282 static __inline int 283 runq_findbit_from(struct runq *rq, u_char pri) 284 { 285 struct rqbits *rqb; 286 rqb_word_t mask; 287 int i; 288 289 /* 290 * Set the mask for the first word so we ignore priorities before 'pri'. 291 */ 292 mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1)); 293 rqb = &rq->rq_status; 294 again: 295 for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) { 296 mask = rqb->rqb_bits[i] & mask; 297 if (mask == 0) 298 continue; 299 pri = RQB_FFS(mask) + (i << RQB_L2BPW); 300 CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d", 301 mask, i, pri); 302 return (pri); 303 } 304 if (pri == 0) 305 return (-1); 306 /* 307 * Wrap back around to the beginning of the list just once so we 308 * scan the whole thing. 309 */ 310 pri = 0; 311 goto again; 312 } 313 314 /* 315 * Set the status bit of the queue corresponding to priority level pri, 316 * indicating that it is non-empty. 317 */ 318 static __inline void 319 runq_setbit(struct runq *rq, int pri) 320 { 321 struct rqbits *rqb; 322 323 rqb = &rq->rq_status; 324 CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d", 325 rqb->rqb_bits[RQB_WORD(pri)], 326 rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri), 327 RQB_BIT(pri), RQB_WORD(pri)); 328 rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri); 329 } 330 331 /* 332 * Add the thread to the queue specified by its priority, and set the 333 * corresponding status bit. 334 */ 335 void 336 runq_add(struct runq *rq, struct thread *td, int flags) 337 { 338 struct rqhead *rqh; 339 int pri; 340 341 pri = td->td_priority / RQ_PPQ; 342 td->td_rqindex = pri; 343 runq_setbit(rq, pri); 344 rqh = &rq->rq_queues[pri]; 345 CTR4(KTR_RUNQ, "runq_add: td=%p pri=%d %d rqh=%p", 346 td, td->td_priority, pri, rqh); 347 if (flags & SRQ_PREEMPTED) { 348 TAILQ_INSERT_HEAD(rqh, td, td_runq); 349 } else { 350 TAILQ_INSERT_TAIL(rqh, td, td_runq); 351 } 352 } 353 354 void 355 runq_add_pri(struct runq *rq, struct thread *td, u_char pri, int flags) 356 { 357 struct rqhead *rqh; 358 359 KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri)); 360 td->td_rqindex = pri; 361 runq_setbit(rq, pri); 362 rqh = &rq->rq_queues[pri]; 363 CTR4(KTR_RUNQ, "runq_add_pri: td=%p pri=%d idx=%d rqh=%p", 364 td, td->td_priority, pri, rqh); 365 if (flags & SRQ_PREEMPTED) { 366 TAILQ_INSERT_HEAD(rqh, td, td_runq); 367 } else { 368 TAILQ_INSERT_TAIL(rqh, td, td_runq); 369 } 370 } 371 /* 372 * Return true if there are runnable processes of any priority on the run 373 * queue, false otherwise. Has no side effects, does not modify the run 374 * queue structure. 375 */ 376 int 377 runq_check(struct runq *rq) 378 { 379 struct rqbits *rqb; 380 int i; 381 382 rqb = &rq->rq_status; 383 for (i = 0; i < RQB_LEN; i++) 384 if (rqb->rqb_bits[i]) { 385 CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d", 386 rqb->rqb_bits[i], i); 387 return (1); 388 } 389 CTR0(KTR_RUNQ, "runq_check: empty"); 390 391 return (0); 392 } 393 394 /* 395 * Find the highest priority process on the run queue. 396 */ 397 struct thread * 398 runq_choose_fuzz(struct runq *rq, int fuzz) 399 { 400 struct rqhead *rqh; 401 struct thread *td; 402 int pri; 403 404 while ((pri = runq_findbit(rq)) != -1) { 405 rqh = &rq->rq_queues[pri]; 406 /* fuzz == 1 is normal.. 0 or less are ignored */ 407 if (fuzz > 1) { 408 /* 409 * In the first couple of entries, check if 410 * there is one for our CPU as a preference. 411 */ 412 int count = fuzz; 413 int cpu = PCPU_GET(cpuid); 414 struct thread *td2; 415 td2 = td = TAILQ_FIRST(rqh); 416 417 while (count-- && td2) { 418 if (td2->td_lastcpu == cpu) { 419 td = td2; 420 break; 421 } 422 td2 = TAILQ_NEXT(td2, td_runq); 423 } 424 } else 425 td = TAILQ_FIRST(rqh); 426 KASSERT(td != NULL, ("runq_choose_fuzz: no proc on busy queue")); 427 CTR3(KTR_RUNQ, 428 "runq_choose_fuzz: pri=%d thread=%p rqh=%p", pri, td, rqh); 429 return (td); 430 } 431 CTR1(KTR_RUNQ, "runq_choose_fuzz: idleproc pri=%d", pri); 432 433 return (NULL); 434 } 435 436 /* 437 * Find the highest priority process on the run queue. 438 */ 439 struct thread * 440 runq_choose(struct runq *rq) 441 { 442 struct rqhead *rqh; 443 struct thread *td; 444 int pri; 445 446 while ((pri = runq_findbit(rq)) != -1) { 447 rqh = &rq->rq_queues[pri]; 448 td = TAILQ_FIRST(rqh); 449 KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); 450 CTR3(KTR_RUNQ, 451 "runq_choose: pri=%d thread=%p rqh=%p", pri, td, rqh); 452 return (td); 453 } 454 CTR1(KTR_RUNQ, "runq_choose: idlethread pri=%d", pri); 455 456 return (NULL); 457 } 458 459 struct thread * 460 runq_choose_from(struct runq *rq, u_char idx) 461 { 462 struct rqhead *rqh; 463 struct thread *td; 464 int pri; 465 466 if ((pri = runq_findbit_from(rq, idx)) != -1) { 467 rqh = &rq->rq_queues[pri]; 468 td = TAILQ_FIRST(rqh); 469 KASSERT(td != NULL, ("runq_choose: no thread on busy queue")); 470 CTR4(KTR_RUNQ, 471 "runq_choose_from: pri=%d thread=%p idx=%d rqh=%p", 472 pri, td, td->td_rqindex, rqh); 473 return (td); 474 } 475 CTR1(KTR_RUNQ, "runq_choose_from: idlethread pri=%d", pri); 476 477 return (NULL); 478 } 479 /* 480 * Remove the thread from the queue specified by its priority, and clear the 481 * corresponding status bit if the queue becomes empty. 482 * Caller must set state afterwards. 483 */ 484 void 485 runq_remove(struct runq *rq, struct thread *td) 486 { 487 488 runq_remove_idx(rq, td, NULL); 489 } 490 491 void 492 runq_remove_idx(struct runq *rq, struct thread *td, u_char *idx) 493 { 494 struct rqhead *rqh; 495 u_char pri; 496 497 KASSERT(td->td_flags & TDF_INMEM, 498 ("runq_remove_idx: thread swapped out")); 499 pri = td->td_rqindex; 500 KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri)); 501 rqh = &rq->rq_queues[pri]; 502 CTR4(KTR_RUNQ, "runq_remove_idx: td=%p, pri=%d %d rqh=%p", 503 td, td->td_priority, pri, rqh); 504 TAILQ_REMOVE(rqh, td, td_runq); 505 if (TAILQ_EMPTY(rqh)) { 506 CTR0(KTR_RUNQ, "runq_remove_idx: empty"); 507 runq_clrbit(rq, pri); 508 if (idx != NULL && *idx == pri) 509 *idx = (pri + 1) % RQ_NQS; 510 } 511 } 512