1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice unmodified, this list of conditions, and the following 12 * disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * This file implements the ULE scheduler. ULE supports independent CPU 31 * run queues and fine grain locking. It has superior interactive 32 * performance under load even on uni-processor systems. 33 * 34 * etymology: 35 * ULE is the last three letters in schedule. It owes its name to a 36 * generic user created for a scheduling system by Paul Mikesell at 37 * Isilon Systems and a general lack of creativity on the part of the author. 38 */ 39 40 #include <sys/cdefs.h> 41 #include "opt_hwpmc_hooks.h" 42 #include "opt_hwt_hooks.h" 43 #include "opt_sched.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/kdb.h> 48 #include <sys/kernel.h> 49 #include <sys/ktr.h> 50 #include <sys/limits.h> 51 #include <sys/lock.h> 52 #include <sys/mutex.h> 53 #include <sys/proc.h> 54 #include <sys/resource.h> 55 #include <sys/resourcevar.h> 56 #include <sys/runq.h> 57 #include <sys/sched.h> 58 #include <sys/sdt.h> 59 #include <sys/smp.h> 60 #include <sys/sx.h> 61 #include <sys/sysctl.h> 62 #include <sys/sysproto.h> 63 #include <sys/turnstile.h> 64 #include <sys/umtxvar.h> 65 #include <sys/vmmeter.h> 66 #include <sys/cpuset.h> 67 #include <sys/sbuf.h> 68 69 #ifdef HWPMC_HOOKS 70 #include <sys/pmckern.h> 71 #endif 72 73 #ifdef HWT_HOOKS 74 #include <dev/hwt/hwt_hook.h> 75 #endif 76 77 #ifdef KDTRACE_HOOKS 78 #include <sys/dtrace_bsd.h> 79 int __read_mostly dtrace_vtime_active; 80 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 81 #endif 82 83 #include <machine/cpu.h> 84 #include <machine/smp.h> 85 86 #define KTR_ULE 0 87 88 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX))) 89 #define TDQ_NAME_LEN (sizeof("sched lock ") + sizeof(__XSTRING(MAXCPU))) 90 #define TDQ_LOADNAME_LEN (sizeof("CPU ") + sizeof(__XSTRING(MAXCPU)) - 1 + sizeof(" load")) 91 92 /* 93 * Thread scheduler specific section. All fields are protected 94 * by the thread lock. 95 */ 96 struct td_sched { 97 short ts_flags; /* TSF_* flags. */ 98 int ts_cpu; /* CPU we are on, or were last on. */ 99 u_int ts_rltick; /* Real last tick, for affinity. */ 100 u_int ts_slice; /* Ticks of slice remaining. */ 101 u_int ts_ftick; /* %CPU window's first tick */ 102 u_int ts_ltick; /* %CPU window's last tick */ 103 /* All ticks count below are stored shifted by SCHED_TICK_SHIFT. */ 104 u_int ts_slptime; /* Number of ticks we vol. slept */ 105 u_int ts_runtime; /* Number of ticks we were running */ 106 u_int ts_ticks; /* pctcpu window's running tick count */ 107 #ifdef KTR 108 char ts_name[TS_NAME_LEN]; 109 #endif 110 }; 111 /* flags kept in ts_flags */ 112 #define TSF_BOUND 0x0001 /* Thread can not migrate. */ 113 #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ 114 115 #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) 116 #define THREAD_CAN_SCHED(td, cpu) \ 117 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask) 118 119 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <= 120 sizeof(struct thread0_storage), 121 "increase struct thread0_storage.t0st_sched size"); 122 123 /* 124 * Priority ranges used for interactive and non-interactive timeshare 125 * threads. The timeshare priorities are split up into four ranges. 126 * The first range handles interactive threads. The last three ranges 127 * (NHALF, x, and NHALF) handle non-interactive threads with the outer 128 * ranges supporting nice values. 129 */ 130 #define PRI_TIMESHARE_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 131 #define PRI_INTERACT_RANGE ((PRI_TIMESHARE_RANGE - SCHED_PRI_NRESV) / 2) 132 #define PRI_BATCH_RANGE (PRI_TIMESHARE_RANGE - PRI_INTERACT_RANGE) 133 134 #define PRI_MIN_INTERACT PRI_MIN_TIMESHARE 135 #define PRI_MAX_INTERACT (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE - 1) 136 #define PRI_MIN_BATCH (PRI_MIN_TIMESHARE + PRI_INTERACT_RANGE) 137 #define PRI_MAX_BATCH PRI_MAX_TIMESHARE 138 139 /* 140 * These macros determine priorities for non-interactive threads. They are 141 * assigned a priority based on their recent cpu utilization as expressed 142 * by the ratio of ticks to the tick total. NHALF priorities at the start 143 * and end of the MIN to MAX timeshare range are only reachable with negative 144 * or positive nice respectively. 145 * 146 * CPU_RANGE: Length of range for priorities computed from CPU use. 147 * NICE: Priority offset due to the nice value. 148 * 5/4 is to preserve historical nice effect on computation ratios. 149 * NRESV: Number of priority levels reserved to account for nice values. 150 */ 151 #define SCHED_PRI_CPU_RANGE (PRI_BATCH_RANGE - SCHED_PRI_NRESV) 152 #define SCHED_PRI_NICE(nice) (((nice) - PRIO_MIN) * 5 / 4) 153 #define SCHED_PRI_NRESV SCHED_PRI_NICE(PRIO_MAX) 154 155 /* 156 * Runqueue indices for the implemented scheduling policies' priority bounds. 157 * 158 * In ULE's implementation, realtime policy covers the ITHD, REALTIME and 159 * INTERACT (see above) ranges, timesharing the BATCH range (see above), and 160 * idle policy the IDLE range. 161 * 162 * Priorities from these ranges must not be assigned to the same runqueue's 163 * queue. 164 */ 165 #define RQ_RT_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_ITHD)) 166 #define RQ_RT_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_INTERACT)) 167 #define RQ_TS_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_BATCH)) 168 #define RQ_TS_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_BATCH)) 169 #define RQ_ID_POL_MIN (RQ_PRI_TO_QUEUE_IDX(PRI_MIN_IDLE)) 170 #define RQ_ID_POL_MAX (RQ_PRI_TO_QUEUE_IDX(PRI_MAX_IDLE)) 171 172 _Static_assert(RQ_RT_POL_MAX != RQ_TS_POL_MIN, 173 "ULE's realtime and timeshare policies' runqueue ranges overlap"); 174 _Static_assert(RQ_TS_POL_MAX != RQ_ID_POL_MIN, 175 "ULE's timeshare and idle policies' runqueue ranges overlap"); 176 177 /* Helper to treat the timeshare range as a circular group of queues. */ 178 #define RQ_TS_POL_MODULO (RQ_TS_POL_MAX - RQ_TS_POL_MIN + 1) 179 180 /* 181 * Cpu percentage computation macros and defines. 182 * 183 * SCHED_TICK_SECS: Max number of seconds to average the cpu usage across. 184 * Must be at most 20 to avoid overflow in sched_pctcpu()'s current formula. 185 * SCHED_TICK_MAX: Max number of hz ticks matching SCHED_TICK_SECS. 186 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 187 * SCHED_TICK_RUN_SHIFTED: Number of shifted ticks running in last window. 188 * SCHED_TICK_LENGTH: Length of last window in shifted ticks or 1 if empty. 189 * SCHED_CPU_DECAY_NUMER: Numerator of %CPU decay factor. 190 * SCHED_CPU_DECAY_DENOM: Denominator of %CPU decay factor. 191 */ 192 #define SCHED_TICK_SECS 11 193 #define SCHED_TICK_MAX(hz) ((hz) * SCHED_TICK_SECS) 194 #define SCHED_TICK_SHIFT 10 195 #define SCHED_TICK_RUN_SHIFTED(ts) ((ts)->ts_ticks) 196 #define SCHED_TICK_LENGTH(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, 1)) 197 #define SCHED_CPU_DECAY_NUMER 10 198 #define SCHED_CPU_DECAY_DENOM 11 199 _Static_assert(SCHED_CPU_DECAY_NUMER >= 0 && SCHED_CPU_DECAY_DENOM > 0 && 200 SCHED_CPU_DECAY_NUMER <= SCHED_CPU_DECAY_DENOM, 201 "Inconsistent values for SCHED_CPU_DECAY_NUMER and/or " 202 "SCHED_CPU_DECAY_DENOM"); 203 204 /* 205 * These determine the interactivity of a process. Interactivity differs from 206 * cpu utilization in that it expresses the voluntary time slept vs time ran 207 * while cpu utilization includes all time not running. This more accurately 208 * models the intent of the thread. 209 * 210 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 211 * before throttling back. 212 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 213 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 214 * INTERACT_THRESH: Threshold for placement on the current runq. 215 */ 216 #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 217 #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 218 #define SCHED_INTERACT_MAX (100) 219 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 220 #define SCHED_INTERACT_THRESH (30) 221 222 /* 223 * These parameters determine the slice behavior for batch work. 224 */ 225 #define SCHED_SLICE_DEFAULT_DIVISOR 10 /* ~94 ms, 12 stathz ticks. */ 226 #define SCHED_SLICE_MIN_DIVISOR 6 /* DEFAULT/MIN = ~16 ms. */ 227 228 /* Flags kept in td_flags. */ 229 #define TDF_PICKCPU TDF_SCHED0 /* Thread should pick new CPU. */ 230 #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */ 231 232 /* 233 * tickincr: Converts a stathz tick into a hz domain scaled by 234 * the shift factor. Without the shift the error rate 235 * due to rounding would be unacceptably high. 236 * realstathz: stathz is sometimes 0 and run off of hz. 237 * sched_slice: Runtime of each thread before rescheduling. 238 * preempt_thresh: Priority threshold for preemption and remote IPIs. 239 */ 240 static u_int __read_mostly sched_interact = SCHED_INTERACT_THRESH; 241 static int __read_mostly tickincr = 8 << SCHED_TICK_SHIFT; 242 static int __read_mostly realstathz = 127; /* reset during boot. */ 243 static int __read_mostly sched_slice = 10; /* reset during boot. */ 244 static int __read_mostly sched_slice_min = 1; /* reset during boot. */ 245 #ifdef PREEMPTION 246 #ifdef FULL_PREEMPTION 247 static int __read_mostly preempt_thresh = PRI_MAX_IDLE; 248 #else 249 static int __read_mostly preempt_thresh = PRI_MIN_KERN; 250 #endif 251 #else 252 static int __read_mostly preempt_thresh = 0; 253 #endif 254 static int __read_mostly static_boost = PRI_MIN_BATCH; 255 static int __read_mostly sched_idlespins = 10000; 256 static int __read_mostly sched_idlespinthresh = -1; 257 258 /* 259 * tdq - per processor runqs and statistics. A mutex synchronizes access to 260 * most fields. Some fields are loaded or modified without the mutex. 261 * 262 * Locking protocols: 263 * (c) constant after initialization 264 * (f) flag, set with the tdq lock held, cleared on local CPU 265 * (l) all accesses are CPU-local 266 * (ls) stores are performed by the local CPU, loads may be lockless 267 * (t) all accesses are protected by the tdq mutex 268 * (ts) stores are serialized by the tdq mutex, loads may be lockless 269 */ 270 struct tdq { 271 /* 272 * Ordered to improve efficiency of cpu_search() and switch(). 273 * tdq_lock is padded to avoid false sharing with tdq_load and 274 * tdq_cpu_idle. 275 */ 276 struct mtx_padalign tdq_lock; /* run queue lock. */ 277 struct cpu_group *tdq_cg; /* (c) Pointer to cpu topology. */ 278 struct thread *tdq_curthread; /* (t) Current executing thread. */ 279 int tdq_load; /* (ts) Aggregate load. */ 280 int tdq_sysload; /* (ts) For loadavg, !ITHD load. */ 281 int tdq_cpu_idle; /* (ls) cpu_idle() is active. */ 282 int tdq_transferable; /* (ts) Transferable thread count. */ 283 short tdq_switchcnt; /* (l) Switches this tick. */ 284 short tdq_oldswitchcnt; /* (l) Switches last tick. */ 285 u_char tdq_lowpri; /* (ts) Lowest priority thread. */ 286 u_char tdq_owepreempt; /* (f) Remote preemption pending. */ 287 u_char tdq_ts_off; /* (t) TS insertion offset. */ 288 u_char tdq_ts_deq_off; /* (t) TS dequeue offset. */ 289 /* 290 * (t) Number of (stathz) ticks since last offset incrementation 291 * correction. 292 */ 293 u_char tdq_ts_ticks; 294 int tdq_id; /* (c) cpuid. */ 295 struct runq tdq_runq; /* (t) Run queue. */ 296 char tdq_name[TDQ_NAME_LEN]; 297 #ifdef KTR 298 char tdq_loadname[TDQ_LOADNAME_LEN]; 299 #endif 300 }; 301 302 /* Idle thread states and config. */ 303 #define TDQ_RUNNING 1 304 #define TDQ_IDLE 2 305 306 /* Lockless accessors. */ 307 #define TDQ_LOAD(tdq) atomic_load_int(&(tdq)->tdq_load) 308 #define TDQ_TRANSFERABLE(tdq) atomic_load_int(&(tdq)->tdq_transferable) 309 #define TDQ_SWITCHCNT(tdq) (atomic_load_short(&(tdq)->tdq_switchcnt) + \ 310 atomic_load_short(&(tdq)->tdq_oldswitchcnt)) 311 #define TDQ_SWITCHCNT_INC(tdq) (atomic_store_short(&(tdq)->tdq_switchcnt, \ 312 atomic_load_short(&(tdq)->tdq_switchcnt) + 1)) 313 314 #ifdef SMP 315 struct cpu_group __read_mostly *cpu_top; /* CPU topology */ 316 317 #define SCHED_AFFINITY_DEFAULT (max(1, hz / 1000)) 318 /* 319 * This inequality has to be written with a positive difference of ticks to 320 * correctly handle wraparound. 321 */ 322 #define SCHED_AFFINITY(ts, t) ((u_int)ticks - (ts)->ts_rltick < (t) * affinity) 323 324 /* 325 * Run-time tunables. 326 */ 327 static int rebalance = 1; 328 static int balance_interval = 128; /* Default set in sched_initticks(). */ 329 static int __read_mostly affinity; 330 static int __read_mostly steal_idle = 1; 331 static int __read_mostly steal_thresh = 2; 332 static int __read_mostly always_steal = 0; 333 static int __read_mostly trysteal_limit = 2; 334 335 /* 336 * One thread queue per processor. 337 */ 338 static struct tdq __read_mostly *balance_tdq; 339 static int balance_ticks; 340 DPCPU_DEFINE_STATIC(struct tdq, tdq); 341 DPCPU_DEFINE_STATIC(uint32_t, randomval); 342 343 #define TDQ_SELF() ((struct tdq *)PCPU_GET(sched)) 344 #define TDQ_CPU(x) (DPCPU_ID_PTR((x), tdq)) 345 #define TDQ_ID(x) ((x)->tdq_id) 346 #else /* !SMP */ 347 static struct tdq tdq_cpu; 348 349 #define TDQ_ID(x) (0) 350 #define TDQ_SELF() (&tdq_cpu) 351 #define TDQ_CPU(x) (&tdq_cpu) 352 #endif 353 354 #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) 355 #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) 356 #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) 357 #define TDQ_TRYLOCK(t) mtx_trylock_spin(TDQ_LOCKPTR((t))) 358 #define TDQ_TRYLOCK_FLAGS(t, f) mtx_trylock_spin_flags(TDQ_LOCKPTR((t)), (f)) 359 #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) 360 #define TDQ_LOCKPTR(t) ((struct mtx *)(&(t)->tdq_lock)) 361 362 static void sched_setpreempt(int); 363 static void sched_priority(struct thread *); 364 static void sched_thread_priority(struct thread *, u_char); 365 static int sched_interact_score(struct thread *); 366 static void sched_interact_update(struct thread *); 367 static void sched_interact_fork(struct thread *); 368 static void sched_pctcpu_update(struct td_sched *, int); 369 370 /* Operations on per processor queues */ 371 static inline struct thread *runq_choose_realtime(struct runq *const rq); 372 static inline struct thread *runq_choose_timeshare(struct runq *const rq, 373 int off); 374 static inline struct thread *runq_choose_idle(struct runq *const rq); 375 static struct thread *tdq_choose(struct tdq *); 376 377 static void tdq_setup(struct tdq *, int i); 378 static void tdq_load_add(struct tdq *, struct thread *); 379 static void tdq_load_rem(struct tdq *, struct thread *); 380 static inline void tdq_runq_add(struct tdq *, struct thread *, int); 381 static inline void tdq_advance_ts_deq_off(struct tdq *, bool); 382 static inline void tdq_runq_rem(struct tdq *, struct thread *); 383 static inline int sched_shouldpreempt(int, int, int); 384 static void tdq_print(int cpu); 385 static void runq_print(struct runq *rq); 386 static int tdq_add(struct tdq *, struct thread *, int); 387 #ifdef SMP 388 static int tdq_move(struct tdq *, struct tdq *); 389 static int tdq_idled(struct tdq *); 390 static void tdq_notify(struct tdq *, int lowpri); 391 392 static bool runq_steal_pred(const int idx, struct rq_queue *const q, 393 void *const data); 394 static inline struct thread *runq_steal_range(struct runq *const rq, 395 const int lvl_min, const int lvl_max, int cpu); 396 static inline struct thread *runq_steal_realtime(struct runq *const rq, 397 int cpu); 398 static inline struct thread *runq_steal_timeshare(struct runq *const rq, 399 int cpu, int off); 400 static inline struct thread *runq_steal_idle(struct runq *const rq, 401 int cpu); 402 static struct thread *tdq_steal(struct tdq *, int); 403 404 static int sched_pickcpu(struct thread *, int); 405 static void sched_balance(void); 406 static bool sched_balance_pair(struct tdq *, struct tdq *); 407 static inline struct tdq *sched_setcpu(struct thread *, int, int); 408 static inline void thread_unblock_switch(struct thread *, struct mtx *); 409 static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS); 410 static int sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, 411 struct cpu_group *cg, int indent); 412 #endif 413 414 static void sched_setup(void *dummy); 415 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL); 416 417 static void sched_initticks(void *dummy); 418 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, 419 NULL); 420 421 SDT_PROVIDER_DEFINE(sched); 422 423 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 424 "struct proc *", "uint8_t"); 425 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 426 "struct proc *", "void *"); 427 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 428 "struct proc *", "void *", "int"); 429 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 430 "struct proc *", "uint8_t", "struct thread *"); 431 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int"); 432 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 433 "struct proc *"); 434 SDT_PROBE_DEFINE(sched, , , on__cpu); 435 SDT_PROBE_DEFINE(sched, , , remain__cpu); 436 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 437 "struct proc *"); 438 439 /* 440 * Print the threads waiting on a run-queue. 441 */ 442 static void 443 runq_print(struct runq *rq) 444 { 445 struct rq_queue *rqq; 446 struct thread *td; 447 int pri; 448 int j; 449 int i; 450 451 for (i = 0; i < RQSW_NB; i++) { 452 printf("\t\trunq bits %d %#lx\n", 453 i, rq->rq_status.rq_sw[i]); 454 for (j = 0; j < RQSW_BPW; j++) 455 if (rq->rq_status.rq_sw[i] & (1ul << j)) { 456 pri = RQSW_TO_QUEUE_IDX(i, j); 457 rqq = &rq->rq_queues[pri]; 458 TAILQ_FOREACH(td, rqq, td_runq) { 459 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 460 td, td->td_name, td->td_priority, 461 td->td_rqindex, pri); 462 } 463 } 464 } 465 } 466 467 /* 468 * Print the status of a per-cpu thread queue. Should be a ddb show cmd. 469 */ 470 static void __unused 471 tdq_print(int cpu) 472 { 473 struct tdq *tdq; 474 475 tdq = TDQ_CPU(cpu); 476 477 printf("tdq %d:\n", TDQ_ID(tdq)); 478 printf("\tlock %p\n", TDQ_LOCKPTR(tdq)); 479 printf("\tLock name: %s\n", tdq->tdq_name); 480 printf("\tload: %d\n", tdq->tdq_load); 481 printf("\tswitch cnt: %d\n", tdq->tdq_switchcnt); 482 printf("\told switch cnt: %d\n", tdq->tdq_oldswitchcnt); 483 printf("\tTS insert offset: %d\n", tdq->tdq_ts_off); 484 printf("\tTS dequeue offset: %d\n", tdq->tdq_ts_deq_off); 485 printf("\tload transferable: %d\n", tdq->tdq_transferable); 486 printf("\tlowest priority: %d\n", tdq->tdq_lowpri); 487 printf("\trunq:\n"); 488 runq_print(&tdq->tdq_runq); 489 } 490 491 static inline int 492 sched_shouldpreempt(int pri, int cpri, int remote) 493 { 494 /* 495 * If the new priority is not better than the current priority there is 496 * nothing to do. 497 */ 498 if (pri >= cpri) 499 return (0); 500 /* 501 * Always preempt idle. 502 */ 503 if (cpri >= PRI_MIN_IDLE) 504 return (1); 505 /* 506 * If preemption is disabled don't preempt others. 507 */ 508 if (preempt_thresh == 0) 509 return (0); 510 /* 511 * Preempt if we exceed the threshold. 512 */ 513 if (pri <= preempt_thresh) 514 return (1); 515 /* 516 * If we're interactive or better and there is non-interactive 517 * or worse running preempt only remote processors. 518 */ 519 if (remote && pri <= PRI_MAX_INTERACT && cpri > PRI_MAX_INTERACT) 520 return (1); 521 return (0); 522 } 523 524 /* 525 * Add a thread to the actual run-queue. Keeps transferable counts up to 526 * date with what is actually on the run-queue. Selects the correct 527 * queue position for timeshare threads. 528 */ 529 static inline void 530 tdq_runq_add(struct tdq *tdq, struct thread *td, int flags) 531 { 532 struct td_sched *ts; 533 u_char pri, idx; 534 535 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 536 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); 537 538 pri = td->td_priority; 539 ts = td_get_sched(td); 540 TD_SET_RUNQ(td); 541 if (THREAD_CAN_MIGRATE(td)) { 542 tdq->tdq_transferable++; 543 ts->ts_flags |= TSF_XFERABLE; 544 } 545 if (PRI_MIN_BATCH <= pri && pri <= PRI_MAX_BATCH) { 546 /* 547 * The queues allocated to the batch range are not used as 548 * a simple array but as a "circular" one where the insertion 549 * index (derived from 'pri') is offset by 'tdq_ts_off'. 'idx' 550 * is first set to the offset of the wanted queue in the TS' 551 * selection policy range. 552 */ 553 if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) != 0) 554 /* Current queue from which processes are being run. */ 555 idx = tdq->tdq_ts_deq_off; 556 else { 557 idx = (RQ_PRI_TO_QUEUE_IDX(pri) - RQ_TS_POL_MIN + 558 tdq->tdq_ts_off) % RQ_TS_POL_MODULO; 559 /* 560 * We avoid enqueuing low priority threads in the queue 561 * that we are still draining, effectively shortening 562 * the runqueue by one queue. 563 */ 564 if (tdq->tdq_ts_deq_off != tdq->tdq_ts_off && 565 idx == tdq->tdq_ts_deq_off) 566 /* Ensure the dividend is positive. */ 567 idx = (idx - 1 + RQ_TS_POL_MODULO) % 568 RQ_TS_POL_MODULO; 569 } 570 /* Absolute queue index. */ 571 idx += RQ_TS_POL_MIN; 572 runq_add_idx(&tdq->tdq_runq, td, idx, flags); 573 } else 574 runq_add(&tdq->tdq_runq, td, flags); 575 } 576 577 /* 578 * Advance the timesharing dequeue offset to the next non-empty queue or the 579 * insertion offset, whichever is closer. 580 * 581 * If 'deq_queue_known_empty' is true, then the queue where timesharing threads 582 * are currently removed for execution (pointed to by 'tdq_ts_deq_off') is 583 * assumed empty. Otherwise, this condition is checked for. 584 */ 585 static inline void 586 tdq_advance_ts_deq_off(struct tdq *tdq, bool deq_queue_known_empty) 587 { 588 /* 589 * We chose a simple iterative algorithm since the difference between 590 * offsets is small in practice (see sched_clock()). 591 */ 592 while (tdq->tdq_ts_deq_off != tdq->tdq_ts_off) { 593 if (deq_queue_known_empty) 594 deq_queue_known_empty = false; 595 else if (!runq_is_queue_empty(&tdq->tdq_runq, 596 tdq->tdq_ts_deq_off + RQ_TS_POL_MIN)) 597 break; 598 599 tdq->tdq_ts_deq_off = (tdq->tdq_ts_deq_off + 1) % 600 RQ_TS_POL_MODULO; 601 } 602 } 603 604 /* 605 * Remove a thread from a run-queue. This typically happens when a thread 606 * is selected to run. Running threads are not on the queue and the 607 * transferable count does not reflect them. 608 */ 609 static inline void 610 tdq_runq_rem(struct tdq *tdq, struct thread *td) 611 { 612 struct td_sched *ts; 613 bool queue_empty; 614 615 ts = td_get_sched(td); 616 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 617 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); 618 if (ts->ts_flags & TSF_XFERABLE) { 619 tdq->tdq_transferable--; 620 ts->ts_flags &= ~TSF_XFERABLE; 621 } 622 queue_empty = runq_remove(&tdq->tdq_runq, td); 623 /* 624 * If thread has a batch priority and the queue from which it was 625 * removed is now empty, advance the batch's queue removal index if it 626 * lags with respect to the batch's queue insertion index, so that we 627 * may eventually be able to advance the latter in sched_clock(). 628 */ 629 if (PRI_MIN_BATCH <= td->td_priority && 630 td->td_priority <= PRI_MAX_BATCH && queue_empty && 631 tdq->tdq_ts_deq_off + RQ_TS_POL_MIN == td->td_rqindex) 632 tdq_advance_ts_deq_off(tdq, true); 633 } 634 635 /* 636 * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load 637 * for this thread to the referenced thread queue. 638 */ 639 static void 640 tdq_load_add(struct tdq *tdq, struct thread *td) 641 { 642 643 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 644 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); 645 646 tdq->tdq_load++; 647 if ((td->td_flags & TDF_NOLOAD) == 0) 648 tdq->tdq_sysload++; 649 KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); 650 SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); 651 } 652 653 /* 654 * Remove the load from a thread that is transitioning to a sleep state or 655 * exiting. 656 */ 657 static void 658 tdq_load_rem(struct tdq *tdq, struct thread *td) 659 { 660 661 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 662 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); 663 KASSERT(tdq->tdq_load != 0, 664 ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); 665 666 tdq->tdq_load--; 667 if ((td->td_flags & TDF_NOLOAD) == 0) 668 tdq->tdq_sysload--; 669 KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load); 670 SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load); 671 } 672 673 /* 674 * Bound timeshare latency by decreasing slice size as load increases. We 675 * consider the maximum latency as the sum of the threads waiting to run 676 * aside from curthread and target no more than sched_slice latency but 677 * no less than sched_slice_min runtime. 678 */ 679 static inline u_int 680 tdq_slice(struct tdq *tdq) 681 { 682 int load; 683 684 /* 685 * It is safe to use sys_load here because this is called from 686 * contexts where timeshare threads are running and so there 687 * cannot be higher priority load in the system. 688 */ 689 load = tdq->tdq_sysload - 1; 690 if (load >= SCHED_SLICE_MIN_DIVISOR) 691 return (sched_slice_min); 692 if (load <= 1) 693 return (sched_slice); 694 return (sched_slice / load); 695 } 696 697 /* 698 * Set lowpri to its exact value by searching the run-queue and 699 * evaluating curthread. curthread may be passed as an optimization. 700 */ 701 static void 702 tdq_setlowpri(struct tdq *tdq, struct thread *ctd) 703 { 704 struct thread *td; 705 706 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 707 if (ctd == NULL) 708 ctd = tdq->tdq_curthread; 709 td = tdq_choose(tdq); 710 if (td == NULL || td->td_priority > ctd->td_priority) 711 tdq->tdq_lowpri = ctd->td_priority; 712 else 713 tdq->tdq_lowpri = td->td_priority; 714 } 715 716 #ifdef SMP 717 /* 718 * We need some randomness. Implement a classic Linear Congruential 719 * Generator X_{n+1}=(aX_n+c) mod m. These values are optimized for 720 * m = 2^32, a = 69069 and c = 5. We only return the upper 16 bits 721 * of the random state (in the low bits of our answer) to keep 722 * the maximum randomness. 723 */ 724 static uint32_t 725 sched_random(void) 726 { 727 uint32_t *rndptr; 728 729 rndptr = DPCPU_PTR(randomval); 730 *rndptr = *rndptr * 69069 + 5; 731 732 return (*rndptr >> 16); 733 } 734 735 struct cpu_search { 736 cpuset_t *cs_mask; /* The mask of allowed CPUs to choose from. */ 737 int cs_prefer; /* Prefer this CPU and groups including it. */ 738 int cs_running; /* The thread is now running at cs_prefer. */ 739 int cs_pri; /* Min priority for low. */ 740 int cs_load; /* Max load for low, min load for high. */ 741 int cs_trans; /* Min transferable load for high. */ 742 }; 743 744 struct cpu_search_res { 745 int csr_cpu; /* The best CPU found. */ 746 int csr_load; /* The load of cs_cpu. */ 747 }; 748 749 /* 750 * Search the tree of cpu_groups for the lowest or highest loaded CPU. 751 * These routines actually compare the load on all paths through the tree 752 * and find the least loaded cpu on the least loaded path, which may differ 753 * from the least loaded cpu in the system. This balances work among caches 754 * and buses. 755 */ 756 static int 757 cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s, 758 struct cpu_search_res *r) 759 { 760 struct cpu_search_res lr; 761 struct tdq *tdq; 762 int c, bload, l, load, p, total; 763 764 total = 0; 765 bload = INT_MAX; 766 r->csr_cpu = -1; 767 768 /* Loop through children CPU groups if there are any. */ 769 if (cg->cg_children > 0) { 770 for (c = cg->cg_children - 1; c >= 0; c--) { 771 load = cpu_search_lowest(&cg->cg_child[c], s, &lr); 772 total += load; 773 774 /* 775 * When balancing do not prefer SMT groups with load >1. 776 * It allows round-robin between SMT groups with equal 777 * load within parent group for more fair scheduling. 778 */ 779 if (__predict_false(s->cs_running) && 780 (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) && 781 load >= 128 && (load & 128) != 0) 782 load += 128; 783 784 if (lr.csr_cpu >= 0 && (load < bload || 785 (load == bload && lr.csr_load < r->csr_load))) { 786 bload = load; 787 r->csr_cpu = lr.csr_cpu; 788 r->csr_load = lr.csr_load; 789 } 790 } 791 return (total); 792 } 793 794 /* Loop through children CPUs otherwise. */ 795 for (c = cg->cg_last; c >= cg->cg_first; c--) { 796 if (!CPU_ISSET(c, &cg->cg_mask)) 797 continue; 798 tdq = TDQ_CPU(c); 799 l = TDQ_LOAD(tdq); 800 if (c == s->cs_prefer) { 801 if (__predict_false(s->cs_running)) 802 l--; 803 p = 128; 804 } else 805 p = 0; 806 load = l * 256; 807 total += load - p; 808 809 /* 810 * Check this CPU is acceptable. 811 * If the threads is already on the CPU, don't look on the TDQ 812 * priority, since it can be the priority of the thread itself. 813 */ 814 if (l > s->cs_load || 815 (atomic_load_char(&tdq->tdq_lowpri) <= s->cs_pri && 816 (!s->cs_running || c != s->cs_prefer)) || 817 !CPU_ISSET(c, s->cs_mask)) 818 continue; 819 820 /* 821 * When balancing do not prefer CPUs with load > 1. 822 * It allows round-robin between CPUs with equal load 823 * within the CPU group for more fair scheduling. 824 */ 825 if (__predict_false(s->cs_running) && l > 0) 826 p = 0; 827 828 load -= sched_random() % 128; 829 if (bload > load - p) { 830 bload = load - p; 831 r->csr_cpu = c; 832 r->csr_load = load; 833 } 834 } 835 return (total); 836 } 837 838 static int 839 cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s, 840 struct cpu_search_res *r) 841 { 842 struct cpu_search_res lr; 843 struct tdq *tdq; 844 int c, bload, l, load, total; 845 846 total = 0; 847 bload = INT_MIN; 848 r->csr_cpu = -1; 849 850 /* Loop through children CPU groups if there are any. */ 851 if (cg->cg_children > 0) { 852 for (c = cg->cg_children - 1; c >= 0; c--) { 853 load = cpu_search_highest(&cg->cg_child[c], s, &lr); 854 total += load; 855 if (lr.csr_cpu >= 0 && (load > bload || 856 (load == bload && lr.csr_load > r->csr_load))) { 857 bload = load; 858 r->csr_cpu = lr.csr_cpu; 859 r->csr_load = lr.csr_load; 860 } 861 } 862 return (total); 863 } 864 865 /* Loop through children CPUs otherwise. */ 866 for (c = cg->cg_last; c >= cg->cg_first; c--) { 867 if (!CPU_ISSET(c, &cg->cg_mask)) 868 continue; 869 tdq = TDQ_CPU(c); 870 l = TDQ_LOAD(tdq); 871 load = l * 256; 872 total += load; 873 874 /* 875 * Check this CPU is acceptable. 876 */ 877 if (l < s->cs_load || TDQ_TRANSFERABLE(tdq) < s->cs_trans || 878 !CPU_ISSET(c, s->cs_mask)) 879 continue; 880 881 load -= sched_random() % 256; 882 if (load > bload) { 883 bload = load; 884 r->csr_cpu = c; 885 } 886 } 887 r->csr_load = bload; 888 return (total); 889 } 890 891 /* 892 * Find the cpu with the least load via the least loaded path that has a 893 * lowpri greater than pri pri. A pri of -1 indicates any priority is 894 * acceptable. 895 */ 896 static inline int 897 sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload, 898 int prefer, int running) 899 { 900 struct cpu_search s; 901 struct cpu_search_res r; 902 903 s.cs_prefer = prefer; 904 s.cs_running = running; 905 s.cs_mask = mask; 906 s.cs_pri = pri; 907 s.cs_load = maxload; 908 cpu_search_lowest(cg, &s, &r); 909 return (r.csr_cpu); 910 } 911 912 /* 913 * Find the cpu with the highest load via the highest loaded path. 914 */ 915 static inline int 916 sched_highest(const struct cpu_group *cg, cpuset_t *mask, int minload, 917 int mintrans) 918 { 919 struct cpu_search s; 920 struct cpu_search_res r; 921 922 s.cs_mask = mask; 923 s.cs_load = minload; 924 s.cs_trans = mintrans; 925 cpu_search_highest(cg, &s, &r); 926 return (r.csr_cpu); 927 } 928 929 static void 930 sched_balance_group(struct cpu_group *cg) 931 { 932 struct tdq *tdq; 933 struct thread *td; 934 cpuset_t hmask, lmask; 935 int high, low, anylow; 936 937 CPU_FILL(&hmask); 938 for (;;) { 939 high = sched_highest(cg, &hmask, 1, 0); 940 /* Stop if there is no more CPU with transferrable threads. */ 941 if (high == -1) 942 break; 943 CPU_CLR(high, &hmask); 944 CPU_COPY(&hmask, &lmask); 945 /* Stop if there is no more CPU left for low. */ 946 if (CPU_EMPTY(&lmask)) 947 break; 948 tdq = TDQ_CPU(high); 949 if (TDQ_LOAD(tdq) == 1) { 950 /* 951 * There is only one running thread. We can't move 952 * it from here, so tell it to pick new CPU by itself. 953 */ 954 TDQ_LOCK(tdq); 955 td = tdq->tdq_curthread; 956 if (td->td_lock == TDQ_LOCKPTR(tdq) && 957 (td->td_flags & TDF_IDLETD) == 0 && 958 THREAD_CAN_MIGRATE(td)) { 959 td->td_flags |= TDF_PICKCPU; 960 ast_sched_locked(td, TDA_SCHED); 961 if (high != curcpu) 962 ipi_cpu(high, IPI_AST); 963 } 964 TDQ_UNLOCK(tdq); 965 break; 966 } 967 anylow = 1; 968 nextlow: 969 if (TDQ_TRANSFERABLE(tdq) == 0) 970 continue; 971 low = sched_lowest(cg, &lmask, -1, TDQ_LOAD(tdq) - 1, high, 1); 972 /* Stop if we looked well and found no less loaded CPU. */ 973 if (anylow && low == -1) 974 break; 975 /* Go to next high if we found no less loaded CPU. */ 976 if (low == -1) 977 continue; 978 /* Transfer thread from high to low. */ 979 if (sched_balance_pair(tdq, TDQ_CPU(low))) { 980 /* CPU that got thread can no longer be a donor. */ 981 CPU_CLR(low, &hmask); 982 } else { 983 /* 984 * If failed, then there is no threads on high 985 * that can run on this low. Drop low from low 986 * mask and look for different one. 987 */ 988 CPU_CLR(low, &lmask); 989 anylow = 0; 990 goto nextlow; 991 } 992 } 993 } 994 995 static void 996 sched_balance(void) 997 { 998 struct tdq *tdq; 999 1000 balance_ticks = max(balance_interval / 2, 1) + 1001 (sched_random() % balance_interval); 1002 tdq = TDQ_SELF(); 1003 TDQ_UNLOCK(tdq); 1004 sched_balance_group(cpu_top); 1005 TDQ_LOCK(tdq); 1006 } 1007 1008 /* 1009 * Lock two thread queues using their address to maintain lock order. 1010 */ 1011 static void 1012 tdq_lock_pair(struct tdq *one, struct tdq *two) 1013 { 1014 if (one < two) { 1015 TDQ_LOCK(one); 1016 TDQ_LOCK_FLAGS(two, MTX_DUPOK); 1017 } else { 1018 TDQ_LOCK(two); 1019 TDQ_LOCK_FLAGS(one, MTX_DUPOK); 1020 } 1021 } 1022 1023 /* 1024 * Unlock two thread queues. Order is not important here. 1025 */ 1026 static void 1027 tdq_unlock_pair(struct tdq *one, struct tdq *two) 1028 { 1029 TDQ_UNLOCK(one); 1030 TDQ_UNLOCK(two); 1031 } 1032 1033 /* 1034 * Transfer load between two imbalanced thread queues. Returns true if a thread 1035 * was moved between the queues, and false otherwise. 1036 */ 1037 static bool 1038 sched_balance_pair(struct tdq *high, struct tdq *low) 1039 { 1040 int cpu, lowpri; 1041 bool ret; 1042 1043 ret = false; 1044 tdq_lock_pair(high, low); 1045 1046 /* 1047 * Transfer a thread from high to low. 1048 */ 1049 if (high->tdq_transferable != 0 && high->tdq_load > low->tdq_load) { 1050 lowpri = tdq_move(high, low); 1051 if (lowpri != -1) { 1052 /* 1053 * In case the target isn't the current CPU notify it of 1054 * the new load, possibly sending an IPI to force it to 1055 * reschedule. Otherwise maybe schedule a preemption. 1056 */ 1057 cpu = TDQ_ID(low); 1058 if (cpu != PCPU_GET(cpuid)) 1059 tdq_notify(low, lowpri); 1060 else 1061 sched_setpreempt(low->tdq_lowpri); 1062 ret = true; 1063 } 1064 } 1065 tdq_unlock_pair(high, low); 1066 return (ret); 1067 } 1068 1069 /* 1070 * Move a thread from one thread queue to another. Returns -1 if the source 1071 * queue was empty, else returns the maximum priority of all threads in 1072 * the destination queue prior to the addition of the new thread. In the latter 1073 * case, this priority can be used to determine whether an IPI needs to be 1074 * delivered. 1075 */ 1076 static int 1077 tdq_move(struct tdq *from, struct tdq *to) 1078 { 1079 struct thread *td; 1080 int cpu; 1081 1082 TDQ_LOCK_ASSERT(from, MA_OWNED); 1083 TDQ_LOCK_ASSERT(to, MA_OWNED); 1084 1085 cpu = TDQ_ID(to); 1086 td = tdq_steal(from, cpu); 1087 if (td == NULL) 1088 return (-1); 1089 1090 /* 1091 * Although the run queue is locked the thread may be 1092 * blocked. We can not set the lock until it is unblocked. 1093 */ 1094 thread_lock_block_wait(td); 1095 sched_rem(td); 1096 THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(from)); 1097 td->td_lock = TDQ_LOCKPTR(to); 1098 td_get_sched(td)->ts_cpu = cpu; 1099 return (tdq_add(to, td, SRQ_YIELDING)); 1100 } 1101 1102 /* 1103 * This tdq has idled. Try to steal a thread from another cpu and switch 1104 * to it. 1105 */ 1106 static int 1107 tdq_idled(struct tdq *tdq) 1108 { 1109 struct cpu_group *cg, *parent; 1110 struct tdq *steal; 1111 cpuset_t mask; 1112 int cpu, switchcnt, goup; 1113 1114 if (smp_started == 0 || steal_idle == 0 || tdq->tdq_cg == NULL) 1115 return (1); 1116 CPU_FILL(&mask); 1117 CPU_CLR(PCPU_GET(cpuid), &mask); 1118 restart: 1119 switchcnt = TDQ_SWITCHCNT(tdq); 1120 for (cg = tdq->tdq_cg, goup = 0; ; ) { 1121 cpu = sched_highest(cg, &mask, steal_thresh, 1); 1122 /* 1123 * We were assigned a thread but not preempted. Returning 1124 * 0 here will cause our caller to switch to it. 1125 */ 1126 if (TDQ_LOAD(tdq)) 1127 return (0); 1128 1129 /* 1130 * We found no CPU to steal from in this group. Escalate to 1131 * the parent and repeat. But if parent has only two children 1132 * groups we can avoid searching this group again by searching 1133 * the other one specifically and then escalating two levels. 1134 */ 1135 if (cpu == -1) { 1136 if (goup) { 1137 cg = cg->cg_parent; 1138 goup = 0; 1139 } 1140 parent = cg->cg_parent; 1141 if (parent == NULL) 1142 return (1); 1143 if (parent->cg_children == 2) { 1144 if (cg == &parent->cg_child[0]) 1145 cg = &parent->cg_child[1]; 1146 else 1147 cg = &parent->cg_child[0]; 1148 goup = 1; 1149 } else 1150 cg = parent; 1151 continue; 1152 } 1153 steal = TDQ_CPU(cpu); 1154 /* 1155 * The data returned by sched_highest() is stale and 1156 * the chosen CPU no longer has an eligible thread. 1157 * 1158 * Testing this ahead of tdq_lock_pair() only catches 1159 * this situation about 20% of the time on an 8 core 1160 * 16 thread Ryzen 7, but it still helps performance. 1161 */ 1162 if (TDQ_LOAD(steal) < steal_thresh || 1163 TDQ_TRANSFERABLE(steal) == 0) 1164 goto restart; 1165 /* 1166 * Try to lock both queues. If we are assigned a thread while 1167 * waited for the lock, switch to it now instead of stealing. 1168 * If we can't get the lock, then somebody likely got there 1169 * first so continue searching. 1170 */ 1171 TDQ_LOCK(tdq); 1172 if (tdq->tdq_load > 0) { 1173 mi_switch(SW_VOL | SWT_IDLE); 1174 return (0); 1175 } 1176 if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) { 1177 TDQ_UNLOCK(tdq); 1178 CPU_CLR(cpu, &mask); 1179 continue; 1180 } 1181 /* 1182 * The data returned by sched_highest() is stale and 1183 * the chosen CPU no longer has an eligible thread, or 1184 * we were preempted and the CPU loading info may be out 1185 * of date. The latter is rare. In either case restart 1186 * the search. 1187 */ 1188 if (TDQ_LOAD(steal) < steal_thresh || 1189 TDQ_TRANSFERABLE(steal) == 0 || 1190 switchcnt != TDQ_SWITCHCNT(tdq)) { 1191 tdq_unlock_pair(tdq, steal); 1192 goto restart; 1193 } 1194 /* 1195 * Steal the thread and switch to it. 1196 */ 1197 if (tdq_move(steal, tdq) != -1) 1198 break; 1199 /* 1200 * We failed to acquire a thread even though it looked 1201 * like one was available. This could be due to affinity 1202 * restrictions or for other reasons. Loop again after 1203 * removing this CPU from the set. The restart logic 1204 * above does not restore this CPU to the set due to the 1205 * likelyhood of failing here again. 1206 */ 1207 CPU_CLR(cpu, &mask); 1208 tdq_unlock_pair(tdq, steal); 1209 } 1210 TDQ_UNLOCK(steal); 1211 mi_switch(SW_VOL | SWT_IDLE); 1212 return (0); 1213 } 1214 1215 /* 1216 * Notify a remote cpu of new work. Sends an IPI if criteria are met. 1217 * 1218 * "lowpri" is the minimum scheduling priority among all threads on 1219 * the queue prior to the addition of the new thread. 1220 */ 1221 static void 1222 tdq_notify(struct tdq *tdq, int lowpri) 1223 { 1224 int cpu; 1225 1226 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1227 KASSERT(tdq->tdq_lowpri <= lowpri, 1228 ("tdq_notify: lowpri %d > tdq_lowpri %d", lowpri, tdq->tdq_lowpri)); 1229 1230 if (tdq->tdq_owepreempt) 1231 return; 1232 1233 /* 1234 * Check to see if the newly added thread should preempt the one 1235 * currently running. 1236 */ 1237 if (!sched_shouldpreempt(tdq->tdq_lowpri, lowpri, 1)) 1238 return; 1239 1240 /* 1241 * Make sure that our caller's earlier update to tdq_load is 1242 * globally visible before we read tdq_cpu_idle. Idle thread 1243 * accesses both of them without locks, and the order is important. 1244 */ 1245 atomic_thread_fence_seq_cst(); 1246 1247 /* 1248 * Try to figure out if we can signal the idle thread instead of sending 1249 * an IPI. This check is racy; at worst, we will deliever an IPI 1250 * unnecessarily. 1251 */ 1252 cpu = TDQ_ID(tdq); 1253 if (TD_IS_IDLETHREAD(tdq->tdq_curthread) && 1254 (atomic_load_int(&tdq->tdq_cpu_idle) == 0 || cpu_idle_wakeup(cpu))) 1255 return; 1256 1257 /* 1258 * The run queues have been updated, so any switch on the remote CPU 1259 * will satisfy the preemption request. 1260 */ 1261 tdq->tdq_owepreempt = 1; 1262 ipi_cpu(cpu, IPI_PREEMPT); 1263 } 1264 1265 struct runq_steal_pred_data { 1266 struct thread *td; 1267 int cpu; 1268 }; 1269 1270 static bool 1271 runq_steal_pred(const int idx, struct rq_queue *const q, void *const data) 1272 { 1273 struct runq_steal_pred_data *const d = data; 1274 struct thread *td; 1275 1276 TAILQ_FOREACH(td, q, td_runq) { 1277 if (THREAD_CAN_MIGRATE(td) && THREAD_CAN_SCHED(td, d->cpu)) { 1278 d->td = td; 1279 return (true); 1280 } 1281 } 1282 1283 return (false); 1284 } 1285 1286 /* 1287 * Steals load contained in queues with indices in the specified range. 1288 */ 1289 static inline struct thread * 1290 runq_steal_range(struct runq *const rq, const int lvl_min, const int lvl_max, 1291 int cpu) 1292 { 1293 struct runq_steal_pred_data data = { 1294 .td = NULL, 1295 .cpu = cpu, 1296 }; 1297 int idx; 1298 1299 idx = runq_findq(rq, lvl_min, lvl_max, &runq_steal_pred, &data); 1300 if (idx != -1) { 1301 MPASS(data.td != NULL); 1302 return (data.td); 1303 } 1304 1305 MPASS(data.td == NULL); 1306 return (NULL); 1307 } 1308 1309 static inline struct thread * 1310 runq_steal_realtime(struct runq *const rq, int cpu) 1311 { 1312 1313 return (runq_steal_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX, cpu)); 1314 } 1315 1316 /* 1317 * Steals load from a timeshare queue. Honors the rotating queue head 1318 * index. 1319 */ 1320 static inline struct thread * 1321 runq_steal_timeshare(struct runq *const rq, int cpu, int off) 1322 { 1323 struct thread *td; 1324 1325 MPASS(0 <= off && off < RQ_TS_POL_MODULO); 1326 1327 td = runq_steal_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX, cpu); 1328 if (td != NULL || off == 0) 1329 return (td); 1330 1331 td = runq_steal_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1, cpu); 1332 return (td); 1333 } 1334 1335 static inline struct thread * 1336 runq_steal_idle(struct runq *const rq, int cpu) 1337 { 1338 1339 return (runq_steal_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX, cpu)); 1340 } 1341 1342 1343 /* 1344 * Attempt to steal a thread in priority order from a thread queue. 1345 */ 1346 static struct thread * 1347 tdq_steal(struct tdq *tdq, int cpu) 1348 { 1349 struct thread *td; 1350 1351 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1352 td = runq_steal_realtime(&tdq->tdq_runq, cpu); 1353 if (td != NULL) 1354 return (td); 1355 td = runq_steal_timeshare(&tdq->tdq_runq, cpu, tdq->tdq_ts_deq_off); 1356 if (td != NULL) 1357 return (td); 1358 return (runq_steal_idle(&tdq->tdq_runq, cpu)); 1359 } 1360 1361 /* 1362 * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the 1363 * current lock and returns with the assigned queue locked. 1364 */ 1365 static inline struct tdq * 1366 sched_setcpu(struct thread *td, int cpu, int flags) 1367 { 1368 1369 struct tdq *tdq; 1370 struct mtx *mtx; 1371 1372 THREAD_LOCK_ASSERT(td, MA_OWNED); 1373 tdq = TDQ_CPU(cpu); 1374 td_get_sched(td)->ts_cpu = cpu; 1375 /* 1376 * If the lock matches just return the queue. 1377 */ 1378 if (td->td_lock == TDQ_LOCKPTR(tdq)) { 1379 KASSERT((flags & SRQ_HOLD) == 0, 1380 ("sched_setcpu: Invalid lock for SRQ_HOLD")); 1381 return (tdq); 1382 } 1383 1384 /* 1385 * The hard case, migration, we need to block the thread first to 1386 * prevent order reversals with other cpus locks. 1387 */ 1388 spinlock_enter(); 1389 mtx = thread_lock_block(td); 1390 if ((flags & SRQ_HOLD) == 0) 1391 mtx_unlock_spin(mtx); 1392 TDQ_LOCK(tdq); 1393 thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); 1394 spinlock_exit(); 1395 return (tdq); 1396 } 1397 1398 SCHED_STAT_DEFINE(pickcpu_intrbind, "Soft interrupt binding"); 1399 SCHED_STAT_DEFINE(pickcpu_idle_affinity, "Picked idle cpu based on affinity"); 1400 SCHED_STAT_DEFINE(pickcpu_affinity, "Picked cpu based on affinity"); 1401 SCHED_STAT_DEFINE(pickcpu_lowest, "Selected lowest load"); 1402 SCHED_STAT_DEFINE(pickcpu_local, "Migrated to current cpu"); 1403 SCHED_STAT_DEFINE(pickcpu_migration, "Selection may have caused migration"); 1404 1405 static int 1406 sched_pickcpu(struct thread *td, int flags) 1407 { 1408 struct cpu_group *cg, *ccg; 1409 struct td_sched *ts; 1410 struct tdq *tdq; 1411 cpuset_t *mask; 1412 int cpu, pri, r, self, intr; 1413 1414 self = PCPU_GET(cpuid); 1415 ts = td_get_sched(td); 1416 KASSERT(!CPU_ABSENT(ts->ts_cpu), ("sched_pickcpu: Start scheduler on " 1417 "absent CPU %d for thread %s.", ts->ts_cpu, td->td_name)); 1418 if (smp_started == 0) 1419 return (self); 1420 /* 1421 * Don't migrate a running thread from sched_switch(). 1422 */ 1423 if ((flags & SRQ_OURSELF) || !THREAD_CAN_MIGRATE(td)) 1424 return (ts->ts_cpu); 1425 /* 1426 * Prefer to run interrupt threads on the processors that generate 1427 * the interrupt. 1428 */ 1429 if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_SCHED(td, self) && 1430 curthread->td_intr_nesting_level) { 1431 tdq = TDQ_SELF(); 1432 if (tdq->tdq_lowpri >= PRI_MIN_IDLE) { 1433 SCHED_STAT_INC(pickcpu_idle_affinity); 1434 return (self); 1435 } 1436 ts->ts_cpu = self; 1437 intr = 1; 1438 cg = tdq->tdq_cg; 1439 goto llc; 1440 } else { 1441 intr = 0; 1442 tdq = TDQ_CPU(ts->ts_cpu); 1443 cg = tdq->tdq_cg; 1444 } 1445 /* 1446 * If the thread can run on the last cpu and the affinity has not 1447 * expired and it is idle, run it there. 1448 */ 1449 if (THREAD_CAN_SCHED(td, ts->ts_cpu) && 1450 atomic_load_char(&tdq->tdq_lowpri) >= PRI_MIN_IDLE && 1451 SCHED_AFFINITY(ts, CG_SHARE_L2)) { 1452 if (cg->cg_flags & CG_FLAG_THREAD) { 1453 /* Check all SMT threads for being idle. */ 1454 for (cpu = cg->cg_first; cpu <= cg->cg_last; cpu++) { 1455 pri = 1456 atomic_load_char(&TDQ_CPU(cpu)->tdq_lowpri); 1457 if (CPU_ISSET(cpu, &cg->cg_mask) && 1458 pri < PRI_MIN_IDLE) 1459 break; 1460 } 1461 if (cpu > cg->cg_last) { 1462 SCHED_STAT_INC(pickcpu_idle_affinity); 1463 return (ts->ts_cpu); 1464 } 1465 } else { 1466 SCHED_STAT_INC(pickcpu_idle_affinity); 1467 return (ts->ts_cpu); 1468 } 1469 } 1470 llc: 1471 /* 1472 * Search for the last level cache CPU group in the tree. 1473 * Skip SMT, identical groups and caches with expired affinity. 1474 * Interrupt threads affinity is explicit and never expires. 1475 */ 1476 for (ccg = NULL; cg != NULL; cg = cg->cg_parent) { 1477 if (cg->cg_flags & CG_FLAG_THREAD) 1478 continue; 1479 if (cg->cg_children == 1 || cg->cg_count == 1) 1480 continue; 1481 if (cg->cg_level == CG_SHARE_NONE || 1482 (!intr && !SCHED_AFFINITY(ts, cg->cg_level))) 1483 continue; 1484 ccg = cg; 1485 } 1486 /* Found LLC shared by all CPUs, so do a global search. */ 1487 if (ccg == cpu_top) 1488 ccg = NULL; 1489 cpu = -1; 1490 mask = &td->td_cpuset->cs_mask; 1491 pri = td->td_priority; 1492 r = TD_IS_RUNNING(td); 1493 /* 1494 * Try hard to keep interrupts within found LLC. Search the LLC for 1495 * the least loaded CPU we can run now. For NUMA systems it should 1496 * be within target domain, and it also reduces scheduling overhead. 1497 */ 1498 if (ccg != NULL && intr) { 1499 cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r); 1500 if (cpu >= 0) 1501 SCHED_STAT_INC(pickcpu_intrbind); 1502 } else 1503 /* Search the LLC for the least loaded idle CPU we can run now. */ 1504 if (ccg != NULL) { 1505 cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE), 1506 INT_MAX, ts->ts_cpu, r); 1507 if (cpu >= 0) 1508 SCHED_STAT_INC(pickcpu_affinity); 1509 } 1510 /* Search globally for the least loaded CPU we can run now. */ 1511 if (cpu < 0) { 1512 cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r); 1513 if (cpu >= 0) 1514 SCHED_STAT_INC(pickcpu_lowest); 1515 } 1516 /* Search globally for the least loaded CPU. */ 1517 if (cpu < 0) { 1518 cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r); 1519 if (cpu >= 0) 1520 SCHED_STAT_INC(pickcpu_lowest); 1521 } 1522 KASSERT(cpu >= 0, ("sched_pickcpu: Failed to find a cpu.")); 1523 KASSERT(!CPU_ABSENT(cpu), ("sched_pickcpu: Picked absent CPU %d.", cpu)); 1524 /* 1525 * Compare the lowest loaded cpu to current cpu. 1526 */ 1527 tdq = TDQ_CPU(cpu); 1528 if (THREAD_CAN_SCHED(td, self) && TDQ_SELF()->tdq_lowpri > pri && 1529 atomic_load_char(&tdq->tdq_lowpri) < PRI_MIN_IDLE && 1530 TDQ_LOAD(TDQ_SELF()) <= TDQ_LOAD(tdq) + 1) { 1531 SCHED_STAT_INC(pickcpu_local); 1532 cpu = self; 1533 } 1534 if (cpu != ts->ts_cpu) 1535 SCHED_STAT_INC(pickcpu_migration); 1536 return (cpu); 1537 } 1538 #endif 1539 1540 static inline struct thread * 1541 runq_choose_realtime(struct runq *const rq) 1542 { 1543 1544 return (runq_first_thread_range(rq, RQ_RT_POL_MIN, RQ_RT_POL_MAX)); 1545 } 1546 1547 static struct thread * 1548 runq_choose_timeshare(struct runq *const rq, int off) 1549 { 1550 struct thread *td; 1551 1552 MPASS(0 <= off && off < RQ_TS_POL_MODULO); 1553 1554 td = runq_first_thread_range(rq, RQ_TS_POL_MIN + off, RQ_TS_POL_MAX); 1555 if (td != NULL || off == 0) 1556 return (td); 1557 1558 td = runq_first_thread_range(rq, RQ_TS_POL_MIN, RQ_TS_POL_MIN + off - 1); 1559 return (td); 1560 } 1561 1562 static inline struct thread * 1563 runq_choose_idle(struct runq *const rq) 1564 { 1565 1566 return (runq_first_thread_range(rq, RQ_ID_POL_MIN, RQ_ID_POL_MAX)); 1567 } 1568 1569 /* 1570 * Pick the highest priority task we have and return it. 1571 */ 1572 static struct thread * 1573 tdq_choose(struct tdq *tdq) 1574 { 1575 struct thread *td; 1576 1577 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1578 td = runq_choose_realtime(&tdq->tdq_runq); 1579 if (td != NULL) 1580 return (td); 1581 td = runq_choose_timeshare(&tdq->tdq_runq, tdq->tdq_ts_deq_off); 1582 if (td != NULL) { 1583 KASSERT(td->td_priority >= PRI_MIN_BATCH, 1584 ("tdq_choose: Invalid priority on timeshare queue %d", 1585 td->td_priority)); 1586 return (td); 1587 } 1588 td = runq_choose_idle(&tdq->tdq_runq); 1589 if (td != NULL) { 1590 KASSERT(td->td_priority >= PRI_MIN_IDLE, 1591 ("tdq_choose: Invalid priority on idle queue %d", 1592 td->td_priority)); 1593 return (td); 1594 } 1595 1596 return (NULL); 1597 } 1598 1599 /* 1600 * Initialize a thread queue. 1601 */ 1602 static void 1603 tdq_setup(struct tdq *tdq, int id) 1604 { 1605 1606 if (bootverbose) 1607 printf("ULE: setup cpu %d\n", id); 1608 runq_init(&tdq->tdq_runq); 1609 tdq->tdq_id = id; 1610 snprintf(tdq->tdq_name, sizeof(tdq->tdq_name), 1611 "sched lock %d", (int)TDQ_ID(tdq)); 1612 mtx_init(&tdq->tdq_lock, tdq->tdq_name, "sched lock", MTX_SPIN); 1613 #ifdef KTR 1614 snprintf(tdq->tdq_loadname, sizeof(tdq->tdq_loadname), 1615 "CPU %d load", (int)TDQ_ID(tdq)); 1616 #endif 1617 } 1618 1619 #ifdef SMP 1620 static void 1621 sched_setup_smp(void) 1622 { 1623 struct tdq *tdq; 1624 int i; 1625 1626 cpu_top = smp_topo(); 1627 CPU_FOREACH(i) { 1628 tdq = DPCPU_ID_PTR(i, tdq); 1629 tdq_setup(tdq, i); 1630 tdq->tdq_cg = smp_topo_find(cpu_top, i); 1631 if (tdq->tdq_cg == NULL) 1632 panic("Can't find cpu group for %d\n", i); 1633 DPCPU_ID_SET(i, randomval, i * 69069 + 5); 1634 } 1635 PCPU_SET(sched, DPCPU_PTR(tdq)); 1636 balance_tdq = TDQ_SELF(); 1637 } 1638 #endif 1639 1640 /* 1641 * Setup the thread queues and initialize the topology based on MD 1642 * information. 1643 */ 1644 static void 1645 sched_setup(void *dummy) 1646 { 1647 struct tdq *tdq; 1648 1649 #ifdef SMP 1650 sched_setup_smp(); 1651 #else 1652 tdq_setup(TDQ_SELF(), 0); 1653 #endif 1654 tdq = TDQ_SELF(); 1655 1656 /* Add thread0's load since it's running. */ 1657 TDQ_LOCK(tdq); 1658 thread0.td_lock = TDQ_LOCKPTR(tdq); 1659 tdq_load_add(tdq, &thread0); 1660 tdq->tdq_curthread = &thread0; 1661 tdq->tdq_lowpri = thread0.td_priority; 1662 TDQ_UNLOCK(tdq); 1663 } 1664 1665 /* 1666 * This routine determines time constants after stathz and hz are setup. 1667 */ 1668 /* ARGSUSED */ 1669 static void 1670 sched_initticks(void *dummy) 1671 { 1672 int incr; 1673 1674 realstathz = stathz ? stathz : hz; 1675 sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR; 1676 sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; 1677 hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / 1678 realstathz); 1679 1680 /* 1681 * tickincr is shifted out by 10 to avoid rounding errors due to 1682 * hz not being evenly divisible by stathz on all platforms. 1683 */ 1684 incr = (hz << SCHED_TICK_SHIFT) / realstathz; 1685 /* 1686 * This does not work for values of stathz that are more than 1687 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1688 */ 1689 if (incr == 0) 1690 incr = 1; 1691 tickincr = incr; 1692 #ifdef SMP 1693 /* 1694 * Set the default balance interval now that we know 1695 * what realstathz is. 1696 */ 1697 balance_interval = realstathz; 1698 balance_ticks = balance_interval; 1699 affinity = SCHED_AFFINITY_DEFAULT; 1700 #endif 1701 if (sched_idlespinthresh < 0) 1702 sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz; 1703 } 1704 1705 /* 1706 * This is the core of the interactivity algorithm. Determines a score based 1707 * on past behavior. It is the ratio of sleep time to run time scaled to 1708 * a [0, 100] integer. This is the voluntary sleep time of a process, which 1709 * differs from the cpu usage because it does not account for time spent 1710 * waiting on a run-queue. Would be prettier if we had floating point. 1711 * 1712 * When a thread's sleep time is greater than its run time the 1713 * calculation is: 1714 * 1715 * scaling factor 1716 * interactivity score = --------------------- 1717 * sleep time / run time 1718 * 1719 * 1720 * When a thread's run time is greater than its sleep time the 1721 * calculation is: 1722 * 1723 * scaling factor 1724 * interactivity score = 2 * scaling factor - --------------------- 1725 * run time / sleep time 1726 */ 1727 static int 1728 sched_interact_score(struct thread *td) 1729 { 1730 struct td_sched *ts; 1731 int div; 1732 1733 ts = td_get_sched(td); 1734 /* 1735 * The score is only needed if this is likely to be an interactive 1736 * task. Don't go through the expense of computing it if there's 1737 * no chance. 1738 */ 1739 if (sched_interact <= SCHED_INTERACT_HALF && 1740 ts->ts_runtime >= ts->ts_slptime) 1741 return (SCHED_INTERACT_HALF); 1742 1743 if (ts->ts_runtime > ts->ts_slptime) { 1744 div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); 1745 return (SCHED_INTERACT_HALF + 1746 (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); 1747 } 1748 if (ts->ts_slptime > ts->ts_runtime) { 1749 div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); 1750 return (ts->ts_runtime / div); 1751 } 1752 /* runtime == slptime */ 1753 if (ts->ts_runtime) 1754 return (SCHED_INTERACT_HALF); 1755 1756 /* 1757 * This can happen if slptime and runtime are 0. 1758 */ 1759 return (0); 1760 1761 } 1762 1763 /* 1764 * Scale the scheduling priority according to the "interactivity" of this 1765 * process. 1766 */ 1767 static void 1768 sched_priority(struct thread *td) 1769 { 1770 u_int pri, score; 1771 int nice; 1772 1773 if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) 1774 return; 1775 1776 nice = td->td_proc->p_nice; 1777 /* 1778 * If the score is interactive we place the thread in the realtime 1779 * queue with a priority that is less than kernel and interrupt 1780 * priorities. These threads are not subject to nice restrictions. 1781 * 1782 * Scores greater than this are placed on the normal timeshare queue 1783 * where the priority is partially decided by the most recent cpu 1784 * utilization and the rest is decided by nice value. 1785 * 1786 * The nice value of the process has a linear effect on the calculated 1787 * score. Negative nice values make it easier for a thread to be 1788 * considered interactive. 1789 */ 1790 score = imax(0, sched_interact_score(td) + nice); 1791 if (score < sched_interact) { 1792 pri = PRI_MIN_INTERACT; 1793 pri += (PRI_MAX_INTERACT - PRI_MIN_INTERACT + 1) * score / 1794 sched_interact; 1795 KASSERT(pri >= PRI_MIN_INTERACT && pri <= PRI_MAX_INTERACT, 1796 ("sched_priority: invalid interactive priority %u score %u", 1797 pri, score)); 1798 } else { 1799 const struct td_sched *const ts = td_get_sched(td); 1800 const u_int run = SCHED_TICK_RUN_SHIFTED(ts); 1801 const u_int run_unshifted __unused = (run + 1802 (1 << SCHED_TICK_SHIFT) / 2) >> SCHED_TICK_SHIFT; 1803 const u_int len = SCHED_TICK_LENGTH(ts); 1804 const u_int nice_pri_off = SCHED_PRI_NICE(nice); 1805 const u_int cpu_pri_off = (((SCHED_PRI_CPU_RANGE - 1) * 1806 run + len / 2) / len + (1 << SCHED_TICK_SHIFT) / 2) >> 1807 SCHED_TICK_SHIFT; 1808 1809 MPASS(cpu_pri_off < SCHED_PRI_CPU_RANGE); 1810 pri = PRI_MIN_BATCH + cpu_pri_off + nice_pri_off; 1811 KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH, 1812 ("sched_priority: Invalid computed priority %u: " 1813 "Should be between %u and %u (PRI_MIN_BATCH: %u; " 1814 "Window size (ticks): %u, runtime (shifted ticks): %u," 1815 "(unshifted ticks): %u => CPU pri off: %u; " 1816 "Nice: %d => nice pri off: %u)", 1817 pri, PRI_MIN_BATCH, PRI_MAX_BATCH, PRI_MIN_BATCH, 1818 len, run, run_unshifted, cpu_pri_off, nice, nice_pri_off)); 1819 } 1820 sched_user_prio(td, pri); 1821 1822 return; 1823 } 1824 1825 /* 1826 * This routine enforces a maximum limit on the amount of scheduling history 1827 * kept. It is called after either the slptime or runtime is adjusted. This 1828 * function is ugly due to integer math. 1829 */ 1830 static void 1831 sched_interact_update(struct thread *td) 1832 { 1833 struct td_sched *ts; 1834 u_int sum; 1835 1836 ts = td_get_sched(td); 1837 sum = ts->ts_runtime + ts->ts_slptime; 1838 if (sum < SCHED_SLP_RUN_MAX) 1839 return; 1840 /* 1841 * This only happens from two places: 1842 * 1) We have added an unusual amount of run time from fork_exit. 1843 * 2) We have added an unusual amount of sleep time from sched_sleep(). 1844 */ 1845 if (sum > SCHED_SLP_RUN_MAX * 2) { 1846 if (ts->ts_runtime > ts->ts_slptime) { 1847 ts->ts_runtime = SCHED_SLP_RUN_MAX; 1848 ts->ts_slptime = 1; 1849 } else { 1850 ts->ts_slptime = SCHED_SLP_RUN_MAX; 1851 ts->ts_runtime = 1; 1852 } 1853 return; 1854 } 1855 /* 1856 * If we have exceeded by more than 1/5th then the algorithm below 1857 * will not bring us back into range. Dividing by two here forces 1858 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1859 */ 1860 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1861 ts->ts_runtime /= 2; 1862 ts->ts_slptime /= 2; 1863 return; 1864 } 1865 ts->ts_runtime = (ts->ts_runtime / 5) * 4; 1866 ts->ts_slptime = (ts->ts_slptime / 5) * 4; 1867 } 1868 1869 /* 1870 * Scale back the interactivity history when a child thread is created. The 1871 * history is inherited from the parent but the thread may behave totally 1872 * differently. For example, a shell spawning a compiler process. We want 1873 * to learn that the compiler is behaving badly very quickly. 1874 */ 1875 static void 1876 sched_interact_fork(struct thread *td) 1877 { 1878 struct td_sched *ts; 1879 int ratio; 1880 int sum; 1881 1882 ts = td_get_sched(td); 1883 sum = ts->ts_runtime + ts->ts_slptime; 1884 if (sum > SCHED_SLP_RUN_FORK) { 1885 ratio = sum / SCHED_SLP_RUN_FORK; 1886 ts->ts_runtime /= ratio; 1887 ts->ts_slptime /= ratio; 1888 } 1889 } 1890 1891 /* 1892 * Called from proc0_init() to setup the scheduler fields. 1893 */ 1894 void 1895 schedinit(void) 1896 { 1897 struct td_sched *ts0; 1898 1899 /* 1900 * Set up the scheduler specific parts of thread0. 1901 */ 1902 ts0 = td_get_sched(&thread0); 1903 ts0->ts_ftick = (u_int)ticks; 1904 ts0->ts_ltick = ts0->ts_ftick; 1905 ts0->ts_slice = 0; 1906 ts0->ts_cpu = curcpu; /* set valid CPU number */ 1907 } 1908 1909 /* 1910 * schedinit_ap() is needed prior to calling sched_throw(NULL) to ensure that 1911 * the pcpu requirements are met for any calls in the period between curthread 1912 * initialization and sched_throw(). One can safely add threads to the queue 1913 * before sched_throw(), for instance, as long as the thread lock is setup 1914 * correctly. 1915 * 1916 * TDQ_SELF() relies on the below sched pcpu setting; it may be used only 1917 * after schedinit_ap(). 1918 */ 1919 void 1920 schedinit_ap(void) 1921 { 1922 1923 #ifdef SMP 1924 PCPU_SET(sched, DPCPU_PTR(tdq)); 1925 #endif 1926 PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1927 } 1928 1929 /* 1930 * This is only somewhat accurate since given many processes of the same 1931 * priority they will switch when their slices run out, which will be 1932 * at most sched_slice stathz ticks. 1933 */ 1934 int 1935 sched_rr_interval(void) 1936 { 1937 1938 /* Convert sched_slice from stathz to hz. */ 1939 return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz)); 1940 } 1941 1942 /* 1943 * Update the percent cpu tracking information when it is requested or the total 1944 * history exceeds the maximum. We keep a sliding history of tick counts that 1945 * slowly decays, for running threads (see comments below for more details). 1946 * This is less precise than the 4BSD mechanism since it happens with less 1947 * regular and frequent events. 1948 */ 1949 static void 1950 sched_pctcpu_update(struct td_sched *ts, int run) 1951 { 1952 const u_int t = (u_int)ticks; 1953 u_int t_max = SCHED_TICK_MAX((u_int)hz); 1954 u_int t_tgt = ((t_max << SCHED_TICK_SHIFT) * SCHED_CPU_DECAY_NUMER / 1955 SCHED_CPU_DECAY_DENOM) >> SCHED_TICK_SHIFT; 1956 const u_int lu_span = t - ts->ts_ltick; 1957 1958 if (lu_span >= t_tgt) { 1959 /* 1960 * Forget all previous ticks if we are more than t_tgt 1961 * (currently, 10s) apart from the last update. Don't account 1962 * for more than 't_tgt' ticks when running. 1963 */ 1964 ts->ts_ticks = run ? (t_tgt << SCHED_TICK_SHIFT) : 0; 1965 ts->ts_ftick = t - t_tgt; 1966 ts->ts_ltick = t; 1967 return; 1968 } 1969 1970 if (t - ts->ts_ftick >= t_max) { 1971 /* 1972 * First reduce the existing ticks to proportionally occupy only 1973 * what's left of the target window given 'lu_span' will occupy 1974 * the rest. Since sched_clock() is called frequently on 1975 * running threads, these threads have a small 'lu_span', and 1976 * the next formula basically becomes an exponential decay with 1977 * ratio r = SCHED_CPU_DECAY_NUMER / SCHED_CPU_DECAY_DENOM 1978 * (currently, 10/11) and period 1s. However, a sleeping thread 1979 * will see its accounted ticks drop linearly with a high slope 1980 * with respect to 'lu_span', approaching 0 as 'lu_span' 1981 * approaches 't_tgt' (so, continuously with respect to the 1982 * previous case). This rescaling is completely dependent on 1983 * the frequency of calls and the span since last update passed 1984 * at each call. 1985 */ 1986 ts->ts_ticks = SCHED_TICK_RUN_SHIFTED(ts) / 1987 SCHED_TICK_LENGTH(ts) * (t_tgt - lu_span); 1988 ts->ts_ftick = t - t_tgt; 1989 } 1990 1991 if (run) 1992 ts->ts_ticks += lu_span << SCHED_TICK_SHIFT; 1993 ts->ts_ltick = t; 1994 } 1995 1996 /* 1997 * Adjust the priority of a thread. Move it to the appropriate run-queue 1998 * if necessary. This is the back-end for several priority related 1999 * functions. 2000 */ 2001 static void 2002 sched_thread_priority(struct thread *td, u_char prio) 2003 { 2004 struct tdq *tdq; 2005 int oldpri; 2006 2007 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio", 2008 "prio:%d", td->td_priority, "new prio:%d", prio, 2009 KTR_ATTR_LINKED, sched_tdname(curthread)); 2010 SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio); 2011 if (td != curthread && prio < td->td_priority) { 2012 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread), 2013 "lend prio", "prio:%d", td->td_priority, "new prio:%d", 2014 prio, KTR_ATTR_LINKED, sched_tdname(td)); 2015 SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 2016 curthread); 2017 } 2018 THREAD_LOCK_ASSERT(td, MA_OWNED); 2019 if (td->td_priority == prio) 2020 return; 2021 /* 2022 * If the priority has been elevated due to priority 2023 * propagation, we may have to move ourselves to a new 2024 * queue. This could be optimized to not re-add in some 2025 * cases. 2026 */ 2027 if (TD_ON_RUNQ(td) && prio < td->td_priority) { 2028 sched_rem(td); 2029 td->td_priority = prio; 2030 sched_add(td, SRQ_BORROWING | SRQ_HOLDTD); 2031 return; 2032 } 2033 /* 2034 * If the thread is currently running we may have to adjust the lowpri 2035 * information so other cpus are aware of our current priority. 2036 */ 2037 if (TD_IS_RUNNING(td)) { 2038 tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); 2039 oldpri = td->td_priority; 2040 td->td_priority = prio; 2041 if (prio < tdq->tdq_lowpri) 2042 tdq->tdq_lowpri = prio; 2043 else if (tdq->tdq_lowpri == oldpri) 2044 tdq_setlowpri(tdq, td); 2045 return; 2046 } 2047 td->td_priority = prio; 2048 } 2049 2050 /* 2051 * Update a thread's priority when it is lent another thread's 2052 * priority. 2053 */ 2054 void 2055 sched_lend_prio(struct thread *td, u_char prio) 2056 { 2057 2058 td->td_flags |= TDF_BORROWING; 2059 sched_thread_priority(td, prio); 2060 } 2061 2062 /* 2063 * Restore a thread's priority when priority propagation is 2064 * over. The prio argument is the minimum priority the thread 2065 * needs to have to satisfy other possible priority lending 2066 * requests. If the thread's regular priority is less 2067 * important than prio, the thread will keep a priority boost 2068 * of prio. 2069 */ 2070 void 2071 sched_unlend_prio(struct thread *td, u_char prio) 2072 { 2073 u_char base_pri; 2074 2075 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 2076 td->td_base_pri <= PRI_MAX_TIMESHARE) 2077 base_pri = td->td_user_pri; 2078 else 2079 base_pri = td->td_base_pri; 2080 if (prio >= base_pri) { 2081 td->td_flags &= ~TDF_BORROWING; 2082 sched_thread_priority(td, base_pri); 2083 } else 2084 sched_lend_prio(td, prio); 2085 } 2086 2087 /* 2088 * Standard entry for setting the priority to an absolute value. 2089 */ 2090 void 2091 sched_prio(struct thread *td, u_char prio) 2092 { 2093 u_char oldprio; 2094 2095 /* First, update the base priority. */ 2096 td->td_base_pri = prio; 2097 2098 /* 2099 * If the thread is borrowing another thread's priority, don't 2100 * ever lower the priority. 2101 */ 2102 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 2103 return; 2104 2105 /* Change the real priority. */ 2106 oldprio = td->td_priority; 2107 sched_thread_priority(td, prio); 2108 2109 /* 2110 * If the thread is on a turnstile, then let the turnstile update 2111 * its state. 2112 */ 2113 if (TD_ON_LOCK(td) && oldprio != prio) 2114 turnstile_adjust(td, oldprio); 2115 } 2116 2117 /* 2118 * Set the base interrupt thread priority. 2119 */ 2120 void 2121 sched_ithread_prio(struct thread *td, u_char prio) 2122 { 2123 THREAD_LOCK_ASSERT(td, MA_OWNED); 2124 MPASS(td->td_pri_class == PRI_ITHD); 2125 td->td_base_ithread_pri = prio; 2126 sched_prio(td, prio); 2127 } 2128 2129 /* 2130 * Set the base user priority, does not effect current running priority. 2131 */ 2132 void 2133 sched_user_prio(struct thread *td, u_char prio) 2134 { 2135 2136 td->td_base_user_pri = prio; 2137 if (td->td_lend_user_pri <= prio) 2138 return; 2139 td->td_user_pri = prio; 2140 } 2141 2142 void 2143 sched_lend_user_prio(struct thread *td, u_char prio) 2144 { 2145 2146 THREAD_LOCK_ASSERT(td, MA_OWNED); 2147 td->td_lend_user_pri = prio; 2148 td->td_user_pri = min(prio, td->td_base_user_pri); 2149 if (td->td_priority > td->td_user_pri) 2150 sched_prio(td, td->td_user_pri); 2151 else if (td->td_priority != td->td_user_pri) 2152 ast_sched_locked(td, TDA_SCHED); 2153 } 2154 2155 /* 2156 * Like the above but first check if there is anything to do. 2157 */ 2158 void 2159 sched_lend_user_prio_cond(struct thread *td, u_char prio) 2160 { 2161 2162 if (td->td_lend_user_pri == prio) 2163 return; 2164 2165 thread_lock(td); 2166 sched_lend_user_prio(td, prio); 2167 thread_unlock(td); 2168 } 2169 2170 #ifdef SMP 2171 /* 2172 * This tdq is about to idle. Try to steal a thread from another CPU before 2173 * choosing the idle thread. 2174 */ 2175 static void 2176 tdq_trysteal(struct tdq *tdq) 2177 { 2178 struct cpu_group *cg, *parent; 2179 struct tdq *steal; 2180 cpuset_t mask; 2181 int cpu, i, goup; 2182 2183 if (smp_started == 0 || steal_idle == 0 || trysteal_limit == 0 || 2184 tdq->tdq_cg == NULL) 2185 return; 2186 CPU_FILL(&mask); 2187 CPU_CLR(PCPU_GET(cpuid), &mask); 2188 /* We don't want to be preempted while we're iterating. */ 2189 spinlock_enter(); 2190 TDQ_UNLOCK(tdq); 2191 for (i = 1, cg = tdq->tdq_cg, goup = 0; ; ) { 2192 cpu = sched_highest(cg, &mask, steal_thresh, 1); 2193 /* 2194 * If a thread was added while interrupts were disabled don't 2195 * steal one here. 2196 */ 2197 if (TDQ_LOAD(tdq) > 0) { 2198 TDQ_LOCK(tdq); 2199 break; 2200 } 2201 2202 /* 2203 * We found no CPU to steal from in this group. Escalate to 2204 * the parent and repeat. But if parent has only two children 2205 * groups we can avoid searching this group again by searching 2206 * the other one specifically and then escalating two levels. 2207 */ 2208 if (cpu == -1) { 2209 if (goup) { 2210 cg = cg->cg_parent; 2211 goup = 0; 2212 } 2213 if (++i > trysteal_limit) { 2214 TDQ_LOCK(tdq); 2215 break; 2216 } 2217 parent = cg->cg_parent; 2218 if (parent == NULL) { 2219 TDQ_LOCK(tdq); 2220 break; 2221 } 2222 if (parent->cg_children == 2) { 2223 if (cg == &parent->cg_child[0]) 2224 cg = &parent->cg_child[1]; 2225 else 2226 cg = &parent->cg_child[0]; 2227 goup = 1; 2228 } else 2229 cg = parent; 2230 continue; 2231 } 2232 steal = TDQ_CPU(cpu); 2233 /* 2234 * The data returned by sched_highest() is stale and 2235 * the chosen CPU no longer has an eligible thread. 2236 * At this point unconditionally exit the loop to bound 2237 * the time spent in the critcal section. 2238 */ 2239 if (TDQ_LOAD(steal) < steal_thresh || 2240 TDQ_TRANSFERABLE(steal) == 0) 2241 continue; 2242 /* 2243 * Try to lock both queues. If we are assigned a thread while 2244 * waited for the lock, switch to it now instead of stealing. 2245 * If we can't get the lock, then somebody likely got there 2246 * first. 2247 */ 2248 TDQ_LOCK(tdq); 2249 if (tdq->tdq_load > 0) 2250 break; 2251 if (TDQ_TRYLOCK_FLAGS(steal, MTX_DUPOK) == 0) 2252 break; 2253 /* 2254 * The data returned by sched_highest() is stale and 2255 * the chosen CPU no longer has an eligible thread. 2256 */ 2257 if (TDQ_LOAD(steal) < steal_thresh || 2258 TDQ_TRANSFERABLE(steal) == 0) { 2259 TDQ_UNLOCK(steal); 2260 break; 2261 } 2262 /* 2263 * If we fail to acquire one due to affinity restrictions, 2264 * bail out and let the idle thread to a more complete search 2265 * outside of a critical section. 2266 */ 2267 if (tdq_move(steal, tdq) == -1) { 2268 TDQ_UNLOCK(steal); 2269 break; 2270 } 2271 TDQ_UNLOCK(steal); 2272 break; 2273 } 2274 spinlock_exit(); 2275 } 2276 #endif 2277 2278 /* 2279 * Handle migration from sched_switch(). This happens only for 2280 * cpu binding. 2281 */ 2282 static struct mtx * 2283 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) 2284 { 2285 struct tdq *tdn; 2286 #ifdef SMP 2287 int lowpri; 2288 #endif 2289 2290 KASSERT(THREAD_CAN_MIGRATE(td) || 2291 (td_get_sched(td)->ts_flags & TSF_BOUND) != 0, 2292 ("Thread %p shouldn't migrate", td)); 2293 KASSERT(!CPU_ABSENT(td_get_sched(td)->ts_cpu), ("sched_switch_migrate: " 2294 "thread %s queued on absent CPU %d.", td->td_name, 2295 td_get_sched(td)->ts_cpu)); 2296 tdn = TDQ_CPU(td_get_sched(td)->ts_cpu); 2297 #ifdef SMP 2298 tdq_load_rem(tdq, td); 2299 /* 2300 * Do the lock dance required to avoid LOR. We have an 2301 * extra spinlock nesting from sched_switch() which will 2302 * prevent preemption while we're holding neither run-queue lock. 2303 */ 2304 TDQ_UNLOCK(tdq); 2305 TDQ_LOCK(tdn); 2306 lowpri = tdq_add(tdn, td, flags); 2307 tdq_notify(tdn, lowpri); 2308 TDQ_UNLOCK(tdn); 2309 TDQ_LOCK(tdq); 2310 #endif 2311 return (TDQ_LOCKPTR(tdn)); 2312 } 2313 2314 /* 2315 * thread_lock_unblock() that does not assume td_lock is blocked. 2316 */ 2317 static inline void 2318 thread_unblock_switch(struct thread *td, struct mtx *mtx) 2319 { 2320 atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, 2321 (uintptr_t)mtx); 2322 } 2323 2324 /* 2325 * Switch threads. This function has to handle threads coming in while 2326 * blocked for some reason, running, or idle. It also must deal with 2327 * migrating a thread from one queue to another as running threads may 2328 * be assigned elsewhere via binding. 2329 */ 2330 void 2331 sched_switch(struct thread *td, int flags) 2332 { 2333 struct thread *newtd; 2334 struct tdq *tdq; 2335 struct td_sched *ts; 2336 struct mtx *mtx; 2337 int srqflag; 2338 int cpuid, preempted; 2339 #ifdef SMP 2340 int pickcpu; 2341 #endif 2342 2343 THREAD_LOCK_ASSERT(td, MA_OWNED); 2344 2345 cpuid = PCPU_GET(cpuid); 2346 tdq = TDQ_SELF(); 2347 ts = td_get_sched(td); 2348 sched_pctcpu_update(ts, 1); 2349 #ifdef SMP 2350 pickcpu = (td->td_flags & TDF_PICKCPU) != 0; 2351 if (pickcpu) 2352 ts->ts_rltick = (u_int)ticks - affinity * MAX_CACHE_LEVELS; 2353 else 2354 ts->ts_rltick = (u_int)ticks; 2355 #endif 2356 td->td_lastcpu = td->td_oncpu; 2357 preempted = (td->td_flags & TDF_SLICEEND) == 0 && 2358 (flags & SW_PREEMPT) != 0; 2359 td->td_flags &= ~(TDF_PICKCPU | TDF_SLICEEND); 2360 ast_unsched_locked(td, TDA_SCHED); 2361 td->td_owepreempt = 0; 2362 atomic_store_char(&tdq->tdq_owepreempt, 0); 2363 if (!TD_IS_IDLETHREAD(td)) 2364 TDQ_SWITCHCNT_INC(tdq); 2365 2366 /* 2367 * Always block the thread lock so we can drop the tdq lock early. 2368 */ 2369 mtx = thread_lock_block(td); 2370 spinlock_enter(); 2371 if (TD_IS_IDLETHREAD(td)) { 2372 MPASS(mtx == TDQ_LOCKPTR(tdq)); 2373 TD_SET_CAN_RUN(td); 2374 } else if (TD_IS_RUNNING(td)) { 2375 MPASS(mtx == TDQ_LOCKPTR(tdq)); 2376 srqflag = SRQ_OURSELF | SRQ_YIELDING | 2377 (preempted ? SRQ_PREEMPTED : 0); 2378 #ifdef SMP 2379 if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu) 2380 || pickcpu)) 2381 ts->ts_cpu = sched_pickcpu(td, 0); 2382 #endif 2383 if (ts->ts_cpu == cpuid) 2384 tdq_runq_add(tdq, td, srqflag); 2385 else 2386 mtx = sched_switch_migrate(tdq, td, srqflag); 2387 } else { 2388 /* This thread must be going to sleep. */ 2389 if (mtx != TDQ_LOCKPTR(tdq)) { 2390 mtx_unlock_spin(mtx); 2391 TDQ_LOCK(tdq); 2392 } 2393 tdq_load_rem(tdq, td); 2394 #ifdef SMP 2395 if (tdq->tdq_load == 0) 2396 tdq_trysteal(tdq); 2397 #endif 2398 } 2399 2400 #if (KTR_COMPILE & KTR_SCHED) != 0 2401 if (TD_IS_IDLETHREAD(td)) 2402 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle", 2403 "prio:%d", td->td_priority); 2404 else 2405 KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td), 2406 "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg, 2407 "lockname:\"%s\"", td->td_lockname); 2408 #endif 2409 2410 /* 2411 * We enter here with the thread blocked and assigned to the 2412 * appropriate cpu run-queue or sleep-queue and with the current 2413 * thread-queue locked. 2414 */ 2415 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 2416 MPASS(td == tdq->tdq_curthread); 2417 newtd = choosethread(); 2418 sched_pctcpu_update(td_get_sched(newtd), 0); 2419 TDQ_UNLOCK(tdq); 2420 2421 /* 2422 * Call the MD code to switch contexts if necessary. 2423 */ 2424 if (td != newtd) { 2425 #ifdef HWPMC_HOOKS 2426 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 2427 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 2428 #endif 2429 SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc); 2430 2431 #ifdef KDTRACE_HOOKS 2432 /* 2433 * If DTrace has set the active vtime enum to anything 2434 * other than INACTIVE (0), then it should have set the 2435 * function to call. 2436 */ 2437 if (dtrace_vtime_active) 2438 (*dtrace_vtime_switch_func)(newtd); 2439 #endif 2440 2441 #ifdef HWT_HOOKS 2442 HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL); 2443 HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); 2444 #endif 2445 2446 td->td_oncpu = NOCPU; 2447 cpu_switch(td, newtd, mtx); 2448 cpuid = td->td_oncpu = PCPU_GET(cpuid); 2449 2450 SDT_PROBE0(sched, , , on__cpu); 2451 #ifdef HWPMC_HOOKS 2452 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 2453 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 2454 #endif 2455 } else { 2456 thread_unblock_switch(td, mtx); 2457 SDT_PROBE0(sched, , , remain__cpu); 2458 } 2459 KASSERT(curthread->td_md.md_spinlock_count == 1, 2460 ("invalid count %d", curthread->td_md.md_spinlock_count)); 2461 2462 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", 2463 "prio:%d", td->td_priority); 2464 } 2465 2466 /* 2467 * Adjust thread priorities as a result of a nice request. 2468 */ 2469 void 2470 sched_nice(struct proc *p, int nice) 2471 { 2472 struct thread *td; 2473 2474 PROC_LOCK_ASSERT(p, MA_OWNED); 2475 2476 p->p_nice = nice; 2477 FOREACH_THREAD_IN_PROC(p, td) { 2478 thread_lock(td); 2479 sched_priority(td); 2480 sched_prio(td, td->td_base_user_pri); 2481 thread_unlock(td); 2482 } 2483 } 2484 2485 /* 2486 * Record the sleep time for the interactivity scorer. 2487 */ 2488 void 2489 sched_sleep(struct thread *td, int prio) 2490 { 2491 2492 THREAD_LOCK_ASSERT(td, MA_OWNED); 2493 2494 td->td_slptick = ticks; 2495 if (PRI_BASE(td->td_pri_class) != PRI_TIMESHARE) 2496 return; 2497 if (static_boost == 1 && prio) 2498 sched_prio(td, prio); 2499 else if (static_boost && td->td_priority > static_boost) 2500 sched_prio(td, static_boost); 2501 } 2502 2503 /* 2504 * Schedule a thread to resume execution and record how long it voluntarily 2505 * slept. We also update the pctcpu, interactivity, and priority. 2506 * 2507 * Requires the thread lock on entry, drops on exit. 2508 */ 2509 void 2510 sched_wakeup(struct thread *td, int srqflags) 2511 { 2512 struct td_sched *ts; 2513 int slptick; 2514 2515 THREAD_LOCK_ASSERT(td, MA_OWNED); 2516 ts = td_get_sched(td); 2517 2518 /* 2519 * If we slept for more than a tick update our interactivity and 2520 * priority. 2521 */ 2522 slptick = td->td_slptick; 2523 td->td_slptick = 0; 2524 if (slptick && slptick != ticks) { 2525 ts->ts_slptime += (ticks - slptick) << SCHED_TICK_SHIFT; 2526 sched_interact_update(td); 2527 sched_pctcpu_update(ts, 0); 2528 } 2529 2530 /* 2531 * When resuming an idle ithread, restore its base ithread 2532 * priority. 2533 */ 2534 if (PRI_BASE(td->td_pri_class) == PRI_ITHD && 2535 td->td_priority != td->td_base_ithread_pri) 2536 sched_prio(td, td->td_base_ithread_pri); 2537 2538 /* 2539 * Reset the slice value since we slept and advanced the round-robin. 2540 */ 2541 ts->ts_slice = 0; 2542 sched_add(td, SRQ_BORING | srqflags); 2543 } 2544 2545 /* 2546 * Penalize the parent for creating a new child and initialize the child's 2547 * priority. 2548 */ 2549 void 2550 sched_fork(struct thread *td, struct thread *child) 2551 { 2552 THREAD_LOCK_ASSERT(td, MA_OWNED); 2553 sched_pctcpu_update(td_get_sched(td), 1); 2554 sched_fork_thread(td, child); 2555 /* 2556 * Penalize the parent and child for forking. 2557 */ 2558 sched_interact_fork(child); 2559 sched_priority(child); 2560 td_get_sched(td)->ts_runtime += tickincr; 2561 sched_interact_update(td); 2562 sched_priority(td); 2563 } 2564 2565 /* 2566 * Fork a new thread, may be within the same process. 2567 */ 2568 void 2569 sched_fork_thread(struct thread *td, struct thread *child) 2570 { 2571 struct td_sched *ts; 2572 struct td_sched *ts2; 2573 struct tdq *tdq; 2574 2575 tdq = TDQ_SELF(); 2576 THREAD_LOCK_ASSERT(td, MA_OWNED); 2577 /* 2578 * Initialize child. 2579 */ 2580 ts = td_get_sched(td); 2581 ts2 = td_get_sched(child); 2582 child->td_oncpu = NOCPU; 2583 child->td_lastcpu = NOCPU; 2584 child->td_lock = TDQ_LOCKPTR(tdq); 2585 child->td_cpuset = cpuset_ref(td->td_cpuset); 2586 child->td_domain.dr_policy = td->td_cpuset->cs_domain; 2587 ts2->ts_cpu = ts->ts_cpu; 2588 ts2->ts_flags = 0; 2589 /* 2590 * Grab our parents cpu estimation information. 2591 */ 2592 ts2->ts_ticks = ts->ts_ticks; 2593 ts2->ts_ltick = ts->ts_ltick; 2594 ts2->ts_ftick = ts->ts_ftick; 2595 /* 2596 * Do not inherit any borrowed priority from the parent. 2597 */ 2598 child->td_priority = child->td_base_pri; 2599 /* 2600 * And update interactivity score. 2601 */ 2602 ts2->ts_slptime = ts->ts_slptime; 2603 ts2->ts_runtime = ts->ts_runtime; 2604 /* Attempt to quickly learn interactivity. */ 2605 ts2->ts_slice = tdq_slice(tdq) - sched_slice_min; 2606 #ifdef KTR 2607 bzero(ts2->ts_name, sizeof(ts2->ts_name)); 2608 #endif 2609 } 2610 2611 /* 2612 * Adjust the priority class of a thread. 2613 */ 2614 void 2615 sched_class(struct thread *td, int class) 2616 { 2617 2618 THREAD_LOCK_ASSERT(td, MA_OWNED); 2619 if (td->td_pri_class == class) 2620 return; 2621 td->td_pri_class = class; 2622 } 2623 2624 /* 2625 * Return some of the child's priority and interactivity to the parent. 2626 */ 2627 void 2628 sched_exit(struct proc *p, struct thread *child) 2629 { 2630 struct thread *td; 2631 2632 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "proc exit", 2633 "prio:%d", child->td_priority); 2634 PROC_LOCK_ASSERT(p, MA_OWNED); 2635 td = FIRST_THREAD_IN_PROC(p); 2636 sched_exit_thread(td, child); 2637 } 2638 2639 /* 2640 * Penalize another thread for the time spent on this one. This helps to 2641 * worsen the priority and interactivity of processes which schedule batch 2642 * jobs such as make. This has little effect on the make process itself but 2643 * causes new processes spawned by it to receive worse scores immediately. 2644 */ 2645 void 2646 sched_exit_thread(struct thread *td, struct thread *child) 2647 { 2648 2649 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "thread exit", 2650 "prio:%d", child->td_priority); 2651 /* 2652 * Give the child's runtime to the parent without returning the 2653 * sleep time as a penalty to the parent. This causes shells that 2654 * launch expensive things to mark their children as expensive. 2655 */ 2656 thread_lock(td); 2657 td_get_sched(td)->ts_runtime += td_get_sched(child)->ts_runtime; 2658 sched_interact_update(td); 2659 sched_priority(td); 2660 thread_unlock(td); 2661 } 2662 2663 void 2664 sched_preempt(struct thread *td) 2665 { 2666 struct tdq *tdq; 2667 int flags; 2668 2669 SDT_PROBE2(sched, , , surrender, td, td->td_proc); 2670 2671 thread_lock(td); 2672 tdq = TDQ_SELF(); 2673 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2674 if (td->td_priority > tdq->tdq_lowpri) { 2675 if (td->td_critnest == 1) { 2676 flags = SW_INVOL | SW_PREEMPT; 2677 flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE : 2678 SWT_REMOTEPREEMPT; 2679 mi_switch(flags); 2680 /* Switch dropped thread lock. */ 2681 return; 2682 } 2683 td->td_owepreempt = 1; 2684 } else { 2685 tdq->tdq_owepreempt = 0; 2686 } 2687 thread_unlock(td); 2688 } 2689 2690 /* 2691 * Fix priorities on return to user-space. Priorities may be elevated due 2692 * to static priorities in msleep() or similar. 2693 */ 2694 void 2695 sched_userret_slowpath(struct thread *td) 2696 { 2697 2698 thread_lock(td); 2699 td->td_priority = td->td_user_pri; 2700 td->td_base_pri = td->td_user_pri; 2701 tdq_setlowpri(TDQ_SELF(), td); 2702 thread_unlock(td); 2703 } 2704 2705 SCHED_STAT_DEFINE(ithread_demotions, "Interrupt thread priority demotions"); 2706 SCHED_STAT_DEFINE(ithread_preemptions, 2707 "Interrupt thread preemptions due to time-sharing"); 2708 2709 /* 2710 * Return time slice for a given thread. For ithreads this is 2711 * sched_slice. For other threads it is tdq_slice(tdq). 2712 */ 2713 static inline u_int 2714 td_slice(struct thread *td, struct tdq *tdq) 2715 { 2716 if (PRI_BASE(td->td_pri_class) == PRI_ITHD) 2717 return (sched_slice); 2718 return (tdq_slice(tdq)); 2719 } 2720 2721 /* 2722 * Handle a stathz tick. This is really only relevant for timeshare 2723 * and interrupt threads. 2724 */ 2725 void 2726 sched_clock(struct thread *td, int cnt) 2727 { 2728 struct tdq *tdq; 2729 struct td_sched *ts; 2730 2731 THREAD_LOCK_ASSERT(td, MA_OWNED); 2732 tdq = TDQ_SELF(); 2733 #ifdef SMP 2734 /* 2735 * We run the long term load balancer infrequently on the first cpu. 2736 */ 2737 if (balance_tdq == tdq && smp_started != 0 && rebalance != 0 && 2738 balance_ticks != 0) { 2739 balance_ticks -= cnt; 2740 if (balance_ticks <= 0) 2741 sched_balance(); 2742 } 2743 #endif 2744 /* 2745 * Save the old switch count so we have a record of the last ticks 2746 * activity. Initialize the new switch count based on our load. 2747 * If there is some activity seed it to reflect that. 2748 */ 2749 tdq->tdq_oldswitchcnt = tdq->tdq_switchcnt; 2750 tdq->tdq_switchcnt = tdq->tdq_load; 2751 2752 /* 2753 * Advance the insert offset once for each tick to ensure that all 2754 * threads get a chance to run. In order not to change too much ULE's 2755 * anti-starvation and "nice" behaviors after the switch to a single 2756 * 256-queue runqueue, since the queue insert offset is incremented by 2757 * 1 at every tick (provided the system is not too loaded) and there are 2758 * now 109 distinct levels for the timesharing selection policy instead 2759 * of 64 before (separate runqueue), we apply a factor 7/4 when 2760 * increasing the insert offset, by incrementing it by 2 instead of 2761 * 1 except for one in four ticks. 2762 */ 2763 if (tdq->tdq_ts_off == tdq->tdq_ts_deq_off) { 2764 tdq->tdq_ts_ticks += cnt; 2765 tdq->tdq_ts_off = (tdq->tdq_ts_off + 2 * cnt - 2766 tdq-> tdq_ts_ticks / 4) % RQ_TS_POL_MODULO; 2767 tdq->tdq_ts_ticks %= 4; 2768 tdq_advance_ts_deq_off(tdq, false); 2769 } 2770 ts = td_get_sched(td); 2771 sched_pctcpu_update(ts, 1); 2772 if ((td->td_pri_class & PRI_FIFO_BIT) || TD_IS_IDLETHREAD(td)) 2773 return; 2774 2775 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) { 2776 /* 2777 * We used a tick; charge it to the thread so 2778 * that we can compute our interactivity. 2779 */ 2780 td_get_sched(td)->ts_runtime += tickincr * cnt; 2781 sched_interact_update(td); 2782 sched_priority(td); 2783 } 2784 2785 /* 2786 * Force a context switch if the current thread has used up a full 2787 * time slice (default is 100ms). 2788 */ 2789 ts->ts_slice += cnt; 2790 if (ts->ts_slice >= td_slice(td, tdq)) { 2791 ts->ts_slice = 0; 2792 2793 /* 2794 * If an ithread uses a full quantum, demote its 2795 * priority and preempt it. 2796 */ 2797 if (PRI_BASE(td->td_pri_class) == PRI_ITHD) { 2798 SCHED_STAT_INC(ithread_preemptions); 2799 td->td_owepreempt = 1; 2800 if (td->td_base_pri + RQ_PPQ < PRI_MAX_ITHD) { 2801 SCHED_STAT_INC(ithread_demotions); 2802 sched_prio(td, td->td_base_pri + RQ_PPQ); 2803 } 2804 } else { 2805 ast_sched_locked(td, TDA_SCHED); 2806 td->td_flags |= TDF_SLICEEND; 2807 } 2808 } 2809 } 2810 2811 u_int 2812 sched_estcpu(struct thread *td __unused) 2813 { 2814 2815 return (0); 2816 } 2817 2818 /* 2819 * Return whether the current CPU has runnable tasks. Used for in-kernel 2820 * cooperative idle threads. 2821 */ 2822 bool 2823 sched_runnable(void) 2824 { 2825 struct tdq *tdq; 2826 2827 tdq = TDQ_SELF(); 2828 return (TDQ_LOAD(tdq) > (TD_IS_IDLETHREAD(curthread) ? 0 : 1)); 2829 } 2830 2831 /* 2832 * Choose the highest priority thread to run. The thread is removed from 2833 * the run-queue while running however the load remains. 2834 */ 2835 struct thread * 2836 sched_choose(void) 2837 { 2838 struct thread *td; 2839 struct tdq *tdq; 2840 2841 tdq = TDQ_SELF(); 2842 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2843 td = tdq_choose(tdq); 2844 if (td != NULL) { 2845 tdq_runq_rem(tdq, td); 2846 tdq->tdq_lowpri = td->td_priority; 2847 } else { 2848 tdq->tdq_lowpri = PRI_MAX_IDLE; 2849 td = PCPU_GET(idlethread); 2850 } 2851 tdq->tdq_curthread = td; 2852 return (td); 2853 } 2854 2855 /* 2856 * Set owepreempt if the currently running thread has lower priority than "pri". 2857 * Preemption never happens directly in ULE, we always request it once we exit a 2858 * critical section. 2859 */ 2860 static void 2861 sched_setpreempt(int pri) 2862 { 2863 struct thread *ctd; 2864 int cpri; 2865 2866 ctd = curthread; 2867 THREAD_LOCK_ASSERT(ctd, MA_OWNED); 2868 2869 cpri = ctd->td_priority; 2870 if (pri < cpri) 2871 ast_sched_locked(ctd, TDA_SCHED); 2872 if (KERNEL_PANICKED() || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) 2873 return; 2874 if (!sched_shouldpreempt(pri, cpri, 0)) 2875 return; 2876 ctd->td_owepreempt = 1; 2877 } 2878 2879 /* 2880 * Add a thread to a thread queue. Select the appropriate runq and add the 2881 * thread to it. This is the internal function called when the tdq is 2882 * predetermined. 2883 */ 2884 static int 2885 tdq_add(struct tdq *tdq, struct thread *td, int flags) 2886 { 2887 int lowpri; 2888 2889 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2890 THREAD_LOCK_BLOCKED_ASSERT(td, MA_OWNED); 2891 KASSERT((td->td_inhibitors == 0), 2892 ("sched_add: trying to run inhibited thread")); 2893 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 2894 ("sched_add: bad thread state")); 2895 KASSERT(td->td_flags & TDF_INMEM, 2896 ("sched_add: thread swapped out")); 2897 2898 lowpri = tdq->tdq_lowpri; 2899 if (td->td_priority < lowpri) 2900 tdq->tdq_lowpri = td->td_priority; 2901 tdq_runq_add(tdq, td, flags); 2902 tdq_load_add(tdq, td); 2903 return (lowpri); 2904 } 2905 2906 /* 2907 * Select the target thread queue and add a thread to it. Request 2908 * preemption or IPI a remote processor if required. 2909 * 2910 * Requires the thread lock on entry, drops on exit. 2911 */ 2912 void 2913 sched_add(struct thread *td, int flags) 2914 { 2915 struct tdq *tdq; 2916 #ifdef SMP 2917 int cpu, lowpri; 2918 #endif 2919 2920 KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add", 2921 "prio:%d", td->td_priority, KTR_ATTR_LINKED, 2922 sched_tdname(curthread)); 2923 KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup", 2924 KTR_ATTR_LINKED, sched_tdname(td)); 2925 SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL, 2926 flags & SRQ_PREEMPTED); 2927 THREAD_LOCK_ASSERT(td, MA_OWNED); 2928 /* 2929 * Recalculate the priority before we select the target cpu or 2930 * run-queue. 2931 */ 2932 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 2933 sched_priority(td); 2934 #ifdef SMP 2935 /* 2936 * Pick the destination cpu and if it isn't ours transfer to the 2937 * target cpu. 2938 */ 2939 cpu = sched_pickcpu(td, flags); 2940 tdq = sched_setcpu(td, cpu, flags); 2941 lowpri = tdq_add(tdq, td, flags); 2942 if (cpu != PCPU_GET(cpuid)) 2943 tdq_notify(tdq, lowpri); 2944 else if (!(flags & SRQ_YIELDING)) 2945 sched_setpreempt(td->td_priority); 2946 #else 2947 tdq = TDQ_SELF(); 2948 /* 2949 * Now that the thread is moving to the run-queue, set the lock 2950 * to the scheduler's lock. 2951 */ 2952 if (td->td_lock != TDQ_LOCKPTR(tdq)) { 2953 TDQ_LOCK(tdq); 2954 if ((flags & SRQ_HOLD) != 0) 2955 td->td_lock = TDQ_LOCKPTR(tdq); 2956 else 2957 thread_lock_set(td, TDQ_LOCKPTR(tdq)); 2958 } 2959 (void)tdq_add(tdq, td, flags); 2960 if (!(flags & SRQ_YIELDING)) 2961 sched_setpreempt(td->td_priority); 2962 #endif 2963 if (!(flags & SRQ_HOLDTD)) 2964 thread_unlock(td); 2965 } 2966 2967 /* 2968 * Remove a thread from a run-queue without running it. This is used 2969 * when we're stealing a thread from a remote queue. Otherwise all threads 2970 * exit by calling sched_exit_thread() and sched_throw() themselves. 2971 */ 2972 void 2973 sched_rem(struct thread *td) 2974 { 2975 struct tdq *tdq; 2976 2977 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "runq rem", 2978 "prio:%d", td->td_priority); 2979 SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL); 2980 tdq = TDQ_CPU(td_get_sched(td)->ts_cpu); 2981 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2982 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2983 KASSERT(TD_ON_RUNQ(td), 2984 ("sched_rem: thread not on run queue")); 2985 tdq_runq_rem(tdq, td); 2986 tdq_load_rem(tdq, td); 2987 TD_SET_CAN_RUN(td); 2988 if (td->td_priority == tdq->tdq_lowpri) 2989 tdq_setlowpri(tdq, NULL); 2990 } 2991 2992 /* 2993 * Fetch cpu utilization information. Updates on demand. 2994 */ 2995 fixpt_t 2996 sched_pctcpu(struct thread *td) 2997 { 2998 struct td_sched *ts; 2999 u_int len; 3000 fixpt_t pctcpu; 3001 3002 THREAD_LOCK_ASSERT(td, MA_OWNED); 3003 ts = td_get_sched(td); 3004 sched_pctcpu_update(ts, TD_IS_RUNNING(td)); 3005 len = SCHED_TICK_LENGTH(ts); 3006 pctcpu = ((FSHIFT >= SCHED_TICK_SHIFT ? /* Resolved at compile-time. */ 3007 (SCHED_TICK_RUN_SHIFTED(ts) << (FSHIFT - SCHED_TICK_SHIFT)) : 3008 (SCHED_TICK_RUN_SHIFTED(ts) >> (SCHED_TICK_SHIFT - FSHIFT))) + 3009 len / 2) / len; 3010 return (pctcpu); 3011 } 3012 3013 /* 3014 * Enforce affinity settings for a thread. Called after adjustments to 3015 * cpumask. 3016 */ 3017 void 3018 sched_affinity(struct thread *td) 3019 { 3020 #ifdef SMP 3021 struct td_sched *ts; 3022 3023 THREAD_LOCK_ASSERT(td, MA_OWNED); 3024 ts = td_get_sched(td); 3025 if (THREAD_CAN_SCHED(td, ts->ts_cpu)) 3026 return; 3027 if (TD_ON_RUNQ(td)) { 3028 sched_rem(td); 3029 sched_add(td, SRQ_BORING | SRQ_HOLDTD); 3030 return; 3031 } 3032 if (!TD_IS_RUNNING(td)) 3033 return; 3034 /* 3035 * Force a switch before returning to userspace. If the 3036 * target thread is not running locally send an ipi to force 3037 * the issue. 3038 */ 3039 ast_sched_locked(td, TDA_SCHED); 3040 if (td != curthread) 3041 ipi_cpu(ts->ts_cpu, IPI_PREEMPT); 3042 #endif 3043 } 3044 3045 /* 3046 * Bind a thread to a target cpu. 3047 */ 3048 void 3049 sched_bind(struct thread *td, int cpu) 3050 { 3051 struct td_sched *ts; 3052 3053 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); 3054 KASSERT(td == curthread, ("sched_bind: can only bind curthread")); 3055 ts = td_get_sched(td); 3056 if (ts->ts_flags & TSF_BOUND) 3057 sched_unbind(td); 3058 KASSERT(THREAD_CAN_MIGRATE(td), ("%p must be migratable", td)); 3059 ts->ts_flags |= TSF_BOUND; 3060 sched_pin(); 3061 if (PCPU_GET(cpuid) == cpu) 3062 return; 3063 ts->ts_cpu = cpu; 3064 /* When we return from mi_switch we'll be on the correct cpu. */ 3065 mi_switch(SW_VOL | SWT_BIND); 3066 thread_lock(td); 3067 } 3068 3069 /* 3070 * Release a bound thread. 3071 */ 3072 void 3073 sched_unbind(struct thread *td) 3074 { 3075 struct td_sched *ts; 3076 3077 THREAD_LOCK_ASSERT(td, MA_OWNED); 3078 KASSERT(td == curthread, ("sched_unbind: can only bind curthread")); 3079 ts = td_get_sched(td); 3080 if ((ts->ts_flags & TSF_BOUND) == 0) 3081 return; 3082 ts->ts_flags &= ~TSF_BOUND; 3083 sched_unpin(); 3084 } 3085 3086 int 3087 sched_is_bound(struct thread *td) 3088 { 3089 THREAD_LOCK_ASSERT(td, MA_OWNED); 3090 return (td_get_sched(td)->ts_flags & TSF_BOUND); 3091 } 3092 3093 /* 3094 * Basic yield call. 3095 */ 3096 void 3097 sched_relinquish(struct thread *td) 3098 { 3099 thread_lock(td); 3100 mi_switch(SW_VOL | SWT_RELINQUISH); 3101 } 3102 3103 /* 3104 * Return the total system load. 3105 */ 3106 int 3107 sched_load(void) 3108 { 3109 #ifdef SMP 3110 int total; 3111 int i; 3112 3113 total = 0; 3114 CPU_FOREACH(i) 3115 total += atomic_load_int(&TDQ_CPU(i)->tdq_sysload); 3116 return (total); 3117 #else 3118 return (atomic_load_int(&TDQ_SELF()->tdq_sysload)); 3119 #endif 3120 } 3121 3122 int 3123 sched_sizeof_proc(void) 3124 { 3125 return (sizeof(struct proc)); 3126 } 3127 3128 int 3129 sched_sizeof_thread(void) 3130 { 3131 return (sizeof(struct thread) + sizeof(struct td_sched)); 3132 } 3133 3134 #ifdef SMP 3135 #define TDQ_IDLESPIN(tdq) \ 3136 ((tdq)->tdq_cg != NULL && ((tdq)->tdq_cg->cg_flags & CG_FLAG_THREAD) == 0) 3137 #else 3138 #define TDQ_IDLESPIN(tdq) 1 3139 #endif 3140 3141 /* 3142 * The actual idle process. 3143 */ 3144 void 3145 sched_idletd(void *dummy) 3146 { 3147 struct thread *td; 3148 struct tdq *tdq; 3149 int oldswitchcnt, switchcnt; 3150 int i; 3151 3152 mtx_assert(&Giant, MA_NOTOWNED); 3153 td = curthread; 3154 tdq = TDQ_SELF(); 3155 THREAD_NO_SLEEPING(); 3156 oldswitchcnt = -1; 3157 for (;;) { 3158 if (TDQ_LOAD(tdq)) { 3159 thread_lock(td); 3160 mi_switch(SW_VOL | SWT_IDLE); 3161 } 3162 switchcnt = TDQ_SWITCHCNT(tdq); 3163 #ifdef SMP 3164 if (always_steal || switchcnt != oldswitchcnt) { 3165 oldswitchcnt = switchcnt; 3166 if (tdq_idled(tdq) == 0) 3167 continue; 3168 } 3169 switchcnt = TDQ_SWITCHCNT(tdq); 3170 #else 3171 oldswitchcnt = switchcnt; 3172 #endif 3173 /* 3174 * If we're switching very frequently, spin while checking 3175 * for load rather than entering a low power state that 3176 * may require an IPI. However, don't do any busy 3177 * loops while on SMT machines as this simply steals 3178 * cycles from cores doing useful work. 3179 */ 3180 if (TDQ_IDLESPIN(tdq) && switchcnt > sched_idlespinthresh) { 3181 for (i = 0; i < sched_idlespins; i++) { 3182 if (TDQ_LOAD(tdq)) 3183 break; 3184 cpu_spinwait(); 3185 } 3186 } 3187 3188 /* If there was context switch during spin, restart it. */ 3189 switchcnt = TDQ_SWITCHCNT(tdq); 3190 if (TDQ_LOAD(tdq) != 0 || switchcnt != oldswitchcnt) 3191 continue; 3192 3193 /* Run main MD idle handler. */ 3194 atomic_store_int(&tdq->tdq_cpu_idle, 1); 3195 /* 3196 * Make sure that the tdq_cpu_idle update is globally visible 3197 * before cpu_idle() reads tdq_load. The order is important 3198 * to avoid races with tdq_notify(). 3199 */ 3200 atomic_thread_fence_seq_cst(); 3201 /* 3202 * Checking for again after the fence picks up assigned 3203 * threads often enough to make it worthwhile to do so in 3204 * order to avoid calling cpu_idle(). 3205 */ 3206 if (TDQ_LOAD(tdq) != 0) { 3207 atomic_store_int(&tdq->tdq_cpu_idle, 0); 3208 continue; 3209 } 3210 cpu_idle(switchcnt * 4 > sched_idlespinthresh); 3211 atomic_store_int(&tdq->tdq_cpu_idle, 0); 3212 3213 /* 3214 * Account thread-less hardware interrupts and 3215 * other wakeup reasons equal to context switches. 3216 */ 3217 switchcnt = TDQ_SWITCHCNT(tdq); 3218 if (switchcnt != oldswitchcnt) 3219 continue; 3220 TDQ_SWITCHCNT_INC(tdq); 3221 oldswitchcnt++; 3222 } 3223 } 3224 3225 /* 3226 * sched_throw_grab() chooses a thread from the queue to switch to 3227 * next. It returns with the tdq lock dropped in a spinlock section to 3228 * keep interrupts disabled until the CPU is running in a proper threaded 3229 * context. 3230 */ 3231 static struct thread * 3232 sched_throw_grab(struct tdq *tdq) 3233 { 3234 struct thread *newtd; 3235 3236 newtd = choosethread(); 3237 spinlock_enter(); 3238 TDQ_UNLOCK(tdq); 3239 KASSERT(curthread->td_md.md_spinlock_count == 1, 3240 ("invalid count %d", curthread->td_md.md_spinlock_count)); 3241 return (newtd); 3242 } 3243 3244 /* 3245 * A CPU is entering for the first time. 3246 */ 3247 void 3248 sched_ap_entry(void) 3249 { 3250 struct thread *newtd; 3251 struct tdq *tdq; 3252 3253 tdq = TDQ_SELF(); 3254 3255 /* This should have been setup in schedinit_ap(). */ 3256 THREAD_LOCKPTR_ASSERT(curthread, TDQ_LOCKPTR(tdq)); 3257 3258 TDQ_LOCK(tdq); 3259 /* Correct spinlock nesting. */ 3260 spinlock_exit(); 3261 PCPU_SET(switchtime, cpu_ticks()); 3262 PCPU_SET(switchticks, ticks); 3263 3264 newtd = sched_throw_grab(tdq); 3265 3266 #ifdef HWT_HOOKS 3267 HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); 3268 #endif 3269 3270 /* doesn't return */ 3271 cpu_throw(NULL, newtd); 3272 } 3273 3274 /* 3275 * A thread is exiting. 3276 */ 3277 void 3278 sched_throw(struct thread *td) 3279 { 3280 struct thread *newtd; 3281 struct tdq *tdq; 3282 3283 tdq = TDQ_SELF(); 3284 3285 MPASS(td != NULL); 3286 THREAD_LOCK_ASSERT(td, MA_OWNED); 3287 THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq)); 3288 3289 tdq_load_rem(tdq, td); 3290 td->td_lastcpu = td->td_oncpu; 3291 td->td_oncpu = NOCPU; 3292 thread_lock_block(td); 3293 3294 newtd = sched_throw_grab(tdq); 3295 3296 #ifdef HWT_HOOKS 3297 HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL); 3298 #endif 3299 3300 /* doesn't return */ 3301 cpu_switch(td, newtd, TDQ_LOCKPTR(tdq)); 3302 } 3303 3304 /* 3305 * This is called from fork_exit(). Just acquire the correct locks and 3306 * let fork do the rest of the work. 3307 */ 3308 void 3309 sched_fork_exit(struct thread *td) 3310 { 3311 struct tdq *tdq; 3312 int cpuid; 3313 3314 /* 3315 * Finish setting up thread glue so that it begins execution in a 3316 * non-nested critical section with the scheduler lock held. 3317 */ 3318 KASSERT(curthread->td_md.md_spinlock_count == 1, 3319 ("invalid count %d", curthread->td_md.md_spinlock_count)); 3320 cpuid = PCPU_GET(cpuid); 3321 tdq = TDQ_SELF(); 3322 TDQ_LOCK(tdq); 3323 spinlock_exit(); 3324 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 3325 td->td_oncpu = cpuid; 3326 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running", 3327 "prio:%d", td->td_priority); 3328 SDT_PROBE0(sched, , , on__cpu); 3329 } 3330 3331 /* 3332 * Create on first use to catch odd startup conditions. 3333 */ 3334 char * 3335 sched_tdname(struct thread *td) 3336 { 3337 #ifdef KTR 3338 struct td_sched *ts; 3339 3340 ts = td_get_sched(td); 3341 if (ts->ts_name[0] == '\0') 3342 snprintf(ts->ts_name, sizeof(ts->ts_name), 3343 "%s tid %d", td->td_name, td->td_tid); 3344 return (ts->ts_name); 3345 #else 3346 return (td->td_name); 3347 #endif 3348 } 3349 3350 #ifdef KTR 3351 void 3352 sched_clear_tdname(struct thread *td) 3353 { 3354 struct td_sched *ts; 3355 3356 ts = td_get_sched(td); 3357 ts->ts_name[0] = '\0'; 3358 } 3359 #endif 3360 3361 #ifdef SMP 3362 3363 /* 3364 * Build the CPU topology dump string. Is recursively called to collect 3365 * the topology tree. 3366 */ 3367 static int 3368 sysctl_kern_sched_topology_spec_internal(struct sbuf *sb, struct cpu_group *cg, 3369 int indent) 3370 { 3371 char cpusetbuf[CPUSETBUFSIZ]; 3372 int i, first; 3373 3374 sbuf_printf(sb, "%*s<group level=\"%d\" cache-level=\"%d\">\n", indent, 3375 "", 1 + indent / 2, cg->cg_level); 3376 sbuf_printf(sb, "%*s <cpu count=\"%d\" mask=\"%s\">", indent, "", 3377 cg->cg_count, cpusetobj_strprint(cpusetbuf, &cg->cg_mask)); 3378 first = TRUE; 3379 for (i = cg->cg_first; i <= cg->cg_last; i++) { 3380 if (CPU_ISSET(i, &cg->cg_mask)) { 3381 if (!first) 3382 sbuf_cat(sb, ", "); 3383 else 3384 first = FALSE; 3385 sbuf_printf(sb, "%d", i); 3386 } 3387 } 3388 sbuf_cat(sb, "</cpu>\n"); 3389 3390 if (cg->cg_flags != 0) { 3391 sbuf_printf(sb, "%*s <flags>", indent, ""); 3392 if ((cg->cg_flags & CG_FLAG_HTT) != 0) 3393 sbuf_cat(sb, "<flag name=\"HTT\">HTT group</flag>"); 3394 if ((cg->cg_flags & CG_FLAG_THREAD) != 0) 3395 sbuf_cat(sb, "<flag name=\"THREAD\">THREAD group</flag>"); 3396 if ((cg->cg_flags & CG_FLAG_SMT) != 0) 3397 sbuf_cat(sb, "<flag name=\"SMT\">SMT group</flag>"); 3398 if ((cg->cg_flags & CG_FLAG_NODE) != 0) 3399 sbuf_cat(sb, "<flag name=\"NODE\">NUMA node</flag>"); 3400 sbuf_cat(sb, "</flags>\n"); 3401 } 3402 3403 if (cg->cg_children > 0) { 3404 sbuf_printf(sb, "%*s <children>\n", indent, ""); 3405 for (i = 0; i < cg->cg_children; i++) 3406 sysctl_kern_sched_topology_spec_internal(sb, 3407 &cg->cg_child[i], indent+2); 3408 sbuf_printf(sb, "%*s </children>\n", indent, ""); 3409 } 3410 sbuf_printf(sb, "%*s</group>\n", indent, ""); 3411 return (0); 3412 } 3413 3414 /* 3415 * Sysctl handler for retrieving topology dump. It's a wrapper for 3416 * the recursive sysctl_kern_smp_topology_spec_internal(). 3417 */ 3418 static int 3419 sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS) 3420 { 3421 struct sbuf *topo; 3422 int err; 3423 3424 KASSERT(cpu_top != NULL, ("cpu_top isn't initialized")); 3425 3426 topo = sbuf_new_for_sysctl(NULL, NULL, 512, req); 3427 if (topo == NULL) 3428 return (ENOMEM); 3429 3430 sbuf_cat(topo, "<groups>\n"); 3431 err = sysctl_kern_sched_topology_spec_internal(topo, cpu_top, 1); 3432 sbuf_cat(topo, "</groups>\n"); 3433 3434 if (err == 0) { 3435 err = sbuf_finish(topo); 3436 } 3437 sbuf_delete(topo); 3438 return (err); 3439 } 3440 3441 #endif 3442 3443 static int 3444 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 3445 { 3446 int error, new_val, period; 3447 3448 period = 1000000 / realstathz; 3449 new_val = period * sched_slice; 3450 error = sysctl_handle_int(oidp, &new_val, 0, req); 3451 if (error != 0 || req->newptr == NULL) 3452 return (error); 3453 if (new_val <= 0) 3454 return (EINVAL); 3455 sched_slice = imax(1, (new_val + period / 2) / period); 3456 sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR; 3457 hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) / 3458 realstathz); 3459 return (0); 3460 } 3461 3462 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 3463 "Scheduler"); 3464 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, 3465 "Scheduler name"); 3466 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum, 3467 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, 3468 sysctl_kern_quantum, "I", 3469 "Quantum for timeshare threads in microseconds"); 3470 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, 3471 "Quantum for timeshare threads in stathz ticks"); 3472 SYSCTL_UINT(_kern_sched, OID_AUTO, interact, CTLFLAG_RWTUN, &sched_interact, 0, 3473 "Interactivity score threshold"); 3474 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RWTUN, 3475 &preempt_thresh, 0, 3476 "Maximal (lowest) priority for preemption"); 3477 SYSCTL_INT(_kern_sched, OID_AUTO, static_boost, CTLFLAG_RWTUN, &static_boost, 0, 3478 "Assign static kernel priorities to sleeping threads"); 3479 SYSCTL_INT(_kern_sched, OID_AUTO, idlespins, CTLFLAG_RWTUN, &sched_idlespins, 0, 3480 "Number of times idle thread will spin waiting for new work"); 3481 SYSCTL_INT(_kern_sched, OID_AUTO, idlespinthresh, CTLFLAG_RW, 3482 &sched_idlespinthresh, 0, 3483 "Threshold before we will permit idle thread spinning"); 3484 #ifdef SMP 3485 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, 3486 "Number of hz ticks to keep thread affinity for"); 3487 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RWTUN, &rebalance, 0, 3488 "Enables the long-term load balancer"); 3489 SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW, 3490 &balance_interval, 0, 3491 "Average period in stathz ticks to run the long-term balancer"); 3492 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RWTUN, &steal_idle, 0, 3493 "Attempts to steal work from other cores before idling"); 3494 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RWTUN, &steal_thresh, 0, 3495 "Minimum load on remote CPU before we'll steal"); 3496 SYSCTL_INT(_kern_sched, OID_AUTO, trysteal_limit, CTLFLAG_RWTUN, 3497 &trysteal_limit, 0, 3498 "Topological distance limit for stealing threads in sched_switch()"); 3499 SYSCTL_INT(_kern_sched, OID_AUTO, always_steal, CTLFLAG_RWTUN, &always_steal, 0, 3500 "Always run the stealer from the idle thread"); 3501 SYSCTL_PROC(_kern_sched, OID_AUTO, topology_spec, CTLTYPE_STRING | 3502 CTLFLAG_MPSAFE | CTLFLAG_RD, NULL, 0, sysctl_kern_sched_topology_spec, "A", 3503 "XML dump of detected CPU topology"); 3504 #endif 3505 3506 /* ps compat. All cpu percentages from ULE are weighted. */ 3507 static int ccpu = 0; 3508 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, 3509 "Decay factor used for updating %CPU in 4BSD scheduler"); 3510