1 /*- 2 * Copyright (c) 2002-2007, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 /* 28 * This file implements the ULE scheduler. ULE supports independent CPU 29 * run queues and fine grain locking. It has superior interactive 30 * performance under load even on uni-processor systems. 31 * 32 * etymology: 33 * ULE is the last three letters in schedule. It owes it's name to a 34 * generic user created for a scheduling system by Paul Mikesell at 35 * Isilon Systems and a general lack of creativity on the part of the author. 36 */ 37 38 #include <sys/cdefs.h> 39 __FBSDID("$FreeBSD$"); 40 41 #include "opt_hwpmc_hooks.h" 42 #include "opt_sched.h" 43 44 #include <sys/param.h> 45 #include <sys/systm.h> 46 #include <sys/kdb.h> 47 #include <sys/kernel.h> 48 #include <sys/ktr.h> 49 #include <sys/lock.h> 50 #include <sys/mutex.h> 51 #include <sys/proc.h> 52 #include <sys/resource.h> 53 #include <sys/resourcevar.h> 54 #include <sys/sched.h> 55 #include <sys/smp.h> 56 #include <sys/sx.h> 57 #include <sys/sysctl.h> 58 #include <sys/sysproto.h> 59 #include <sys/turnstile.h> 60 #include <sys/umtx.h> 61 #include <sys/vmmeter.h> 62 #ifdef KTRACE 63 #include <sys/uio.h> 64 #include <sys/ktrace.h> 65 #endif 66 67 #ifdef HWPMC_HOOKS 68 #include <sys/pmckern.h> 69 #endif 70 71 #include <machine/cpu.h> 72 #include <machine/smp.h> 73 74 #ifndef PREEMPTION 75 #error "SCHED_ULE requires options PREEMPTION" 76 #endif 77 78 #define KTR_ULE 0 79 80 /* 81 * Thread scheduler specific section. All fields are protected 82 * by the thread lock. 83 */ 84 struct td_sched { 85 TAILQ_ENTRY(td_sched) ts_procq; /* Run queue. */ 86 struct thread *ts_thread; /* Active associated thread. */ 87 struct runq *ts_runq; /* Run-queue we're queued on. */ 88 short ts_flags; /* TSF_* flags. */ 89 u_char ts_rqindex; /* Run queue index. */ 90 u_char ts_cpu; /* CPU that we have affinity for. */ 91 int ts_slice; /* Ticks of slice remaining. */ 92 u_int ts_slptime; /* Number of ticks we vol. slept */ 93 u_int ts_runtime; /* Number of ticks we were running */ 94 /* The following variables are only used for pctcpu calculation */ 95 int ts_ltick; /* Last tick that we were running on */ 96 int ts_ftick; /* First tick that we were running on */ 97 int ts_ticks; /* Tick count */ 98 #ifdef SMP 99 int ts_rltick; /* Real last tick, for affinity. */ 100 #endif 101 }; 102 /* flags kept in ts_flags */ 103 #define TSF_BOUND 0x0001 /* Thread can not migrate. */ 104 #define TSF_XFERABLE 0x0002 /* Thread was added as transferable. */ 105 106 static struct td_sched td_sched0; 107 108 /* 109 * Cpu percentage computation macros and defines. 110 * 111 * SCHED_TICK_SECS: Number of seconds to average the cpu usage across. 112 * SCHED_TICK_TARG: Number of hz ticks to average the cpu usage across. 113 * SCHED_TICK_MAX: Maximum number of ticks before scaling back. 114 * SCHED_TICK_SHIFT: Shift factor to avoid rounding away results. 115 * SCHED_TICK_HZ: Compute the number of hz ticks for a given ticks count. 116 * SCHED_TICK_TOTAL: Gives the amount of time we've been recording ticks. 117 */ 118 #define SCHED_TICK_SECS 10 119 #define SCHED_TICK_TARG (hz * SCHED_TICK_SECS) 120 #define SCHED_TICK_MAX (SCHED_TICK_TARG + hz) 121 #define SCHED_TICK_SHIFT 10 122 #define SCHED_TICK_HZ(ts) ((ts)->ts_ticks >> SCHED_TICK_SHIFT) 123 #define SCHED_TICK_TOTAL(ts) (max((ts)->ts_ltick - (ts)->ts_ftick, hz)) 124 125 /* 126 * These macros determine priorities for non-interactive threads. They are 127 * assigned a priority based on their recent cpu utilization as expressed 128 * by the ratio of ticks to the tick total. NHALF priorities at the start 129 * and end of the MIN to MAX timeshare range are only reachable with negative 130 * or positive nice respectively. 131 * 132 * PRI_RANGE: Priority range for utilization dependent priorities. 133 * PRI_NRESV: Number of nice values. 134 * PRI_TICKS: Compute a priority in PRI_RANGE from the ticks count and total. 135 * PRI_NICE: Determines the part of the priority inherited from nice. 136 */ 137 #define SCHED_PRI_NRESV (PRIO_MAX - PRIO_MIN) 138 #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 139 #define SCHED_PRI_MIN (PRI_MIN_TIMESHARE + SCHED_PRI_NHALF) 140 #define SCHED_PRI_MAX (PRI_MAX_TIMESHARE - SCHED_PRI_NHALF) 141 #define SCHED_PRI_RANGE (SCHED_PRI_MAX - SCHED_PRI_MIN) 142 #define SCHED_PRI_TICKS(ts) \ 143 (SCHED_TICK_HZ((ts)) / \ 144 (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE)) 145 #define SCHED_PRI_NICE(nice) (nice) 146 147 /* 148 * These determine the interactivity of a process. Interactivity differs from 149 * cpu utilization in that it expresses the voluntary time slept vs time ran 150 * while cpu utilization includes all time not running. This more accurately 151 * models the intent of the thread. 152 * 153 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 154 * before throttling back. 155 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 156 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 157 * INTERACT_THRESH: Threshhold for placement on the current runq. 158 */ 159 #define SCHED_SLP_RUN_MAX ((hz * 5) << SCHED_TICK_SHIFT) 160 #define SCHED_SLP_RUN_FORK ((hz / 2) << SCHED_TICK_SHIFT) 161 #define SCHED_INTERACT_MAX (100) 162 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 163 #define SCHED_INTERACT_THRESH (30) 164 165 /* 166 * tickincr: Converts a stathz tick into a hz domain scaled by 167 * the shift factor. Without the shift the error rate 168 * due to rounding would be unacceptably high. 169 * realstathz: stathz is sometimes 0 and run off of hz. 170 * sched_slice: Runtime of each thread before rescheduling. 171 * preempt_thresh: Priority threshold for preemption and remote IPIs. 172 */ 173 static int sched_interact = SCHED_INTERACT_THRESH; 174 static int realstathz; 175 static int tickincr; 176 static int sched_slice; 177 static int preempt_thresh = PRI_MIN_KERN; 178 179 /* 180 * tdq - per processor runqs and statistics. All fields are protected by the 181 * tdq_lock. The load and lowpri may be accessed without to avoid excess 182 * locking in sched_pickcpu(); 183 */ 184 struct tdq { 185 struct mtx *tdq_lock; /* Pointer to group lock. */ 186 struct runq tdq_realtime; /* real-time run queue. */ 187 struct runq tdq_timeshare; /* timeshare run queue. */ 188 struct runq tdq_idle; /* Queue of IDLE threads. */ 189 int tdq_load; /* Aggregate load. */ 190 u_char tdq_idx; /* Current insert index. */ 191 u_char tdq_ridx; /* Current removal index. */ 192 #ifdef SMP 193 u_char tdq_lowpri; /* Lowest priority thread. */ 194 int tdq_transferable; /* Transferable thread count. */ 195 LIST_ENTRY(tdq) tdq_siblings; /* Next in tdq group. */ 196 struct tdq_group *tdq_group; /* Our processor group. */ 197 #else 198 int tdq_sysload; /* For loadavg, !ITHD load. */ 199 #endif 200 } __aligned(64); 201 202 203 #ifdef SMP 204 /* 205 * tdq groups are groups of processors which can cheaply share threads. When 206 * one processor in the group goes idle it will check the runqs of the other 207 * processors in its group prior to halting and waiting for an interrupt. 208 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 209 * In a numa environment we'd want an idle bitmap per group and a two tiered 210 * load balancer. 211 */ 212 struct tdq_group { 213 struct mtx tdg_lock; /* Protects all fields below. */ 214 int tdg_cpus; /* Count of CPUs in this tdq group. */ 215 cpumask_t tdg_cpumask; /* Mask of cpus in this group. */ 216 cpumask_t tdg_idlemask; /* Idle cpus in this group. */ 217 cpumask_t tdg_mask; /* Bit mask for first cpu. */ 218 int tdg_load; /* Total load of this group. */ 219 int tdg_transferable; /* Transferable load of this group. */ 220 LIST_HEAD(, tdq) tdg_members; /* Linked list of all members. */ 221 char tdg_name[16]; /* lock name. */ 222 } __aligned(64); 223 224 #define SCHED_AFFINITY_DEFAULT (max(1, hz / 300)) 225 #define SCHED_AFFINITY(ts) ((ts)->ts_rltick > ticks - affinity) 226 227 /* 228 * Run-time tunables. 229 */ 230 static int rebalance = 1; 231 static int balance_secs = 1; 232 static int pick_pri = 1; 233 static int affinity; 234 static int tryself = 1; 235 static int steal_htt = 0; 236 static int steal_idle = 1; 237 static int steal_thresh = 2; 238 static int topology = 0; 239 240 /* 241 * One thread queue per processor. 242 */ 243 static volatile cpumask_t tdq_idle; 244 static int tdg_maxid; 245 static struct tdq tdq_cpu[MAXCPU]; 246 static struct tdq_group tdq_groups[MAXCPU]; 247 static struct callout balco; 248 static struct callout gbalco; 249 250 #define TDQ_SELF() (&tdq_cpu[PCPU_GET(cpuid)]) 251 #define TDQ_CPU(x) (&tdq_cpu[(x)]) 252 #define TDQ_ID(x) ((int)((x) - tdq_cpu)) 253 #define TDQ_GROUP(x) (&tdq_groups[(x)]) 254 #define TDG_ID(x) ((int)((x) - tdq_groups)) 255 #else /* !SMP */ 256 static struct tdq tdq_cpu; 257 static struct mtx tdq_lock; 258 259 #define TDQ_ID(x) (0) 260 #define TDQ_SELF() (&tdq_cpu) 261 #define TDQ_CPU(x) (&tdq_cpu) 262 #endif 263 264 #define TDQ_LOCK_ASSERT(t, type) mtx_assert(TDQ_LOCKPTR((t)), (type)) 265 #define TDQ_LOCK(t) mtx_lock_spin(TDQ_LOCKPTR((t))) 266 #define TDQ_LOCK_FLAGS(t, f) mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f)) 267 #define TDQ_UNLOCK(t) mtx_unlock_spin(TDQ_LOCKPTR((t))) 268 #define TDQ_LOCKPTR(t) ((t)->tdq_lock) 269 270 static void sched_priority(struct thread *); 271 static void sched_thread_priority(struct thread *, u_char); 272 static int sched_interact_score(struct thread *); 273 static void sched_interact_update(struct thread *); 274 static void sched_interact_fork(struct thread *); 275 static void sched_pctcpu_update(struct td_sched *); 276 277 /* Operations on per processor queues */ 278 static struct td_sched * tdq_choose(struct tdq *); 279 static void tdq_setup(struct tdq *); 280 static void tdq_load_add(struct tdq *, struct td_sched *); 281 static void tdq_load_rem(struct tdq *, struct td_sched *); 282 static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int); 283 static __inline void tdq_runq_rem(struct tdq *, struct td_sched *); 284 void tdq_print(int cpu); 285 static void runq_print(struct runq *rq); 286 static void tdq_add(struct tdq *, struct thread *, int); 287 #ifdef SMP 288 static void tdq_move(struct tdq *, struct tdq *); 289 static int tdq_idled(struct tdq *); 290 static void tdq_notify(struct td_sched *); 291 static struct td_sched *tdq_steal(struct tdq *, int); 292 static struct td_sched *runq_steal(struct runq *); 293 static int sched_pickcpu(struct td_sched *, int); 294 static void sched_balance(void *); 295 static void sched_balance_groups(void *); 296 static void sched_balance_group(struct tdq_group *); 297 static void sched_balance_pair(struct tdq *, struct tdq *); 298 static inline struct tdq *sched_setcpu(struct td_sched *, int, int); 299 static inline struct mtx *thread_block_switch(struct thread *); 300 static inline void thread_unblock_switch(struct thread *, struct mtx *); 301 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int); 302 303 #define THREAD_CAN_MIGRATE(td) ((td)->td_pinned == 0) 304 #endif 305 306 static void sched_setup(void *dummy); 307 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 308 309 static void sched_initticks(void *dummy); 310 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 311 312 /* 313 * Print the threads waiting on a run-queue. 314 */ 315 static void 316 runq_print(struct runq *rq) 317 { 318 struct rqhead *rqh; 319 struct td_sched *ts; 320 int pri; 321 int j; 322 int i; 323 324 for (i = 0; i < RQB_LEN; i++) { 325 printf("\t\trunq bits %d 0x%zx\n", 326 i, rq->rq_status.rqb_bits[i]); 327 for (j = 0; j < RQB_BPW; j++) 328 if (rq->rq_status.rqb_bits[i] & (1ul << j)) { 329 pri = j + (i << RQB_L2BPW); 330 rqh = &rq->rq_queues[pri]; 331 TAILQ_FOREACH(ts, rqh, ts_procq) { 332 printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n", 333 ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri); 334 } 335 } 336 } 337 } 338 339 /* 340 * Print the status of a per-cpu thread queue. Should be a ddb show cmd. 341 */ 342 void 343 tdq_print(int cpu) 344 { 345 struct tdq *tdq; 346 347 tdq = TDQ_CPU(cpu); 348 349 printf("tdq %d:\n", TDQ_ID(tdq)); 350 printf("\tlockptr %p\n", TDQ_LOCKPTR(tdq)); 351 printf("\tload: %d\n", tdq->tdq_load); 352 printf("\ttimeshare idx: %d\n", tdq->tdq_idx); 353 printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx); 354 printf("\trealtime runq:\n"); 355 runq_print(&tdq->tdq_realtime); 356 printf("\ttimeshare runq:\n"); 357 runq_print(&tdq->tdq_timeshare); 358 printf("\tidle runq:\n"); 359 runq_print(&tdq->tdq_idle); 360 #ifdef SMP 361 printf("\tload transferable: %d\n", tdq->tdq_transferable); 362 printf("\tlowest priority: %d\n", tdq->tdq_lowpri); 363 printf("\tgroup: %d\n", TDG_ID(tdq->tdq_group)); 364 printf("\tLock name: %s\n", tdq->tdq_group->tdg_name); 365 #endif 366 } 367 368 #define TS_RQ_PPQ (((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS) 369 /* 370 * Add a thread to the actual run-queue. Keeps transferable counts up to 371 * date with what is actually on the run-queue. Selects the correct 372 * queue position for timeshare threads. 373 */ 374 static __inline void 375 tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags) 376 { 377 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 378 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 379 #ifdef SMP 380 if (THREAD_CAN_MIGRATE(ts->ts_thread)) { 381 tdq->tdq_transferable++; 382 tdq->tdq_group->tdg_transferable++; 383 ts->ts_flags |= TSF_XFERABLE; 384 } 385 #endif 386 if (ts->ts_runq == &tdq->tdq_timeshare) { 387 u_char pri; 388 389 pri = ts->ts_thread->td_priority; 390 KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE, 391 ("Invalid priority %d on timeshare runq", pri)); 392 /* 393 * This queue contains only priorities between MIN and MAX 394 * realtime. Use the whole queue to represent these values. 395 */ 396 if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) { 397 pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ; 398 pri = (pri + tdq->tdq_idx) % RQ_NQS; 399 /* 400 * This effectively shortens the queue by one so we 401 * can have a one slot difference between idx and 402 * ridx while we wait for threads to drain. 403 */ 404 if (tdq->tdq_ridx != tdq->tdq_idx && 405 pri == tdq->tdq_ridx) 406 pri = (unsigned char)(pri - 1) % RQ_NQS; 407 } else 408 pri = tdq->tdq_ridx; 409 runq_add_pri(ts->ts_runq, ts, pri, flags); 410 } else 411 runq_add(ts->ts_runq, ts, flags); 412 } 413 414 /* 415 * Remove a thread from a run-queue. This typically happens when a thread 416 * is selected to run. Running threads are not on the queue and the 417 * transferable count does not reflect them. 418 */ 419 static __inline void 420 tdq_runq_rem(struct tdq *tdq, struct td_sched *ts) 421 { 422 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 423 KASSERT(ts->ts_runq != NULL, 424 ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread)); 425 #ifdef SMP 426 if (ts->ts_flags & TSF_XFERABLE) { 427 tdq->tdq_transferable--; 428 tdq->tdq_group->tdg_transferable--; 429 ts->ts_flags &= ~TSF_XFERABLE; 430 } 431 #endif 432 if (ts->ts_runq == &tdq->tdq_timeshare) { 433 if (tdq->tdq_idx != tdq->tdq_ridx) 434 runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx); 435 else 436 runq_remove_idx(ts->ts_runq, ts, NULL); 437 /* 438 * For timeshare threads we update the priority here so 439 * the priority reflects the time we've been sleeping. 440 */ 441 ts->ts_ltick = ticks; 442 sched_pctcpu_update(ts); 443 sched_priority(ts->ts_thread); 444 } else 445 runq_remove(ts->ts_runq, ts); 446 } 447 448 /* 449 * Load is maintained for all threads RUNNING and ON_RUNQ. Add the load 450 * for this thread to the referenced thread queue. 451 */ 452 static void 453 tdq_load_add(struct tdq *tdq, struct td_sched *ts) 454 { 455 int class; 456 457 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 458 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 459 class = PRI_BASE(ts->ts_thread->td_pri_class); 460 tdq->tdq_load++; 461 CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load); 462 if (class != PRI_ITHD && 463 (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 464 #ifdef SMP 465 tdq->tdq_group->tdg_load++; 466 #else 467 tdq->tdq_sysload++; 468 #endif 469 } 470 471 /* 472 * Remove the load from a thread that is transitioning to a sleep state or 473 * exiting. 474 */ 475 static void 476 tdq_load_rem(struct tdq *tdq, struct td_sched *ts) 477 { 478 int class; 479 480 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 481 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 482 class = PRI_BASE(ts->ts_thread->td_pri_class); 483 if (class != PRI_ITHD && 484 (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0) 485 #ifdef SMP 486 tdq->tdq_group->tdg_load--; 487 #else 488 tdq->tdq_sysload--; 489 #endif 490 KASSERT(tdq->tdq_load != 0, 491 ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq))); 492 tdq->tdq_load--; 493 CTR1(KTR_SCHED, "load: %d", tdq->tdq_load); 494 ts->ts_runq = NULL; 495 } 496 497 #ifdef SMP 498 /* 499 * sched_balance is a simple CPU load balancing algorithm. It operates by 500 * finding the least loaded and most loaded cpu and equalizing their load 501 * by migrating some processes. 502 * 503 * Dealing only with two CPUs at a time has two advantages. Firstly, most 504 * installations will only have 2 cpus. Secondly, load balancing too much at 505 * once can have an unpleasant effect on the system. The scheduler rarely has 506 * enough information to make perfect decisions. So this algorithm chooses 507 * simplicity and more gradual effects on load in larger systems. 508 * 509 */ 510 static void 511 sched_balance(void *arg) 512 { 513 struct tdq_group *high; 514 struct tdq_group *low; 515 struct tdq_group *tdg; 516 int cnt; 517 int i; 518 519 callout_reset(&balco, max(hz / 2, random() % (hz * balance_secs)), 520 sched_balance, NULL); 521 if (smp_started == 0 || rebalance == 0) 522 return; 523 low = high = NULL; 524 i = random() % (tdg_maxid + 1); 525 for (cnt = 0; cnt <= tdg_maxid; cnt++) { 526 tdg = TDQ_GROUP(i); 527 /* 528 * Find the CPU with the highest load that has some 529 * threads to transfer. 530 */ 531 if ((high == NULL || tdg->tdg_load > high->tdg_load) 532 && tdg->tdg_transferable) 533 high = tdg; 534 if (low == NULL || tdg->tdg_load < low->tdg_load) 535 low = tdg; 536 if (++i > tdg_maxid) 537 i = 0; 538 } 539 if (low != NULL && high != NULL && high != low) 540 sched_balance_pair(LIST_FIRST(&high->tdg_members), 541 LIST_FIRST(&low->tdg_members)); 542 } 543 544 /* 545 * Balance load between CPUs in a group. Will only migrate within the group. 546 */ 547 static void 548 sched_balance_groups(void *arg) 549 { 550 int i; 551 552 callout_reset(&gbalco, max(hz / 2, random() % (hz * balance_secs)), 553 sched_balance_groups, NULL); 554 if (smp_started == 0 || rebalance == 0) 555 return; 556 for (i = 0; i <= tdg_maxid; i++) 557 sched_balance_group(TDQ_GROUP(i)); 558 } 559 560 /* 561 * Finds the greatest imbalance between two tdqs in a group. 562 */ 563 static void 564 sched_balance_group(struct tdq_group *tdg) 565 { 566 struct tdq *tdq; 567 struct tdq *high; 568 struct tdq *low; 569 int load; 570 571 if (tdg->tdg_transferable == 0) 572 return; 573 low = NULL; 574 high = NULL; 575 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 576 load = tdq->tdq_load; 577 if (high == NULL || load > high->tdq_load) 578 high = tdq; 579 if (low == NULL || load < low->tdq_load) 580 low = tdq; 581 } 582 if (high != NULL && low != NULL && high != low) 583 sched_balance_pair(high, low); 584 } 585 586 /* 587 * Lock two thread queues using their address to maintain lock order. 588 */ 589 static void 590 tdq_lock_pair(struct tdq *one, struct tdq *two) 591 { 592 if (one < two) { 593 TDQ_LOCK(one); 594 TDQ_LOCK_FLAGS(two, MTX_DUPOK); 595 } else { 596 TDQ_LOCK(two); 597 TDQ_LOCK_FLAGS(one, MTX_DUPOK); 598 } 599 } 600 601 /* 602 * Transfer load between two imbalanced thread queues. 603 */ 604 static void 605 sched_balance_pair(struct tdq *high, struct tdq *low) 606 { 607 int transferable; 608 int high_load; 609 int low_load; 610 int move; 611 int diff; 612 int i; 613 614 tdq_lock_pair(high, low); 615 /* 616 * If we're transfering within a group we have to use this specific 617 * tdq's transferable count, otherwise we can steal from other members 618 * of the group. 619 */ 620 if (high->tdq_group == low->tdq_group) { 621 transferable = high->tdq_transferable; 622 high_load = high->tdq_load; 623 low_load = low->tdq_load; 624 } else { 625 transferable = high->tdq_group->tdg_transferable; 626 high_load = high->tdq_group->tdg_load; 627 low_load = low->tdq_group->tdg_load; 628 } 629 /* 630 * Determine what the imbalance is and then adjust that to how many 631 * threads we actually have to give up (transferable). 632 */ 633 if (transferable != 0) { 634 diff = high_load - low_load; 635 move = diff / 2; 636 if (diff & 0x1) 637 move++; 638 move = min(move, transferable); 639 for (i = 0; i < move; i++) 640 tdq_move(high, low); 641 } 642 TDQ_UNLOCK(high); 643 TDQ_UNLOCK(low); 644 return; 645 } 646 647 /* 648 * Move a thread from one thread queue to another. 649 */ 650 static void 651 tdq_move(struct tdq *from, struct tdq *to) 652 { 653 struct td_sched *ts; 654 struct thread *td; 655 struct tdq *tdq; 656 int cpu; 657 658 tdq = from; 659 cpu = TDQ_ID(to); 660 ts = tdq_steal(tdq, 1); 661 if (ts == NULL) { 662 struct tdq_group *tdg; 663 664 tdg = tdq->tdq_group; 665 LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) { 666 if (tdq == from || tdq->tdq_transferable == 0) 667 continue; 668 ts = tdq_steal(tdq, 1); 669 break; 670 } 671 if (ts == NULL) 672 return; 673 } 674 if (tdq == to) 675 return; 676 td = ts->ts_thread; 677 /* 678 * Although the run queue is locked the thread may be blocked. Lock 679 * it to clear this. 680 */ 681 thread_lock(td); 682 /* Drop recursive lock on from. */ 683 TDQ_UNLOCK(from); 684 sched_rem(td); 685 ts->ts_cpu = cpu; 686 td->td_lock = TDQ_LOCKPTR(to); 687 tdq_add(to, td, SRQ_YIELDING); 688 tdq_notify(ts); 689 } 690 691 /* 692 * This tdq has idled. Try to steal a thread from another cpu and switch 693 * to it. 694 */ 695 static int 696 tdq_idled(struct tdq *tdq) 697 { 698 struct tdq_group *tdg; 699 struct tdq *steal; 700 struct td_sched *ts; 701 struct thread *td; 702 int highload; 703 int highcpu; 704 int load; 705 int cpu; 706 707 /* We don't want to be preempted while we're iterating over tdqs */ 708 spinlock_enter(); 709 tdg = tdq->tdq_group; 710 /* 711 * If we're in a cpu group, try and steal threads from another cpu in 712 * the group before idling. 713 */ 714 if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) { 715 LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) { 716 if (steal == tdq || steal->tdq_transferable == 0) 717 continue; 718 TDQ_LOCK(steal); 719 ts = tdq_steal(steal, 0); 720 if (ts) 721 goto steal; 722 TDQ_UNLOCK(steal); 723 } 724 } 725 for (;;) { 726 if (steal_idle == 0) 727 break; 728 highcpu = 0; 729 highload = 0; 730 for (cpu = 0; cpu <= mp_maxid; cpu++) { 731 if (CPU_ABSENT(cpu)) 732 continue; 733 steal = TDQ_CPU(cpu); 734 load = TDQ_CPU(cpu)->tdq_transferable; 735 if (load < highload) 736 continue; 737 highload = load; 738 highcpu = cpu; 739 } 740 if (highload < steal_thresh) 741 break; 742 steal = TDQ_CPU(highcpu); 743 TDQ_LOCK(steal); 744 if (steal->tdq_transferable >= steal_thresh && 745 (ts = tdq_steal(steal, 1)) != NULL) 746 goto steal; 747 TDQ_UNLOCK(steal); 748 break; 749 } 750 spinlock_exit(); 751 return (1); 752 steal: 753 td = ts->ts_thread; 754 thread_lock(td); 755 spinlock_exit(); 756 MPASS(td->td_lock == TDQ_LOCKPTR(steal)); 757 TDQ_UNLOCK(steal); 758 sched_rem(td); 759 sched_setcpu(ts, PCPU_GET(cpuid), SRQ_YIELDING); 760 tdq_add(tdq, td, SRQ_YIELDING); 761 MPASS(td->td_lock == curthread->td_lock); 762 mi_switch(SW_VOL, NULL); 763 thread_unlock(curthread); 764 765 return (0); 766 } 767 768 /* 769 * Notify a remote cpu of new work. Sends an IPI if criteria are met. 770 */ 771 static void 772 tdq_notify(struct td_sched *ts) 773 { 774 struct thread *ctd; 775 struct pcpu *pcpu; 776 int cpri; 777 int pri; 778 int cpu; 779 780 cpu = ts->ts_cpu; 781 pri = ts->ts_thread->td_priority; 782 pcpu = pcpu_find(cpu); 783 ctd = pcpu->pc_curthread; 784 cpri = ctd->td_priority; 785 786 /* 787 * If our priority is not better than the current priority there is 788 * nothing to do. 789 */ 790 if (pri > cpri) 791 return; 792 /* 793 * Always IPI idle. 794 */ 795 if (cpri > PRI_MIN_IDLE) 796 goto sendipi; 797 /* 798 * If we're realtime or better and there is timeshare or worse running 799 * send an IPI. 800 */ 801 if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME) 802 goto sendipi; 803 /* 804 * Otherwise only IPI if we exceed the threshold. 805 */ 806 if (pri > preempt_thresh) 807 return; 808 sendipi: 809 ctd->td_flags |= TDF_NEEDRESCHED; 810 ipi_selected(1 << cpu, IPI_PREEMPT); 811 } 812 813 /* 814 * Steals load from a timeshare queue. Honors the rotating queue head 815 * index. 816 */ 817 static struct td_sched * 818 runq_steal_from(struct runq *rq, u_char start) 819 { 820 struct td_sched *ts; 821 struct rqbits *rqb; 822 struct rqhead *rqh; 823 int first; 824 int bit; 825 int pri; 826 int i; 827 828 rqb = &rq->rq_status; 829 bit = start & (RQB_BPW -1); 830 pri = 0; 831 first = 0; 832 again: 833 for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) { 834 if (rqb->rqb_bits[i] == 0) 835 continue; 836 if (bit != 0) { 837 for (pri = bit; pri < RQB_BPW; pri++) 838 if (rqb->rqb_bits[i] & (1ul << pri)) 839 break; 840 if (pri >= RQB_BPW) 841 continue; 842 } else 843 pri = RQB_FFS(rqb->rqb_bits[i]); 844 pri += (i << RQB_L2BPW); 845 rqh = &rq->rq_queues[pri]; 846 TAILQ_FOREACH(ts, rqh, ts_procq) { 847 if (first && THREAD_CAN_MIGRATE(ts->ts_thread)) 848 return (ts); 849 first = 1; 850 } 851 } 852 if (start != 0) { 853 start = 0; 854 goto again; 855 } 856 857 return (NULL); 858 } 859 860 /* 861 * Steals load from a standard linear queue. 862 */ 863 static struct td_sched * 864 runq_steal(struct runq *rq) 865 { 866 struct rqhead *rqh; 867 struct rqbits *rqb; 868 struct td_sched *ts; 869 int word; 870 int bit; 871 872 rqb = &rq->rq_status; 873 for (word = 0; word < RQB_LEN; word++) { 874 if (rqb->rqb_bits[word] == 0) 875 continue; 876 for (bit = 0; bit < RQB_BPW; bit++) { 877 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 878 continue; 879 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 880 TAILQ_FOREACH(ts, rqh, ts_procq) 881 if (THREAD_CAN_MIGRATE(ts->ts_thread)) 882 return (ts); 883 } 884 } 885 return (NULL); 886 } 887 888 /* 889 * Attempt to steal a thread in priority order from a thread queue. 890 */ 891 static struct td_sched * 892 tdq_steal(struct tdq *tdq, int stealidle) 893 { 894 struct td_sched *ts; 895 896 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 897 if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL) 898 return (ts); 899 if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL) 900 return (ts); 901 if (stealidle) 902 return (runq_steal(&tdq->tdq_idle)); 903 return (NULL); 904 } 905 906 /* 907 * Sets the thread lock and ts_cpu to match the requested cpu. Unlocks the 908 * current lock and returns with the assigned queue locked. If this is 909 * via sched_switch() we leave the thread in a blocked state as an 910 * optimization. 911 */ 912 static inline struct tdq * 913 sched_setcpu(struct td_sched *ts, int cpu, int flags) 914 { 915 struct thread *td; 916 struct tdq *tdq; 917 918 THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED); 919 920 tdq = TDQ_CPU(cpu); 921 td = ts->ts_thread; 922 ts->ts_cpu = cpu; 923 924 /* If the lock matches just return the queue. */ 925 if (td->td_lock == TDQ_LOCKPTR(tdq)) 926 return (tdq); 927 #ifdef notyet 928 /* 929 * If the thread isn't running it's lockptr is a 930 * turnstile or a sleepqueue. We can just lock_set without 931 * blocking. 932 */ 933 if (TD_CAN_RUN(td)) { 934 TDQ_LOCK(tdq); 935 thread_lock_set(td, TDQ_LOCKPTR(tdq)); 936 return (tdq); 937 } 938 #endif 939 /* 940 * The hard case, migration, we need to block the thread first to 941 * prevent order reversals with other cpus locks. 942 */ 943 thread_lock_block(td); 944 TDQ_LOCK(tdq); 945 thread_lock_unblock(td, TDQ_LOCKPTR(tdq)); 946 return (tdq); 947 } 948 949 /* 950 * Find the thread queue running the lowest priority thread. 951 */ 952 static int 953 tdq_lowestpri(void) 954 { 955 struct tdq *tdq; 956 int lowpri; 957 int lowcpu; 958 int lowload; 959 int load; 960 int cpu; 961 int pri; 962 963 lowload = 0; 964 lowpri = lowcpu = 0; 965 for (cpu = 0; cpu <= mp_maxid; cpu++) { 966 if (CPU_ABSENT(cpu)) 967 continue; 968 tdq = TDQ_CPU(cpu); 969 pri = tdq->tdq_lowpri; 970 load = TDQ_CPU(cpu)->tdq_load; 971 CTR4(KTR_ULE, 972 "cpu %d pri %d lowcpu %d lowpri %d", 973 cpu, pri, lowcpu, lowpri); 974 if (pri < lowpri) 975 continue; 976 if (lowpri && lowpri == pri && load > lowload) 977 continue; 978 lowpri = pri; 979 lowcpu = cpu; 980 lowload = load; 981 } 982 983 return (lowcpu); 984 } 985 986 /* 987 * Find the thread queue with the least load. 988 */ 989 static int 990 tdq_lowestload(void) 991 { 992 struct tdq *tdq; 993 int lowload; 994 int lowpri; 995 int lowcpu; 996 int load; 997 int cpu; 998 int pri; 999 1000 lowcpu = 0; 1001 lowload = TDQ_CPU(0)->tdq_load; 1002 lowpri = TDQ_CPU(0)->tdq_lowpri; 1003 for (cpu = 1; cpu <= mp_maxid; cpu++) { 1004 if (CPU_ABSENT(cpu)) 1005 continue; 1006 tdq = TDQ_CPU(cpu); 1007 load = tdq->tdq_load; 1008 pri = tdq->tdq_lowpri; 1009 CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d", 1010 cpu, load, lowcpu, lowload); 1011 if (load > lowload) 1012 continue; 1013 if (load == lowload && pri < lowpri) 1014 continue; 1015 lowcpu = cpu; 1016 lowload = load; 1017 lowpri = pri; 1018 } 1019 1020 return (lowcpu); 1021 } 1022 1023 /* 1024 * Pick the destination cpu for sched_add(). Respects affinity and makes 1025 * a determination based on load or priority of available processors. 1026 */ 1027 static int 1028 sched_pickcpu(struct td_sched *ts, int flags) 1029 { 1030 struct tdq *tdq; 1031 int self; 1032 int pri; 1033 int cpu; 1034 1035 cpu = self = PCPU_GET(cpuid); 1036 if (smp_started == 0) 1037 return (self); 1038 /* 1039 * Don't migrate a running thread from sched_switch(). 1040 */ 1041 if (flags & SRQ_OURSELF) { 1042 CTR1(KTR_ULE, "YIELDING %d", 1043 curthread->td_priority); 1044 return (self); 1045 } 1046 pri = ts->ts_thread->td_priority; 1047 cpu = ts->ts_cpu; 1048 /* 1049 * Regardless of affinity, if the last cpu is idle send it there. 1050 */ 1051 tdq = TDQ_CPU(cpu); 1052 if (tdq->tdq_lowpri > PRI_MIN_IDLE) { 1053 CTR5(KTR_ULE, 1054 "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d", 1055 ts->ts_cpu, ts->ts_rltick, ticks, pri, 1056 tdq->tdq_lowpri); 1057 return (ts->ts_cpu); 1058 } 1059 /* 1060 * If we have affinity, try to place it on the cpu we last ran on. 1061 */ 1062 if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) { 1063 CTR5(KTR_ULE, 1064 "affinity for %d, ltick %d ticks %d pri %d curthread %d", 1065 ts->ts_cpu, ts->ts_rltick, ticks, pri, 1066 tdq->tdq_lowpri); 1067 return (ts->ts_cpu); 1068 } 1069 /* 1070 * Look for an idle group. 1071 */ 1072 CTR1(KTR_ULE, "tdq_idle %X", tdq_idle); 1073 cpu = ffs(tdq_idle); 1074 if (cpu) 1075 return (--cpu); 1076 /* 1077 * If there are no idle cores see if we can run the thread locally. This may 1078 * improve locality among sleepers and wakers when there is shared data. 1079 */ 1080 if (tryself && pri < curthread->td_priority) { 1081 CTR1(KTR_ULE, "tryself %d", 1082 curthread->td_priority); 1083 return (self); 1084 } 1085 /* 1086 * Now search for the cpu running the lowest priority thread with 1087 * the least load. 1088 */ 1089 if (pick_pri) 1090 cpu = tdq_lowestpri(); 1091 else 1092 cpu = tdq_lowestload(); 1093 return (cpu); 1094 } 1095 1096 #endif /* SMP */ 1097 1098 /* 1099 * Pick the highest priority task we have and return it. 1100 */ 1101 static struct td_sched * 1102 tdq_choose(struct tdq *tdq) 1103 { 1104 struct td_sched *ts; 1105 1106 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 1107 ts = runq_choose(&tdq->tdq_realtime); 1108 if (ts != NULL) 1109 return (ts); 1110 ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx); 1111 if (ts != NULL) { 1112 KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE, 1113 ("tdq_choose: Invalid priority on timeshare queue %d", 1114 ts->ts_thread->td_priority)); 1115 return (ts); 1116 } 1117 1118 ts = runq_choose(&tdq->tdq_idle); 1119 if (ts != NULL) { 1120 KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE, 1121 ("tdq_choose: Invalid priority on idle queue %d", 1122 ts->ts_thread->td_priority)); 1123 return (ts); 1124 } 1125 1126 return (NULL); 1127 } 1128 1129 /* 1130 * Initialize a thread queue. 1131 */ 1132 static void 1133 tdq_setup(struct tdq *tdq) 1134 { 1135 1136 if (bootverbose) 1137 printf("ULE: setup cpu %d\n", TDQ_ID(tdq)); 1138 runq_init(&tdq->tdq_realtime); 1139 runq_init(&tdq->tdq_timeshare); 1140 runq_init(&tdq->tdq_idle); 1141 tdq->tdq_load = 0; 1142 } 1143 1144 #ifdef SMP 1145 static void 1146 tdg_setup(struct tdq_group *tdg) 1147 { 1148 if (bootverbose) 1149 printf("ULE: setup cpu group %d\n", TDG_ID(tdg)); 1150 snprintf(tdg->tdg_name, sizeof(tdg->tdg_name), 1151 "sched lock %d", (int)TDG_ID(tdg)); 1152 mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock", 1153 MTX_SPIN | MTX_RECURSE); 1154 LIST_INIT(&tdg->tdg_members); 1155 tdg->tdg_load = 0; 1156 tdg->tdg_transferable = 0; 1157 tdg->tdg_cpus = 0; 1158 tdg->tdg_mask = 0; 1159 tdg->tdg_cpumask = 0; 1160 tdg->tdg_idlemask = 0; 1161 } 1162 1163 static void 1164 tdg_add(struct tdq_group *tdg, struct tdq *tdq) 1165 { 1166 if (tdg->tdg_mask == 0) 1167 tdg->tdg_mask |= 1 << TDQ_ID(tdq); 1168 tdg->tdg_cpumask |= 1 << TDQ_ID(tdq); 1169 tdg->tdg_cpus++; 1170 tdq->tdq_group = tdg; 1171 tdq->tdq_lock = &tdg->tdg_lock; 1172 LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings); 1173 if (bootverbose) 1174 printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n", 1175 TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask); 1176 } 1177 1178 static void 1179 sched_setup_topology(void) 1180 { 1181 struct tdq_group *tdg; 1182 struct cpu_group *cg; 1183 int balance_groups; 1184 struct tdq *tdq; 1185 int i; 1186 int j; 1187 1188 topology = 1; 1189 balance_groups = 0; 1190 for (i = 0; i < smp_topology->ct_count; i++) { 1191 cg = &smp_topology->ct_group[i]; 1192 tdg = &tdq_groups[i]; 1193 /* 1194 * Initialize the group. 1195 */ 1196 tdg_setup(tdg); 1197 /* 1198 * Find all of the group members and add them. 1199 */ 1200 for (j = 0; j < MAXCPU; j++) { 1201 if ((cg->cg_mask & (1 << j)) != 0) { 1202 tdq = TDQ_CPU(j); 1203 tdq_setup(tdq); 1204 tdg_add(tdg, tdq); 1205 } 1206 } 1207 if (tdg->tdg_cpus > 1) 1208 balance_groups = 1; 1209 } 1210 tdg_maxid = smp_topology->ct_count - 1; 1211 if (balance_groups) 1212 sched_balance_groups(NULL); 1213 } 1214 1215 static void 1216 sched_setup_smp(void) 1217 { 1218 struct tdq_group *tdg; 1219 struct tdq *tdq; 1220 int cpus; 1221 int i; 1222 1223 for (cpus = 0, i = 0; i < MAXCPU; i++) { 1224 if (CPU_ABSENT(i)) 1225 continue; 1226 tdq = &tdq_cpu[i]; 1227 tdg = &tdq_groups[i]; 1228 /* 1229 * Setup a tdq group with one member. 1230 */ 1231 tdg_setup(tdg); 1232 tdq_setup(tdq); 1233 tdg_add(tdg, tdq); 1234 cpus++; 1235 } 1236 tdg_maxid = cpus - 1; 1237 } 1238 1239 /* 1240 * Fake a topology with one group containing all CPUs. 1241 */ 1242 static void 1243 sched_fake_topo(void) 1244 { 1245 #ifdef SCHED_FAKE_TOPOLOGY 1246 static struct cpu_top top; 1247 static struct cpu_group group; 1248 1249 top.ct_count = 1; 1250 top.ct_group = &group; 1251 group.cg_mask = all_cpus; 1252 group.cg_count = mp_ncpus; 1253 group.cg_children = 0; 1254 smp_topology = ⊤ 1255 #endif 1256 } 1257 #endif 1258 1259 /* 1260 * Setup the thread queues and initialize the topology based on MD 1261 * information. 1262 */ 1263 static void 1264 sched_setup(void *dummy) 1265 { 1266 struct tdq *tdq; 1267 1268 tdq = TDQ_SELF(); 1269 #ifdef SMP 1270 /* 1271 * Initialize long-term cpu balancing algorithm. 1272 */ 1273 callout_init(&balco, CALLOUT_MPSAFE); 1274 callout_init(&gbalco, CALLOUT_MPSAFE); 1275 sched_fake_topo(); 1276 /* 1277 * Setup tdqs based on a topology configuration or vanilla SMP based 1278 * on mp_maxid. 1279 */ 1280 if (smp_topology == NULL) 1281 sched_setup_smp(); 1282 else 1283 sched_setup_topology(); 1284 sched_balance(NULL); 1285 #else 1286 tdq_setup(tdq); 1287 mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE); 1288 tdq->tdq_lock = &tdq_lock; 1289 #endif 1290 /* 1291 * To avoid divide-by-zero, we set realstathz a dummy value 1292 * in case which sched_clock() called before sched_initticks(). 1293 */ 1294 realstathz = hz; 1295 sched_slice = (realstathz/10); /* ~100ms */ 1296 tickincr = 1 << SCHED_TICK_SHIFT; 1297 1298 /* Add thread0's load since it's running. */ 1299 TDQ_LOCK(tdq); 1300 thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1301 tdq_load_add(tdq, &td_sched0); 1302 TDQ_UNLOCK(tdq); 1303 } 1304 1305 /* 1306 * This routine determines the tickincr after stathz and hz are setup. 1307 */ 1308 /* ARGSUSED */ 1309 static void 1310 sched_initticks(void *dummy) 1311 { 1312 int incr; 1313 1314 realstathz = stathz ? stathz : hz; 1315 sched_slice = (realstathz/10); /* ~100ms */ 1316 1317 /* 1318 * tickincr is shifted out by 10 to avoid rounding errors due to 1319 * hz not being evenly divisible by stathz on all platforms. 1320 */ 1321 incr = (hz << SCHED_TICK_SHIFT) / realstathz; 1322 /* 1323 * This does not work for values of stathz that are more than 1324 * 1 << SCHED_TICK_SHIFT * hz. In practice this does not happen. 1325 */ 1326 if (incr == 0) 1327 incr = 1; 1328 tickincr = incr; 1329 #ifdef SMP 1330 /* 1331 * Set steal thresh to log2(mp_ncpu) but no greater than 4. This 1332 * prevents excess thrashing on large machines and excess idle on 1333 * smaller machines. 1334 */ 1335 steal_thresh = min(ffs(mp_ncpus) - 1, 4); 1336 affinity = SCHED_AFFINITY_DEFAULT; 1337 #endif 1338 } 1339 1340 1341 /* 1342 * This is the core of the interactivity algorithm. Determines a score based 1343 * on past behavior. It is the ratio of sleep time to run time scaled to 1344 * a [0, 100] integer. This is the voluntary sleep time of a process, which 1345 * differs from the cpu usage because it does not account for time spent 1346 * waiting on a run-queue. Would be prettier if we had floating point. 1347 */ 1348 static int 1349 sched_interact_score(struct thread *td) 1350 { 1351 struct td_sched *ts; 1352 int div; 1353 1354 ts = td->td_sched; 1355 /* 1356 * The score is only needed if this is likely to be an interactive 1357 * task. Don't go through the expense of computing it if there's 1358 * no chance. 1359 */ 1360 if (sched_interact <= SCHED_INTERACT_HALF && 1361 ts->ts_runtime >= ts->ts_slptime) 1362 return (SCHED_INTERACT_HALF); 1363 1364 if (ts->ts_runtime > ts->ts_slptime) { 1365 div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF); 1366 return (SCHED_INTERACT_HALF + 1367 (SCHED_INTERACT_HALF - (ts->ts_slptime / div))); 1368 } 1369 if (ts->ts_slptime > ts->ts_runtime) { 1370 div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF); 1371 return (ts->ts_runtime / div); 1372 } 1373 /* runtime == slptime */ 1374 if (ts->ts_runtime) 1375 return (SCHED_INTERACT_HALF); 1376 1377 /* 1378 * This can happen if slptime and runtime are 0. 1379 */ 1380 return (0); 1381 1382 } 1383 1384 /* 1385 * Scale the scheduling priority according to the "interactivity" of this 1386 * process. 1387 */ 1388 static void 1389 sched_priority(struct thread *td) 1390 { 1391 int score; 1392 int pri; 1393 1394 if (td->td_pri_class != PRI_TIMESHARE) 1395 return; 1396 /* 1397 * If the score is interactive we place the thread in the realtime 1398 * queue with a priority that is less than kernel and interrupt 1399 * priorities. These threads are not subject to nice restrictions. 1400 * 1401 * Scores greater than this are placed on the normal timeshare queue 1402 * where the priority is partially decided by the most recent cpu 1403 * utilization and the rest is decided by nice value. 1404 */ 1405 score = sched_interact_score(td); 1406 if (score < sched_interact) { 1407 pri = PRI_MIN_REALTIME; 1408 pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact) 1409 * score; 1410 KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME, 1411 ("sched_priority: invalid interactive priority %d score %d", 1412 pri, score)); 1413 } else { 1414 pri = SCHED_PRI_MIN; 1415 if (td->td_sched->ts_ticks) 1416 pri += SCHED_PRI_TICKS(td->td_sched); 1417 pri += SCHED_PRI_NICE(td->td_proc->p_nice); 1418 KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE, 1419 ("sched_priority: invalid priority %d: nice %d, " 1420 "ticks %d ftick %d ltick %d tick pri %d", 1421 pri, td->td_proc->p_nice, td->td_sched->ts_ticks, 1422 td->td_sched->ts_ftick, td->td_sched->ts_ltick, 1423 SCHED_PRI_TICKS(td->td_sched))); 1424 } 1425 sched_user_prio(td, pri); 1426 1427 return; 1428 } 1429 1430 /* 1431 * This routine enforces a maximum limit on the amount of scheduling history 1432 * kept. It is called after either the slptime or runtime is adjusted. This 1433 * function is ugly due to integer math. 1434 */ 1435 static void 1436 sched_interact_update(struct thread *td) 1437 { 1438 struct td_sched *ts; 1439 u_int sum; 1440 1441 ts = td->td_sched; 1442 sum = ts->ts_runtime + ts->ts_slptime; 1443 if (sum < SCHED_SLP_RUN_MAX) 1444 return; 1445 /* 1446 * This only happens from two places: 1447 * 1) We have added an unusual amount of run time from fork_exit. 1448 * 2) We have added an unusual amount of sleep time from sched_sleep(). 1449 */ 1450 if (sum > SCHED_SLP_RUN_MAX * 2) { 1451 if (ts->ts_runtime > ts->ts_slptime) { 1452 ts->ts_runtime = SCHED_SLP_RUN_MAX; 1453 ts->ts_slptime = 1; 1454 } else { 1455 ts->ts_slptime = SCHED_SLP_RUN_MAX; 1456 ts->ts_runtime = 1; 1457 } 1458 return; 1459 } 1460 /* 1461 * If we have exceeded by more than 1/5th then the algorithm below 1462 * will not bring us back into range. Dividing by two here forces 1463 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1464 */ 1465 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1466 ts->ts_runtime /= 2; 1467 ts->ts_slptime /= 2; 1468 return; 1469 } 1470 ts->ts_runtime = (ts->ts_runtime / 5) * 4; 1471 ts->ts_slptime = (ts->ts_slptime / 5) * 4; 1472 } 1473 1474 /* 1475 * Scale back the interactivity history when a child thread is created. The 1476 * history is inherited from the parent but the thread may behave totally 1477 * differently. For example, a shell spawning a compiler process. We want 1478 * to learn that the compiler is behaving badly very quickly. 1479 */ 1480 static void 1481 sched_interact_fork(struct thread *td) 1482 { 1483 int ratio; 1484 int sum; 1485 1486 sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime; 1487 if (sum > SCHED_SLP_RUN_FORK) { 1488 ratio = sum / SCHED_SLP_RUN_FORK; 1489 td->td_sched->ts_runtime /= ratio; 1490 td->td_sched->ts_slptime /= ratio; 1491 } 1492 } 1493 1494 /* 1495 * Called from proc0_init() to setup the scheduler fields. 1496 */ 1497 void 1498 schedinit(void) 1499 { 1500 1501 /* 1502 * Set up the scheduler specific parts of proc0. 1503 */ 1504 proc0.p_sched = NULL; /* XXX */ 1505 thread0.td_sched = &td_sched0; 1506 td_sched0.ts_ltick = ticks; 1507 td_sched0.ts_ftick = ticks; 1508 td_sched0.ts_thread = &thread0; 1509 } 1510 1511 /* 1512 * This is only somewhat accurate since given many processes of the same 1513 * priority they will switch when their slices run out, which will be 1514 * at most sched_slice stathz ticks. 1515 */ 1516 int 1517 sched_rr_interval(void) 1518 { 1519 1520 /* Convert sched_slice to hz */ 1521 return (hz/(realstathz/sched_slice)); 1522 } 1523 1524 /* 1525 * Update the percent cpu tracking information when it is requested or 1526 * the total history exceeds the maximum. We keep a sliding history of 1527 * tick counts that slowly decays. This is less precise than the 4BSD 1528 * mechanism since it happens with less regular and frequent events. 1529 */ 1530 static void 1531 sched_pctcpu_update(struct td_sched *ts) 1532 { 1533 1534 if (ts->ts_ticks == 0) 1535 return; 1536 if (ticks - (hz / 10) < ts->ts_ltick && 1537 SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX) 1538 return; 1539 /* 1540 * Adjust counters and watermark for pctcpu calc. 1541 */ 1542 if (ts->ts_ltick > ticks - SCHED_TICK_TARG) 1543 ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) * 1544 SCHED_TICK_TARG; 1545 else 1546 ts->ts_ticks = 0; 1547 ts->ts_ltick = ticks; 1548 ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG; 1549 } 1550 1551 /* 1552 * Adjust the priority of a thread. Move it to the appropriate run-queue 1553 * if necessary. This is the back-end for several priority related 1554 * functions. 1555 */ 1556 static void 1557 sched_thread_priority(struct thread *td, u_char prio) 1558 { 1559 struct td_sched *ts; 1560 1561 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1562 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1563 curthread->td_proc->p_comm); 1564 ts = td->td_sched; 1565 THREAD_LOCK_ASSERT(td, MA_OWNED); 1566 if (td->td_priority == prio) 1567 return; 1568 1569 if (TD_ON_RUNQ(td) && prio < td->td_priority) { 1570 /* 1571 * If the priority has been elevated due to priority 1572 * propagation, we may have to move ourselves to a new 1573 * queue. This could be optimized to not re-add in some 1574 * cases. 1575 */ 1576 sched_rem(td); 1577 td->td_priority = prio; 1578 sched_add(td, SRQ_BORROWING); 1579 } else { 1580 #ifdef SMP 1581 struct tdq *tdq; 1582 1583 tdq = TDQ_CPU(ts->ts_cpu); 1584 if (prio < tdq->tdq_lowpri) 1585 tdq->tdq_lowpri = prio; 1586 #endif 1587 td->td_priority = prio; 1588 } 1589 } 1590 1591 /* 1592 * Update a thread's priority when it is lent another thread's 1593 * priority. 1594 */ 1595 void 1596 sched_lend_prio(struct thread *td, u_char prio) 1597 { 1598 1599 td->td_flags |= TDF_BORROWING; 1600 sched_thread_priority(td, prio); 1601 } 1602 1603 /* 1604 * Restore a thread's priority when priority propagation is 1605 * over. The prio argument is the minimum priority the thread 1606 * needs to have to satisfy other possible priority lending 1607 * requests. If the thread's regular priority is less 1608 * important than prio, the thread will keep a priority boost 1609 * of prio. 1610 */ 1611 void 1612 sched_unlend_prio(struct thread *td, u_char prio) 1613 { 1614 u_char base_pri; 1615 1616 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1617 td->td_base_pri <= PRI_MAX_TIMESHARE) 1618 base_pri = td->td_user_pri; 1619 else 1620 base_pri = td->td_base_pri; 1621 if (prio >= base_pri) { 1622 td->td_flags &= ~TDF_BORROWING; 1623 sched_thread_priority(td, base_pri); 1624 } else 1625 sched_lend_prio(td, prio); 1626 } 1627 1628 /* 1629 * Standard entry for setting the priority to an absolute value. 1630 */ 1631 void 1632 sched_prio(struct thread *td, u_char prio) 1633 { 1634 u_char oldprio; 1635 1636 /* First, update the base priority. */ 1637 td->td_base_pri = prio; 1638 1639 /* 1640 * If the thread is borrowing another thread's priority, don't 1641 * ever lower the priority. 1642 */ 1643 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1644 return; 1645 1646 /* Change the real priority. */ 1647 oldprio = td->td_priority; 1648 sched_thread_priority(td, prio); 1649 1650 /* 1651 * If the thread is on a turnstile, then let the turnstile update 1652 * its state. 1653 */ 1654 if (TD_ON_LOCK(td) && oldprio != prio) 1655 turnstile_adjust(td, oldprio); 1656 } 1657 1658 /* 1659 * Set the base user priority, does not effect current running priority. 1660 */ 1661 void 1662 sched_user_prio(struct thread *td, u_char prio) 1663 { 1664 u_char oldprio; 1665 1666 td->td_base_user_pri = prio; 1667 if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio) 1668 return; 1669 oldprio = td->td_user_pri; 1670 td->td_user_pri = prio; 1671 1672 if (TD_ON_UPILOCK(td) && oldprio != prio) 1673 umtx_pi_adjust(td, oldprio); 1674 } 1675 1676 void 1677 sched_lend_user_prio(struct thread *td, u_char prio) 1678 { 1679 u_char oldprio; 1680 1681 td->td_flags |= TDF_UBORROWING; 1682 1683 oldprio = td->td_user_pri; 1684 td->td_user_pri = prio; 1685 1686 if (TD_ON_UPILOCK(td) && oldprio != prio) 1687 umtx_pi_adjust(td, oldprio); 1688 } 1689 1690 void 1691 sched_unlend_user_prio(struct thread *td, u_char prio) 1692 { 1693 u_char base_pri; 1694 1695 base_pri = td->td_base_user_pri; 1696 if (prio >= base_pri) { 1697 td->td_flags &= ~TDF_UBORROWING; 1698 sched_user_prio(td, base_pri); 1699 } else 1700 sched_lend_user_prio(td, prio); 1701 } 1702 1703 /* 1704 * Add the thread passed as 'newtd' to the run queue before selecting 1705 * the next thread to run. This is only used for KSE. 1706 */ 1707 static void 1708 sched_switchin(struct tdq *tdq, struct thread *td) 1709 { 1710 #ifdef SMP 1711 spinlock_enter(); 1712 TDQ_UNLOCK(tdq); 1713 thread_lock(td); 1714 spinlock_exit(); 1715 sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING); 1716 #else 1717 td->td_lock = TDQ_LOCKPTR(tdq); 1718 #endif 1719 tdq_add(tdq, td, SRQ_YIELDING); 1720 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1721 } 1722 1723 /* 1724 * Handle migration from sched_switch(). This happens only for 1725 * cpu binding. 1726 */ 1727 static struct mtx * 1728 sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags) 1729 { 1730 struct tdq *tdn; 1731 1732 tdn = TDQ_CPU(td->td_sched->ts_cpu); 1733 #ifdef SMP 1734 /* 1735 * Do the lock dance required to avoid LOR. We grab an extra 1736 * spinlock nesting to prevent preemption while we're 1737 * not holding either run-queue lock. 1738 */ 1739 spinlock_enter(); 1740 thread_block_switch(td); /* This releases the lock on tdq. */ 1741 TDQ_LOCK(tdn); 1742 tdq_add(tdn, td, flags); 1743 tdq_notify(td->td_sched); 1744 /* 1745 * After we unlock tdn the new cpu still can't switch into this 1746 * thread until we've unblocked it in cpu_switch(). The lock 1747 * pointers may match in the case of HTT cores. Don't unlock here 1748 * or we can deadlock when the other CPU runs the IPI handler. 1749 */ 1750 if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) { 1751 TDQ_UNLOCK(tdn); 1752 TDQ_LOCK(tdq); 1753 } 1754 spinlock_exit(); 1755 #endif 1756 return (TDQ_LOCKPTR(tdn)); 1757 } 1758 1759 /* 1760 * Block a thread for switching. Similar to thread_block() but does not 1761 * bump the spin count. 1762 */ 1763 static inline struct mtx * 1764 thread_block_switch(struct thread *td) 1765 { 1766 struct mtx *lock; 1767 1768 THREAD_LOCK_ASSERT(td, MA_OWNED); 1769 lock = td->td_lock; 1770 td->td_lock = &blocked_lock; 1771 mtx_unlock_spin(lock); 1772 1773 return (lock); 1774 } 1775 1776 /* 1777 * Release a thread that was blocked with thread_block_switch(). 1778 */ 1779 static inline void 1780 thread_unblock_switch(struct thread *td, struct mtx *mtx) 1781 { 1782 atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock, 1783 (uintptr_t)mtx); 1784 } 1785 1786 /* 1787 * Switch threads. This function has to handle threads coming in while 1788 * blocked for some reason, running, or idle. It also must deal with 1789 * migrating a thread from one queue to another as running threads may 1790 * be assigned elsewhere via binding. 1791 */ 1792 void 1793 sched_switch(struct thread *td, struct thread *newtd, int flags) 1794 { 1795 struct tdq *tdq; 1796 struct td_sched *ts; 1797 struct mtx *mtx; 1798 int srqflag; 1799 int cpuid; 1800 1801 THREAD_LOCK_ASSERT(td, MA_OWNED); 1802 1803 cpuid = PCPU_GET(cpuid); 1804 tdq = TDQ_CPU(cpuid); 1805 ts = td->td_sched; 1806 mtx = td->td_lock; 1807 #ifdef SMP 1808 ts->ts_rltick = ticks; 1809 if (newtd && newtd->td_priority < tdq->tdq_lowpri) 1810 tdq->tdq_lowpri = newtd->td_priority; 1811 #endif 1812 td->td_lastcpu = td->td_oncpu; 1813 td->td_oncpu = NOCPU; 1814 td->td_flags &= ~TDF_NEEDRESCHED; 1815 td->td_owepreempt = 0; 1816 /* 1817 * The lock pointer in an idle thread should never change. Reset it 1818 * to CAN_RUN as well. 1819 */ 1820 if (TD_IS_IDLETHREAD(td)) { 1821 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1822 TD_SET_CAN_RUN(td); 1823 } else if (TD_IS_RUNNING(td)) { 1824 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1825 tdq_load_rem(tdq, ts); 1826 srqflag = (flags & SW_PREEMPT) ? 1827 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1828 SRQ_OURSELF|SRQ_YIELDING; 1829 if (ts->ts_cpu == cpuid) 1830 tdq_add(tdq, td, srqflag); 1831 else 1832 mtx = sched_switch_migrate(tdq, td, srqflag); 1833 } else { 1834 /* This thread must be going to sleep. */ 1835 TDQ_LOCK(tdq); 1836 mtx = thread_block_switch(td); 1837 tdq_load_rem(tdq, ts); 1838 } 1839 /* 1840 * We enter here with the thread blocked and assigned to the 1841 * appropriate cpu run-queue or sleep-queue and with the current 1842 * thread-queue locked. 1843 */ 1844 TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED); 1845 /* 1846 * If KSE assigned a new thread just add it here and let choosethread 1847 * select the best one. 1848 */ 1849 if (newtd != NULL) 1850 sched_switchin(tdq, newtd); 1851 newtd = choosethread(); 1852 /* 1853 * Call the MD code to switch contexts if necessary. 1854 */ 1855 if (td != newtd) { 1856 #ifdef HWPMC_HOOKS 1857 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1858 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1859 #endif 1860 cpu_switch(td, newtd, mtx); 1861 /* 1862 * We may return from cpu_switch on a different cpu. However, 1863 * we always return with td_lock pointing to the current cpu's 1864 * run queue lock. 1865 */ 1866 cpuid = PCPU_GET(cpuid); 1867 tdq = TDQ_CPU(cpuid); 1868 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td; 1869 #ifdef HWPMC_HOOKS 1870 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1871 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1872 #endif 1873 } else 1874 thread_unblock_switch(td, mtx); 1875 /* 1876 * Assert that all went well and return. 1877 */ 1878 #ifdef SMP 1879 /* We should always get here with the lowest priority td possible */ 1880 tdq->tdq_lowpri = td->td_priority; 1881 #endif 1882 TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED); 1883 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 1884 td->td_oncpu = cpuid; 1885 } 1886 1887 /* 1888 * Adjust thread priorities as a result of a nice request. 1889 */ 1890 void 1891 sched_nice(struct proc *p, int nice) 1892 { 1893 struct thread *td; 1894 1895 PROC_LOCK_ASSERT(p, MA_OWNED); 1896 PROC_SLOCK_ASSERT(p, MA_OWNED); 1897 1898 p->p_nice = nice; 1899 FOREACH_THREAD_IN_PROC(p, td) { 1900 thread_lock(td); 1901 sched_priority(td); 1902 sched_prio(td, td->td_base_user_pri); 1903 thread_unlock(td); 1904 } 1905 } 1906 1907 /* 1908 * Record the sleep time for the interactivity scorer. 1909 */ 1910 void 1911 sched_sleep(struct thread *td) 1912 { 1913 1914 THREAD_LOCK_ASSERT(td, MA_OWNED); 1915 1916 td->td_slptick = ticks; 1917 } 1918 1919 /* 1920 * Schedule a thread to resume execution and record how long it voluntarily 1921 * slept. We also update the pctcpu, interactivity, and priority. 1922 */ 1923 void 1924 sched_wakeup(struct thread *td) 1925 { 1926 struct td_sched *ts; 1927 int slptick; 1928 1929 THREAD_LOCK_ASSERT(td, MA_OWNED); 1930 ts = td->td_sched; 1931 /* 1932 * If we slept for more than a tick update our interactivity and 1933 * priority. 1934 */ 1935 slptick = td->td_slptick; 1936 td->td_slptick = 0; 1937 if (slptick && slptick != ticks) { 1938 u_int hzticks; 1939 1940 hzticks = (ticks - slptick) << SCHED_TICK_SHIFT; 1941 ts->ts_slptime += hzticks; 1942 sched_interact_update(td); 1943 sched_pctcpu_update(ts); 1944 sched_priority(td); 1945 } 1946 /* Reset the slice value after we sleep. */ 1947 ts->ts_slice = sched_slice; 1948 sched_add(td, SRQ_BORING); 1949 } 1950 1951 /* 1952 * Penalize the parent for creating a new child and initialize the child's 1953 * priority. 1954 */ 1955 void 1956 sched_fork(struct thread *td, struct thread *child) 1957 { 1958 THREAD_LOCK_ASSERT(td, MA_OWNED); 1959 sched_fork_thread(td, child); 1960 /* 1961 * Penalize the parent and child for forking. 1962 */ 1963 sched_interact_fork(child); 1964 sched_priority(child); 1965 td->td_sched->ts_runtime += tickincr; 1966 sched_interact_update(td); 1967 sched_priority(td); 1968 } 1969 1970 /* 1971 * Fork a new thread, may be within the same process. 1972 */ 1973 void 1974 sched_fork_thread(struct thread *td, struct thread *child) 1975 { 1976 struct td_sched *ts; 1977 struct td_sched *ts2; 1978 1979 /* 1980 * Initialize child. 1981 */ 1982 THREAD_LOCK_ASSERT(td, MA_OWNED); 1983 sched_newthread(child); 1984 child->td_lock = TDQ_LOCKPTR(TDQ_SELF()); 1985 ts = td->td_sched; 1986 ts2 = child->td_sched; 1987 ts2->ts_cpu = ts->ts_cpu; 1988 ts2->ts_runq = NULL; 1989 /* 1990 * Grab our parents cpu estimation information and priority. 1991 */ 1992 ts2->ts_ticks = ts->ts_ticks; 1993 ts2->ts_ltick = ts->ts_ltick; 1994 ts2->ts_ftick = ts->ts_ftick; 1995 child->td_user_pri = td->td_user_pri; 1996 child->td_base_user_pri = td->td_base_user_pri; 1997 /* 1998 * And update interactivity score. 1999 */ 2000 ts2->ts_slptime = ts->ts_slptime; 2001 ts2->ts_runtime = ts->ts_runtime; 2002 ts2->ts_slice = 1; /* Attempt to quickly learn interactivity. */ 2003 } 2004 2005 /* 2006 * Adjust the priority class of a thread. 2007 */ 2008 void 2009 sched_class(struct thread *td, int class) 2010 { 2011 2012 THREAD_LOCK_ASSERT(td, MA_OWNED); 2013 if (td->td_pri_class == class) 2014 return; 2015 2016 #ifdef SMP 2017 /* 2018 * On SMP if we're on the RUNQ we must adjust the transferable 2019 * count because could be changing to or from an interrupt 2020 * class. 2021 */ 2022 if (TD_ON_RUNQ(td)) { 2023 struct tdq *tdq; 2024 2025 tdq = TDQ_CPU(td->td_sched->ts_cpu); 2026 if (THREAD_CAN_MIGRATE(td)) { 2027 tdq->tdq_transferable--; 2028 tdq->tdq_group->tdg_transferable--; 2029 } 2030 td->td_pri_class = class; 2031 if (THREAD_CAN_MIGRATE(td)) { 2032 tdq->tdq_transferable++; 2033 tdq->tdq_group->tdg_transferable++; 2034 } 2035 } 2036 #endif 2037 td->td_pri_class = class; 2038 } 2039 2040 /* 2041 * Return some of the child's priority and interactivity to the parent. 2042 */ 2043 void 2044 sched_exit(struct proc *p, struct thread *child) 2045 { 2046 struct thread *td; 2047 2048 CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d", 2049 child, child->td_proc->p_comm, child->td_priority); 2050 2051 PROC_SLOCK_ASSERT(p, MA_OWNED); 2052 td = FIRST_THREAD_IN_PROC(p); 2053 sched_exit_thread(td, child); 2054 } 2055 2056 /* 2057 * Penalize another thread for the time spent on this one. This helps to 2058 * worsen the priority and interactivity of processes which schedule batch 2059 * jobs such as make. This has little effect on the make process itself but 2060 * causes new processes spawned by it to receive worse scores immediately. 2061 */ 2062 void 2063 sched_exit_thread(struct thread *td, struct thread *child) 2064 { 2065 2066 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 2067 child, child->td_proc->p_comm, child->td_priority); 2068 2069 #ifdef KSE 2070 /* 2071 * KSE forks and exits so often that this penalty causes short-lived 2072 * threads to always be non-interactive. This causes mozilla to 2073 * crawl under load. 2074 */ 2075 if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc) 2076 return; 2077 #endif 2078 /* 2079 * Give the child's runtime to the parent without returning the 2080 * sleep time as a penalty to the parent. This causes shells that 2081 * launch expensive things to mark their children as expensive. 2082 */ 2083 thread_lock(td); 2084 td->td_sched->ts_runtime += child->td_sched->ts_runtime; 2085 sched_interact_update(td); 2086 sched_priority(td); 2087 thread_unlock(td); 2088 } 2089 2090 /* 2091 * Fix priorities on return to user-space. Priorities may be elevated due 2092 * to static priorities in msleep() or similar. 2093 */ 2094 void 2095 sched_userret(struct thread *td) 2096 { 2097 /* 2098 * XXX we cheat slightly on the locking here to avoid locking in 2099 * the usual case. Setting td_priority here is essentially an 2100 * incomplete workaround for not setting it properly elsewhere. 2101 * Now that some interrupt handlers are threads, not setting it 2102 * properly elsewhere can clobber it in the window between setting 2103 * it here and returning to user mode, so don't waste time setting 2104 * it perfectly here. 2105 */ 2106 KASSERT((td->td_flags & TDF_BORROWING) == 0, 2107 ("thread with borrowed priority returning to userland")); 2108 if (td->td_priority != td->td_user_pri) { 2109 thread_lock(td); 2110 td->td_priority = td->td_user_pri; 2111 td->td_base_pri = td->td_user_pri; 2112 thread_unlock(td); 2113 } 2114 } 2115 2116 /* 2117 * Handle a stathz tick. This is really only relevant for timeshare 2118 * threads. 2119 */ 2120 void 2121 sched_clock(struct thread *td) 2122 { 2123 struct tdq *tdq; 2124 struct td_sched *ts; 2125 2126 THREAD_LOCK_ASSERT(td, MA_OWNED); 2127 tdq = TDQ_SELF(); 2128 /* 2129 * Advance the insert index once for each tick to ensure that all 2130 * threads get a chance to run. 2131 */ 2132 if (tdq->tdq_idx == tdq->tdq_ridx) { 2133 tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS; 2134 if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx])) 2135 tdq->tdq_ridx = tdq->tdq_idx; 2136 } 2137 ts = td->td_sched; 2138 /* 2139 * We only do slicing code for TIMESHARE threads. 2140 */ 2141 if (td->td_pri_class != PRI_TIMESHARE) 2142 return; 2143 /* 2144 * We used a tick; charge it to the thread so that we can compute our 2145 * interactivity. 2146 */ 2147 td->td_sched->ts_runtime += tickincr; 2148 sched_interact_update(td); 2149 /* 2150 * We used up one time slice. 2151 */ 2152 if (--ts->ts_slice > 0) 2153 return; 2154 /* 2155 * We're out of time, recompute priorities and requeue. 2156 */ 2157 sched_priority(td); 2158 td->td_flags |= TDF_NEEDRESCHED; 2159 } 2160 2161 /* 2162 * Called once per hz tick. Used for cpu utilization information. This 2163 * is easier than trying to scale based on stathz. 2164 */ 2165 void 2166 sched_tick(void) 2167 { 2168 struct td_sched *ts; 2169 2170 ts = curthread->td_sched; 2171 /* Adjust ticks for pctcpu */ 2172 ts->ts_ticks += 1 << SCHED_TICK_SHIFT; 2173 ts->ts_ltick = ticks; 2174 /* 2175 * Update if we've exceeded our desired tick threshhold by over one 2176 * second. 2177 */ 2178 if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick) 2179 sched_pctcpu_update(ts); 2180 } 2181 2182 /* 2183 * Return whether the current CPU has runnable tasks. Used for in-kernel 2184 * cooperative idle threads. 2185 */ 2186 int 2187 sched_runnable(void) 2188 { 2189 struct tdq *tdq; 2190 int load; 2191 2192 load = 1; 2193 2194 tdq = TDQ_SELF(); 2195 if ((curthread->td_flags & TDF_IDLETD) != 0) { 2196 if (tdq->tdq_load > 0) 2197 goto out; 2198 } else 2199 if (tdq->tdq_load - 1 > 0) 2200 goto out; 2201 load = 0; 2202 out: 2203 return (load); 2204 } 2205 2206 /* 2207 * Choose the highest priority thread to run. The thread is removed from 2208 * the run-queue while running however the load remains. For SMP we set 2209 * the tdq in the global idle bitmask if it idles here. 2210 */ 2211 struct thread * 2212 sched_choose(void) 2213 { 2214 #ifdef SMP 2215 struct tdq_group *tdg; 2216 #endif 2217 struct td_sched *ts; 2218 struct tdq *tdq; 2219 2220 tdq = TDQ_SELF(); 2221 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2222 ts = tdq_choose(tdq); 2223 if (ts) { 2224 tdq_runq_rem(tdq, ts); 2225 return (ts->ts_thread); 2226 } 2227 #ifdef SMP 2228 /* 2229 * We only set the idled bit when all of the cpus in the group are 2230 * idle. Otherwise we could get into a situation where a thread bounces 2231 * back and forth between two idle cores on seperate physical CPUs. 2232 */ 2233 tdg = tdq->tdq_group; 2234 tdg->tdg_idlemask |= PCPU_GET(cpumask); 2235 if (tdg->tdg_idlemask == tdg->tdg_cpumask) 2236 atomic_set_int(&tdq_idle, tdg->tdg_mask); 2237 tdq->tdq_lowpri = PRI_MAX_IDLE; 2238 #endif 2239 return (PCPU_GET(idlethread)); 2240 } 2241 2242 /* 2243 * Set owepreempt if necessary. Preemption never happens directly in ULE, 2244 * we always request it once we exit a critical section. 2245 */ 2246 static inline void 2247 sched_setpreempt(struct thread *td) 2248 { 2249 struct thread *ctd; 2250 int cpri; 2251 int pri; 2252 2253 ctd = curthread; 2254 pri = td->td_priority; 2255 cpri = ctd->td_priority; 2256 if (td->td_priority < ctd->td_priority) 2257 curthread->td_flags |= TDF_NEEDRESCHED; 2258 if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd)) 2259 return; 2260 /* 2261 * Always preempt IDLE threads. Otherwise only if the preempting 2262 * thread is an ithread. 2263 */ 2264 if (pri > preempt_thresh && cpri < PRI_MIN_IDLE) 2265 return; 2266 ctd->td_owepreempt = 1; 2267 return; 2268 } 2269 2270 /* 2271 * Add a thread to a thread queue. Initializes priority, slice, runq, and 2272 * add it to the appropriate queue. This is the internal function called 2273 * when the tdq is predetermined. 2274 */ 2275 void 2276 tdq_add(struct tdq *tdq, struct thread *td, int flags) 2277 { 2278 struct td_sched *ts; 2279 int class; 2280 #ifdef SMP 2281 int cpumask; 2282 #endif 2283 2284 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2285 KASSERT((td->td_inhibitors == 0), 2286 ("sched_add: trying to run inhibited thread")); 2287 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)), 2288 ("sched_add: bad thread state")); 2289 KASSERT(td->td_flags & TDF_INMEM, 2290 ("sched_add: thread swapped out")); 2291 2292 ts = td->td_sched; 2293 class = PRI_BASE(td->td_pri_class); 2294 TD_SET_RUNQ(td); 2295 if (ts->ts_slice == 0) 2296 ts->ts_slice = sched_slice; 2297 /* 2298 * Pick the run queue based on priority. 2299 */ 2300 if (td->td_priority <= PRI_MAX_REALTIME) 2301 ts->ts_runq = &tdq->tdq_realtime; 2302 else if (td->td_priority <= PRI_MAX_TIMESHARE) 2303 ts->ts_runq = &tdq->tdq_timeshare; 2304 else 2305 ts->ts_runq = &tdq->tdq_idle; 2306 #ifdef SMP 2307 cpumask = 1 << ts->ts_cpu; 2308 /* 2309 * If we had been idle, clear our bit in the group and potentially 2310 * the global bitmap. 2311 */ 2312 if ((class != PRI_IDLE && class != PRI_ITHD) && 2313 (tdq->tdq_group->tdg_idlemask & cpumask) != 0) { 2314 /* 2315 * Check to see if our group is unidling, and if so, remove it 2316 * from the global idle mask. 2317 */ 2318 if (tdq->tdq_group->tdg_idlemask == 2319 tdq->tdq_group->tdg_cpumask) 2320 atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask); 2321 /* 2322 * Now remove ourselves from the group specific idle mask. 2323 */ 2324 tdq->tdq_group->tdg_idlemask &= ~cpumask; 2325 } 2326 if (td->td_priority < tdq->tdq_lowpri) 2327 tdq->tdq_lowpri = td->td_priority; 2328 #endif 2329 tdq_runq_add(tdq, ts, flags); 2330 tdq_load_add(tdq, ts); 2331 } 2332 2333 /* 2334 * Select the target thread queue and add a thread to it. Request 2335 * preemption or IPI a remote processor if required. 2336 */ 2337 void 2338 sched_add(struct thread *td, int flags) 2339 { 2340 struct td_sched *ts; 2341 struct tdq *tdq; 2342 #ifdef SMP 2343 int cpuid; 2344 int cpu; 2345 #endif 2346 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 2347 td, td->td_proc->p_comm, td->td_priority, curthread, 2348 curthread->td_proc->p_comm); 2349 THREAD_LOCK_ASSERT(td, MA_OWNED); 2350 ts = td->td_sched; 2351 /* 2352 * Recalculate the priority before we select the target cpu or 2353 * run-queue. 2354 */ 2355 if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE) 2356 sched_priority(td); 2357 #ifdef SMP 2358 cpuid = PCPU_GET(cpuid); 2359 /* 2360 * Pick the destination cpu and if it isn't ours transfer to the 2361 * target cpu. 2362 */ 2363 if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td)) 2364 cpu = cpuid; 2365 else if (!THREAD_CAN_MIGRATE(td)) 2366 cpu = ts->ts_cpu; 2367 else 2368 cpu = sched_pickcpu(ts, flags); 2369 tdq = sched_setcpu(ts, cpu, flags); 2370 tdq_add(tdq, td, flags); 2371 if (cpu != cpuid) { 2372 tdq_notify(ts); 2373 return; 2374 } 2375 #else 2376 tdq = TDQ_SELF(); 2377 TDQ_LOCK(tdq); 2378 /* 2379 * Now that the thread is moving to the run-queue, set the lock 2380 * to the scheduler's lock. 2381 */ 2382 thread_lock_set(td, TDQ_LOCKPTR(tdq)); 2383 tdq_add(tdq, td, flags); 2384 #endif 2385 if (!(flags & SRQ_YIELDING)) 2386 sched_setpreempt(td); 2387 } 2388 2389 /* 2390 * Remove a thread from a run-queue without running it. This is used 2391 * when we're stealing a thread from a remote queue. Otherwise all threads 2392 * exit by calling sched_exit_thread() and sched_throw() themselves. 2393 */ 2394 void 2395 sched_rem(struct thread *td) 2396 { 2397 struct tdq *tdq; 2398 struct td_sched *ts; 2399 2400 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 2401 td, td->td_proc->p_comm, td->td_priority, curthread, 2402 curthread->td_proc->p_comm); 2403 ts = td->td_sched; 2404 tdq = TDQ_CPU(ts->ts_cpu); 2405 TDQ_LOCK_ASSERT(tdq, MA_OWNED); 2406 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2407 KASSERT(TD_ON_RUNQ(td), 2408 ("sched_rem: thread not on run queue")); 2409 tdq_runq_rem(tdq, ts); 2410 tdq_load_rem(tdq, ts); 2411 TD_SET_CAN_RUN(td); 2412 } 2413 2414 /* 2415 * Fetch cpu utilization information. Updates on demand. 2416 */ 2417 fixpt_t 2418 sched_pctcpu(struct thread *td) 2419 { 2420 fixpt_t pctcpu; 2421 struct td_sched *ts; 2422 2423 pctcpu = 0; 2424 ts = td->td_sched; 2425 if (ts == NULL) 2426 return (0); 2427 2428 thread_lock(td); 2429 if (ts->ts_ticks) { 2430 int rtick; 2431 2432 sched_pctcpu_update(ts); 2433 /* How many rtick per second ? */ 2434 rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz); 2435 pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT; 2436 } 2437 thread_unlock(td); 2438 2439 return (pctcpu); 2440 } 2441 2442 /* 2443 * Bind a thread to a target cpu. 2444 */ 2445 void 2446 sched_bind(struct thread *td, int cpu) 2447 { 2448 struct td_sched *ts; 2449 2450 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED); 2451 ts = td->td_sched; 2452 if (ts->ts_flags & TSF_BOUND) 2453 sched_unbind(td); 2454 ts->ts_flags |= TSF_BOUND; 2455 #ifdef SMP 2456 sched_pin(); 2457 if (PCPU_GET(cpuid) == cpu) 2458 return; 2459 ts->ts_cpu = cpu; 2460 /* When we return from mi_switch we'll be on the correct cpu. */ 2461 mi_switch(SW_VOL, NULL); 2462 #endif 2463 } 2464 2465 /* 2466 * Release a bound thread. 2467 */ 2468 void 2469 sched_unbind(struct thread *td) 2470 { 2471 struct td_sched *ts; 2472 2473 THREAD_LOCK_ASSERT(td, MA_OWNED); 2474 ts = td->td_sched; 2475 if ((ts->ts_flags & TSF_BOUND) == 0) 2476 return; 2477 ts->ts_flags &= ~TSF_BOUND; 2478 #ifdef SMP 2479 sched_unpin(); 2480 #endif 2481 } 2482 2483 int 2484 sched_is_bound(struct thread *td) 2485 { 2486 THREAD_LOCK_ASSERT(td, MA_OWNED); 2487 return (td->td_sched->ts_flags & TSF_BOUND); 2488 } 2489 2490 /* 2491 * Basic yield call. 2492 */ 2493 void 2494 sched_relinquish(struct thread *td) 2495 { 2496 thread_lock(td); 2497 if (td->td_pri_class == PRI_TIMESHARE) 2498 sched_prio(td, PRI_MAX_TIMESHARE); 2499 SCHED_STAT_INC(switch_relinquish); 2500 mi_switch(SW_VOL, NULL); 2501 thread_unlock(td); 2502 } 2503 2504 /* 2505 * Return the total system load. 2506 */ 2507 int 2508 sched_load(void) 2509 { 2510 #ifdef SMP 2511 int total; 2512 int i; 2513 2514 total = 0; 2515 for (i = 0; i <= tdg_maxid; i++) 2516 total += TDQ_GROUP(i)->tdg_load; 2517 return (total); 2518 #else 2519 return (TDQ_SELF()->tdq_sysload); 2520 #endif 2521 } 2522 2523 int 2524 sched_sizeof_proc(void) 2525 { 2526 return (sizeof(struct proc)); 2527 } 2528 2529 int 2530 sched_sizeof_thread(void) 2531 { 2532 return (sizeof(struct thread) + sizeof(struct td_sched)); 2533 } 2534 2535 /* 2536 * The actual idle process. 2537 */ 2538 void 2539 sched_idletd(void *dummy) 2540 { 2541 struct thread *td; 2542 struct tdq *tdq; 2543 2544 td = curthread; 2545 tdq = TDQ_SELF(); 2546 mtx_assert(&Giant, MA_NOTOWNED); 2547 /* ULE relies on preemption for idle interruption. */ 2548 for (;;) { 2549 #ifdef SMP 2550 if (tdq_idled(tdq)) 2551 cpu_idle(); 2552 #else 2553 cpu_idle(); 2554 #endif 2555 } 2556 } 2557 2558 /* 2559 * A CPU is entering for the first time or a thread is exiting. 2560 */ 2561 void 2562 sched_throw(struct thread *td) 2563 { 2564 struct tdq *tdq; 2565 2566 tdq = TDQ_SELF(); 2567 if (td == NULL) { 2568 /* Correct spinlock nesting and acquire the correct lock. */ 2569 TDQ_LOCK(tdq); 2570 spinlock_exit(); 2571 } else { 2572 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2573 tdq_load_rem(tdq, td->td_sched); 2574 } 2575 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count")); 2576 PCPU_SET(switchtime, cpu_ticks()); 2577 PCPU_SET(switchticks, ticks); 2578 cpu_throw(td, choosethread()); /* doesn't return */ 2579 } 2580 2581 /* 2582 * This is called from fork_exit(). Just acquire the correct locks and 2583 * let fork do the rest of the work. 2584 */ 2585 void 2586 sched_fork_exit(struct thread *td) 2587 { 2588 struct td_sched *ts; 2589 struct tdq *tdq; 2590 int cpuid; 2591 2592 /* 2593 * Finish setting up thread glue so that it begins execution in a 2594 * non-nested critical section with the scheduler lock held. 2595 */ 2596 cpuid = PCPU_GET(cpuid); 2597 tdq = TDQ_CPU(cpuid); 2598 ts = td->td_sched; 2599 if (TD_IS_IDLETHREAD(td)) 2600 td->td_lock = TDQ_LOCKPTR(tdq); 2601 MPASS(td->td_lock == TDQ_LOCKPTR(tdq)); 2602 td->td_oncpu = cpuid; 2603 TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)td; 2604 THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED); 2605 } 2606 2607 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, 2608 "Scheduler"); 2609 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0, 2610 "Scheduler name"); 2611 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, 2612 "Slice size for timeshare threads"); 2613 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, 2614 "Interactivity score threshold"); 2615 SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh, 2616 0,"Min priority for preemption, lower priorities have greater precedence"); 2617 #ifdef SMP 2618 SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, 2619 "Pick the target cpu based on priority rather than load."); 2620 SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0, 2621 "Number of hz ticks to keep thread affinity for"); 2622 SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, ""); 2623 SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, 2624 "Enables the long-term load balancer"); 2625 SYSCTL_INT(_kern_sched, OID_AUTO, balance_secs, CTLFLAG_RW, &balance_secs, 0, 2626 "Average frequence in seconds to run the long-term balancer"); 2627 SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, 2628 "Steals work from another hyper-threaded core on idle"); 2629 SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0, 2630 "Attempts to steal work from other cores before idling"); 2631 SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0, 2632 "Minimum load on remote cpu before we'll steal"); 2633 SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0, 2634 "True when a topology has been specified by the MD code."); 2635 #endif 2636 2637 /* ps compat. All cpu percentages from ULE are weighted. */ 2638 static int ccpu = 0.0; 2639 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 2640 2641 2642 #define KERN_SWITCH_INCLUDE 1 2643 #include "kern/kern_switch.c" 2644