1 /*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <opt_sched.h> 31 32 #define kse td_sched 33 34 #include <sys/param.h> 35 #include <sys/systm.h> 36 #include <sys/kdb.h> 37 #include <sys/kernel.h> 38 #include <sys/ktr.h> 39 #include <sys/lock.h> 40 #include <sys/mutex.h> 41 #include <sys/proc.h> 42 #include <sys/resource.h> 43 #include <sys/resourcevar.h> 44 #include <sys/sched.h> 45 #include <sys/smp.h> 46 #include <sys/sx.h> 47 #include <sys/sysctl.h> 48 #include <sys/sysproto.h> 49 #include <sys/turnstile.h> 50 #include <sys/vmmeter.h> 51 #ifdef KTRACE 52 #include <sys/uio.h> 53 #include <sys/ktrace.h> 54 #endif 55 56 #include <machine/cpu.h> 57 #include <machine/smp.h> 58 59 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 60 /* XXX This is bogus compatability crap for ps */ 61 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 62 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 63 64 static void sched_setup(void *dummy); 65 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 66 67 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 68 69 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 70 "Scheduler name"); 71 72 static int slice_min = 1; 73 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 74 75 static int slice_max = 10; 76 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 77 78 int realstathz; 79 int tickincr = 1; 80 81 /* 82 * The schedulable entity that can be given a context to run. 83 * A process may have several of these. Probably one per processor 84 * but posibly a few more. In this universe they are grouped 85 * with a KSEG that contains the priority and niceness 86 * for the group. 87 */ 88 struct kse { 89 TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 90 int ke_flags; /* (j) KEF_* flags. */ 91 struct thread *ke_thread; /* (*) Active associated thread. */ 92 fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 93 char ke_rqindex; /* (j) Run queue index. */ 94 enum { 95 KES_THREAD = 0x0, /* slaved to thread state */ 96 KES_ONRUNQ 97 } ke_state; /* (j) thread sched specific status. */ 98 int ke_slptime; 99 int ke_slice; 100 struct runq *ke_runq; 101 u_char ke_cpu; /* CPU that we have affinity for. */ 102 /* The following variables are only used for pctcpu calculation */ 103 int ke_ltick; /* Last tick that we were running on */ 104 int ke_ftick; /* First tick that we were running on */ 105 int ke_ticks; /* Tick count */ 106 107 }; 108 109 110 #define td_kse td_sched 111 #define td_slptime td_kse->ke_slptime 112 #define ke_proc ke_thread->td_proc 113 #define ke_ksegrp ke_thread->td_ksegrp 114 115 /* flags kept in ke_flags */ 116 #define KEF_SCHED0 0x00001 /* For scheduler-specific use. */ 117 #define KEF_SCHED1 0x00002 /* For scheduler-specific use. */ 118 #define KEF_SCHED2 0x00004 /* For scheduler-specific use. */ 119 #define KEF_SCHED3 0x00008 /* For scheduler-specific use. */ 120 #define KEF_SCHED4 0x00010 121 #define KEF_SCHED5 0x00020 122 #define KEF_DIDRUN 0x02000 /* Thread actually ran. */ 123 #define KEF_EXIT 0x04000 /* Thread is being killed. */ 124 125 /* 126 * These datastructures are allocated within their parent datastructure but 127 * are scheduler specific. 128 */ 129 130 #define ke_assign ke_procq.tqe_next 131 132 #define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */ 133 #define KEF_BOUND 0x0002 /* Thread can not migrate. */ 134 #define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */ 135 #define KEF_HOLD 0x0008 /* Thread is temporarily bound. */ 136 #define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ 137 #define KEF_INTERNAL 0x0020 138 139 struct kg_sched { 140 struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 141 /* the system scheduler */ 142 int skg_slptime; /* Number of ticks we vol. slept */ 143 int skg_runtime; /* Number of ticks we were running */ 144 int skg_avail_opennings; /* (j) Num unfilled slots in group.*/ 145 int skg_concurrency; /* (j) Num threads requested in group.*/ 146 }; 147 #define kg_last_assigned kg_sched->skg_last_assigned 148 #define kg_avail_opennings kg_sched->skg_avail_opennings 149 #define kg_concurrency kg_sched->skg_concurrency 150 #define kg_runtime kg_sched->skg_runtime 151 #define kg_slptime kg_sched->skg_slptime 152 153 #define SLOT_RELEASE(kg) \ 154 do { \ 155 kg->kg_avail_opennings++; \ 156 CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)", \ 157 kg, \ 158 kg->kg_concurrency, \ 159 kg->kg_avail_opennings); \ 160 /*KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency), \ 161 ("slots out of whack")); */ \ 162 } while (0) 163 164 #define SLOT_USE(kg) \ 165 do { \ 166 kg->kg_avail_opennings--; \ 167 CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)", \ 168 kg, \ 169 kg->kg_concurrency, \ 170 kg->kg_avail_opennings); \ 171 /*KASSERT((kg->kg_avail_opennings >= 0), \ 172 ("slots out of whack"));*/ \ 173 } while (0) 174 175 static struct kse kse0; 176 static struct kg_sched kg_sched0; 177 178 /* 179 * The priority is primarily determined by the interactivity score. Thus, we 180 * give lower(better) priorities to kse groups that use less CPU. The nice 181 * value is then directly added to this to allow nice to have some effect 182 * on latency. 183 * 184 * PRI_RANGE: Total priority range for timeshare threads. 185 * PRI_NRESV: Number of nice values. 186 * PRI_BASE: The start of the dynamic range. 187 */ 188 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 189 #define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 190 #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 191 #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 192 #define SCHED_PRI_INTERACT(score) \ 193 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 194 195 /* 196 * These determine the interactivity of a process. 197 * 198 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 199 * before throttling back. 200 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 201 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 202 * INTERACT_THRESH: Threshhold for placement on the current runq. 203 */ 204 #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 205 #define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 206 #define SCHED_INTERACT_MAX (100) 207 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 208 #define SCHED_INTERACT_THRESH (30) 209 210 /* 211 * These parameters and macros determine the size of the time slice that is 212 * granted to each thread. 213 * 214 * SLICE_MIN: Minimum time slice granted, in units of ticks. 215 * SLICE_MAX: Maximum time slice granted. 216 * SLICE_RANGE: Range of available time slices scaled by hz. 217 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 218 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 219 * SLICE_NTHRESH: The nice cutoff point for slice assignment. 220 */ 221 #define SCHED_SLICE_MIN (slice_min) 222 #define SCHED_SLICE_MAX (slice_max) 223 #define SCHED_SLICE_INTERACTIVE (slice_max) 224 #define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 225 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 226 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 227 #define SCHED_SLICE_NICE(nice) \ 228 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 229 230 /* 231 * This macro determines whether or not the thread belongs on the current or 232 * next run queue. 233 */ 234 #define SCHED_INTERACTIVE(kg) \ 235 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 236 #define SCHED_CURR(kg, ke) \ 237 ((ke->ke_thread->td_flags & TDF_BORROWING) || SCHED_INTERACTIVE(kg)) 238 239 /* 240 * Cpu percentage computation macros and defines. 241 * 242 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 243 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 244 */ 245 246 #define SCHED_CPU_TIME 10 247 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 248 249 /* 250 * kseq - per processor runqs and statistics. 251 */ 252 struct kseq { 253 struct runq ksq_idle; /* Queue of IDLE threads. */ 254 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 255 struct runq *ksq_next; /* Next timeshare queue. */ 256 struct runq *ksq_curr; /* Current queue. */ 257 int ksq_load_timeshare; /* Load for timeshare. */ 258 int ksq_load; /* Aggregate load. */ 259 short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 260 short ksq_nicemin; /* Least nice. */ 261 #ifdef SMP 262 int ksq_transferable; 263 LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 264 struct kseq_group *ksq_group; /* Our processor group. */ 265 volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 266 #else 267 int ksq_sysload; /* For loadavg, !ITHD load. */ 268 #endif 269 }; 270 271 #ifdef SMP 272 /* 273 * kseq groups are groups of processors which can cheaply share threads. When 274 * one processor in the group goes idle it will check the runqs of the other 275 * processors in its group prior to halting and waiting for an interrupt. 276 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 277 * In a numa environment we'd want an idle bitmap per group and a two tiered 278 * load balancer. 279 */ 280 struct kseq_group { 281 int ksg_cpus; /* Count of CPUs in this kseq group. */ 282 cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 283 cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 284 cpumask_t ksg_mask; /* Bit mask for first cpu. */ 285 int ksg_load; /* Total load of this group. */ 286 int ksg_transferable; /* Transferable load of this group. */ 287 LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 288 }; 289 #endif 290 291 /* 292 * One kse queue per processor. 293 */ 294 #ifdef SMP 295 static cpumask_t kseq_idle; 296 static int ksg_maxid; 297 static struct kseq kseq_cpu[MAXCPU]; 298 static struct kseq_group kseq_groups[MAXCPU]; 299 static int bal_tick; 300 static int gbal_tick; 301 static int balance_groups; 302 303 #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 304 #define KSEQ_CPU(x) (&kseq_cpu[(x)]) 305 #define KSEQ_ID(x) ((x) - kseq_cpu) 306 #define KSEQ_GROUP(x) (&kseq_groups[(x)]) 307 #else /* !SMP */ 308 static struct kseq kseq_cpu; 309 310 #define KSEQ_SELF() (&kseq_cpu) 311 #define KSEQ_CPU(x) (&kseq_cpu) 312 #endif 313 314 static void slot_fill(struct ksegrp *kg); 315 static struct kse *sched_choose(void); /* XXX Should be thread * */ 316 static void sched_slice(struct kse *ke); 317 static void sched_priority(struct ksegrp *kg); 318 static void sched_thread_priority(struct thread *td, u_char prio); 319 static int sched_interact_score(struct ksegrp *kg); 320 static void sched_interact_update(struct ksegrp *kg); 321 static void sched_interact_fork(struct ksegrp *kg); 322 static void sched_pctcpu_update(struct kse *ke); 323 324 /* Operations on per processor queues */ 325 static struct kse * kseq_choose(struct kseq *kseq); 326 static void kseq_setup(struct kseq *kseq); 327 static void kseq_load_add(struct kseq *kseq, struct kse *ke); 328 static void kseq_load_rem(struct kseq *kseq, struct kse *ke); 329 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke, int); 330 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 331 static void kseq_nice_add(struct kseq *kseq, int nice); 332 static void kseq_nice_rem(struct kseq *kseq, int nice); 333 void kseq_print(int cpu); 334 #ifdef SMP 335 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 336 static struct kse *runq_steal(struct runq *rq); 337 static void sched_balance(void); 338 static void sched_balance_groups(void); 339 static void sched_balance_group(struct kseq_group *ksg); 340 static void sched_balance_pair(struct kseq *high, struct kseq *low); 341 static void kseq_move(struct kseq *from, int cpu); 342 static int kseq_idled(struct kseq *kseq); 343 static void kseq_notify(struct kse *ke, int cpu); 344 static void kseq_assign(struct kseq *); 345 static struct kse *kseq_steal(struct kseq *kseq, int stealidle); 346 #define KSE_CAN_MIGRATE(ke) \ 347 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 348 #endif 349 350 void 351 kseq_print(int cpu) 352 { 353 struct kseq *kseq; 354 int i; 355 356 kseq = KSEQ_CPU(cpu); 357 358 printf("kseq:\n"); 359 printf("\tload: %d\n", kseq->ksq_load); 360 printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 361 #ifdef SMP 362 printf("\tload transferable: %d\n", kseq->ksq_transferable); 363 #endif 364 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 365 printf("\tnice counts:\n"); 366 for (i = 0; i < SCHED_PRI_NRESV; i++) 367 if (kseq->ksq_nice[i]) 368 printf("\t\t%d = %d\n", 369 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 370 } 371 372 static __inline void 373 kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags) 374 { 375 #ifdef SMP 376 if (KSE_CAN_MIGRATE(ke)) { 377 kseq->ksq_transferable++; 378 kseq->ksq_group->ksg_transferable++; 379 ke->ke_flags |= KEF_XFERABLE; 380 } 381 #endif 382 runq_add(ke->ke_runq, ke, flags); 383 } 384 385 static __inline void 386 kseq_runq_rem(struct kseq *kseq, struct kse *ke) 387 { 388 #ifdef SMP 389 if (ke->ke_flags & KEF_XFERABLE) { 390 kseq->ksq_transferable--; 391 kseq->ksq_group->ksg_transferable--; 392 ke->ke_flags &= ~KEF_XFERABLE; 393 } 394 #endif 395 runq_remove(ke->ke_runq, ke); 396 } 397 398 static void 399 kseq_load_add(struct kseq *kseq, struct kse *ke) 400 { 401 int class; 402 mtx_assert(&sched_lock, MA_OWNED); 403 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 404 if (class == PRI_TIMESHARE) 405 kseq->ksq_load_timeshare++; 406 kseq->ksq_load++; 407 CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 408 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 409 #ifdef SMP 410 kseq->ksq_group->ksg_load++; 411 #else 412 kseq->ksq_sysload++; 413 #endif 414 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 415 kseq_nice_add(kseq, ke->ke_proc->p_nice); 416 } 417 418 static void 419 kseq_load_rem(struct kseq *kseq, struct kse *ke) 420 { 421 int class; 422 mtx_assert(&sched_lock, MA_OWNED); 423 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 424 if (class == PRI_TIMESHARE) 425 kseq->ksq_load_timeshare--; 426 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 427 #ifdef SMP 428 kseq->ksq_group->ksg_load--; 429 #else 430 kseq->ksq_sysload--; 431 #endif 432 kseq->ksq_load--; 433 CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 434 ke->ke_runq = NULL; 435 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 436 kseq_nice_rem(kseq, ke->ke_proc->p_nice); 437 } 438 439 static void 440 kseq_nice_add(struct kseq *kseq, int nice) 441 { 442 mtx_assert(&sched_lock, MA_OWNED); 443 /* Normalize to zero. */ 444 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 445 if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 446 kseq->ksq_nicemin = nice; 447 } 448 449 static void 450 kseq_nice_rem(struct kseq *kseq, int nice) 451 { 452 int n; 453 454 mtx_assert(&sched_lock, MA_OWNED); 455 /* Normalize to zero. */ 456 n = nice + SCHED_PRI_NHALF; 457 kseq->ksq_nice[n]--; 458 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 459 460 /* 461 * If this wasn't the smallest nice value or there are more in 462 * this bucket we can just return. Otherwise we have to recalculate 463 * the smallest nice. 464 */ 465 if (nice != kseq->ksq_nicemin || 466 kseq->ksq_nice[n] != 0 || 467 kseq->ksq_load_timeshare == 0) 468 return; 469 470 for (; n < SCHED_PRI_NRESV; n++) 471 if (kseq->ksq_nice[n]) { 472 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 473 return; 474 } 475 } 476 477 #ifdef SMP 478 /* 479 * sched_balance is a simple CPU load balancing algorithm. It operates by 480 * finding the least loaded and most loaded cpu and equalizing their load 481 * by migrating some processes. 482 * 483 * Dealing only with two CPUs at a time has two advantages. Firstly, most 484 * installations will only have 2 cpus. Secondly, load balancing too much at 485 * once can have an unpleasant effect on the system. The scheduler rarely has 486 * enough information to make perfect decisions. So this algorithm chooses 487 * algorithm simplicity and more gradual effects on load in larger systems. 488 * 489 * It could be improved by considering the priorities and slices assigned to 490 * each task prior to balancing them. There are many pathological cases with 491 * any approach and so the semi random algorithm below may work as well as any. 492 * 493 */ 494 static void 495 sched_balance(void) 496 { 497 struct kseq_group *high; 498 struct kseq_group *low; 499 struct kseq_group *ksg; 500 int cnt; 501 int i; 502 503 bal_tick = ticks + (random() % (hz * 2)); 504 if (smp_started == 0) 505 return; 506 low = high = NULL; 507 i = random() % (ksg_maxid + 1); 508 for (cnt = 0; cnt <= ksg_maxid; cnt++) { 509 ksg = KSEQ_GROUP(i); 510 /* 511 * Find the CPU with the highest load that has some 512 * threads to transfer. 513 */ 514 if ((high == NULL || ksg->ksg_load > high->ksg_load) 515 && ksg->ksg_transferable) 516 high = ksg; 517 if (low == NULL || ksg->ksg_load < low->ksg_load) 518 low = ksg; 519 if (++i > ksg_maxid) 520 i = 0; 521 } 522 if (low != NULL && high != NULL && high != low) 523 sched_balance_pair(LIST_FIRST(&high->ksg_members), 524 LIST_FIRST(&low->ksg_members)); 525 } 526 527 static void 528 sched_balance_groups(void) 529 { 530 int i; 531 532 gbal_tick = ticks + (random() % (hz * 2)); 533 mtx_assert(&sched_lock, MA_OWNED); 534 if (smp_started) 535 for (i = 0; i <= ksg_maxid; i++) 536 sched_balance_group(KSEQ_GROUP(i)); 537 } 538 539 static void 540 sched_balance_group(struct kseq_group *ksg) 541 { 542 struct kseq *kseq; 543 struct kseq *high; 544 struct kseq *low; 545 int load; 546 547 if (ksg->ksg_transferable == 0) 548 return; 549 low = NULL; 550 high = NULL; 551 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 552 load = kseq->ksq_load; 553 if (high == NULL || load > high->ksq_load) 554 high = kseq; 555 if (low == NULL || load < low->ksq_load) 556 low = kseq; 557 } 558 if (high != NULL && low != NULL && high != low) 559 sched_balance_pair(high, low); 560 } 561 562 static void 563 sched_balance_pair(struct kseq *high, struct kseq *low) 564 { 565 int transferable; 566 int high_load; 567 int low_load; 568 int move; 569 int diff; 570 int i; 571 572 /* 573 * If we're transfering within a group we have to use this specific 574 * kseq's transferable count, otherwise we can steal from other members 575 * of the group. 576 */ 577 if (high->ksq_group == low->ksq_group) { 578 transferable = high->ksq_transferable; 579 high_load = high->ksq_load; 580 low_load = low->ksq_load; 581 } else { 582 transferable = high->ksq_group->ksg_transferable; 583 high_load = high->ksq_group->ksg_load; 584 low_load = low->ksq_group->ksg_load; 585 } 586 if (transferable == 0) 587 return; 588 /* 589 * Determine what the imbalance is and then adjust that to how many 590 * kses we actually have to give up (transferable). 591 */ 592 diff = high_load - low_load; 593 move = diff / 2; 594 if (diff & 0x1) 595 move++; 596 move = min(move, transferable); 597 for (i = 0; i < move; i++) 598 kseq_move(high, KSEQ_ID(low)); 599 return; 600 } 601 602 static void 603 kseq_move(struct kseq *from, int cpu) 604 { 605 struct kseq *kseq; 606 struct kseq *to; 607 struct kse *ke; 608 609 kseq = from; 610 to = KSEQ_CPU(cpu); 611 ke = kseq_steal(kseq, 1); 612 if (ke == NULL) { 613 struct kseq_group *ksg; 614 615 ksg = kseq->ksq_group; 616 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 617 if (kseq == from || kseq->ksq_transferable == 0) 618 continue; 619 ke = kseq_steal(kseq, 1); 620 break; 621 } 622 if (ke == NULL) 623 panic("kseq_move: No KSEs available with a " 624 "transferable count of %d\n", 625 ksg->ksg_transferable); 626 } 627 if (kseq == to) 628 return; 629 ke->ke_state = KES_THREAD; 630 kseq_runq_rem(kseq, ke); 631 kseq_load_rem(kseq, ke); 632 kseq_notify(ke, cpu); 633 } 634 635 static int 636 kseq_idled(struct kseq *kseq) 637 { 638 struct kseq_group *ksg; 639 struct kseq *steal; 640 struct kse *ke; 641 642 ksg = kseq->ksq_group; 643 /* 644 * If we're in a cpu group, try and steal kses from another cpu in 645 * the group before idling. 646 */ 647 if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 648 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 649 if (steal == kseq || steal->ksq_transferable == 0) 650 continue; 651 ke = kseq_steal(steal, 0); 652 if (ke == NULL) 653 continue; 654 ke->ke_state = KES_THREAD; 655 kseq_runq_rem(steal, ke); 656 kseq_load_rem(steal, ke); 657 ke->ke_cpu = PCPU_GET(cpuid); 658 ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 659 sched_add(ke->ke_thread, SRQ_YIELDING); 660 return (0); 661 } 662 } 663 /* 664 * We only set the idled bit when all of the cpus in the group are 665 * idle. Otherwise we could get into a situation where a KSE bounces 666 * back and forth between two idle cores on seperate physical CPUs. 667 */ 668 ksg->ksg_idlemask |= PCPU_GET(cpumask); 669 if (ksg->ksg_idlemask != ksg->ksg_cpumask) 670 return (1); 671 atomic_set_int(&kseq_idle, ksg->ksg_mask); 672 return (1); 673 } 674 675 static void 676 kseq_assign(struct kseq *kseq) 677 { 678 struct kse *nke; 679 struct kse *ke; 680 681 do { 682 *(volatile struct kse **)&ke = kseq->ksq_assigned; 683 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 684 for (; ke != NULL; ke = nke) { 685 nke = ke->ke_assign; 686 kseq->ksq_group->ksg_load--; 687 kseq->ksq_load--; 688 ke->ke_flags &= ~KEF_ASSIGNED; 689 ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 690 sched_add(ke->ke_thread, SRQ_YIELDING); 691 } 692 } 693 694 static void 695 kseq_notify(struct kse *ke, int cpu) 696 { 697 struct kseq *kseq; 698 struct thread *td; 699 struct pcpu *pcpu; 700 int class; 701 int prio; 702 703 kseq = KSEQ_CPU(cpu); 704 /* XXX */ 705 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 706 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 707 (kseq_idle & kseq->ksq_group->ksg_mask)) 708 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 709 kseq->ksq_group->ksg_load++; 710 kseq->ksq_load++; 711 ke->ke_cpu = cpu; 712 ke->ke_flags |= KEF_ASSIGNED; 713 prio = ke->ke_thread->td_priority; 714 715 /* 716 * Place a KSE on another cpu's queue and force a resched. 717 */ 718 do { 719 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 720 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 721 /* 722 * Without sched_lock we could lose a race where we set NEEDRESCHED 723 * on a thread that is switched out before the IPI is delivered. This 724 * would lead us to miss the resched. This will be a problem once 725 * sched_lock is pushed down. 726 */ 727 pcpu = pcpu_find(cpu); 728 td = pcpu->pc_curthread; 729 if (ke->ke_thread->td_priority < td->td_priority || 730 td == pcpu->pc_idlethread) { 731 td->td_flags |= TDF_NEEDRESCHED; 732 ipi_selected(1 << cpu, IPI_AST); 733 } 734 } 735 736 static struct kse * 737 runq_steal(struct runq *rq) 738 { 739 struct rqhead *rqh; 740 struct rqbits *rqb; 741 struct kse *ke; 742 int word; 743 int bit; 744 745 mtx_assert(&sched_lock, MA_OWNED); 746 rqb = &rq->rq_status; 747 for (word = 0; word < RQB_LEN; word++) { 748 if (rqb->rqb_bits[word] == 0) 749 continue; 750 for (bit = 0; bit < RQB_BPW; bit++) { 751 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 752 continue; 753 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 754 TAILQ_FOREACH(ke, rqh, ke_procq) { 755 if (KSE_CAN_MIGRATE(ke)) 756 return (ke); 757 } 758 } 759 } 760 return (NULL); 761 } 762 763 static struct kse * 764 kseq_steal(struct kseq *kseq, int stealidle) 765 { 766 struct kse *ke; 767 768 /* 769 * Steal from next first to try to get a non-interactive task that 770 * may not have run for a while. 771 */ 772 if ((ke = runq_steal(kseq->ksq_next)) != NULL) 773 return (ke); 774 if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 775 return (ke); 776 if (stealidle) 777 return (runq_steal(&kseq->ksq_idle)); 778 return (NULL); 779 } 780 781 int 782 kseq_transfer(struct kseq *kseq, struct kse *ke, int class) 783 { 784 struct kseq_group *nksg; 785 struct kseq_group *ksg; 786 struct kseq *old; 787 int cpu; 788 int idx; 789 790 if (smp_started == 0) 791 return (0); 792 cpu = 0; 793 /* 794 * If our load exceeds a certain threshold we should attempt to 795 * reassign this thread. The first candidate is the cpu that 796 * originally ran the thread. If it is idle, assign it there, 797 * otherwise, pick an idle cpu. 798 * 799 * The threshold at which we start to reassign kses has a large impact 800 * on the overall performance of the system. Tuned too high and 801 * some CPUs may idle. Too low and there will be excess migration 802 * and context switches. 803 */ 804 old = KSEQ_CPU(ke->ke_cpu); 805 nksg = old->ksq_group; 806 ksg = kseq->ksq_group; 807 if (kseq_idle) { 808 if (kseq_idle & nksg->ksg_mask) { 809 cpu = ffs(nksg->ksg_idlemask); 810 if (cpu) { 811 CTR2(KTR_SCHED, 812 "kseq_transfer: %p found old cpu %X " 813 "in idlemask.", ke, cpu); 814 goto migrate; 815 } 816 } 817 /* 818 * Multiple cpus could find this bit simultaneously 819 * but the race shouldn't be terrible. 820 */ 821 cpu = ffs(kseq_idle); 822 if (cpu) { 823 CTR2(KTR_SCHED, "kseq_transfer: %p found %X " 824 "in idlemask.", ke, cpu); 825 goto migrate; 826 } 827 } 828 idx = 0; 829 #if 0 830 if (old->ksq_load < kseq->ksq_load) { 831 cpu = ke->ke_cpu + 1; 832 CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " 833 "load less than ours.", ke, cpu); 834 goto migrate; 835 } 836 /* 837 * No new CPU was found, look for one with less load. 838 */ 839 for (idx = 0; idx <= ksg_maxid; idx++) { 840 nksg = KSEQ_GROUP(idx); 841 if (nksg->ksg_load /*+ (nksg->ksg_cpus * 2)*/ < ksg->ksg_load) { 842 cpu = ffs(nksg->ksg_cpumask); 843 CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " 844 "than ours.", ke, cpu); 845 goto migrate; 846 } 847 } 848 #endif 849 /* 850 * If another cpu in this group has idled, assign a thread over 851 * to them after checking to see if there are idled groups. 852 */ 853 if (ksg->ksg_idlemask) { 854 cpu = ffs(ksg->ksg_idlemask); 855 if (cpu) { 856 CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " 857 "group.", ke, cpu); 858 goto migrate; 859 } 860 } 861 return (0); 862 migrate: 863 /* 864 * Now that we've found an idle CPU, migrate the thread. 865 */ 866 cpu--; 867 ke->ke_runq = NULL; 868 kseq_notify(ke, cpu); 869 870 return (1); 871 } 872 873 #endif /* SMP */ 874 875 /* 876 * Pick the highest priority task we have and return it. 877 */ 878 879 static struct kse * 880 kseq_choose(struct kseq *kseq) 881 { 882 struct runq *swap; 883 struct kse *ke; 884 int nice; 885 886 mtx_assert(&sched_lock, MA_OWNED); 887 swap = NULL; 888 889 for (;;) { 890 ke = runq_choose(kseq->ksq_curr); 891 if (ke == NULL) { 892 /* 893 * We already swapped once and didn't get anywhere. 894 */ 895 if (swap) 896 break; 897 swap = kseq->ksq_curr; 898 kseq->ksq_curr = kseq->ksq_next; 899 kseq->ksq_next = swap; 900 continue; 901 } 902 /* 903 * If we encounter a slice of 0 the kse is in a 904 * TIMESHARE kse group and its nice was too far out 905 * of the range that receives slices. 906 */ 907 nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin); 908 if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH && 909 ke->ke_proc->p_nice != 0)) { 910 runq_remove(ke->ke_runq, ke); 911 sched_slice(ke); 912 ke->ke_runq = kseq->ksq_next; 913 runq_add(ke->ke_runq, ke, 0); 914 continue; 915 } 916 return (ke); 917 } 918 919 return (runq_choose(&kseq->ksq_idle)); 920 } 921 922 static void 923 kseq_setup(struct kseq *kseq) 924 { 925 runq_init(&kseq->ksq_timeshare[0]); 926 runq_init(&kseq->ksq_timeshare[1]); 927 runq_init(&kseq->ksq_idle); 928 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 929 kseq->ksq_next = &kseq->ksq_timeshare[1]; 930 kseq->ksq_load = 0; 931 kseq->ksq_load_timeshare = 0; 932 } 933 934 static void 935 sched_setup(void *dummy) 936 { 937 #ifdef SMP 938 int i; 939 #endif 940 941 slice_min = (hz/100); /* 10ms */ 942 slice_max = (hz/7); /* ~140ms */ 943 944 #ifdef SMP 945 balance_groups = 0; 946 /* 947 * Initialize the kseqs. 948 */ 949 for (i = 0; i < MAXCPU; i++) { 950 struct kseq *ksq; 951 952 ksq = &kseq_cpu[i]; 953 ksq->ksq_assigned = NULL; 954 kseq_setup(&kseq_cpu[i]); 955 } 956 if (smp_topology == NULL) { 957 struct kseq_group *ksg; 958 struct kseq *ksq; 959 int cpus; 960 961 for (cpus = 0, i = 0; i < MAXCPU; i++) { 962 if (CPU_ABSENT(i)) 963 continue; 964 ksq = &kseq_cpu[cpus]; 965 ksg = &kseq_groups[cpus]; 966 /* 967 * Setup a kseq group with one member. 968 */ 969 ksq->ksq_transferable = 0; 970 ksq->ksq_group = ksg; 971 ksg->ksg_cpus = 1; 972 ksg->ksg_idlemask = 0; 973 ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 974 ksg->ksg_load = 0; 975 ksg->ksg_transferable = 0; 976 LIST_INIT(&ksg->ksg_members); 977 LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 978 cpus++; 979 } 980 ksg_maxid = cpus - 1; 981 } else { 982 struct kseq_group *ksg; 983 struct cpu_group *cg; 984 int j; 985 986 for (i = 0; i < smp_topology->ct_count; i++) { 987 cg = &smp_topology->ct_group[i]; 988 ksg = &kseq_groups[i]; 989 /* 990 * Initialize the group. 991 */ 992 ksg->ksg_idlemask = 0; 993 ksg->ksg_load = 0; 994 ksg->ksg_transferable = 0; 995 ksg->ksg_cpus = cg->cg_count; 996 ksg->ksg_cpumask = cg->cg_mask; 997 LIST_INIT(&ksg->ksg_members); 998 /* 999 * Find all of the group members and add them. 1000 */ 1001 for (j = 0; j < MAXCPU; j++) { 1002 if ((cg->cg_mask & (1 << j)) != 0) { 1003 if (ksg->ksg_mask == 0) 1004 ksg->ksg_mask = 1 << j; 1005 kseq_cpu[j].ksq_transferable = 0; 1006 kseq_cpu[j].ksq_group = ksg; 1007 LIST_INSERT_HEAD(&ksg->ksg_members, 1008 &kseq_cpu[j], ksq_siblings); 1009 } 1010 } 1011 if (ksg->ksg_cpus > 1) 1012 balance_groups = 1; 1013 } 1014 ksg_maxid = smp_topology->ct_count - 1; 1015 } 1016 /* 1017 * Stagger the group and global load balancer so they do not 1018 * interfere with each other. 1019 */ 1020 bal_tick = ticks + hz; 1021 if (balance_groups) 1022 gbal_tick = ticks + (hz / 2); 1023 #else 1024 kseq_setup(KSEQ_SELF()); 1025 #endif 1026 mtx_lock_spin(&sched_lock); 1027 kseq_load_add(KSEQ_SELF(), &kse0); 1028 mtx_unlock_spin(&sched_lock); 1029 } 1030 1031 /* 1032 * Scale the scheduling priority according to the "interactivity" of this 1033 * process. 1034 */ 1035 static void 1036 sched_priority(struct ksegrp *kg) 1037 { 1038 int pri; 1039 1040 if (kg->kg_pri_class != PRI_TIMESHARE) 1041 return; 1042 1043 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 1044 pri += SCHED_PRI_BASE; 1045 pri += kg->kg_proc->p_nice; 1046 1047 if (pri > PRI_MAX_TIMESHARE) 1048 pri = PRI_MAX_TIMESHARE; 1049 else if (pri < PRI_MIN_TIMESHARE) 1050 pri = PRI_MIN_TIMESHARE; 1051 1052 kg->kg_user_pri = pri; 1053 1054 return; 1055 } 1056 1057 /* 1058 * Calculate a time slice based on the properties of the kseg and the runq 1059 * that we're on. This is only for PRI_TIMESHARE ksegrps. 1060 */ 1061 static void 1062 sched_slice(struct kse *ke) 1063 { 1064 struct kseq *kseq; 1065 struct ksegrp *kg; 1066 1067 kg = ke->ke_ksegrp; 1068 kseq = KSEQ_CPU(ke->ke_cpu); 1069 1070 if (ke->ke_thread->td_flags & TDF_BORROWING) { 1071 ke->ke_slice = SCHED_SLICE_MIN; 1072 return; 1073 } 1074 1075 /* 1076 * Rationale: 1077 * KSEs in interactive ksegs get a minimal slice so that we 1078 * quickly notice if it abuses its advantage. 1079 * 1080 * KSEs in non-interactive ksegs are assigned a slice that is 1081 * based on the ksegs nice value relative to the least nice kseg 1082 * on the run queue for this cpu. 1083 * 1084 * If the KSE is less nice than all others it gets the maximum 1085 * slice and other KSEs will adjust their slice relative to 1086 * this when they first expire. 1087 * 1088 * There is 20 point window that starts relative to the least 1089 * nice kse on the run queue. Slice size is determined by 1090 * the kse distance from the last nice ksegrp. 1091 * 1092 * If the kse is outside of the window it will get no slice 1093 * and will be reevaluated each time it is selected on the 1094 * run queue. The exception to this is nice 0 ksegs when 1095 * a nice -20 is running. They are always granted a minimum 1096 * slice. 1097 */ 1098 if (!SCHED_INTERACTIVE(kg)) { 1099 int nice; 1100 1101 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1102 if (kseq->ksq_load_timeshare == 0 || 1103 kg->kg_proc->p_nice < kseq->ksq_nicemin) 1104 ke->ke_slice = SCHED_SLICE_MAX; 1105 else if (nice <= SCHED_SLICE_NTHRESH) 1106 ke->ke_slice = SCHED_SLICE_NICE(nice); 1107 else if (kg->kg_proc->p_nice == 0) 1108 ke->ke_slice = SCHED_SLICE_MIN; 1109 else 1110 ke->ke_slice = 0; 1111 } else 1112 ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1113 1114 return; 1115 } 1116 1117 /* 1118 * This routine enforces a maximum limit on the amount of scheduling history 1119 * kept. It is called after either the slptime or runtime is adjusted. 1120 * This routine will not operate correctly when slp or run times have been 1121 * adjusted to more than double their maximum. 1122 */ 1123 static void 1124 sched_interact_update(struct ksegrp *kg) 1125 { 1126 int sum; 1127 1128 sum = kg->kg_runtime + kg->kg_slptime; 1129 if (sum < SCHED_SLP_RUN_MAX) 1130 return; 1131 /* 1132 * If we have exceeded by more than 1/5th then the algorithm below 1133 * will not bring us back into range. Dividing by two here forces 1134 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1135 */ 1136 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1137 kg->kg_runtime /= 2; 1138 kg->kg_slptime /= 2; 1139 return; 1140 } 1141 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1142 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1143 } 1144 1145 static void 1146 sched_interact_fork(struct ksegrp *kg) 1147 { 1148 int ratio; 1149 int sum; 1150 1151 sum = kg->kg_runtime + kg->kg_slptime; 1152 if (sum > SCHED_SLP_RUN_FORK) { 1153 ratio = sum / SCHED_SLP_RUN_FORK; 1154 kg->kg_runtime /= ratio; 1155 kg->kg_slptime /= ratio; 1156 } 1157 } 1158 1159 static int 1160 sched_interact_score(struct ksegrp *kg) 1161 { 1162 int div; 1163 1164 if (kg->kg_runtime > kg->kg_slptime) { 1165 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1166 return (SCHED_INTERACT_HALF + 1167 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1168 } if (kg->kg_slptime > kg->kg_runtime) { 1169 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1170 return (kg->kg_runtime / div); 1171 } 1172 1173 /* 1174 * This can happen if slptime and runtime are 0. 1175 */ 1176 return (0); 1177 1178 } 1179 1180 /* 1181 * Very early in the boot some setup of scheduler-specific 1182 * parts of proc0 and of soem scheduler resources needs to be done. 1183 * Called from: 1184 * proc0_init() 1185 */ 1186 void 1187 schedinit(void) 1188 { 1189 /* 1190 * Set up the scheduler specific parts of proc0. 1191 */ 1192 proc0.p_sched = NULL; /* XXX */ 1193 ksegrp0.kg_sched = &kg_sched0; 1194 thread0.td_sched = &kse0; 1195 kse0.ke_thread = &thread0; 1196 kse0.ke_state = KES_THREAD; 1197 kg_sched0.skg_concurrency = 1; 1198 kg_sched0.skg_avail_opennings = 0; /* we are already running */ 1199 } 1200 1201 /* 1202 * This is only somewhat accurate since given many processes of the same 1203 * priority they will switch when their slices run out, which will be 1204 * at most SCHED_SLICE_MAX. 1205 */ 1206 int 1207 sched_rr_interval(void) 1208 { 1209 return (SCHED_SLICE_MAX); 1210 } 1211 1212 static void 1213 sched_pctcpu_update(struct kse *ke) 1214 { 1215 /* 1216 * Adjust counters and watermark for pctcpu calc. 1217 */ 1218 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1219 /* 1220 * Shift the tick count out so that the divide doesn't 1221 * round away our results. 1222 */ 1223 ke->ke_ticks <<= 10; 1224 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1225 SCHED_CPU_TICKS; 1226 ke->ke_ticks >>= 10; 1227 } else 1228 ke->ke_ticks = 0; 1229 ke->ke_ltick = ticks; 1230 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1231 } 1232 1233 void 1234 sched_thread_priority(struct thread *td, u_char prio) 1235 { 1236 struct kse *ke; 1237 1238 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1239 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1240 curthread->td_proc->p_comm); 1241 ke = td->td_kse; 1242 mtx_assert(&sched_lock, MA_OWNED); 1243 if (td->td_priority == prio) 1244 return; 1245 if (TD_ON_RUNQ(td)) { 1246 /* 1247 * If the priority has been elevated due to priority 1248 * propagation, we may have to move ourselves to a new 1249 * queue. We still call adjustrunqueue below in case kse 1250 * needs to fix things up. 1251 */ 1252 if (prio < td->td_priority && ke->ke_runq != NULL && 1253 (ke->ke_flags & KEF_ASSIGNED) == 0 && 1254 ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1255 runq_remove(ke->ke_runq, ke); 1256 ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1257 runq_add(ke->ke_runq, ke, 0); 1258 } 1259 /* 1260 * Hold this kse on this cpu so that sched_prio() doesn't 1261 * cause excessive migration. We only want migration to 1262 * happen as the result of a wakeup. 1263 */ 1264 ke->ke_flags |= KEF_HOLD; 1265 adjustrunqueue(td, prio); 1266 ke->ke_flags &= ~KEF_HOLD; 1267 } else 1268 td->td_priority = prio; 1269 } 1270 1271 /* 1272 * Update a thread's priority when it is lent another thread's 1273 * priority. 1274 */ 1275 void 1276 sched_lend_prio(struct thread *td, u_char prio) 1277 { 1278 1279 td->td_flags |= TDF_BORROWING; 1280 sched_thread_priority(td, prio); 1281 } 1282 1283 /* 1284 * Restore a thread's priority when priority propagation is 1285 * over. The prio argument is the minimum priority the thread 1286 * needs to have to satisfy other possible priority lending 1287 * requests. If the thread's regular priority is less 1288 * important than prio, the thread will keep a priority boost 1289 * of prio. 1290 */ 1291 void 1292 sched_unlend_prio(struct thread *td, u_char prio) 1293 { 1294 u_char base_pri; 1295 1296 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1297 td->td_base_pri <= PRI_MAX_TIMESHARE) 1298 base_pri = td->td_ksegrp->kg_user_pri; 1299 else 1300 base_pri = td->td_base_pri; 1301 if (prio >= base_pri) { 1302 td->td_flags &= ~TDF_BORROWING; 1303 sched_thread_priority(td, base_pri); 1304 } else 1305 sched_lend_prio(td, prio); 1306 } 1307 1308 void 1309 sched_prio(struct thread *td, u_char prio) 1310 { 1311 u_char oldprio; 1312 1313 /* First, update the base priority. */ 1314 td->td_base_pri = prio; 1315 1316 /* 1317 * If the thread is borrowing another thread's priority, don't 1318 * ever lower the priority. 1319 */ 1320 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1321 return; 1322 1323 /* Change the real priority. */ 1324 oldprio = td->td_priority; 1325 sched_thread_priority(td, prio); 1326 1327 /* 1328 * If the thread is on a turnstile, then let the turnstile update 1329 * its state. 1330 */ 1331 if (TD_ON_LOCK(td) && oldprio != prio) 1332 turnstile_adjust(td, oldprio); 1333 } 1334 1335 void 1336 sched_switch(struct thread *td, struct thread *newtd, int flags) 1337 { 1338 struct kseq *ksq; 1339 struct kse *ke; 1340 1341 mtx_assert(&sched_lock, MA_OWNED); 1342 1343 ke = td->td_kse; 1344 ksq = KSEQ_SELF(); 1345 1346 td->td_lastcpu = td->td_oncpu; 1347 td->td_oncpu = NOCPU; 1348 td->td_flags &= ~TDF_NEEDRESCHED; 1349 td->td_pflags &= ~TDP_OWEPREEMPT; 1350 1351 /* 1352 * If the KSE has been assigned it may be in the process of switching 1353 * to the new cpu. This is the case in sched_bind(). 1354 */ 1355 if (td == PCPU_GET(idlethread)) { 1356 TD_SET_CAN_RUN(td); 1357 } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1358 /* We are ending our run so make our slot available again */ 1359 SLOT_RELEASE(td->td_ksegrp); 1360 kseq_load_rem(ksq, ke); 1361 if (TD_IS_RUNNING(td)) { 1362 /* 1363 * Don't allow the thread to migrate 1364 * from a preemption. 1365 */ 1366 ke->ke_flags |= KEF_HOLD; 1367 setrunqueue(td, (flags & SW_PREEMPT) ? 1368 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1369 SRQ_OURSELF|SRQ_YIELDING); 1370 ke->ke_flags &= ~KEF_HOLD; 1371 } else if ((td->td_proc->p_flag & P_HADTHREADS) && 1372 (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)) 1373 /* 1374 * We will not be on the run queue. 1375 * So we must be sleeping or similar. 1376 * Don't use the slot if we will need it 1377 * for newtd. 1378 */ 1379 slot_fill(td->td_ksegrp); 1380 } 1381 if (newtd != NULL) { 1382 /* 1383 * If we bring in a thread, 1384 * then account for it as if it had been added to the 1385 * run queue and then chosen. 1386 */ 1387 newtd->td_kse->ke_flags |= KEF_DIDRUN; 1388 newtd->td_kse->ke_runq = ksq->ksq_curr; 1389 SLOT_USE(newtd->td_ksegrp); 1390 TD_SET_RUNNING(newtd); 1391 kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1392 } else 1393 newtd = choosethread(); 1394 if (td != newtd) 1395 cpu_switch(td, newtd); 1396 sched_lock.mtx_lock = (uintptr_t)td; 1397 1398 td->td_oncpu = PCPU_GET(cpuid); 1399 } 1400 1401 void 1402 sched_nice(struct proc *p, int nice) 1403 { 1404 struct ksegrp *kg; 1405 struct kse *ke; 1406 struct thread *td; 1407 struct kseq *kseq; 1408 1409 PROC_LOCK_ASSERT(p, MA_OWNED); 1410 mtx_assert(&sched_lock, MA_OWNED); 1411 /* 1412 * We need to adjust the nice counts for running KSEs. 1413 */ 1414 FOREACH_KSEGRP_IN_PROC(p, kg) { 1415 if (kg->kg_pri_class == PRI_TIMESHARE) { 1416 FOREACH_THREAD_IN_GROUP(kg, td) { 1417 ke = td->td_kse; 1418 if (ke->ke_runq == NULL) 1419 continue; 1420 kseq = KSEQ_CPU(ke->ke_cpu); 1421 kseq_nice_rem(kseq, p->p_nice); 1422 kseq_nice_add(kseq, nice); 1423 } 1424 } 1425 } 1426 p->p_nice = nice; 1427 FOREACH_KSEGRP_IN_PROC(p, kg) { 1428 sched_priority(kg); 1429 FOREACH_THREAD_IN_GROUP(kg, td) 1430 td->td_flags |= TDF_NEEDRESCHED; 1431 } 1432 } 1433 1434 void 1435 sched_sleep(struct thread *td) 1436 { 1437 mtx_assert(&sched_lock, MA_OWNED); 1438 1439 td->td_slptime = ticks; 1440 } 1441 1442 void 1443 sched_wakeup(struct thread *td) 1444 { 1445 mtx_assert(&sched_lock, MA_OWNED); 1446 1447 /* 1448 * Let the kseg know how long we slept for. This is because process 1449 * interactivity behavior is modeled in the kseg. 1450 */ 1451 if (td->td_slptime) { 1452 struct ksegrp *kg; 1453 int hzticks; 1454 1455 kg = td->td_ksegrp; 1456 hzticks = (ticks - td->td_slptime) << 10; 1457 if (hzticks >= SCHED_SLP_RUN_MAX) { 1458 kg->kg_slptime = SCHED_SLP_RUN_MAX; 1459 kg->kg_runtime = 1; 1460 } else { 1461 kg->kg_slptime += hzticks; 1462 sched_interact_update(kg); 1463 } 1464 sched_priority(kg); 1465 sched_slice(td->td_kse); 1466 td->td_slptime = 0; 1467 } 1468 setrunqueue(td, SRQ_BORING); 1469 } 1470 1471 /* 1472 * Penalize the parent for creating a new child and initialize the child's 1473 * priority. 1474 */ 1475 void 1476 sched_fork(struct thread *td, struct thread *childtd) 1477 { 1478 1479 mtx_assert(&sched_lock, MA_OWNED); 1480 1481 sched_fork_ksegrp(td, childtd->td_ksegrp); 1482 sched_fork_thread(td, childtd); 1483 } 1484 1485 void 1486 sched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1487 { 1488 struct ksegrp *kg = td->td_ksegrp; 1489 mtx_assert(&sched_lock, MA_OWNED); 1490 1491 child->kg_slptime = kg->kg_slptime; 1492 child->kg_runtime = kg->kg_runtime; 1493 child->kg_user_pri = kg->kg_user_pri; 1494 sched_interact_fork(child); 1495 kg->kg_runtime += tickincr << 10; 1496 sched_interact_update(kg); 1497 } 1498 1499 void 1500 sched_fork_thread(struct thread *td, struct thread *child) 1501 { 1502 struct kse *ke; 1503 struct kse *ke2; 1504 1505 sched_newthread(child); 1506 ke = td->td_kse; 1507 ke2 = child->td_kse; 1508 ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1509 ke2->ke_cpu = ke->ke_cpu; 1510 ke2->ke_runq = NULL; 1511 1512 /* Grab our parents cpu estimation information. */ 1513 ke2->ke_ticks = ke->ke_ticks; 1514 ke2->ke_ltick = ke->ke_ltick; 1515 ke2->ke_ftick = ke->ke_ftick; 1516 } 1517 1518 void 1519 sched_class(struct ksegrp *kg, int class) 1520 { 1521 struct kseq *kseq; 1522 struct kse *ke; 1523 struct thread *td; 1524 int nclass; 1525 int oclass; 1526 1527 mtx_assert(&sched_lock, MA_OWNED); 1528 if (kg->kg_pri_class == class) 1529 return; 1530 1531 nclass = PRI_BASE(class); 1532 oclass = PRI_BASE(kg->kg_pri_class); 1533 FOREACH_THREAD_IN_GROUP(kg, td) { 1534 ke = td->td_kse; 1535 if ((ke->ke_state != KES_ONRUNQ && 1536 ke->ke_state != KES_THREAD) || ke->ke_runq == NULL) 1537 continue; 1538 kseq = KSEQ_CPU(ke->ke_cpu); 1539 1540 #ifdef SMP 1541 /* 1542 * On SMP if we're on the RUNQ we must adjust the transferable 1543 * count because could be changing to or from an interrupt 1544 * class. 1545 */ 1546 if (ke->ke_state == KES_ONRUNQ) { 1547 if (KSE_CAN_MIGRATE(ke)) { 1548 kseq->ksq_transferable--; 1549 kseq->ksq_group->ksg_transferable--; 1550 } 1551 if (KSE_CAN_MIGRATE(ke)) { 1552 kseq->ksq_transferable++; 1553 kseq->ksq_group->ksg_transferable++; 1554 } 1555 } 1556 #endif 1557 if (oclass == PRI_TIMESHARE) { 1558 kseq->ksq_load_timeshare--; 1559 kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1560 } 1561 if (nclass == PRI_TIMESHARE) { 1562 kseq->ksq_load_timeshare++; 1563 kseq_nice_add(kseq, kg->kg_proc->p_nice); 1564 } 1565 } 1566 1567 kg->kg_pri_class = class; 1568 } 1569 1570 /* 1571 * Return some of the child's priority and interactivity to the parent. 1572 */ 1573 void 1574 sched_exit(struct proc *p, struct thread *childtd) 1575 { 1576 mtx_assert(&sched_lock, MA_OWNED); 1577 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd); 1578 sched_exit_thread(NULL, childtd); 1579 } 1580 1581 void 1582 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1583 { 1584 /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1585 kg->kg_runtime += td->td_ksegrp->kg_runtime; 1586 sched_interact_update(kg); 1587 } 1588 1589 void 1590 sched_exit_thread(struct thread *td, struct thread *childtd) 1591 { 1592 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1593 childtd, childtd->td_proc->p_comm, childtd->td_priority); 1594 kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse); 1595 } 1596 1597 void 1598 sched_clock(struct thread *td) 1599 { 1600 struct kseq *kseq; 1601 struct ksegrp *kg; 1602 struct kse *ke; 1603 1604 mtx_assert(&sched_lock, MA_OWNED); 1605 kseq = KSEQ_SELF(); 1606 #ifdef SMP 1607 if (ticks >= bal_tick) 1608 sched_balance(); 1609 if (ticks >= gbal_tick && balance_groups) 1610 sched_balance_groups(); 1611 /* 1612 * We could have been assigned a non real-time thread without an 1613 * IPI. 1614 */ 1615 if (kseq->ksq_assigned) 1616 kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1617 #endif 1618 /* 1619 * sched_setup() apparently happens prior to stathz being set. We 1620 * need to resolve the timers earlier in the boot so we can avoid 1621 * calculating this here. 1622 */ 1623 if (realstathz == 0) { 1624 realstathz = stathz ? stathz : hz; 1625 tickincr = hz / realstathz; 1626 /* 1627 * XXX This does not work for values of stathz that are much 1628 * larger than hz. 1629 */ 1630 if (tickincr == 0) 1631 tickincr = 1; 1632 } 1633 1634 ke = td->td_kse; 1635 kg = ke->ke_ksegrp; 1636 1637 /* Adjust ticks for pctcpu */ 1638 ke->ke_ticks++; 1639 ke->ke_ltick = ticks; 1640 1641 /* Go up to one second beyond our max and then trim back down */ 1642 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1643 sched_pctcpu_update(ke); 1644 1645 if (td->td_flags & TDF_IDLETD) 1646 return; 1647 /* 1648 * We only do slicing code for TIMESHARE ksegrps. 1649 */ 1650 if (kg->kg_pri_class != PRI_TIMESHARE) 1651 return; 1652 /* 1653 * We used a tick charge it to the ksegrp so that we can compute our 1654 * interactivity. 1655 */ 1656 kg->kg_runtime += tickincr << 10; 1657 sched_interact_update(kg); 1658 1659 /* 1660 * We used up one time slice. 1661 */ 1662 if (--ke->ke_slice > 0) 1663 return; 1664 /* 1665 * We're out of time, recompute priorities and requeue. 1666 */ 1667 kseq_load_rem(kseq, ke); 1668 sched_priority(kg); 1669 sched_slice(ke); 1670 if (SCHED_CURR(kg, ke)) 1671 ke->ke_runq = kseq->ksq_curr; 1672 else 1673 ke->ke_runq = kseq->ksq_next; 1674 kseq_load_add(kseq, ke); 1675 td->td_flags |= TDF_NEEDRESCHED; 1676 } 1677 1678 int 1679 sched_runnable(void) 1680 { 1681 struct kseq *kseq; 1682 int load; 1683 1684 load = 1; 1685 1686 kseq = KSEQ_SELF(); 1687 #ifdef SMP 1688 if (kseq->ksq_assigned) { 1689 mtx_lock_spin(&sched_lock); 1690 kseq_assign(kseq); 1691 mtx_unlock_spin(&sched_lock); 1692 } 1693 #endif 1694 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1695 if (kseq->ksq_load > 0) 1696 goto out; 1697 } else 1698 if (kseq->ksq_load - 1 > 0) 1699 goto out; 1700 load = 0; 1701 out: 1702 return (load); 1703 } 1704 1705 void 1706 sched_userret(struct thread *td) 1707 { 1708 struct ksegrp *kg; 1709 1710 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1711 ("thread with borrowed priority returning to userland")); 1712 kg = td->td_ksegrp; 1713 if (td->td_priority != kg->kg_user_pri) { 1714 mtx_lock_spin(&sched_lock); 1715 td->td_priority = kg->kg_user_pri; 1716 td->td_base_pri = kg->kg_user_pri; 1717 mtx_unlock_spin(&sched_lock); 1718 } 1719 } 1720 1721 struct kse * 1722 sched_choose(void) 1723 { 1724 struct kseq *kseq; 1725 struct kse *ke; 1726 1727 mtx_assert(&sched_lock, MA_OWNED); 1728 kseq = KSEQ_SELF(); 1729 #ifdef SMP 1730 restart: 1731 if (kseq->ksq_assigned) 1732 kseq_assign(kseq); 1733 #endif 1734 ke = kseq_choose(kseq); 1735 if (ke) { 1736 #ifdef SMP 1737 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1738 if (kseq_idled(kseq) == 0) 1739 goto restart; 1740 #endif 1741 kseq_runq_rem(kseq, ke); 1742 ke->ke_state = KES_THREAD; 1743 return (ke); 1744 } 1745 #ifdef SMP 1746 if (kseq_idled(kseq) == 0) 1747 goto restart; 1748 #endif 1749 return (NULL); 1750 } 1751 1752 void 1753 sched_add(struct thread *td, int flags) 1754 { 1755 struct kseq *kseq; 1756 struct ksegrp *kg; 1757 struct kse *ke; 1758 int preemptive; 1759 int canmigrate; 1760 int class; 1761 1762 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1763 td, td->td_proc->p_comm, td->td_priority, curthread, 1764 curthread->td_proc->p_comm); 1765 mtx_assert(&sched_lock, MA_OWNED); 1766 ke = td->td_kse; 1767 kg = td->td_ksegrp; 1768 canmigrate = 1; 1769 preemptive = !(flags & SRQ_YIELDING); 1770 class = PRI_BASE(kg->kg_pri_class); 1771 kseq = KSEQ_SELF(); 1772 if ((ke->ke_flags & KEF_INTERNAL) == 0) 1773 SLOT_USE(td->td_ksegrp); 1774 ke->ke_flags &= ~KEF_INTERNAL; 1775 #ifdef SMP 1776 if (ke->ke_flags & KEF_ASSIGNED) { 1777 if (ke->ke_flags & KEF_REMOVED) 1778 ke->ke_flags &= ~KEF_REMOVED; 1779 return; 1780 } 1781 canmigrate = KSE_CAN_MIGRATE(ke); 1782 #endif 1783 KASSERT(ke->ke_state != KES_ONRUNQ, 1784 ("sched_add: kse %p (%s) already in run queue", ke, 1785 ke->ke_proc->p_comm)); 1786 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1787 ("sched_add: process swapped out")); 1788 KASSERT(ke->ke_runq == NULL, 1789 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1790 switch (class) { 1791 case PRI_ITHD: 1792 case PRI_REALTIME: 1793 ke->ke_runq = kseq->ksq_curr; 1794 ke->ke_slice = SCHED_SLICE_MAX; 1795 if (canmigrate) 1796 ke->ke_cpu = PCPU_GET(cpuid); 1797 break; 1798 case PRI_TIMESHARE: 1799 if (SCHED_CURR(kg, ke)) 1800 ke->ke_runq = kseq->ksq_curr; 1801 else 1802 ke->ke_runq = kseq->ksq_next; 1803 break; 1804 case PRI_IDLE: 1805 /* 1806 * This is for priority prop. 1807 */ 1808 if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1809 ke->ke_runq = kseq->ksq_curr; 1810 else 1811 ke->ke_runq = &kseq->ksq_idle; 1812 ke->ke_slice = SCHED_SLICE_MIN; 1813 break; 1814 default: 1815 panic("Unknown pri class."); 1816 break; 1817 } 1818 #ifdef SMP 1819 /* 1820 * Don't migrate running threads here. Force the long term balancer 1821 * to do it. 1822 */ 1823 if (ke->ke_flags & KEF_HOLD) { 1824 ke->ke_flags &= ~KEF_HOLD; 1825 canmigrate = 0; 1826 } 1827 /* 1828 * If this thread is pinned or bound, notify the target cpu. 1829 */ 1830 if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1831 ke->ke_runq = NULL; 1832 kseq_notify(ke, ke->ke_cpu); 1833 return; 1834 } 1835 /* 1836 * If we had been idle, clear our bit in the group and potentially 1837 * the global bitmap. If not, see if we should transfer this thread. 1838 */ 1839 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1840 (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1841 /* 1842 * Check to see if our group is unidling, and if so, remove it 1843 * from the global idle mask. 1844 */ 1845 if (kseq->ksq_group->ksg_idlemask == 1846 kseq->ksq_group->ksg_cpumask) 1847 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1848 /* 1849 * Now remove ourselves from the group specific idle mask. 1850 */ 1851 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1852 } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD) 1853 if (kseq_transfer(kseq, ke, class)) 1854 return; 1855 ke->ke_cpu = PCPU_GET(cpuid); 1856 #endif 1857 if (td->td_priority < curthread->td_priority && 1858 ke->ke_runq == kseq->ksq_curr) 1859 curthread->td_flags |= TDF_NEEDRESCHED; 1860 if (preemptive && maybe_preempt(td)) 1861 return; 1862 ke->ke_state = KES_ONRUNQ; 1863 1864 kseq_runq_add(kseq, ke, flags); 1865 kseq_load_add(kseq, ke); 1866 } 1867 1868 void 1869 sched_rem(struct thread *td) 1870 { 1871 struct kseq *kseq; 1872 struct kse *ke; 1873 1874 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1875 td, td->td_proc->p_comm, td->td_priority, curthread, 1876 curthread->td_proc->p_comm); 1877 mtx_assert(&sched_lock, MA_OWNED); 1878 ke = td->td_kse; 1879 SLOT_RELEASE(td->td_ksegrp); 1880 if (ke->ke_flags & KEF_ASSIGNED) { 1881 ke->ke_flags |= KEF_REMOVED; 1882 return; 1883 } 1884 KASSERT((ke->ke_state == KES_ONRUNQ), 1885 ("sched_rem: KSE not on run queue")); 1886 1887 ke->ke_state = KES_THREAD; 1888 kseq = KSEQ_CPU(ke->ke_cpu); 1889 kseq_runq_rem(kseq, ke); 1890 kseq_load_rem(kseq, ke); 1891 } 1892 1893 fixpt_t 1894 sched_pctcpu(struct thread *td) 1895 { 1896 fixpt_t pctcpu; 1897 struct kse *ke; 1898 1899 pctcpu = 0; 1900 ke = td->td_kse; 1901 if (ke == NULL) 1902 return (0); 1903 1904 mtx_lock_spin(&sched_lock); 1905 if (ke->ke_ticks) { 1906 int rtick; 1907 1908 /* 1909 * Don't update more frequently than twice a second. Allowing 1910 * this causes the cpu usage to decay away too quickly due to 1911 * rounding errors. 1912 */ 1913 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1914 ke->ke_ltick < (ticks - (hz / 2))) 1915 sched_pctcpu_update(ke); 1916 /* How many rtick per second ? */ 1917 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1918 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1919 } 1920 1921 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1922 mtx_unlock_spin(&sched_lock); 1923 1924 return (pctcpu); 1925 } 1926 1927 void 1928 sched_bind(struct thread *td, int cpu) 1929 { 1930 struct kse *ke; 1931 1932 mtx_assert(&sched_lock, MA_OWNED); 1933 ke = td->td_kse; 1934 ke->ke_flags |= KEF_BOUND; 1935 #ifdef SMP 1936 if (PCPU_GET(cpuid) == cpu) 1937 return; 1938 /* sched_rem without the runq_remove */ 1939 ke->ke_state = KES_THREAD; 1940 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1941 kseq_notify(ke, cpu); 1942 /* When we return from mi_switch we'll be on the correct cpu. */ 1943 mi_switch(SW_VOL, NULL); 1944 #endif 1945 } 1946 1947 void 1948 sched_unbind(struct thread *td) 1949 { 1950 mtx_assert(&sched_lock, MA_OWNED); 1951 td->td_kse->ke_flags &= ~KEF_BOUND; 1952 } 1953 1954 int 1955 sched_load(void) 1956 { 1957 #ifdef SMP 1958 int total; 1959 int i; 1960 1961 total = 0; 1962 for (i = 0; i <= ksg_maxid; i++) 1963 total += KSEQ_GROUP(i)->ksg_load; 1964 return (total); 1965 #else 1966 return (KSEQ_SELF()->ksq_sysload); 1967 #endif 1968 } 1969 1970 int 1971 sched_sizeof_ksegrp(void) 1972 { 1973 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1974 } 1975 1976 int 1977 sched_sizeof_proc(void) 1978 { 1979 return (sizeof(struct proc)); 1980 } 1981 1982 int 1983 sched_sizeof_thread(void) 1984 { 1985 return (sizeof(struct thread) + sizeof(struct td_sched)); 1986 } 1987 #define KERN_SWITCH_INCLUDE 1 1988 #include "kern/kern_switch.c" 1989