1 /*- 2 * Copyright (c) 2002-2005, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include "opt_hwpmc_hooks.h" 31 #include "opt_sched.h" 32 33 #define kse td_sched 34 35 #include <sys/param.h> 36 #include <sys/systm.h> 37 #include <sys/kdb.h> 38 #include <sys/kernel.h> 39 #include <sys/ktr.h> 40 #include <sys/lock.h> 41 #include <sys/mutex.h> 42 #include <sys/proc.h> 43 #include <sys/resource.h> 44 #include <sys/resourcevar.h> 45 #include <sys/sched.h> 46 #include <sys/smp.h> 47 #include <sys/sx.h> 48 #include <sys/sysctl.h> 49 #include <sys/sysproto.h> 50 #include <sys/turnstile.h> 51 #include <sys/umtx.h> 52 #include <sys/vmmeter.h> 53 #ifdef KTRACE 54 #include <sys/uio.h> 55 #include <sys/ktrace.h> 56 #endif 57 58 #ifdef HWPMC_HOOKS 59 #include <sys/pmckern.h> 60 #endif 61 62 #include <machine/cpu.h> 63 #include <machine/smp.h> 64 65 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 66 /* XXX This is bogus compatability crap for ps */ 67 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 68 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 69 70 static void sched_setup(void *dummy); 71 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 72 73 static void sched_initticks(void *dummy); 74 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL) 75 76 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 77 78 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 79 "Scheduler name"); 80 81 static int slice_min = 1; 82 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 83 84 static int slice_max = 10; 85 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 86 87 int realstathz; 88 int tickincr = 1 << 10; 89 90 /* 91 * The following datastructures are allocated within their parent structure 92 * but are scheduler specific. 93 */ 94 /* 95 * The schedulable entity that can be given a context to run. A process may 96 * have several of these. 97 */ 98 struct kse { 99 TAILQ_ENTRY(kse) ke_procq; /* (j/z) Run queue. */ 100 int ke_flags; /* (j) KEF_* flags. */ 101 struct thread *ke_thread; /* (*) Active associated thread. */ 102 fixpt_t ke_pctcpu; /* (j) %cpu during p_swtime. */ 103 u_char ke_rqindex; /* (j) Run queue index. */ 104 enum { 105 KES_THREAD = 0x0, /* slaved to thread state */ 106 KES_ONRUNQ 107 } ke_state; /* (j) thread sched specific status. */ 108 int ke_slptime; 109 int ke_slice; 110 struct runq *ke_runq; 111 u_char ke_cpu; /* CPU that we have affinity for. */ 112 /* The following variables are only used for pctcpu calculation */ 113 int ke_ltick; /* Last tick that we were running on */ 114 int ke_ftick; /* First tick that we were running on */ 115 int ke_ticks; /* Tick count */ 116 117 }; 118 #define td_kse td_sched 119 #define td_slptime td_kse->ke_slptime 120 #define ke_proc ke_thread->td_proc 121 #define ke_ksegrp ke_thread->td_ksegrp 122 #define ke_assign ke_procq.tqe_next 123 /* flags kept in ke_flags */ 124 #define KEF_ASSIGNED 0x0001 /* Thread is being migrated. */ 125 #define KEF_BOUND 0x0002 /* Thread can not migrate. */ 126 #define KEF_XFERABLE 0x0004 /* Thread was added as transferable. */ 127 #define KEF_HOLD 0x0008 /* Thread is temporarily bound. */ 128 #define KEF_REMOVED 0x0010 /* Thread was removed while ASSIGNED */ 129 #define KEF_INTERNAL 0x0020 /* Thread added due to migration. */ 130 #define KEF_PREEMPTED 0x0040 /* Thread was preempted */ 131 #define KEF_DIDRUN 0x02000 /* Thread actually ran. */ 132 #define KEF_EXIT 0x04000 /* Thread is being killed. */ 133 134 struct kg_sched { 135 struct thread *skg_last_assigned; /* (j) Last thread assigned to */ 136 /* the system scheduler */ 137 int skg_slptime; /* Number of ticks we vol. slept */ 138 int skg_runtime; /* Number of ticks we were running */ 139 int skg_avail_opennings; /* (j) Num unfilled slots in group.*/ 140 int skg_concurrency; /* (j) Num threads requested in group.*/ 141 }; 142 #define kg_last_assigned kg_sched->skg_last_assigned 143 #define kg_avail_opennings kg_sched->skg_avail_opennings 144 #define kg_concurrency kg_sched->skg_concurrency 145 #define kg_runtime kg_sched->skg_runtime 146 #define kg_slptime kg_sched->skg_slptime 147 148 #define SLOT_RELEASE(kg) (kg)->kg_avail_opennings++ 149 #define SLOT_USE(kg) (kg)->kg_avail_opennings-- 150 151 static struct kse kse0; 152 static struct kg_sched kg_sched0; 153 154 /* 155 * The priority is primarily determined by the interactivity score. Thus, we 156 * give lower(better) priorities to kse groups that use less CPU. The nice 157 * value is then directly added to this to allow nice to have some effect 158 * on latency. 159 * 160 * PRI_RANGE: Total priority range for timeshare threads. 161 * PRI_NRESV: Number of nice values. 162 * PRI_BASE: The start of the dynamic range. 163 */ 164 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 165 #define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 166 #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 167 #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 168 #define SCHED_PRI_INTERACT(score) \ 169 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 170 171 /* 172 * These determine the interactivity of a process. 173 * 174 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 175 * before throttling back. 176 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 177 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 178 * INTERACT_THRESH: Threshhold for placement on the current runq. 179 */ 180 #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 181 #define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 182 #define SCHED_INTERACT_MAX (100) 183 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 184 #define SCHED_INTERACT_THRESH (30) 185 186 /* 187 * These parameters and macros determine the size of the time slice that is 188 * granted to each thread. 189 * 190 * SLICE_MIN: Minimum time slice granted, in units of ticks. 191 * SLICE_MAX: Maximum time slice granted. 192 * SLICE_RANGE: Range of available time slices scaled by hz. 193 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 194 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 195 * SLICE_NTHRESH: The nice cutoff point for slice assignment. 196 */ 197 #define SCHED_SLICE_MIN (slice_min) 198 #define SCHED_SLICE_MAX (slice_max) 199 #define SCHED_SLICE_INTERACTIVE (slice_max) 200 #define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 201 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 202 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 203 #define SCHED_SLICE_NICE(nice) \ 204 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 205 206 /* 207 * This macro determines whether or not the thread belongs on the current or 208 * next run queue. 209 */ 210 #define SCHED_INTERACTIVE(kg) \ 211 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 212 #define SCHED_CURR(kg, ke) \ 213 ((ke->ke_thread->td_flags & TDF_BORROWING) || \ 214 (ke->ke_flags & KEF_PREEMPTED) || SCHED_INTERACTIVE(kg)) 215 216 /* 217 * Cpu percentage computation macros and defines. 218 * 219 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 220 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 221 */ 222 223 #define SCHED_CPU_TIME 10 224 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 225 226 /* 227 * kseq - per processor runqs and statistics. 228 */ 229 struct kseq { 230 struct runq ksq_idle; /* Queue of IDLE threads. */ 231 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 232 struct runq *ksq_next; /* Next timeshare queue. */ 233 struct runq *ksq_curr; /* Current queue. */ 234 int ksq_load_timeshare; /* Load for timeshare. */ 235 int ksq_load; /* Aggregate load. */ 236 short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 237 short ksq_nicemin; /* Least nice. */ 238 #ifdef SMP 239 int ksq_transferable; 240 LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 241 struct kseq_group *ksq_group; /* Our processor group. */ 242 volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 243 #else 244 int ksq_sysload; /* For loadavg, !ITHD load. */ 245 #endif 246 }; 247 248 #ifdef SMP 249 /* 250 * kseq groups are groups of processors which can cheaply share threads. When 251 * one processor in the group goes idle it will check the runqs of the other 252 * processors in its group prior to halting and waiting for an interrupt. 253 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 254 * In a numa environment we'd want an idle bitmap per group and a two tiered 255 * load balancer. 256 */ 257 struct kseq_group { 258 int ksg_cpus; /* Count of CPUs in this kseq group. */ 259 cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 260 cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 261 cpumask_t ksg_mask; /* Bit mask for first cpu. */ 262 int ksg_load; /* Total load of this group. */ 263 int ksg_transferable; /* Transferable load of this group. */ 264 LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 265 }; 266 #endif 267 268 /* 269 * One kse queue per processor. 270 */ 271 #ifdef SMP 272 static cpumask_t kseq_idle; 273 static int ksg_maxid; 274 static struct kseq kseq_cpu[MAXCPU]; 275 static struct kseq_group kseq_groups[MAXCPU]; 276 static int bal_tick; 277 static int gbal_tick; 278 static int balance_groups; 279 280 #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 281 #define KSEQ_CPU(x) (&kseq_cpu[(x)]) 282 #define KSEQ_ID(x) ((x) - kseq_cpu) 283 #define KSEQ_GROUP(x) (&kseq_groups[(x)]) 284 #else /* !SMP */ 285 static struct kseq kseq_cpu; 286 287 #define KSEQ_SELF() (&kseq_cpu) 288 #define KSEQ_CPU(x) (&kseq_cpu) 289 #endif 290 291 static void slot_fill(struct ksegrp *); 292 static struct kse *sched_choose(void); /* XXX Should be thread * */ 293 static void sched_slice(struct kse *); 294 static void sched_priority(struct ksegrp *); 295 static void sched_thread_priority(struct thread *, u_char); 296 static int sched_interact_score(struct ksegrp *); 297 static void sched_interact_update(struct ksegrp *); 298 static void sched_interact_fork(struct ksegrp *); 299 static void sched_pctcpu_update(struct kse *); 300 301 /* Operations on per processor queues */ 302 static struct kse * kseq_choose(struct kseq *); 303 static void kseq_setup(struct kseq *); 304 static void kseq_load_add(struct kseq *, struct kse *); 305 static void kseq_load_rem(struct kseq *, struct kse *); 306 static __inline void kseq_runq_add(struct kseq *, struct kse *, int); 307 static __inline void kseq_runq_rem(struct kseq *, struct kse *); 308 static void kseq_nice_add(struct kseq *, int); 309 static void kseq_nice_rem(struct kseq *, int); 310 void kseq_print(int cpu); 311 #ifdef SMP 312 static int kseq_transfer(struct kseq *, struct kse *, int); 313 static struct kse *runq_steal(struct runq *); 314 static void sched_balance(void); 315 static void sched_balance_groups(void); 316 static void sched_balance_group(struct kseq_group *); 317 static void sched_balance_pair(struct kseq *, struct kseq *); 318 static void kseq_move(struct kseq *, int); 319 static int kseq_idled(struct kseq *); 320 static void kseq_notify(struct kse *, int); 321 static void kseq_assign(struct kseq *); 322 static struct kse *kseq_steal(struct kseq *, int); 323 #define KSE_CAN_MIGRATE(ke) \ 324 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 325 #endif 326 327 void 328 kseq_print(int cpu) 329 { 330 struct kseq *kseq; 331 int i; 332 333 kseq = KSEQ_CPU(cpu); 334 335 printf("kseq:\n"); 336 printf("\tload: %d\n", kseq->ksq_load); 337 printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 338 #ifdef SMP 339 printf("\tload transferable: %d\n", kseq->ksq_transferable); 340 #endif 341 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 342 printf("\tnice counts:\n"); 343 for (i = 0; i < SCHED_PRI_NRESV; i++) 344 if (kseq->ksq_nice[i]) 345 printf("\t\t%d = %d\n", 346 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 347 } 348 349 static __inline void 350 kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags) 351 { 352 #ifdef SMP 353 if (KSE_CAN_MIGRATE(ke)) { 354 kseq->ksq_transferable++; 355 kseq->ksq_group->ksg_transferable++; 356 ke->ke_flags |= KEF_XFERABLE; 357 } 358 #endif 359 if (ke->ke_flags & KEF_PREEMPTED) 360 flags |= SRQ_PREEMPTED; 361 runq_add(ke->ke_runq, ke, flags); 362 } 363 364 static __inline void 365 kseq_runq_rem(struct kseq *kseq, struct kse *ke) 366 { 367 #ifdef SMP 368 if (ke->ke_flags & KEF_XFERABLE) { 369 kseq->ksq_transferable--; 370 kseq->ksq_group->ksg_transferable--; 371 ke->ke_flags &= ~KEF_XFERABLE; 372 } 373 #endif 374 runq_remove(ke->ke_runq, ke); 375 } 376 377 static void 378 kseq_load_add(struct kseq *kseq, struct kse *ke) 379 { 380 int class; 381 mtx_assert(&sched_lock, MA_OWNED); 382 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 383 if (class == PRI_TIMESHARE) 384 kseq->ksq_load_timeshare++; 385 kseq->ksq_load++; 386 CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 387 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 388 #ifdef SMP 389 kseq->ksq_group->ksg_load++; 390 #else 391 kseq->ksq_sysload++; 392 #endif 393 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 394 kseq_nice_add(kseq, ke->ke_proc->p_nice); 395 } 396 397 static void 398 kseq_load_rem(struct kseq *kseq, struct kse *ke) 399 { 400 int class; 401 mtx_assert(&sched_lock, MA_OWNED); 402 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 403 if (class == PRI_TIMESHARE) 404 kseq->ksq_load_timeshare--; 405 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 406 #ifdef SMP 407 kseq->ksq_group->ksg_load--; 408 #else 409 kseq->ksq_sysload--; 410 #endif 411 kseq->ksq_load--; 412 CTR1(KTR_SCHED, "load: %d", kseq->ksq_load); 413 ke->ke_runq = NULL; 414 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 415 kseq_nice_rem(kseq, ke->ke_proc->p_nice); 416 } 417 418 static void 419 kseq_nice_add(struct kseq *kseq, int nice) 420 { 421 mtx_assert(&sched_lock, MA_OWNED); 422 /* Normalize to zero. */ 423 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 424 if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 425 kseq->ksq_nicemin = nice; 426 } 427 428 static void 429 kseq_nice_rem(struct kseq *kseq, int nice) 430 { 431 int n; 432 433 mtx_assert(&sched_lock, MA_OWNED); 434 /* Normalize to zero. */ 435 n = nice + SCHED_PRI_NHALF; 436 kseq->ksq_nice[n]--; 437 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 438 439 /* 440 * If this wasn't the smallest nice value or there are more in 441 * this bucket we can just return. Otherwise we have to recalculate 442 * the smallest nice. 443 */ 444 if (nice != kseq->ksq_nicemin || 445 kseq->ksq_nice[n] != 0 || 446 kseq->ksq_load_timeshare == 0) 447 return; 448 449 for (; n < SCHED_PRI_NRESV; n++) 450 if (kseq->ksq_nice[n]) { 451 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 452 return; 453 } 454 } 455 456 #ifdef SMP 457 /* 458 * sched_balance is a simple CPU load balancing algorithm. It operates by 459 * finding the least loaded and most loaded cpu and equalizing their load 460 * by migrating some processes. 461 * 462 * Dealing only with two CPUs at a time has two advantages. Firstly, most 463 * installations will only have 2 cpus. Secondly, load balancing too much at 464 * once can have an unpleasant effect on the system. The scheduler rarely has 465 * enough information to make perfect decisions. So this algorithm chooses 466 * algorithm simplicity and more gradual effects on load in larger systems. 467 * 468 * It could be improved by considering the priorities and slices assigned to 469 * each task prior to balancing them. There are many pathological cases with 470 * any approach and so the semi random algorithm below may work as well as any. 471 * 472 */ 473 static void 474 sched_balance(void) 475 { 476 struct kseq_group *high; 477 struct kseq_group *low; 478 struct kseq_group *ksg; 479 int cnt; 480 int i; 481 482 bal_tick = ticks + (random() % (hz * 2)); 483 if (smp_started == 0) 484 return; 485 low = high = NULL; 486 i = random() % (ksg_maxid + 1); 487 for (cnt = 0; cnt <= ksg_maxid; cnt++) { 488 ksg = KSEQ_GROUP(i); 489 /* 490 * Find the CPU with the highest load that has some 491 * threads to transfer. 492 */ 493 if ((high == NULL || ksg->ksg_load > high->ksg_load) 494 && ksg->ksg_transferable) 495 high = ksg; 496 if (low == NULL || ksg->ksg_load < low->ksg_load) 497 low = ksg; 498 if (++i > ksg_maxid) 499 i = 0; 500 } 501 if (low != NULL && high != NULL && high != low) 502 sched_balance_pair(LIST_FIRST(&high->ksg_members), 503 LIST_FIRST(&low->ksg_members)); 504 } 505 506 static void 507 sched_balance_groups(void) 508 { 509 int i; 510 511 gbal_tick = ticks + (random() % (hz * 2)); 512 mtx_assert(&sched_lock, MA_OWNED); 513 if (smp_started) 514 for (i = 0; i <= ksg_maxid; i++) 515 sched_balance_group(KSEQ_GROUP(i)); 516 } 517 518 static void 519 sched_balance_group(struct kseq_group *ksg) 520 { 521 struct kseq *kseq; 522 struct kseq *high; 523 struct kseq *low; 524 int load; 525 526 if (ksg->ksg_transferable == 0) 527 return; 528 low = NULL; 529 high = NULL; 530 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 531 load = kseq->ksq_load; 532 if (high == NULL || load > high->ksq_load) 533 high = kseq; 534 if (low == NULL || load < low->ksq_load) 535 low = kseq; 536 } 537 if (high != NULL && low != NULL && high != low) 538 sched_balance_pair(high, low); 539 } 540 541 static void 542 sched_balance_pair(struct kseq *high, struct kseq *low) 543 { 544 int transferable; 545 int high_load; 546 int low_load; 547 int move; 548 int diff; 549 int i; 550 551 /* 552 * If we're transfering within a group we have to use this specific 553 * kseq's transferable count, otherwise we can steal from other members 554 * of the group. 555 */ 556 if (high->ksq_group == low->ksq_group) { 557 transferable = high->ksq_transferable; 558 high_load = high->ksq_load; 559 low_load = low->ksq_load; 560 } else { 561 transferable = high->ksq_group->ksg_transferable; 562 high_load = high->ksq_group->ksg_load; 563 low_load = low->ksq_group->ksg_load; 564 } 565 if (transferable == 0) 566 return; 567 /* 568 * Determine what the imbalance is and then adjust that to how many 569 * kses we actually have to give up (transferable). 570 */ 571 diff = high_load - low_load; 572 move = diff / 2; 573 if (diff & 0x1) 574 move++; 575 move = min(move, transferable); 576 for (i = 0; i < move; i++) 577 kseq_move(high, KSEQ_ID(low)); 578 return; 579 } 580 581 static void 582 kseq_move(struct kseq *from, int cpu) 583 { 584 struct kseq *kseq; 585 struct kseq *to; 586 struct kse *ke; 587 588 kseq = from; 589 to = KSEQ_CPU(cpu); 590 ke = kseq_steal(kseq, 1); 591 if (ke == NULL) { 592 struct kseq_group *ksg; 593 594 ksg = kseq->ksq_group; 595 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 596 if (kseq == from || kseq->ksq_transferable == 0) 597 continue; 598 ke = kseq_steal(kseq, 1); 599 break; 600 } 601 if (ke == NULL) 602 panic("kseq_move: No KSEs available with a " 603 "transferable count of %d\n", 604 ksg->ksg_transferable); 605 } 606 if (kseq == to) 607 return; 608 ke->ke_state = KES_THREAD; 609 kseq_runq_rem(kseq, ke); 610 kseq_load_rem(kseq, ke); 611 kseq_notify(ke, cpu); 612 } 613 614 static int 615 kseq_idled(struct kseq *kseq) 616 { 617 struct kseq_group *ksg; 618 struct kseq *steal; 619 struct kse *ke; 620 621 ksg = kseq->ksq_group; 622 /* 623 * If we're in a cpu group, try and steal kses from another cpu in 624 * the group before idling. 625 */ 626 if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 627 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 628 if (steal == kseq || steal->ksq_transferable == 0) 629 continue; 630 ke = kseq_steal(steal, 0); 631 if (ke == NULL) 632 continue; 633 ke->ke_state = KES_THREAD; 634 kseq_runq_rem(steal, ke); 635 kseq_load_rem(steal, ke); 636 ke->ke_cpu = PCPU_GET(cpuid); 637 ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 638 sched_add(ke->ke_thread, SRQ_YIELDING); 639 return (0); 640 } 641 } 642 /* 643 * We only set the idled bit when all of the cpus in the group are 644 * idle. Otherwise we could get into a situation where a KSE bounces 645 * back and forth between two idle cores on seperate physical CPUs. 646 */ 647 ksg->ksg_idlemask |= PCPU_GET(cpumask); 648 if (ksg->ksg_idlemask != ksg->ksg_cpumask) 649 return (1); 650 atomic_set_int(&kseq_idle, ksg->ksg_mask); 651 return (1); 652 } 653 654 static void 655 kseq_assign(struct kseq *kseq) 656 { 657 struct kse *nke; 658 struct kse *ke; 659 660 do { 661 *(volatile struct kse **)&ke = kseq->ksq_assigned; 662 } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned, 663 (uintptr_t)ke, (uintptr_t)NULL)); 664 for (; ke != NULL; ke = nke) { 665 nke = ke->ke_assign; 666 kseq->ksq_group->ksg_load--; 667 kseq->ksq_load--; 668 ke->ke_flags &= ~KEF_ASSIGNED; 669 if (ke->ke_flags & KEF_REMOVED) { 670 ke->ke_flags &= ~KEF_REMOVED; 671 continue; 672 } 673 ke->ke_flags |= KEF_INTERNAL | KEF_HOLD; 674 sched_add(ke->ke_thread, SRQ_YIELDING); 675 } 676 } 677 678 static void 679 kseq_notify(struct kse *ke, int cpu) 680 { 681 struct kseq *kseq; 682 struct thread *td; 683 struct pcpu *pcpu; 684 int class; 685 int prio; 686 687 kseq = KSEQ_CPU(cpu); 688 /* XXX */ 689 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 690 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 691 (kseq_idle & kseq->ksq_group->ksg_mask)) 692 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 693 kseq->ksq_group->ksg_load++; 694 kseq->ksq_load++; 695 ke->ke_cpu = cpu; 696 ke->ke_flags |= KEF_ASSIGNED; 697 prio = ke->ke_thread->td_priority; 698 699 /* 700 * Place a KSE on another cpu's queue and force a resched. 701 */ 702 do { 703 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 704 } while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned, 705 (uintptr_t)ke->ke_assign, (uintptr_t)ke)); 706 /* 707 * Without sched_lock we could lose a race where we set NEEDRESCHED 708 * on a thread that is switched out before the IPI is delivered. This 709 * would lead us to miss the resched. This will be a problem once 710 * sched_lock is pushed down. 711 */ 712 pcpu = pcpu_find(cpu); 713 td = pcpu->pc_curthread; 714 if (ke->ke_thread->td_priority < td->td_priority || 715 td == pcpu->pc_idlethread) { 716 td->td_flags |= TDF_NEEDRESCHED; 717 ipi_selected(1 << cpu, IPI_AST); 718 } 719 } 720 721 static struct kse * 722 runq_steal(struct runq *rq) 723 { 724 struct rqhead *rqh; 725 struct rqbits *rqb; 726 struct kse *ke; 727 int word; 728 int bit; 729 730 mtx_assert(&sched_lock, MA_OWNED); 731 rqb = &rq->rq_status; 732 for (word = 0; word < RQB_LEN; word++) { 733 if (rqb->rqb_bits[word] == 0) 734 continue; 735 for (bit = 0; bit < RQB_BPW; bit++) { 736 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 737 continue; 738 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 739 TAILQ_FOREACH(ke, rqh, ke_procq) { 740 if (KSE_CAN_MIGRATE(ke)) 741 return (ke); 742 } 743 } 744 } 745 return (NULL); 746 } 747 748 static struct kse * 749 kseq_steal(struct kseq *kseq, int stealidle) 750 { 751 struct kse *ke; 752 753 /* 754 * Steal from next first to try to get a non-interactive task that 755 * may not have run for a while. 756 */ 757 if ((ke = runq_steal(kseq->ksq_next)) != NULL) 758 return (ke); 759 if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 760 return (ke); 761 if (stealidle) 762 return (runq_steal(&kseq->ksq_idle)); 763 return (NULL); 764 } 765 766 int 767 kseq_transfer(struct kseq *kseq, struct kse *ke, int class) 768 { 769 struct kseq_group *nksg; 770 struct kseq_group *ksg; 771 struct kseq *old; 772 int cpu; 773 int idx; 774 775 if (smp_started == 0) 776 return (0); 777 cpu = 0; 778 /* 779 * If our load exceeds a certain threshold we should attempt to 780 * reassign this thread. The first candidate is the cpu that 781 * originally ran the thread. If it is idle, assign it there, 782 * otherwise, pick an idle cpu. 783 * 784 * The threshold at which we start to reassign kses has a large impact 785 * on the overall performance of the system. Tuned too high and 786 * some CPUs may idle. Too low and there will be excess migration 787 * and context switches. 788 */ 789 old = KSEQ_CPU(ke->ke_cpu); 790 nksg = old->ksq_group; 791 ksg = kseq->ksq_group; 792 if (kseq_idle) { 793 if (kseq_idle & nksg->ksg_mask) { 794 cpu = ffs(nksg->ksg_idlemask); 795 if (cpu) { 796 CTR2(KTR_SCHED, 797 "kseq_transfer: %p found old cpu %X " 798 "in idlemask.", ke, cpu); 799 goto migrate; 800 } 801 } 802 /* 803 * Multiple cpus could find this bit simultaneously 804 * but the race shouldn't be terrible. 805 */ 806 cpu = ffs(kseq_idle); 807 if (cpu) { 808 CTR2(KTR_SCHED, "kseq_transfer: %p found %X " 809 "in idlemask.", ke, cpu); 810 goto migrate; 811 } 812 } 813 idx = 0; 814 #if 0 815 if (old->ksq_load < kseq->ksq_load) { 816 cpu = ke->ke_cpu + 1; 817 CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " 818 "load less than ours.", ke, cpu); 819 goto migrate; 820 } 821 /* 822 * No new CPU was found, look for one with less load. 823 */ 824 for (idx = 0; idx <= ksg_maxid; idx++) { 825 nksg = KSEQ_GROUP(idx); 826 if (nksg->ksg_load /*+ (nksg->ksg_cpus * 2)*/ < ksg->ksg_load) { 827 cpu = ffs(nksg->ksg_cpumask); 828 CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " 829 "than ours.", ke, cpu); 830 goto migrate; 831 } 832 } 833 #endif 834 /* 835 * If another cpu in this group has idled, assign a thread over 836 * to them after checking to see if there are idled groups. 837 */ 838 if (ksg->ksg_idlemask) { 839 cpu = ffs(ksg->ksg_idlemask); 840 if (cpu) { 841 CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " 842 "group.", ke, cpu); 843 goto migrate; 844 } 845 } 846 return (0); 847 migrate: 848 /* 849 * Now that we've found an idle CPU, migrate the thread. 850 */ 851 cpu--; 852 ke->ke_runq = NULL; 853 kseq_notify(ke, cpu); 854 855 return (1); 856 } 857 858 #endif /* SMP */ 859 860 /* 861 * Pick the highest priority task we have and return it. 862 */ 863 864 static struct kse * 865 kseq_choose(struct kseq *kseq) 866 { 867 struct runq *swap; 868 struct kse *ke; 869 int nice; 870 871 mtx_assert(&sched_lock, MA_OWNED); 872 swap = NULL; 873 874 for (;;) { 875 ke = runq_choose(kseq->ksq_curr); 876 if (ke == NULL) { 877 /* 878 * We already swapped once and didn't get anywhere. 879 */ 880 if (swap) 881 break; 882 swap = kseq->ksq_curr; 883 kseq->ksq_curr = kseq->ksq_next; 884 kseq->ksq_next = swap; 885 continue; 886 } 887 /* 888 * If we encounter a slice of 0 the kse is in a 889 * TIMESHARE kse group and its nice was too far out 890 * of the range that receives slices. 891 */ 892 nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin); 893 #if 0 894 if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH && 895 ke->ke_proc->p_nice != 0)) { 896 runq_remove(ke->ke_runq, ke); 897 sched_slice(ke); 898 ke->ke_runq = kseq->ksq_next; 899 runq_add(ke->ke_runq, ke, 0); 900 continue; 901 } 902 #endif 903 return (ke); 904 } 905 906 return (runq_choose(&kseq->ksq_idle)); 907 } 908 909 static void 910 kseq_setup(struct kseq *kseq) 911 { 912 runq_init(&kseq->ksq_timeshare[0]); 913 runq_init(&kseq->ksq_timeshare[1]); 914 runq_init(&kseq->ksq_idle); 915 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 916 kseq->ksq_next = &kseq->ksq_timeshare[1]; 917 kseq->ksq_load = 0; 918 kseq->ksq_load_timeshare = 0; 919 } 920 921 static void 922 sched_setup(void *dummy) 923 { 924 #ifdef SMP 925 int i; 926 #endif 927 928 /* 929 * To avoid divide-by-zero, we set realstathz a dummy value 930 * in case which sched_clock() called before sched_initticks(). 931 */ 932 realstathz = hz; 933 slice_min = (hz/100); /* 10ms */ 934 slice_max = (hz/7); /* ~140ms */ 935 936 #ifdef SMP 937 balance_groups = 0; 938 /* 939 * Initialize the kseqs. 940 */ 941 for (i = 0; i < MAXCPU; i++) { 942 struct kseq *ksq; 943 944 ksq = &kseq_cpu[i]; 945 ksq->ksq_assigned = NULL; 946 kseq_setup(&kseq_cpu[i]); 947 } 948 if (smp_topology == NULL) { 949 struct kseq_group *ksg; 950 struct kseq *ksq; 951 int cpus; 952 953 for (cpus = 0, i = 0; i < MAXCPU; i++) { 954 if (CPU_ABSENT(i)) 955 continue; 956 ksq = &kseq_cpu[i]; 957 ksg = &kseq_groups[cpus]; 958 /* 959 * Setup a kseq group with one member. 960 */ 961 ksq->ksq_transferable = 0; 962 ksq->ksq_group = ksg; 963 ksg->ksg_cpus = 1; 964 ksg->ksg_idlemask = 0; 965 ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 966 ksg->ksg_load = 0; 967 ksg->ksg_transferable = 0; 968 LIST_INIT(&ksg->ksg_members); 969 LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 970 cpus++; 971 } 972 ksg_maxid = cpus - 1; 973 } else { 974 struct kseq_group *ksg; 975 struct cpu_group *cg; 976 int j; 977 978 for (i = 0; i < smp_topology->ct_count; i++) { 979 cg = &smp_topology->ct_group[i]; 980 ksg = &kseq_groups[i]; 981 /* 982 * Initialize the group. 983 */ 984 ksg->ksg_idlemask = 0; 985 ksg->ksg_load = 0; 986 ksg->ksg_transferable = 0; 987 ksg->ksg_cpus = cg->cg_count; 988 ksg->ksg_cpumask = cg->cg_mask; 989 LIST_INIT(&ksg->ksg_members); 990 /* 991 * Find all of the group members and add them. 992 */ 993 for (j = 0; j < MAXCPU; j++) { 994 if ((cg->cg_mask & (1 << j)) != 0) { 995 if (ksg->ksg_mask == 0) 996 ksg->ksg_mask = 1 << j; 997 kseq_cpu[j].ksq_transferable = 0; 998 kseq_cpu[j].ksq_group = ksg; 999 LIST_INSERT_HEAD(&ksg->ksg_members, 1000 &kseq_cpu[j], ksq_siblings); 1001 } 1002 } 1003 if (ksg->ksg_cpus > 1) 1004 balance_groups = 1; 1005 } 1006 ksg_maxid = smp_topology->ct_count - 1; 1007 } 1008 /* 1009 * Stagger the group and global load balancer so they do not 1010 * interfere with each other. 1011 */ 1012 bal_tick = ticks + hz; 1013 if (balance_groups) 1014 gbal_tick = ticks + (hz / 2); 1015 #else 1016 kseq_setup(KSEQ_SELF()); 1017 #endif 1018 mtx_lock_spin(&sched_lock); 1019 kseq_load_add(KSEQ_SELF(), &kse0); 1020 mtx_unlock_spin(&sched_lock); 1021 } 1022 1023 /* ARGSUSED */ 1024 static void 1025 sched_initticks(void *dummy) 1026 { 1027 mtx_lock_spin(&sched_lock); 1028 realstathz = stathz ? stathz : hz; 1029 slice_min = (realstathz/100); /* 10ms */ 1030 slice_max = (realstathz/7); /* ~140ms */ 1031 1032 tickincr = (hz << 10) / realstathz; 1033 /* 1034 * XXX This does not work for values of stathz that are much 1035 * larger than hz. 1036 */ 1037 if (tickincr == 0) 1038 tickincr = 1; 1039 mtx_unlock_spin(&sched_lock); 1040 } 1041 1042 1043 /* 1044 * Scale the scheduling priority according to the "interactivity" of this 1045 * process. 1046 */ 1047 static void 1048 sched_priority(struct ksegrp *kg) 1049 { 1050 int pri; 1051 1052 if (kg->kg_pri_class != PRI_TIMESHARE) 1053 return; 1054 1055 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 1056 pri += SCHED_PRI_BASE; 1057 pri += kg->kg_proc->p_nice; 1058 1059 if (pri > PRI_MAX_TIMESHARE) 1060 pri = PRI_MAX_TIMESHARE; 1061 else if (pri < PRI_MIN_TIMESHARE) 1062 pri = PRI_MIN_TIMESHARE; 1063 1064 sched_user_prio(kg, pri); 1065 1066 return; 1067 } 1068 1069 /* 1070 * Calculate a time slice based on the properties of the kseg and the runq 1071 * that we're on. This is only for PRI_TIMESHARE ksegrps. 1072 */ 1073 static void 1074 sched_slice(struct kse *ke) 1075 { 1076 struct kseq *kseq; 1077 struct ksegrp *kg; 1078 1079 kg = ke->ke_ksegrp; 1080 kseq = KSEQ_CPU(ke->ke_cpu); 1081 1082 if (ke->ke_thread->td_flags & TDF_BORROWING) { 1083 ke->ke_slice = SCHED_SLICE_MIN; 1084 return; 1085 } 1086 1087 /* 1088 * Rationale: 1089 * KSEs in interactive ksegs get a minimal slice so that we 1090 * quickly notice if it abuses its advantage. 1091 * 1092 * KSEs in non-interactive ksegs are assigned a slice that is 1093 * based on the ksegs nice value relative to the least nice kseg 1094 * on the run queue for this cpu. 1095 * 1096 * If the KSE is less nice than all others it gets the maximum 1097 * slice and other KSEs will adjust their slice relative to 1098 * this when they first expire. 1099 * 1100 * There is 20 point window that starts relative to the least 1101 * nice kse on the run queue. Slice size is determined by 1102 * the kse distance from the last nice ksegrp. 1103 * 1104 * If the kse is outside of the window it will get no slice 1105 * and will be reevaluated each time it is selected on the 1106 * run queue. The exception to this is nice 0 ksegs when 1107 * a nice -20 is running. They are always granted a minimum 1108 * slice. 1109 */ 1110 if (!SCHED_INTERACTIVE(kg)) { 1111 int nice; 1112 1113 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1114 if (kseq->ksq_load_timeshare == 0 || 1115 kg->kg_proc->p_nice < kseq->ksq_nicemin) 1116 ke->ke_slice = SCHED_SLICE_MAX; 1117 else if (nice <= SCHED_SLICE_NTHRESH) 1118 ke->ke_slice = SCHED_SLICE_NICE(nice); 1119 else if (kg->kg_proc->p_nice == 0) 1120 ke->ke_slice = SCHED_SLICE_MIN; 1121 else 1122 ke->ke_slice = SCHED_SLICE_MIN; /* 0 */ 1123 } else 1124 ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1125 1126 return; 1127 } 1128 1129 /* 1130 * This routine enforces a maximum limit on the amount of scheduling history 1131 * kept. It is called after either the slptime or runtime is adjusted. 1132 * This routine will not operate correctly when slp or run times have been 1133 * adjusted to more than double their maximum. 1134 */ 1135 static void 1136 sched_interact_update(struct ksegrp *kg) 1137 { 1138 int sum; 1139 1140 sum = kg->kg_runtime + kg->kg_slptime; 1141 if (sum < SCHED_SLP_RUN_MAX) 1142 return; 1143 /* 1144 * If we have exceeded by more than 1/5th then the algorithm below 1145 * will not bring us back into range. Dividing by two here forces 1146 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1147 */ 1148 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1149 kg->kg_runtime /= 2; 1150 kg->kg_slptime /= 2; 1151 return; 1152 } 1153 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1154 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1155 } 1156 1157 static void 1158 sched_interact_fork(struct ksegrp *kg) 1159 { 1160 int ratio; 1161 int sum; 1162 1163 sum = kg->kg_runtime + kg->kg_slptime; 1164 if (sum > SCHED_SLP_RUN_FORK) { 1165 ratio = sum / SCHED_SLP_RUN_FORK; 1166 kg->kg_runtime /= ratio; 1167 kg->kg_slptime /= ratio; 1168 } 1169 } 1170 1171 static int 1172 sched_interact_score(struct ksegrp *kg) 1173 { 1174 int div; 1175 1176 if (kg->kg_runtime > kg->kg_slptime) { 1177 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1178 return (SCHED_INTERACT_HALF + 1179 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1180 } if (kg->kg_slptime > kg->kg_runtime) { 1181 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1182 return (kg->kg_runtime / div); 1183 } 1184 1185 /* 1186 * This can happen if slptime and runtime are 0. 1187 */ 1188 return (0); 1189 1190 } 1191 1192 /* 1193 * Very early in the boot some setup of scheduler-specific 1194 * parts of proc0 and of soem scheduler resources needs to be done. 1195 * Called from: 1196 * proc0_init() 1197 */ 1198 void 1199 schedinit(void) 1200 { 1201 /* 1202 * Set up the scheduler specific parts of proc0. 1203 */ 1204 proc0.p_sched = NULL; /* XXX */ 1205 ksegrp0.kg_sched = &kg_sched0; 1206 thread0.td_sched = &kse0; 1207 kse0.ke_thread = &thread0; 1208 kse0.ke_state = KES_THREAD; 1209 kg_sched0.skg_concurrency = 1; 1210 kg_sched0.skg_avail_opennings = 0; /* we are already running */ 1211 } 1212 1213 /* 1214 * This is only somewhat accurate since given many processes of the same 1215 * priority they will switch when their slices run out, which will be 1216 * at most SCHED_SLICE_MAX. 1217 */ 1218 int 1219 sched_rr_interval(void) 1220 { 1221 return (SCHED_SLICE_MAX); 1222 } 1223 1224 static void 1225 sched_pctcpu_update(struct kse *ke) 1226 { 1227 /* 1228 * Adjust counters and watermark for pctcpu calc. 1229 */ 1230 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1231 /* 1232 * Shift the tick count out so that the divide doesn't 1233 * round away our results. 1234 */ 1235 ke->ke_ticks <<= 10; 1236 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1237 SCHED_CPU_TICKS; 1238 ke->ke_ticks >>= 10; 1239 } else 1240 ke->ke_ticks = 0; 1241 ke->ke_ltick = ticks; 1242 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1243 } 1244 1245 void 1246 sched_thread_priority(struct thread *td, u_char prio) 1247 { 1248 struct kse *ke; 1249 1250 CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)", 1251 td, td->td_proc->p_comm, td->td_priority, prio, curthread, 1252 curthread->td_proc->p_comm); 1253 ke = td->td_kse; 1254 mtx_assert(&sched_lock, MA_OWNED); 1255 if (td->td_priority == prio) 1256 return; 1257 if (TD_ON_RUNQ(td)) { 1258 /* 1259 * If the priority has been elevated due to priority 1260 * propagation, we may have to move ourselves to a new 1261 * queue. We still call adjustrunqueue below in case kse 1262 * needs to fix things up. 1263 */ 1264 if (prio < td->td_priority && ke->ke_runq != NULL && 1265 (ke->ke_flags & KEF_ASSIGNED) == 0 && 1266 ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1267 runq_remove(ke->ke_runq, ke); 1268 ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1269 runq_add(ke->ke_runq, ke, 0); 1270 } 1271 /* 1272 * Hold this kse on this cpu so that sched_prio() doesn't 1273 * cause excessive migration. We only want migration to 1274 * happen as the result of a wakeup. 1275 */ 1276 ke->ke_flags |= KEF_HOLD; 1277 adjustrunqueue(td, prio); 1278 ke->ke_flags &= ~KEF_HOLD; 1279 } else 1280 td->td_priority = prio; 1281 } 1282 1283 /* 1284 * Update a thread's priority when it is lent another thread's 1285 * priority. 1286 */ 1287 void 1288 sched_lend_prio(struct thread *td, u_char prio) 1289 { 1290 1291 td->td_flags |= TDF_BORROWING; 1292 sched_thread_priority(td, prio); 1293 } 1294 1295 /* 1296 * Restore a thread's priority when priority propagation is 1297 * over. The prio argument is the minimum priority the thread 1298 * needs to have to satisfy other possible priority lending 1299 * requests. If the thread's regular priority is less 1300 * important than prio, the thread will keep a priority boost 1301 * of prio. 1302 */ 1303 void 1304 sched_unlend_prio(struct thread *td, u_char prio) 1305 { 1306 u_char base_pri; 1307 1308 if (td->td_base_pri >= PRI_MIN_TIMESHARE && 1309 td->td_base_pri <= PRI_MAX_TIMESHARE) 1310 base_pri = td->td_ksegrp->kg_user_pri; 1311 else 1312 base_pri = td->td_base_pri; 1313 if (prio >= base_pri) { 1314 td->td_flags &= ~TDF_BORROWING; 1315 sched_thread_priority(td, base_pri); 1316 } else 1317 sched_lend_prio(td, prio); 1318 } 1319 1320 void 1321 sched_prio(struct thread *td, u_char prio) 1322 { 1323 u_char oldprio; 1324 1325 /* First, update the base priority. */ 1326 td->td_base_pri = prio; 1327 1328 /* 1329 * If the thread is borrowing another thread's priority, don't 1330 * ever lower the priority. 1331 */ 1332 if (td->td_flags & TDF_BORROWING && td->td_priority < prio) 1333 return; 1334 1335 /* Change the real priority. */ 1336 oldprio = td->td_priority; 1337 sched_thread_priority(td, prio); 1338 1339 /* 1340 * If the thread is on a turnstile, then let the turnstile update 1341 * its state. 1342 */ 1343 if (TD_ON_LOCK(td) && oldprio != prio) 1344 turnstile_adjust(td, oldprio); 1345 } 1346 1347 void 1348 sched_user_prio(struct ksegrp *kg, u_char prio) 1349 { 1350 struct thread *td; 1351 u_char oldprio; 1352 1353 kg->kg_base_user_pri = prio; 1354 1355 /* XXXKSE only for 1:1 */ 1356 1357 td = TAILQ_FIRST(&kg->kg_threads); 1358 if (td == NULL) { 1359 kg->kg_user_pri = prio; 1360 return; 1361 } 1362 1363 if (td->td_flags & TDF_UBORROWING && kg->kg_user_pri <= prio) 1364 return; 1365 1366 oldprio = kg->kg_user_pri; 1367 kg->kg_user_pri = prio; 1368 1369 if (TD_ON_UPILOCK(td) && oldprio != prio) 1370 umtx_pi_adjust(td, oldprio); 1371 } 1372 1373 void 1374 sched_lend_user_prio(struct thread *td, u_char prio) 1375 { 1376 u_char oldprio; 1377 1378 td->td_flags |= TDF_UBORROWING; 1379 1380 oldprio = td->td_ksegrp->kg_user_pri; 1381 td->td_ksegrp->kg_user_pri = prio; 1382 1383 if (TD_ON_UPILOCK(td) && oldprio != prio) 1384 umtx_pi_adjust(td, oldprio); 1385 } 1386 1387 void 1388 sched_unlend_user_prio(struct thread *td, u_char prio) 1389 { 1390 struct ksegrp *kg = td->td_ksegrp; 1391 u_char base_pri; 1392 1393 base_pri = kg->kg_base_user_pri; 1394 if (prio >= base_pri) { 1395 td->td_flags &= ~TDF_UBORROWING; 1396 sched_user_prio(kg, base_pri); 1397 } else 1398 sched_lend_user_prio(td, prio); 1399 } 1400 1401 void 1402 sched_switch(struct thread *td, struct thread *newtd, int flags) 1403 { 1404 struct kseq *ksq; 1405 struct kse *ke; 1406 1407 mtx_assert(&sched_lock, MA_OWNED); 1408 1409 ke = td->td_kse; 1410 ksq = KSEQ_SELF(); 1411 1412 td->td_lastcpu = td->td_oncpu; 1413 td->td_oncpu = NOCPU; 1414 td->td_flags &= ~TDF_NEEDRESCHED; 1415 td->td_owepreempt = 0; 1416 1417 /* 1418 * If the KSE has been assigned it may be in the process of switching 1419 * to the new cpu. This is the case in sched_bind(). 1420 */ 1421 if (td == PCPU_GET(idlethread)) { 1422 TD_SET_CAN_RUN(td); 1423 } else if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1424 /* We are ending our run so make our slot available again */ 1425 SLOT_RELEASE(td->td_ksegrp); 1426 kseq_load_rem(ksq, ke); 1427 if (TD_IS_RUNNING(td)) { 1428 /* 1429 * Don't allow the thread to migrate 1430 * from a preemption. 1431 */ 1432 ke->ke_flags |= KEF_HOLD; 1433 setrunqueue(td, (flags & SW_PREEMPT) ? 1434 SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED : 1435 SRQ_OURSELF|SRQ_YIELDING); 1436 ke->ke_flags &= ~KEF_HOLD; 1437 } else if ((td->td_proc->p_flag & P_HADTHREADS) && 1438 (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)) 1439 /* 1440 * We will not be on the run queue. 1441 * So we must be sleeping or similar. 1442 * Don't use the slot if we will need it 1443 * for newtd. 1444 */ 1445 slot_fill(td->td_ksegrp); 1446 } 1447 if (newtd != NULL) { 1448 /* 1449 * If we bring in a thread account for it as if it had been 1450 * added to the run queue and then chosen. 1451 */ 1452 newtd->td_kse->ke_flags |= KEF_DIDRUN; 1453 newtd->td_kse->ke_runq = ksq->ksq_curr; 1454 TD_SET_RUNNING(newtd); 1455 kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1456 /* 1457 * XXX When we preempt, we've already consumed a slot because 1458 * we got here through sched_add(). However, newtd can come 1459 * from thread_switchout() which can't SLOT_USE() because 1460 * the SLOT code is scheduler dependent. We must use the 1461 * slot here otherwise. 1462 */ 1463 if ((flags & SW_PREEMPT) == 0) 1464 SLOT_USE(newtd->td_ksegrp); 1465 } else 1466 newtd = choosethread(); 1467 if (td != newtd) { 1468 #ifdef HWPMC_HOOKS 1469 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1470 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT); 1471 #endif 1472 cpu_switch(td, newtd); 1473 #ifdef HWPMC_HOOKS 1474 if (PMC_PROC_IS_USING_PMCS(td->td_proc)) 1475 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN); 1476 #endif 1477 } 1478 1479 sched_lock.mtx_lock = (uintptr_t)td; 1480 1481 td->td_oncpu = PCPU_GET(cpuid); 1482 } 1483 1484 void 1485 sched_nice(struct proc *p, int nice) 1486 { 1487 struct ksegrp *kg; 1488 struct kse *ke; 1489 struct thread *td; 1490 struct kseq *kseq; 1491 1492 PROC_LOCK_ASSERT(p, MA_OWNED); 1493 mtx_assert(&sched_lock, MA_OWNED); 1494 /* 1495 * We need to adjust the nice counts for running KSEs. 1496 */ 1497 FOREACH_KSEGRP_IN_PROC(p, kg) { 1498 if (kg->kg_pri_class == PRI_TIMESHARE) { 1499 FOREACH_THREAD_IN_GROUP(kg, td) { 1500 ke = td->td_kse; 1501 if (ke->ke_runq == NULL) 1502 continue; 1503 kseq = KSEQ_CPU(ke->ke_cpu); 1504 kseq_nice_rem(kseq, p->p_nice); 1505 kseq_nice_add(kseq, nice); 1506 } 1507 } 1508 } 1509 p->p_nice = nice; 1510 FOREACH_KSEGRP_IN_PROC(p, kg) { 1511 sched_priority(kg); 1512 FOREACH_THREAD_IN_GROUP(kg, td) 1513 td->td_flags |= TDF_NEEDRESCHED; 1514 } 1515 } 1516 1517 void 1518 sched_sleep(struct thread *td) 1519 { 1520 mtx_assert(&sched_lock, MA_OWNED); 1521 1522 td->td_slptime = ticks; 1523 } 1524 1525 void 1526 sched_wakeup(struct thread *td) 1527 { 1528 mtx_assert(&sched_lock, MA_OWNED); 1529 1530 /* 1531 * Let the kseg know how long we slept for. This is because process 1532 * interactivity behavior is modeled in the kseg. 1533 */ 1534 if (td->td_slptime) { 1535 struct ksegrp *kg; 1536 int hzticks; 1537 1538 kg = td->td_ksegrp; 1539 hzticks = (ticks - td->td_slptime) << 10; 1540 if (hzticks >= SCHED_SLP_RUN_MAX) { 1541 kg->kg_slptime = SCHED_SLP_RUN_MAX; 1542 kg->kg_runtime = 1; 1543 } else { 1544 kg->kg_slptime += hzticks; 1545 sched_interact_update(kg); 1546 } 1547 sched_priority(kg); 1548 sched_slice(td->td_kse); 1549 td->td_slptime = 0; 1550 } 1551 setrunqueue(td, SRQ_BORING); 1552 } 1553 1554 /* 1555 * Penalize the parent for creating a new child and initialize the child's 1556 * priority. 1557 */ 1558 void 1559 sched_fork(struct thread *td, struct thread *childtd) 1560 { 1561 1562 mtx_assert(&sched_lock, MA_OWNED); 1563 1564 sched_fork_ksegrp(td, childtd->td_ksegrp); 1565 sched_fork_thread(td, childtd); 1566 } 1567 1568 void 1569 sched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1570 { 1571 struct ksegrp *kg = td->td_ksegrp; 1572 mtx_assert(&sched_lock, MA_OWNED); 1573 1574 child->kg_slptime = kg->kg_slptime; 1575 child->kg_runtime = kg->kg_runtime; 1576 child->kg_user_pri = kg->kg_user_pri; 1577 child->kg_base_user_pri = kg->kg_base_user_pri; 1578 sched_interact_fork(child); 1579 kg->kg_runtime += tickincr; 1580 sched_interact_update(kg); 1581 } 1582 1583 void 1584 sched_fork_thread(struct thread *td, struct thread *child) 1585 { 1586 struct kse *ke; 1587 struct kse *ke2; 1588 1589 sched_newthread(child); 1590 ke = td->td_kse; 1591 ke2 = child->td_kse; 1592 ke2->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1593 ke2->ke_cpu = ke->ke_cpu; 1594 ke2->ke_runq = NULL; 1595 1596 /* Grab our parents cpu estimation information. */ 1597 ke2->ke_ticks = ke->ke_ticks; 1598 ke2->ke_ltick = ke->ke_ltick; 1599 ke2->ke_ftick = ke->ke_ftick; 1600 } 1601 1602 void 1603 sched_class(struct ksegrp *kg, int class) 1604 { 1605 struct kseq *kseq; 1606 struct kse *ke; 1607 struct thread *td; 1608 int nclass; 1609 int oclass; 1610 1611 mtx_assert(&sched_lock, MA_OWNED); 1612 if (kg->kg_pri_class == class) 1613 return; 1614 1615 nclass = PRI_BASE(class); 1616 oclass = PRI_BASE(kg->kg_pri_class); 1617 FOREACH_THREAD_IN_GROUP(kg, td) { 1618 ke = td->td_kse; 1619 if ((ke->ke_state != KES_ONRUNQ && 1620 ke->ke_state != KES_THREAD) || ke->ke_runq == NULL) 1621 continue; 1622 kseq = KSEQ_CPU(ke->ke_cpu); 1623 1624 #ifdef SMP 1625 /* 1626 * On SMP if we're on the RUNQ we must adjust the transferable 1627 * count because could be changing to or from an interrupt 1628 * class. 1629 */ 1630 if (ke->ke_state == KES_ONRUNQ) { 1631 if (KSE_CAN_MIGRATE(ke)) { 1632 kseq->ksq_transferable--; 1633 kseq->ksq_group->ksg_transferable--; 1634 } 1635 if (KSE_CAN_MIGRATE(ke)) { 1636 kseq->ksq_transferable++; 1637 kseq->ksq_group->ksg_transferable++; 1638 } 1639 } 1640 #endif 1641 if (oclass == PRI_TIMESHARE) { 1642 kseq->ksq_load_timeshare--; 1643 kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1644 } 1645 if (nclass == PRI_TIMESHARE) { 1646 kseq->ksq_load_timeshare++; 1647 kseq_nice_add(kseq, kg->kg_proc->p_nice); 1648 } 1649 } 1650 1651 kg->kg_pri_class = class; 1652 } 1653 1654 /* 1655 * Return some of the child's priority and interactivity to the parent. 1656 */ 1657 void 1658 sched_exit(struct proc *p, struct thread *childtd) 1659 { 1660 mtx_assert(&sched_lock, MA_OWNED); 1661 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd); 1662 sched_exit_thread(NULL, childtd); 1663 } 1664 1665 void 1666 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1667 { 1668 /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1669 kg->kg_runtime += td->td_ksegrp->kg_runtime; 1670 sched_interact_update(kg); 1671 } 1672 1673 void 1674 sched_exit_thread(struct thread *td, struct thread *childtd) 1675 { 1676 CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d", 1677 childtd, childtd->td_proc->p_comm, childtd->td_priority); 1678 kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse); 1679 } 1680 1681 void 1682 sched_clock(struct thread *td) 1683 { 1684 struct kseq *kseq; 1685 struct ksegrp *kg; 1686 struct kse *ke; 1687 1688 mtx_assert(&sched_lock, MA_OWNED); 1689 kseq = KSEQ_SELF(); 1690 #ifdef SMP 1691 if (ticks >= bal_tick) 1692 sched_balance(); 1693 if (ticks >= gbal_tick && balance_groups) 1694 sched_balance_groups(); 1695 /* 1696 * We could have been assigned a non real-time thread without an 1697 * IPI. 1698 */ 1699 if (kseq->ksq_assigned) 1700 kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1701 #endif 1702 ke = td->td_kse; 1703 kg = ke->ke_ksegrp; 1704 1705 /* Adjust ticks for pctcpu */ 1706 ke->ke_ticks++; 1707 ke->ke_ltick = ticks; 1708 1709 /* Go up to one second beyond our max and then trim back down */ 1710 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1711 sched_pctcpu_update(ke); 1712 1713 if (td->td_flags & TDF_IDLETD) 1714 return; 1715 /* 1716 * We only do slicing code for TIMESHARE ksegrps. 1717 */ 1718 if (kg->kg_pri_class != PRI_TIMESHARE) 1719 return; 1720 /* 1721 * We used a tick charge it to the ksegrp so that we can compute our 1722 * interactivity. 1723 */ 1724 kg->kg_runtime += tickincr; 1725 sched_interact_update(kg); 1726 1727 /* 1728 * We used up one time slice. 1729 */ 1730 if (--ke->ke_slice > 0) 1731 return; 1732 /* 1733 * We're out of time, recompute priorities and requeue. 1734 */ 1735 kseq_load_rem(kseq, ke); 1736 sched_priority(kg); 1737 sched_slice(ke); 1738 if (SCHED_CURR(kg, ke)) 1739 ke->ke_runq = kseq->ksq_curr; 1740 else 1741 ke->ke_runq = kseq->ksq_next; 1742 kseq_load_add(kseq, ke); 1743 td->td_flags |= TDF_NEEDRESCHED; 1744 } 1745 1746 int 1747 sched_runnable(void) 1748 { 1749 struct kseq *kseq; 1750 int load; 1751 1752 load = 1; 1753 1754 kseq = KSEQ_SELF(); 1755 #ifdef SMP 1756 if (kseq->ksq_assigned) { 1757 mtx_lock_spin(&sched_lock); 1758 kseq_assign(kseq); 1759 mtx_unlock_spin(&sched_lock); 1760 } 1761 #endif 1762 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1763 if (kseq->ksq_load > 0) 1764 goto out; 1765 } else 1766 if (kseq->ksq_load - 1 > 0) 1767 goto out; 1768 load = 0; 1769 out: 1770 return (load); 1771 } 1772 1773 void 1774 sched_userret(struct thread *td) 1775 { 1776 struct ksegrp *kg; 1777 1778 KASSERT((td->td_flags & TDF_BORROWING) == 0, 1779 ("thread with borrowed priority returning to userland")); 1780 kg = td->td_ksegrp; 1781 if (td->td_priority != kg->kg_user_pri) { 1782 mtx_lock_spin(&sched_lock); 1783 td->td_priority = kg->kg_user_pri; 1784 td->td_base_pri = kg->kg_user_pri; 1785 mtx_unlock_spin(&sched_lock); 1786 } 1787 } 1788 1789 struct kse * 1790 sched_choose(void) 1791 { 1792 struct kseq *kseq; 1793 struct kse *ke; 1794 1795 mtx_assert(&sched_lock, MA_OWNED); 1796 kseq = KSEQ_SELF(); 1797 #ifdef SMP 1798 restart: 1799 if (kseq->ksq_assigned) 1800 kseq_assign(kseq); 1801 #endif 1802 ke = kseq_choose(kseq); 1803 if (ke) { 1804 #ifdef SMP 1805 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1806 if (kseq_idled(kseq) == 0) 1807 goto restart; 1808 #endif 1809 kseq_runq_rem(kseq, ke); 1810 ke->ke_state = KES_THREAD; 1811 ke->ke_flags &= ~KEF_PREEMPTED; 1812 return (ke); 1813 } 1814 #ifdef SMP 1815 if (kseq_idled(kseq) == 0) 1816 goto restart; 1817 #endif 1818 return (NULL); 1819 } 1820 1821 void 1822 sched_add(struct thread *td, int flags) 1823 { 1824 struct kseq *kseq; 1825 struct ksegrp *kg; 1826 struct kse *ke; 1827 int preemptive; 1828 int canmigrate; 1829 int class; 1830 1831 CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)", 1832 td, td->td_proc->p_comm, td->td_priority, curthread, 1833 curthread->td_proc->p_comm); 1834 mtx_assert(&sched_lock, MA_OWNED); 1835 ke = td->td_kse; 1836 kg = td->td_ksegrp; 1837 canmigrate = 1; 1838 preemptive = !(flags & SRQ_YIELDING); 1839 class = PRI_BASE(kg->kg_pri_class); 1840 kseq = KSEQ_SELF(); 1841 if ((ke->ke_flags & KEF_INTERNAL) == 0) 1842 SLOT_USE(td->td_ksegrp); 1843 ke->ke_flags &= ~KEF_INTERNAL; 1844 #ifdef SMP 1845 if (ke->ke_flags & KEF_ASSIGNED) { 1846 if (ke->ke_flags & KEF_REMOVED) 1847 ke->ke_flags &= ~KEF_REMOVED; 1848 return; 1849 } 1850 canmigrate = KSE_CAN_MIGRATE(ke); 1851 /* 1852 * Don't migrate running threads here. Force the long term balancer 1853 * to do it. 1854 */ 1855 if (ke->ke_flags & KEF_HOLD) { 1856 ke->ke_flags &= ~KEF_HOLD; 1857 canmigrate = 0; 1858 } 1859 #endif 1860 KASSERT(ke->ke_state != KES_ONRUNQ, 1861 ("sched_add: kse %p (%s) already in run queue", ke, 1862 ke->ke_proc->p_comm)); 1863 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1864 ("sched_add: process swapped out")); 1865 KASSERT(ke->ke_runq == NULL, 1866 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1867 if (flags & SRQ_PREEMPTED) 1868 ke->ke_flags |= KEF_PREEMPTED; 1869 switch (class) { 1870 case PRI_ITHD: 1871 case PRI_REALTIME: 1872 ke->ke_runq = kseq->ksq_curr; 1873 ke->ke_slice = SCHED_SLICE_MAX; 1874 if (canmigrate) 1875 ke->ke_cpu = PCPU_GET(cpuid); 1876 break; 1877 case PRI_TIMESHARE: 1878 if (SCHED_CURR(kg, ke)) 1879 ke->ke_runq = kseq->ksq_curr; 1880 else 1881 ke->ke_runq = kseq->ksq_next; 1882 break; 1883 case PRI_IDLE: 1884 /* 1885 * This is for priority prop. 1886 */ 1887 if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1888 ke->ke_runq = kseq->ksq_curr; 1889 else 1890 ke->ke_runq = &kseq->ksq_idle; 1891 ke->ke_slice = SCHED_SLICE_MIN; 1892 break; 1893 default: 1894 panic("Unknown pri class."); 1895 break; 1896 } 1897 #ifdef SMP 1898 /* 1899 * If this thread is pinned or bound, notify the target cpu. 1900 */ 1901 if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1902 ke->ke_runq = NULL; 1903 kseq_notify(ke, ke->ke_cpu); 1904 return; 1905 } 1906 /* 1907 * If we had been idle, clear our bit in the group and potentially 1908 * the global bitmap. If not, see if we should transfer this thread. 1909 */ 1910 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1911 (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1912 /* 1913 * Check to see if our group is unidling, and if so, remove it 1914 * from the global idle mask. 1915 */ 1916 if (kseq->ksq_group->ksg_idlemask == 1917 kseq->ksq_group->ksg_cpumask) 1918 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1919 /* 1920 * Now remove ourselves from the group specific idle mask. 1921 */ 1922 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1923 } else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD) 1924 if (kseq_transfer(kseq, ke, class)) 1925 return; 1926 ke->ke_cpu = PCPU_GET(cpuid); 1927 #endif 1928 if (td->td_priority < curthread->td_priority && 1929 ke->ke_runq == kseq->ksq_curr) 1930 curthread->td_flags |= TDF_NEEDRESCHED; 1931 if (preemptive && maybe_preempt(td)) 1932 return; 1933 ke->ke_state = KES_ONRUNQ; 1934 1935 kseq_runq_add(kseq, ke, flags); 1936 kseq_load_add(kseq, ke); 1937 } 1938 1939 void 1940 sched_rem(struct thread *td) 1941 { 1942 struct kseq *kseq; 1943 struct kse *ke; 1944 1945 CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)", 1946 td, td->td_proc->p_comm, td->td_priority, curthread, 1947 curthread->td_proc->p_comm); 1948 mtx_assert(&sched_lock, MA_OWNED); 1949 ke = td->td_kse; 1950 SLOT_RELEASE(td->td_ksegrp); 1951 ke->ke_flags &= ~KEF_PREEMPTED; 1952 if (ke->ke_flags & KEF_ASSIGNED) { 1953 ke->ke_flags |= KEF_REMOVED; 1954 return; 1955 } 1956 KASSERT((ke->ke_state == KES_ONRUNQ), 1957 ("sched_rem: KSE not on run queue")); 1958 1959 ke->ke_state = KES_THREAD; 1960 kseq = KSEQ_CPU(ke->ke_cpu); 1961 kseq_runq_rem(kseq, ke); 1962 kseq_load_rem(kseq, ke); 1963 } 1964 1965 fixpt_t 1966 sched_pctcpu(struct thread *td) 1967 { 1968 fixpt_t pctcpu; 1969 struct kse *ke; 1970 1971 pctcpu = 0; 1972 ke = td->td_kse; 1973 if (ke == NULL) 1974 return (0); 1975 1976 mtx_lock_spin(&sched_lock); 1977 if (ke->ke_ticks) { 1978 int rtick; 1979 1980 /* 1981 * Don't update more frequently than twice a second. Allowing 1982 * this causes the cpu usage to decay away too quickly due to 1983 * rounding errors. 1984 */ 1985 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1986 ke->ke_ltick < (ticks - (hz / 2))) 1987 sched_pctcpu_update(ke); 1988 /* How many rtick per second ? */ 1989 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1990 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1991 } 1992 1993 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1994 mtx_unlock_spin(&sched_lock); 1995 1996 return (pctcpu); 1997 } 1998 1999 void 2000 sched_bind(struct thread *td, int cpu) 2001 { 2002 struct kse *ke; 2003 2004 mtx_assert(&sched_lock, MA_OWNED); 2005 ke = td->td_kse; 2006 ke->ke_flags |= KEF_BOUND; 2007 #ifdef SMP 2008 if (PCPU_GET(cpuid) == cpu) 2009 return; 2010 /* sched_rem without the runq_remove */ 2011 ke->ke_state = KES_THREAD; 2012 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 2013 kseq_notify(ke, cpu); 2014 /* When we return from mi_switch we'll be on the correct cpu. */ 2015 mi_switch(SW_VOL, NULL); 2016 #endif 2017 } 2018 2019 void 2020 sched_unbind(struct thread *td) 2021 { 2022 mtx_assert(&sched_lock, MA_OWNED); 2023 td->td_kse->ke_flags &= ~KEF_BOUND; 2024 } 2025 2026 int 2027 sched_is_bound(struct thread *td) 2028 { 2029 mtx_assert(&sched_lock, MA_OWNED); 2030 return (td->td_kse->ke_flags & KEF_BOUND); 2031 } 2032 2033 void 2034 sched_relinquish(struct thread *td) 2035 { 2036 struct ksegrp *kg; 2037 2038 kg = td->td_ksegrp; 2039 mtx_lock_spin(&sched_lock); 2040 if (kg->kg_pri_class == PRI_TIMESHARE) 2041 sched_prio(td, PRI_MAX_TIMESHARE); 2042 mi_switch(SW_VOL, NULL); 2043 mtx_unlock_spin(&sched_lock); 2044 } 2045 2046 int 2047 sched_load(void) 2048 { 2049 #ifdef SMP 2050 int total; 2051 int i; 2052 2053 total = 0; 2054 for (i = 0; i <= ksg_maxid; i++) 2055 total += KSEQ_GROUP(i)->ksg_load; 2056 return (total); 2057 #else 2058 return (KSEQ_SELF()->ksq_sysload); 2059 #endif 2060 } 2061 2062 int 2063 sched_sizeof_ksegrp(void) 2064 { 2065 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 2066 } 2067 2068 int 2069 sched_sizeof_proc(void) 2070 { 2071 return (sizeof(struct proc)); 2072 } 2073 2074 int 2075 sched_sizeof_thread(void) 2076 { 2077 return (sizeof(struct thread) + sizeof(struct td_sched)); 2078 } 2079 2080 void 2081 sched_tick(void) 2082 { 2083 } 2084 #define KERN_SWITCH_INCLUDE 1 2085 #include "kern/kern_switch.c" 2086