1 /*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kdb.h> 33 #include <sys/kernel.h> 34 #include <sys/ktr.h> 35 #include <sys/lock.h> 36 #include <sys/mutex.h> 37 #include <sys/proc.h> 38 #include <sys/resource.h> 39 #include <sys/resourcevar.h> 40 #include <sys/sched.h> 41 #include <sys/smp.h> 42 #include <sys/sx.h> 43 #include <sys/sysctl.h> 44 #include <sys/sysproto.h> 45 #include <sys/vmmeter.h> 46 #ifdef KTRACE 47 #include <sys/uio.h> 48 #include <sys/ktrace.h> 49 #endif 50 51 #include <machine/cpu.h> 52 #include <machine/smp.h> 53 54 #define KTR_ULE KTR_NFS 55 56 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57 /* XXX This is bogus compatability crap for ps */ 58 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60 61 static void sched_setup(void *dummy); 62 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63 64 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler"); 65 66 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0, 67 "Scheduler name"); 68 69 static int slice_min = 1; 70 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71 72 static int slice_max = 10; 73 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74 75 int realstathz; 76 int tickincr = 1; 77 78 /* 79 * These datastructures are allocated within their parent datastructure but 80 * are scheduler specific. 81 */ 82 83 struct ke_sched { 84 int ske_slice; 85 struct runq *ske_runq; 86 /* The following variables are only used for pctcpu calculation */ 87 int ske_ltick; /* Last tick that we were running on */ 88 int ske_ftick; /* First tick that we were running on */ 89 int ske_ticks; /* Tick count */ 90 /* CPU that we have affinity for. */ 91 u_char ske_cpu; 92 }; 93 #define ke_slice ke_sched->ske_slice 94 #define ke_runq ke_sched->ske_runq 95 #define ke_ltick ke_sched->ske_ltick 96 #define ke_ftick ke_sched->ske_ftick 97 #define ke_ticks ke_sched->ske_ticks 98 #define ke_cpu ke_sched->ske_cpu 99 #define ke_assign ke_procq.tqe_next 100 101 #define KEF_ASSIGNED KEF_SCHED0 /* KSE is being migrated. */ 102 #define KEF_BOUND KEF_SCHED1 /* KSE can not migrate. */ 103 #define KEF_XFERABLE KEF_SCHED2 /* KSE was added as transferable. */ 104 #define KEF_HOLD KEF_SCHED3 /* KSE is temporarily bound. */ 105 106 struct kg_sched { 107 int skg_slptime; /* Number of ticks we vol. slept */ 108 int skg_runtime; /* Number of ticks we were running */ 109 }; 110 #define kg_slptime kg_sched->skg_slptime 111 #define kg_runtime kg_sched->skg_runtime 112 113 struct td_sched { 114 int std_slptime; 115 }; 116 #define td_slptime td_sched->std_slptime 117 118 struct td_sched td_sched; 119 struct ke_sched ke_sched; 120 struct kg_sched kg_sched; 121 122 struct ke_sched *kse0_sched = &ke_sched; 123 struct kg_sched *ksegrp0_sched = &kg_sched; 124 struct p_sched *proc0_sched = NULL; 125 struct td_sched *thread0_sched = &td_sched; 126 127 /* 128 * The priority is primarily determined by the interactivity score. Thus, we 129 * give lower(better) priorities to kse groups that use less CPU. The nice 130 * value is then directly added to this to allow nice to have some effect 131 * on latency. 132 * 133 * PRI_RANGE: Total priority range for timeshare threads. 134 * PRI_NRESV: Number of nice values. 135 * PRI_BASE: The start of the dynamic range. 136 */ 137 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 138 #define SCHED_PRI_NRESV ((PRIO_MAX - PRIO_MIN) + 1) 139 #define SCHED_PRI_NHALF (SCHED_PRI_NRESV / 2) 140 #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141 #define SCHED_PRI_INTERACT(score) \ 142 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143 144 /* 145 * These determine the interactivity of a process. 146 * 147 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148 * before throttling back. 149 * SLP_RUN_FORK: Maximum slp+run time to inherit at fork time. 150 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151 * INTERACT_THRESH: Threshhold for placement on the current runq. 152 */ 153 #define SCHED_SLP_RUN_MAX ((hz * 5) << 10) 154 #define SCHED_SLP_RUN_FORK ((hz / 2) << 10) 155 #define SCHED_INTERACT_MAX (100) 156 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157 #define SCHED_INTERACT_THRESH (30) 158 159 /* 160 * These parameters and macros determine the size of the time slice that is 161 * granted to each thread. 162 * 163 * SLICE_MIN: Minimum time slice granted, in units of ticks. 164 * SLICE_MAX: Maximum time slice granted. 165 * SLICE_RANGE: Range of available time slices scaled by hz. 166 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168 * SLICE_NTHRESH: The nice cutoff point for slice assignment. 169 */ 170 #define SCHED_SLICE_MIN (slice_min) 171 #define SCHED_SLICE_MAX (slice_max) 172 #define SCHED_SLICE_INTERACTIVE (slice_max) 173 #define SCHED_SLICE_NTHRESH (SCHED_PRI_NHALF - 1) 174 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 175 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 176 #define SCHED_SLICE_NICE(nice) \ 177 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH)) 178 179 /* 180 * This macro determines whether or not the kse belongs on the current or 181 * next run queue. 182 */ 183 #define SCHED_INTERACTIVE(kg) \ 184 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 185 #define SCHED_CURR(kg, ke) \ 186 (ke->ke_thread->td_priority < kg->kg_user_pri || \ 187 SCHED_INTERACTIVE(kg)) 188 189 /* 190 * Cpu percentage computation macros and defines. 191 * 192 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 193 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 194 */ 195 196 #define SCHED_CPU_TIME 10 197 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 198 199 /* 200 * kseq - per processor runqs and statistics. 201 */ 202 struct kseq { 203 struct runq ksq_idle; /* Queue of IDLE threads. */ 204 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 205 struct runq *ksq_next; /* Next timeshare queue. */ 206 struct runq *ksq_curr; /* Current queue. */ 207 int ksq_load_timeshare; /* Load for timeshare. */ 208 int ksq_load; /* Aggregate load. */ 209 short ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */ 210 short ksq_nicemin; /* Least nice. */ 211 #ifdef SMP 212 int ksq_transferable; 213 LIST_ENTRY(kseq) ksq_siblings; /* Next in kseq group. */ 214 struct kseq_group *ksq_group; /* Our processor group. */ 215 volatile struct kse *ksq_assigned; /* assigned by another CPU. */ 216 #else 217 int ksq_sysload; /* For loadavg, !ITHD load. */ 218 #endif 219 }; 220 221 #ifdef SMP 222 /* 223 * kseq groups are groups of processors which can cheaply share threads. When 224 * one processor in the group goes idle it will check the runqs of the other 225 * processors in its group prior to halting and waiting for an interrupt. 226 * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA. 227 * In a numa environment we'd want an idle bitmap per group and a two tiered 228 * load balancer. 229 */ 230 struct kseq_group { 231 int ksg_cpus; /* Count of CPUs in this kseq group. */ 232 cpumask_t ksg_cpumask; /* Mask of cpus in this group. */ 233 cpumask_t ksg_idlemask; /* Idle cpus in this group. */ 234 cpumask_t ksg_mask; /* Bit mask for first cpu. */ 235 int ksg_load; /* Total load of this group. */ 236 int ksg_transferable; /* Transferable load of this group. */ 237 LIST_HEAD(, kseq) ksg_members; /* Linked list of all members. */ 238 }; 239 #endif 240 241 /* 242 * One kse queue per processor. 243 */ 244 #ifdef SMP 245 static cpumask_t kseq_idle; 246 static int ksg_maxid; 247 static struct kseq kseq_cpu[MAXCPU]; 248 static struct kseq_group kseq_groups[MAXCPU]; 249 static int bal_tick; 250 static int gbal_tick; 251 252 #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 253 #define KSEQ_CPU(x) (&kseq_cpu[(x)]) 254 #define KSEQ_ID(x) ((x) - kseq_cpu) 255 #define KSEQ_GROUP(x) (&kseq_groups[(x)]) 256 #else /* !SMP */ 257 static struct kseq kseq_cpu; 258 259 #define KSEQ_SELF() (&kseq_cpu) 260 #define KSEQ_CPU(x) (&kseq_cpu) 261 #endif 262 263 static void sched_add_internal(struct thread *td, int preemptive); 264 static void sched_slice(struct kse *ke); 265 static void sched_priority(struct ksegrp *kg); 266 static int sched_interact_score(struct ksegrp *kg); 267 static void sched_interact_update(struct ksegrp *kg); 268 static void sched_interact_fork(struct ksegrp *kg); 269 static void sched_pctcpu_update(struct kse *ke); 270 271 /* Operations on per processor queues */ 272 static struct kse * kseq_choose(struct kseq *kseq); 273 static void kseq_setup(struct kseq *kseq); 274 static void kseq_load_add(struct kseq *kseq, struct kse *ke); 275 static void kseq_load_rem(struct kseq *kseq, struct kse *ke); 276 static __inline void kseq_runq_add(struct kseq *kseq, struct kse *ke); 277 static __inline void kseq_runq_rem(struct kseq *kseq, struct kse *ke); 278 static void kseq_nice_add(struct kseq *kseq, int nice); 279 static void kseq_nice_rem(struct kseq *kseq, int nice); 280 void kseq_print(int cpu); 281 #ifdef SMP 282 static int kseq_transfer(struct kseq *ksq, struct kse *ke, int class); 283 static struct kse *runq_steal(struct runq *rq); 284 static void sched_balance(void); 285 static void sched_balance_groups(void); 286 static void sched_balance_group(struct kseq_group *ksg); 287 static void sched_balance_pair(struct kseq *high, struct kseq *low); 288 static void kseq_move(struct kseq *from, int cpu); 289 static int kseq_idled(struct kseq *kseq); 290 static void kseq_notify(struct kse *ke, int cpu); 291 static void kseq_assign(struct kseq *); 292 static struct kse *kseq_steal(struct kseq *kseq, int stealidle); 293 /* 294 * On P4 Xeons the round-robin interrupt delivery is broken. As a result of 295 * this, we can't pin interrupts to the cpu that they were delivered to, 296 * otherwise all ithreads only run on CPU 0. 297 */ 298 #ifdef __i386__ 299 #define KSE_CAN_MIGRATE(ke, class) \ 300 ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0) 301 #else /* !__i386__ */ 302 #define KSE_CAN_MIGRATE(ke, class) \ 303 ((class) != PRI_ITHD && (ke)->ke_thread->td_pinned == 0 && \ 304 ((ke)->ke_flags & KEF_BOUND) == 0) 305 #endif /* !__i386__ */ 306 #endif 307 308 void 309 kseq_print(int cpu) 310 { 311 struct kseq *kseq; 312 int i; 313 314 kseq = KSEQ_CPU(cpu); 315 316 printf("kseq:\n"); 317 printf("\tload: %d\n", kseq->ksq_load); 318 printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare); 319 #ifdef SMP 320 printf("\tload transferable: %d\n", kseq->ksq_transferable); 321 #endif 322 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 323 printf("\tnice counts:\n"); 324 for (i = 0; i < SCHED_PRI_NRESV; i++) 325 if (kseq->ksq_nice[i]) 326 printf("\t\t%d = %d\n", 327 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 328 } 329 330 static __inline void 331 kseq_runq_add(struct kseq *kseq, struct kse *ke) 332 { 333 #ifdef SMP 334 if (KSE_CAN_MIGRATE(ke, PRI_BASE(ke->ke_ksegrp->kg_pri_class))) { 335 kseq->ksq_transferable++; 336 kseq->ksq_group->ksg_transferable++; 337 ke->ke_flags |= KEF_XFERABLE; 338 } 339 #endif 340 runq_add(ke->ke_runq, ke); 341 } 342 343 static __inline void 344 kseq_runq_rem(struct kseq *kseq, struct kse *ke) 345 { 346 #ifdef SMP 347 if (ke->ke_flags & KEF_XFERABLE) { 348 kseq->ksq_transferable--; 349 kseq->ksq_group->ksg_transferable--; 350 ke->ke_flags &= ~KEF_XFERABLE; 351 } 352 #endif 353 runq_remove(ke->ke_runq, ke); 354 } 355 356 static void 357 kseq_load_add(struct kseq *kseq, struct kse *ke) 358 { 359 int class; 360 mtx_assert(&sched_lock, MA_OWNED); 361 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 362 if (class == PRI_TIMESHARE) 363 kseq->ksq_load_timeshare++; 364 kseq->ksq_load++; 365 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 366 #ifdef SMP 367 kseq->ksq_group->ksg_load++; 368 #else 369 kseq->ksq_sysload++; 370 #endif 371 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 372 CTR6(KTR_ULE, 373 "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 374 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 375 ke->ke_proc->p_nice, kseq->ksq_nicemin); 376 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 377 kseq_nice_add(kseq, ke->ke_proc->p_nice); 378 } 379 380 static void 381 kseq_load_rem(struct kseq *kseq, struct kse *ke) 382 { 383 int class; 384 mtx_assert(&sched_lock, MA_OWNED); 385 class = PRI_BASE(ke->ke_ksegrp->kg_pri_class); 386 if (class == PRI_TIMESHARE) 387 kseq->ksq_load_timeshare--; 388 if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0) 389 #ifdef SMP 390 kseq->ksq_group->ksg_load--; 391 #else 392 kseq->ksq_sysload--; 393 #endif 394 kseq->ksq_load--; 395 ke->ke_runq = NULL; 396 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 397 kseq_nice_rem(kseq, ke->ke_proc->p_nice); 398 } 399 400 static void 401 kseq_nice_add(struct kseq *kseq, int nice) 402 { 403 mtx_assert(&sched_lock, MA_OWNED); 404 /* Normalize to zero. */ 405 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 406 if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1) 407 kseq->ksq_nicemin = nice; 408 } 409 410 static void 411 kseq_nice_rem(struct kseq *kseq, int nice) 412 { 413 int n; 414 415 mtx_assert(&sched_lock, MA_OWNED); 416 /* Normalize to zero. */ 417 n = nice + SCHED_PRI_NHALF; 418 kseq->ksq_nice[n]--; 419 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 420 421 /* 422 * If this wasn't the smallest nice value or there are more in 423 * this bucket we can just return. Otherwise we have to recalculate 424 * the smallest nice. 425 */ 426 if (nice != kseq->ksq_nicemin || 427 kseq->ksq_nice[n] != 0 || 428 kseq->ksq_load_timeshare == 0) 429 return; 430 431 for (; n < SCHED_PRI_NRESV; n++) 432 if (kseq->ksq_nice[n]) { 433 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 434 return; 435 } 436 } 437 438 #ifdef SMP 439 /* 440 * sched_balance is a simple CPU load balancing algorithm. It operates by 441 * finding the least loaded and most loaded cpu and equalizing their load 442 * by migrating some processes. 443 * 444 * Dealing only with two CPUs at a time has two advantages. Firstly, most 445 * installations will only have 2 cpus. Secondly, load balancing too much at 446 * once can have an unpleasant effect on the system. The scheduler rarely has 447 * enough information to make perfect decisions. So this algorithm chooses 448 * algorithm simplicity and more gradual effects on load in larger systems. 449 * 450 * It could be improved by considering the priorities and slices assigned to 451 * each task prior to balancing them. There are many pathological cases with 452 * any approach and so the semi random algorithm below may work as well as any. 453 * 454 */ 455 static void 456 sched_balance(void) 457 { 458 struct kseq_group *high; 459 struct kseq_group *low; 460 struct kseq_group *ksg; 461 int cnt; 462 int i; 463 464 if (smp_started == 0) 465 goto out; 466 low = high = NULL; 467 i = random() % (ksg_maxid + 1); 468 for (cnt = 0; cnt <= ksg_maxid; cnt++) { 469 ksg = KSEQ_GROUP(i); 470 /* 471 * Find the CPU with the highest load that has some 472 * threads to transfer. 473 */ 474 if ((high == NULL || ksg->ksg_load > high->ksg_load) 475 && ksg->ksg_transferable) 476 high = ksg; 477 if (low == NULL || ksg->ksg_load < low->ksg_load) 478 low = ksg; 479 if (++i > ksg_maxid) 480 i = 0; 481 } 482 if (low != NULL && high != NULL && high != low) 483 sched_balance_pair(LIST_FIRST(&high->ksg_members), 484 LIST_FIRST(&low->ksg_members)); 485 out: 486 bal_tick = ticks + (random() % (hz * 2)); 487 } 488 489 static void 490 sched_balance_groups(void) 491 { 492 int i; 493 494 mtx_assert(&sched_lock, MA_OWNED); 495 if (smp_started) 496 for (i = 0; i <= ksg_maxid; i++) 497 sched_balance_group(KSEQ_GROUP(i)); 498 gbal_tick = ticks + (random() % (hz * 2)); 499 } 500 501 static void 502 sched_balance_group(struct kseq_group *ksg) 503 { 504 struct kseq *kseq; 505 struct kseq *high; 506 struct kseq *low; 507 int load; 508 509 if (ksg->ksg_transferable == 0) 510 return; 511 low = NULL; 512 high = NULL; 513 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 514 load = kseq->ksq_load; 515 if (high == NULL || load > high->ksq_load) 516 high = kseq; 517 if (low == NULL || load < low->ksq_load) 518 low = kseq; 519 } 520 if (high != NULL && low != NULL && high != low) 521 sched_balance_pair(high, low); 522 } 523 524 static void 525 sched_balance_pair(struct kseq *high, struct kseq *low) 526 { 527 int transferable; 528 int high_load; 529 int low_load; 530 int move; 531 int diff; 532 int i; 533 534 /* 535 * If we're transfering within a group we have to use this specific 536 * kseq's transferable count, otherwise we can steal from other members 537 * of the group. 538 */ 539 if (high->ksq_group == low->ksq_group) { 540 transferable = high->ksq_transferable; 541 high_load = high->ksq_load; 542 low_load = low->ksq_load; 543 } else { 544 transferable = high->ksq_group->ksg_transferable; 545 high_load = high->ksq_group->ksg_load; 546 low_load = low->ksq_group->ksg_load; 547 } 548 if (transferable == 0) 549 return; 550 /* 551 * Determine what the imbalance is and then adjust that to how many 552 * kses we actually have to give up (transferable). 553 */ 554 diff = high_load - low_load; 555 move = diff / 2; 556 if (diff & 0x1) 557 move++; 558 move = min(move, transferable); 559 for (i = 0; i < move; i++) 560 kseq_move(high, KSEQ_ID(low)); 561 return; 562 } 563 564 static void 565 kseq_move(struct kseq *from, int cpu) 566 { 567 struct kseq *kseq; 568 struct kseq *to; 569 struct kse *ke; 570 571 kseq = from; 572 to = KSEQ_CPU(cpu); 573 ke = kseq_steal(kseq, 1); 574 if (ke == NULL) { 575 struct kseq_group *ksg; 576 577 ksg = kseq->ksq_group; 578 LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) { 579 if (kseq == from || kseq->ksq_transferable == 0) 580 continue; 581 ke = kseq_steal(kseq, 1); 582 break; 583 } 584 if (ke == NULL) 585 panic("kseq_move: No KSEs available with a " 586 "transferable count of %d\n", 587 ksg->ksg_transferable); 588 } 589 if (kseq == to) 590 return; 591 ke->ke_state = KES_THREAD; 592 kseq_runq_rem(kseq, ke); 593 kseq_load_rem(kseq, ke); 594 kseq_notify(ke, cpu); 595 } 596 597 static int 598 kseq_idled(struct kseq *kseq) 599 { 600 struct kseq_group *ksg; 601 struct kseq *steal; 602 struct kse *ke; 603 604 ksg = kseq->ksq_group; 605 /* 606 * If we're in a cpu group, try and steal kses from another cpu in 607 * the group before idling. 608 */ 609 if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) { 610 LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) { 611 if (steal == kseq || steal->ksq_transferable == 0) 612 continue; 613 ke = kseq_steal(steal, 0); 614 if (ke == NULL) 615 continue; 616 ke->ke_state = KES_THREAD; 617 kseq_runq_rem(steal, ke); 618 kseq_load_rem(steal, ke); 619 ke->ke_cpu = PCPU_GET(cpuid); 620 sched_add_internal(ke->ke_thread, 0); 621 return (0); 622 } 623 } 624 /* 625 * We only set the idled bit when all of the cpus in the group are 626 * idle. Otherwise we could get into a situation where a KSE bounces 627 * back and forth between two idle cores on seperate physical CPUs. 628 */ 629 ksg->ksg_idlemask |= PCPU_GET(cpumask); 630 if (ksg->ksg_idlemask != ksg->ksg_cpumask) 631 return (1); 632 atomic_set_int(&kseq_idle, ksg->ksg_mask); 633 return (1); 634 } 635 636 static void 637 kseq_assign(struct kseq *kseq) 638 { 639 struct kse *nke; 640 struct kse *ke; 641 642 do { 643 *(volatile struct kse **)&ke = kseq->ksq_assigned; 644 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke, NULL)); 645 for (; ke != NULL; ke = nke) { 646 nke = ke->ke_assign; 647 ke->ke_flags &= ~KEF_ASSIGNED; 648 sched_add_internal(ke->ke_thread, 0); 649 } 650 } 651 652 static void 653 kseq_notify(struct kse *ke, int cpu) 654 { 655 struct kseq *kseq; 656 struct thread *td; 657 struct pcpu *pcpu; 658 int prio; 659 660 ke->ke_cpu = cpu; 661 ke->ke_flags |= KEF_ASSIGNED; 662 prio = ke->ke_thread->td_priority; 663 664 kseq = KSEQ_CPU(cpu); 665 666 /* 667 * Place a KSE on another cpu's queue and force a resched. 668 */ 669 do { 670 *(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned; 671 } while(!atomic_cmpset_ptr(&kseq->ksq_assigned, ke->ke_assign, ke)); 672 /* 673 * Without sched_lock we could lose a race where we set NEEDRESCHED 674 * on a thread that is switched out before the IPI is delivered. This 675 * would lead us to miss the resched. This will be a problem once 676 * sched_lock is pushed down. 677 */ 678 pcpu = pcpu_find(cpu); 679 td = pcpu->pc_curthread; 680 if (ke->ke_thread->td_priority < td->td_priority || 681 td == pcpu->pc_idlethread) { 682 td->td_flags |= TDF_NEEDRESCHED; 683 ipi_selected(1 << cpu, IPI_AST); 684 } 685 } 686 687 static struct kse * 688 runq_steal(struct runq *rq) 689 { 690 struct rqhead *rqh; 691 struct rqbits *rqb; 692 struct kse *ke; 693 int word; 694 int bit; 695 696 mtx_assert(&sched_lock, MA_OWNED); 697 rqb = &rq->rq_status; 698 for (word = 0; word < RQB_LEN; word++) { 699 if (rqb->rqb_bits[word] == 0) 700 continue; 701 for (bit = 0; bit < RQB_BPW; bit++) { 702 if ((rqb->rqb_bits[word] & (1ul << bit)) == 0) 703 continue; 704 rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)]; 705 TAILQ_FOREACH(ke, rqh, ke_procq) { 706 if (KSE_CAN_MIGRATE(ke, 707 PRI_BASE(ke->ke_ksegrp->kg_pri_class))) 708 return (ke); 709 } 710 } 711 } 712 return (NULL); 713 } 714 715 static struct kse * 716 kseq_steal(struct kseq *kseq, int stealidle) 717 { 718 struct kse *ke; 719 720 /* 721 * Steal from next first to try to get a non-interactive task that 722 * may not have run for a while. 723 */ 724 if ((ke = runq_steal(kseq->ksq_next)) != NULL) 725 return (ke); 726 if ((ke = runq_steal(kseq->ksq_curr)) != NULL) 727 return (ke); 728 if (stealidle) 729 return (runq_steal(&kseq->ksq_idle)); 730 return (NULL); 731 } 732 733 int 734 kseq_transfer(struct kseq *kseq, struct kse *ke, int class) 735 { 736 struct kseq_group *ksg; 737 int cpu; 738 739 if (smp_started == 0) 740 return (0); 741 cpu = 0; 742 /* 743 * If our load exceeds a certain threshold we should attempt to 744 * reassign this thread. The first candidate is the cpu that 745 * originally ran the thread. If it is idle, assign it there, 746 * otherwise, pick an idle cpu. 747 * 748 * The threshold at which we start to reassign kses has a large impact 749 * on the overall performance of the system. Tuned too high and 750 * some CPUs may idle. Too low and there will be excess migration 751 * and context switches. 752 */ 753 ksg = kseq->ksq_group; 754 if (ksg->ksg_load > ksg->ksg_cpus && kseq_idle) { 755 ksg = KSEQ_CPU(ke->ke_cpu)->ksq_group; 756 if (kseq_idle & ksg->ksg_mask) { 757 cpu = ffs(ksg->ksg_idlemask); 758 if (cpu) 759 goto migrate; 760 } 761 /* 762 * Multiple cpus could find this bit simultaneously 763 * but the race shouldn't be terrible. 764 */ 765 cpu = ffs(kseq_idle); 766 if (cpu) 767 goto migrate; 768 } 769 /* 770 * If another cpu in this group has idled, assign a thread over 771 * to them after checking to see if there are idled groups. 772 */ 773 ksg = kseq->ksq_group; 774 if (ksg->ksg_idlemask) { 775 cpu = ffs(ksg->ksg_idlemask); 776 if (cpu) 777 goto migrate; 778 } 779 /* 780 * No new CPU was found. 781 */ 782 return (0); 783 migrate: 784 /* 785 * Now that we've found an idle CPU, migrate the thread. 786 */ 787 cpu--; 788 ke->ke_runq = NULL; 789 kseq_notify(ke, cpu); 790 791 return (1); 792 } 793 794 #endif /* SMP */ 795 796 /* 797 * Pick the highest priority task we have and return it. 798 */ 799 800 static struct kse * 801 kseq_choose(struct kseq *kseq) 802 { 803 struct kse *ke; 804 struct runq *swap; 805 806 mtx_assert(&sched_lock, MA_OWNED); 807 swap = NULL; 808 809 for (;;) { 810 ke = runq_choose(kseq->ksq_curr); 811 if (ke == NULL) { 812 /* 813 * We already swapped once and didn't get anywhere. 814 */ 815 if (swap) 816 break; 817 swap = kseq->ksq_curr; 818 kseq->ksq_curr = kseq->ksq_next; 819 kseq->ksq_next = swap; 820 continue; 821 } 822 /* 823 * If we encounter a slice of 0 the kse is in a 824 * TIMESHARE kse group and its nice was too far out 825 * of the range that receives slices. 826 */ 827 if (ke->ke_slice == 0) { 828 runq_remove(ke->ke_runq, ke); 829 sched_slice(ke); 830 ke->ke_runq = kseq->ksq_next; 831 runq_add(ke->ke_runq, ke); 832 continue; 833 } 834 return (ke); 835 } 836 837 return (runq_choose(&kseq->ksq_idle)); 838 } 839 840 static void 841 kseq_setup(struct kseq *kseq) 842 { 843 runq_init(&kseq->ksq_timeshare[0]); 844 runq_init(&kseq->ksq_timeshare[1]); 845 runq_init(&kseq->ksq_idle); 846 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 847 kseq->ksq_next = &kseq->ksq_timeshare[1]; 848 kseq->ksq_load = 0; 849 kseq->ksq_load_timeshare = 0; 850 } 851 852 static void 853 sched_setup(void *dummy) 854 { 855 #ifdef SMP 856 int balance_groups; 857 int i; 858 #endif 859 860 slice_min = (hz/100); /* 10ms */ 861 slice_max = (hz/7); /* ~140ms */ 862 863 #ifdef SMP 864 balance_groups = 0; 865 /* 866 * Initialize the kseqs. 867 */ 868 for (i = 0; i < MAXCPU; i++) { 869 struct kseq *ksq; 870 871 ksq = &kseq_cpu[i]; 872 ksq->ksq_assigned = NULL; 873 kseq_setup(&kseq_cpu[i]); 874 } 875 if (smp_topology == NULL) { 876 struct kseq_group *ksg; 877 struct kseq *ksq; 878 879 for (i = 0; i < MAXCPU; i++) { 880 ksq = &kseq_cpu[i]; 881 ksg = &kseq_groups[i]; 882 /* 883 * Setup a kseq group with one member. 884 */ 885 ksq->ksq_transferable = 0; 886 ksq->ksq_group = ksg; 887 ksg->ksg_cpus = 1; 888 ksg->ksg_idlemask = 0; 889 ksg->ksg_cpumask = ksg->ksg_mask = 1 << i; 890 ksg->ksg_load = 0; 891 ksg->ksg_transferable = 0; 892 LIST_INIT(&ksg->ksg_members); 893 LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings); 894 } 895 } else { 896 struct kseq_group *ksg; 897 struct cpu_group *cg; 898 int j; 899 900 for (i = 0; i < smp_topology->ct_count; i++) { 901 cg = &smp_topology->ct_group[i]; 902 ksg = &kseq_groups[i]; 903 /* 904 * Initialize the group. 905 */ 906 ksg->ksg_idlemask = 0; 907 ksg->ksg_load = 0; 908 ksg->ksg_transferable = 0; 909 ksg->ksg_cpus = cg->cg_count; 910 ksg->ksg_cpumask = cg->cg_mask; 911 LIST_INIT(&ksg->ksg_members); 912 /* 913 * Find all of the group members and add them. 914 */ 915 for (j = 0; j < MAXCPU; j++) { 916 if ((cg->cg_mask & (1 << j)) != 0) { 917 if (ksg->ksg_mask == 0) 918 ksg->ksg_mask = 1 << j; 919 kseq_cpu[j].ksq_transferable = 0; 920 kseq_cpu[j].ksq_group = ksg; 921 LIST_INSERT_HEAD(&ksg->ksg_members, 922 &kseq_cpu[j], ksq_siblings); 923 } 924 } 925 if (ksg->ksg_cpus > 1) 926 balance_groups = 1; 927 } 928 ksg_maxid = smp_topology->ct_count - 1; 929 } 930 /* 931 * Stagger the group and global load balancer so they do not 932 * interfere with each other. 933 */ 934 bal_tick = ticks + hz; 935 if (balance_groups) 936 gbal_tick = ticks + (hz / 2); 937 #else 938 kseq_setup(KSEQ_SELF()); 939 #endif 940 mtx_lock_spin(&sched_lock); 941 kseq_load_add(KSEQ_SELF(), &kse0); 942 mtx_unlock_spin(&sched_lock); 943 } 944 945 /* 946 * Scale the scheduling priority according to the "interactivity" of this 947 * process. 948 */ 949 static void 950 sched_priority(struct ksegrp *kg) 951 { 952 int pri; 953 954 if (kg->kg_pri_class != PRI_TIMESHARE) 955 return; 956 957 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 958 pri += SCHED_PRI_BASE; 959 pri += kg->kg_proc->p_nice; 960 961 if (pri > PRI_MAX_TIMESHARE) 962 pri = PRI_MAX_TIMESHARE; 963 else if (pri < PRI_MIN_TIMESHARE) 964 pri = PRI_MIN_TIMESHARE; 965 966 kg->kg_user_pri = pri; 967 968 return; 969 } 970 971 /* 972 * Calculate a time slice based on the properties of the kseg and the runq 973 * that we're on. This is only for PRI_TIMESHARE ksegrps. 974 */ 975 static void 976 sched_slice(struct kse *ke) 977 { 978 struct kseq *kseq; 979 struct ksegrp *kg; 980 981 kg = ke->ke_ksegrp; 982 kseq = KSEQ_CPU(ke->ke_cpu); 983 984 /* 985 * Rationale: 986 * KSEs in interactive ksegs get a minimal slice so that we 987 * quickly notice if it abuses its advantage. 988 * 989 * KSEs in non-interactive ksegs are assigned a slice that is 990 * based on the ksegs nice value relative to the least nice kseg 991 * on the run queue for this cpu. 992 * 993 * If the KSE is less nice than all others it gets the maximum 994 * slice and other KSEs will adjust their slice relative to 995 * this when they first expire. 996 * 997 * There is 20 point window that starts relative to the least 998 * nice kse on the run queue. Slice size is determined by 999 * the kse distance from the last nice ksegrp. 1000 * 1001 * If the kse is outside of the window it will get no slice 1002 * and will be reevaluated each time it is selected on the 1003 * run queue. The exception to this is nice 0 ksegs when 1004 * a nice -20 is running. They are always granted a minimum 1005 * slice. 1006 */ 1007 if (!SCHED_INTERACTIVE(kg)) { 1008 int nice; 1009 1010 nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin); 1011 if (kseq->ksq_load_timeshare == 0 || 1012 kg->kg_proc->p_nice < kseq->ksq_nicemin) 1013 ke->ke_slice = SCHED_SLICE_MAX; 1014 else if (nice <= SCHED_SLICE_NTHRESH) 1015 ke->ke_slice = SCHED_SLICE_NICE(nice); 1016 else if (kg->kg_proc->p_nice == 0) 1017 ke->ke_slice = SCHED_SLICE_MIN; 1018 else 1019 ke->ke_slice = 0; 1020 } else 1021 ke->ke_slice = SCHED_SLICE_INTERACTIVE; 1022 1023 CTR6(KTR_ULE, 1024 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 1025 ke, ke->ke_slice, kg->kg_proc->p_nice, kseq->ksq_nicemin, 1026 kseq->ksq_load_timeshare, SCHED_INTERACTIVE(kg)); 1027 1028 return; 1029 } 1030 1031 /* 1032 * This routine enforces a maximum limit on the amount of scheduling history 1033 * kept. It is called after either the slptime or runtime is adjusted. 1034 * This routine will not operate correctly when slp or run times have been 1035 * adjusted to more than double their maximum. 1036 */ 1037 static void 1038 sched_interact_update(struct ksegrp *kg) 1039 { 1040 int sum; 1041 1042 sum = kg->kg_runtime + kg->kg_slptime; 1043 if (sum < SCHED_SLP_RUN_MAX) 1044 return; 1045 /* 1046 * If we have exceeded by more than 1/5th then the algorithm below 1047 * will not bring us back into range. Dividing by two here forces 1048 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX] 1049 */ 1050 if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) { 1051 kg->kg_runtime /= 2; 1052 kg->kg_slptime /= 2; 1053 return; 1054 } 1055 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 1056 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 1057 } 1058 1059 static void 1060 sched_interact_fork(struct ksegrp *kg) 1061 { 1062 int ratio; 1063 int sum; 1064 1065 sum = kg->kg_runtime + kg->kg_slptime; 1066 if (sum > SCHED_SLP_RUN_FORK) { 1067 ratio = sum / SCHED_SLP_RUN_FORK; 1068 kg->kg_runtime /= ratio; 1069 kg->kg_slptime /= ratio; 1070 } 1071 } 1072 1073 static int 1074 sched_interact_score(struct ksegrp *kg) 1075 { 1076 int div; 1077 1078 if (kg->kg_runtime > kg->kg_slptime) { 1079 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 1080 return (SCHED_INTERACT_HALF + 1081 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 1082 } if (kg->kg_slptime > kg->kg_runtime) { 1083 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 1084 return (kg->kg_runtime / div); 1085 } 1086 1087 /* 1088 * This can happen if slptime and runtime are 0. 1089 */ 1090 return (0); 1091 1092 } 1093 1094 /* 1095 * This is only somewhat accurate since given many processes of the same 1096 * priority they will switch when their slices run out, which will be 1097 * at most SCHED_SLICE_MAX. 1098 */ 1099 int 1100 sched_rr_interval(void) 1101 { 1102 return (SCHED_SLICE_MAX); 1103 } 1104 1105 static void 1106 sched_pctcpu_update(struct kse *ke) 1107 { 1108 /* 1109 * Adjust counters and watermark for pctcpu calc. 1110 */ 1111 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 1112 /* 1113 * Shift the tick count out so that the divide doesn't 1114 * round away our results. 1115 */ 1116 ke->ke_ticks <<= 10; 1117 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 1118 SCHED_CPU_TICKS; 1119 ke->ke_ticks >>= 10; 1120 } else 1121 ke->ke_ticks = 0; 1122 ke->ke_ltick = ticks; 1123 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 1124 } 1125 1126 void 1127 sched_prio(struct thread *td, u_char prio) 1128 { 1129 struct kse *ke; 1130 1131 ke = td->td_kse; 1132 mtx_assert(&sched_lock, MA_OWNED); 1133 if (TD_ON_RUNQ(td)) { 1134 /* 1135 * If the priority has been elevated due to priority 1136 * propagation, we may have to move ourselves to a new 1137 * queue. We still call adjustrunqueue below in case kse 1138 * needs to fix things up. 1139 */ 1140 if (prio < td->td_priority && ke && 1141 (ke->ke_flags & KEF_ASSIGNED) == 0 && 1142 ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) { 1143 runq_remove(ke->ke_runq, ke); 1144 ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr; 1145 runq_add(ke->ke_runq, ke); 1146 } 1147 /* 1148 * Hold this kse on this cpu so that sched_prio() doesn't 1149 * cause excessive migration. We only want migration to 1150 * happen as the result of a wakeup. 1151 */ 1152 ke->ke_flags |= KEF_HOLD; 1153 adjustrunqueue(td, prio); 1154 } else 1155 td->td_priority = prio; 1156 } 1157 1158 void 1159 sched_switch(struct thread *td, struct thread *newtd) 1160 { 1161 struct kse *ke; 1162 1163 mtx_assert(&sched_lock, MA_OWNED); 1164 1165 ke = td->td_kse; 1166 1167 td->td_last_kse = ke; 1168 td->td_lastcpu = td->td_oncpu; 1169 td->td_oncpu = NOCPU; 1170 td->td_flags &= ~TDF_NEEDRESCHED; 1171 td->td_pflags &= ~TDP_OWEPREEMPT; 1172 1173 /* 1174 * If the KSE has been assigned it may be in the process of switching 1175 * to the new cpu. This is the case in sched_bind(). 1176 */ 1177 if ((ke->ke_flags & KEF_ASSIGNED) == 0) { 1178 if (td == PCPU_GET(idlethread)) { 1179 TD_SET_CAN_RUN(td); 1180 } else if (TD_IS_RUNNING(td)) { 1181 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1182 /* 1183 * Don't allow the kse to migrate from a preemption. 1184 */ 1185 ke->ke_flags |= KEF_HOLD; 1186 setrunqueue(td); 1187 } else { 1188 if (ke->ke_runq) { 1189 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1190 } else if ((td->td_flags & TDF_IDLETD) == 0) 1191 kdb_backtrace(); 1192 /* 1193 * We will not be on the run queue. So we must be 1194 * sleeping or similar. 1195 */ 1196 if (td->td_proc->p_flag & P_SA) 1197 kse_reassign(ke); 1198 } 1199 } 1200 if (newtd != NULL) { 1201 kseq_load_add(KSEQ_SELF(), newtd->td_kse); 1202 ke->ke_cpu = PCPU_GET(cpuid); 1203 ke->ke_runq = KSEQ_SELF()->ksq_curr; 1204 } else 1205 newtd = choosethread(); 1206 if (td != newtd) 1207 cpu_switch(td, newtd); 1208 sched_lock.mtx_lock = (uintptr_t)td; 1209 1210 td->td_oncpu = PCPU_GET(cpuid); 1211 } 1212 1213 void 1214 sched_nice(struct proc *p, int nice) 1215 { 1216 struct ksegrp *kg; 1217 struct kse *ke; 1218 struct thread *td; 1219 struct kseq *kseq; 1220 1221 PROC_LOCK_ASSERT(p, MA_OWNED); 1222 mtx_assert(&sched_lock, MA_OWNED); 1223 /* 1224 * We need to adjust the nice counts for running KSEs. 1225 */ 1226 FOREACH_KSEGRP_IN_PROC(p, kg) { 1227 if (kg->kg_pri_class == PRI_TIMESHARE) { 1228 FOREACH_KSE_IN_GROUP(kg, ke) { 1229 if (ke->ke_runq == NULL) 1230 continue; 1231 kseq = KSEQ_CPU(ke->ke_cpu); 1232 kseq_nice_rem(kseq, p->p_nice); 1233 kseq_nice_add(kseq, nice); 1234 } 1235 } 1236 } 1237 p->p_nice = nice; 1238 FOREACH_KSEGRP_IN_PROC(p, kg) { 1239 sched_priority(kg); 1240 FOREACH_THREAD_IN_GROUP(kg, td) 1241 td->td_flags |= TDF_NEEDRESCHED; 1242 } 1243 } 1244 1245 void 1246 sched_sleep(struct thread *td) 1247 { 1248 mtx_assert(&sched_lock, MA_OWNED); 1249 1250 td->td_slptime = ticks; 1251 td->td_base_pri = td->td_priority; 1252 1253 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 1254 td->td_kse, td->td_slptime); 1255 } 1256 1257 void 1258 sched_wakeup(struct thread *td) 1259 { 1260 mtx_assert(&sched_lock, MA_OWNED); 1261 1262 /* 1263 * Let the kseg know how long we slept for. This is because process 1264 * interactivity behavior is modeled in the kseg. 1265 */ 1266 if (td->td_slptime) { 1267 struct ksegrp *kg; 1268 int hzticks; 1269 1270 kg = td->td_ksegrp; 1271 hzticks = (ticks - td->td_slptime) << 10; 1272 if (hzticks >= SCHED_SLP_RUN_MAX) { 1273 kg->kg_slptime = SCHED_SLP_RUN_MAX; 1274 kg->kg_runtime = 1; 1275 } else { 1276 kg->kg_slptime += hzticks; 1277 sched_interact_update(kg); 1278 } 1279 sched_priority(kg); 1280 if (td->td_kse) 1281 sched_slice(td->td_kse); 1282 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 1283 td->td_kse, hzticks); 1284 td->td_slptime = 0; 1285 } 1286 setrunqueue(td); 1287 } 1288 1289 /* 1290 * Penalize the parent for creating a new child and initialize the child's 1291 * priority. 1292 */ 1293 void 1294 sched_fork(struct thread *td, struct proc *p1) 1295 { 1296 1297 mtx_assert(&sched_lock, MA_OWNED); 1298 1299 p1->p_nice = td->td_proc->p_nice; 1300 sched_fork_ksegrp(td, FIRST_KSEGRP_IN_PROC(p1)); 1301 sched_fork_kse(td, FIRST_KSE_IN_PROC(p1)); 1302 sched_fork_thread(td, FIRST_THREAD_IN_PROC(p1)); 1303 } 1304 1305 void 1306 sched_fork_kse(struct thread *td, struct kse *child) 1307 { 1308 struct kse *ke = td->td_kse; 1309 1310 child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 1311 child->ke_cpu = ke->ke_cpu; 1312 child->ke_runq = NULL; 1313 1314 /* Grab our parents cpu estimation information. */ 1315 child->ke_ticks = ke->ke_ticks; 1316 child->ke_ltick = ke->ke_ltick; 1317 child->ke_ftick = ke->ke_ftick; 1318 } 1319 1320 void 1321 sched_fork_ksegrp(struct thread *td, struct ksegrp *child) 1322 { 1323 struct ksegrp *kg = td->td_ksegrp; 1324 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 1325 1326 child->kg_slptime = kg->kg_slptime; 1327 child->kg_runtime = kg->kg_runtime; 1328 child->kg_user_pri = kg->kg_user_pri; 1329 sched_interact_fork(child); 1330 kg->kg_runtime += tickincr << 10; 1331 sched_interact_update(kg); 1332 1333 CTR6(KTR_ULE, "sched_fork_ksegrp: %d(%d, %d) - %d(%d, %d)", 1334 kg->kg_proc->p_pid, kg->kg_slptime, kg->kg_runtime, 1335 child->kg_proc->p_pid, child->kg_slptime, child->kg_runtime); 1336 } 1337 1338 void 1339 sched_fork_thread(struct thread *td, struct thread *child) 1340 { 1341 } 1342 1343 void 1344 sched_class(struct ksegrp *kg, int class) 1345 { 1346 struct kseq *kseq; 1347 struct kse *ke; 1348 int nclass; 1349 int oclass; 1350 1351 mtx_assert(&sched_lock, MA_OWNED); 1352 if (kg->kg_pri_class == class) 1353 return; 1354 1355 nclass = PRI_BASE(class); 1356 oclass = PRI_BASE(kg->kg_pri_class); 1357 FOREACH_KSE_IN_GROUP(kg, ke) { 1358 if (ke->ke_state != KES_ONRUNQ && 1359 ke->ke_state != KES_THREAD) 1360 continue; 1361 kseq = KSEQ_CPU(ke->ke_cpu); 1362 1363 #ifdef SMP 1364 /* 1365 * On SMP if we're on the RUNQ we must adjust the transferable 1366 * count because could be changing to or from an interrupt 1367 * class. 1368 */ 1369 if (ke->ke_state == KES_ONRUNQ) { 1370 if (KSE_CAN_MIGRATE(ke, oclass)) { 1371 kseq->ksq_transferable--; 1372 kseq->ksq_group->ksg_transferable--; 1373 } 1374 if (KSE_CAN_MIGRATE(ke, nclass)) { 1375 kseq->ksq_transferable++; 1376 kseq->ksq_group->ksg_transferable++; 1377 } 1378 } 1379 #endif 1380 if (oclass == PRI_TIMESHARE) { 1381 kseq->ksq_load_timeshare--; 1382 kseq_nice_rem(kseq, kg->kg_proc->p_nice); 1383 } 1384 if (nclass == PRI_TIMESHARE) { 1385 kseq->ksq_load_timeshare++; 1386 kseq_nice_add(kseq, kg->kg_proc->p_nice); 1387 } 1388 } 1389 1390 kg->kg_pri_class = class; 1391 } 1392 1393 /* 1394 * Return some of the child's priority and interactivity to the parent. 1395 */ 1396 void 1397 sched_exit(struct proc *p, struct thread *td) 1398 { 1399 mtx_assert(&sched_lock, MA_OWNED); 1400 sched_exit_kse(FIRST_KSE_IN_PROC(p), td); 1401 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td); 1402 } 1403 1404 void 1405 sched_exit_kse(struct kse *ke, struct thread *td) 1406 { 1407 kseq_load_rem(KSEQ_CPU(td->td_kse->ke_cpu), td->td_kse); 1408 } 1409 1410 void 1411 sched_exit_ksegrp(struct ksegrp *kg, struct thread *td) 1412 { 1413 /* kg->kg_slptime += td->td_ksegrp->kg_slptime; */ 1414 kg->kg_runtime += td->td_ksegrp->kg_runtime; 1415 sched_interact_update(kg); 1416 } 1417 1418 void 1419 sched_exit_thread(struct thread *td, struct thread *child) 1420 { 1421 } 1422 1423 void 1424 sched_clock(struct thread *td) 1425 { 1426 struct kseq *kseq; 1427 struct ksegrp *kg; 1428 struct kse *ke; 1429 1430 mtx_assert(&sched_lock, MA_OWNED); 1431 kseq = KSEQ_SELF(); 1432 #ifdef SMP 1433 if (ticks == bal_tick) 1434 sched_balance(); 1435 if (ticks == gbal_tick) 1436 sched_balance_groups(); 1437 /* 1438 * We could have been assigned a non real-time thread without an 1439 * IPI. 1440 */ 1441 if (kseq->ksq_assigned) 1442 kseq_assign(kseq); /* Potentially sets NEEDRESCHED */ 1443 #endif 1444 /* 1445 * sched_setup() apparently happens prior to stathz being set. We 1446 * need to resolve the timers earlier in the boot so we can avoid 1447 * calculating this here. 1448 */ 1449 if (realstathz == 0) { 1450 realstathz = stathz ? stathz : hz; 1451 tickincr = hz / realstathz; 1452 /* 1453 * XXX This does not work for values of stathz that are much 1454 * larger than hz. 1455 */ 1456 if (tickincr == 0) 1457 tickincr = 1; 1458 } 1459 1460 ke = td->td_kse; 1461 kg = ke->ke_ksegrp; 1462 1463 /* Adjust ticks for pctcpu */ 1464 ke->ke_ticks++; 1465 ke->ke_ltick = ticks; 1466 1467 /* Go up to one second beyond our max and then trim back down */ 1468 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1469 sched_pctcpu_update(ke); 1470 1471 if (td->td_flags & TDF_IDLETD) 1472 return; 1473 1474 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1475 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1476 /* 1477 * We only do slicing code for TIMESHARE ksegrps. 1478 */ 1479 if (kg->kg_pri_class != PRI_TIMESHARE) 1480 return; 1481 /* 1482 * We used a tick charge it to the ksegrp so that we can compute our 1483 * interactivity. 1484 */ 1485 kg->kg_runtime += tickincr << 10; 1486 sched_interact_update(kg); 1487 1488 /* 1489 * We used up one time slice. 1490 */ 1491 if (--ke->ke_slice > 0) 1492 return; 1493 /* 1494 * We're out of time, recompute priorities and requeue. 1495 */ 1496 kseq_load_rem(kseq, ke); 1497 sched_priority(kg); 1498 sched_slice(ke); 1499 if (SCHED_CURR(kg, ke)) 1500 ke->ke_runq = kseq->ksq_curr; 1501 else 1502 ke->ke_runq = kseq->ksq_next; 1503 kseq_load_add(kseq, ke); 1504 td->td_flags |= TDF_NEEDRESCHED; 1505 } 1506 1507 int 1508 sched_runnable(void) 1509 { 1510 struct kseq *kseq; 1511 int load; 1512 1513 load = 1; 1514 1515 kseq = KSEQ_SELF(); 1516 #ifdef SMP 1517 if (kseq->ksq_assigned) { 1518 mtx_lock_spin(&sched_lock); 1519 kseq_assign(kseq); 1520 mtx_unlock_spin(&sched_lock); 1521 } 1522 #endif 1523 if ((curthread->td_flags & TDF_IDLETD) != 0) { 1524 if (kseq->ksq_load > 0) 1525 goto out; 1526 } else 1527 if (kseq->ksq_load - 1 > 0) 1528 goto out; 1529 load = 0; 1530 out: 1531 return (load); 1532 } 1533 1534 void 1535 sched_userret(struct thread *td) 1536 { 1537 struct ksegrp *kg; 1538 1539 kg = td->td_ksegrp; 1540 1541 if (td->td_priority != kg->kg_user_pri) { 1542 mtx_lock_spin(&sched_lock); 1543 td->td_priority = kg->kg_user_pri; 1544 mtx_unlock_spin(&sched_lock); 1545 } 1546 } 1547 1548 struct kse * 1549 sched_choose(void) 1550 { 1551 struct kseq *kseq; 1552 struct kse *ke; 1553 1554 mtx_assert(&sched_lock, MA_OWNED); 1555 kseq = KSEQ_SELF(); 1556 #ifdef SMP 1557 restart: 1558 if (kseq->ksq_assigned) 1559 kseq_assign(kseq); 1560 #endif 1561 ke = kseq_choose(kseq); 1562 if (ke) { 1563 #ifdef SMP 1564 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE) 1565 if (kseq_idled(kseq) == 0) 1566 goto restart; 1567 #endif 1568 kseq_runq_rem(kseq, ke); 1569 ke->ke_state = KES_THREAD; 1570 1571 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1572 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1573 ke, ke->ke_runq, ke->ke_slice, 1574 ke->ke_thread->td_priority); 1575 } 1576 return (ke); 1577 } 1578 #ifdef SMP 1579 if (kseq_idled(kseq) == 0) 1580 goto restart; 1581 #endif 1582 return (NULL); 1583 } 1584 1585 void 1586 sched_add(struct thread *td) 1587 { 1588 1589 sched_add_internal(td, 1); 1590 } 1591 1592 static void 1593 sched_add_internal(struct thread *td, int preemptive) 1594 { 1595 struct kseq *kseq; 1596 struct ksegrp *kg; 1597 struct kse *ke; 1598 #ifdef SMP 1599 int canmigrate; 1600 #endif 1601 int class; 1602 1603 mtx_assert(&sched_lock, MA_OWNED); 1604 ke = td->td_kse; 1605 kg = td->td_ksegrp; 1606 if (ke->ke_flags & KEF_ASSIGNED) 1607 return; 1608 kseq = KSEQ_SELF(); 1609 KASSERT((ke->ke_thread != NULL), 1610 ("sched_add: No thread on KSE")); 1611 KASSERT((ke->ke_thread->td_kse != NULL), 1612 ("sched_add: No KSE on thread")); 1613 KASSERT(ke->ke_state != KES_ONRUNQ, 1614 ("sched_add: kse %p (%s) already in run queue", ke, 1615 ke->ke_proc->p_comm)); 1616 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1617 ("sched_add: process swapped out")); 1618 KASSERT(ke->ke_runq == NULL, 1619 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1620 1621 class = PRI_BASE(kg->kg_pri_class); 1622 switch (class) { 1623 case PRI_ITHD: 1624 case PRI_REALTIME: 1625 ke->ke_runq = kseq->ksq_curr; 1626 ke->ke_slice = SCHED_SLICE_MAX; 1627 ke->ke_cpu = PCPU_GET(cpuid); 1628 break; 1629 case PRI_TIMESHARE: 1630 if (SCHED_CURR(kg, ke)) 1631 ke->ke_runq = kseq->ksq_curr; 1632 else 1633 ke->ke_runq = kseq->ksq_next; 1634 break; 1635 case PRI_IDLE: 1636 /* 1637 * This is for priority prop. 1638 */ 1639 if (ke->ke_thread->td_priority < PRI_MIN_IDLE) 1640 ke->ke_runq = kseq->ksq_curr; 1641 else 1642 ke->ke_runq = &kseq->ksq_idle; 1643 ke->ke_slice = SCHED_SLICE_MIN; 1644 break; 1645 default: 1646 panic("Unknown pri class."); 1647 break; 1648 } 1649 #ifdef SMP 1650 /* 1651 * Don't migrate running threads here. Force the long term balancer 1652 * to do it. 1653 */ 1654 canmigrate = KSE_CAN_MIGRATE(ke, class); 1655 if (ke->ke_flags & KEF_HOLD) { 1656 ke->ke_flags &= ~KEF_HOLD; 1657 canmigrate = 0; 1658 } 1659 /* 1660 * If this thread is pinned or bound, notify the target cpu. 1661 */ 1662 if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) { 1663 ke->ke_runq = NULL; 1664 kseq_notify(ke, ke->ke_cpu); 1665 return; 1666 } 1667 /* 1668 * If we had been idle, clear our bit in the group and potentially 1669 * the global bitmap. If not, see if we should transfer this thread. 1670 */ 1671 if ((class == PRI_TIMESHARE || class == PRI_REALTIME) && 1672 (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) { 1673 /* 1674 * Check to see if our group is unidling, and if so, remove it 1675 * from the global idle mask. 1676 */ 1677 if (kseq->ksq_group->ksg_idlemask == 1678 kseq->ksq_group->ksg_cpumask) 1679 atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask); 1680 /* 1681 * Now remove ourselves from the group specific idle mask. 1682 */ 1683 kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask); 1684 } else if (kseq->ksq_load > 1 && canmigrate) 1685 if (kseq_transfer(kseq, ke, class)) 1686 return; 1687 ke->ke_cpu = PCPU_GET(cpuid); 1688 #endif 1689 /* 1690 * XXX With preemption this is not necessary. 1691 */ 1692 if (td->td_priority < curthread->td_priority && 1693 ke->ke_runq == kseq->ksq_curr) 1694 curthread->td_flags |= TDF_NEEDRESCHED; 1695 if (preemptive && maybe_preempt(td)) 1696 return; 1697 ke->ke_ksegrp->kg_runq_kses++; 1698 ke->ke_state = KES_ONRUNQ; 1699 1700 kseq_runq_add(kseq, ke); 1701 kseq_load_add(kseq, ke); 1702 } 1703 1704 void 1705 sched_rem(struct thread *td) 1706 { 1707 struct kseq *kseq; 1708 struct kse *ke; 1709 1710 ke = td->td_kse; 1711 /* 1712 * It is safe to just return here because sched_rem() is only ever 1713 * used in places where we're immediately going to add the 1714 * kse back on again. In that case it'll be added with the correct 1715 * thread and priority when the caller drops the sched_lock. 1716 */ 1717 if (ke->ke_flags & KEF_ASSIGNED) 1718 return; 1719 mtx_assert(&sched_lock, MA_OWNED); 1720 KASSERT((ke->ke_state == KES_ONRUNQ), 1721 ("sched_rem: KSE not on run queue")); 1722 1723 ke->ke_state = KES_THREAD; 1724 ke->ke_ksegrp->kg_runq_kses--; 1725 kseq = KSEQ_CPU(ke->ke_cpu); 1726 kseq_runq_rem(kseq, ke); 1727 kseq_load_rem(kseq, ke); 1728 } 1729 1730 fixpt_t 1731 sched_pctcpu(struct thread *td) 1732 { 1733 fixpt_t pctcpu; 1734 struct kse *ke; 1735 1736 pctcpu = 0; 1737 ke = td->td_kse; 1738 if (ke == NULL) 1739 return (0); 1740 1741 mtx_lock_spin(&sched_lock); 1742 if (ke->ke_ticks) { 1743 int rtick; 1744 1745 /* 1746 * Don't update more frequently than twice a second. Allowing 1747 * this causes the cpu usage to decay away too quickly due to 1748 * rounding errors. 1749 */ 1750 if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick || 1751 ke->ke_ltick < (ticks - (hz / 2))) 1752 sched_pctcpu_update(ke); 1753 /* How many rtick per second ? */ 1754 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1755 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1756 } 1757 1758 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1759 mtx_unlock_spin(&sched_lock); 1760 1761 return (pctcpu); 1762 } 1763 1764 void 1765 sched_bind(struct thread *td, int cpu) 1766 { 1767 struct kse *ke; 1768 1769 mtx_assert(&sched_lock, MA_OWNED); 1770 ke = td->td_kse; 1771 ke->ke_flags |= KEF_BOUND; 1772 #ifdef SMP 1773 if (PCPU_GET(cpuid) == cpu) 1774 return; 1775 /* sched_rem without the runq_remove */ 1776 ke->ke_state = KES_THREAD; 1777 ke->ke_ksegrp->kg_runq_kses--; 1778 kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke); 1779 kseq_notify(ke, cpu); 1780 /* When we return from mi_switch we'll be on the correct cpu. */ 1781 mi_switch(SW_VOL, NULL); 1782 #endif 1783 } 1784 1785 void 1786 sched_unbind(struct thread *td) 1787 { 1788 mtx_assert(&sched_lock, MA_OWNED); 1789 td->td_kse->ke_flags &= ~KEF_BOUND; 1790 } 1791 1792 int 1793 sched_load(void) 1794 { 1795 #ifdef SMP 1796 int total; 1797 int i; 1798 1799 total = 0; 1800 for (i = 0; i <= ksg_maxid; i++) 1801 total += KSEQ_GROUP(i)->ksg_load; 1802 return (total); 1803 #else 1804 return (KSEQ_SELF()->ksq_sysload); 1805 #endif 1806 } 1807 1808 int 1809 sched_sizeof_kse(void) 1810 { 1811 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1812 } 1813 1814 int 1815 sched_sizeof_ksegrp(void) 1816 { 1817 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1818 } 1819 1820 int 1821 sched_sizeof_proc(void) 1822 { 1823 return (sizeof(struct proc)); 1824 } 1825 1826 int 1827 sched_sizeof_thread(void) 1828 { 1829 return (sizeof(struct thread) + sizeof(struct td_sched)); 1830 } 1831