1 /*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kernel.h> 33 #include <sys/ktr.h> 34 #include <sys/lock.h> 35 #include <sys/mutex.h> 36 #include <sys/proc.h> 37 #include <sys/resource.h> 38 #include <sys/sched.h> 39 #include <sys/smp.h> 40 #include <sys/sx.h> 41 #include <sys/sysctl.h> 42 #include <sys/sysproto.h> 43 #include <sys/vmmeter.h> 44 #ifdef DDB 45 #include <ddb/ddb.h> 46 #endif 47 #ifdef KTRACE 48 #include <sys/uio.h> 49 #include <sys/ktrace.h> 50 #endif 51 52 #include <machine/cpu.h> 53 54 #define KTR_ULE KTR_NFS 55 56 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57 /* XXX This is bogus compatability crap for ps */ 58 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60 61 static void sched_setup(void *dummy); 62 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63 64 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65 66 static int sched_strict; 67 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68 69 static int slice_min = 1; 70 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71 72 static int slice_max = 10; 73 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74 75 int realstathz; 76 int tickincr = 1; 77 78 #ifdef SMP 79 /* Callout to handle load balancing SMP systems. */ 80 static struct callout kseq_lb_callout; 81 #endif 82 83 /* 84 * These datastructures are allocated within their parent datastructure but 85 * are scheduler specific. 86 */ 87 88 struct ke_sched { 89 int ske_slice; 90 struct runq *ske_runq; 91 /* The following variables are only used for pctcpu calculation */ 92 int ske_ltick; /* Last tick that we were running on */ 93 int ske_ftick; /* First tick that we were running on */ 94 int ske_ticks; /* Tick count */ 95 /* CPU that we have affinity for. */ 96 u_char ske_cpu; 97 }; 98 #define ke_slice ke_sched->ske_slice 99 #define ke_runq ke_sched->ske_runq 100 #define ke_ltick ke_sched->ske_ltick 101 #define ke_ftick ke_sched->ske_ftick 102 #define ke_ticks ke_sched->ske_ticks 103 #define ke_cpu ke_sched->ske_cpu 104 105 struct kg_sched { 106 int skg_slptime; /* Number of ticks we vol. slept */ 107 int skg_runtime; /* Number of ticks we were running */ 108 }; 109 #define kg_slptime kg_sched->skg_slptime 110 #define kg_runtime kg_sched->skg_runtime 111 112 struct td_sched { 113 int std_slptime; 114 }; 115 #define td_slptime td_sched->std_slptime 116 117 struct td_sched td_sched; 118 struct ke_sched ke_sched; 119 struct kg_sched kg_sched; 120 121 struct ke_sched *kse0_sched = &ke_sched; 122 struct kg_sched *ksegrp0_sched = &kg_sched; 123 struct p_sched *proc0_sched = NULL; 124 struct td_sched *thread0_sched = &td_sched; 125 126 /* 127 * The priority is primarily determined by the interactivity score. Thus, we 128 * give lower(better) priorities to kse groups that use less CPU. The nice 129 * value is then directly added to this to allow nice to have some effect 130 * on latency. 131 * 132 * PRI_RANGE: Total priority range for timeshare threads. 133 * PRI_NRESV: Number of nice values. 134 * PRI_BASE: The start of the dynamic range. 135 */ 136 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137 #define SCHED_PRI_NRESV PRIO_TOTAL 138 #define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139 #define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140 #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141 #define SCHED_PRI_INTERACT(score) \ 142 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143 144 /* 145 * These determine the interactivity of a process. 146 * 147 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148 * before throttling back. 149 * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 150 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151 * INTERACT_THRESH: Threshhold for placement on the current runq. 152 */ 153 #define SCHED_SLP_RUN_MAX ((hz * 2) << 10) 154 #define SCHED_SLP_RUN_THROTTLE (100) 155 #define SCHED_INTERACT_MAX (100) 156 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157 #define SCHED_INTERACT_THRESH (20) 158 159 /* 160 * These parameters and macros determine the size of the time slice that is 161 * granted to each thread. 162 * 163 * SLICE_MIN: Minimum time slice granted, in units of ticks. 164 * SLICE_MAX: Maximum time slice granted. 165 * SLICE_RANGE: Range of available time slices scaled by hz. 166 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168 */ 169 #define SCHED_SLICE_MIN (slice_min) 170 #define SCHED_SLICE_MAX (slice_max) 171 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 172 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 173 #define SCHED_SLICE_NICE(nice) \ 174 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 175 176 /* 177 * This macro determines whether or not the kse belongs on the current or 178 * next run queue. 179 * 180 * XXX nice value should effect how interactive a kg is. 181 */ 182 #define SCHED_INTERACTIVE(kg) \ 183 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184 #define SCHED_CURR(kg, ke) \ 185 (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg)) 186 187 /* 188 * Cpu percentage computation macros and defines. 189 * 190 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 191 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 192 */ 193 194 #define SCHED_CPU_TIME 10 195 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 196 197 /* 198 * kseq - per processor runqs and statistics. 199 */ 200 201 #define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 202 203 struct kseq { 204 struct runq ksq_idle; /* Queue of IDLE threads. */ 205 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 206 struct runq *ksq_next; /* Next timeshare queue. */ 207 struct runq *ksq_curr; /* Current queue. */ 208 int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 209 int ksq_load; /* Aggregate load. */ 210 short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 211 short ksq_nicemin; /* Least nice. */ 212 #ifdef SMP 213 int ksq_cpus; /* Count of CPUs in this kseq. */ 214 unsigned int ksq_rslices; /* Slices on run queue */ 215 #endif 216 }; 217 218 /* 219 * One kse queue per processor. 220 */ 221 #ifdef SMP 222 struct kseq kseq_cpu[MAXCPU]; 223 struct kseq *kseq_idmap[MAXCPU]; 224 #define KSEQ_SELF() (kseq_idmap[PCPU_GET(cpuid)]) 225 #define KSEQ_CPU(x) (kseq_idmap[(x)]) 226 #else 227 struct kseq kseq_cpu; 228 #define KSEQ_SELF() (&kseq_cpu) 229 #define KSEQ_CPU(x) (&kseq_cpu) 230 #endif 231 232 static void sched_slice(struct kse *ke); 233 static void sched_priority(struct ksegrp *kg); 234 static int sched_interact_score(struct ksegrp *kg); 235 static void sched_interact_update(struct ksegrp *kg); 236 void sched_pctcpu_update(struct kse *ke); 237 int sched_pickcpu(void); 238 239 /* Operations on per processor queues */ 240 static struct kse * kseq_choose(struct kseq *kseq, int steal); 241 static void kseq_setup(struct kseq *kseq); 242 static void kseq_add(struct kseq *kseq, struct kse *ke); 243 static void kseq_rem(struct kseq *kseq, struct kse *ke); 244 static void kseq_nice_add(struct kseq *kseq, int nice); 245 static void kseq_nice_rem(struct kseq *kseq, int nice); 246 void kseq_print(int cpu); 247 #ifdef SMP 248 struct kseq * kseq_load_highest(void); 249 void kseq_balance(void *arg); 250 void kseq_move(struct kseq *from, int cpu); 251 #endif 252 253 void 254 kseq_print(int cpu) 255 { 256 struct kseq *kseq; 257 int i; 258 259 kseq = KSEQ_CPU(cpu); 260 261 printf("kseq:\n"); 262 printf("\tload: %d\n", kseq->ksq_load); 263 printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 264 printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 265 printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 266 printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 267 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 268 printf("\tnice counts:\n"); 269 for (i = 0; i < PRIO_TOTAL + 1; i++) 270 if (kseq->ksq_nice[i]) 271 printf("\t\t%d = %d\n", 272 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 273 } 274 275 static void 276 kseq_add(struct kseq *kseq, struct kse *ke) 277 { 278 mtx_assert(&sched_lock, MA_OWNED); 279 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 280 kseq->ksq_load++; 281 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 282 CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 283 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 284 ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 285 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 286 kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 287 #ifdef SMP 288 kseq->ksq_rslices += ke->ke_slice; 289 #endif 290 } 291 292 static void 293 kseq_rem(struct kseq *kseq, struct kse *ke) 294 { 295 mtx_assert(&sched_lock, MA_OWNED); 296 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 297 kseq->ksq_load--; 298 ke->ke_runq = NULL; 299 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 300 kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 301 #ifdef SMP 302 kseq->ksq_rslices -= ke->ke_slice; 303 #endif 304 } 305 306 static void 307 kseq_nice_add(struct kseq *kseq, int nice) 308 { 309 mtx_assert(&sched_lock, MA_OWNED); 310 /* Normalize to zero. */ 311 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 312 if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 313 kseq->ksq_nicemin = nice; 314 } 315 316 static void 317 kseq_nice_rem(struct kseq *kseq, int nice) 318 { 319 int n; 320 321 mtx_assert(&sched_lock, MA_OWNED); 322 /* Normalize to zero. */ 323 n = nice + SCHED_PRI_NHALF; 324 kseq->ksq_nice[n]--; 325 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 326 327 /* 328 * If this wasn't the smallest nice value or there are more in 329 * this bucket we can just return. Otherwise we have to recalculate 330 * the smallest nice. 331 */ 332 if (nice != kseq->ksq_nicemin || 333 kseq->ksq_nice[n] != 0 || 334 kseq->ksq_loads[PRI_TIMESHARE] == 0) 335 return; 336 337 for (; n < SCHED_PRI_NRESV + 1; n++) 338 if (kseq->ksq_nice[n]) { 339 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 340 return; 341 } 342 } 343 344 #ifdef SMP 345 /* 346 * kseq_balance is a simple CPU load balancing algorithm. It operates by 347 * finding the least loaded and most loaded cpu and equalizing their load 348 * by migrating some processes. 349 * 350 * Dealing only with two CPUs at a time has two advantages. Firstly, most 351 * installations will only have 2 cpus. Secondly, load balancing too much at 352 * once can have an unpleasant effect on the system. The scheduler rarely has 353 * enough information to make perfect decisions. So this algorithm chooses 354 * algorithm simplicity and more gradual effects on load in larger systems. 355 * 356 * It could be improved by considering the priorities and slices assigned to 357 * each task prior to balancing them. There are many pathological cases with 358 * any approach and so the semi random algorithm below may work as well as any. 359 * 360 */ 361 void 362 kseq_balance(void *arg) 363 { 364 struct kseq *kseq; 365 int high_load; 366 int low_load; 367 int high_cpu; 368 int low_cpu; 369 int move; 370 int diff; 371 int i; 372 373 high_cpu = 0; 374 low_cpu = 0; 375 high_load = 0; 376 low_load = -1; 377 378 mtx_lock_spin(&sched_lock); 379 if (smp_started == 0) 380 goto out; 381 382 for (i = 0; i < mp_maxid; i++) { 383 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 384 continue; 385 kseq = KSEQ_CPU(i); 386 if (kseq->ksq_load > high_load) { 387 high_load = kseq->ksq_load; 388 high_cpu = i; 389 } 390 if (low_load == -1 || kseq->ksq_load < low_load) { 391 low_load = kseq->ksq_load; 392 low_cpu = i; 393 } 394 } 395 396 kseq = KSEQ_CPU(high_cpu); 397 398 /* 399 * Nothing to do. 400 */ 401 if (high_load < kseq->ksq_cpus + 1) 402 goto out; 403 404 high_load -= kseq->ksq_cpus; 405 406 if (low_load >= high_load) 407 goto out; 408 409 diff = high_load - low_load; 410 move = diff / 2; 411 if (diff & 0x1) 412 move++; 413 414 for (i = 0; i < move; i++) 415 kseq_move(kseq, low_cpu); 416 417 out: 418 mtx_unlock_spin(&sched_lock); 419 callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 420 421 return; 422 } 423 424 struct kseq * 425 kseq_load_highest(void) 426 { 427 struct kseq *kseq; 428 int load; 429 int cpu; 430 int i; 431 432 mtx_assert(&sched_lock, MA_OWNED); 433 cpu = 0; 434 load = 0; 435 436 for (i = 0; i < mp_maxid; i++) { 437 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 438 continue; 439 kseq = KSEQ_CPU(i); 440 if (kseq->ksq_load > load) { 441 load = kseq->ksq_load; 442 cpu = i; 443 } 444 } 445 kseq = KSEQ_CPU(cpu); 446 447 if (load > kseq->ksq_cpus) 448 return (kseq); 449 450 return (NULL); 451 } 452 453 void 454 kseq_move(struct kseq *from, int cpu) 455 { 456 struct kse *ke; 457 458 ke = kseq_choose(from, 1); 459 runq_remove(ke->ke_runq, ke); 460 ke->ke_state = KES_THREAD; 461 kseq_rem(from, ke); 462 463 ke->ke_cpu = cpu; 464 sched_add(ke); 465 } 466 #endif 467 468 /* 469 * Pick the highest priority task we have and return it. If steal is 1 we 470 * will return kses that have been denied slices due to their nice being too 471 * low. In the future we should prohibit stealing interrupt threads as well. 472 */ 473 474 struct kse * 475 kseq_choose(struct kseq *kseq, int steal) 476 { 477 struct kse *ke; 478 struct runq *swap; 479 480 mtx_assert(&sched_lock, MA_OWNED); 481 swap = NULL; 482 483 for (;;) { 484 ke = runq_choose(kseq->ksq_curr); 485 if (ke == NULL) { 486 /* 487 * We already swaped once and didn't get anywhere. 488 */ 489 if (swap) 490 break; 491 swap = kseq->ksq_curr; 492 kseq->ksq_curr = kseq->ksq_next; 493 kseq->ksq_next = swap; 494 continue; 495 } 496 /* 497 * If we encounter a slice of 0 the kse is in a 498 * TIMESHARE kse group and its nice was too far out 499 * of the range that receives slices. 500 */ 501 if (ke->ke_slice == 0 && steal == 0) { 502 runq_remove(ke->ke_runq, ke); 503 sched_slice(ke); 504 ke->ke_runq = kseq->ksq_next; 505 runq_add(ke->ke_runq, ke); 506 continue; 507 } 508 return (ke); 509 } 510 511 return (runq_choose(&kseq->ksq_idle)); 512 } 513 514 static void 515 kseq_setup(struct kseq *kseq) 516 { 517 runq_init(&kseq->ksq_timeshare[0]); 518 runq_init(&kseq->ksq_timeshare[1]); 519 runq_init(&kseq->ksq_idle); 520 521 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 522 kseq->ksq_next = &kseq->ksq_timeshare[1]; 523 524 kseq->ksq_loads[PRI_ITHD] = 0; 525 kseq->ksq_loads[PRI_REALTIME] = 0; 526 kseq->ksq_loads[PRI_TIMESHARE] = 0; 527 kseq->ksq_loads[PRI_IDLE] = 0; 528 kseq->ksq_load = 0; 529 #ifdef SMP 530 kseq->ksq_rslices = 0; 531 #endif 532 } 533 534 static void 535 sched_setup(void *dummy) 536 { 537 #ifdef SMP 538 int i; 539 #endif 540 541 slice_min = (hz/100); /* 10ms */ 542 slice_max = (hz/7); /* ~140ms */ 543 544 #ifdef SMP 545 /* init kseqs */ 546 /* Create the idmap. */ 547 #ifdef ULE_HTT_EXPERIMENTAL 548 if (smp_topology == NULL) { 549 #else 550 if (1) { 551 #endif 552 for (i = 0; i < MAXCPU; i++) { 553 kseq_setup(&kseq_cpu[i]); 554 kseq_idmap[i] = &kseq_cpu[i]; 555 kseq_cpu[i].ksq_cpus = 1; 556 } 557 } else { 558 int j; 559 560 for (i = 0; i < smp_topology->ct_count; i++) { 561 struct cpu_group *cg; 562 563 cg = &smp_topology->ct_group[i]; 564 kseq_setup(&kseq_cpu[i]); 565 566 for (j = 0; j < MAXCPU; j++) 567 if ((cg->cg_mask & (1 << j)) != 0) 568 kseq_idmap[j] = &kseq_cpu[i]; 569 kseq_cpu[i].ksq_cpus = cg->cg_count; 570 } 571 } 572 callout_init(&kseq_lb_callout, CALLOUT_MPSAFE); 573 kseq_balance(NULL); 574 #else 575 kseq_setup(KSEQ_SELF()); 576 #endif 577 mtx_lock_spin(&sched_lock); 578 kseq_add(KSEQ_SELF(), &kse0); 579 mtx_unlock_spin(&sched_lock); 580 } 581 582 /* 583 * Scale the scheduling priority according to the "interactivity" of this 584 * process. 585 */ 586 static void 587 sched_priority(struct ksegrp *kg) 588 { 589 int pri; 590 591 if (kg->kg_pri_class != PRI_TIMESHARE) 592 return; 593 594 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 595 pri += SCHED_PRI_BASE; 596 pri += kg->kg_nice; 597 598 if (pri > PRI_MAX_TIMESHARE) 599 pri = PRI_MAX_TIMESHARE; 600 else if (pri < PRI_MIN_TIMESHARE) 601 pri = PRI_MIN_TIMESHARE; 602 603 kg->kg_user_pri = pri; 604 605 return; 606 } 607 608 /* 609 * Calculate a time slice based on the properties of the kseg and the runq 610 * that we're on. This is only for PRI_TIMESHARE ksegrps. 611 */ 612 static void 613 sched_slice(struct kse *ke) 614 { 615 struct kseq *kseq; 616 struct ksegrp *kg; 617 618 kg = ke->ke_ksegrp; 619 kseq = KSEQ_CPU(ke->ke_cpu); 620 621 /* 622 * Rationale: 623 * KSEs in interactive ksegs get the minimum slice so that we 624 * quickly notice if it abuses its advantage. 625 * 626 * KSEs in non-interactive ksegs are assigned a slice that is 627 * based on the ksegs nice value relative to the least nice kseg 628 * on the run queue for this cpu. 629 * 630 * If the KSE is less nice than all others it gets the maximum 631 * slice and other KSEs will adjust their slice relative to 632 * this when they first expire. 633 * 634 * There is 20 point window that starts relative to the least 635 * nice kse on the run queue. Slice size is determined by 636 * the kse distance from the last nice ksegrp. 637 * 638 * If you are outside of the window you will get no slice and 639 * you will be reevaluated each time you are selected on the 640 * run queue. 641 * 642 */ 643 644 if (!SCHED_INTERACTIVE(kg)) { 645 int nice; 646 647 nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 648 if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 649 kg->kg_nice < kseq->ksq_nicemin) 650 ke->ke_slice = SCHED_SLICE_MAX; 651 else if (nice <= SCHED_PRI_NTHRESH) 652 ke->ke_slice = SCHED_SLICE_NICE(nice); 653 else 654 ke->ke_slice = 0; 655 } else 656 ke->ke_slice = SCHED_SLICE_MIN; 657 658 CTR6(KTR_ULE, 659 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 660 ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 661 kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 662 663 /* 664 * Check to see if we need to scale back the slp and run time 665 * in the kg. This will cause us to forget old interactivity 666 * while maintaining the current ratio. 667 */ 668 sched_interact_update(kg); 669 670 return; 671 } 672 673 static void 674 sched_interact_update(struct ksegrp *kg) 675 { 676 /* XXX Fixme, use a linear algorithm and not a while loop. */ 677 while ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 678 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 679 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 680 } 681 } 682 683 static int 684 sched_interact_score(struct ksegrp *kg) 685 { 686 int div; 687 688 if (kg->kg_runtime > kg->kg_slptime) { 689 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 690 return (SCHED_INTERACT_HALF + 691 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 692 } if (kg->kg_slptime > kg->kg_runtime) { 693 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 694 return (kg->kg_runtime / div); 695 } 696 697 /* 698 * This can happen if slptime and runtime are 0. 699 */ 700 return (0); 701 702 } 703 704 /* 705 * This is only somewhat accurate since given many processes of the same 706 * priority they will switch when their slices run out, which will be 707 * at most SCHED_SLICE_MAX. 708 */ 709 int 710 sched_rr_interval(void) 711 { 712 return (SCHED_SLICE_MAX); 713 } 714 715 void 716 sched_pctcpu_update(struct kse *ke) 717 { 718 /* 719 * Adjust counters and watermark for pctcpu calc. 720 */ 721 if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) { 722 /* 723 * Shift the tick count out so that the divide doesn't 724 * round away our results. 725 */ 726 ke->ke_ticks <<= 10; 727 ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) * 728 SCHED_CPU_TICKS; 729 ke->ke_ticks >>= 10; 730 } else 731 ke->ke_ticks = 0; 732 ke->ke_ltick = ticks; 733 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 734 } 735 736 #ifdef SMP 737 /* XXX Should be changed to kseq_load_lowest() */ 738 int 739 sched_pickcpu(void) 740 { 741 struct kseq *kseq; 742 int load; 743 int cpu; 744 int i; 745 746 mtx_assert(&sched_lock, MA_OWNED); 747 if (!smp_started) 748 return (0); 749 750 load = 0; 751 cpu = 0; 752 753 for (i = 0; i < mp_maxid; i++) { 754 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 755 continue; 756 kseq = KSEQ_CPU(i); 757 if (kseq->ksq_load < load) { 758 cpu = i; 759 load = kseq->ksq_load; 760 } 761 } 762 763 CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 764 return (cpu); 765 } 766 #else 767 int 768 sched_pickcpu(void) 769 { 770 return (0); 771 } 772 #endif 773 774 void 775 sched_prio(struct thread *td, u_char prio) 776 { 777 778 mtx_assert(&sched_lock, MA_OWNED); 779 if (TD_ON_RUNQ(td)) { 780 adjustrunqueue(td, prio); 781 } else { 782 td->td_priority = prio; 783 } 784 } 785 786 void 787 sched_switchout(struct thread *td) 788 { 789 struct kse *ke; 790 791 mtx_assert(&sched_lock, MA_OWNED); 792 793 ke = td->td_kse; 794 795 td->td_last_kse = ke; 796 td->td_lastcpu = td->td_oncpu; 797 td->td_oncpu = NOCPU; 798 td->td_flags &= ~TDF_NEEDRESCHED; 799 800 if (TD_IS_RUNNING(td)) { 801 if (td->td_proc->p_flag & P_SA) { 802 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 803 setrunqueue(td); 804 } else { 805 /* 806 * This queue is always correct except for idle threads which 807 * have a higher priority due to priority propagation. 808 */ 809 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 810 ke->ke_thread->td_priority > PRI_MIN_IDLE) 811 ke->ke_runq = KSEQ_SELF()->ksq_curr; 812 runq_add(ke->ke_runq, ke); 813 /* setrunqueue(td); */ 814 } 815 return; 816 } 817 if (ke->ke_runq) 818 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 819 /* 820 * We will not be on the run queue. So we must be 821 * sleeping or similar. 822 */ 823 if (td->td_proc->p_flag & P_SA) 824 kse_reassign(ke); 825 } 826 827 void 828 sched_switchin(struct thread *td) 829 { 830 /* struct kse *ke = td->td_kse; */ 831 mtx_assert(&sched_lock, MA_OWNED); 832 833 td->td_oncpu = PCPU_GET(cpuid); 834 } 835 836 void 837 sched_nice(struct ksegrp *kg, int nice) 838 { 839 struct kse *ke; 840 struct thread *td; 841 struct kseq *kseq; 842 843 PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 844 mtx_assert(&sched_lock, MA_OWNED); 845 /* 846 * We need to adjust the nice counts for running KSEs. 847 */ 848 if (kg->kg_pri_class == PRI_TIMESHARE) 849 FOREACH_KSE_IN_GROUP(kg, ke) { 850 if (ke->ke_runq == NULL) 851 continue; 852 kseq = KSEQ_CPU(ke->ke_cpu); 853 kseq_nice_rem(kseq, kg->kg_nice); 854 kseq_nice_add(kseq, nice); 855 } 856 kg->kg_nice = nice; 857 sched_priority(kg); 858 FOREACH_THREAD_IN_GROUP(kg, td) 859 td->td_flags |= TDF_NEEDRESCHED; 860 } 861 862 void 863 sched_sleep(struct thread *td, u_char prio) 864 { 865 mtx_assert(&sched_lock, MA_OWNED); 866 867 td->td_slptime = ticks; 868 td->td_priority = prio; 869 870 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 871 td->td_kse, td->td_slptime); 872 } 873 874 void 875 sched_wakeup(struct thread *td) 876 { 877 mtx_assert(&sched_lock, MA_OWNED); 878 879 /* 880 * Let the kseg know how long we slept for. This is because process 881 * interactivity behavior is modeled in the kseg. 882 */ 883 if (td->td_slptime) { 884 struct ksegrp *kg; 885 int hzticks; 886 887 kg = td->td_ksegrp; 888 hzticks = ticks - td->td_slptime; 889 kg->kg_slptime += hzticks << 10; 890 sched_interact_update(kg); 891 sched_priority(kg); 892 if (td->td_kse) 893 sched_slice(td->td_kse); 894 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 895 td->td_kse, hzticks); 896 td->td_slptime = 0; 897 } 898 setrunqueue(td); 899 if (td->td_priority < curthread->td_priority) 900 curthread->td_flags |= TDF_NEEDRESCHED; 901 } 902 903 /* 904 * Penalize the parent for creating a new child and initialize the child's 905 * priority. 906 */ 907 void 908 sched_fork(struct proc *p, struct proc *p1) 909 { 910 911 mtx_assert(&sched_lock, MA_OWNED); 912 913 sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 914 sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 915 sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 916 } 917 918 void 919 sched_fork_kse(struct kse *ke, struct kse *child) 920 { 921 922 child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 923 child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 924 child->ke_runq = NULL; 925 926 /* 927 * Claim that we've been running for one second for statistical 928 * purposes. 929 */ 930 child->ke_ticks = 0; 931 child->ke_ltick = ticks; 932 child->ke_ftick = ticks - hz; 933 } 934 935 void 936 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 937 { 938 939 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 940 /* XXX Need something better here */ 941 942 child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 943 child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 944 kg->kg_runtime += tickincr << 10; 945 sched_interact_update(kg); 946 947 child->kg_user_pri = kg->kg_user_pri; 948 child->kg_nice = kg->kg_nice; 949 } 950 951 void 952 sched_fork_thread(struct thread *td, struct thread *child) 953 { 954 } 955 956 void 957 sched_class(struct ksegrp *kg, int class) 958 { 959 struct kseq *kseq; 960 struct kse *ke; 961 962 mtx_assert(&sched_lock, MA_OWNED); 963 if (kg->kg_pri_class == class) 964 return; 965 966 FOREACH_KSE_IN_GROUP(kg, ke) { 967 if (ke->ke_state != KES_ONRUNQ && 968 ke->ke_state != KES_THREAD) 969 continue; 970 kseq = KSEQ_CPU(ke->ke_cpu); 971 972 kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 973 kseq->ksq_loads[PRI_BASE(class)]++; 974 975 if (kg->kg_pri_class == PRI_TIMESHARE) 976 kseq_nice_rem(kseq, kg->kg_nice); 977 else if (class == PRI_TIMESHARE) 978 kseq_nice_add(kseq, kg->kg_nice); 979 } 980 981 kg->kg_pri_class = class; 982 } 983 984 /* 985 * Return some of the child's priority and interactivity to the parent. 986 */ 987 void 988 sched_exit(struct proc *p, struct proc *child) 989 { 990 /* XXX Need something better here */ 991 mtx_assert(&sched_lock, MA_OWNED); 992 sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 993 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 994 } 995 996 void 997 sched_exit_kse(struct kse *ke, struct kse *child) 998 { 999 kseq_rem(KSEQ_CPU(child->ke_cpu), child); 1000 } 1001 1002 void 1003 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 1004 { 1005 /* kg->kg_slptime += child->kg_slptime; */ 1006 kg->kg_runtime += child->kg_runtime; 1007 sched_interact_update(kg); 1008 } 1009 1010 void 1011 sched_exit_thread(struct thread *td, struct thread *child) 1012 { 1013 } 1014 1015 void 1016 sched_clock(struct kse *ke) 1017 { 1018 struct kseq *kseq; 1019 struct ksegrp *kg; 1020 struct thread *td; 1021 #if 0 1022 struct kse *nke; 1023 #endif 1024 1025 /* 1026 * sched_setup() apparently happens prior to stathz being set. We 1027 * need to resolve the timers earlier in the boot so we can avoid 1028 * calculating this here. 1029 */ 1030 if (realstathz == 0) { 1031 realstathz = stathz ? stathz : hz; 1032 tickincr = hz / realstathz; 1033 /* 1034 * XXX This does not work for values of stathz that are much 1035 * larger than hz. 1036 */ 1037 if (tickincr == 0) 1038 tickincr = 1; 1039 } 1040 1041 td = ke->ke_thread; 1042 kg = ke->ke_ksegrp; 1043 1044 mtx_assert(&sched_lock, MA_OWNED); 1045 KASSERT((td != NULL), ("schedclock: null thread pointer")); 1046 1047 /* Adjust ticks for pctcpu */ 1048 ke->ke_ticks++; 1049 ke->ke_ltick = ticks; 1050 1051 /* Go up to one second beyond our max and then trim back down */ 1052 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1053 sched_pctcpu_update(ke); 1054 1055 if (td->td_flags & TDF_IDLETD) 1056 return; 1057 1058 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1059 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1060 1061 /* 1062 * We only do slicing code for TIMESHARE ksegrps. 1063 */ 1064 if (kg->kg_pri_class != PRI_TIMESHARE) 1065 return; 1066 /* 1067 * Check for a higher priority task on the run queue. This can happen 1068 * on SMP if another processor woke up a process on our runq. 1069 */ 1070 kseq = KSEQ_SELF(); 1071 #if 0 1072 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq, 0)) != NULL) { 1073 if (sched_strict && 1074 nke->ke_thread->td_priority < td->td_priority) 1075 td->td_flags |= TDF_NEEDRESCHED; 1076 else if (nke->ke_thread->td_priority < 1077 td->td_priority SCHED_PRIO_SLOP) 1078 1079 if (nke->ke_thread->td_priority < td->td_priority) 1080 td->td_flags |= TDF_NEEDRESCHED; 1081 } 1082 #endif 1083 /* 1084 * We used a tick charge it to the ksegrp so that we can compute our 1085 * interactivity. 1086 */ 1087 kg->kg_runtime += tickincr << 10; 1088 sched_interact_update(kg); 1089 1090 /* 1091 * We used up one time slice. 1092 */ 1093 ke->ke_slice--; 1094 #ifdef SMP 1095 kseq->ksq_rslices--; 1096 #endif 1097 1098 if (ke->ke_slice > 0) 1099 return; 1100 /* 1101 * We're out of time, recompute priorities and requeue. 1102 */ 1103 kseq_rem(kseq, ke); 1104 sched_priority(kg); 1105 sched_slice(ke); 1106 if (SCHED_CURR(kg, ke)) 1107 ke->ke_runq = kseq->ksq_curr; 1108 else 1109 ke->ke_runq = kseq->ksq_next; 1110 kseq_add(kseq, ke); 1111 td->td_flags |= TDF_NEEDRESCHED; 1112 } 1113 1114 int 1115 sched_runnable(void) 1116 { 1117 struct kseq *kseq; 1118 int load; 1119 1120 load = 1; 1121 1122 mtx_lock_spin(&sched_lock); 1123 kseq = KSEQ_SELF(); 1124 1125 if (kseq->ksq_load) 1126 goto out; 1127 #ifdef SMP 1128 /* 1129 * For SMP we may steal other processor's KSEs. Just search until we 1130 * verify that at least on other cpu has a runnable task. 1131 */ 1132 if (smp_started) { 1133 int i; 1134 1135 for (i = 0; i < mp_maxid; i++) { 1136 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1137 continue; 1138 kseq = KSEQ_CPU(i); 1139 if (kseq->ksq_load > kseq->ksq_cpus) 1140 goto out; 1141 } 1142 } 1143 #endif 1144 load = 0; 1145 out: 1146 mtx_unlock_spin(&sched_lock); 1147 return (load); 1148 } 1149 1150 void 1151 sched_userret(struct thread *td) 1152 { 1153 struct ksegrp *kg; 1154 struct kseq *kseq; 1155 struct kse *ke; 1156 1157 kg = td->td_ksegrp; 1158 1159 if (td->td_priority != kg->kg_user_pri) { 1160 mtx_lock_spin(&sched_lock); 1161 td->td_priority = kg->kg_user_pri; 1162 kseq = KSEQ_SELF(); 1163 if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 1164 #ifdef SMP 1165 kseq->ksq_load > kseq->ksq_cpus && 1166 #else 1167 kseq->ksq_load > 1 && 1168 #endif 1169 (ke = kseq_choose(kseq, 0)) != NULL && 1170 ke->ke_thread->td_priority < td->td_priority) 1171 curthread->td_flags |= TDF_NEEDRESCHED; 1172 mtx_unlock_spin(&sched_lock); 1173 } 1174 } 1175 1176 struct kse * 1177 sched_choose(void) 1178 { 1179 struct kseq *kseq; 1180 struct kse *ke; 1181 1182 mtx_assert(&sched_lock, MA_OWNED); 1183 #ifdef SMP 1184 retry: 1185 #endif 1186 kseq = KSEQ_SELF(); 1187 ke = kseq_choose(kseq, 0); 1188 if (ke) { 1189 runq_remove(ke->ke_runq, ke); 1190 ke->ke_state = KES_THREAD; 1191 1192 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1193 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1194 ke, ke->ke_runq, ke->ke_slice, 1195 ke->ke_thread->td_priority); 1196 } 1197 return (ke); 1198 } 1199 1200 #ifdef SMP 1201 if (smp_started) { 1202 /* 1203 * Find the cpu with the highest load and steal one proc. 1204 */ 1205 if ((kseq = kseq_load_highest()) == NULL) 1206 return (NULL); 1207 1208 /* 1209 * Remove this kse from this kseq and runq and then requeue 1210 * on the current processor. Then we will dequeue it 1211 * normally above. 1212 */ 1213 kseq_move(kseq, PCPU_GET(cpuid)); 1214 goto retry; 1215 } 1216 #endif 1217 1218 return (NULL); 1219 } 1220 1221 void 1222 sched_add(struct kse *ke) 1223 { 1224 struct kseq *kseq; 1225 struct ksegrp *kg; 1226 1227 mtx_assert(&sched_lock, MA_OWNED); 1228 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1229 KASSERT((ke->ke_thread->td_kse != NULL), 1230 ("sched_add: No KSE on thread")); 1231 KASSERT(ke->ke_state != KES_ONRUNQ, 1232 ("sched_add: kse %p (%s) already in run queue", ke, 1233 ke->ke_proc->p_comm)); 1234 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1235 ("sched_add: process swapped out")); 1236 KASSERT(ke->ke_runq == NULL, 1237 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1238 1239 kg = ke->ke_ksegrp; 1240 1241 switch (PRI_BASE(kg->kg_pri_class)) { 1242 case PRI_ITHD: 1243 case PRI_REALTIME: 1244 kseq = KSEQ_SELF(); 1245 ke->ke_runq = kseq->ksq_curr; 1246 ke->ke_slice = SCHED_SLICE_MAX; 1247 ke->ke_cpu = PCPU_GET(cpuid); 1248 break; 1249 case PRI_TIMESHARE: 1250 kseq = KSEQ_CPU(ke->ke_cpu); 1251 if (SCHED_CURR(kg, ke)) 1252 ke->ke_runq = kseq->ksq_curr; 1253 else 1254 ke->ke_runq = kseq->ksq_next; 1255 break; 1256 case PRI_IDLE: 1257 kseq = KSEQ_CPU(ke->ke_cpu); 1258 /* 1259 * This is for priority prop. 1260 */ 1261 if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1262 ke->ke_runq = kseq->ksq_curr; 1263 else 1264 ke->ke_runq = &kseq->ksq_idle; 1265 ke->ke_slice = SCHED_SLICE_MIN; 1266 break; 1267 default: 1268 panic("Unknown pri class.\n"); 1269 break; 1270 } 1271 1272 ke->ke_ksegrp->kg_runq_kses++; 1273 ke->ke_state = KES_ONRUNQ; 1274 1275 runq_add(ke->ke_runq, ke); 1276 kseq_add(kseq, ke); 1277 } 1278 1279 void 1280 sched_rem(struct kse *ke) 1281 { 1282 struct kseq *kseq; 1283 1284 mtx_assert(&sched_lock, MA_OWNED); 1285 KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1286 1287 ke->ke_state = KES_THREAD; 1288 ke->ke_ksegrp->kg_runq_kses--; 1289 kseq = KSEQ_CPU(ke->ke_cpu); 1290 runq_remove(ke->ke_runq, ke); 1291 kseq_rem(kseq, ke); 1292 } 1293 1294 fixpt_t 1295 sched_pctcpu(struct kse *ke) 1296 { 1297 fixpt_t pctcpu; 1298 1299 pctcpu = 0; 1300 1301 mtx_lock_spin(&sched_lock); 1302 if (ke->ke_ticks) { 1303 int rtick; 1304 1305 /* 1306 * Don't update more frequently than twice a second. Allowing 1307 * this causes the cpu usage to decay away too quickly due to 1308 * rounding errors. 1309 */ 1310 if (ke->ke_ltick < (ticks - (hz / 2))) 1311 sched_pctcpu_update(ke); 1312 /* How many rtick per second ? */ 1313 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1314 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1315 } 1316 1317 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1318 mtx_unlock_spin(&sched_lock); 1319 1320 return (pctcpu); 1321 } 1322 1323 int 1324 sched_sizeof_kse(void) 1325 { 1326 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1327 } 1328 1329 int 1330 sched_sizeof_ksegrp(void) 1331 { 1332 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1333 } 1334 1335 int 1336 sched_sizeof_proc(void) 1337 { 1338 return (sizeof(struct proc)); 1339 } 1340 1341 int 1342 sched_sizeof_thread(void) 1343 { 1344 return (sizeof(struct thread) + sizeof(struct td_sched)); 1345 } 1346