1 /*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kernel.h> 33 #include <sys/ktr.h> 34 #include <sys/lock.h> 35 #include <sys/mutex.h> 36 #include <sys/proc.h> 37 #include <sys/resource.h> 38 #include <sys/sched.h> 39 #include <sys/smp.h> 40 #include <sys/sx.h> 41 #include <sys/sysctl.h> 42 #include <sys/sysproto.h> 43 #include <sys/vmmeter.h> 44 #ifdef DDB 45 #include <ddb/ddb.h> 46 #endif 47 #ifdef KTRACE 48 #include <sys/uio.h> 49 #include <sys/ktrace.h> 50 #endif 51 52 #include <machine/cpu.h> 53 54 #define KTR_ULE KTR_NFS 55 56 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 57 /* XXX This is bogus compatability crap for ps */ 58 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 59 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 60 61 static void sched_setup(void *dummy); 62 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 63 64 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 65 66 static int sched_strict; 67 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 68 69 static int slice_min = 1; 70 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 71 72 static int slice_max = 10; 73 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 74 75 int realstathz; 76 int tickincr = 1; 77 78 #ifdef SMP 79 /* Callout to handle load balancing SMP systems. */ 80 static struct callout kseq_lb_callout; 81 #endif 82 83 /* 84 * These datastructures are allocated within their parent datastructure but 85 * are scheduler specific. 86 */ 87 88 struct ke_sched { 89 int ske_slice; 90 struct runq *ske_runq; 91 /* The following variables are only used for pctcpu calculation */ 92 int ske_ltick; /* Last tick that we were running on */ 93 int ske_ftick; /* First tick that we were running on */ 94 int ske_ticks; /* Tick count */ 95 /* CPU that we have affinity for. */ 96 u_char ske_cpu; 97 }; 98 #define ke_slice ke_sched->ske_slice 99 #define ke_runq ke_sched->ske_runq 100 #define ke_ltick ke_sched->ske_ltick 101 #define ke_ftick ke_sched->ske_ftick 102 #define ke_ticks ke_sched->ske_ticks 103 #define ke_cpu ke_sched->ske_cpu 104 105 struct kg_sched { 106 int skg_slptime; /* Number of ticks we vol. slept */ 107 int skg_runtime; /* Number of ticks we were running */ 108 }; 109 #define kg_slptime kg_sched->skg_slptime 110 #define kg_runtime kg_sched->skg_runtime 111 112 struct td_sched { 113 int std_slptime; 114 }; 115 #define td_slptime td_sched->std_slptime 116 117 struct td_sched td_sched; 118 struct ke_sched ke_sched; 119 struct kg_sched kg_sched; 120 121 struct ke_sched *kse0_sched = &ke_sched; 122 struct kg_sched *ksegrp0_sched = &kg_sched; 123 struct p_sched *proc0_sched = NULL; 124 struct td_sched *thread0_sched = &td_sched; 125 126 /* 127 * The priority is primarily determined by the interactivity score. Thus, we 128 * give lower(better) priorities to kse groups that use less CPU. The nice 129 * value is then directly added to this to allow nice to have some effect 130 * on latency. 131 * 132 * PRI_RANGE: Total priority range for timeshare threads. 133 * PRI_NRESV: Number of nice values. 134 * PRI_BASE: The start of the dynamic range. 135 */ 136 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 137 #define SCHED_PRI_NRESV PRIO_TOTAL 138 #define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 139 #define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 140 #define SCHED_PRI_BASE (PRI_MIN_TIMESHARE) 141 #define SCHED_PRI_INTERACT(score) \ 142 ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX) 143 144 /* 145 * These determine the interactivity of a process. 146 * 147 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 148 * before throttling back. 149 * SLP_RUN_THROTTLE: Divisor for reducing slp/run time at fork time. 150 * INTERACT_MAX: Maximum interactivity value. Smaller is better. 151 * INTERACT_THRESH: Threshhold for placement on the current runq. 152 */ 153 #define SCHED_SLP_RUN_MAX ((hz * 2) << 10) 154 #define SCHED_SLP_RUN_THROTTLE (100) 155 #define SCHED_INTERACT_MAX (100) 156 #define SCHED_INTERACT_HALF (SCHED_INTERACT_MAX / 2) 157 #define SCHED_INTERACT_THRESH (20) 158 159 /* 160 * These parameters and macros determine the size of the time slice that is 161 * granted to each thread. 162 * 163 * SLICE_MIN: Minimum time slice granted, in units of ticks. 164 * SLICE_MAX: Maximum time slice granted. 165 * SLICE_RANGE: Range of available time slices scaled by hz. 166 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 167 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 168 */ 169 #define SCHED_SLICE_MIN (slice_min) 170 #define SCHED_SLICE_MAX (slice_max) 171 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 172 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 173 #define SCHED_SLICE_NICE(nice) \ 174 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 175 176 /* 177 * This macro determines whether or not the kse belongs on the current or 178 * next run queue. 179 * 180 * XXX nice value should effect how interactive a kg is. 181 */ 182 #define SCHED_INTERACTIVE(kg) \ 183 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 184 #define SCHED_CURR(kg, ke) \ 185 (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg)) 186 187 /* 188 * Cpu percentage computation macros and defines. 189 * 190 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 191 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 192 */ 193 194 #define SCHED_CPU_TIME 10 195 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 196 197 /* 198 * kseq - per processor runqs and statistics. 199 */ 200 201 #define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 202 203 struct kseq { 204 struct runq ksq_idle; /* Queue of IDLE threads. */ 205 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 206 struct runq *ksq_next; /* Next timeshare queue. */ 207 struct runq *ksq_curr; /* Current queue. */ 208 int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 209 int ksq_load; /* Aggregate load. */ 210 short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 211 short ksq_nicemin; /* Least nice. */ 212 #ifdef SMP 213 unsigned int ksq_rslices; /* Slices on run queue */ 214 #endif 215 }; 216 217 /* 218 * One kse queue per processor. 219 */ 220 #ifdef SMP 221 struct kseq kseq_cpu[MAXCPU]; 222 #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 223 #define KSEQ_CPU(x) (&kseq_cpu[(x)]) 224 #else 225 struct kseq kseq_cpu; 226 #define KSEQ_SELF() (&kseq_cpu) 227 #define KSEQ_CPU(x) (&kseq_cpu) 228 #endif 229 230 static void sched_slice(struct kse *ke); 231 static void sched_priority(struct ksegrp *kg); 232 static int sched_interact_score(struct ksegrp *kg); 233 static void sched_interact_update(struct ksegrp *kg); 234 void sched_pctcpu_update(struct kse *ke); 235 int sched_pickcpu(void); 236 237 /* Operations on per processor queues */ 238 static struct kse * kseq_choose(struct kseq *kseq); 239 static void kseq_setup(struct kseq *kseq); 240 static void kseq_add(struct kseq *kseq, struct kse *ke); 241 static void kseq_rem(struct kseq *kseq, struct kse *ke); 242 static void kseq_nice_add(struct kseq *kseq, int nice); 243 static void kseq_nice_rem(struct kseq *kseq, int nice); 244 void kseq_print(int cpu); 245 #ifdef SMP 246 struct kseq * kseq_load_highest(void); 247 void kseq_balance(void *arg); 248 void kseq_move(struct kseq *from, int cpu); 249 #endif 250 251 void 252 kseq_print(int cpu) 253 { 254 struct kseq *kseq; 255 int i; 256 257 kseq = KSEQ_CPU(cpu); 258 259 printf("kseq:\n"); 260 printf("\tload: %d\n", kseq->ksq_load); 261 printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 262 printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 263 printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 264 printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 265 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 266 printf("\tnice counts:\n"); 267 for (i = 0; i < PRIO_TOTAL + 1; i++) 268 if (kseq->ksq_nice[i]) 269 printf("\t\t%d = %d\n", 270 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 271 } 272 273 static void 274 kseq_add(struct kseq *kseq, struct kse *ke) 275 { 276 mtx_assert(&sched_lock, MA_OWNED); 277 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 278 kseq->ksq_load++; 279 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 280 CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 281 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 282 ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 283 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 284 kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 285 #ifdef SMP 286 kseq->ksq_rslices += ke->ke_slice; 287 #endif 288 } 289 290 static void 291 kseq_rem(struct kseq *kseq, struct kse *ke) 292 { 293 mtx_assert(&sched_lock, MA_OWNED); 294 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 295 kseq->ksq_load--; 296 ke->ke_runq = NULL; 297 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 298 kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 299 #ifdef SMP 300 kseq->ksq_rslices -= ke->ke_slice; 301 #endif 302 } 303 304 static void 305 kseq_nice_add(struct kseq *kseq, int nice) 306 { 307 mtx_assert(&sched_lock, MA_OWNED); 308 /* Normalize to zero. */ 309 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 310 if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 311 kseq->ksq_nicemin = nice; 312 } 313 314 static void 315 kseq_nice_rem(struct kseq *kseq, int nice) 316 { 317 int n; 318 319 mtx_assert(&sched_lock, MA_OWNED); 320 /* Normalize to zero. */ 321 n = nice + SCHED_PRI_NHALF; 322 kseq->ksq_nice[n]--; 323 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 324 325 /* 326 * If this wasn't the smallest nice value or there are more in 327 * this bucket we can just return. Otherwise we have to recalculate 328 * the smallest nice. 329 */ 330 if (nice != kseq->ksq_nicemin || 331 kseq->ksq_nice[n] != 0 || 332 kseq->ksq_loads[PRI_TIMESHARE] == 0) 333 return; 334 335 for (; n < SCHED_PRI_NRESV + 1; n++) 336 if (kseq->ksq_nice[n]) { 337 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 338 return; 339 } 340 } 341 342 #ifdef SMP 343 /* 344 * kseq_balance is a simple CPU load balancing algorithm. It operates by 345 * finding the least loaded and most loaded cpu and equalizing their load 346 * by migrating some processes. 347 * 348 * Dealing only with two CPUs at a time has two advantages. Firstly, most 349 * installations will only have 2 cpus. Secondly, load balancing too much at 350 * once can have an unpleasant effect on the system. The scheduler rarely has 351 * enough information to make perfect decisions. So this algorithm chooses 352 * algorithm simplicity and more gradual effects on load in larger systems. 353 * 354 * It could be improved by considering the priorities and slices assigned to 355 * each task prior to balancing them. There are many pathological cases with 356 * any approach and so the semi random algorithm below may work as well as any. 357 * 358 */ 359 void 360 kseq_balance(void *arg) 361 { 362 struct kseq *kseq; 363 int high_load; 364 int low_load; 365 int high_cpu; 366 int low_cpu; 367 int move; 368 int diff; 369 int i; 370 371 high_cpu = 0; 372 low_cpu = 0; 373 high_load = 0; 374 low_load = -1; 375 376 mtx_lock_spin(&sched_lock); 377 if (smp_started == 0) 378 goto out; 379 380 for (i = 0; i < mp_maxid; i++) { 381 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 382 continue; 383 kseq = KSEQ_CPU(i); 384 if (kseq->ksq_load > high_load) { 385 high_load = kseq->ksq_load; 386 high_cpu = i; 387 } 388 if (low_load == -1 || kseq->ksq_load < low_load) { 389 low_load = kseq->ksq_load; 390 low_cpu = i; 391 } 392 } 393 394 /* 395 * Nothing to do. 396 */ 397 if (high_load < 2 || low_load == high_load) 398 goto out; 399 400 diff = high_load - low_load; 401 move = diff / 2; 402 if (diff & 0x1) 403 move++; 404 405 for (i = 0; i < move; i++) 406 kseq_move(KSEQ_CPU(high_cpu), low_cpu); 407 408 out: 409 mtx_unlock_spin(&sched_lock); 410 callout_reset(&kseq_lb_callout, hz, kseq_balance, NULL); 411 412 return; 413 } 414 415 struct kseq * 416 kseq_load_highest(void) 417 { 418 struct kseq *kseq; 419 int load; 420 int cpu; 421 int i; 422 423 mtx_assert(&sched_lock, MA_OWNED); 424 cpu = 0; 425 load = 0; 426 427 for (i = 0; i < mp_maxid; i++) { 428 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 429 continue; 430 kseq = KSEQ_CPU(i); 431 if (kseq->ksq_load > load) { 432 load = kseq->ksq_load; 433 cpu = i; 434 } 435 } 436 if (load > 1) 437 return (KSEQ_CPU(cpu)); 438 439 return (NULL); 440 } 441 442 void 443 kseq_move(struct kseq *from, int cpu) 444 { 445 struct kse *ke; 446 447 ke = kseq_choose(from); 448 runq_remove(ke->ke_runq, ke); 449 ke->ke_state = KES_THREAD; 450 kseq_rem(from, ke); 451 452 ke->ke_cpu = cpu; 453 sched_add(ke); 454 } 455 #endif 456 457 struct kse * 458 kseq_choose(struct kseq *kseq) 459 { 460 struct kse *ke; 461 struct runq *swap; 462 463 mtx_assert(&sched_lock, MA_OWNED); 464 swap = NULL; 465 466 for (;;) { 467 ke = runq_choose(kseq->ksq_curr); 468 if (ke == NULL) { 469 /* 470 * We already swaped once and didn't get anywhere. 471 */ 472 if (swap) 473 break; 474 swap = kseq->ksq_curr; 475 kseq->ksq_curr = kseq->ksq_next; 476 kseq->ksq_next = swap; 477 continue; 478 } 479 /* 480 * If we encounter a slice of 0 the kse is in a 481 * TIMESHARE kse group and its nice was too far out 482 * of the range that receives slices. 483 */ 484 if (ke->ke_slice == 0) { 485 runq_remove(ke->ke_runq, ke); 486 sched_slice(ke); 487 ke->ke_runq = kseq->ksq_next; 488 runq_add(ke->ke_runq, ke); 489 continue; 490 } 491 return (ke); 492 } 493 494 return (runq_choose(&kseq->ksq_idle)); 495 } 496 497 static void 498 kseq_setup(struct kseq *kseq) 499 { 500 runq_init(&kseq->ksq_timeshare[0]); 501 runq_init(&kseq->ksq_timeshare[1]); 502 runq_init(&kseq->ksq_idle); 503 504 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 505 kseq->ksq_next = &kseq->ksq_timeshare[1]; 506 507 kseq->ksq_loads[PRI_ITHD] = 0; 508 kseq->ksq_loads[PRI_REALTIME] = 0; 509 kseq->ksq_loads[PRI_TIMESHARE] = 0; 510 kseq->ksq_loads[PRI_IDLE] = 0; 511 kseq->ksq_load = 0; 512 #ifdef SMP 513 kseq->ksq_rslices = 0; 514 #endif 515 } 516 517 static void 518 sched_setup(void *dummy) 519 { 520 int i; 521 522 slice_min = (hz/100); /* 10ms */ 523 slice_max = (hz/7); /* ~140ms */ 524 525 mtx_lock_spin(&sched_lock); 526 /* init kseqs */ 527 for (i = 0; i < MAXCPU; i++) 528 kseq_setup(KSEQ_CPU(i)); 529 530 kseq_add(KSEQ_SELF(), &kse0); 531 mtx_unlock_spin(&sched_lock); 532 #ifdef SMP 533 callout_init(&kseq_lb_callout, 1); 534 kseq_balance(NULL); 535 #endif 536 } 537 538 /* 539 * Scale the scheduling priority according to the "interactivity" of this 540 * process. 541 */ 542 static void 543 sched_priority(struct ksegrp *kg) 544 { 545 int pri; 546 547 if (kg->kg_pri_class != PRI_TIMESHARE) 548 return; 549 550 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 551 pri += SCHED_PRI_BASE; 552 pri += kg->kg_nice; 553 554 if (pri > PRI_MAX_TIMESHARE) 555 pri = PRI_MAX_TIMESHARE; 556 else if (pri < PRI_MIN_TIMESHARE) 557 pri = PRI_MIN_TIMESHARE; 558 559 kg->kg_user_pri = pri; 560 561 return; 562 } 563 564 /* 565 * Calculate a time slice based on the properties of the kseg and the runq 566 * that we're on. This is only for PRI_TIMESHARE ksegrps. 567 */ 568 static void 569 sched_slice(struct kse *ke) 570 { 571 struct kseq *kseq; 572 struct ksegrp *kg; 573 574 kg = ke->ke_ksegrp; 575 kseq = KSEQ_CPU(ke->ke_cpu); 576 577 /* 578 * Rationale: 579 * KSEs in interactive ksegs get the minimum slice so that we 580 * quickly notice if it abuses its advantage. 581 * 582 * KSEs in non-interactive ksegs are assigned a slice that is 583 * based on the ksegs nice value relative to the least nice kseg 584 * on the run queue for this cpu. 585 * 586 * If the KSE is less nice than all others it gets the maximum 587 * slice and other KSEs will adjust their slice relative to 588 * this when they first expire. 589 * 590 * There is 20 point window that starts relative to the least 591 * nice kse on the run queue. Slice size is determined by 592 * the kse distance from the last nice ksegrp. 593 * 594 * If you are outside of the window you will get no slice and 595 * you will be reevaluated each time you are selected on the 596 * run queue. 597 * 598 */ 599 600 if (!SCHED_INTERACTIVE(kg)) { 601 int nice; 602 603 nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 604 if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 605 kg->kg_nice < kseq->ksq_nicemin) 606 ke->ke_slice = SCHED_SLICE_MAX; 607 else if (nice <= SCHED_PRI_NTHRESH) 608 ke->ke_slice = SCHED_SLICE_NICE(nice); 609 else 610 ke->ke_slice = 0; 611 } else 612 ke->ke_slice = SCHED_SLICE_MIN; 613 614 CTR6(KTR_ULE, 615 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 616 ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 617 kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 618 619 /* 620 * Check to see if we need to scale back the slp and run time 621 * in the kg. This will cause us to forget old interactivity 622 * while maintaining the current ratio. 623 */ 624 sched_interact_update(kg); 625 626 return; 627 } 628 629 static void 630 sched_interact_update(struct ksegrp *kg) 631 { 632 /* XXX Fixme, use a linear algorithm and not a while loop. */ 633 while ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 634 kg->kg_runtime = (kg->kg_runtime / 5) * 4; 635 kg->kg_slptime = (kg->kg_slptime / 5) * 4; 636 } 637 } 638 639 static int 640 sched_interact_score(struct ksegrp *kg) 641 { 642 int div; 643 644 if (kg->kg_runtime > kg->kg_slptime) { 645 div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF); 646 return (SCHED_INTERACT_HALF + 647 (SCHED_INTERACT_HALF - (kg->kg_slptime / div))); 648 } if (kg->kg_slptime > kg->kg_runtime) { 649 div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF); 650 return (kg->kg_runtime / div); 651 } 652 653 /* 654 * This can happen if slptime and runtime are 0. 655 */ 656 return (0); 657 658 } 659 660 /* 661 * This is only somewhat accurate since given many processes of the same 662 * priority they will switch when their slices run out, which will be 663 * at most SCHED_SLICE_MAX. 664 */ 665 int 666 sched_rr_interval(void) 667 { 668 return (SCHED_SLICE_MAX); 669 } 670 671 void 672 sched_pctcpu_update(struct kse *ke) 673 { 674 /* 675 * Adjust counters and watermark for pctcpu calc. 676 */ 677 678 /* 679 * Shift the tick count out so that the divide doesn't round away 680 * our results. 681 */ 682 ke->ke_ticks <<= 10; 683 ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * 684 SCHED_CPU_TICKS; 685 ke->ke_ticks >>= 10; 686 ke->ke_ltick = ticks; 687 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 688 } 689 690 #ifdef SMP 691 /* XXX Should be changed to kseq_load_lowest() */ 692 int 693 sched_pickcpu(void) 694 { 695 struct kseq *kseq; 696 int load; 697 int cpu; 698 int i; 699 700 mtx_assert(&sched_lock, MA_OWNED); 701 if (!smp_started) 702 return (0); 703 704 load = 0; 705 cpu = 0; 706 707 for (i = 0; i < mp_maxid; i++) { 708 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 709 continue; 710 kseq = KSEQ_CPU(i); 711 if (kseq->ksq_load < load) { 712 cpu = i; 713 load = kseq->ksq_load; 714 } 715 } 716 717 CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 718 return (cpu); 719 } 720 #else 721 int 722 sched_pickcpu(void) 723 { 724 return (0); 725 } 726 #endif 727 728 void 729 sched_prio(struct thread *td, u_char prio) 730 { 731 struct kse *ke; 732 struct runq *rq; 733 734 mtx_assert(&sched_lock, MA_OWNED); 735 ke = td->td_kse; 736 td->td_priority = prio; 737 738 if (TD_ON_RUNQ(td)) { 739 rq = ke->ke_runq; 740 741 runq_remove(rq, ke); 742 runq_add(rq, ke); 743 } 744 } 745 746 void 747 sched_switchout(struct thread *td) 748 { 749 struct kse *ke; 750 751 mtx_assert(&sched_lock, MA_OWNED); 752 753 ke = td->td_kse; 754 755 td->td_last_kse = ke; 756 td->td_lastcpu = td->td_oncpu; 757 td->td_oncpu = NOCPU; 758 td->td_flags &= ~TDF_NEEDRESCHED; 759 760 if (TD_IS_RUNNING(td)) { 761 /* 762 * This queue is always correct except for idle threads which 763 * have a higher priority due to priority propagation. 764 */ 765 if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE && 766 ke->ke_thread->td_priority > PRI_MIN_IDLE) 767 ke->ke_runq = KSEQ_SELF()->ksq_curr; 768 runq_add(ke->ke_runq, ke); 769 /* setrunqueue(td); */ 770 return; 771 } 772 if (ke->ke_runq) 773 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 774 /* 775 * We will not be on the run queue. So we must be 776 * sleeping or similar. 777 */ 778 if (td->td_proc->p_flag & P_SA) 779 kse_reassign(ke); 780 } 781 782 void 783 sched_switchin(struct thread *td) 784 { 785 /* struct kse *ke = td->td_kse; */ 786 mtx_assert(&sched_lock, MA_OWNED); 787 788 td->td_oncpu = PCPU_GET(cpuid); 789 } 790 791 void 792 sched_nice(struct ksegrp *kg, int nice) 793 { 794 struct kse *ke; 795 struct thread *td; 796 struct kseq *kseq; 797 798 PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 799 mtx_assert(&sched_lock, MA_OWNED); 800 /* 801 * We need to adjust the nice counts for running KSEs. 802 */ 803 if (kg->kg_pri_class == PRI_TIMESHARE) 804 FOREACH_KSE_IN_GROUP(kg, ke) { 805 if (ke->ke_runq == NULL) 806 continue; 807 kseq = KSEQ_CPU(ke->ke_cpu); 808 kseq_nice_rem(kseq, kg->kg_nice); 809 kseq_nice_add(kseq, nice); 810 } 811 kg->kg_nice = nice; 812 sched_priority(kg); 813 FOREACH_THREAD_IN_GROUP(kg, td) 814 td->td_flags |= TDF_NEEDRESCHED; 815 } 816 817 void 818 sched_sleep(struct thread *td, u_char prio) 819 { 820 mtx_assert(&sched_lock, MA_OWNED); 821 822 td->td_slptime = ticks; 823 td->td_priority = prio; 824 825 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 826 td->td_kse, td->td_slptime); 827 } 828 829 void 830 sched_wakeup(struct thread *td) 831 { 832 mtx_assert(&sched_lock, MA_OWNED); 833 834 /* 835 * Let the kseg know how long we slept for. This is because process 836 * interactivity behavior is modeled in the kseg. 837 */ 838 if (td->td_slptime) { 839 struct ksegrp *kg; 840 int hzticks; 841 842 kg = td->td_ksegrp; 843 hzticks = ticks - td->td_slptime; 844 kg->kg_slptime += hzticks << 10; 845 sched_interact_update(kg); 846 sched_priority(kg); 847 if (td->td_kse) 848 sched_slice(td->td_kse); 849 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 850 td->td_kse, hzticks); 851 td->td_slptime = 0; 852 } 853 setrunqueue(td); 854 if (td->td_priority < curthread->td_priority) 855 curthread->td_flags |= TDF_NEEDRESCHED; 856 } 857 858 /* 859 * Penalize the parent for creating a new child and initialize the child's 860 * priority. 861 */ 862 void 863 sched_fork(struct proc *p, struct proc *p1) 864 { 865 866 mtx_assert(&sched_lock, MA_OWNED); 867 868 sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 869 sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 870 sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 871 } 872 873 void 874 sched_fork_kse(struct kse *ke, struct kse *child) 875 { 876 877 child->ke_slice = 1; /* Attempt to quickly learn interactivity. */ 878 child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 879 child->ke_runq = NULL; 880 881 /* 882 * Claim that we've been running for one second for statistical 883 * purposes. 884 */ 885 child->ke_ticks = 0; 886 child->ke_ltick = ticks; 887 child->ke_ftick = ticks - hz; 888 } 889 890 void 891 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 892 { 893 894 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 895 /* XXX Need something better here */ 896 897 child->kg_slptime = kg->kg_slptime / SCHED_SLP_RUN_THROTTLE; 898 child->kg_runtime = kg->kg_runtime / SCHED_SLP_RUN_THROTTLE; 899 kg->kg_runtime += tickincr << 10; 900 sched_interact_update(kg); 901 902 child->kg_user_pri = kg->kg_user_pri; 903 child->kg_nice = kg->kg_nice; 904 } 905 906 void 907 sched_fork_thread(struct thread *td, struct thread *child) 908 { 909 } 910 911 void 912 sched_class(struct ksegrp *kg, int class) 913 { 914 struct kseq *kseq; 915 struct kse *ke; 916 917 mtx_assert(&sched_lock, MA_OWNED); 918 if (kg->kg_pri_class == class) 919 return; 920 921 FOREACH_KSE_IN_GROUP(kg, ke) { 922 if (ke->ke_state != KES_ONRUNQ && 923 ke->ke_state != KES_THREAD) 924 continue; 925 kseq = KSEQ_CPU(ke->ke_cpu); 926 927 kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 928 kseq->ksq_loads[PRI_BASE(class)]++; 929 930 if (kg->kg_pri_class == PRI_TIMESHARE) 931 kseq_nice_rem(kseq, kg->kg_nice); 932 else if (class == PRI_TIMESHARE) 933 kseq_nice_add(kseq, kg->kg_nice); 934 } 935 936 kg->kg_pri_class = class; 937 } 938 939 /* 940 * Return some of the child's priority and interactivity to the parent. 941 */ 942 void 943 sched_exit(struct proc *p, struct proc *child) 944 { 945 /* XXX Need something better here */ 946 mtx_assert(&sched_lock, MA_OWNED); 947 sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 948 sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(child)); 949 } 950 951 void 952 sched_exit_kse(struct kse *ke, struct kse *child) 953 { 954 kseq_rem(KSEQ_CPU(child->ke_cpu), child); 955 } 956 957 void 958 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 959 { 960 /* kg->kg_slptime += child->kg_slptime; */ 961 kg->kg_runtime += child->kg_runtime; 962 sched_interact_update(kg); 963 } 964 965 void 966 sched_exit_thread(struct thread *td, struct thread *child) 967 { 968 } 969 970 void 971 sched_clock(struct kse *ke) 972 { 973 struct kseq *kseq; 974 struct ksegrp *kg; 975 struct thread *td; 976 #if 0 977 struct kse *nke; 978 #endif 979 980 /* 981 * sched_setup() apparently happens prior to stathz being set. We 982 * need to resolve the timers earlier in the boot so we can avoid 983 * calculating this here. 984 */ 985 if (realstathz == 0) { 986 realstathz = stathz ? stathz : hz; 987 tickincr = hz / realstathz; 988 /* 989 * XXX This does not work for values of stathz that are much 990 * larger than hz. 991 */ 992 if (tickincr == 0) 993 tickincr = 1; 994 } 995 996 td = ke->ke_thread; 997 kg = ke->ke_ksegrp; 998 999 mtx_assert(&sched_lock, MA_OWNED); 1000 KASSERT((td != NULL), ("schedclock: null thread pointer")); 1001 1002 /* Adjust ticks for pctcpu */ 1003 ke->ke_ticks++; 1004 ke->ke_ltick = ticks; 1005 1006 /* Go up to one second beyond our max and then trim back down */ 1007 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 1008 sched_pctcpu_update(ke); 1009 1010 if (td->td_flags & TDF_IDLETD) 1011 return; 1012 1013 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 1014 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 1015 1016 /* 1017 * We only do slicing code for TIMESHARE ksegrps. 1018 */ 1019 if (kg->kg_pri_class != PRI_TIMESHARE) 1020 return; 1021 /* 1022 * Check for a higher priority task on the run queue. This can happen 1023 * on SMP if another processor woke up a process on our runq. 1024 */ 1025 kseq = KSEQ_SELF(); 1026 #if 0 1027 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) { 1028 if (sched_strict && 1029 nke->ke_thread->td_priority < td->td_priority) 1030 td->td_flags |= TDF_NEEDRESCHED; 1031 else if (nke->ke_thread->td_priority < 1032 td->td_priority SCHED_PRIO_SLOP) 1033 1034 if (nke->ke_thread->td_priority < td->td_priority) 1035 td->td_flags |= TDF_NEEDRESCHED; 1036 } 1037 #endif 1038 /* 1039 * We used a tick charge it to the ksegrp so that we can compute our 1040 * interactivity. 1041 */ 1042 kg->kg_runtime += tickincr << 10; 1043 sched_interact_update(kg); 1044 1045 /* 1046 * We used up one time slice. 1047 */ 1048 ke->ke_slice--; 1049 #ifdef SMP 1050 kseq->ksq_rslices--; 1051 #endif 1052 1053 if (ke->ke_slice > 0) 1054 return; 1055 /* 1056 * We're out of time, recompute priorities and requeue. 1057 */ 1058 kseq_rem(kseq, ke); 1059 sched_priority(kg); 1060 sched_slice(ke); 1061 if (SCHED_CURR(kg, ke)) 1062 ke->ke_runq = kseq->ksq_curr; 1063 else 1064 ke->ke_runq = kseq->ksq_next; 1065 kseq_add(kseq, ke); 1066 td->td_flags |= TDF_NEEDRESCHED; 1067 } 1068 1069 int 1070 sched_runnable(void) 1071 { 1072 struct kseq *kseq; 1073 int load; 1074 1075 load = 1; 1076 1077 mtx_lock_spin(&sched_lock); 1078 kseq = KSEQ_SELF(); 1079 1080 if (kseq->ksq_load) 1081 goto out; 1082 #ifdef SMP 1083 /* 1084 * For SMP we may steal other processor's KSEs. Just search until we 1085 * verify that at least on other cpu has a runnable task. 1086 */ 1087 if (smp_started) { 1088 int i; 1089 1090 for (i = 0; i < mp_maxid; i++) { 1091 if (CPU_ABSENT(i) || (i & stopped_cpus) != 0) 1092 continue; 1093 kseq = KSEQ_CPU(i); 1094 if (kseq->ksq_load > 1) 1095 goto out; 1096 } 1097 } 1098 #endif 1099 load = 0; 1100 out: 1101 mtx_unlock_spin(&sched_lock); 1102 return (load); 1103 } 1104 1105 void 1106 sched_userret(struct thread *td) 1107 { 1108 struct ksegrp *kg; 1109 struct kseq *kseq; 1110 struct kse *ke; 1111 1112 kg = td->td_ksegrp; 1113 1114 if (td->td_priority != kg->kg_user_pri) { 1115 mtx_lock_spin(&sched_lock); 1116 td->td_priority = kg->kg_user_pri; 1117 kseq = KSEQ_SELF(); 1118 if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 1119 kseq->ksq_load > 1 && 1120 (ke = kseq_choose(kseq)) != NULL && 1121 ke->ke_thread->td_priority < td->td_priority) 1122 curthread->td_flags |= TDF_NEEDRESCHED; 1123 mtx_unlock_spin(&sched_lock); 1124 } 1125 } 1126 1127 struct kse * 1128 sched_choose(void) 1129 { 1130 struct kseq *kseq; 1131 struct kse *ke; 1132 1133 mtx_assert(&sched_lock, MA_OWNED); 1134 #ifdef SMP 1135 retry: 1136 #endif 1137 kseq = KSEQ_SELF(); 1138 ke = kseq_choose(kseq); 1139 if (ke) { 1140 runq_remove(ke->ke_runq, ke); 1141 ke->ke_state = KES_THREAD; 1142 1143 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1144 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1145 ke, ke->ke_runq, ke->ke_slice, 1146 ke->ke_thread->td_priority); 1147 } 1148 return (ke); 1149 } 1150 1151 #ifdef SMP 1152 if (smp_started) { 1153 /* 1154 * Find the cpu with the highest load and steal one proc. 1155 */ 1156 if ((kseq = kseq_load_highest()) == NULL) 1157 return (NULL); 1158 1159 /* 1160 * Remove this kse from this kseq and runq and then requeue 1161 * on the current processor. Then we will dequeue it 1162 * normally above. 1163 */ 1164 kseq_move(kseq, PCPU_GET(cpuid)); 1165 goto retry; 1166 } 1167 #endif 1168 1169 return (NULL); 1170 } 1171 1172 void 1173 sched_add(struct kse *ke) 1174 { 1175 struct kseq *kseq; 1176 struct ksegrp *kg; 1177 1178 mtx_assert(&sched_lock, MA_OWNED); 1179 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1180 KASSERT((ke->ke_thread->td_kse != NULL), 1181 ("sched_add: No KSE on thread")); 1182 KASSERT(ke->ke_state != KES_ONRUNQ, 1183 ("sched_add: kse %p (%s) already in run queue", ke, 1184 ke->ke_proc->p_comm)); 1185 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1186 ("sched_add: process swapped out")); 1187 KASSERT(ke->ke_runq == NULL, 1188 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1189 1190 kg = ke->ke_ksegrp; 1191 1192 switch (PRI_BASE(kg->kg_pri_class)) { 1193 case PRI_ITHD: 1194 case PRI_REALTIME: 1195 kseq = KSEQ_SELF(); 1196 ke->ke_runq = kseq->ksq_curr; 1197 ke->ke_slice = SCHED_SLICE_MAX; 1198 ke->ke_cpu = PCPU_GET(cpuid); 1199 break; 1200 case PRI_TIMESHARE: 1201 kseq = KSEQ_CPU(ke->ke_cpu); 1202 if (SCHED_CURR(kg, ke)) 1203 ke->ke_runq = kseq->ksq_curr; 1204 else 1205 ke->ke_runq = kseq->ksq_next; 1206 break; 1207 case PRI_IDLE: 1208 kseq = KSEQ_CPU(ke->ke_cpu); 1209 /* 1210 * This is for priority prop. 1211 */ 1212 if (ke->ke_thread->td_priority > PRI_MIN_IDLE) 1213 ke->ke_runq = kseq->ksq_curr; 1214 else 1215 ke->ke_runq = &kseq->ksq_idle; 1216 ke->ke_slice = SCHED_SLICE_MIN; 1217 break; 1218 default: 1219 panic("Unknown pri class.\n"); 1220 break; 1221 } 1222 1223 ke->ke_ksegrp->kg_runq_kses++; 1224 ke->ke_state = KES_ONRUNQ; 1225 1226 runq_add(ke->ke_runq, ke); 1227 kseq_add(kseq, ke); 1228 } 1229 1230 void 1231 sched_rem(struct kse *ke) 1232 { 1233 struct kseq *kseq; 1234 1235 mtx_assert(&sched_lock, MA_OWNED); 1236 KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1237 1238 ke->ke_state = KES_THREAD; 1239 ke->ke_ksegrp->kg_runq_kses--; 1240 kseq = KSEQ_CPU(ke->ke_cpu); 1241 runq_remove(ke->ke_runq, ke); 1242 kseq_rem(kseq, ke); 1243 } 1244 1245 fixpt_t 1246 sched_pctcpu(struct kse *ke) 1247 { 1248 fixpt_t pctcpu; 1249 1250 pctcpu = 0; 1251 1252 mtx_lock_spin(&sched_lock); 1253 if (ke->ke_ticks) { 1254 int rtick; 1255 1256 /* 1257 * Don't update more frequently than twice a second. Allowing 1258 * this causes the cpu usage to decay away too quickly due to 1259 * rounding errors. 1260 */ 1261 if (ke->ke_ltick < (ticks - (hz / 2))) 1262 sched_pctcpu_update(ke); 1263 1264 /* How many rtick per second ? */ 1265 rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS); 1266 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1267 } 1268 1269 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1270 mtx_unlock_spin(&sched_lock); 1271 1272 return (pctcpu); 1273 } 1274 1275 int 1276 sched_sizeof_kse(void) 1277 { 1278 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1279 } 1280 1281 int 1282 sched_sizeof_ksegrp(void) 1283 { 1284 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1285 } 1286 1287 int 1288 sched_sizeof_proc(void) 1289 { 1290 return (sizeof(struct proc)); 1291 } 1292 1293 int 1294 sched_sizeof_thread(void) 1295 { 1296 return (sizeof(struct thread) + sizeof(struct td_sched)); 1297 } 1298