1 /*- 2 * Copyright (c) 2002-2003, Jeffrey Roberson <jeff@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice unmodified, this list of conditions, and the following 10 * disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 * 26 * $FreeBSD$ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/ktr.h> 33 #include <sys/lock.h> 34 #include <sys/mutex.h> 35 #include <sys/proc.h> 36 #include <sys/resource.h> 37 #include <sys/sched.h> 38 #include <sys/smp.h> 39 #include <sys/sx.h> 40 #include <sys/sysctl.h> 41 #include <sys/sysproto.h> 42 #include <sys/vmmeter.h> 43 #ifdef DDB 44 #include <ddb/ddb.h> 45 #endif 46 #ifdef KTRACE 47 #include <sys/uio.h> 48 #include <sys/ktrace.h> 49 #endif 50 51 #include <machine/cpu.h> 52 53 #define KTR_ULE KTR_NFS 54 55 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 56 /* XXX This is bogus compatability crap for ps */ 57 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 58 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 59 60 static void sched_setup(void *dummy); 61 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL) 62 63 static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "SCHED"); 64 65 static int sched_strict; 66 SYSCTL_INT(_kern_sched, OID_AUTO, strict, CTLFLAG_RD, &sched_strict, 0, ""); 67 68 static int slice_min = 1; 69 SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, ""); 70 71 static int slice_max = 2; 72 SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, ""); 73 74 int realstathz; 75 int tickincr = 1; 76 77 /* 78 * These datastructures are allocated within their parent datastructure but 79 * are scheduler specific. 80 */ 81 82 struct ke_sched { 83 int ske_slice; 84 struct runq *ske_runq; 85 /* The following variables are only used for pctcpu calculation */ 86 int ske_ltick; /* Last tick that we were running on */ 87 int ske_ftick; /* First tick that we were running on */ 88 int ske_ticks; /* Tick count */ 89 /* CPU that we have affinity for. */ 90 u_char ske_cpu; 91 }; 92 #define ke_slice ke_sched->ske_slice 93 #define ke_runq ke_sched->ske_runq 94 #define ke_ltick ke_sched->ske_ltick 95 #define ke_ftick ke_sched->ske_ftick 96 #define ke_ticks ke_sched->ske_ticks 97 #define ke_cpu ke_sched->ske_cpu 98 99 struct kg_sched { 100 int skg_slptime; /* Number of ticks we vol. slept */ 101 int skg_runtime; /* Number of ticks we were running */ 102 }; 103 #define kg_slptime kg_sched->skg_slptime 104 #define kg_runtime kg_sched->skg_runtime 105 106 struct td_sched { 107 int std_slptime; 108 }; 109 #define td_slptime td_sched->std_slptime 110 111 struct td_sched td_sched; 112 struct ke_sched ke_sched; 113 struct kg_sched kg_sched; 114 115 struct ke_sched *kse0_sched = &ke_sched; 116 struct kg_sched *ksegrp0_sched = &kg_sched; 117 struct p_sched *proc0_sched = NULL; 118 struct td_sched *thread0_sched = &td_sched; 119 120 /* 121 * This priority range has 20 priorities on either end that are reachable 122 * only through nice values. 123 * 124 * PRI_RANGE: Total priority range for timeshare threads. 125 * PRI_NRESV: Reserved priorities for nice. 126 * PRI_BASE: The start of the dynamic range. 127 * DYN_RANGE: Number of priorities that are available int the dynamic 128 * priority range. 129 */ 130 #define SCHED_PRI_RANGE (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1) 131 #define SCHED_PRI_NRESV PRIO_TOTAL 132 #define SCHED_PRI_NHALF (PRIO_TOTAL / 2) 133 #define SCHED_PRI_NTHRESH (SCHED_PRI_NHALF - 1) 134 #define SCHED_PRI_BASE ((SCHED_PRI_NRESV / 2) + PRI_MIN_TIMESHARE) 135 #define SCHED_DYN_RANGE (SCHED_PRI_RANGE - SCHED_PRI_NRESV) 136 #define SCHED_PRI_INTERACT(score) \ 137 ((score) * SCHED_DYN_RANGE / SCHED_INTERACT_RANGE) 138 139 /* 140 * These determine the interactivity of a process. 141 * 142 * SLP_RUN_MAX: Maximum amount of sleep time + run time we'll accumulate 143 * before throttling back. 144 * SLP_RUN_THROTTLE: Divisor for reducing slp/run time. 145 * INTERACT_RANGE: Range of interactivity values. Smaller is better. 146 * INTERACT_HALF: Convenience define, half of the interactivity range. 147 * INTERACT_THRESH: Threshhold for placement on the current runq. 148 */ 149 #define SCHED_SLP_RUN_MAX ((hz / 10) << 10) 150 #define SCHED_SLP_RUN_THROTTLE (10) 151 #define SCHED_INTERACT_RANGE (100) 152 #define SCHED_INTERACT_HALF (SCHED_INTERACT_RANGE / 2) 153 #define SCHED_INTERACT_THRESH (10) 154 155 /* 156 * These parameters and macros determine the size of the time slice that is 157 * granted to each thread. 158 * 159 * SLICE_MIN: Minimum time slice granted, in units of ticks. 160 * SLICE_MAX: Maximum time slice granted. 161 * SLICE_RANGE: Range of available time slices scaled by hz. 162 * SLICE_SCALE: The number slices granted per val in the range of [0, max]. 163 * SLICE_NICE: Determine the amount of slice granted to a scaled nice. 164 */ 165 #define SCHED_SLICE_MIN (slice_min) 166 #define SCHED_SLICE_MAX (slice_max) 167 #define SCHED_SLICE_RANGE (SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1) 168 #define SCHED_SLICE_SCALE(val, max) (((val) * SCHED_SLICE_RANGE) / (max)) 169 #define SCHED_SLICE_NICE(nice) \ 170 (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_PRI_NTHRESH)) 171 172 /* 173 * This macro determines whether or not the kse belongs on the current or 174 * next run queue. 175 * 176 * XXX nice value should effect how interactive a kg is. 177 */ 178 #define SCHED_INTERACTIVE(kg) \ 179 (sched_interact_score(kg) < SCHED_INTERACT_THRESH) 180 #define SCHED_CURR(kg, ke) \ 181 (ke->ke_thread->td_priority < PRI_MIN_TIMESHARE || SCHED_INTERACTIVE(kg)) 182 183 /* 184 * Cpu percentage computation macros and defines. 185 * 186 * SCHED_CPU_TIME: Number of seconds to average the cpu usage across. 187 * SCHED_CPU_TICKS: Number of hz ticks to average the cpu usage across. 188 */ 189 190 #define SCHED_CPU_TIME 10 191 #define SCHED_CPU_TICKS (hz * SCHED_CPU_TIME) 192 193 /* 194 * kseq - per processor runqs and statistics. 195 */ 196 197 #define KSEQ_NCLASS (PRI_IDLE + 1) /* Number of run classes. */ 198 199 struct kseq { 200 struct runq ksq_idle; /* Queue of IDLE threads. */ 201 struct runq ksq_timeshare[2]; /* Run queues for !IDLE. */ 202 struct runq *ksq_next; /* Next timeshare queue. */ 203 struct runq *ksq_curr; /* Current queue. */ 204 int ksq_loads[KSEQ_NCLASS]; /* Load for each class */ 205 int ksq_load; /* Aggregate load. */ 206 short ksq_nice[PRIO_TOTAL + 1]; /* KSEs in each nice bin. */ 207 short ksq_nicemin; /* Least nice. */ 208 #ifdef SMP 209 unsigned int ksq_rslices; /* Slices on run queue */ 210 #endif 211 }; 212 213 /* 214 * One kse queue per processor. 215 */ 216 #ifdef SMP 217 struct kseq kseq_cpu[MAXCPU]; 218 #define KSEQ_SELF() (&kseq_cpu[PCPU_GET(cpuid)]) 219 #define KSEQ_CPU(x) (&kseq_cpu[(x)]) 220 #else 221 struct kseq kseq_cpu; 222 #define KSEQ_SELF() (&kseq_cpu) 223 #define KSEQ_CPU(x) (&kseq_cpu) 224 #endif 225 226 static void sched_slice(struct kse *ke); 227 static void sched_priority(struct ksegrp *kg); 228 static int sched_interact_score(struct ksegrp *kg); 229 void sched_pctcpu_update(struct kse *ke); 230 int sched_pickcpu(void); 231 232 /* Operations on per processor queues */ 233 static struct kse * kseq_choose(struct kseq *kseq); 234 static void kseq_setup(struct kseq *kseq); 235 static void kseq_add(struct kseq *kseq, struct kse *ke); 236 static void kseq_rem(struct kseq *kseq, struct kse *ke); 237 static void kseq_nice_add(struct kseq *kseq, int nice); 238 static void kseq_nice_rem(struct kseq *kseq, int nice); 239 void kseq_print(int cpu); 240 #ifdef SMP 241 struct kseq * kseq_load_highest(void); 242 #endif 243 244 void 245 kseq_print(int cpu) 246 { 247 struct kseq *kseq; 248 int i; 249 250 kseq = KSEQ_CPU(cpu); 251 252 printf("kseq:\n"); 253 printf("\tload: %d\n", kseq->ksq_load); 254 printf("\tload ITHD: %d\n", kseq->ksq_loads[PRI_ITHD]); 255 printf("\tload REALTIME: %d\n", kseq->ksq_loads[PRI_REALTIME]); 256 printf("\tload TIMESHARE: %d\n", kseq->ksq_loads[PRI_TIMESHARE]); 257 printf("\tload IDLE: %d\n", kseq->ksq_loads[PRI_IDLE]); 258 printf("\tnicemin:\t%d\n", kseq->ksq_nicemin); 259 printf("\tnice counts:\n"); 260 for (i = 0; i < PRIO_TOTAL + 1; i++) 261 if (kseq->ksq_nice[i]) 262 printf("\t\t%d = %d\n", 263 i - SCHED_PRI_NHALF, kseq->ksq_nice[i]); 264 } 265 266 static void 267 kseq_add(struct kseq *kseq, struct kse *ke) 268 { 269 mtx_assert(&sched_lock, MA_OWNED); 270 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]++; 271 kseq->ksq_load++; 272 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 273 CTR6(KTR_ULE, "Add kse %p to %p (slice: %d, pri: %d, nice: %d(%d))", 274 ke, ke->ke_runq, ke->ke_slice, ke->ke_thread->td_priority, 275 ke->ke_ksegrp->kg_nice, kseq->ksq_nicemin); 276 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 277 kseq_nice_add(kseq, ke->ke_ksegrp->kg_nice); 278 #ifdef SMP 279 kseq->ksq_rslices += ke->ke_slice; 280 #endif 281 } 282 283 static void 284 kseq_rem(struct kseq *kseq, struct kse *ke) 285 { 286 mtx_assert(&sched_lock, MA_OWNED); 287 kseq->ksq_loads[PRI_BASE(ke->ke_ksegrp->kg_pri_class)]--; 288 kseq->ksq_load--; 289 ke->ke_runq = NULL; 290 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) 291 kseq_nice_rem(kseq, ke->ke_ksegrp->kg_nice); 292 #ifdef SMP 293 kseq->ksq_rslices -= ke->ke_slice; 294 #endif 295 } 296 297 static void 298 kseq_nice_add(struct kseq *kseq, int nice) 299 { 300 mtx_assert(&sched_lock, MA_OWNED); 301 /* Normalize to zero. */ 302 kseq->ksq_nice[nice + SCHED_PRI_NHALF]++; 303 if (nice < kseq->ksq_nicemin || kseq->ksq_loads[PRI_TIMESHARE] == 1) 304 kseq->ksq_nicemin = nice; 305 } 306 307 static void 308 kseq_nice_rem(struct kseq *kseq, int nice) 309 { 310 int n; 311 312 mtx_assert(&sched_lock, MA_OWNED); 313 /* Normalize to zero. */ 314 n = nice + SCHED_PRI_NHALF; 315 kseq->ksq_nice[n]--; 316 KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count.")); 317 318 /* 319 * If this wasn't the smallest nice value or there are more in 320 * this bucket we can just return. Otherwise we have to recalculate 321 * the smallest nice. 322 */ 323 if (nice != kseq->ksq_nicemin || 324 kseq->ksq_nice[n] != 0 || 325 kseq->ksq_loads[PRI_TIMESHARE] == 0) 326 return; 327 328 for (; n < SCHED_PRI_NRESV + 1; n++) 329 if (kseq->ksq_nice[n]) { 330 kseq->ksq_nicemin = n - SCHED_PRI_NHALF; 331 return; 332 } 333 } 334 335 #ifdef SMP 336 struct kseq * 337 kseq_load_highest(void) 338 { 339 struct kseq *kseq; 340 int load; 341 int cpu; 342 int i; 343 344 mtx_assert(&sched_lock, MA_OWNED); 345 cpu = 0; 346 load = 0; 347 348 for (i = 0; i < mp_maxid; i++) { 349 if (CPU_ABSENT(i)) 350 continue; 351 kseq = KSEQ_CPU(i); 352 if (kseq->ksq_load > load) { 353 load = kseq->ksq_load; 354 cpu = i; 355 } 356 } 357 if (load > 1) 358 return (KSEQ_CPU(cpu)); 359 360 return (NULL); 361 } 362 #endif 363 364 struct kse * 365 kseq_choose(struct kseq *kseq) 366 { 367 struct kse *ke; 368 struct runq *swap; 369 370 mtx_assert(&sched_lock, MA_OWNED); 371 swap = NULL; 372 373 for (;;) { 374 ke = runq_choose(kseq->ksq_curr); 375 if (ke == NULL) { 376 /* 377 * We already swaped once and didn't get anywhere. 378 */ 379 if (swap) 380 break; 381 swap = kseq->ksq_curr; 382 kseq->ksq_curr = kseq->ksq_next; 383 kseq->ksq_next = swap; 384 continue; 385 } 386 /* 387 * If we encounter a slice of 0 the kse is in a 388 * TIMESHARE kse group and its nice was too far out 389 * of the range that receives slices. 390 */ 391 if (ke->ke_slice == 0) { 392 runq_remove(ke->ke_runq, ke); 393 sched_slice(ke); 394 ke->ke_runq = kseq->ksq_next; 395 runq_add(ke->ke_runq, ke); 396 continue; 397 } 398 return (ke); 399 } 400 401 return (runq_choose(&kseq->ksq_idle)); 402 } 403 404 static void 405 kseq_setup(struct kseq *kseq) 406 { 407 runq_init(&kseq->ksq_timeshare[0]); 408 runq_init(&kseq->ksq_timeshare[1]); 409 runq_init(&kseq->ksq_idle); 410 411 kseq->ksq_curr = &kseq->ksq_timeshare[0]; 412 kseq->ksq_next = &kseq->ksq_timeshare[1]; 413 414 kseq->ksq_loads[PRI_ITHD] = 0; 415 kseq->ksq_loads[PRI_REALTIME] = 0; 416 kseq->ksq_loads[PRI_TIMESHARE] = 0; 417 kseq->ksq_loads[PRI_IDLE] = 0; 418 kseq->ksq_load = 0; 419 #ifdef SMP 420 kseq->ksq_rslices = 0; 421 #endif 422 } 423 424 static void 425 sched_setup(void *dummy) 426 { 427 int i; 428 429 slice_min = (hz/100); 430 slice_max = (hz/10); 431 432 mtx_lock_spin(&sched_lock); 433 /* init kseqs */ 434 for (i = 0; i < MAXCPU; i++) 435 kseq_setup(KSEQ_CPU(i)); 436 437 kseq_add(KSEQ_SELF(), &kse0); 438 mtx_unlock_spin(&sched_lock); 439 } 440 441 /* 442 * Scale the scheduling priority according to the "interactivity" of this 443 * process. 444 */ 445 static void 446 sched_priority(struct ksegrp *kg) 447 { 448 int pri; 449 450 if (kg->kg_pri_class != PRI_TIMESHARE) 451 return; 452 453 pri = SCHED_PRI_INTERACT(sched_interact_score(kg)); 454 pri += SCHED_PRI_BASE; 455 pri += kg->kg_nice; 456 457 if (pri > PRI_MAX_TIMESHARE) 458 pri = PRI_MAX_TIMESHARE; 459 else if (pri < PRI_MIN_TIMESHARE) 460 pri = PRI_MIN_TIMESHARE; 461 462 kg->kg_user_pri = pri; 463 464 return; 465 } 466 467 /* 468 * Calculate a time slice based on the properties of the kseg and the runq 469 * that we're on. This is only for PRI_TIMESHARE ksegrps. 470 */ 471 static void 472 sched_slice(struct kse *ke) 473 { 474 struct kseq *kseq; 475 struct ksegrp *kg; 476 477 kg = ke->ke_ksegrp; 478 kseq = KSEQ_CPU(ke->ke_cpu); 479 480 /* 481 * Rationale: 482 * KSEs in interactive ksegs get the minimum slice so that we 483 * quickly notice if it abuses its advantage. 484 * 485 * KSEs in non-interactive ksegs are assigned a slice that is 486 * based on the ksegs nice value relative to the least nice kseg 487 * on the run queue for this cpu. 488 * 489 * If the KSE is less nice than all others it gets the maximum 490 * slice and other KSEs will adjust their slice relative to 491 * this when they first expire. 492 * 493 * There is 20 point window that starts relative to the least 494 * nice kse on the run queue. Slice size is determined by 495 * the kse distance from the last nice ksegrp. 496 * 497 * If you are outside of the window you will get no slice and 498 * you will be reevaluated each time you are selected on the 499 * run queue. 500 * 501 */ 502 503 if (!SCHED_INTERACTIVE(kg)) { 504 int nice; 505 506 nice = kg->kg_nice + (0 - kseq->ksq_nicemin); 507 if (kseq->ksq_loads[PRI_TIMESHARE] == 0 || 508 kg->kg_nice < kseq->ksq_nicemin) 509 ke->ke_slice = SCHED_SLICE_MAX; 510 else if (nice <= SCHED_PRI_NTHRESH) 511 ke->ke_slice = SCHED_SLICE_NICE(nice); 512 else 513 ke->ke_slice = 0; 514 } else 515 ke->ke_slice = SCHED_SLICE_MIN; 516 517 CTR6(KTR_ULE, 518 "Sliced %p(%d) (nice: %d, nicemin: %d, load: %d, interactive: %d)", 519 ke, ke->ke_slice, kg->kg_nice, kseq->ksq_nicemin, 520 kseq->ksq_loads[PRI_TIMESHARE], SCHED_INTERACTIVE(kg)); 521 522 /* 523 * Check to see if we need to scale back the slp and run time 524 * in the kg. This will cause us to forget old interactivity 525 * while maintaining the current ratio. 526 */ 527 CTR4(KTR_ULE, "Slp vs Run %p (Slp %d, Run %d, Score %d)", 528 ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10, 529 sched_interact_score(kg)); 530 531 if ((kg->kg_runtime + kg->kg_slptime) > SCHED_SLP_RUN_MAX) { 532 kg->kg_runtime /= SCHED_SLP_RUN_THROTTLE; 533 kg->kg_slptime /= SCHED_SLP_RUN_THROTTLE; 534 } 535 CTR4(KTR_ULE, "Slp vs Run(2) %p (Slp %d, Run %d, Score %d)", 536 ke, kg->kg_slptime >> 10, kg->kg_runtime >> 10, 537 sched_interact_score(kg)); 538 539 return; 540 } 541 542 static int 543 sched_interact_score(struct ksegrp *kg) 544 { 545 int big; 546 int small; 547 int base; 548 549 if (kg->kg_runtime > kg->kg_slptime) { 550 big = kg->kg_runtime; 551 small = kg->kg_slptime; 552 base = SCHED_INTERACT_HALF; 553 } else { 554 big = kg->kg_slptime; 555 small = kg->kg_runtime; 556 base = 0; 557 } 558 559 big /= SCHED_INTERACT_HALF; 560 if (big != 0) 561 small /= big; 562 else 563 small = 0; 564 565 small += base; 566 /* XXX Factor in nice */ 567 return (small); 568 } 569 570 /* 571 * This is only somewhat accurate since given many processes of the same 572 * priority they will switch when their slices run out, which will be 573 * at most SCHED_SLICE_MAX. 574 */ 575 int 576 sched_rr_interval(void) 577 { 578 return (SCHED_SLICE_MAX); 579 } 580 581 void 582 sched_pctcpu_update(struct kse *ke) 583 { 584 /* 585 * Adjust counters and watermark for pctcpu calc. 586 * 587 * Shift the tick count out so that the divide doesn't round away 588 * our results. 589 */ 590 ke->ke_ticks <<= 10; 591 ke->ke_ticks = (ke->ke_ticks / (ke->ke_ltick - ke->ke_ftick)) * 592 SCHED_CPU_TICKS; 593 ke->ke_ticks >>= 10; 594 ke->ke_ltick = ticks; 595 ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS; 596 } 597 598 #ifdef SMP 599 /* XXX Should be changed to kseq_load_lowest() */ 600 int 601 sched_pickcpu(void) 602 { 603 struct kseq *kseq; 604 int load; 605 int cpu; 606 int i; 607 608 mtx_assert(&sched_lock, MA_OWNED); 609 if (!smp_started) 610 return (0); 611 612 load = 0; 613 cpu = 0; 614 615 for (i = 0; i < mp_maxid; i++) { 616 if (CPU_ABSENT(i)) 617 continue; 618 kseq = KSEQ_CPU(i); 619 if (kseq->ksq_load < load) { 620 cpu = i; 621 load = kseq->ksq_load; 622 } 623 } 624 625 CTR1(KTR_RUNQ, "sched_pickcpu: %d", cpu); 626 return (cpu); 627 } 628 #else 629 int 630 sched_pickcpu(void) 631 { 632 return (0); 633 } 634 #endif 635 636 void 637 sched_prio(struct thread *td, u_char prio) 638 { 639 struct kse *ke; 640 struct runq *rq; 641 642 mtx_assert(&sched_lock, MA_OWNED); 643 ke = td->td_kse; 644 td->td_priority = prio; 645 646 if (TD_ON_RUNQ(td)) { 647 rq = ke->ke_runq; 648 649 runq_remove(rq, ke); 650 runq_add(rq, ke); 651 } 652 } 653 654 void 655 sched_switchout(struct thread *td) 656 { 657 struct kse *ke; 658 659 mtx_assert(&sched_lock, MA_OWNED); 660 661 ke = td->td_kse; 662 663 td->td_last_kse = ke; 664 td->td_lastcpu = td->td_oncpu; 665 td->td_oncpu = NOCPU; 666 td->td_flags &= ~TDF_NEEDRESCHED; 667 668 if (TD_IS_RUNNING(td)) { 669 runq_add(ke->ke_runq, ke); 670 /* setrunqueue(td); */ 671 return; 672 } 673 if (ke->ke_runq) 674 kseq_rem(KSEQ_CPU(ke->ke_cpu), ke); 675 /* 676 * We will not be on the run queue. So we must be 677 * sleeping or similar. 678 */ 679 if (td->td_proc->p_flag & P_THREADED) 680 kse_reassign(ke); 681 } 682 683 void 684 sched_switchin(struct thread *td) 685 { 686 /* struct kse *ke = td->td_kse; */ 687 mtx_assert(&sched_lock, MA_OWNED); 688 689 td->td_oncpu = PCPU_GET(cpuid); 690 691 if (td->td_ksegrp->kg_pri_class == PRI_TIMESHARE && 692 td->td_priority != td->td_ksegrp->kg_user_pri) 693 curthread->td_flags |= TDF_NEEDRESCHED; 694 } 695 696 void 697 sched_nice(struct ksegrp *kg, int nice) 698 { 699 struct kse *ke; 700 struct thread *td; 701 struct kseq *kseq; 702 703 PROC_LOCK_ASSERT(kg->kg_proc, MA_OWNED); 704 mtx_assert(&sched_lock, MA_OWNED); 705 /* 706 * We need to adjust the nice counts for running KSEs. 707 */ 708 if (kg->kg_pri_class == PRI_TIMESHARE) 709 FOREACH_KSE_IN_GROUP(kg, ke) { 710 if (ke->ke_state != KES_ONRUNQ && 711 ke->ke_state != KES_THREAD) 712 continue; 713 kseq = KSEQ_CPU(ke->ke_cpu); 714 kseq_nice_rem(kseq, kg->kg_nice); 715 kseq_nice_add(kseq, nice); 716 } 717 kg->kg_nice = nice; 718 sched_priority(kg); 719 FOREACH_THREAD_IN_GROUP(kg, td) 720 td->td_flags |= TDF_NEEDRESCHED; 721 } 722 723 void 724 sched_sleep(struct thread *td, u_char prio) 725 { 726 mtx_assert(&sched_lock, MA_OWNED); 727 728 td->td_slptime = ticks; 729 td->td_priority = prio; 730 731 CTR2(KTR_ULE, "sleep kse %p (tick: %d)", 732 td->td_kse, td->td_slptime); 733 } 734 735 void 736 sched_wakeup(struct thread *td) 737 { 738 mtx_assert(&sched_lock, MA_OWNED); 739 740 /* 741 * Let the kseg know how long we slept for. This is because process 742 * interactivity behavior is modeled in the kseg. 743 */ 744 if (td->td_slptime) { 745 struct ksegrp *kg; 746 int hzticks; 747 748 kg = td->td_ksegrp; 749 hzticks = ticks - td->td_slptime; 750 kg->kg_slptime += hzticks << 10; 751 sched_priority(kg); 752 CTR2(KTR_ULE, "wakeup kse %p (%d ticks)", 753 td->td_kse, hzticks); 754 td->td_slptime = 0; 755 } 756 setrunqueue(td); 757 if (td->td_priority < curthread->td_priority) 758 curthread->td_flags |= TDF_NEEDRESCHED; 759 } 760 761 /* 762 * Penalize the parent for creating a new child and initialize the child's 763 * priority. 764 */ 765 void 766 sched_fork(struct proc *p, struct proc *p1) 767 { 768 769 mtx_assert(&sched_lock, MA_OWNED); 770 771 sched_fork_ksegrp(FIRST_KSEGRP_IN_PROC(p), FIRST_KSEGRP_IN_PROC(p1)); 772 sched_fork_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(p1)); 773 sched_fork_thread(FIRST_THREAD_IN_PROC(p), FIRST_THREAD_IN_PROC(p1)); 774 } 775 776 void 777 sched_fork_kse(struct kse *ke, struct kse *child) 778 { 779 780 child->ke_slice = ke->ke_slice; 781 child->ke_cpu = ke->ke_cpu; /* sched_pickcpu(); */ 782 child->ke_runq = NULL; 783 784 /* 785 * Claim that we've been running for one second for statistical 786 * purposes. 787 */ 788 child->ke_ticks = 0; 789 child->ke_ltick = ticks; 790 child->ke_ftick = ticks - hz; 791 } 792 793 void 794 sched_fork_ksegrp(struct ksegrp *kg, struct ksegrp *child) 795 { 796 797 PROC_LOCK_ASSERT(child->kg_proc, MA_OWNED); 798 /* XXX Need something better here */ 799 if (kg->kg_slptime > kg->kg_runtime) { 800 child->kg_slptime = SCHED_DYN_RANGE; 801 child->kg_runtime = kg->kg_slptime / SCHED_DYN_RANGE; 802 } else { 803 child->kg_runtime = SCHED_DYN_RANGE; 804 child->kg_slptime = kg->kg_runtime / SCHED_DYN_RANGE; 805 } 806 807 child->kg_user_pri = kg->kg_user_pri; 808 child->kg_nice = kg->kg_nice; 809 } 810 811 void 812 sched_fork_thread(struct thread *td, struct thread *child) 813 { 814 } 815 816 void 817 sched_class(struct ksegrp *kg, int class) 818 { 819 struct kseq *kseq; 820 struct kse *ke; 821 822 mtx_assert(&sched_lock, MA_OWNED); 823 if (kg->kg_pri_class == class) 824 return; 825 826 FOREACH_KSE_IN_GROUP(kg, ke) { 827 if (ke->ke_state != KES_ONRUNQ && 828 ke->ke_state != KES_THREAD) 829 continue; 830 kseq = KSEQ_CPU(ke->ke_cpu); 831 832 kseq->ksq_loads[PRI_BASE(kg->kg_pri_class)]--; 833 kseq->ksq_loads[PRI_BASE(class)]++; 834 835 if (kg->kg_pri_class == PRI_TIMESHARE) 836 kseq_nice_rem(kseq, kg->kg_nice); 837 else if (class == PRI_TIMESHARE) 838 kseq_nice_add(kseq, kg->kg_nice); 839 } 840 841 kg->kg_pri_class = class; 842 } 843 844 /* 845 * Return some of the child's priority and interactivity to the parent. 846 */ 847 void 848 sched_exit(struct proc *p, struct proc *child) 849 { 850 /* XXX Need something better here */ 851 mtx_assert(&sched_lock, MA_OWNED); 852 sched_exit_kse(FIRST_KSE_IN_PROC(p), FIRST_KSE_IN_PROC(child)); 853 } 854 855 void 856 sched_exit_kse(struct kse *ke, struct kse *child) 857 { 858 kseq_rem(KSEQ_CPU(child->ke_cpu), child); 859 } 860 861 void 862 sched_exit_ksegrp(struct ksegrp *kg, struct ksegrp *child) 863 { 864 } 865 866 void 867 sched_exit_thread(struct thread *td, struct thread *child) 868 { 869 } 870 871 void 872 sched_clock(struct kse *ke) 873 { 874 struct kseq *kseq; 875 struct ksegrp *kg; 876 struct thread *td; 877 #if 0 878 struct kse *nke; 879 #endif 880 881 /* 882 * sched_setup() apparently happens prior to stathz being set. We 883 * need to resolve the timers earlier in the boot so we can avoid 884 * calculating this here. 885 */ 886 if (realstathz == 0) { 887 realstathz = stathz ? stathz : hz; 888 tickincr = hz / realstathz; 889 /* 890 * XXX This does not work for values of stathz that are much 891 * larger than hz. 892 */ 893 if (tickincr == 0) 894 tickincr = 1; 895 } 896 897 td = ke->ke_thread; 898 kg = ke->ke_ksegrp; 899 900 mtx_assert(&sched_lock, MA_OWNED); 901 KASSERT((td != NULL), ("schedclock: null thread pointer")); 902 903 /* Adjust ticks for pctcpu */ 904 ke->ke_ticks++; 905 ke->ke_ltick = ticks; 906 907 /* Go up to one second beyond our max and then trim back down */ 908 if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick) 909 sched_pctcpu_update(ke); 910 911 if (td->td_flags & TDF_IDLETD) 912 return; 913 914 CTR4(KTR_ULE, "Tick kse %p (slice: %d, slptime: %d, runtime: %d)", 915 ke, ke->ke_slice, kg->kg_slptime >> 10, kg->kg_runtime >> 10); 916 917 /* 918 * We only do slicing code for TIMESHARE ksegrps. 919 */ 920 if (kg->kg_pri_class != PRI_TIMESHARE) 921 return; 922 /* 923 * Check for a higher priority task on the run queue. This can happen 924 * on SMP if another processor woke up a process on our runq. 925 */ 926 kseq = KSEQ_SELF(); 927 #if 0 928 if (kseq->ksq_load > 1 && (nke = kseq_choose(kseq)) != NULL) { 929 if (sched_strict && 930 nke->ke_thread->td_priority < td->td_priority) 931 td->td_flags |= TDF_NEEDRESCHED; 932 else if (nke->ke_thread->td_priority < 933 td->td_priority SCHED_PRIO_SLOP) 934 935 if (nke->ke_thread->td_priority < td->td_priority) 936 td->td_flags |= TDF_NEEDRESCHED; 937 } 938 #endif 939 /* 940 * We used a tick charge it to the ksegrp so that we can compute our 941 * interactivity. 942 */ 943 kg->kg_runtime += tickincr << 10; 944 945 /* 946 * We used up one time slice. 947 */ 948 ke->ke_slice--; 949 #ifdef SMP 950 kseq->ksq_rslices--; 951 #endif 952 953 if (ke->ke_slice > 0) 954 return; 955 /* 956 * We're out of time, recompute priorities and requeue. 957 */ 958 kseq_rem(kseq, ke); 959 sched_priority(kg); 960 sched_slice(ke); 961 if (SCHED_CURR(kg, ke)) 962 ke->ke_runq = kseq->ksq_curr; 963 else 964 ke->ke_runq = kseq->ksq_next; 965 kseq_add(kseq, ke); 966 td->td_flags |= TDF_NEEDRESCHED; 967 } 968 969 int 970 sched_runnable(void) 971 { 972 struct kseq *kseq; 973 int load; 974 975 load = 1; 976 977 mtx_lock_spin(&sched_lock); 978 kseq = KSEQ_SELF(); 979 980 if (kseq->ksq_load) 981 goto out; 982 #ifdef SMP 983 /* 984 * For SMP we may steal other processor's KSEs. Just search until we 985 * verify that at least on other cpu has a runnable task. 986 */ 987 if (smp_started) { 988 int i; 989 990 for (i = 0; i < mp_maxid; i++) { 991 if (CPU_ABSENT(i)) 992 continue; 993 kseq = KSEQ_CPU(i); 994 if (kseq->ksq_load > 1) 995 goto out; 996 } 997 } 998 #endif 999 load = 0; 1000 out: 1001 mtx_unlock_spin(&sched_lock); 1002 return (load); 1003 } 1004 1005 void 1006 sched_userret(struct thread *td) 1007 { 1008 struct ksegrp *kg; 1009 1010 kg = td->td_ksegrp; 1011 1012 if (td->td_priority != kg->kg_user_pri) { 1013 mtx_lock_spin(&sched_lock); 1014 td->td_priority = kg->kg_user_pri; 1015 mtx_unlock_spin(&sched_lock); 1016 } 1017 } 1018 1019 struct kse * 1020 sched_choose(void) 1021 { 1022 struct kseq *kseq; 1023 struct kse *ke; 1024 1025 mtx_assert(&sched_lock, MA_OWNED); 1026 #ifdef SMP 1027 retry: 1028 #endif 1029 kseq = KSEQ_SELF(); 1030 ke = kseq_choose(kseq); 1031 if (ke) { 1032 runq_remove(ke->ke_runq, ke); 1033 ke->ke_state = KES_THREAD; 1034 1035 if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE) { 1036 CTR4(KTR_ULE, "Run kse %p from %p (slice: %d, pri: %d)", 1037 ke, ke->ke_runq, ke->ke_slice, 1038 ke->ke_thread->td_priority); 1039 } 1040 return (ke); 1041 } 1042 1043 #ifdef SMP 1044 if (smp_started) { 1045 /* 1046 * Find the cpu with the highest load and steal one proc. 1047 */ 1048 if ((kseq = kseq_load_highest()) == NULL) 1049 return (NULL); 1050 1051 /* 1052 * Remove this kse from this kseq and runq and then requeue 1053 * on the current processor. Then we will dequeue it 1054 * normally above. 1055 */ 1056 ke = kseq_choose(kseq); 1057 runq_remove(ke->ke_runq, ke); 1058 ke->ke_state = KES_THREAD; 1059 kseq_rem(kseq, ke); 1060 1061 ke->ke_cpu = PCPU_GET(cpuid); 1062 sched_add(ke); 1063 goto retry; 1064 } 1065 #endif 1066 1067 return (NULL); 1068 } 1069 1070 void 1071 sched_add(struct kse *ke) 1072 { 1073 struct kseq *kseq; 1074 struct ksegrp *kg; 1075 1076 mtx_assert(&sched_lock, MA_OWNED); 1077 KASSERT((ke->ke_thread != NULL), ("sched_add: No thread on KSE")); 1078 KASSERT((ke->ke_thread->td_kse != NULL), 1079 ("sched_add: No KSE on thread")); 1080 KASSERT(ke->ke_state != KES_ONRUNQ, 1081 ("sched_add: kse %p (%s) already in run queue", ke, 1082 ke->ke_proc->p_comm)); 1083 KASSERT(ke->ke_proc->p_sflag & PS_INMEM, 1084 ("sched_add: process swapped out")); 1085 KASSERT(ke->ke_runq == NULL, 1086 ("sched_add: KSE %p is still assigned to a run queue", ke)); 1087 1088 kg = ke->ke_ksegrp; 1089 1090 switch (PRI_BASE(kg->kg_pri_class)) { 1091 case PRI_ITHD: 1092 case PRI_REALTIME: 1093 kseq = KSEQ_SELF(); 1094 ke->ke_runq = kseq->ksq_curr; 1095 ke->ke_slice = SCHED_SLICE_MAX; 1096 ke->ke_cpu = PCPU_GET(cpuid); 1097 break; 1098 case PRI_TIMESHARE: 1099 kseq = KSEQ_CPU(ke->ke_cpu); 1100 if (SCHED_CURR(kg, ke)) 1101 ke->ke_runq = kseq->ksq_curr; 1102 else 1103 ke->ke_runq = kseq->ksq_next; 1104 break; 1105 case PRI_IDLE: 1106 kseq = KSEQ_CPU(ke->ke_cpu); 1107 /* 1108 * This is for priority prop. 1109 */ 1110 if (ke->ke_thread->td_priority < PRI_MAX_TIMESHARE) 1111 ke->ke_runq = kseq->ksq_curr; 1112 else 1113 ke->ke_runq = &kseq->ksq_idle; 1114 ke->ke_slice = SCHED_SLICE_MIN; 1115 break; 1116 default: 1117 panic("Unknown pri class.\n"); 1118 break; 1119 } 1120 1121 ke->ke_ksegrp->kg_runq_kses++; 1122 ke->ke_state = KES_ONRUNQ; 1123 1124 runq_add(ke->ke_runq, ke); 1125 kseq_add(kseq, ke); 1126 } 1127 1128 void 1129 sched_rem(struct kse *ke) 1130 { 1131 struct kseq *kseq; 1132 1133 mtx_assert(&sched_lock, MA_OWNED); 1134 KASSERT((ke->ke_state == KES_ONRUNQ), ("KSE not on run queue")); 1135 1136 ke->ke_state = KES_THREAD; 1137 ke->ke_ksegrp->kg_runq_kses--; 1138 kseq = KSEQ_CPU(ke->ke_cpu); 1139 runq_remove(ke->ke_runq, ke); 1140 kseq_rem(kseq, ke); 1141 } 1142 1143 fixpt_t 1144 sched_pctcpu(struct kse *ke) 1145 { 1146 fixpt_t pctcpu; 1147 1148 pctcpu = 0; 1149 1150 mtx_lock_spin(&sched_lock); 1151 if (ke->ke_ticks) { 1152 int rtick; 1153 1154 /* Update to account for time potentially spent sleeping */ 1155 ke->ke_ltick = ticks; 1156 sched_pctcpu_update(ke); 1157 1158 /* How many rtick per second ? */ 1159 rtick = ke->ke_ticks / SCHED_CPU_TIME; 1160 pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT; 1161 } 1162 1163 ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick; 1164 mtx_unlock_spin(&sched_lock); 1165 1166 return (pctcpu); 1167 } 1168 1169 int 1170 sched_sizeof_kse(void) 1171 { 1172 return (sizeof(struct kse) + sizeof(struct ke_sched)); 1173 } 1174 1175 int 1176 sched_sizeof_ksegrp(void) 1177 { 1178 return (sizeof(struct ksegrp) + sizeof(struct kg_sched)); 1179 } 1180 1181 int 1182 sched_sizeof_proc(void) 1183 { 1184 return (sizeof(struct proc)); 1185 } 1186 1187 int 1188 sched_sizeof_thread(void) 1189 { 1190 return (sizeof(struct thread) + sizeof(struct td_sched)); 1191 } 1192