1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1982, 1986, 1990, 1991, 1993
5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. Neither the name of the University nor the names of its contributors
21 * may be used to endorse or promote products derived from this software
22 * without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 */
36
37 #include <sys/cdefs.h>
38 #include "opt_hwpmc_hooks.h"
39 #include "opt_hwt_hooks.h"
40 #include "opt_sched.h"
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/cpuset.h>
45 #include <sys/kernel.h>
46 #include <sys/ktr.h>
47 #include <sys/lock.h>
48 #include <sys/kthread.h>
49 #include <sys/mutex.h>
50 #include <sys/proc.h>
51 #include <sys/resourcevar.h>
52 #include <sys/runq.h>
53 #include <sys/sched.h>
54 #include <sys/sdt.h>
55 #include <sys/smp.h>
56 #include <sys/sysctl.h>
57 #include <sys/sx.h>
58 #include <sys/turnstile.h>
59 #include <sys/umtxvar.h>
60 #include <machine/pcb.h>
61 #include <machine/smp.h>
62
63 #ifdef HWPMC_HOOKS
64 #include <sys/pmckern.h>
65 #endif
66
67 #ifdef HWT_HOOKS
68 #include <dev/hwt/hwt_hook.h>
69 #endif
70
71 #ifdef KDTRACE_HOOKS
72 #include <sys/dtrace_bsd.h>
73 int __read_mostly dtrace_vtime_active;
74 dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
75 #endif
76
77 /*
78 * INVERSE_ESTCPU_WEIGHT is only suitable for statclock() frequencies in
79 * the range 100-256 Hz (approximately).
80 */
81 #ifdef SMP
82 #define INVERSE_ESTCPU_WEIGHT (8 * smp_cpus)
83 #else
84 #define INVERSE_ESTCPU_WEIGHT 8 /* 1 / (priorities per estcpu level). */
85 #endif
86 #define NICE_WEIGHT 1 /* Priorities per nice level. */
87 #define ESTCPULIM(e) \
88 min((e), INVERSE_ESTCPU_WEIGHT * \
89 (NICE_WEIGHT * (PRIO_MAX - PRIO_MIN) + \
90 PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) \
91 + INVERSE_ESTCPU_WEIGHT - 1)
92
93 #define TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
94
95 /*
96 * The schedulable entity that runs a context.
97 * This is an extension to the thread structure and is tailored to
98 * the requirements of this scheduler.
99 * All fields are protected by the scheduler lock.
100 */
101 struct td_sched {
102 fixpt_t ts_pctcpu; /* %cpu during p_swtime. */
103 u_int ts_estcpu; /* Estimated cpu utilization. */
104 int ts_cpticks; /* Ticks of cpu time. */
105 int ts_slptime; /* Seconds !RUNNING. */
106 int ts_slice; /* Remaining part of time slice. */
107 int ts_flags;
108 struct runq *ts_runq; /* runq the thread is currently on */
109 #ifdef KTR
110 char ts_name[TS_NAME_LEN];
111 #endif
112 };
113
114 /* flags kept in td_flags */
115 #define TDF_DIDRUN TDF_SCHED0 /* thread actually ran. */
116 #define TDF_BOUND TDF_SCHED1 /* Bound to one CPU. */
117 #define TDF_SLICEEND TDF_SCHED2 /* Thread time slice is over. */
118
119 /* flags kept in ts_flags */
120 #define TSF_AFFINITY 0x0001 /* Has a non-"full" CPU set. */
121
122 #define SKE_RUNQ_PCPU(ts) \
123 ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
124
125 #define THREAD_CAN_SCHED(td, cpu) \
126 CPU_ISSET((cpu), &(td)->td_cpuset->cs_mask)
127
128 _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
129 sizeof(struct thread0_storage),
130 "increase struct thread0_storage.t0st_sched size");
131
132 static struct mtx sched_lock;
133
134 static int realstathz = 127; /* stathz is sometimes 0 and run off of hz. */
135 static int sched_tdcnt; /* Total runnable threads in the system. */
136 static int sched_slice = 12; /* Thread run time before rescheduling. */
137
138 static void setup_runqs(void);
139 static void schedcpu(void);
140 static void schedcpu_thread(void);
141 static void sched_priority(struct thread *td, u_char prio);
142 static void sched_setup(void *dummy);
143 static void maybe_resched(struct thread *td);
144 static void updatepri(struct thread *td);
145 static void resetpriority(struct thread *td);
146 static void resetpriority_thread(struct thread *td);
147 #ifdef SMP
148 static int sched_pickcpu(struct thread *td);
149 static int forward_wakeup(int cpunum);
150 static void kick_other_cpu(int pri, int cpuid);
151 #endif
152
153 static struct kproc_desc sched_kp = {
154 "schedcpu",
155 schedcpu_thread,
156 NULL
157 };
158 SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
159 &sched_kp);
160 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
161
162 static void sched_initticks(void *dummy);
163 SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks,
164 NULL);
165
166 /*
167 * Global run queue.
168 */
169 static struct runq runq;
170
171 #ifdef SMP
172 /*
173 * Per-CPU run queues
174 */
175 static struct runq runq_pcpu[MAXCPU];
176 long runq_length[MAXCPU];
177
178 static cpuset_t idle_cpus_mask;
179 #endif
180
181 struct pcpuidlestat {
182 u_int idlecalls;
183 u_int oldidlecalls;
184 };
185 DPCPU_DEFINE_STATIC(struct pcpuidlestat, idlestat);
186
187 static void
setup_runqs(void)188 setup_runqs(void)
189 {
190 #ifdef SMP
191 int i;
192
193 for (i = 0; i < MAXCPU; ++i)
194 runq_init(&runq_pcpu[i]);
195 #endif
196
197 runq_init(&runq);
198 }
199
200 static int
sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)201 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS)
202 {
203 int error, new_val, period;
204
205 period = 1000000 / realstathz;
206 new_val = period * sched_slice;
207 error = sysctl_handle_int(oidp, &new_val, 0, req);
208 if (error != 0 || req->newptr == NULL)
209 return (error);
210 if (new_val <= 0)
211 return (EINVAL);
212 sched_slice = imax(1, (new_val + period / 2) / period);
213 hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
214 realstathz);
215 return (0);
216 }
217
218 SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
219 "Scheduler");
220
221 SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "4BSD", 0,
222 "Scheduler name");
223 SYSCTL_PROC(_kern_sched, OID_AUTO, quantum,
224 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
225 sysctl_kern_quantum, "I",
226 "Quantum for timeshare threads in microseconds");
227 SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
228 "Quantum for timeshare threads in stathz ticks");
229 #ifdef SMP
230 /* Enable forwarding of wakeups to all other cpus */
231 static SYSCTL_NODE(_kern_sched, OID_AUTO, ipiwakeup,
232 CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
233 "Kernel SMP");
234
235 static int runq_fuzz = 1;
236 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
237
238 static int forward_wakeup_enabled = 1;
239 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, enabled, CTLFLAG_RW,
240 &forward_wakeup_enabled, 0,
241 "Forwarding of wakeup to idle CPUs");
242
243 static int forward_wakeups_requested = 0;
244 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, requested, CTLFLAG_RD,
245 &forward_wakeups_requested, 0,
246 "Requests for Forwarding of wakeup to idle CPUs");
247
248 static int forward_wakeups_delivered = 0;
249 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, delivered, CTLFLAG_RD,
250 &forward_wakeups_delivered, 0,
251 "Completed Forwarding of wakeup to idle CPUs");
252
253 static int forward_wakeup_use_mask = 1;
254 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, usemask, CTLFLAG_RW,
255 &forward_wakeup_use_mask, 0,
256 "Use the mask of idle cpus");
257
258 static int forward_wakeup_use_loop = 0;
259 SYSCTL_INT(_kern_sched_ipiwakeup, OID_AUTO, useloop, CTLFLAG_RW,
260 &forward_wakeup_use_loop, 0,
261 "Use a loop to find idle cpus");
262
263 #endif
264 #if 0
265 static int sched_followon = 0;
266 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
267 &sched_followon, 0,
268 "allow threads to share a quantum");
269 #endif
270
271 SDT_PROVIDER_DEFINE(sched);
272
273 SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *",
274 "struct proc *", "uint8_t");
275 SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *",
276 "struct proc *", "void *");
277 SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *",
278 "struct proc *", "void *", "int");
279 SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *",
280 "struct proc *", "uint8_t", "struct thread *");
281 SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
282 SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
283 "struct proc *");
284 SDT_PROBE_DEFINE(sched, , , on__cpu);
285 SDT_PROBE_DEFINE(sched, , , remain__cpu);
286 SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
287 "struct proc *");
288
289 static __inline void
sched_load_add(void)290 sched_load_add(void)
291 {
292
293 sched_tdcnt++;
294 KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
295 SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
296 }
297
298 static __inline void
sched_load_rem(void)299 sched_load_rem(void)
300 {
301
302 sched_tdcnt--;
303 KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
304 SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
305 }
306 /*
307 * Arrange to reschedule if necessary, taking the priorities and
308 * schedulers into account.
309 */
310 static void
maybe_resched(struct thread * td)311 maybe_resched(struct thread *td)
312 {
313
314 THREAD_LOCK_ASSERT(td, MA_OWNED);
315 if (td->td_priority < curthread->td_priority)
316 ast_sched_locked(curthread, TDA_SCHED);
317 }
318
319 /*
320 * This function is called when a thread is about to be put on run queue
321 * because it has been made runnable or its priority has been adjusted. It
322 * determines if the new thread should preempt the current thread. If so,
323 * it sets td_owepreempt to request a preemption.
324 */
325 int
maybe_preempt(struct thread * td)326 maybe_preempt(struct thread *td)
327 {
328 #ifdef PREEMPTION
329 struct thread *ctd;
330 int cpri, pri;
331
332 /*
333 * The new thread should not preempt the current thread if any of the
334 * following conditions are true:
335 *
336 * - The kernel is in the throes of crashing (panicstr).
337 * - The current thread has a higher (numerically lower) or
338 * equivalent priority. Note that this prevents curthread from
339 * trying to preempt to itself.
340 * - The current thread has an inhibitor set or is in the process of
341 * exiting. In this case, the current thread is about to switch
342 * out anyways, so there's no point in preempting. If we did,
343 * the current thread would not be properly resumed as well, so
344 * just avoid that whole landmine.
345 * - If the new thread's priority is not a realtime priority and
346 * the current thread's priority is not an idle priority and
347 * FULL_PREEMPTION is disabled.
348 *
349 * If all of these conditions are false, but the current thread is in
350 * a nested critical section, then we have to defer the preemption
351 * until we exit the critical section. Otherwise, switch immediately
352 * to the new thread.
353 */
354 ctd = curthread;
355 THREAD_LOCK_ASSERT(td, MA_OWNED);
356 KASSERT((td->td_inhibitors == 0),
357 ("maybe_preempt: trying to run inhibited thread"));
358 pri = td->td_priority;
359 cpri = ctd->td_priority;
360 if (KERNEL_PANICKED() || pri >= cpri /* || dumping */ ||
361 TD_IS_INHIBITED(ctd))
362 return (0);
363 #ifndef FULL_PREEMPTION
364 if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
365 return (0);
366 #endif
367
368 CTR0(KTR_PROC, "maybe_preempt: scheduling preemption");
369 ctd->td_owepreempt = 1;
370 return (1);
371 #else
372 return (0);
373 #endif
374 }
375
376 /*
377 * Constants for digital decay and forget:
378 * 90% of (ts_estcpu) usage in 5 * loadav time
379 * 95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
380 * Note that, as ps(1) mentions, this can let percentages
381 * total over 100% (I've seen 137.9% for 3 processes).
382 *
383 * Note that schedclock() updates ts_estcpu and p_cpticks asynchronously.
384 *
385 * We wish to decay away 90% of ts_estcpu in (5 * loadavg) seconds.
386 * That is, the system wants to compute a value of decay such
387 * that the following for loop:
388 * for (i = 0; i < (5 * loadavg); i++)
389 * ts_estcpu *= decay;
390 * will compute
391 * ts_estcpu *= 0.1;
392 * for all values of loadavg:
393 *
394 * Mathematically this loop can be expressed by saying:
395 * decay ** (5 * loadavg) ~= .1
396 *
397 * The system computes decay as:
398 * decay = (2 * loadavg) / (2 * loadavg + 1)
399 *
400 * We wish to prove that the system's computation of decay
401 * will always fulfill the equation:
402 * decay ** (5 * loadavg) ~= .1
403 *
404 * If we compute b as:
405 * b = 2 * loadavg
406 * then
407 * decay = b / (b + 1)
408 *
409 * We now need to prove two things:
410 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
411 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
412 *
413 * Facts:
414 * For x close to zero, exp(x) =~ 1 + x, since
415 * exp(x) = 0! + x**1/1! + x**2/2! + ... .
416 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
417 * For x close to zero, ln(1+x) =~ x, since
418 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1
419 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
420 * ln(.1) =~ -2.30
421 *
422 * Proof of (1):
423 * Solve (factor)**(power) =~ .1 given power (5*loadav):
424 * solving for factor,
425 * ln(factor) =~ (-2.30/5*loadav), or
426 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
427 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED
428 *
429 * Proof of (2):
430 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
431 * solving for power,
432 * power*ln(b/(b+1)) =~ -2.30, or
433 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED
434 *
435 * Actual power values for the implemented algorithm are as follows:
436 * loadav: 1 2 3 4
437 * power: 5.68 10.32 14.94 19.55
438 */
439
440 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
441 #define loadfactor(loadav) (2 * (loadav))
442 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE))
443
444 /* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
445 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
446 SYSCTL_UINT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0,
447 "Decay factor used for updating %CPU");
448
449 /*
450 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
451 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
452 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
453 *
454 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
455 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
456 *
457 * If you don't want to bother with the faster/more-accurate formula, you
458 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
459 * (more general) method of calculating the %age of CPU used by a process.
460 */
461 #define CCPU_SHIFT 11
462
463 /*
464 * Recompute process priorities, every hz ticks.
465 * MP-safe, called without the Giant mutex.
466 */
467 /* ARGSUSED */
468 static void
schedcpu(void)469 schedcpu(void)
470 {
471 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
472 struct thread *td;
473 struct proc *p;
474 struct td_sched *ts;
475 int awake;
476
477 sx_slock(&allproc_lock);
478 FOREACH_PROC_IN_SYSTEM(p) {
479 PROC_LOCK(p);
480 if (p->p_state == PRS_NEW) {
481 PROC_UNLOCK(p);
482 continue;
483 }
484 FOREACH_THREAD_IN_PROC(p, td) {
485 awake = 0;
486 ts = td_get_sched(td);
487 thread_lock(td);
488 /*
489 * Increment sleep time (if sleeping). We
490 * ignore overflow, as above.
491 */
492 /*
493 * The td_sched slptimes are not touched in wakeup
494 * because the thread may not HAVE everything in
495 * memory? XXX I think this is out of date.
496 */
497 if (TD_ON_RUNQ(td)) {
498 awake = 1;
499 td->td_flags &= ~TDF_DIDRUN;
500 } else if (TD_IS_RUNNING(td)) {
501 awake = 1;
502 /* Do not clear TDF_DIDRUN */
503 } else if (td->td_flags & TDF_DIDRUN) {
504 awake = 1;
505 td->td_flags &= ~TDF_DIDRUN;
506 }
507
508 /*
509 * ts_pctcpu is only for ps and ttyinfo().
510 */
511 ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
512 /*
513 * If the td_sched has been idle the entire second,
514 * stop recalculating its priority until
515 * it wakes up.
516 */
517 if (ts->ts_cpticks != 0) {
518 #if (FSHIFT >= CCPU_SHIFT)
519 ts->ts_pctcpu += (realstathz == 100)
520 ? ((fixpt_t) ts->ts_cpticks) <<
521 (FSHIFT - CCPU_SHIFT) :
522 100 * (((fixpt_t) ts->ts_cpticks)
523 << (FSHIFT - CCPU_SHIFT)) / realstathz;
524 #else
525 ts->ts_pctcpu += ((FSCALE - ccpu) *
526 (ts->ts_cpticks *
527 FSCALE / realstathz)) >> FSHIFT;
528 #endif
529 ts->ts_cpticks = 0;
530 }
531 /*
532 * If there are ANY running threads in this process,
533 * then don't count it as sleeping.
534 * XXX: this is broken.
535 */
536 if (awake) {
537 if (ts->ts_slptime > 1) {
538 /*
539 * In an ideal world, this should not
540 * happen, because whoever woke us
541 * up from the long sleep should have
542 * unwound the slptime and reset our
543 * priority before we run at the stale
544 * priority. Should KASSERT at some
545 * point when all the cases are fixed.
546 */
547 updatepri(td);
548 }
549 ts->ts_slptime = 0;
550 } else
551 ts->ts_slptime++;
552 if (ts->ts_slptime > 1) {
553 thread_unlock(td);
554 continue;
555 }
556 ts->ts_estcpu = decay_cpu(loadfac, ts->ts_estcpu);
557 resetpriority(td);
558 resetpriority_thread(td);
559 thread_unlock(td);
560 }
561 PROC_UNLOCK(p);
562 }
563 sx_sunlock(&allproc_lock);
564 }
565
566 /*
567 * Main loop for a kthread that executes schedcpu once a second.
568 */
569 static void
schedcpu_thread(void)570 schedcpu_thread(void)
571 {
572
573 for (;;) {
574 schedcpu();
575 pause("-", hz);
576 }
577 }
578
579 /*
580 * Recalculate the priority of a process after it has slept for a while.
581 * For all load averages >= 1 and max ts_estcpu of 255, sleeping for at
582 * least six times the loadfactor will decay ts_estcpu to zero.
583 */
584 static void
updatepri(struct thread * td)585 updatepri(struct thread *td)
586 {
587 struct td_sched *ts;
588 fixpt_t loadfac;
589 unsigned int newcpu;
590
591 ts = td_get_sched(td);
592 loadfac = loadfactor(averunnable.ldavg[0]);
593 if (ts->ts_slptime > 5 * loadfac)
594 ts->ts_estcpu = 0;
595 else {
596 newcpu = ts->ts_estcpu;
597 ts->ts_slptime--; /* was incremented in schedcpu() */
598 while (newcpu && --ts->ts_slptime)
599 newcpu = decay_cpu(loadfac, newcpu);
600 ts->ts_estcpu = newcpu;
601 }
602 }
603
604 /*
605 * Compute the priority of a process when running in user mode.
606 * Arrange to reschedule if the resulting priority is better
607 * than that of the current process.
608 */
609 static void
resetpriority(struct thread * td)610 resetpriority(struct thread *td)
611 {
612 u_int newpriority;
613
614 if (td->td_pri_class != PRI_TIMESHARE)
615 return;
616 newpriority = PUSER +
617 td_get_sched(td)->ts_estcpu / INVERSE_ESTCPU_WEIGHT +
618 NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
619 newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
620 PRI_MAX_TIMESHARE);
621 sched_user_prio(td, newpriority);
622 }
623
624 /*
625 * Update the thread's priority when the associated process's user
626 * priority changes.
627 */
628 static void
resetpriority_thread(struct thread * td)629 resetpriority_thread(struct thread *td)
630 {
631
632 /* Only change threads with a time sharing user priority. */
633 if (td->td_priority < PRI_MIN_TIMESHARE ||
634 td->td_priority > PRI_MAX_TIMESHARE)
635 return;
636
637 /* XXX the whole needresched thing is broken, but not silly. */
638 maybe_resched(td);
639
640 sched_prio(td, td->td_user_pri);
641 }
642
643 /* ARGSUSED */
644 static void
sched_setup(void * dummy)645 sched_setup(void *dummy)
646 {
647
648 setup_runqs();
649
650 /* Account for thread0. */
651 sched_load_add();
652 }
653
654 /*
655 * This routine determines time constants after stathz and hz are setup.
656 */
657 static void
sched_initticks(void * dummy)658 sched_initticks(void *dummy)
659 {
660
661 realstathz = stathz ? stathz : hz;
662 sched_slice = realstathz / 10; /* ~100ms */
663 hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
664 realstathz);
665 }
666
667 /* External interfaces start here */
668
669 /*
670 * Very early in the boot some setup of scheduler-specific
671 * parts of proc0 and of some scheduler resources needs to be done.
672 * Called from:
673 * proc0_init()
674 */
675 void
schedinit(void)676 schedinit(void)
677 {
678
679 /*
680 * Set up the scheduler specific parts of thread0.
681 */
682 thread0.td_lock = &sched_lock;
683 td_get_sched(&thread0)->ts_slice = sched_slice;
684 mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN);
685 }
686
687 void
schedinit_ap(void)688 schedinit_ap(void)
689 {
690
691 /* Nothing needed. */
692 }
693
694 bool
sched_runnable(void)695 sched_runnable(void)
696 {
697 #ifdef SMP
698 return (runq_not_empty(&runq) ||
699 runq_not_empty(&runq_pcpu[PCPU_GET(cpuid)]));
700 #else
701 return (runq_not_empty(&runq));
702 #endif
703 }
704
705 int
sched_rr_interval(void)706 sched_rr_interval(void)
707 {
708
709 /* Convert sched_slice from stathz to hz. */
710 return (imax(1, (sched_slice * hz + realstathz / 2) / realstathz));
711 }
712
713 SCHED_STAT_DEFINE(ithread_demotions, "Interrupt thread priority demotions");
714 SCHED_STAT_DEFINE(ithread_preemptions,
715 "Interrupt thread preemptions due to time-sharing");
716
717 /*
718 * We adjust the priority of the current process. The priority of a
719 * process gets worse as it accumulates CPU time. The cpu usage
720 * estimator (ts_estcpu) is increased here. resetpriority() will
721 * compute a different priority each time ts_estcpu increases by
722 * INVERSE_ESTCPU_WEIGHT (until PRI_MAX_TIMESHARE is reached). The
723 * cpu usage estimator ramps up quite quickly when the process is
724 * running (linearly), and decays away exponentially, at a rate which
725 * is proportionally slower when the system is busy. The basic
726 * principle is that the system will 90% forget that the process used
727 * a lot of CPU time in 5 * loadav seconds. This causes the system to
728 * favor processes which haven't run much recently, and to round-robin
729 * among other processes.
730 */
731 static void
sched_clock_tick(struct thread * td)732 sched_clock_tick(struct thread *td)
733 {
734 struct pcpuidlestat *stat;
735 struct td_sched *ts;
736
737 THREAD_LOCK_ASSERT(td, MA_OWNED);
738 ts = td_get_sched(td);
739
740 ts->ts_cpticks++;
741 ts->ts_estcpu = ESTCPULIM(ts->ts_estcpu + 1);
742 if ((ts->ts_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
743 resetpriority(td);
744 resetpriority_thread(td);
745 }
746
747 /*
748 * Force a context switch if the current thread has used up a full
749 * time slice (default is 100ms).
750 */
751 if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
752 ts->ts_slice = sched_slice;
753
754 /*
755 * If an ithread uses a full quantum, demote its
756 * priority and preempt it.
757 */
758 if (PRI_BASE(td->td_pri_class) == PRI_ITHD) {
759 SCHED_STAT_INC(ithread_preemptions);
760 td->td_owepreempt = 1;
761 if (td->td_base_pri + RQ_PPQ < PRI_MAX_ITHD) {
762 SCHED_STAT_INC(ithread_demotions);
763 sched_prio(td, td->td_base_pri + RQ_PPQ);
764 }
765 } else {
766 td->td_flags |= TDF_SLICEEND;
767 ast_sched_locked(td, TDA_SCHED);
768 }
769 }
770
771 stat = DPCPU_PTR(idlestat);
772 stat->oldidlecalls = stat->idlecalls;
773 stat->idlecalls = 0;
774 }
775
776 void
sched_clock(struct thread * td,int cnt)777 sched_clock(struct thread *td, int cnt)
778 {
779
780 for ( ; cnt > 0; cnt--)
781 sched_clock_tick(td);
782 }
783
784 /*
785 * Charge child's scheduling CPU usage to parent.
786 */
787 void
sched_exit(struct proc * p,struct thread * td)788 sched_exit(struct proc *p, struct thread *td)
789 {
790
791 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "proc exit",
792 "prio:%d", td->td_priority);
793
794 PROC_LOCK_ASSERT(p, MA_OWNED);
795 sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
796 }
797
798 void
sched_exit_thread(struct thread * td,struct thread * child)799 sched_exit_thread(struct thread *td, struct thread *child)
800 {
801
802 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(child), "exit",
803 "prio:%d", child->td_priority);
804 thread_lock(td);
805 td_get_sched(td)->ts_estcpu = ESTCPULIM(td_get_sched(td)->ts_estcpu +
806 td_get_sched(child)->ts_estcpu);
807 thread_unlock(td);
808 thread_lock(child);
809 if ((child->td_flags & TDF_NOLOAD) == 0)
810 sched_load_rem();
811 thread_unlock(child);
812 }
813
814 void
sched_fork(struct thread * td,struct thread * childtd)815 sched_fork(struct thread *td, struct thread *childtd)
816 {
817 sched_fork_thread(td, childtd);
818 }
819
820 void
sched_fork_thread(struct thread * td,struct thread * childtd)821 sched_fork_thread(struct thread *td, struct thread *childtd)
822 {
823 struct td_sched *ts, *tsc;
824
825 childtd->td_oncpu = NOCPU;
826 childtd->td_lastcpu = NOCPU;
827 childtd->td_lock = &sched_lock;
828 childtd->td_cpuset = cpuset_ref(td->td_cpuset);
829 childtd->td_domain.dr_policy = td->td_cpuset->cs_domain;
830 childtd->td_priority = childtd->td_base_pri;
831 ts = td_get_sched(childtd);
832 bzero(ts, sizeof(*ts));
833 tsc = td_get_sched(td);
834 ts->ts_estcpu = tsc->ts_estcpu;
835 ts->ts_flags |= (tsc->ts_flags & TSF_AFFINITY);
836 ts->ts_slice = 1;
837 }
838
839 void
sched_nice(struct proc * p,int nice)840 sched_nice(struct proc *p, int nice)
841 {
842 struct thread *td;
843
844 PROC_LOCK_ASSERT(p, MA_OWNED);
845 p->p_nice = nice;
846 FOREACH_THREAD_IN_PROC(p, td) {
847 thread_lock(td);
848 resetpriority(td);
849 resetpriority_thread(td);
850 thread_unlock(td);
851 }
852 }
853
854 void
sched_class(struct thread * td,int class)855 sched_class(struct thread *td, int class)
856 {
857 THREAD_LOCK_ASSERT(td, MA_OWNED);
858 td->td_pri_class = class;
859 }
860
861 /*
862 * Adjust the priority of a thread.
863 */
864 static void
sched_priority(struct thread * td,u_char prio)865 sched_priority(struct thread *td, u_char prio)
866 {
867
868 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
869 "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
870 sched_tdname(curthread));
871 SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
872 if (td != curthread && prio > td->td_priority) {
873 KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
874 "lend prio", "prio:%d", td->td_priority, "new prio:%d",
875 prio, KTR_ATTR_LINKED, sched_tdname(td));
876 SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio,
877 curthread);
878 }
879 THREAD_LOCK_ASSERT(td, MA_OWNED);
880 if (td->td_priority == prio)
881 return;
882 td->td_priority = prio;
883 if (TD_ON_RUNQ(td) && td->td_rqindex != RQ_PRI_TO_QUEUE_IDX(prio)) {
884 sched_rem(td);
885 sched_add(td, SRQ_BORING | SRQ_HOLDTD);
886 }
887 }
888
889 /*
890 * Update a thread's priority when it is lent another thread's
891 * priority.
892 */
893 void
sched_lend_prio(struct thread * td,u_char prio)894 sched_lend_prio(struct thread *td, u_char prio)
895 {
896
897 td->td_flags |= TDF_BORROWING;
898 sched_priority(td, prio);
899 }
900
901 /*
902 * Restore a thread's priority when priority propagation is
903 * over. The prio argument is the minimum priority the thread
904 * needs to have to satisfy other possible priority lending
905 * requests. If the thread's regulary priority is less
906 * important than prio the thread will keep a priority boost
907 * of prio.
908 */
909 void
sched_unlend_prio(struct thread * td,u_char prio)910 sched_unlend_prio(struct thread *td, u_char prio)
911 {
912 u_char base_pri;
913
914 if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
915 td->td_base_pri <= PRI_MAX_TIMESHARE)
916 base_pri = td->td_user_pri;
917 else
918 base_pri = td->td_base_pri;
919 if (prio >= base_pri) {
920 td->td_flags &= ~TDF_BORROWING;
921 sched_prio(td, base_pri);
922 } else
923 sched_lend_prio(td, prio);
924 }
925
926 void
sched_prio(struct thread * td,u_char prio)927 sched_prio(struct thread *td, u_char prio)
928 {
929 u_char oldprio;
930
931 /* First, update the base priority. */
932 td->td_base_pri = prio;
933
934 /*
935 * If the thread is borrowing another thread's priority, don't ever
936 * lower the priority.
937 */
938 if (td->td_flags & TDF_BORROWING && td->td_priority < prio)
939 return;
940
941 /* Change the real priority. */
942 oldprio = td->td_priority;
943 sched_priority(td, prio);
944
945 /*
946 * If the thread is on a turnstile, then let the turnstile update
947 * its state.
948 */
949 if (TD_ON_LOCK(td) && oldprio != prio)
950 turnstile_adjust(td, oldprio);
951 }
952
953 void
sched_ithread_prio(struct thread * td,u_char prio)954 sched_ithread_prio(struct thread *td, u_char prio)
955 {
956 THREAD_LOCK_ASSERT(td, MA_OWNED);
957 MPASS(td->td_pri_class == PRI_ITHD);
958 td->td_base_ithread_pri = prio;
959 sched_prio(td, prio);
960 }
961
962 void
sched_user_prio(struct thread * td,u_char prio)963 sched_user_prio(struct thread *td, u_char prio)
964 {
965
966 THREAD_LOCK_ASSERT(td, MA_OWNED);
967 td->td_base_user_pri = prio;
968 if (td->td_lend_user_pri <= prio)
969 return;
970 td->td_user_pri = prio;
971 }
972
973 void
sched_lend_user_prio(struct thread * td,u_char prio)974 sched_lend_user_prio(struct thread *td, u_char prio)
975 {
976
977 THREAD_LOCK_ASSERT(td, MA_OWNED);
978 td->td_lend_user_pri = prio;
979 td->td_user_pri = min(prio, td->td_base_user_pri);
980 if (td->td_priority > td->td_user_pri)
981 sched_prio(td, td->td_user_pri);
982 else if (td->td_priority != td->td_user_pri)
983 ast_sched_locked(td, TDA_SCHED);
984 }
985
986 /*
987 * Like the above but first check if there is anything to do.
988 */
989 void
sched_lend_user_prio_cond(struct thread * td,u_char prio)990 sched_lend_user_prio_cond(struct thread *td, u_char prio)
991 {
992
993 if (td->td_lend_user_pri == prio)
994 return;
995
996 thread_lock(td);
997 sched_lend_user_prio(td, prio);
998 thread_unlock(td);
999 }
1000
1001 void
sched_sleep(struct thread * td,int pri)1002 sched_sleep(struct thread *td, int pri)
1003 {
1004
1005 THREAD_LOCK_ASSERT(td, MA_OWNED);
1006 td->td_slptick = ticks;
1007 td_get_sched(td)->ts_slptime = 0;
1008 if (pri != 0 && PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
1009 sched_prio(td, pri);
1010 }
1011
1012 void
sched_switch(struct thread * td,int flags)1013 sched_switch(struct thread *td, int flags)
1014 {
1015 struct thread *newtd;
1016 struct mtx *tmtx;
1017 int preempted;
1018
1019 tmtx = &sched_lock;
1020
1021 THREAD_LOCK_ASSERT(td, MA_OWNED);
1022
1023 td->td_lastcpu = td->td_oncpu;
1024 preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
1025 (flags & SW_PREEMPT) != 0;
1026 td->td_flags &= ~TDF_SLICEEND;
1027 ast_unsched_locked(td, TDA_SCHED);
1028 td->td_owepreempt = 0;
1029 td->td_oncpu = NOCPU;
1030
1031 /*
1032 * At the last moment, if this thread is still marked RUNNING,
1033 * then put it back on the run queue as it has not been suspended
1034 * or stopped or any thing else similar. We never put the idle
1035 * threads on the run queue, however.
1036 */
1037 if (td->td_flags & TDF_IDLETD) {
1038 TD_SET_CAN_RUN(td);
1039 #ifdef SMP
1040 CPU_CLR(PCPU_GET(cpuid), &idle_cpus_mask);
1041 #endif
1042 } else {
1043 if (TD_IS_RUNNING(td)) {
1044 /* Put us back on the run queue. */
1045 sched_add(td, SRQ_HOLDTD | SRQ_OURSELF | SRQ_YIELDING |
1046 (preempted ? SRQ_PREEMPTED : 0));
1047 }
1048 }
1049
1050 /*
1051 * Switch to the sched lock to fix things up and pick
1052 * a new thread. Block the td_lock in order to avoid
1053 * breaking the critical path.
1054 */
1055 if (td->td_lock != &sched_lock) {
1056 mtx_lock_spin(&sched_lock);
1057 tmtx = thread_lock_block(td);
1058 mtx_unlock_spin(tmtx);
1059 }
1060
1061 if ((td->td_flags & TDF_NOLOAD) == 0)
1062 sched_load_rem();
1063
1064 newtd = choosethread();
1065 MPASS(newtd->td_lock == &sched_lock);
1066
1067 #if (KTR_COMPILE & KTR_SCHED) != 0
1068 if (TD_IS_IDLETHREAD(td))
1069 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
1070 "prio:%d", td->td_priority);
1071 else
1072 KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
1073 "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
1074 "lockname:\"%s\"", td->td_lockname);
1075 #endif
1076
1077 if (td != newtd) {
1078 #ifdef HWPMC_HOOKS
1079 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1080 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
1081 #endif
1082
1083 #ifdef HWT_HOOKS
1084 HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL);
1085 HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
1086 #endif
1087
1088 SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
1089
1090 /* I feel sleepy */
1091 lock_profile_release_lock(&sched_lock.lock_object, true);
1092 #ifdef KDTRACE_HOOKS
1093 /*
1094 * If DTrace has set the active vtime enum to anything
1095 * other than INACTIVE (0), then it should have set the
1096 * function to call.
1097 */
1098 if (dtrace_vtime_active)
1099 (*dtrace_vtime_switch_func)(newtd);
1100 #endif
1101
1102 cpu_switch(td, newtd, tmtx);
1103 lock_profile_obtain_lock_success(&sched_lock.lock_object, true,
1104 0, 0, __FILE__, __LINE__);
1105 /*
1106 * Where am I? What year is it?
1107 * We are in the same thread that went to sleep above,
1108 * but any amount of time may have passed. All our context
1109 * will still be available as will local variables.
1110 * PCPU values however may have changed as we may have
1111 * changed CPU so don't trust cached values of them.
1112 * New threads will go to fork_exit() instead of here
1113 * so if you change things here you may need to change
1114 * things there too.
1115 *
1116 * If the thread above was exiting it will never wake
1117 * up again here, so either it has saved everything it
1118 * needed to, or the thread_wait() or wait() will
1119 * need to reap it.
1120 */
1121
1122 SDT_PROBE0(sched, , , on__cpu);
1123 #ifdef HWPMC_HOOKS
1124 if (PMC_PROC_IS_USING_PMCS(td->td_proc))
1125 PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
1126 #endif
1127 } else {
1128 td->td_lock = &sched_lock;
1129 SDT_PROBE0(sched, , , remain__cpu);
1130 }
1131
1132 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
1133 "prio:%d", td->td_priority);
1134
1135 #ifdef SMP
1136 if (td->td_flags & TDF_IDLETD)
1137 CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
1138 #endif
1139 sched_lock.mtx_lock = (uintptr_t)td;
1140 td->td_oncpu = PCPU_GET(cpuid);
1141 spinlock_enter();
1142 mtx_unlock_spin(&sched_lock);
1143 }
1144
1145 void
sched_wakeup(struct thread * td,int srqflags)1146 sched_wakeup(struct thread *td, int srqflags)
1147 {
1148 struct td_sched *ts;
1149
1150 THREAD_LOCK_ASSERT(td, MA_OWNED);
1151 ts = td_get_sched(td);
1152 if (ts->ts_slptime > 1) {
1153 updatepri(td);
1154 resetpriority(td);
1155 }
1156 td->td_slptick = 0;
1157 ts->ts_slptime = 0;
1158 ts->ts_slice = sched_slice;
1159
1160 /*
1161 * When resuming an idle ithread, restore its base ithread
1162 * priority.
1163 */
1164 if (PRI_BASE(td->td_pri_class) == PRI_ITHD &&
1165 td->td_base_pri != td->td_base_ithread_pri)
1166 sched_prio(td, td->td_base_ithread_pri);
1167
1168 sched_add(td, srqflags);
1169 }
1170
1171 #ifdef SMP
1172 static int
forward_wakeup(int cpunum)1173 forward_wakeup(int cpunum)
1174 {
1175 struct pcpu *pc;
1176 cpuset_t dontuse, map, map2;
1177 u_int id, me;
1178 int iscpuset;
1179
1180 mtx_assert(&sched_lock, MA_OWNED);
1181
1182 CTR0(KTR_RUNQ, "forward_wakeup()");
1183
1184 if ((!forward_wakeup_enabled) ||
1185 (forward_wakeup_use_mask == 0 && forward_wakeup_use_loop == 0))
1186 return (0);
1187 if (!smp_started || KERNEL_PANICKED())
1188 return (0);
1189
1190 forward_wakeups_requested++;
1191
1192 /*
1193 * Check the idle mask we received against what we calculated
1194 * before in the old version.
1195 */
1196 me = PCPU_GET(cpuid);
1197
1198 /* Don't bother if we should be doing it ourself. */
1199 if (CPU_ISSET(me, &idle_cpus_mask) &&
1200 (cpunum == NOCPU || me == cpunum))
1201 return (0);
1202
1203 CPU_SETOF(me, &dontuse);
1204 CPU_OR(&dontuse, &dontuse, &stopped_cpus);
1205 CPU_OR(&dontuse, &dontuse, &hlt_cpus_mask);
1206 CPU_ZERO(&map2);
1207 if (forward_wakeup_use_loop) {
1208 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1209 id = pc->pc_cpuid;
1210 if (!CPU_ISSET(id, &dontuse) &&
1211 pc->pc_curthread == pc->pc_idlethread) {
1212 CPU_SET(id, &map2);
1213 }
1214 }
1215 }
1216
1217 if (forward_wakeup_use_mask) {
1218 map = idle_cpus_mask;
1219 CPU_ANDNOT(&map, &map, &dontuse);
1220
1221 /* If they are both on, compare and use loop if different. */
1222 if (forward_wakeup_use_loop) {
1223 if (CPU_CMP(&map, &map2)) {
1224 printf("map != map2, loop method preferred\n");
1225 map = map2;
1226 }
1227 }
1228 } else {
1229 map = map2;
1230 }
1231
1232 /* If we only allow a specific CPU, then mask off all the others. */
1233 if (cpunum != NOCPU) {
1234 KASSERT((cpunum <= mp_maxcpus),("forward_wakeup: bad cpunum."));
1235 iscpuset = CPU_ISSET(cpunum, &map);
1236 if (iscpuset == 0)
1237 CPU_ZERO(&map);
1238 else
1239 CPU_SETOF(cpunum, &map);
1240 }
1241 if (!CPU_EMPTY(&map)) {
1242 forward_wakeups_delivered++;
1243 STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
1244 id = pc->pc_cpuid;
1245 if (!CPU_ISSET(id, &map))
1246 continue;
1247 if (cpu_idle_wakeup(pc->pc_cpuid))
1248 CPU_CLR(id, &map);
1249 }
1250 if (!CPU_EMPTY(&map))
1251 ipi_selected(map, IPI_AST);
1252 return (1);
1253 }
1254 if (cpunum == NOCPU)
1255 printf("forward_wakeup: Idle processor not found\n");
1256 return (0);
1257 }
1258
1259 static void
kick_other_cpu(int pri,int cpuid)1260 kick_other_cpu(int pri, int cpuid)
1261 {
1262 struct pcpu *pcpu;
1263 int cpri;
1264
1265 pcpu = pcpu_find(cpuid);
1266 if (CPU_ISSET(cpuid, &idle_cpus_mask)) {
1267 forward_wakeups_delivered++;
1268 if (!cpu_idle_wakeup(cpuid))
1269 ipi_cpu(cpuid, IPI_AST);
1270 return;
1271 }
1272
1273 cpri = pcpu->pc_curthread->td_priority;
1274 if (pri >= cpri)
1275 return;
1276
1277 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
1278 #if !defined(FULL_PREEMPTION)
1279 if (pri <= PRI_MAX_ITHD)
1280 #endif /* ! FULL_PREEMPTION */
1281 {
1282 ipi_cpu(cpuid, IPI_PREEMPT);
1283 return;
1284 }
1285 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
1286
1287 if (pcpu->pc_curthread->td_lock == &sched_lock) {
1288 ast_sched_locked(pcpu->pc_curthread, TDA_SCHED);
1289 ipi_cpu(cpuid, IPI_AST);
1290 }
1291 }
1292 #endif /* SMP */
1293
1294 #ifdef SMP
1295 static int
sched_pickcpu(struct thread * td)1296 sched_pickcpu(struct thread *td)
1297 {
1298 int best, cpu;
1299
1300 mtx_assert(&sched_lock, MA_OWNED);
1301
1302 if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu))
1303 best = td->td_lastcpu;
1304 else
1305 best = NOCPU;
1306 CPU_FOREACH(cpu) {
1307 if (!THREAD_CAN_SCHED(td, cpu))
1308 continue;
1309
1310 if (best == NOCPU)
1311 best = cpu;
1312 else if (runq_length[cpu] < runq_length[best])
1313 best = cpu;
1314 }
1315 KASSERT(best != NOCPU, ("no valid CPUs"));
1316
1317 return (best);
1318 }
1319 #endif
1320
1321 void
sched_add(struct thread * td,int flags)1322 sched_add(struct thread *td, int flags)
1323 #ifdef SMP
1324 {
1325 cpuset_t tidlemsk;
1326 struct td_sched *ts;
1327 u_int cpu, cpuid;
1328 int forwarded = 0;
1329 int single_cpu = 0;
1330
1331 ts = td_get_sched(td);
1332 THREAD_LOCK_ASSERT(td, MA_OWNED);
1333 KASSERT((td->td_inhibitors == 0),
1334 ("sched_add: trying to run inhibited thread"));
1335 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1336 ("sched_add: bad thread state"));
1337 KASSERT(td->td_flags & TDF_INMEM,
1338 ("sched_add: thread swapped out"));
1339
1340 KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1341 "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1342 sched_tdname(curthread));
1343 KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1344 KTR_ATTR_LINKED, sched_tdname(td));
1345 SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
1346 flags & SRQ_PREEMPTED);
1347
1348 /*
1349 * Now that the thread is moving to the run-queue, set the lock
1350 * to the scheduler's lock.
1351 */
1352 if (td->td_lock != &sched_lock) {
1353 mtx_lock_spin(&sched_lock);
1354 if ((flags & SRQ_HOLD) != 0)
1355 td->td_lock = &sched_lock;
1356 else
1357 thread_lock_set(td, &sched_lock);
1358 }
1359 TD_SET_RUNQ(td);
1360
1361 /*
1362 * If SMP is started and the thread is pinned or otherwise limited to
1363 * a specific set of CPUs, queue the thread to a per-CPU run queue.
1364 * Otherwise, queue the thread to the global run queue.
1365 *
1366 * If SMP has not yet been started we must use the global run queue
1367 * as per-CPU state may not be initialized yet and we may crash if we
1368 * try to access the per-CPU run queues.
1369 */
1370 if (smp_started && (td->td_pinned != 0 || td->td_flags & TDF_BOUND ||
1371 ts->ts_flags & TSF_AFFINITY)) {
1372 if (td->td_pinned != 0)
1373 cpu = td->td_lastcpu;
1374 else if (td->td_flags & TDF_BOUND) {
1375 /* Find CPU from bound runq. */
1376 KASSERT(SKE_RUNQ_PCPU(ts),
1377 ("sched_add: bound td_sched not on cpu runq"));
1378 cpu = ts->ts_runq - &runq_pcpu[0];
1379 } else
1380 /* Find a valid CPU for our cpuset */
1381 cpu = sched_pickcpu(td);
1382 ts->ts_runq = &runq_pcpu[cpu];
1383 single_cpu = 1;
1384 CTR3(KTR_RUNQ,
1385 "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td,
1386 cpu);
1387 } else {
1388 CTR2(KTR_RUNQ,
1389 "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts,
1390 td);
1391 cpu = NOCPU;
1392 ts->ts_runq = &runq;
1393 }
1394
1395 if ((td->td_flags & TDF_NOLOAD) == 0)
1396 sched_load_add();
1397 runq_add(ts->ts_runq, td, flags);
1398 if (cpu != NOCPU)
1399 runq_length[cpu]++;
1400
1401 cpuid = PCPU_GET(cpuid);
1402 if (single_cpu && cpu != cpuid) {
1403 kick_other_cpu(td->td_priority, cpu);
1404 } else {
1405 if (!single_cpu) {
1406 tidlemsk = idle_cpus_mask;
1407 CPU_ANDNOT(&tidlemsk, &tidlemsk, &hlt_cpus_mask);
1408 CPU_CLR(cpuid, &tidlemsk);
1409
1410 if (!CPU_ISSET(cpuid, &idle_cpus_mask) &&
1411 ((flags & SRQ_INTR) == 0) &&
1412 !CPU_EMPTY(&tidlemsk))
1413 forwarded = forward_wakeup(cpu);
1414 }
1415
1416 if (!forwarded) {
1417 if (!maybe_preempt(td))
1418 maybe_resched(td);
1419 }
1420 }
1421 if ((flags & SRQ_HOLDTD) == 0)
1422 thread_unlock(td);
1423 }
1424 #else /* SMP */
1425 {
1426 struct td_sched *ts;
1427
1428 ts = td_get_sched(td);
1429 THREAD_LOCK_ASSERT(td, MA_OWNED);
1430 KASSERT((td->td_inhibitors == 0),
1431 ("sched_add: trying to run inhibited thread"));
1432 KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
1433 ("sched_add: bad thread state"));
1434 KASSERT(td->td_flags & TDF_INMEM,
1435 ("sched_add: thread swapped out"));
1436 KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq add",
1437 "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1438 sched_tdname(curthread));
1439 KTR_POINT1(KTR_SCHED, "thread", sched_tdname(curthread), "wokeup",
1440 KTR_ATTR_LINKED, sched_tdname(td));
1441 SDT_PROBE4(sched, , , enqueue, td, td->td_proc, NULL,
1442 flags & SRQ_PREEMPTED);
1443
1444 /*
1445 * Now that the thread is moving to the run-queue, set the lock
1446 * to the scheduler's lock.
1447 */
1448 if (td->td_lock != &sched_lock) {
1449 mtx_lock_spin(&sched_lock);
1450 if ((flags & SRQ_HOLD) != 0)
1451 td->td_lock = &sched_lock;
1452 else
1453 thread_lock_set(td, &sched_lock);
1454 }
1455 TD_SET_RUNQ(td);
1456 CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
1457 ts->ts_runq = &runq;
1458
1459 if ((td->td_flags & TDF_NOLOAD) == 0)
1460 sched_load_add();
1461 runq_add(ts->ts_runq, td, flags);
1462 if (!maybe_preempt(td))
1463 maybe_resched(td);
1464 if ((flags & SRQ_HOLDTD) == 0)
1465 thread_unlock(td);
1466 }
1467 #endif /* SMP */
1468
1469 void
sched_rem(struct thread * td)1470 sched_rem(struct thread *td)
1471 {
1472 struct td_sched *ts;
1473
1474 ts = td_get_sched(td);
1475 KASSERT(td->td_flags & TDF_INMEM,
1476 ("sched_rem: thread swapped out"));
1477 KASSERT(TD_ON_RUNQ(td),
1478 ("sched_rem: thread not on run queue"));
1479 mtx_assert(&sched_lock, MA_OWNED);
1480 KTR_STATE2(KTR_SCHED, "thread", sched_tdname(td), "runq rem",
1481 "prio:%d", td->td_priority, KTR_ATTR_LINKED,
1482 sched_tdname(curthread));
1483 SDT_PROBE3(sched, , , dequeue, td, td->td_proc, NULL);
1484
1485 if ((td->td_flags & TDF_NOLOAD) == 0)
1486 sched_load_rem();
1487 #ifdef SMP
1488 if (ts->ts_runq != &runq)
1489 runq_length[ts->ts_runq - runq_pcpu]--;
1490 #endif
1491 runq_remove(ts->ts_runq, td);
1492 TD_SET_CAN_RUN(td);
1493 }
1494
1495 /*
1496 * Select threads to run. Note that running threads still consume a
1497 * slot.
1498 */
1499 struct thread *
sched_choose(void)1500 sched_choose(void)
1501 {
1502 struct thread *td;
1503 struct runq *rq;
1504
1505 mtx_assert(&sched_lock, MA_OWNED);
1506 #ifdef SMP
1507 struct thread *tdcpu;
1508
1509 rq = &runq;
1510 td = runq_choose_fuzz(&runq, runq_fuzz);
1511 tdcpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
1512
1513 if (td == NULL ||
1514 (tdcpu != NULL &&
1515 tdcpu->td_priority < td->td_priority)) {
1516 CTR2(KTR_RUNQ, "choosing td %p from pcpu runq %d", tdcpu,
1517 PCPU_GET(cpuid));
1518 td = tdcpu;
1519 rq = &runq_pcpu[PCPU_GET(cpuid)];
1520 } else {
1521 CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", td);
1522 }
1523
1524 #else
1525 rq = &runq;
1526 td = runq_choose(&runq);
1527 #endif
1528
1529 if (td) {
1530 #ifdef SMP
1531 if (td == tdcpu)
1532 runq_length[PCPU_GET(cpuid)]--;
1533 #endif
1534 runq_remove(rq, td);
1535 td->td_flags |= TDF_DIDRUN;
1536
1537 KASSERT(td->td_flags & TDF_INMEM,
1538 ("sched_choose: thread swapped out"));
1539 return (td);
1540 }
1541 return (PCPU_GET(idlethread));
1542 }
1543
1544 void
sched_preempt(struct thread * td)1545 sched_preempt(struct thread *td)
1546 {
1547 int flags;
1548
1549 SDT_PROBE2(sched, , , surrender, td, td->td_proc);
1550 if (td->td_critnest > 1) {
1551 td->td_owepreempt = 1;
1552 } else {
1553 thread_lock(td);
1554 flags = SW_INVOL | SW_PREEMPT;
1555 flags |= TD_IS_IDLETHREAD(td) ? SWT_REMOTEWAKEIDLE :
1556 SWT_REMOTEPREEMPT;
1557 mi_switch(flags);
1558 }
1559 }
1560
1561 void
sched_userret_slowpath(struct thread * td)1562 sched_userret_slowpath(struct thread *td)
1563 {
1564
1565 thread_lock(td);
1566 td->td_priority = td->td_user_pri;
1567 td->td_base_pri = td->td_user_pri;
1568 thread_unlock(td);
1569 }
1570
1571 void
sched_bind(struct thread * td,int cpu)1572 sched_bind(struct thread *td, int cpu)
1573 {
1574 #ifdef SMP
1575 struct td_sched *ts = td_get_sched(td);
1576 #endif
1577
1578 THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
1579 KASSERT(td == curthread, ("sched_bind: can only bind curthread"));
1580
1581 td->td_flags |= TDF_BOUND;
1582 #ifdef SMP
1583 ts->ts_runq = &runq_pcpu[cpu];
1584 if (PCPU_GET(cpuid) == cpu)
1585 return;
1586
1587 mi_switch(SW_VOL | SWT_BIND);
1588 thread_lock(td);
1589 #endif
1590 }
1591
1592 void
sched_unbind(struct thread * td)1593 sched_unbind(struct thread* td)
1594 {
1595 THREAD_LOCK_ASSERT(td, MA_OWNED);
1596 KASSERT(td == curthread, ("sched_unbind: can only bind curthread"));
1597 td->td_flags &= ~TDF_BOUND;
1598 }
1599
1600 int
sched_is_bound(struct thread * td)1601 sched_is_bound(struct thread *td)
1602 {
1603 THREAD_LOCK_ASSERT(td, MA_OWNED);
1604 return (td->td_flags & TDF_BOUND);
1605 }
1606
1607 void
sched_relinquish(struct thread * td)1608 sched_relinquish(struct thread *td)
1609 {
1610 thread_lock(td);
1611 mi_switch(SW_VOL | SWT_RELINQUISH);
1612 }
1613
1614 int
sched_load(void)1615 sched_load(void)
1616 {
1617 return (sched_tdcnt);
1618 }
1619
1620 int
sched_sizeof_proc(void)1621 sched_sizeof_proc(void)
1622 {
1623 return (sizeof(struct proc));
1624 }
1625
1626 int
sched_sizeof_thread(void)1627 sched_sizeof_thread(void)
1628 {
1629 return (sizeof(struct thread) + sizeof(struct td_sched));
1630 }
1631
1632 fixpt_t
sched_pctcpu(struct thread * td)1633 sched_pctcpu(struct thread *td)
1634 {
1635 struct td_sched *ts;
1636
1637 THREAD_LOCK_ASSERT(td, MA_OWNED);
1638 ts = td_get_sched(td);
1639 return (ts->ts_pctcpu);
1640 }
1641
1642 #ifdef RACCT
1643 /*
1644 * Calculates the contribution to the thread cpu usage for the latest
1645 * (unfinished) second.
1646 */
1647 fixpt_t
sched_pctcpu_delta(struct thread * td)1648 sched_pctcpu_delta(struct thread *td)
1649 {
1650 struct td_sched *ts;
1651 fixpt_t delta;
1652 int realstathz;
1653
1654 THREAD_LOCK_ASSERT(td, MA_OWNED);
1655 ts = td_get_sched(td);
1656 delta = 0;
1657 realstathz = stathz ? stathz : hz;
1658 if (ts->ts_cpticks != 0) {
1659 #if (FSHIFT >= CCPU_SHIFT)
1660 delta = (realstathz == 100)
1661 ? ((fixpt_t) ts->ts_cpticks) <<
1662 (FSHIFT - CCPU_SHIFT) :
1663 100 * (((fixpt_t) ts->ts_cpticks)
1664 << (FSHIFT - CCPU_SHIFT)) / realstathz;
1665 #else
1666 delta = ((FSCALE - ccpu) *
1667 (ts->ts_cpticks *
1668 FSCALE / realstathz)) >> FSHIFT;
1669 #endif
1670 }
1671
1672 return (delta);
1673 }
1674 #endif
1675
1676 u_int
sched_estcpu(struct thread * td)1677 sched_estcpu(struct thread *td)
1678 {
1679
1680 return (td_get_sched(td)->ts_estcpu);
1681 }
1682
1683 /*
1684 * The actual idle process.
1685 */
1686 void
sched_idletd(void * dummy)1687 sched_idletd(void *dummy)
1688 {
1689 struct pcpuidlestat *stat;
1690
1691 THREAD_NO_SLEEPING();
1692 stat = DPCPU_PTR(idlestat);
1693 for (;;) {
1694 mtx_assert(&Giant, MA_NOTOWNED);
1695
1696 while (!sched_runnable()) {
1697 cpu_idle(stat->idlecalls + stat->oldidlecalls > 64);
1698 stat->idlecalls++;
1699 }
1700
1701 mtx_lock_spin(&sched_lock);
1702 mi_switch(SW_VOL | SWT_IDLE);
1703 }
1704 }
1705
1706 static void
sched_throw_tail(struct thread * td)1707 sched_throw_tail(struct thread *td)
1708 {
1709 struct thread *newtd;
1710
1711 mtx_assert(&sched_lock, MA_OWNED);
1712 KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
1713
1714 newtd = choosethread();
1715
1716 #ifdef HWT_HOOKS
1717 if (td)
1718 HWT_CALL_HOOK(td, HWT_SWITCH_OUT, NULL);
1719 HWT_CALL_HOOK(newtd, HWT_SWITCH_IN, NULL);
1720 #endif
1721
1722 cpu_throw(td, newtd); /* doesn't return */
1723 }
1724
1725 /*
1726 * A CPU is entering for the first time.
1727 */
1728 void
sched_ap_entry(void)1729 sched_ap_entry(void)
1730 {
1731
1732 /*
1733 * Correct spinlock nesting. The idle thread context that we are
1734 * borrowing was created so that it would start out with a single
1735 * spin lock (sched_lock) held in fork_trampoline(). Since we've
1736 * explicitly acquired locks in this function, the nesting count
1737 * is now 2 rather than 1. Since we are nested, calling
1738 * spinlock_exit() will simply adjust the counts without allowing
1739 * spin lock using code to interrupt us.
1740 */
1741 mtx_lock_spin(&sched_lock);
1742 spinlock_exit();
1743 PCPU_SET(switchtime, cpu_ticks());
1744 PCPU_SET(switchticks, ticks);
1745
1746 sched_throw_tail(NULL);
1747 }
1748
1749 /*
1750 * A thread is exiting.
1751 */
1752 void
sched_throw(struct thread * td)1753 sched_throw(struct thread *td)
1754 {
1755
1756 MPASS(td != NULL);
1757 MPASS(td->td_lock == &sched_lock);
1758
1759 lock_profile_release_lock(&sched_lock.lock_object, true);
1760 td->td_lastcpu = td->td_oncpu;
1761 td->td_oncpu = NOCPU;
1762
1763 sched_throw_tail(td);
1764 }
1765
1766 void
sched_fork_exit(struct thread * td)1767 sched_fork_exit(struct thread *td)
1768 {
1769
1770 /*
1771 * Finish setting up thread glue so that it begins execution in a
1772 * non-nested critical section with sched_lock held but not recursed.
1773 */
1774 td->td_oncpu = PCPU_GET(cpuid);
1775 sched_lock.mtx_lock = (uintptr_t)td;
1776 lock_profile_obtain_lock_success(&sched_lock.lock_object, true,
1777 0, 0, __FILE__, __LINE__);
1778 THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
1779
1780 KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
1781 "prio:%d", td->td_priority);
1782 SDT_PROBE0(sched, , , on__cpu);
1783 }
1784
1785 char *
sched_tdname(struct thread * td)1786 sched_tdname(struct thread *td)
1787 {
1788 #ifdef KTR
1789 struct td_sched *ts;
1790
1791 ts = td_get_sched(td);
1792 if (ts->ts_name[0] == '\0')
1793 snprintf(ts->ts_name, sizeof(ts->ts_name),
1794 "%s tid %d", td->td_name, td->td_tid);
1795 return (ts->ts_name);
1796 #else
1797 return (td->td_name);
1798 #endif
1799 }
1800
1801 #ifdef KTR
1802 void
sched_clear_tdname(struct thread * td)1803 sched_clear_tdname(struct thread *td)
1804 {
1805 struct td_sched *ts;
1806
1807 ts = td_get_sched(td);
1808 ts->ts_name[0] = '\0';
1809 }
1810 #endif
1811
1812 void
sched_affinity(struct thread * td)1813 sched_affinity(struct thread *td)
1814 {
1815 #ifdef SMP
1816 struct td_sched *ts;
1817 int cpu;
1818
1819 THREAD_LOCK_ASSERT(td, MA_OWNED);
1820
1821 /*
1822 * Set the TSF_AFFINITY flag if there is at least one CPU this
1823 * thread can't run on.
1824 */
1825 ts = td_get_sched(td);
1826 ts->ts_flags &= ~TSF_AFFINITY;
1827 CPU_FOREACH(cpu) {
1828 if (!THREAD_CAN_SCHED(td, cpu)) {
1829 ts->ts_flags |= TSF_AFFINITY;
1830 break;
1831 }
1832 }
1833
1834 /*
1835 * If this thread can run on all CPUs, nothing else to do.
1836 */
1837 if (!(ts->ts_flags & TSF_AFFINITY))
1838 return;
1839
1840 /* Pinned threads and bound threads should be left alone. */
1841 if (td->td_pinned != 0 || td->td_flags & TDF_BOUND)
1842 return;
1843
1844 switch (TD_GET_STATE(td)) {
1845 case TDS_RUNQ:
1846 /*
1847 * If we are on a per-CPU runqueue that is in the set,
1848 * then nothing needs to be done.
1849 */
1850 if (ts->ts_runq != &runq &&
1851 THREAD_CAN_SCHED(td, ts->ts_runq - runq_pcpu))
1852 return;
1853
1854 /* Put this thread on a valid per-CPU runqueue. */
1855 sched_rem(td);
1856 sched_add(td, SRQ_HOLDTD | SRQ_BORING);
1857 break;
1858 case TDS_RUNNING:
1859 /*
1860 * See if our current CPU is in the set. If not, force a
1861 * context switch.
1862 */
1863 if (THREAD_CAN_SCHED(td, td->td_oncpu))
1864 return;
1865
1866 ast_sched_locked(td, TDA_SCHED);
1867 if (td != curthread)
1868 ipi_cpu(cpu, IPI_AST);
1869 break;
1870 default:
1871 break;
1872 }
1873 #endif
1874 }
1875