xref: /freebsd/sys/kern/kern_switch.c (revision b3a1f9373a31b644f8a65de1ba35929af3f6a9fe)
1 /*-
2  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /***
28 Here is the logic..
29 
30 If there are N processors, then there are at most N KSEs (kernel
31 schedulable entities) working to process threads that belong to a
32 KSEGROUP (kg). If there are X of these KSEs actually running at the
33 moment in question, then there are at most M (N-X) of these KSEs on
34 the run queue, as running KSEs are not on the queue.
35 
36 Runnable threads are queued off the KSEGROUP in priority order.
37 If there are M or more threads runnable, the top M threads
38 (by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39 their priority from those threads and are put on the run queue.
40 
41 The last thread that had a priority high enough to have a KSE associated
42 with it, AND IS ON THE RUN QUEUE is pointed to by
43 kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44 assigned as all the available KSEs are activly running, or because there
45 are no threads queued, that pointer is NULL.
46 
47 When a KSE is removed from the run queue to become runnable, we know
48 it was associated with the highest priority thread in the queue (at the head
49 of the queue). If it is also the last assigned we know M was 1 and must
50 now be 0. Since the thread is no longer queued that pointer must be
51 removed from it. Since we know there were no more KSEs available,
52 (M was 1 and is now 0) and since we are not FREEING our KSE
53 but using it, we know there are STILL no more KSEs available, we can prove
54 that the next thread in the ksegrp list will not have a KSE to assign to
55 it, so we can show that the pointer must be made 'invalid' (NULL).
56 
57 The pointer exists so that when a new thread is made runnable, it can
58 have its priority compared with the last assigned thread to see if
59 it should 'steal' its KSE or not.. i.e. is it 'earlier'
60 on the list than that thread or later.. If it's earlier, then the KSE is
61 removed from the last assigned (which is now not assigned a KSE)
62 and reassigned to the new thread, which is placed earlier in the list.
63 The pointer is then backed up to the previous thread (which may or may not
64 be the new thread).
65 
66 When a thread sleeps or is removed, the KSE becomes available and if there
67 are queued threads that are not assigned KSEs, the highest priority one of
68 them is assigned the KSE, which is then placed back on the run queue at
69 the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70 to point to it.
71 
72 The following diagram shows 2 KSEs and 3 threads from a single process.
73 
74  RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75               \    \____
76                \        \
77     KSEGROUP---thread--thread--thread    (queued in priority order)
78         \                 /
79          \_______________/
80           (last_assigned)
81 
82 The result of this scheme is that the M available KSEs are always
83 queued at the priorities they have inherrited from the M highest priority
84 threads for that KSEGROUP. If this situation changes, the KSEs are
85 reassigned to keep this true.
86 ***/
87 
88 #include <sys/cdefs.h>
89 __FBSDID("$FreeBSD$");
90 
91 #include "opt_sched.h"
92 
93 #ifndef KERN_SWITCH_INCLUDE
94 #include <sys/param.h>
95 #include <sys/systm.h>
96 #include <sys/kdb.h>
97 #include <sys/kernel.h>
98 #include <sys/ktr.h>
99 #include <sys/lock.h>
100 #include <sys/mutex.h>
101 #include <sys/proc.h>
102 #include <sys/queue.h>
103 #include <sys/sched.h>
104 #else  /* KERN_SWITCH_INCLUDE */
105 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106 #include <sys/smp.h>
107 #endif
108 #if defined(SMP) && defined(SCHED_4BSD)
109 #include <sys/sysctl.h>
110 #endif
111 
112 /* Uncomment this to enable logging of critical_enter/exit. */
113 #if 0
114 #define	KTR_CRITICAL	KTR_SCHED
115 #else
116 #define	KTR_CRITICAL	0
117 #endif
118 
119 #ifdef FULL_PREEMPTION
120 #ifndef PREEMPTION
121 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
122 #endif
123 #endif
124 
125 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
126 
127 #define td_kse td_sched
128 
129 /*
130  * kern.sched.preemption allows user space to determine if preemption support
131  * is compiled in or not.  It is not currently a boot or runtime flag that
132  * can be changed.
133  */
134 #ifdef PREEMPTION
135 static int kern_sched_preemption = 1;
136 #else
137 static int kern_sched_preemption = 0;
138 #endif
139 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
140     &kern_sched_preemption, 0, "Kernel preemption enabled");
141 
142 /************************************************************************
143  * Functions that manipulate runnability from a thread perspective.	*
144  ************************************************************************/
145 /*
146  * Select the KSE that will be run next.  From that find the thread, and
147  * remove it from the KSEGRP's run queue.  If there is thread clustering,
148  * this will be what does it.
149  */
150 struct thread *
151 choosethread(void)
152 {
153 	struct kse *ke;
154 	struct thread *td;
155 	struct ksegrp *kg;
156 
157 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
158 	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
159 		/* Shutting down, run idlethread on AP's */
160 		td = PCPU_GET(idlethread);
161 		ke = td->td_kse;
162 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
163 		ke->ke_flags |= KEF_DIDRUN;
164 		TD_SET_RUNNING(td);
165 		return (td);
166 	}
167 #endif
168 
169 retry:
170 	ke = sched_choose();
171 	if (ke) {
172 		td = ke->ke_thread;
173 		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
174 		kg = ke->ke_ksegrp;
175 		if (td->td_proc->p_flag & P_HADTHREADS) {
176 			if (kg->kg_last_assigned == td) {
177 				kg->kg_last_assigned = TAILQ_PREV(td,
178 				    threadqueue, td_runq);
179 			}
180 			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
181 		}
182 		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
183 		    td, td->td_priority);
184 	} else {
185 		/* Simulate runq_choose() having returned the idle thread */
186 		td = PCPU_GET(idlethread);
187 		ke = td->td_kse;
188 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
189 	}
190 	ke->ke_flags |= KEF_DIDRUN;
191 
192 	/*
193 	 * If we are in panic, only allow system threads,
194 	 * plus the one we are running in, to be run.
195 	 */
196 	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
197 	    (td->td_flags & TDF_INPANIC) == 0)) {
198 		/* note that it is no longer on the run queue */
199 		TD_SET_CAN_RUN(td);
200 		goto retry;
201 	}
202 
203 	TD_SET_RUNNING(td);
204 	return (td);
205 }
206 
207 /*
208  * Given a surplus system slot, try assign a new runnable thread to it.
209  * Called from:
210  *  sched_thread_exit()  (local)
211  *  sched_switch()  (local)
212  *  sched_thread_exit()  (local)
213  *  remrunqueue()  (local)  (not at the moment)
214  */
215 static void
216 slot_fill(struct ksegrp *kg)
217 {
218 	struct thread *td;
219 
220 	mtx_assert(&sched_lock, MA_OWNED);
221 	while (kg->kg_avail_opennings > 0) {
222 		/*
223 		 * Find the first unassigned thread
224 		 */
225 		if ((td = kg->kg_last_assigned) != NULL)
226 			td = TAILQ_NEXT(td, td_runq);
227 		else
228 			td = TAILQ_FIRST(&kg->kg_runq);
229 
230 		/*
231 		 * If we found one, send it to the system scheduler.
232 		 */
233 		if (td) {
234 			kg->kg_last_assigned = td;
235 			sched_add(td, SRQ_YIELDING);
236 			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
237 		} else {
238 			/* no threads to use up the slots. quit now */
239 			break;
240 		}
241 	}
242 }
243 
244 #ifdef	SCHED_4BSD
245 /*
246  * Remove a thread from its KSEGRP's run queue.
247  * This in turn may remove it from a KSE if it was already assigned
248  * to one, possibly causing a new thread to be assigned to the KSE
249  * and the KSE getting a new priority.
250  */
251 static void
252 remrunqueue(struct thread *td)
253 {
254 	struct thread *td2, *td3;
255 	struct ksegrp *kg;
256 	struct kse *ke;
257 
258 	mtx_assert(&sched_lock, MA_OWNED);
259 	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
260 	kg = td->td_ksegrp;
261 	ke = td->td_kse;
262 	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
263 	TD_SET_CAN_RUN(td);
264 	/*
265 	 * If it is not a threaded process, take the shortcut.
266 	 */
267 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
268 		/* remve from sys run queue and free up a slot */
269 		sched_rem(td);
270 		return;
271 	}
272    	td3 = TAILQ_PREV(td, threadqueue, td_runq);
273 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
274 	if (ke->ke_state == KES_ONRUNQ) {
275 		/*
276 		 * This thread has been assigned to the system run queue.
277 		 * We need to dissociate it and try assign the
278 		 * KSE to the next available thread. Then, we should
279 		 * see if we need to move the KSE in the run queues.
280 		 */
281 		sched_rem(td);
282 		td2 = kg->kg_last_assigned;
283 		KASSERT((td2 != NULL), ("last assigned has wrong value"));
284 		if (td2 == td)
285 			kg->kg_last_assigned = td3;
286 		/* slot_fill(kg); */ /* will replace it with another */
287 	}
288 }
289 #endif
290 
291 /*
292  * Change the priority of a thread that is on the run queue.
293  */
294 void
295 adjustrunqueue( struct thread *td, int newpri)
296 {
297 	struct ksegrp *kg;
298 	struct kse *ke;
299 
300 	mtx_assert(&sched_lock, MA_OWNED);
301 	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
302 
303 	ke = td->td_kse;
304 	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
305 	/*
306 	 * If it is not a threaded process, take the shortcut.
307 	 */
308 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
309 		/* We only care about the kse in the run queue. */
310 		td->td_priority = newpri;
311 		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
312 			sched_rem(td);
313 			sched_add(td, SRQ_BORING);
314 		}
315 		return;
316 	}
317 
318 	/* It is a threaded process */
319 	kg = td->td_ksegrp;
320 	if (ke->ke_state == KES_ONRUNQ
321 #ifdef SCHED_ULE
322 	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
323 	     (ke->ke_flags & KEF_REMOVED) == 0)
324 #endif
325 	   ) {
326 		if (kg->kg_last_assigned == td) {
327 			kg->kg_last_assigned =
328 			    TAILQ_PREV(td, threadqueue, td_runq);
329 		}
330 		sched_rem(td);
331 	}
332 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
333 	TD_SET_CAN_RUN(td);
334 	td->td_priority = newpri;
335 	setrunqueue(td, SRQ_BORING);
336 }
337 
338 /*
339  * This function is called when a thread is about to be put on a
340  * ksegrp run queue because it has been made runnable or its
341  * priority has been adjusted and the ksegrp does not have a
342  * free kse slot.  It determines if a thread from the same ksegrp
343  * should be preempted.  If so, it tries to switch threads
344  * if the thread is on the same cpu or notifies another cpu that
345  * it should switch threads.
346  */
347 
348 static void
349 maybe_preempt_in_ksegrp(struct thread *td)
350 #if  !defined(SMP)
351 {
352 	struct thread *running_thread;
353 
354 	mtx_assert(&sched_lock, MA_OWNED);
355 	running_thread = curthread;
356 
357 	if (running_thread->td_ksegrp != td->td_ksegrp)
358 		return;
359 
360 	if (td->td_priority >= running_thread->td_priority)
361 		return;
362 #ifdef PREEMPTION
363 #ifndef FULL_PREEMPTION
364 	if (td->td_priority > PRI_MAX_ITHD) {
365 		running_thread->td_flags |= TDF_NEEDRESCHED;
366 		return;
367 	}
368 #endif /* FULL_PREEMPTION */
369 
370 	if (running_thread->td_critnest > 1)
371 		running_thread->td_owepreempt = 1;
372 	 else
373 		 mi_switch(SW_INVOL, NULL);
374 
375 #else /* PREEMPTION */
376 	running_thread->td_flags |= TDF_NEEDRESCHED;
377 #endif /* PREEMPTION */
378 	return;
379 }
380 
381 #else /* SMP */
382 {
383 	struct thread *running_thread;
384 	int worst_pri;
385 	struct ksegrp *kg;
386 	cpumask_t cpumask,dontuse;
387 	struct pcpu *pc;
388 	struct pcpu *best_pcpu;
389 	struct thread *cputhread;
390 
391 	mtx_assert(&sched_lock, MA_OWNED);
392 
393 	running_thread = curthread;
394 
395 #if !defined(KSEG_PEEMPT_BEST_CPU)
396 	if (running_thread->td_ksegrp != td->td_ksegrp) {
397 #endif
398 		kg = td->td_ksegrp;
399 
400 		/* if someone is ahead of this thread, wait our turn */
401 		if (td != TAILQ_FIRST(&kg->kg_runq))
402 			return;
403 
404 		worst_pri = td->td_priority;
405 		best_pcpu = NULL;
406 		dontuse   = stopped_cpus | idle_cpus_mask;
407 
408 		/*
409 		 * Find a cpu with the worst priority that runs at thread from
410 		 * the same  ksegrp - if multiple exist give first the last run
411 		 * cpu and then the current cpu priority
412 		 */
413 
414 		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
415 			cpumask   = pc->pc_cpumask;
416 			cputhread = pc->pc_curthread;
417 
418 			if ((cpumask & dontuse)  ||
419 			    cputhread->td_ksegrp != kg)
420 				continue;
421 
422 			if (cputhread->td_priority > worst_pri) {
423 				worst_pri = cputhread->td_priority;
424 				best_pcpu = pc;
425 				continue;
426 			}
427 
428 			if (cputhread->td_priority == worst_pri &&
429 			    best_pcpu != NULL &&
430 			    (td->td_lastcpu == pc->pc_cpuid ||
431 				(PCPU_GET(cpumask) == cpumask &&
432 				    td->td_lastcpu != best_pcpu->pc_cpuid)))
433 			    best_pcpu = pc;
434 		}
435 
436 		/* Check if we need to preempt someone */
437 		if (best_pcpu == NULL)
438 			return;
439 
440 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
441 #if !defined(FULL_PREEMPTION)
442 		if (td->td_priority <= PRI_MAX_ITHD)
443 #endif /* ! FULL_PREEMPTION */
444 			{
445 				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
446 				return;
447 			}
448 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
449 
450 		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
451 			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
452 			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
453 			return;
454 		}
455 #if !defined(KSEG_PEEMPT_BEST_CPU)
456 	}
457 #endif
458 
459 	if (td->td_priority >= running_thread->td_priority)
460 		return;
461 #ifdef PREEMPTION
462 
463 #if !defined(FULL_PREEMPTION)
464 	if (td->td_priority > PRI_MAX_ITHD) {
465 		running_thread->td_flags |= TDF_NEEDRESCHED;
466 	}
467 #endif /* ! FULL_PREEMPTION */
468 
469 	if (running_thread->td_critnest > 1)
470 		running_thread->td_owepreempt = 1;
471 	 else
472 		 mi_switch(SW_INVOL, NULL);
473 
474 #else /* PREEMPTION */
475 	running_thread->td_flags |= TDF_NEEDRESCHED;
476 #endif /* PREEMPTION */
477 	return;
478 }
479 #endif /* !SMP */
480 
481 
482 int limitcount;
483 void
484 setrunqueue(struct thread *td, int flags)
485 {
486 	struct ksegrp *kg;
487 	struct thread *td2;
488 	struct thread *tda;
489 
490 	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
491 	    td, td->td_ksegrp, td->td_proc->p_pid);
492 	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
493             td, td->td_proc->p_comm, td->td_priority, curthread,
494             curthread->td_proc->p_comm);
495 	mtx_assert(&sched_lock, MA_OWNED);
496 	KASSERT((td->td_inhibitors == 0),
497 			("setrunqueue: trying to run inhibitted thread"));
498 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
499 	    ("setrunqueue: bad thread state"));
500 	TD_SET_RUNQ(td);
501 	kg = td->td_ksegrp;
502 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
503 		/*
504 		 * Common path optimisation: Only one of everything
505 		 * and the KSE is always already attached.
506 		 * Totally ignore the ksegrp run queue.
507 		 */
508 		if (kg->kg_avail_opennings != 1) {
509 			if (limitcount < 1) {
510 				limitcount++;
511 				printf("pid %d: corrected slot count (%d->1)\n",
512 				    td->td_proc->p_pid, kg->kg_avail_opennings);
513 
514 			}
515 			kg->kg_avail_opennings = 1;
516 		}
517 		sched_add(td, flags);
518 		return;
519 	}
520 
521 	/*
522 	 * If the concurrency has reduced, and we would go in the
523 	 * assigned section, then keep removing entries from the
524 	 * system run queue, until we are not in that section
525 	 * or there is room for us to be put in that section.
526 	 * What we MUST avoid is the case where there are threads of less
527 	 * priority than the new one scheduled, but it can not
528 	 * be scheduled itself. That would lead to a non contiguous set
529 	 * of scheduled threads, and everything would break.
530 	 */
531 	tda = kg->kg_last_assigned;
532 	while ((kg->kg_avail_opennings <= 0) &&
533 	    (tda && (tda->td_priority > td->td_priority))) {
534 		/*
535 		 * None free, but there is one we can commandeer.
536 		 */
537 		CTR2(KTR_RUNQ,
538 		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
539 		sched_rem(tda);
540 		tda = kg->kg_last_assigned =
541 		    TAILQ_PREV(tda, threadqueue, td_runq);
542 	}
543 
544 	/*
545 	 * Add the thread to the ksegrp's run queue at
546 	 * the appropriate place.
547 	 */
548 	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
549 		if (td2->td_priority > td->td_priority) {
550 			TAILQ_INSERT_BEFORE(td2, td, td_runq);
551 			break;
552 		}
553 	}
554 	if (td2 == NULL) {
555 		/* We ran off the end of the TAILQ or it was empty. */
556 		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
557 	}
558 
559 	/*
560 	 * If we have a slot to use, then put the thread on the system
561 	 * run queue and if needed, readjust the last_assigned pointer.
562 	 * it may be that we need to schedule something anyhow
563 	 * even if the availabel slots are -ve so that
564 	 * all the items < last_assigned are scheduled.
565 	 */
566 	if (kg->kg_avail_opennings > 0) {
567 		if (tda == NULL) {
568 			/*
569 			 * No pre-existing last assigned so whoever is first
570 			 * gets the slot.. (maybe us)
571 			 */
572 			td2 = TAILQ_FIRST(&kg->kg_runq);
573 			kg->kg_last_assigned = td2;
574 		} else if (tda->td_priority > td->td_priority) {
575 			td2 = td;
576 		} else {
577 			/*
578 			 * We are past last_assigned, so
579 			 * give the next slot to whatever is next,
580 			 * which may or may not be us.
581 			 */
582 			td2 = TAILQ_NEXT(tda, td_runq);
583 			kg->kg_last_assigned = td2;
584 		}
585 		sched_add(td2, flags);
586 	} else {
587 		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
588 			td, td->td_ksegrp, td->td_proc->p_pid);
589 		if ((flags & SRQ_YIELDING) == 0)
590 			maybe_preempt_in_ksegrp(td);
591 	}
592 }
593 
594 /*
595  * Kernel thread preemption implementation.  Critical sections mark
596  * regions of code in which preemptions are not allowed.
597  */
598 void
599 critical_enter(void)
600 {
601 	struct thread *td;
602 
603 	td = curthread;
604 	td->td_critnest++;
605 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
606 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
607 }
608 
609 void
610 critical_exit(void)
611 {
612 	struct thread *td;
613 
614 	td = curthread;
615 	KASSERT(td->td_critnest != 0,
616 	    ("critical_exit: td_critnest == 0"));
617 #ifdef PREEMPTION
618 	if (td->td_critnest == 1) {
619 		td->td_critnest = 0;
620 		mtx_assert(&sched_lock, MA_NOTOWNED);
621 		if (td->td_owepreempt) {
622 			td->td_critnest = 1;
623 			mtx_lock_spin(&sched_lock);
624 			td->td_critnest--;
625 			mi_switch(SW_INVOL, NULL);
626 			mtx_unlock_spin(&sched_lock);
627 		}
628 	} else
629 #endif
630 		td->td_critnest--;
631 
632 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
633 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
634 }
635 
636 /*
637  * This function is called when a thread is about to be put on run queue
638  * because it has been made runnable or its priority has been adjusted.  It
639  * determines if the new thread should be immediately preempted to.  If so,
640  * it switches to it and eventually returns true.  If not, it returns false
641  * so that the caller may place the thread on an appropriate run queue.
642  */
643 int
644 maybe_preempt(struct thread *td)
645 {
646 #ifdef PREEMPTION
647 	struct thread *ctd;
648 	int cpri, pri;
649 #endif
650 
651 	mtx_assert(&sched_lock, MA_OWNED);
652 #ifdef PREEMPTION
653 	/*
654 	 * The new thread should not preempt the current thread if any of the
655 	 * following conditions are true:
656 	 *
657 	 *  - The kernel is in the throes of crashing (panicstr).
658 	 *  - The current thread has a higher (numerically lower) or
659 	 *    equivalent priority.  Note that this prevents curthread from
660 	 *    trying to preempt to itself.
661 	 *  - It is too early in the boot for context switches (cold is set).
662 	 *  - The current thread has an inhibitor set or is in the process of
663 	 *    exiting.  In this case, the current thread is about to switch
664 	 *    out anyways, so there's no point in preempting.  If we did,
665 	 *    the current thread would not be properly resumed as well, so
666 	 *    just avoid that whole landmine.
667 	 *  - If the new thread's priority is not a realtime priority and
668 	 *    the current thread's priority is not an idle priority and
669 	 *    FULL_PREEMPTION is disabled.
670 	 *
671 	 * If all of these conditions are false, but the current thread is in
672 	 * a nested critical section, then we have to defer the preemption
673 	 * until we exit the critical section.  Otherwise, switch immediately
674 	 * to the new thread.
675 	 */
676 	ctd = curthread;
677 	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
678 	  ("thread has no (or wrong) sched-private part."));
679 	KASSERT((td->td_inhibitors == 0),
680 			("maybe_preempt: trying to run inhibitted thread"));
681 	pri = td->td_priority;
682 	cpri = ctd->td_priority;
683 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
684 	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
685 		return (0);
686 #ifndef FULL_PREEMPTION
687 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
688 		return (0);
689 #endif
690 
691 	if (ctd->td_critnest > 1) {
692 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
693 		    ctd->td_critnest);
694 		ctd->td_owepreempt = 1;
695 		return (0);
696 	}
697 
698 	/*
699 	 * Thread is runnable but not yet put on system run queue.
700 	 */
701 	MPASS(TD_ON_RUNQ(td));
702 	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
703 	if (td->td_proc->p_flag & P_HADTHREADS) {
704 		/*
705 		 * If this is a threaded process we actually ARE on the
706 		 * ksegrp run queue so take it off that first.
707 		 * Also undo any damage done to the last_assigned pointer.
708 		 * XXX Fix setrunqueue so this isn't needed
709 		 */
710 		struct ksegrp *kg;
711 
712 		kg = td->td_ksegrp;
713 		if (kg->kg_last_assigned == td)
714 			kg->kg_last_assigned =
715 			    TAILQ_PREV(td, threadqueue, td_runq);
716 		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
717 	}
718 
719 	TD_SET_RUNNING(td);
720 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
721 	    td->td_proc->p_pid, td->td_proc->p_comm);
722 	mi_switch(SW_INVOL|SW_PREEMPT, td);
723 	return (1);
724 #else
725 	return (0);
726 #endif
727 }
728 
729 #if 0
730 #ifndef PREEMPTION
731 /* XXX: There should be a non-static version of this. */
732 static void
733 printf_caddr_t(void *data)
734 {
735 	printf("%s", (char *)data);
736 }
737 static char preempt_warning[] =
738     "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
739 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
740     preempt_warning)
741 #endif
742 #endif
743 
744 /************************************************************************
745  * SYSTEM RUN QUEUE manipulations and tests				*
746  ************************************************************************/
747 /*
748  * Initialize a run structure.
749  */
750 void
751 runq_init(struct runq *rq)
752 {
753 	int i;
754 
755 	bzero(rq, sizeof *rq);
756 	for (i = 0; i < RQ_NQS; i++)
757 		TAILQ_INIT(&rq->rq_queues[i]);
758 }
759 
760 /*
761  * Clear the status bit of the queue corresponding to priority level pri,
762  * indicating that it is empty.
763  */
764 static __inline void
765 runq_clrbit(struct runq *rq, int pri)
766 {
767 	struct rqbits *rqb;
768 
769 	rqb = &rq->rq_status;
770 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
771 	    rqb->rqb_bits[RQB_WORD(pri)],
772 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
773 	    RQB_BIT(pri), RQB_WORD(pri));
774 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
775 }
776 
777 /*
778  * Find the index of the first non-empty run queue.  This is done by
779  * scanning the status bits, a set bit indicates a non-empty queue.
780  */
781 static __inline int
782 runq_findbit(struct runq *rq)
783 {
784 	struct rqbits *rqb;
785 	int pri;
786 	int i;
787 
788 	rqb = &rq->rq_status;
789 	for (i = 0; i < RQB_LEN; i++)
790 		if (rqb->rqb_bits[i]) {
791 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
792 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
793 			    rqb->rqb_bits[i], i, pri);
794 			return (pri);
795 		}
796 
797 	return (-1);
798 }
799 
800 /*
801  * Set the status bit of the queue corresponding to priority level pri,
802  * indicating that it is non-empty.
803  */
804 static __inline void
805 runq_setbit(struct runq *rq, int pri)
806 {
807 	struct rqbits *rqb;
808 
809 	rqb = &rq->rq_status;
810 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
811 	    rqb->rqb_bits[RQB_WORD(pri)],
812 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
813 	    RQB_BIT(pri), RQB_WORD(pri));
814 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
815 }
816 
817 /*
818  * Add the KSE to the queue specified by its priority, and set the
819  * corresponding status bit.
820  */
821 void
822 runq_add(struct runq *rq, struct kse *ke, int flags)
823 {
824 	struct rqhead *rqh;
825 	int pri;
826 
827 	pri = ke->ke_thread->td_priority / RQ_PPQ;
828 	ke->ke_rqindex = pri;
829 	runq_setbit(rq, pri);
830 	rqh = &rq->rq_queues[pri];
831 	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
832 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
833 	if (flags & SRQ_PREEMPTED) {
834 		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
835 	} else {
836 		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
837 	}
838 }
839 
840 /*
841  * Return true if there are runnable processes of any priority on the run
842  * queue, false otherwise.  Has no side effects, does not modify the run
843  * queue structure.
844  */
845 int
846 runq_check(struct runq *rq)
847 {
848 	struct rqbits *rqb;
849 	int i;
850 
851 	rqb = &rq->rq_status;
852 	for (i = 0; i < RQB_LEN; i++)
853 		if (rqb->rqb_bits[i]) {
854 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
855 			    rqb->rqb_bits[i], i);
856 			return (1);
857 		}
858 	CTR0(KTR_RUNQ, "runq_check: empty");
859 
860 	return (0);
861 }
862 
863 #if defined(SMP) && defined(SCHED_4BSD)
864 int runq_fuzz = 1;
865 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
866 #endif
867 
868 /*
869  * Find the highest priority process on the run queue.
870  */
871 struct kse *
872 runq_choose(struct runq *rq)
873 {
874 	struct rqhead *rqh;
875 	struct kse *ke;
876 	int pri;
877 
878 	mtx_assert(&sched_lock, MA_OWNED);
879 	while ((pri = runq_findbit(rq)) != -1) {
880 		rqh = &rq->rq_queues[pri];
881 #if defined(SMP) && defined(SCHED_4BSD)
882 		/* fuzz == 1 is normal.. 0 or less are ignored */
883 		if (runq_fuzz > 1) {
884 			/*
885 			 * In the first couple of entries, check if
886 			 * there is one for our CPU as a preference.
887 			 */
888 			int count = runq_fuzz;
889 			int cpu = PCPU_GET(cpuid);
890 			struct kse *ke2;
891 			ke2 = ke = TAILQ_FIRST(rqh);
892 
893 			while (count-- && ke2) {
894 				if (ke->ke_thread->td_lastcpu == cpu) {
895 					ke = ke2;
896 					break;
897 				}
898 				ke2 = TAILQ_NEXT(ke2, ke_procq);
899 			}
900 		} else
901 #endif
902 			ke = TAILQ_FIRST(rqh);
903 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
904 		CTR3(KTR_RUNQ,
905 		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
906 		return (ke);
907 	}
908 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
909 
910 	return (NULL);
911 }
912 
913 /*
914  * Remove the KSE from the queue specified by its priority, and clear the
915  * corresponding status bit if the queue becomes empty.
916  * Caller must set ke->ke_state afterwards.
917  */
918 void
919 runq_remove(struct runq *rq, struct kse *ke)
920 {
921 	struct rqhead *rqh;
922 	int pri;
923 
924 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
925 		("runq_remove: process swapped out"));
926 	pri = ke->ke_rqindex;
927 	rqh = &rq->rq_queues[pri];
928 	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
929 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
930 	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
931 	TAILQ_REMOVE(rqh, ke, ke_procq);
932 	if (TAILQ_EMPTY(rqh)) {
933 		CTR0(KTR_RUNQ, "runq_remove: empty");
934 		runq_clrbit(rq, pri);
935 	}
936 }
937 
938 /****** functions that are temporarily here ***********/
939 #include <vm/uma.h>
940 extern struct mtx kse_zombie_lock;
941 
942 /*
943  *  Allocate scheduler specific per-process resources.
944  * The thread and ksegrp have already been linked in.
945  * In this case just set the default concurrency value.
946  *
947  * Called from:
948  *  proc_init() (UMA init method)
949  */
950 void
951 sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
952 {
953 
954 	/* This can go in sched_fork */
955 	sched_init_concurrency(kg);
956 }
957 
958 /*
959  * thread is being either created or recycled.
960  * Fix up the per-scheduler resources associated with it.
961  * Called from:
962  *  sched_fork_thread()
963  *  thread_dtor()  (*may go away)
964  *  thread_init()  (*may go away)
965  */
966 void
967 sched_newthread(struct thread *td)
968 {
969 	struct td_sched *ke;
970 
971 	ke = (struct td_sched *) (td + 1);
972 	bzero(ke, sizeof(*ke));
973 	td->td_sched     = ke;
974 	ke->ke_thread	= td;
975 	ke->ke_state	= KES_THREAD;
976 }
977 
978 /*
979  * Set up an initial concurrency of 1
980  * and set the given thread (if given) to be using that
981  * concurrency slot.
982  * May be used "offline"..before the ksegrp is attached to the world
983  * and thus wouldn't need schedlock in that case.
984  * Called from:
985  *  thr_create()
986  *  proc_init() (UMA) via sched_newproc()
987  */
988 void
989 sched_init_concurrency(struct ksegrp *kg)
990 {
991 
992 	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
993 	kg->kg_concurrency = 1;
994 	kg->kg_avail_opennings = 1;
995 }
996 
997 /*
998  * Change the concurrency of an existing ksegrp to N
999  * Called from:
1000  *  kse_create()
1001  *  kse_exit()
1002  *  thread_exit()
1003  *  thread_single()
1004  */
1005 void
1006 sched_set_concurrency(struct ksegrp *kg, int concurrency)
1007 {
1008 
1009 	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1010 	    kg,
1011 	    concurrency,
1012 	    kg->kg_avail_opennings,
1013 	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1014 	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1015 	kg->kg_concurrency = concurrency;
1016 }
1017 
1018 /*
1019  * Called from thread_exit() for all exiting thread
1020  *
1021  * Not to be confused with sched_exit_thread()
1022  * that is only called from thread_exit() for threads exiting
1023  * without the rest of the process exiting because it is also called from
1024  * sched_exit() and we wouldn't want to call it twice.
1025  * XXX This can probably be fixed.
1026  */
1027 void
1028 sched_thread_exit(struct thread *td)
1029 {
1030 
1031 	SLOT_RELEASE(td->td_ksegrp);
1032 	slot_fill(td->td_ksegrp);
1033 }
1034 
1035 #endif /* KERN_SWITCH_INCLUDE */
1036