xref: /freebsd/sys/kern/kern_switch.c (revision d056fa046c6a91b90cd98165face0e42a33a5173)
1 /*-
2  * Copyright (c) 2001 Jake Burkholder <jake@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 /***
28 Here is the logic..
29 
30 If there are N processors, then there are at most N KSEs (kernel
31 schedulable entities) working to process threads that belong to a
32 KSEGROUP (kg). If there are X of these KSEs actually running at the
33 moment in question, then there are at most M (N-X) of these KSEs on
34 the run queue, as running KSEs are not on the queue.
35 
36 Runnable threads are queued off the KSEGROUP in priority order.
37 If there are M or more threads runnable, the top M threads
38 (by priority) are 'preassigned' to the M KSEs not running. The KSEs take
39 their priority from those threads and are put on the run queue.
40 
41 The last thread that had a priority high enough to have a KSE associated
42 with it, AND IS ON THE RUN QUEUE is pointed to by
43 kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
44 assigned as all the available KSEs are activly running, or because there
45 are no threads queued, that pointer is NULL.
46 
47 When a KSE is removed from the run queue to become runnable, we know
48 it was associated with the highest priority thread in the queue (at the head
49 of the queue). If it is also the last assigned we know M was 1 and must
50 now be 0. Since the thread is no longer queued that pointer must be
51 removed from it. Since we know there were no more KSEs available,
52 (M was 1 and is now 0) and since we are not FREEING our KSE
53 but using it, we know there are STILL no more KSEs available, we can prove
54 that the next thread in the ksegrp list will not have a KSE to assign to
55 it, so we can show that the pointer must be made 'invalid' (NULL).
56 
57 The pointer exists so that when a new thread is made runnable, it can
58 have its priority compared with the last assigned thread to see if
59 it should 'steal' its KSE or not.. i.e. is it 'earlier'
60 on the list than that thread or later.. If it's earlier, then the KSE is
61 removed from the last assigned (which is now not assigned a KSE)
62 and reassigned to the new thread, which is placed earlier in the list.
63 The pointer is then backed up to the previous thread (which may or may not
64 be the new thread).
65 
66 When a thread sleeps or is removed, the KSE becomes available and if there
67 are queued threads that are not assigned KSEs, the highest priority one of
68 them is assigned the KSE, which is then placed back on the run queue at
69 the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
70 to point to it.
71 
72 The following diagram shows 2 KSEs and 3 threads from a single process.
73 
74  RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
75               \    \____
76                \        \
77     KSEGROUP---thread--thread--thread    (queued in priority order)
78         \                 /
79          \_______________/
80           (last_assigned)
81 
82 The result of this scheme is that the M available KSEs are always
83 queued at the priorities they have inherrited from the M highest priority
84 threads for that KSEGROUP. If this situation changes, the KSEs are
85 reassigned to keep this true.
86 ***/
87 
88 #include <sys/cdefs.h>
89 __FBSDID("$FreeBSD$");
90 
91 #include "opt_sched.h"
92 
93 #ifndef KERN_SWITCH_INCLUDE
94 #include <sys/param.h>
95 #include <sys/systm.h>
96 #include <sys/kdb.h>
97 #include <sys/kernel.h>
98 #include <sys/ktr.h>
99 #include <sys/lock.h>
100 #include <sys/mutex.h>
101 #include <sys/proc.h>
102 #include <sys/queue.h>
103 #include <sys/sched.h>
104 #else  /* KERN_SWITCH_INCLUDE */
105 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
106 #include <sys/smp.h>
107 #endif
108 #if defined(SMP) && defined(SCHED_4BSD)
109 #include <sys/sysctl.h>
110 #endif
111 
112 /* Uncomment this to enable logging of critical_enter/exit. */
113 #if 0
114 #define	KTR_CRITICAL	KTR_SCHED
115 #else
116 #define	KTR_CRITICAL	0
117 #endif
118 
119 #ifdef FULL_PREEMPTION
120 #ifndef PREEMPTION
121 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
122 #endif
123 #endif
124 
125 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
126 
127 #define td_kse td_sched
128 
129 /*
130  * kern.sched.preemption allows user space to determine if preemption support
131  * is compiled in or not.  It is not currently a boot or runtime flag that
132  * can be changed.
133  */
134 #ifdef PREEMPTION
135 static int kern_sched_preemption = 1;
136 #else
137 static int kern_sched_preemption = 0;
138 #endif
139 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
140     &kern_sched_preemption, 0, "Kernel preemption enabled");
141 
142 /************************************************************************
143  * Functions that manipulate runnability from a thread perspective.	*
144  ************************************************************************/
145 /*
146  * Select the KSE that will be run next.  From that find the thread, and
147  * remove it from the KSEGRP's run queue.  If there is thread clustering,
148  * this will be what does it.
149  */
150 struct thread *
151 choosethread(void)
152 {
153 	struct kse *ke;
154 	struct thread *td;
155 	struct ksegrp *kg;
156 
157 #if defined(SMP) && (defined(__i386__) || defined(__amd64__))
158 	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
159 		/* Shutting down, run idlethread on AP's */
160 		td = PCPU_GET(idlethread);
161 		ke = td->td_kse;
162 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
163 		ke->ke_flags |= KEF_DIDRUN;
164 		TD_SET_RUNNING(td);
165 		return (td);
166 	}
167 #endif
168 
169 retry:
170 	ke = sched_choose();
171 	if (ke) {
172 		td = ke->ke_thread;
173 		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
174 		kg = ke->ke_ksegrp;
175 		if (td->td_proc->p_flag & P_HADTHREADS) {
176 			if (kg->kg_last_assigned == td) {
177 				kg->kg_last_assigned = TAILQ_PREV(td,
178 				    threadqueue, td_runq);
179 			}
180 			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
181 		}
182 		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
183 		    td, td->td_priority);
184 	} else {
185 		/* Simulate runq_choose() having returned the idle thread */
186 		td = PCPU_GET(idlethread);
187 		ke = td->td_kse;
188 		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
189 	}
190 	ke->ke_flags |= KEF_DIDRUN;
191 
192 	/*
193 	 * If we are in panic, only allow system threads,
194 	 * plus the one we are running in, to be run.
195 	 */
196 	if (panicstr && ((td->td_proc->p_flag & P_SYSTEM) == 0 &&
197 	    (td->td_flags & TDF_INPANIC) == 0)) {
198 		/* note that it is no longer on the run queue */
199 		TD_SET_CAN_RUN(td);
200 		goto retry;
201 	}
202 
203 	TD_SET_RUNNING(td);
204 	return (td);
205 }
206 
207 /*
208  * Given a surplus system slot, try assign a new runnable thread to it.
209  * Called from:
210  *  sched_thread_exit()  (local)
211  *  sched_switch()  (local)
212  *  sched_thread_exit()  (local)
213  *  remrunqueue()  (local)  (not at the moment)
214  */
215 static void
216 slot_fill(struct ksegrp *kg)
217 {
218 	struct thread *td;
219 
220 	mtx_assert(&sched_lock, MA_OWNED);
221 	while (kg->kg_avail_opennings > 0) {
222 		/*
223 		 * Find the first unassigned thread
224 		 */
225 		if ((td = kg->kg_last_assigned) != NULL)
226 			td = TAILQ_NEXT(td, td_runq);
227 		else
228 			td = TAILQ_FIRST(&kg->kg_runq);
229 
230 		/*
231 		 * If we found one, send it to the system scheduler.
232 		 */
233 		if (td) {
234 			kg->kg_last_assigned = td;
235 			sched_add(td, SRQ_YIELDING);
236 			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
237 		} else {
238 			/* no threads to use up the slots. quit now */
239 			break;
240 		}
241 	}
242 }
243 
244 #ifdef	SCHED_4BSD
245 /*
246  * Remove a thread from its KSEGRP's run queue.
247  * This in turn may remove it from a KSE if it was already assigned
248  * to one, possibly causing a new thread to be assigned to the KSE
249  * and the KSE getting a new priority.
250  */
251 static void
252 remrunqueue(struct thread *td)
253 {
254 	struct thread *td2, *td3;
255 	struct ksegrp *kg;
256 	struct kse *ke;
257 
258 	mtx_assert(&sched_lock, MA_OWNED);
259 	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
260 	kg = td->td_ksegrp;
261 	ke = td->td_kse;
262 	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
263 	TD_SET_CAN_RUN(td);
264 	/*
265 	 * If it is not a threaded process, take the shortcut.
266 	 */
267 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
268 		/* remve from sys run queue and free up a slot */
269 		sched_rem(td);
270 		return;
271 	}
272    	td3 = TAILQ_PREV(td, threadqueue, td_runq);
273 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
274 	if (ke->ke_state == KES_ONRUNQ) {
275 		/*
276 		 * This thread has been assigned to the system run queue.
277 		 * We need to dissociate it and try assign the
278 		 * KSE to the next available thread. Then, we should
279 		 * see if we need to move the KSE in the run queues.
280 		 */
281 		sched_rem(td);
282 		td2 = kg->kg_last_assigned;
283 		KASSERT((td2 != NULL), ("last assigned has wrong value"));
284 		if (td2 == td)
285 			kg->kg_last_assigned = td3;
286 		/* slot_fill(kg); */ /* will replace it with another */
287 	}
288 }
289 #endif
290 
291 /*
292  * Change the priority of a thread that is on the run queue.
293  */
294 void
295 adjustrunqueue( struct thread *td, int newpri)
296 {
297 	struct ksegrp *kg;
298 	struct kse *ke;
299 
300 	mtx_assert(&sched_lock, MA_OWNED);
301 	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
302 
303 	ke = td->td_kse;
304 	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
305 	/*
306 	 * If it is not a threaded process, take the shortcut.
307 	 */
308 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
309 		/* We only care about the kse in the run queue. */
310 		td->td_priority = newpri;
311 #ifndef SCHED_CORE
312 		if (ke->ke_rqindex != (newpri / RQ_PPQ))
313 #else
314 		if (ke->ke_rqindex != newpri)
315 #endif
316 		{
317 			sched_rem(td);
318 			sched_add(td, SRQ_BORING);
319 		}
320 		return;
321 	}
322 
323 	/* It is a threaded process */
324 	kg = td->td_ksegrp;
325 	if (ke->ke_state == KES_ONRUNQ
326 #ifdef SCHED_ULE
327 	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
328 	     (ke->ke_flags & KEF_REMOVED) == 0)
329 #endif
330 	   ) {
331 		if (kg->kg_last_assigned == td) {
332 			kg->kg_last_assigned =
333 			    TAILQ_PREV(td, threadqueue, td_runq);
334 		}
335 		sched_rem(td);
336 	}
337 	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
338 	TD_SET_CAN_RUN(td);
339 	td->td_priority = newpri;
340 	setrunqueue(td, SRQ_BORING);
341 }
342 
343 /*
344  * This function is called when a thread is about to be put on a
345  * ksegrp run queue because it has been made runnable or its
346  * priority has been adjusted and the ksegrp does not have a
347  * free kse slot.  It determines if a thread from the same ksegrp
348  * should be preempted.  If so, it tries to switch threads
349  * if the thread is on the same cpu or notifies another cpu that
350  * it should switch threads.
351  */
352 
353 static void
354 maybe_preempt_in_ksegrp(struct thread *td)
355 #if  !defined(SMP)
356 {
357 	struct thread *running_thread;
358 
359 	mtx_assert(&sched_lock, MA_OWNED);
360 	running_thread = curthread;
361 
362 	if (running_thread->td_ksegrp != td->td_ksegrp)
363 		return;
364 
365 	if (td->td_priority >= running_thread->td_priority)
366 		return;
367 #ifdef PREEMPTION
368 #ifndef FULL_PREEMPTION
369 	if (td->td_priority > PRI_MAX_ITHD) {
370 		running_thread->td_flags |= TDF_NEEDRESCHED;
371 		return;
372 	}
373 #endif /* FULL_PREEMPTION */
374 
375 	if (running_thread->td_critnest > 1)
376 		running_thread->td_owepreempt = 1;
377 	 else
378 		 mi_switch(SW_INVOL, NULL);
379 
380 #else /* PREEMPTION */
381 	running_thread->td_flags |= TDF_NEEDRESCHED;
382 #endif /* PREEMPTION */
383 	return;
384 }
385 
386 #else /* SMP */
387 {
388 	struct thread *running_thread;
389 	int worst_pri;
390 	struct ksegrp *kg;
391 	cpumask_t cpumask,dontuse;
392 	struct pcpu *pc;
393 	struct pcpu *best_pcpu;
394 	struct thread *cputhread;
395 
396 	mtx_assert(&sched_lock, MA_OWNED);
397 
398 	running_thread = curthread;
399 
400 #if !defined(KSEG_PEEMPT_BEST_CPU)
401 	if (running_thread->td_ksegrp != td->td_ksegrp) {
402 #endif
403 		kg = td->td_ksegrp;
404 
405 		/* if someone is ahead of this thread, wait our turn */
406 		if (td != TAILQ_FIRST(&kg->kg_runq))
407 			return;
408 
409 		worst_pri = td->td_priority;
410 		best_pcpu = NULL;
411 		dontuse   = stopped_cpus | idle_cpus_mask;
412 
413 		/*
414 		 * Find a cpu with the worst priority that runs at thread from
415 		 * the same  ksegrp - if multiple exist give first the last run
416 		 * cpu and then the current cpu priority
417 		 */
418 
419 		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
420 			cpumask   = pc->pc_cpumask;
421 			cputhread = pc->pc_curthread;
422 
423 			if ((cpumask & dontuse)  ||
424 			    cputhread->td_ksegrp != kg)
425 				continue;
426 
427 			if (cputhread->td_priority > worst_pri) {
428 				worst_pri = cputhread->td_priority;
429 				best_pcpu = pc;
430 				continue;
431 			}
432 
433 			if (cputhread->td_priority == worst_pri &&
434 			    best_pcpu != NULL &&
435 			    (td->td_lastcpu == pc->pc_cpuid ||
436 				(PCPU_GET(cpumask) == cpumask &&
437 				    td->td_lastcpu != best_pcpu->pc_cpuid)))
438 			    best_pcpu = pc;
439 		}
440 
441 		/* Check if we need to preempt someone */
442 		if (best_pcpu == NULL)
443 			return;
444 
445 #if defined(IPI_PREEMPTION) && defined(PREEMPTION)
446 #if !defined(FULL_PREEMPTION)
447 		if (td->td_priority <= PRI_MAX_ITHD)
448 #endif /* ! FULL_PREEMPTION */
449 			{
450 				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
451 				return;
452 			}
453 #endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
454 
455 		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
456 			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
457 			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
458 			return;
459 		}
460 #if !defined(KSEG_PEEMPT_BEST_CPU)
461 	}
462 #endif
463 
464 	if (td->td_priority >= running_thread->td_priority)
465 		return;
466 #ifdef PREEMPTION
467 
468 #if !defined(FULL_PREEMPTION)
469 	if (td->td_priority > PRI_MAX_ITHD) {
470 		running_thread->td_flags |= TDF_NEEDRESCHED;
471 	}
472 #endif /* ! FULL_PREEMPTION */
473 
474 	if (running_thread->td_critnest > 1)
475 		running_thread->td_owepreempt = 1;
476 	 else
477 		 mi_switch(SW_INVOL, NULL);
478 
479 #else /* PREEMPTION */
480 	running_thread->td_flags |= TDF_NEEDRESCHED;
481 #endif /* PREEMPTION */
482 	return;
483 }
484 #endif /* !SMP */
485 
486 
487 int limitcount;
488 void
489 setrunqueue(struct thread *td, int flags)
490 {
491 	struct ksegrp *kg;
492 	struct thread *td2;
493 	struct thread *tda;
494 
495 	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
496 	    td, td->td_ksegrp, td->td_proc->p_pid);
497 	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
498             td, td->td_proc->p_comm, td->td_priority, curthread,
499             curthread->td_proc->p_comm);
500 	mtx_assert(&sched_lock, MA_OWNED);
501 	KASSERT((td->td_inhibitors == 0),
502 			("setrunqueue: trying to run inhibitted thread"));
503 	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
504 	    ("setrunqueue: bad thread state"));
505 	TD_SET_RUNQ(td);
506 	kg = td->td_ksegrp;
507 	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
508 		/*
509 		 * Common path optimisation: Only one of everything
510 		 * and the KSE is always already attached.
511 		 * Totally ignore the ksegrp run queue.
512 		 */
513 		if (kg->kg_avail_opennings != 1) {
514 			if (limitcount < 1) {
515 				limitcount++;
516 				printf("pid %d: corrected slot count (%d->1)\n",
517 				    td->td_proc->p_pid, kg->kg_avail_opennings);
518 
519 			}
520 			kg->kg_avail_opennings = 1;
521 		}
522 		sched_add(td, flags);
523 		return;
524 	}
525 
526 	/*
527 	 * If the concurrency has reduced, and we would go in the
528 	 * assigned section, then keep removing entries from the
529 	 * system run queue, until we are not in that section
530 	 * or there is room for us to be put in that section.
531 	 * What we MUST avoid is the case where there are threads of less
532 	 * priority than the new one scheduled, but it can not
533 	 * be scheduled itself. That would lead to a non contiguous set
534 	 * of scheduled threads, and everything would break.
535 	 */
536 	tda = kg->kg_last_assigned;
537 	while ((kg->kg_avail_opennings <= 0) &&
538 	    (tda && (tda->td_priority > td->td_priority))) {
539 		/*
540 		 * None free, but there is one we can commandeer.
541 		 */
542 		CTR2(KTR_RUNQ,
543 		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
544 		sched_rem(tda);
545 		tda = kg->kg_last_assigned =
546 		    TAILQ_PREV(tda, threadqueue, td_runq);
547 	}
548 
549 	/*
550 	 * Add the thread to the ksegrp's run queue at
551 	 * the appropriate place.
552 	 */
553 	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
554 		if (td2->td_priority > td->td_priority) {
555 			TAILQ_INSERT_BEFORE(td2, td, td_runq);
556 			break;
557 		}
558 	}
559 	if (td2 == NULL) {
560 		/* We ran off the end of the TAILQ or it was empty. */
561 		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
562 	}
563 
564 	/*
565 	 * If we have a slot to use, then put the thread on the system
566 	 * run queue and if needed, readjust the last_assigned pointer.
567 	 * it may be that we need to schedule something anyhow
568 	 * even if the availabel slots are -ve so that
569 	 * all the items < last_assigned are scheduled.
570 	 */
571 	if (kg->kg_avail_opennings > 0) {
572 		if (tda == NULL) {
573 			/*
574 			 * No pre-existing last assigned so whoever is first
575 			 * gets the slot.. (maybe us)
576 			 */
577 			td2 = TAILQ_FIRST(&kg->kg_runq);
578 			kg->kg_last_assigned = td2;
579 		} else if (tda->td_priority > td->td_priority) {
580 			td2 = td;
581 		} else {
582 			/*
583 			 * We are past last_assigned, so
584 			 * give the next slot to whatever is next,
585 			 * which may or may not be us.
586 			 */
587 			td2 = TAILQ_NEXT(tda, td_runq);
588 			kg->kg_last_assigned = td2;
589 		}
590 		sched_add(td2, flags);
591 	} else {
592 		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
593 			td, td->td_ksegrp, td->td_proc->p_pid);
594 		if ((flags & SRQ_YIELDING) == 0)
595 			maybe_preempt_in_ksegrp(td);
596 	}
597 }
598 
599 /*
600  * Kernel thread preemption implementation.  Critical sections mark
601  * regions of code in which preemptions are not allowed.
602  */
603 void
604 critical_enter(void)
605 {
606 	struct thread *td;
607 
608 	td = curthread;
609 	td->td_critnest++;
610 	CTR4(KTR_CRITICAL, "critical_enter by thread %p (%ld, %s) to %d", td,
611 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
612 }
613 
614 void
615 critical_exit(void)
616 {
617 	struct thread *td;
618 
619 	td = curthread;
620 	KASSERT(td->td_critnest != 0,
621 	    ("critical_exit: td_critnest == 0"));
622 #ifdef PREEMPTION
623 	if (td->td_critnest == 1) {
624 		td->td_critnest = 0;
625 		mtx_assert(&sched_lock, MA_NOTOWNED);
626 		if (td->td_owepreempt) {
627 			td->td_critnest = 1;
628 			mtx_lock_spin(&sched_lock);
629 			td->td_critnest--;
630 			mi_switch(SW_INVOL, NULL);
631 			mtx_unlock_spin(&sched_lock);
632 		}
633 	} else
634 #endif
635 		td->td_critnest--;
636 
637 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
638 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
639 }
640 
641 /*
642  * This function is called when a thread is about to be put on run queue
643  * because it has been made runnable or its priority has been adjusted.  It
644  * determines if the new thread should be immediately preempted to.  If so,
645  * it switches to it and eventually returns true.  If not, it returns false
646  * so that the caller may place the thread on an appropriate run queue.
647  */
648 int
649 maybe_preempt(struct thread *td)
650 {
651 #ifdef PREEMPTION
652 	struct thread *ctd;
653 	int cpri, pri;
654 #endif
655 
656 	mtx_assert(&sched_lock, MA_OWNED);
657 #ifdef PREEMPTION
658 	/*
659 	 * The new thread should not preempt the current thread if any of the
660 	 * following conditions are true:
661 	 *
662 	 *  - The kernel is in the throes of crashing (panicstr).
663 	 *  - The current thread has a higher (numerically lower) or
664 	 *    equivalent priority.  Note that this prevents curthread from
665 	 *    trying to preempt to itself.
666 	 *  - It is too early in the boot for context switches (cold is set).
667 	 *  - The current thread has an inhibitor set or is in the process of
668 	 *    exiting.  In this case, the current thread is about to switch
669 	 *    out anyways, so there's no point in preempting.  If we did,
670 	 *    the current thread would not be properly resumed as well, so
671 	 *    just avoid that whole landmine.
672 	 *  - If the new thread's priority is not a realtime priority and
673 	 *    the current thread's priority is not an idle priority and
674 	 *    FULL_PREEMPTION is disabled.
675 	 *
676 	 * If all of these conditions are false, but the current thread is in
677 	 * a nested critical section, then we have to defer the preemption
678 	 * until we exit the critical section.  Otherwise, switch immediately
679 	 * to the new thread.
680 	 */
681 	ctd = curthread;
682 	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
683 	  ("thread has no (or wrong) sched-private part."));
684 	KASSERT((td->td_inhibitors == 0),
685 			("maybe_preempt: trying to run inhibitted thread"));
686 	pri = td->td_priority;
687 	cpri = ctd->td_priority;
688 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
689 	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
690 		return (0);
691 #ifndef FULL_PREEMPTION
692 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
693 		return (0);
694 #endif
695 
696 	if (ctd->td_critnest > 1) {
697 		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
698 		    ctd->td_critnest);
699 		ctd->td_owepreempt = 1;
700 		return (0);
701 	}
702 
703 	/*
704 	 * Thread is runnable but not yet put on system run queue.
705 	 */
706 	MPASS(TD_ON_RUNQ(td));
707 	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
708 	if (td->td_proc->p_flag & P_HADTHREADS) {
709 		/*
710 		 * If this is a threaded process we actually ARE on the
711 		 * ksegrp run queue so take it off that first.
712 		 * Also undo any damage done to the last_assigned pointer.
713 		 * XXX Fix setrunqueue so this isn't needed
714 		 */
715 		struct ksegrp *kg;
716 
717 		kg = td->td_ksegrp;
718 		if (kg->kg_last_assigned == td)
719 			kg->kg_last_assigned =
720 			    TAILQ_PREV(td, threadqueue, td_runq);
721 		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
722 	}
723 
724 	TD_SET_RUNNING(td);
725 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
726 	    td->td_proc->p_pid, td->td_proc->p_comm);
727 	mi_switch(SW_INVOL|SW_PREEMPT, td);
728 	return (1);
729 #else
730 	return (0);
731 #endif
732 }
733 
734 #if 0
735 #ifndef PREEMPTION
736 /* XXX: There should be a non-static version of this. */
737 static void
738 printf_caddr_t(void *data)
739 {
740 	printf("%s", (char *)data);
741 }
742 static char preempt_warning[] =
743     "WARNING: Kernel preemption is disabled, expect reduced performance.\n";
744 SYSINIT(preempt_warning, SI_SUB_COPYRIGHT, SI_ORDER_ANY, printf_caddr_t,
745     preempt_warning)
746 #endif
747 #endif
748 
749 /************************************************************************
750  * SYSTEM RUN QUEUE manipulations and tests				*
751  ************************************************************************/
752 /*
753  * Initialize a run structure.
754  */
755 void
756 runq_init(struct runq *rq)
757 {
758 	int i;
759 
760 	bzero(rq, sizeof *rq);
761 	for (i = 0; i < RQ_NQS; i++)
762 		TAILQ_INIT(&rq->rq_queues[i]);
763 }
764 
765 /*
766  * Clear the status bit of the queue corresponding to priority level pri,
767  * indicating that it is empty.
768  */
769 static __inline void
770 runq_clrbit(struct runq *rq, int pri)
771 {
772 	struct rqbits *rqb;
773 
774 	rqb = &rq->rq_status;
775 	CTR4(KTR_RUNQ, "runq_clrbit: bits=%#x %#x bit=%#x word=%d",
776 	    rqb->rqb_bits[RQB_WORD(pri)],
777 	    rqb->rqb_bits[RQB_WORD(pri)] & ~RQB_BIT(pri),
778 	    RQB_BIT(pri), RQB_WORD(pri));
779 	rqb->rqb_bits[RQB_WORD(pri)] &= ~RQB_BIT(pri);
780 }
781 
782 /*
783  * Find the index of the first non-empty run queue.  This is done by
784  * scanning the status bits, a set bit indicates a non-empty queue.
785  */
786 static __inline int
787 runq_findbit(struct runq *rq)
788 {
789 	struct rqbits *rqb;
790 	int pri;
791 	int i;
792 
793 	rqb = &rq->rq_status;
794 	for (i = 0; i < RQB_LEN; i++)
795 		if (rqb->rqb_bits[i]) {
796 			pri = RQB_FFS(rqb->rqb_bits[i]) + (i << RQB_L2BPW);
797 			CTR3(KTR_RUNQ, "runq_findbit: bits=%#x i=%d pri=%d",
798 			    rqb->rqb_bits[i], i, pri);
799 			return (pri);
800 		}
801 
802 	return (-1);
803 }
804 
805 /*
806  * Set the status bit of the queue corresponding to priority level pri,
807  * indicating that it is non-empty.
808  */
809 static __inline void
810 runq_setbit(struct runq *rq, int pri)
811 {
812 	struct rqbits *rqb;
813 
814 	rqb = &rq->rq_status;
815 	CTR4(KTR_RUNQ, "runq_setbit: bits=%#x %#x bit=%#x word=%d",
816 	    rqb->rqb_bits[RQB_WORD(pri)],
817 	    rqb->rqb_bits[RQB_WORD(pri)] | RQB_BIT(pri),
818 	    RQB_BIT(pri), RQB_WORD(pri));
819 	rqb->rqb_bits[RQB_WORD(pri)] |= RQB_BIT(pri);
820 }
821 
822 /*
823  * Add the KSE to the queue specified by its priority, and set the
824  * corresponding status bit.
825  */
826 void
827 runq_add(struct runq *rq, struct kse *ke, int flags)
828 {
829 	struct rqhead *rqh;
830 	int pri;
831 
832 	pri = ke->ke_thread->td_priority / RQ_PPQ;
833 	ke->ke_rqindex = pri;
834 	runq_setbit(rq, pri);
835 	rqh = &rq->rq_queues[pri];
836 	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
837 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
838 	if (flags & SRQ_PREEMPTED) {
839 		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
840 	} else {
841 		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
842 	}
843 }
844 
845 /*
846  * Return true if there are runnable processes of any priority on the run
847  * queue, false otherwise.  Has no side effects, does not modify the run
848  * queue structure.
849  */
850 int
851 runq_check(struct runq *rq)
852 {
853 	struct rqbits *rqb;
854 	int i;
855 
856 	rqb = &rq->rq_status;
857 	for (i = 0; i < RQB_LEN; i++)
858 		if (rqb->rqb_bits[i]) {
859 			CTR2(KTR_RUNQ, "runq_check: bits=%#x i=%d",
860 			    rqb->rqb_bits[i], i);
861 			return (1);
862 		}
863 	CTR0(KTR_RUNQ, "runq_check: empty");
864 
865 	return (0);
866 }
867 
868 #if defined(SMP) && defined(SCHED_4BSD)
869 int runq_fuzz = 1;
870 SYSCTL_INT(_kern_sched, OID_AUTO, runq_fuzz, CTLFLAG_RW, &runq_fuzz, 0, "");
871 #endif
872 
873 /*
874  * Find the highest priority process on the run queue.
875  */
876 struct kse *
877 runq_choose(struct runq *rq)
878 {
879 	struct rqhead *rqh;
880 	struct kse *ke;
881 	int pri;
882 
883 	mtx_assert(&sched_lock, MA_OWNED);
884 	while ((pri = runq_findbit(rq)) != -1) {
885 		rqh = &rq->rq_queues[pri];
886 #if defined(SMP) && defined(SCHED_4BSD)
887 		/* fuzz == 1 is normal.. 0 or less are ignored */
888 		if (runq_fuzz > 1) {
889 			/*
890 			 * In the first couple of entries, check if
891 			 * there is one for our CPU as a preference.
892 			 */
893 			int count = runq_fuzz;
894 			int cpu = PCPU_GET(cpuid);
895 			struct kse *ke2;
896 			ke2 = ke = TAILQ_FIRST(rqh);
897 
898 			while (count-- && ke2) {
899 				if (ke->ke_thread->td_lastcpu == cpu) {
900 					ke = ke2;
901 					break;
902 				}
903 				ke2 = TAILQ_NEXT(ke2, ke_procq);
904 			}
905 		} else
906 #endif
907 			ke = TAILQ_FIRST(rqh);
908 		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
909 		CTR3(KTR_RUNQ,
910 		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
911 		return (ke);
912 	}
913 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
914 
915 	return (NULL);
916 }
917 
918 /*
919  * Remove the KSE from the queue specified by its priority, and clear the
920  * corresponding status bit if the queue becomes empty.
921  * Caller must set ke->ke_state afterwards.
922  */
923 void
924 runq_remove(struct runq *rq, struct kse *ke)
925 {
926 	struct rqhead *rqh;
927 	int pri;
928 
929 	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
930 		("runq_remove: process swapped out"));
931 	pri = ke->ke_rqindex;
932 	rqh = &rq->rq_queues[pri];
933 	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
934 	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
935 	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
936 	TAILQ_REMOVE(rqh, ke, ke_procq);
937 	if (TAILQ_EMPTY(rqh)) {
938 		CTR0(KTR_RUNQ, "runq_remove: empty");
939 		runq_clrbit(rq, pri);
940 	}
941 }
942 
943 /****** functions that are temporarily here ***********/
944 #include <vm/uma.h>
945 extern struct mtx kse_zombie_lock;
946 
947 /*
948  *  Allocate scheduler specific per-process resources.
949  * The thread and ksegrp have already been linked in.
950  * In this case just set the default concurrency value.
951  *
952  * Called from:
953  *  proc_init() (UMA init method)
954  */
955 void
956 sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
957 {
958 
959 	/* This can go in sched_fork */
960 	sched_init_concurrency(kg);
961 }
962 
963 /*
964  * thread is being either created or recycled.
965  * Fix up the per-scheduler resources associated with it.
966  * Called from:
967  *  sched_fork_thread()
968  *  thread_dtor()  (*may go away)
969  *  thread_init()  (*may go away)
970  */
971 void
972 sched_newthread(struct thread *td)
973 {
974 	struct td_sched *ke;
975 
976 	ke = (struct td_sched *) (td + 1);
977 	bzero(ke, sizeof(*ke));
978 	td->td_sched     = ke;
979 	ke->ke_thread	= td;
980 	ke->ke_state	= KES_THREAD;
981 }
982 
983 /*
984  * Set up an initial concurrency of 1
985  * and set the given thread (if given) to be using that
986  * concurrency slot.
987  * May be used "offline"..before the ksegrp is attached to the world
988  * and thus wouldn't need schedlock in that case.
989  * Called from:
990  *  thr_create()
991  *  proc_init() (UMA) via sched_newproc()
992  */
993 void
994 sched_init_concurrency(struct ksegrp *kg)
995 {
996 
997 	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
998 	kg->kg_concurrency = 1;
999 	kg->kg_avail_opennings = 1;
1000 }
1001 
1002 /*
1003  * Change the concurrency of an existing ksegrp to N
1004  * Called from:
1005  *  kse_create()
1006  *  kse_exit()
1007  *  thread_exit()
1008  *  thread_single()
1009  */
1010 void
1011 sched_set_concurrency(struct ksegrp *kg, int concurrency)
1012 {
1013 
1014 	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
1015 	    kg,
1016 	    concurrency,
1017 	    kg->kg_avail_opennings,
1018 	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
1019 	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
1020 	kg->kg_concurrency = concurrency;
1021 }
1022 
1023 /*
1024  * Called from thread_exit() for all exiting thread
1025  *
1026  * Not to be confused with sched_exit_thread()
1027  * that is only called from thread_exit() for threads exiting
1028  * without the rest of the process exiting because it is also called from
1029  * sched_exit() and we wouldn't want to call it twice.
1030  * XXX This can probably be fixed.
1031  */
1032 void
1033 sched_thread_exit(struct thread *td)
1034 {
1035 
1036 	SLOT_RELEASE(td->td_ksegrp);
1037 	slot_fill(td->td_ksegrp);
1038 }
1039 
1040 #endif /* KERN_SWITCH_INCLUDE */
1041