xref: /titanic_50/usr/src/uts/common/os/sched.c (revision 965005c81e0f731867d47892b9fb677030b102df)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
25  * Use is subject to license terms.
26  */
27 
28 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
29 /*	  All Rights Reserved	*/
30 
31 
32 #pragma ident	"%Z%%M%	%I%	%E% SMI"
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/sysmacros.h>
37 #include <sys/systm.h>
38 #include <sys/proc.h>
39 #include <sys/cpuvar.h>
40 #include <sys/var.h>
41 #include <sys/tuneable.h>
42 #include <sys/cmn_err.h>
43 #include <sys/buf.h>
44 #include <sys/disp.h>
45 #include <sys/vmsystm.h>
46 #include <sys/vmparam.h>
47 #include <sys/class.h>
48 #include <sys/vtrace.h>
49 #include <sys/modctl.h>
50 #include <sys/debug.h>
51 #include <sys/tnf_probe.h>
52 #include <sys/procfs.h>
53 
54 #include <vm/seg.h>
55 #include <vm/seg_kp.h>
56 #include <vm/as.h>
57 #include <vm/rm.h>
58 #include <vm/seg_kmem.h>
59 #include <sys/callb.h>
60 
61 /*
62  * The swapper sleeps on runout when there is no one to swap in.
63  * It sleeps on runin when it could not find space to swap someone
64  * in or after swapping someone in.
65  */
66 char	runout;
67 char	runin;
68 char	wake_sched;	/* flag tells clock to wake swapper on next tick */
69 char	wake_sched_sec;	/* flag tells clock to wake swapper after a second */
70 
71 /*
72  * The swapper swaps processes to reduce memory demand and runs
73  * when avefree < desfree.  The swapper resorts to SOFTSWAP when
74  * avefree < desfree which results in swapping out all processes
75  * sleeping for more than maxslp seconds.  HARDSWAP occurs when the
76  * system is on the verge of thrashing and this results in swapping
77  * out runnable threads or threads sleeping for less than maxslp secs.
78  *
79  * The swapper runs through all the active processes in the system
80  * and invokes the scheduling class specific swapin/swapout routine
81  * for every thread in the process to obtain an effective priority
82  * for the process.  A priority of -1 implies that the thread isn't
83  * swappable.  This effective priority is used to find the most
84  * eligible process to swapout or swapin.
85  *
86  * NOTE:  Threads which have been swapped are not linked on any
87  *	  queue and their dispatcher lock points at the "swapped_lock".
88  *
89  * Processes containing threads with the TS_DONT_SWAP flag set cannot be
90  * swapped out immediately by the swapper.  This is due to the fact that
91  * such threads may be holding locks which may be needed by the swapper
92  * to push its pages out.  The TS_SWAPENQ flag is set on such threads
93  * to prevent them running in user mode.  When such threads reach a
94  * safe point (i.e., are not holding any locks - CL_TRAPRET), they
95  * queue themseleves onto the swap queue which is processed by the
96  * swapper.  This results in reducing memory demand when the system
97  * is desparate for memory as the thread can't run in user mode.
98  *
99  * The swap queue consists of threads, linked via t_link, which are
100  * haven't been swapped, are runnable but not on the run queue.  The
101  * swap queue is protected by the "swapped_lock".  The dispatcher
102  * lock (t_lockp) of all threads on the swap queue points at the
103  * "swapped_lock".  Thus, the entire queue and/or threads on the
104  * queue can be locked by acquiring "swapped_lock".
105  */
106 static kthread_t *tswap_queue;
107 extern disp_lock_t swapped_lock; /* protects swap queue and threads on it */
108 
109 int	maxslp = 0;
110 pgcnt_t	avefree;	/* 5 sec moving average of free memory */
111 pgcnt_t	avefree30;	/* 30 sec moving average of free memory */
112 
113 /*
114  * Minimum size used to decide if sufficient memory is available
115  * before a process is swapped in.  This is necessary since in most
116  * cases the actual size of a process (p_swrss) being swapped in
117  * is usually 2 pages (kernel stack pages).  This is due to the fact
118  * almost all user pages of a process are stolen by pageout before
119  * the swapper decides to swapout it out.
120  */
121 int	min_procsize = 12;
122 
123 static int	swapin(proc_t *);
124 static int	swapout(proc_t *, uint_t *, int);
125 static void	process_swap_queue();
126 
127 #ifdef __sparc
128 extern void lwp_swapin(kthread_t *);
129 #endif /* __sparc */
130 
131 /*
132  * Counters to keep track of the number of swapins or swapouts.
133  */
134 uint_t tot_swapped_in, tot_swapped_out;
135 uint_t softswap, hardswap, swapqswap;
136 
137 /*
138  * Macro to determine if a process is eligble to be swapped.
139  */
140 #define	not_swappable(p)					\
141 	(((p)->p_flag & SSYS) || (p)->p_stat == SIDL ||		\
142 	    (p)->p_stat == SZOMB || (p)->p_as == NULL ||	\
143 	    (p)->p_as == &kas)
144 
145 /*
146  * Memory scheduler.
147  */
148 void
149 sched()
150 {
151 	kthread_id_t	t;
152 	pri_t		proc_pri;
153 	pri_t		thread_pri;
154 	pri_t		swapin_pri;
155 	int		desperate;
156 	pgcnt_t		needs;
157 	int		divisor;
158 	proc_t		*prp;
159 	proc_t		*swapout_prp;
160 	proc_t		*swapin_prp;
161 	spgcnt_t	avail;
162 	int		chosen_pri;
163 	time_t		swapout_time;
164 	time_t		swapin_proc_time;
165 	callb_cpr_t	cprinfo;
166 	kmutex_t	swap_cpr_lock;
167 
168 	mutex_init(&swap_cpr_lock, NULL, MUTEX_DEFAULT, NULL);
169 	CALLB_CPR_INIT(&cprinfo, &swap_cpr_lock, callb_generic_cpr, "sched");
170 	if (maxslp == 0)
171 		maxslp = MAXSLP;
172 loop:
173 	needs = 0;
174 	desperate = 0;
175 
176 	swapin_pri = v.v_nglobpris;
177 	swapin_prp = NULL;
178 	chosen_pri = -1;
179 
180 	process_swap_queue();
181 
182 	/*
183 	 * Set desperate if
184 	 * 	1.  At least 2 runnable processes (on average).
185 	 *	2.  Short (5 sec) and longer (30 sec) average is less
186 	 *	    than minfree and desfree respectively.
187 	 *	3.  Pagein + pageout rate is excessive.
188 	 */
189 	if (avenrun[0] >= 2 * FSCALE &&
190 	    (MAX(avefree, avefree30) < desfree) &&
191 	    (pginrate + pgoutrate > maxpgio || avefree < minfree)) {
192 		TRACE_4(TR_FAC_SCHED, TR_DESPERATE,
193 		    "desp:avefree: %d, avefree30: %d, freemem: %d"
194 		    " pginrate: %d\n", avefree, avefree30, freemem, pginrate);
195 		desperate = 1;
196 		goto unload;
197 	}
198 
199 	/*
200 	 * Search list of processes to swapin and swapout deadwood.
201 	 */
202 	swapin_proc_time = 0;
203 top:
204 	mutex_enter(&pidlock);
205 	for (prp = practive; prp != NULL; prp = prp->p_next) {
206 		if (not_swappable(prp))
207 			continue;
208 
209 		/*
210 		 * Look at processes with at least one swapped lwp.
211 		 */
212 		if (prp->p_swapcnt) {
213 			time_t proc_time;
214 
215 			/*
216 			 * Higher priority processes are good candidates
217 			 * to swapin.
218 			 */
219 			mutex_enter(&prp->p_lock);
220 			proc_pri = -1;
221 			t = prp->p_tlist;
222 			proc_time = 0;
223 			do {
224 				if (t->t_schedflag & TS_LOAD)
225 					continue;
226 
227 				thread_lock(t);
228 				thread_pri = CL_SWAPIN(t, 0);
229 				thread_unlock(t);
230 
231 				if (t->t_stime - proc_time > 0)
232 					proc_time = t->t_stime;
233 				if (thread_pri > proc_pri)
234 					proc_pri = thread_pri;
235 			} while ((t = t->t_forw) != prp->p_tlist);
236 			mutex_exit(&prp->p_lock);
237 
238 			if (proc_pri == -1)
239 				continue;
240 
241 			TRACE_3(TR_FAC_SCHED, TR_CHOOSE_SWAPIN,
242 			    "prp %p epri %d proc_time %d",
243 			    prp, proc_pri, proc_time);
244 
245 			/*
246 			 * Swapin processes with a high effective priority.
247 			 */
248 			if (swapin_prp == NULL || proc_pri > chosen_pri) {
249 				swapin_prp = prp;
250 				chosen_pri = proc_pri;
251 				swapin_pri = proc_pri;
252 				swapin_proc_time = proc_time;
253 			}
254 		} else {
255 			/*
256 			 * No need to soft swap if we have sufficient
257 			 * memory.
258 			 */
259 			if (avefree > desfree ||
260 			    avefree < desfree && freemem > desfree)
261 				continue;
262 
263 			/*
264 			 * Skip processes that are exiting
265 			 * or whose address spaces are locked.
266 			 */
267 			mutex_enter(&prp->p_lock);
268 			if ((prp->p_flag & SEXITING) ||
269 			    (prp->p_as != NULL && AS_ISPGLCK(prp->p_as))) {
270 				mutex_exit(&prp->p_lock);
271 				continue;
272 			}
273 
274 			/*
275 			 * Softswapping to kick out deadwood.
276 			 */
277 			proc_pri = -1;
278 			t = prp->p_tlist;
279 			do {
280 				if ((t->t_schedflag & (TS_SWAPENQ |
281 				    TS_ON_SWAPQ | TS_LOAD)) != TS_LOAD)
282 					continue;
283 
284 				thread_lock(t);
285 				thread_pri = CL_SWAPOUT(t, SOFTSWAP);
286 				thread_unlock(t);
287 				if (thread_pri > proc_pri)
288 					proc_pri = thread_pri;
289 			} while ((t = t->t_forw) != prp->p_tlist);
290 
291 			if (proc_pri != -1) {
292 				uint_t swrss;
293 
294 				mutex_exit(&pidlock);
295 
296 				TRACE_1(TR_FAC_SCHED, TR_SOFTSWAP,
297 				    "softswap:prp %p", prp);
298 
299 				(void) swapout(prp, &swrss, SOFTSWAP);
300 				softswap++;
301 				prp->p_swrss += swrss;
302 				mutex_exit(&prp->p_lock);
303 				goto top;
304 			}
305 			mutex_exit(&prp->p_lock);
306 		}
307 	}
308 	if (swapin_prp != NULL)
309 		mutex_enter(&swapin_prp->p_lock);
310 	mutex_exit(&pidlock);
311 
312 	if (swapin_prp == NULL) {
313 		TRACE_3(TR_FAC_SCHED, TR_RUNOUT,
314 		"schedrunout:runout nswapped: %d, avefree: %ld freemem: %ld",
315 		    nswapped, avefree, freemem);
316 
317 		t = curthread;
318 		thread_lock(t);
319 		runout++;
320 		t->t_schedflag |= (TS_ALLSTART & ~TS_CSTART);
321 		t->t_whystop = PR_SUSPENDED;
322 		t->t_whatstop = SUSPEND_NORMAL;
323 		(void) new_mstate(t, LMS_SLEEP);
324 		mutex_enter(&swap_cpr_lock);
325 		CALLB_CPR_SAFE_BEGIN(&cprinfo);
326 		mutex_exit(&swap_cpr_lock);
327 		thread_stop(t);		/* change state and drop lock */
328 		swtch();
329 		mutex_enter(&swap_cpr_lock);
330 		CALLB_CPR_SAFE_END(&cprinfo, &swap_cpr_lock);
331 		mutex_exit(&swap_cpr_lock);
332 		goto loop;
333 	}
334 
335 	/*
336 	 * Decide how deserving this process is to be brought in.
337 	 * Needs is an estimate of how much core the process will
338 	 * need.  If the process has been out for a while, then we
339 	 * will bring it in with 1/2 the core needed, otherwise
340 	 * we are conservative.
341 	 */
342 	divisor = 1;
343 	swapout_time = (lbolt - swapin_proc_time) / hz;
344 	if (swapout_time > maxslp / 2)
345 		divisor = 2;
346 
347 	needs = MIN(swapin_prp->p_swrss, lotsfree);
348 	needs = MAX(needs, min_procsize);
349 	needs = needs / divisor;
350 
351 	/*
352 	 * Use freemem, since we want processes to be swapped
353 	 * in quickly.
354 	 */
355 	avail = freemem - deficit;
356 	if (avail > (spgcnt_t)needs) {
357 		deficit += needs;
358 
359 		TRACE_2(TR_FAC_SCHED, TR_SWAPIN_VALUES,
360 		    "swapin_values: prp %p needs %lu", swapin_prp, needs);
361 
362 		if (swapin(swapin_prp)) {
363 			mutex_exit(&swapin_prp->p_lock);
364 			goto loop;
365 		}
366 		deficit -= MIN(needs, deficit);
367 		mutex_exit(&swapin_prp->p_lock);
368 	} else {
369 		mutex_exit(&swapin_prp->p_lock);
370 		/*
371 		 * If deficit is high, too many processes have been
372 		 * swapped in so wait a sec before attempting to
373 		 * swapin more.
374 		 */
375 		if (freemem > needs) {
376 			TRACE_2(TR_FAC_SCHED, TR_HIGH_DEFICIT,
377 			    "deficit: prp %p needs %lu", swapin_prp, needs);
378 			goto block;
379 		}
380 	}
381 
382 	TRACE_2(TR_FAC_SCHED, TR_UNLOAD,
383 	    "unload: prp %p needs %lu", swapin_prp, needs);
384 
385 unload:
386 	/*
387 	 * Unload all unloadable modules, free all other memory
388 	 * resources we can find, then look for a thread to hardswap.
389 	 */
390 	modreap();
391 	segkp_cache_free();
392 
393 	swapout_prp = NULL;
394 	mutex_enter(&pidlock);
395 	for (prp = practive; prp != NULL; prp = prp->p_next) {
396 
397 		/*
398 		 * No need to soft swap if we have sufficient
399 		 * memory.
400 		 */
401 		if (not_swappable(prp))
402 			continue;
403 
404 		if (avefree > minfree ||
405 		    avefree < minfree && freemem > desfree) {
406 			swapout_prp = NULL;
407 			break;
408 		}
409 
410 		/*
411 		 * Skip processes that are exiting
412 		 * or whose address spaces are locked.
413 		 */
414 		mutex_enter(&prp->p_lock);
415 		if ((prp->p_flag & SEXITING) ||
416 		    (prp->p_as != NULL && AS_ISPGLCK(prp->p_as))) {
417 			mutex_exit(&prp->p_lock);
418 			continue;
419 		}
420 
421 		proc_pri = -1;
422 		t = prp->p_tlist;
423 		do {
424 			if ((t->t_schedflag & (TS_SWAPENQ |
425 			    TS_ON_SWAPQ | TS_LOAD)) != TS_LOAD)
426 				continue;
427 
428 			thread_lock(t);
429 			thread_pri = CL_SWAPOUT(t, HARDSWAP);
430 			thread_unlock(t);
431 			if (thread_pri > proc_pri)
432 				proc_pri = thread_pri;
433 		} while ((t = t->t_forw) != prp->p_tlist);
434 
435 		mutex_exit(&prp->p_lock);
436 		if (proc_pri == -1)
437 			continue;
438 
439 		/*
440 		 * Swapout processes sleeping with a lower priority
441 		 * than the one currently being swapped in, if any.
442 		 */
443 		if (swapin_prp == NULL || swapin_pri > proc_pri) {
444 			TRACE_2(TR_FAC_SCHED, TR_CHOOSE_SWAPOUT,
445 			    "hardswap: prp %p needs %lu", prp, needs);
446 
447 			if (swapout_prp == NULL || proc_pri < chosen_pri) {
448 				swapout_prp = prp;
449 				chosen_pri = proc_pri;
450 			}
451 		}
452 	}
453 
454 	/*
455 	 * Acquire the "p_lock" before dropping "pidlock"
456 	 * to prevent the proc structure from being freed
457 	 * if the process exits before swapout completes.
458 	 */
459 	if (swapout_prp != NULL)
460 		mutex_enter(&swapout_prp->p_lock);
461 	mutex_exit(&pidlock);
462 
463 	if ((prp = swapout_prp) != NULL) {
464 		uint_t swrss = 0;
465 		int swapped;
466 
467 		swapped = swapout(prp, &swrss, HARDSWAP);
468 		if (swapped) {
469 			/*
470 			 * If desperate, we want to give the space obtained
471 			 * by swapping this process out to processes in core,
472 			 * so we give them a chance by increasing deficit.
473 			 */
474 			prp->p_swrss += swrss;
475 			if (desperate)
476 				deficit += MIN(prp->p_swrss, lotsfree);
477 			hardswap++;
478 		}
479 		mutex_exit(&swapout_prp->p_lock);
480 
481 		if (swapped)
482 			goto loop;
483 	}
484 
485 	/*
486 	 * Delay for 1 second and look again later.
487 	 */
488 	TRACE_3(TR_FAC_SCHED, TR_RUNIN,
489 	    "schedrunin:runin nswapped: %d, avefree: %ld freemem: %ld",
490 	    nswapped, avefree, freemem);
491 
492 block:
493 	t = curthread;
494 	thread_lock(t);
495 	runin++;
496 	t->t_schedflag |= (TS_ALLSTART & ~TS_CSTART);
497 	t->t_whystop = PR_SUSPENDED;
498 	t->t_whatstop = SUSPEND_NORMAL;
499 	(void) new_mstate(t, LMS_SLEEP);
500 	mutex_enter(&swap_cpr_lock);
501 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
502 	mutex_exit(&swap_cpr_lock);
503 	thread_stop(t);		/* change to stop state and drop lock */
504 	swtch();
505 	mutex_enter(&swap_cpr_lock);
506 	CALLB_CPR_SAFE_END(&cprinfo, &swap_cpr_lock);
507 	mutex_exit(&swap_cpr_lock);
508 	goto loop;
509 }
510 
511 /*
512  * Remove the specified thread from the swap queue.
513  */
514 static void
515 swapdeq(kthread_id_t tp)
516 {
517 	kthread_id_t *tpp;
518 
519 	ASSERT(THREAD_LOCK_HELD(tp));
520 	ASSERT(tp->t_schedflag & TS_ON_SWAPQ);
521 
522 	tpp = &tswap_queue;
523 	for (;;) {
524 		ASSERT(*tpp != NULL);
525 		if (*tpp == tp)
526 			break;
527 		tpp = &(*tpp)->t_link;
528 	}
529 	*tpp = tp->t_link;
530 	tp->t_schedflag &= ~TS_ON_SWAPQ;
531 }
532 
533 /*
534  * Swap in lwps.  Returns nonzero on success (i.e., if at least one lwp is
535  * swapped in) and 0 on failure.
536  */
537 static int
538 swapin(proc_t *pp)
539 {
540 	kthread_id_t tp;
541 	int err;
542 	int num_swapped_in = 0;
543 	struct cpu *cpup = CPU;
544 	pri_t thread_pri;
545 
546 	ASSERT(MUTEX_HELD(&pp->p_lock));
547 	ASSERT(pp->p_swapcnt);
548 
549 top:
550 	tp = pp->p_tlist;
551 	do {
552 		/*
553 		 * Only swapin eligible lwps (specified by the scheduling
554 		 * class) which are unloaded and ready to run.
555 		 */
556 		thread_lock(tp);
557 		thread_pri = CL_SWAPIN(tp, 0);
558 		if (thread_pri != -1 && tp->t_state == TS_RUN &&
559 		    (tp->t_schedflag & TS_LOAD) == 0) {
560 			size_t stack_size;
561 			pgcnt_t stack_pages;
562 
563 			ASSERT((tp->t_schedflag & TS_ON_SWAPQ) == 0);
564 
565 			thread_unlock(tp);
566 			/*
567 			 * Now drop the p_lock since the stack needs
568 			 * to brought in.
569 			 */
570 			mutex_exit(&pp->p_lock);
571 
572 			stack_size = swapsize(tp->t_swap);
573 			stack_pages = btopr(stack_size);
574 			/* Kernel probe */
575 			TNF_PROBE_4(swapin_lwp, "vm swap swapin", /* CSTYLED */,
576 				tnf_pid,	pid,		pp->p_pid,
577 				tnf_lwpid,	lwpid,		tp->t_tid,
578 				tnf_kthread_id,	tid,		tp,
579 				tnf_ulong,	page_count,	stack_pages);
580 
581 			rw_enter(&kas.a_lock, RW_READER);
582 			err = segkp_fault(segkp->s_as->a_hat, segkp,
583 			    tp->t_swap, stack_size, F_SOFTLOCK, S_OTHER);
584 			rw_exit(&kas.a_lock);
585 
586 #ifdef __sparc
587 			lwp_swapin(tp);
588 #endif /* __sparc */
589 
590 			/*
591 			 * Re-acquire the p_lock.
592 			 */
593 			mutex_enter(&pp->p_lock);
594 			if (err) {
595 				num_swapped_in = 0;
596 				break;
597 			} else {
598 				CPU_STATS_ADDQ(cpup, vm, swapin, 1);
599 				CPU_STATS_ADDQ(cpup, vm, pgswapin,
600 				    stack_pages);
601 
602 				pp->p_swapcnt--;
603 				pp->p_swrss -= stack_pages;
604 
605 				thread_lock(tp);
606 				tp->t_schedflag |= TS_LOAD;
607 				dq_sruninc(tp);
608 
609 				tp->t_stime = lbolt;	/* set swapin time */
610 				thread_unlock(tp);
611 
612 				nswapped--;
613 				tot_swapped_in++;
614 				num_swapped_in++;
615 
616 				TRACE_2(TR_FAC_SCHED, TR_SWAPIN,
617 				    "swapin: pp %p stack_pages %lu",
618 				    pp, stack_pages);
619 				goto top;
620 			}
621 		}
622 		thread_unlock(tp);
623 	} while ((tp = tp->t_forw) != pp->p_tlist);
624 	return (num_swapped_in);
625 }
626 
627 /*
628  * Swap out lwps.  Returns nonzero on success (i.e., if at least one lwp is
629  * swapped out) and 0 on failure.
630  */
631 static int
632 swapout(proc_t *pp, uint_t *swrss, int swapflags)
633 {
634 	kthread_id_t tp;
635 	pgcnt_t ws_pages = 0;
636 	int err;
637 	int swapped_lwps = 0;
638 	struct as *as = pp->p_as;
639 	struct cpu *cpup = CPU;
640 	pri_t thread_pri;
641 
642 	ASSERT(MUTEX_HELD(&pp->p_lock));
643 
644 	if (pp->p_flag & SEXITING)
645 		return (0);
646 
647 top:
648 	tp = pp->p_tlist;
649 	do {
650 		klwp_t *lwp = ttolwp(tp);
651 
652 		/*
653 		 * Swapout eligible lwps (specified by the scheduling
654 		 * class) which don't have TS_DONT_SWAP set.  Set the
655 		 * "intent to swap" flag (TS_SWAPENQ) on threads
656 		 * which have TS_DONT_SWAP set so that they can be
657 		 * swapped if and when they reach a safe point.
658 		 */
659 		thread_lock(tp);
660 		thread_pri = CL_SWAPOUT(tp, swapflags);
661 		if (thread_pri != -1) {
662 			if (tp->t_schedflag & TS_DONT_SWAP) {
663 				tp->t_schedflag |= TS_SWAPENQ;
664 				tp->t_trapret = 1;
665 				aston(tp);
666 			} else {
667 				pgcnt_t stack_pages;
668 				size_t stack_size;
669 
670 				ASSERT((tp->t_schedflag &
671 				    (TS_DONT_SWAP | TS_LOAD)) == TS_LOAD);
672 
673 				if (lock_try(&tp->t_lock)) {
674 					/*
675 					 * Remove thread from the swap_queue.
676 					 */
677 					if (tp->t_schedflag & TS_ON_SWAPQ) {
678 						ASSERT(!(tp->t_schedflag &
679 						    TS_SWAPENQ));
680 						swapdeq(tp);
681 					} else if (tp->t_state == TS_RUN)
682 						dq_srundec(tp);
683 
684 					tp->t_schedflag &=
685 					    ~(TS_LOAD | TS_SWAPENQ);
686 					lock_clear(&tp->t_lock);
687 
688 					/*
689 					 * Set swapout time if the thread isn't
690 					 * sleeping.
691 					 */
692 					if (tp->t_state != TS_SLEEP)
693 						tp->t_stime = lbolt;
694 					thread_unlock(tp);
695 
696 					nswapped++;
697 					tot_swapped_out++;
698 
699 					lwp->lwp_ru.nswap++;
700 
701 					/*
702 					 * Now drop the p_lock since the
703 					 * stack needs to pushed out.
704 					 */
705 					mutex_exit(&pp->p_lock);
706 
707 					stack_size = swapsize(tp->t_swap);
708 					stack_pages = btopr(stack_size);
709 					ws_pages += stack_pages;
710 					/* Kernel probe */
711 					TNF_PROBE_4(swapout_lwp,
712 						"vm swap swapout",
713 						/* CSTYLED */,
714 						tnf_pid, pid, pp->p_pid,
715 						tnf_lwpid, lwpid, tp->t_tid,
716 						tnf_kthread_id, tid, tp,
717 						tnf_ulong, page_count,
718 							stack_pages);
719 
720 					rw_enter(&kas.a_lock, RW_READER);
721 					err = segkp_fault(segkp->s_as->a_hat,
722 					    segkp, tp->t_swap, stack_size,
723 					    F_SOFTUNLOCK, S_WRITE);
724 					rw_exit(&kas.a_lock);
725 
726 					if (err) {
727 						cmn_err(CE_PANIC,
728 						    "swapout: segkp_fault "
729 						    "failed err: %d", err);
730 					}
731 					CPU_STATS_ADDQ(cpup,
732 					    vm, pgswapout, stack_pages);
733 
734 					mutex_enter(&pp->p_lock);
735 					pp->p_swapcnt++;
736 					swapped_lwps++;
737 					goto top;
738 				}
739 			}
740 		}
741 		thread_unlock(tp);
742 	} while ((tp = tp->t_forw) != pp->p_tlist);
743 
744 	/*
745 	 * Unload address space when all lwps are swapped out.
746 	 */
747 	if (pp->p_swapcnt == pp->p_lwpcnt) {
748 		size_t as_size = 0;
749 
750 		/*
751 		 * Avoid invoking as_swapout() if the process has
752 		 * no MMU resources since pageout will eventually
753 		 * steal pages belonging to this address space.  This
754 		 * saves CPU cycles as the number of pages that are
755 		 * potentially freed or pushed out by the segment
756 		 * swapout operation is very small.
757 		 */
758 		if (rm_asrss(pp->p_as) != 0)
759 			as_size = as_swapout(as);
760 
761 		CPU_STATS_ADDQ(cpup, vm, pgswapout, btop(as_size));
762 		CPU_STATS_ADDQ(cpup, vm, swapout, 1);
763 		ws_pages += btop(as_size);
764 
765 		TRACE_2(TR_FAC_SCHED, TR_SWAPOUT,
766 		    "swapout: pp %p pages_pushed %lu", pp, ws_pages);
767 		/* Kernel probe */
768 		TNF_PROBE_2(swapout_process, "vm swap swapout", /* CSTYLED */,
769 			tnf_pid,	pid,		pp->p_pid,
770 			tnf_ulong,	page_count,	ws_pages);
771 	}
772 	*swrss = ws_pages;
773 	return (swapped_lwps);
774 }
775 
776 void
777 swapout_lwp(klwp_t *lwp)
778 {
779 	kthread_id_t tp = curthread;
780 
781 	ASSERT(curthread == lwptot(lwp));
782 
783 	/*
784 	 * Don't insert the thread onto the swap queue if
785 	 * sufficient memory is available.
786 	 */
787 	if (avefree > desfree || avefree < desfree && freemem > desfree) {
788 		thread_lock(tp);
789 		tp->t_schedflag &= ~TS_SWAPENQ;
790 		thread_unlock(tp);
791 		return;
792 	}
793 
794 	/*
795 	 * Lock the thread, then move it to the swapped queue from the
796 	 * onproc queue and set its state to be TS_RUN.
797 	 */
798 	thread_lock(tp);
799 	ASSERT(tp->t_state == TS_ONPROC);
800 	if (tp->t_schedflag & TS_SWAPENQ) {
801 		tp->t_schedflag &= ~TS_SWAPENQ;
802 
803 		/*
804 		 * Set the state of this thread to be runnable
805 		 * and move it from the onproc queue to the swap queue.
806 		 */
807 		disp_swapped_enq(tp);
808 
809 		/*
810 		 * Insert the thread onto the swap queue.
811 		 */
812 		tp->t_link = tswap_queue;
813 		tswap_queue = tp;
814 		tp->t_schedflag |= TS_ON_SWAPQ;
815 
816 		thread_unlock_nopreempt(tp);
817 
818 		TRACE_1(TR_FAC_SCHED, TR_SWAPOUT_LWP, "swapout_lwp:%x", lwp);
819 
820 		swtch();
821 	} else {
822 		thread_unlock(tp);
823 	}
824 }
825 
826 /*
827  * Swap all threads on the swap queue.
828  */
829 static void
830 process_swap_queue(void)
831 {
832 	kthread_id_t tp;
833 	uint_t ws_pages;
834 	proc_t *pp;
835 	struct cpu *cpup = CPU;
836 	klwp_t *lwp;
837 	int err;
838 
839 	if (tswap_queue == NULL)
840 		return;
841 
842 	/*
843 	 * Acquire the "swapped_lock" which locks the swap queue,
844 	 * and unload the stacks of all threads on it.
845 	 */
846 	disp_lock_enter(&swapped_lock);
847 	while ((tp = tswap_queue) != NULL) {
848 		pgcnt_t stack_pages;
849 		size_t stack_size;
850 
851 		tswap_queue = tp->t_link;
852 		tp->t_link = NULL;
853 
854 		/*
855 		 * Drop the "dispatcher lock" before acquiring "t_lock"
856 		 * to avoid spinning on it since the thread at the front
857 		 * of the swap queue could be pinned before giving up
858 		 * its "t_lock" in resume.
859 		 */
860 		disp_lock_exit(&swapped_lock);
861 		lock_set(&tp->t_lock);
862 
863 		/*
864 		 * Now, re-acquire the "swapped_lock".  Acquiring this lock
865 		 * results in locking the thread since its dispatcher lock
866 		 * (t_lockp) is the "swapped_lock".
867 		 */
868 		disp_lock_enter(&swapped_lock);
869 		ASSERT(tp->t_state == TS_RUN);
870 		ASSERT(tp->t_schedflag & (TS_LOAD | TS_ON_SWAPQ));
871 
872 		tp->t_schedflag &= ~(TS_LOAD | TS_ON_SWAPQ);
873 		tp->t_stime = lbolt;		/* swapout time */
874 		disp_lock_exit(&swapped_lock);
875 		lock_clear(&tp->t_lock);
876 
877 		lwp = ttolwp(tp);
878 		lwp->lwp_ru.nswap++;
879 
880 		pp = ttoproc(tp);
881 		stack_size = swapsize(tp->t_swap);
882 		stack_pages = btopr(stack_size);
883 
884 		/* Kernel probe */
885 		TNF_PROBE_4(swapout_lwp, "vm swap swapout", /* CSTYLED */,
886 			tnf_pid,	pid,		pp->p_pid,
887 			tnf_lwpid,	lwpid,		tp->t_tid,
888 			tnf_kthread_id,	tid,		tp,
889 			tnf_ulong,	page_count,	stack_pages);
890 
891 		rw_enter(&kas.a_lock, RW_READER);
892 		err = segkp_fault(segkp->s_as->a_hat, segkp, tp->t_swap,
893 		    stack_size, F_SOFTUNLOCK, S_WRITE);
894 		rw_exit(&kas.a_lock);
895 
896 		if (err) {
897 			cmn_err(CE_PANIC,
898 			"process_swap_list: segkp_fault failed err: %d", err);
899 		}
900 		CPU_STATS_ADDQ(cpup, vm, pgswapout, stack_pages);
901 
902 		nswapped++;
903 		tot_swapped_out++;
904 		swapqswap++;
905 
906 		/*
907 		 * Don't need p_lock since the swapper is the only
908 		 * thread which increments/decrements p_swapcnt and p_swrss.
909 		 */
910 		ws_pages = stack_pages;
911 		pp->p_swapcnt++;
912 
913 		TRACE_1(TR_FAC_SCHED, TR_SWAPQ_LWP, "swaplist: pp %p", pp);
914 
915 		/*
916 		 * Unload address space when all lwps are swapped out.
917 		 */
918 		if (pp->p_swapcnt == pp->p_lwpcnt) {
919 			size_t as_size = 0;
920 
921 			if (rm_asrss(pp->p_as) != 0)
922 				as_size = as_swapout(pp->p_as);
923 
924 			CPU_STATS_ADDQ(cpup, vm, pgswapout,
925 			    btop(as_size));
926 			CPU_STATS_ADDQ(cpup, vm, swapout, 1);
927 
928 			ws_pages += btop(as_size);
929 
930 			TRACE_2(TR_FAC_SCHED, TR_SWAPQ_PROC,
931 			    "swaplist_proc: pp %p pages_pushed: %lu",
932 			    pp, ws_pages);
933 			/* Kernel probe */
934 			TNF_PROBE_2(swapout_process, "vm swap swapout",
935 				/* CSTYLED */,
936 				tnf_pid,	pid,		pp->p_pid,
937 				tnf_ulong,	page_count,	ws_pages);
938 		}
939 		pp->p_swrss += ws_pages;
940 		disp_lock_enter(&swapped_lock);
941 	}
942 	disp_lock_exit(&swapped_lock);
943 }
944