xref: /titanic_50/usr/src/uts/common/os/mem_cage.c (revision 554ff184129088135ad2643c1c9832174a17be88)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/proc.h>
33 #include <sys/callb.h>
34 #include <sys/vnode.h>
35 #include <sys/debug.h>
36 #include <sys/systm.h>		/* for bzero */
37 #include <sys/memlist.h>
38 #include <sys/cmn_err.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
41 #include <sys/atomic.h>		/* used to update kcage_freemem */
42 #include <sys/kmem.h>		/* for kmem_reap */
43 #include <sys/errno.h>
44 #include <sys/mem_cage.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/page.h>
47 #include <vm/hat.h>
48 #include <sys/mem_config.h>
49 #include <sys/lgrp.h>
50 
51 extern pri_t maxclsyspri;
52 
53 #ifdef DEBUG
54 #define	KCAGE_STATS
55 #endif
56 
57 #ifdef KCAGE_STATS
58 
59 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
60 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
61 
62 struct kcage_stats_scan {
63 	/* managed by KCAGE_STAT_* macros */
64 	clock_t	scan_lbolt;
65 	uint_t	scan_id;
66 
67 	/* set in kcage_cageout() */
68 	uint_t	kt_passes;
69 	clock_t	kt_ticks;
70 	pgcnt_t	kt_kcage_freemem_start;
71 	pgcnt_t	kt_kcage_freemem_end;
72 	pgcnt_t kt_freemem_start;
73 	pgcnt_t kt_freemem_end;
74 	uint_t	kt_examined;
75 	uint_t	kt_cantlock;
76 	uint_t	kt_gotone;
77 	uint_t	kt_gotonefree;
78 	uint_t	kt_skiplevel;
79 	uint_t	kt_skipshared;
80 	uint_t	kt_skiprefd;
81 	uint_t	kt_destroy;
82 
83 	/* set in kcage_invalidate_page() */
84 	uint_t	kip_reloclocked;
85 	uint_t	kip_relocmod;
86 	uint_t	kip_destroy;
87 	uint_t	kip_nomem;
88 	uint_t	kip_demotefailed;
89 
90 	/* set in kcage_expand() */
91 	uint_t	ke_wanted;
92 	uint_t	ke_examined;
93 	uint_t	ke_lefthole;
94 	uint_t	ke_gotone;
95 	uint_t	ke_gotonefree;
96 };
97 
98 struct kcage_stats {
99 	/* managed by KCAGE_STAT_* macros */
100 	uint_t	version;
101 	uint_t	size;
102 
103 	/* set in kcage_cageout */
104 	uint_t	kt_wakeups;
105 	uint_t	kt_scans;
106 	uint_t	kt_cageout_break;
107 
108 	/* set in kcage_expand */
109 	uint_t	ke_calls;
110 	uint_t	ke_nopfn;
111 	uint_t	ke_nopaget;
112 	uint_t	ke_isnoreloc;
113 	uint_t	ke_deleting;
114 	uint_t	ke_lowfreemem;
115 	uint_t	ke_terminate;
116 
117 	/* set in kcage_freemem_add() */
118 	uint_t	kfa_trottlewake;
119 
120 	/* set in kcage_freemem_sub() */
121 	uint_t	kfs_cagewake;
122 
123 	/* set in kcage_create_throttle */
124 	uint_t	kct_calls;
125 	uint_t	kct_cageout;
126 	uint_t	kct_critical;
127 	uint_t	kct_exempt;
128 	uint_t	kct_cagewake;
129 	uint_t	kct_wait;
130 	uint_t	kct_progress;
131 	uint_t	kct_noprogress;
132 	uint_t	kct_timeout;
133 
134 	/* set in kcage_cageout_wakeup */
135 	uint_t	kcw_expandearly;
136 
137 	/* managed by KCAGE_STAT_* macros */
138 	uint_t	scan_array_size;
139 	uint_t	scan_index;
140 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
141 };
142 
143 static struct kcage_stats kcage_stats;
144 static struct kcage_stats_scan kcage_stats_scan_zero;
145 
146 /*
147  * No real need for atomics here. For the most part the incs and sets are
148  * done by the kernel cage thread. There are a few that are done by any
149  * number of other threads. Those cases are noted by comments.
150  */
151 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
152 
153 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
154 
155 #define	KCAGE_STAT_INCR_SCAN(m)	\
156 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
157 
158 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
159 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
160 
161 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
162 
163 #define	KCAGE_STAT_SETZ(m, v)	\
164 	if (kcage_stats.m == 0) kcage_stats.m = (v)
165 
166 #define	KCAGE_STAT_SET_SCAN(m, v)	\
167 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
168 
169 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
170 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
171 
172 #define	KCAGE_STAT_INC_SCAN_INDEX \
173 	KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \
174 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
175 	kcage_stats.scan_index = \
176 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
177 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
178 
179 #define	KCAGE_STAT_INIT_SCAN_INDEX \
180 	kcage_stats.version = KCAGE_STATS_VERSION; \
181 	kcage_stats.size = sizeof (kcage_stats); \
182 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
183 	kcage_stats.scan_index = 0
184 
185 #else /* KCAGE_STATS */
186 
187 #define	KCAGE_STAT_INCR(v)
188 #define	KCAGE_STAT_NINCR(m, v)
189 #define	KCAGE_STAT_INCR_SCAN(v)
190 #define	KCAGE_STAT_NINCR_SCAN(m, v)
191 #define	KCAGE_STAT_SET(m, v)
192 #define	KCAGE_STAT_SETZ(m, v)
193 #define	KCAGE_STAT_SET_SCAN(m, v)
194 #define	KCAGE_STAT_SETZ_SCAN(m, v)
195 #define	KCAGE_STAT_INC_SCAN_INDEX
196 #define	KCAGE_STAT_INIT_SCAN_INDEX
197 
198 #endif /* KCAGE_STATS */
199 
200 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
201 static kcondvar_t kcage_throttle_cv;
202 
203 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
204 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
205 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
206 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
207 
208 static kmutex_t kcage_range_mutex;	/* proctects kcage_glist elements */
209 
210 /*
211  * Cage expansion happens within a range.
212  */
213 struct kcage_glist {
214 	struct kcage_glist	*next;
215 	pfn_t			base;
216 	pfn_t			lim;
217 	pfn_t			curr;
218 	int			decr;
219 };
220 
221 static struct kcage_glist *kcage_glist;
222 static struct kcage_glist *kcage_current_glist;
223 
224 /*
225  * The firstfree element is provided so that kmem_alloc can be avoided
226  * until that cage has somewhere to go. This is not currently a problem
227  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
228  */
229 static struct kcage_glist kcage_glist_firstfree;
230 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
231 
232 /*
233  * Miscellaneous forward references
234  */
235 static struct kcage_glist *kcage_glist_alloc(void);
236 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
237 static void kcage_cageout(void);
238 static int kcage_invalidate_page(page_t *, pgcnt_t *);
239 static int kcage_setnoreloc_pages(page_t *, se_t);
240 
241 /*
242  * Kernel Memory Cage counters and thresholds.
243  */
244 int kcage_on = 0;
245 pgcnt_t kcage_freemem;
246 pgcnt_t kcage_needfree;
247 pgcnt_t kcage_lotsfree;
248 pgcnt_t kcage_desfree;
249 pgcnt_t kcage_minfree;
250 pgcnt_t kcage_throttlefree;
251 int kcage_maxwait = 10;	/* in seconds */
252 
253 /* when we use lp for kmem we start the cage at a higher initial value */
254 pgcnt_t kcage_kmemlp_mincage;
255 
256 #ifdef DEBUG
257 pgcnt_t	kcage_pagets;
258 #define	KCAGEPAGETS_INC()	kcage_pagets++
259 #else
260 #define	KCAGEPAGETS_INC()
261 #endif
262 
263 /*
264  * Startup and Dynamic Reconfiguration interfaces.
265  * kcage_range_lock()
266  * kcage_range_unlock()
267  * kcage_range_islocked()
268  * kcage_range_add()
269  * kcage_range_del()
270  * kcage_init()
271  * kcage_set_thresholds()
272  */
273 
274 int
275 kcage_range_trylock(void)
276 {
277 	return (mutex_tryenter(&kcage_range_mutex));
278 }
279 
280 void
281 kcage_range_lock(void)
282 {
283 	mutex_enter(&kcage_range_mutex);
284 }
285 
286 void
287 kcage_range_unlock(void)
288 {
289 	mutex_exit(&kcage_range_mutex);
290 }
291 
292 int
293 kcage_range_islocked(void)
294 {
295 	return (MUTEX_HELD(&kcage_range_mutex));
296 }
297 
298 /*
299  * Called from page_get_contig_pages to get the approximate kcage pfn range
300  * for exclusion from search for contiguous pages. This routine is called
301  * without kcage_range lock (kcage routines can call page_get_contig_pages
302  * through page_relocate) and with the assumption, based on kcage_range_add,
303  * that kcage_current_glist always contain a valid pointer.
304  */
305 
306 int
307 kcage_current_pfn(pfn_t *pfncur)
308 {
309 	struct kcage_glist *lp = kcage_current_glist;
310 
311 	ASSERT(kcage_on);
312 
313 	ASSERT(lp != NULL);
314 
315 	*pfncur = lp->curr;
316 
317 	return (lp->decr);
318 }
319 
320 int
321 kcage_range_init(struct memlist *ml, int decr)
322 {
323 	int ret = 0;
324 
325 	ASSERT(kcage_range_islocked());
326 
327 	if (decr) {
328 		while (ml->next != NULL)
329 			ml = ml->next;
330 	}
331 
332 	while (ml != NULL) {
333 		ret = kcage_range_add(btop(ml->address), btop(ml->size), decr);
334 		if (ret)
335 			break;
336 
337 		ml = (decr ? ml->prev : ml->next);
338 	}
339 
340 	return (ret);
341 }
342 
343 /*
344  * Third arg controls direction of growth: 0: increasing pfns,
345  * 1: decreasing.
346  * Calls to add and delete must be protected by calls to
347  * kcage_range_lock() and kcage_range_unlock().
348  */
349 int
350 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr)
351 {
352 	struct kcage_glist *new, **lpp;
353 	pfn_t lim;
354 
355 	ASSERT(kcage_range_islocked());
356 
357 	ASSERT(npgs != 0);
358 	if (npgs == 0)
359 		return (EINVAL);
360 
361 	lim = base + npgs;
362 
363 	ASSERT(lim > base);
364 	if (lim <= base)
365 		return (EINVAL);
366 
367 	new = kcage_glist_alloc();
368 	if (new == NULL) {
369 		return (ENOMEM);
370 	}
371 
372 	new->base = base;
373 	new->lim = lim;
374 	new->decr = decr;
375 	if (new->decr != 0)
376 		new->curr = new->lim;
377 	else
378 		new->curr = new->base;
379 	/*
380 	 * Any overlapping existing ranges are removed by deleting
381 	 * from the new list as we search for the tail.
382 	 */
383 	lpp = &kcage_glist;
384 	while (*lpp != NULL) {
385 		int ret;
386 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
387 		if (ret != 0)
388 			return (ret);
389 		lpp = &(*lpp)->next;
390 	}
391 
392 	*lpp = new;
393 
394 	if (kcage_current_glist == NULL) {
395 		kcage_current_glist = kcage_glist;
396 	}
397 
398 	return (0);
399 }
400 
401 /*
402  * Calls to add and delete must be protected by calls to
403  * kcage_range_lock() and kcage_range_unlock().
404  */
405 int
406 kcage_range_delete(pfn_t base, pgcnt_t npgs)
407 {
408 	struct kcage_glist *lp;
409 	pfn_t lim;
410 
411 	ASSERT(kcage_range_islocked());
412 
413 	ASSERT(npgs != 0);
414 	if (npgs == 0)
415 		return (EINVAL);
416 
417 	lim = base + npgs;
418 
419 	ASSERT(lim > base);
420 	if (lim <= base)
421 		return (EINVAL);
422 
423 	/*
424 	 * Check if the delete is OK first as a number of elements
425 	 * might be involved and it will be difficult to go
426 	 * back and undo (can't just add the range back in).
427 	 */
428 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
429 		/*
430 		 * If there have been no pages allocated from this
431 		 * element, we don't need to check it.
432 		 */
433 		if ((lp->decr == 0 && lp->curr == lp->base) ||
434 		    (lp->decr != 0 && lp->curr == lp->lim))
435 			continue;
436 		/*
437 		 * If the element does not overlap, its OK.
438 		 */
439 		if (base >= lp->lim || lim <= lp->base)
440 			continue;
441 		/*
442 		 * Overlapping element: Does the range to be deleted
443 		 * overlap the area already used? If so fail.
444 		 */
445 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
446 			return (EBUSY);
447 		}
448 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
449 			return (EBUSY);
450 		}
451 	}
452 	return (kcage_glist_delete(base, lim, &kcage_glist));
453 }
454 
455 /*
456  * Calls to add and delete must be protected by calls to
457  * kcage_range_lock() and kcage_range_unlock().
458  * This routine gets called after successful Solaris memory
459  * delete operation from DR post memory delete routines.
460  */
461 int
462 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
463 {
464 	pfn_t lim;
465 
466 	ASSERT(kcage_range_islocked());
467 
468 	ASSERT(npgs != 0);
469 	if (npgs == 0)
470 		return (EINVAL);
471 
472 	lim = base + npgs;
473 
474 	ASSERT(lim > base);
475 	if (lim <= base)
476 		return (EINVAL);
477 
478 	return (kcage_glist_delete(base, lim, &kcage_glist));
479 }
480 
481 /*
482  * No locking is required here as the whole operation is covered
483  * by the kcage_range_lock().
484  */
485 static struct kcage_glist *
486 kcage_glist_alloc(void)
487 {
488 	struct kcage_glist *new;
489 
490 	if ((new = kcage_glist_freelist) != NULL) {
491 		kcage_glist_freelist = new->next;
492 		bzero(new, sizeof (*new));
493 	} else {
494 		new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP);
495 	}
496 	return (new);
497 }
498 
499 static void
500 kcage_glist_free(struct kcage_glist *lp)
501 {
502 	lp->next = kcage_glist_freelist;
503 	kcage_glist_freelist = lp;
504 }
505 
506 static int
507 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
508 {
509 	struct kcage_glist *lp, *prev = *lpp;
510 
511 	while ((lp = *lpp) != NULL) {
512 		if (lim > lp->base && base < lp->lim) {
513 			/* The delete range overlaps this element. */
514 			if (base <= lp->base && lim >= lp->lim) {
515 				/* Delete whole element. */
516 				*lpp = lp->next;
517 				if (lp == kcage_current_glist) {
518 					/* This can never happen. */
519 					ASSERT(kcage_current_glist != prev);
520 					kcage_current_glist = prev;
521 				}
522 				kcage_glist_free(lp);
523 				continue;
524 			}
525 
526 			/* Partial delete. */
527 			if (base > lp->base && lim < lp->lim) {
528 				struct kcage_glist *new;
529 
530 				/*
531 				 * Remove a section from the middle,
532 				 * need to allocate a new element.
533 				 */
534 				new = kcage_glist_alloc();
535 				if (new == NULL) {
536 					return (ENOMEM);
537 				}
538 
539 				/*
540 				 * Tranfser unused range to new.
541 				 * Edit lp in place to preserve
542 				 * kcage_current_glist.
543 				 */
544 				new->decr = lp->decr;
545 				if (new->decr != 0) {
546 					new->base = lp->base;
547 					new->lim = base;
548 					new->curr = base;
549 
550 					lp->base = lim;
551 				} else {
552 					new->base = lim;
553 					new->lim = lp->lim;
554 					new->curr = new->base;
555 
556 					lp->lim = base;
557 				}
558 
559 				/* Insert new. */
560 				new->next = lp->next;
561 				lp->next = new;
562 				lpp = &lp->next;
563 			} else {
564 				/* Delete part of current block. */
565 				if (base > lp->base) {
566 					ASSERT(lim >= lp->lim);
567 					ASSERT(base < lp->lim);
568 					if (lp->decr != 0 &&
569 					    lp->curr == lp->lim)
570 						lp->curr = base;
571 					lp->lim = base;
572 				} else {
573 					ASSERT(base <= lp->base);
574 					ASSERT(lim > lp->base);
575 					if (lp->decr == 0 &&
576 					    lp->curr == lp->base)
577 						lp->curr = lim;
578 					lp->base = lim;
579 				}
580 			}
581 		}
582 		prev = *lpp;
583 		lpp = &(*lpp)->next;
584 	}
585 
586 	return (0);
587 }
588 
589 /*
590  * The caller of kcage_get_pfn must hold the kcage_range_lock to make
591  * sure that there are no concurrent calls. The same lock
592  * must be obtained for range add and delete by calling
593  * kcage_range_lock() and kcage_range_unlock().
594  */
595 static pfn_t
596 kcage_get_pfn(void)
597 {
598 	struct kcage_glist *lp;
599 	pfn_t pfn;
600 
601 	ASSERT(kcage_range_islocked());
602 
603 	lp = kcage_current_glist;
604 	while (lp != NULL) {
605 		if (lp->decr != 0) {
606 			if (lp->curr != lp->base) {
607 				pfn = --lp->curr;
608 				return (pfn);
609 			}
610 		} else {
611 			if (lp->curr != lp->lim) {
612 				pfn = lp->curr++;
613 				return (pfn);
614 			}
615 		}
616 
617 		lp = lp->next;
618 		if (lp)
619 			kcage_current_glist = lp;
620 	}
621 
622 	return (PFN_INVALID);
623 }
624 
625 /*
626  * Walk the physical address space of the cage.
627  * This routine does not guarantee to return PFNs in the order
628  * in which they were allocated to the cage. Instead, it walks
629  * each range as they appear on the growth list returning the PFNs
630  * range in ascending order.
631  *
632  * To begin scanning at lower edge of cage, reset should be nonzero.
633  * To step through cage, reset should be zero.
634  *
635  * PFN_INVALID will be returned when the upper end of the cage is
636  * reached -- indicating a full scan of the cage has been completed since
637  * previous reset. PFN_INVALID will continue to be returned until
638  * kcage_walk_cage is reset.
639  *
640  * It is possible to receive a PFN_INVALID result on reset if a growth
641  * list is not installed or if none of the PFNs in the installed list have
642  * been allocated to the cage. In otherwords, there is no cage.
643  *
644  * Caller need not hold kcage_range_lock while calling this function
645  * as the front part of the list is static - pages never come out of
646  * the cage.
647  *
648  * The caller is expected to only be kcage_cageout().
649  */
650 static pfn_t
651 kcage_walk_cage(int reset)
652 {
653 	static struct kcage_glist *lp = NULL;
654 	static pfn_t pfn;
655 
656 	if (reset)
657 		lp = NULL;
658 	if (lp == NULL) {
659 		lp = kcage_glist;
660 		pfn = PFN_INVALID;
661 	}
662 again:
663 	if (pfn == PFN_INVALID) {
664 		if (lp == NULL)
665 			return (PFN_INVALID);
666 
667 		if (lp->decr != 0) {
668 			/*
669 			 * In this range the cage grows from the highest
670 			 * address towards the lowest.
671 			 * Arrange to return pfns from curr to lim-1,
672 			 * inclusive, in ascending order.
673 			 */
674 
675 			pfn = lp->curr;
676 		} else {
677 			/*
678 			 * In this range the cage grows from the lowest
679 			 * address towards the highest.
680 			 * Arrange to return pfns from base to curr,
681 			 * inclusive, in ascending order.
682 			 */
683 
684 			pfn = lp->base;
685 		}
686 	}
687 
688 	if (lp->decr != 0) {		/* decrementing pfn */
689 		if (pfn == lp->lim) {
690 			/* Don't go beyond the static part of the glist. */
691 			if (lp == kcage_current_glist)
692 				lp = NULL;
693 			else
694 				lp = lp->next;
695 			pfn = PFN_INVALID;
696 			goto again;
697 		}
698 
699 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
700 	} else {			/* incrementing pfn */
701 		if (pfn == lp->curr) {
702 			/* Don't go beyond the static part of the glist. */
703 			if (lp == kcage_current_glist)
704 				lp = NULL;
705 			else
706 				lp = lp->next;
707 			pfn = PFN_INVALID;
708 			goto again;
709 		}
710 
711 		ASSERT(pfn >= lp->base && pfn < lp->curr);
712 	}
713 
714 	return (pfn++);
715 }
716 
717 /*
718  * Callback functions for to recalc cage thresholds after
719  * Kphysm memory add/delete operations.
720  */
721 /*ARGSUSED*/
722 static void
723 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
724 {
725 	kcage_recalc_thresholds();
726 }
727 
728 /*ARGSUSED*/
729 static int
730 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
731 {
732 	/* TODO: when should cage refuse memory delete requests? */
733 	return (0);
734 }
735 
736 /*ARGSUSED*/
737 static  void
738 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
739 {
740 	kcage_recalc_thresholds();
741 }
742 
743 static kphysm_setup_vector_t kcage_kphysm_vectors = {
744 	KPHYSM_SETUP_VECTOR_VERSION,
745 	kcage_kphysm_postadd_cb,
746 	kcage_kphysm_predel_cb,
747 	kcage_kphysm_postdel_cb
748 };
749 
750 /*
751  * This is called before a CPR suspend and after a CPR resume.  We have to
752  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
753  * restart.
754  */
755 /*ARGSUSED*/
756 static boolean_t
757 kcage_cageout_cpr(void *arg, int code)
758 {
759 	if (code == CB_CODE_CPR_CHKPT) {
760 		ASSERT(kcage_cageout_ready);
761 		kcage_cageout_ready = 0;
762 		return (B_TRUE);
763 	} else if (code == CB_CODE_CPR_RESUME) {
764 		ASSERT(kcage_cageout_ready == 0);
765 		kcage_cageout_ready = 1;
766 		return (B_TRUE);
767 	}
768 	return (B_FALSE);
769 }
770 
771 /*
772  * kcage_recalc_preferred_size() increases initial cage size to improve large
773  * page availability when lp for kmem is enabled and kpr is disabled
774  */
775 static pgcnt_t
776 kcage_recalc_preferred_size(pgcnt_t preferred_size)
777 {
778 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
779 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
780 		if (lpmincage == 0) {
781 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
782 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
783 		}
784 		kcage_kmemlp_mincage = MIN(lpmincage,
785 			    (segkmem_kmemlp_max / PAGESIZE));
786 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
787 	}
788 	return (preferred_size);
789 }
790 
791 /*
792  * Kcage_init() builds the cage and initializes the cage thresholds.
793  * The size of the cage is determined by the argument preferred_size.
794  * or the actual amount of memory, whichever is smaller.
795  */
796 void
797 kcage_init(pgcnt_t preferred_size)
798 {
799 	pgcnt_t wanted;
800 	pfn_t pfn;
801 	page_t *pp;
802 	extern struct vnode kvp;
803 	extern void page_list_noreloc_startup(page_t *);
804 
805 	ASSERT(!kcage_on);
806 	ASSERT(kcage_range_islocked());
807 
808 	/* increase preferred cage size for lp for kmem */
809 	preferred_size = kcage_recalc_preferred_size(preferred_size);
810 
811 	/* Debug note: initialize this now so early expansions can stat */
812 	KCAGE_STAT_INIT_SCAN_INDEX;
813 
814 	/*
815 	 * Initialize cage thresholds and install kphysm callback.
816 	 * If we can't arrange to have the thresholds track with
817 	 * available physical memory, then the cage thresholds may
818 	 * end up over time at levels that adversly effect system
819 	 * performance; so, bail out.
820 	 */
821 	kcage_recalc_thresholds();
822 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
823 		ASSERT(0);		/* Catch this in DEBUG kernels. */
824 		return;
825 	}
826 
827 	/*
828 	 * Limit startup cage size within the range of kcage_minfree
829 	 * and availrmem, inclusively.
830 	 */
831 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
832 
833 	/*
834 	 * Construct the cage. PFNs are allocated from the glist. It
835 	 * is assumed that the list has been properly ordered for the
836 	 * platform by the platform code. Typically, this is as simple
837 	 * as calling kcage_range_init(phys_avail, decr), where decr is
838 	 * 1 if the kernel has been loaded into upper end of physical
839 	 * memory, or 0 if the kernel has been loaded at the low end.
840 	 *
841 	 * Note: it is assumed that we are in the startup flow, so there
842 	 * is no reason to grab the page lock.
843 	 */
844 	kcage_freemem = 0;
845 	pfn = PFN_INVALID;			/* prime for alignment test */
846 	while (wanted != 0) {
847 		if ((pfn = kcage_get_pfn()) == PFN_INVALID)
848 			break;
849 
850 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
851 			KCAGEPAGETS_INC();
852 			/*
853 			 * Set the noreloc state on the page.
854 			 * If the page is free and not already
855 			 * on the noreloc list then move it.
856 			 */
857 			if (PP_ISFREE(pp)) {
858 				if (PP_ISNORELOC(pp) == 0)
859 					page_list_noreloc_startup(pp);
860 			} else {
861 				ASSERT(pp->p_szc == 0);
862 				PP_SETNORELOC(pp);
863 			}
864 		}
865 
866 		wanted -= 1;
867 	}
868 
869 	/*
870 	 * Need to go through and find kernel allocated pages
871 	 * and capture them into the Cage.  These will primarily
872 	 * be pages gotten through boot_alloc().
873 	 */
874 	if (kvp.v_pages) {
875 
876 		pp = kvp.v_pages;
877 		do {
878 			ASSERT(!PP_ISFREE(pp));
879 			ASSERT(pp->p_szc == 0);
880 			PP_SETNORELOC(pp);
881 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
882 
883 	}
884 
885 	kcage_on = 1;
886 
887 	/*
888 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
889 	 * after the cageout thread is blocked, and executes from cpr_resume()
890 	 * before the cageout thread is restarted.  By executing in this class,
891 	 * we are assured that the kernel cage thread won't miss wakeup calls
892 	 * and also CPR's larger kmem_alloc requests will not fail after
893 	 * CPR shuts down the cageout kernel thread.
894 	 */
895 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
896 	    "cageout");
897 
898 	/*
899 	 * Coalesce pages to improve large page availability. A better fix
900 	 * would to coalesce pages as they are included in the cage
901 	 */
902 	if (SEGKMEM_USE_LARGEPAGES) {
903 		extern void page_freelist_coalesce_all(int mnode);
904 		extern int max_mem_nodes;
905 		int mnode, max_mnodes = max_mem_nodes;
906 		for (mnode = 0; mnode < max_mnodes; mnode++) {
907 			page_freelist_coalesce_all(mnode);
908 		}
909 	}
910 }
911 
912 void
913 kcage_recalc_thresholds()
914 {
915 	static int first = 1;
916 	static pgcnt_t init_lotsfree;
917 	static pgcnt_t init_desfree;
918 	static pgcnt_t init_minfree;
919 	static pgcnt_t init_throttlefree;
920 
921 	/* TODO: any reason to take more care than this with live editing? */
922 	mutex_enter(&kcage_cageout_mutex);
923 	mutex_enter(&freemem_lock);
924 
925 	if (first) {
926 		first = 0;
927 		init_lotsfree = kcage_lotsfree;
928 		init_desfree = kcage_desfree;
929 		init_minfree = kcage_minfree;
930 		init_throttlefree = kcage_throttlefree;
931 	} else {
932 		kcage_lotsfree = init_lotsfree;
933 		kcage_desfree = init_desfree;
934 		kcage_minfree = init_minfree;
935 		kcage_throttlefree = init_throttlefree;
936 	}
937 
938 	if (kcage_lotsfree == 0)
939 		kcage_lotsfree = MAX(32, total_pages / 256);
940 
941 	if (kcage_minfree == 0)
942 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
943 
944 	if (kcage_desfree == 0)
945 		kcage_desfree = MAX(32, kcage_minfree);
946 
947 	if (kcage_throttlefree == 0)
948 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
949 
950 	mutex_exit(&freemem_lock);
951 	mutex_exit(&kcage_cageout_mutex);
952 
953 	if (kcage_cageout_ready) {
954 		if (kcage_freemem < kcage_desfree)
955 			kcage_cageout_wakeup();
956 
957 		if (kcage_needfree) {
958 			mutex_enter(&kcage_throttle_mutex);
959 			cv_broadcast(&kcage_throttle_cv);
960 			mutex_exit(&kcage_throttle_mutex);
961 		}
962 	}
963 }
964 
965 /*
966  * Pageout interface:
967  * kcage_cageout_init()
968  */
969 void
970 kcage_cageout_init()
971 {
972 	if (kcage_on) {
973 		mutex_enter(&kcage_cageout_mutex);
974 
975 		kcage_cageout_thread = thread_create(NULL, 0, kcage_cageout,
976 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
977 
978 		mutex_exit(&kcage_cageout_mutex);
979 	}
980 }
981 
982 
983 /*
984  * VM Interfaces:
985  * kcage_create_throttle()
986  * kcage_freemem_add()
987  * kcage_freemem_sub()
988  */
989 
990 /*
991  * Wakeup cageout thread and throttle waiting for the number of pages
992  * requested to become available.  For non-critical requests, a
993  * timeout is added, since freemem accounting is separate from cage
994  * freemem accounting: it's possible for us to get stuck and not make
995  * forward progress even though there was sufficient freemem before
996  * arriving here.
997  */
998 int
999 kcage_create_throttle(pgcnt_t npages, int flags)
1000 {
1001 	int niter = 0;
1002 	pgcnt_t lastfree;
1003 	int enough = kcage_freemem > kcage_throttlefree + npages;
1004 
1005 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1006 
1007 	kcage_cageout_wakeup();			/* just to be sure */
1008 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1009 
1010 	/*
1011 	 * Obviously, we can't throttle the cageout thread since
1012 	 * we depend on it.  We also can't throttle the panic thread.
1013 	 */
1014 	if (curthread == kcage_cageout_thread || panicstr) {
1015 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1016 		return (KCT_CRIT);
1017 	}
1018 
1019 	/*
1020 	 * Don't throttle threads which are critical for proper
1021 	 * vm management if we're above kcage_throttlefree or
1022 	 * if freemem is very low.
1023 	 */
1024 	if (NOMEMWAIT()) {
1025 		if (enough) {
1026 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1027 			return (KCT_CRIT);
1028 		} else if (freemem < minfree) {
1029 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1030 			return (KCT_CRIT);
1031 		}
1032 	}
1033 
1034 	/*
1035 	 * Don't throttle real-time threads.
1036 	 */
1037 	if (DISP_PRIO(curthread) > maxclsyspri) {
1038 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1039 		return (KCT_CRIT);
1040 	}
1041 
1042 	/*
1043 	 * Cause all other threads (which are assumed to not be
1044 	 * critical to cageout) to wait here until their request
1045 	 * can be satisfied. Be a little paranoid and wake the
1046 	 * kernel cage on each loop through this logic.
1047 	 */
1048 	while (kcage_freemem < kcage_throttlefree + npages) {
1049 		ASSERT(kcage_on);
1050 
1051 		lastfree = kcage_freemem;
1052 
1053 		if (kcage_cageout_ready) {
1054 			mutex_enter(&kcage_throttle_mutex);
1055 
1056 			kcage_needfree += npages;
1057 			KCAGE_STAT_INCR(kct_wait);
1058 
1059 			kcage_cageout_wakeup();
1060 			KCAGE_STAT_INCR(kct_cagewake);
1061 
1062 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1063 
1064 			kcage_needfree -= npages;
1065 
1066 			mutex_exit(&kcage_throttle_mutex);
1067 		} else {
1068 			/*
1069 			 * NOTE: atomics are used just in case we enter
1070 			 * mp operation before the cageout thread is ready.
1071 			 */
1072 			atomic_add_long(&kcage_needfree, npages);
1073 
1074 			kcage_cageout_wakeup();
1075 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1076 
1077 			atomic_add_long(&kcage_needfree, -npages);
1078 		}
1079 
1080 		if ((flags & PG_WAIT) == 0) {
1081 			if (kcage_freemem > lastfree) {
1082 				KCAGE_STAT_INCR(kct_progress);
1083 				niter = 0;
1084 			} else {
1085 				KCAGE_STAT_INCR(kct_noprogress);
1086 				if (++niter >= kcage_maxwait) {
1087 					KCAGE_STAT_INCR(kct_timeout);
1088 					return (KCT_FAILURE);
1089 				}
1090 			}
1091 		}
1092 	}
1093 	return (KCT_NONCRIT);
1094 }
1095 
1096 void
1097 kcage_freemem_add(pgcnt_t npages)
1098 {
1099 	extern void wakeup_pcgs(void);
1100 
1101 	atomic_add_long(&kcage_freemem, npages);
1102 
1103 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1104 
1105 	if (kcage_needfree != 0 &&
1106 		kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1107 
1108 		mutex_enter(&kcage_throttle_mutex);
1109 		cv_broadcast(&kcage_throttle_cv);
1110 		KCAGE_STAT_INCR(kfa_trottlewake);
1111 		mutex_exit(&kcage_throttle_mutex);
1112 	}
1113 }
1114 
1115 void
1116 kcage_freemem_sub(pgcnt_t npages)
1117 {
1118 	atomic_add_long(&kcage_freemem, -npages);
1119 
1120 	if (kcage_freemem < kcage_desfree) {
1121 		kcage_cageout_wakeup();
1122 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1123 	}
1124 }
1125 
1126 /*
1127  * return 0 on failure and 1 on success.
1128  */
1129 static int
1130 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1131 {
1132 	pgcnt_t npgs, i;
1133 	page_t *pp;
1134 	pfn_t rootpfn = page_pptonum(rootpp);
1135 	uint_t szc;
1136 
1137 	ASSERT(!PP_ISFREE(rootpp));
1138 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1139 	if (!group_page_trylock(rootpp, se)) {
1140 		return (0);
1141 	}
1142 	szc = rootpp->p_szc;
1143 	if (szc == 0) {
1144 		/*
1145 		 * The szc of a locked page can only change for pages that are
1146 		 * non-swapfs (i.e. anonymous memory) file system pages.
1147 		 */
1148 		ASSERT(rootpp->p_vnode != NULL &&
1149 		    rootpp->p_vnode != &kvp &&
1150 		    !IS_SWAPFSVP(rootpp->p_vnode));
1151 		PP_SETNORELOC(rootpp);
1152 		return (1);
1153 	}
1154 	npgs = page_get_pagecnt(szc);
1155 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1156 	pp = rootpp;
1157 	for (i = 0; i < npgs; i++, pp = page_next(pp)) {
1158 		ASSERT(PAGE_LOCKED_SE(pp, se));
1159 		ASSERT(!PP_ISFREE(pp));
1160 		ASSERT(pp->p_szc == szc);
1161 		PP_SETNORELOC(pp);
1162 	}
1163 	group_page_unlock(rootpp);
1164 	return (1);
1165 }
1166 
1167 /*
1168  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1169  * If successful and pages is free, move page to the tail of whichever
1170  * list it is on.
1171  * Returns:
1172  *   EBUSY  page already locked, assimilated but not free.
1173  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1174  *   EAGAIN page not assimilated. Page not free.
1175  *   ERANGE page assimilated. Page not root.
1176  *   0      page assimilated. Page free.
1177  *   *nfreedp number of pages freed.
1178  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1179  * to distinguish between a page that was already a NORELOC page from
1180  * those newly converted to NORELOC pages by this invocation of
1181  * kcage_assimilate_page.
1182  */
1183 static int
1184 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1185 {
1186 	if (page_trylock(pp, SE_EXCL)) {
1187 		if (PP_ISNORELOC(pp)) {
1188 check_free_and_return:
1189 			if (PP_ISFREE(pp)) {
1190 				page_unlock(pp);
1191 				*nfreedp = 0;
1192 				return (0);
1193 			} else {
1194 				page_unlock(pp);
1195 				return (EBUSY);
1196 			}
1197 			/*NOTREACHED*/
1198 		}
1199 	} else {
1200 		if (page_trylock(pp, SE_SHARED)) {
1201 			if (PP_ISNORELOC(pp))
1202 				goto check_free_and_return;
1203 		} else
1204 			return (EAGAIN);
1205 
1206 		if (!PP_ISFREE(pp)) {
1207 			page_unlock(pp);
1208 			return (EAGAIN);
1209 		}
1210 
1211 		/*
1212 		 * Need to upgrade the lock on it and set the NORELOC
1213 		 * bit. If it is free then remove it from the free
1214 		 * list so that the platform free list code can keep
1215 		 * NORELOC pages where they should be.
1216 		 */
1217 		/*
1218 		 * Before doing anything, get the exclusive lock.
1219 		 * This may fail (eg ISM pages are left shared locked).
1220 		 * If the page is free this will leave a hole in the
1221 		 * cage. There is no solution yet to this.
1222 		 */
1223 		if (!page_tryupgrade(pp)) {
1224 			page_unlock(pp);
1225 			return (EAGAIN);
1226 		}
1227 	}
1228 
1229 	ASSERT(PAGE_EXCL(pp));
1230 
1231 	if (PP_ISFREE(pp)) {
1232 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1233 
1234 		page_list_sub(pp, which | PG_LIST_ISCAGE);
1235 		ASSERT(pp->p_szc == 0);
1236 		PP_SETNORELOC(pp);
1237 		page_list_add(pp, which | PG_LIST_TAIL | PG_LIST_ISCAGE);
1238 
1239 		page_unlock(pp);
1240 		*nfreedp = 1;
1241 		return (0);
1242 	} else {
1243 		if (pp->p_szc != 0) {
1244 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1245 				page_unlock(pp);
1246 				return (EAGAIN);
1247 			}
1248 			ASSERT(PP_ISNORELOC(pp));
1249 		} else {
1250 			PP_SETNORELOC(pp);
1251 		}
1252 		return (kcage_invalidate_page(pp, nfreedp));
1253 	}
1254 	/*NOTREACHED*/
1255 }
1256 
1257 static int
1258 kcage_expand()
1259 {
1260 	int did_something = 0;
1261 
1262 	spgcnt_t wanted;
1263 	pfn_t pfn;
1264 	page_t *pp;
1265 	/* TODO: we don't really need n any more? */
1266 	pgcnt_t n;
1267 	pgcnt_t nf, nfreed;
1268 
1269 	/*
1270 	 * Expand the cage if available cage memory is really low. Calculate
1271 	 * the amount required to return kcage_freemem to the level of
1272 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1273 	 * more.  It is rare for their sum to create an artificial threshold
1274 	 * above kcage_lotsfree, but it is possible.
1275 	 *
1276 	 * Exit early if expansion amount is equal to or less than zero.
1277 	 * (<0 is possible if kcage_freemem rises suddenly.)
1278 	 *
1279 	 * Exit early when the global page pool (apparently) does not
1280 	 * have enough free pages to page_relocate() even a single page.
1281 	 */
1282 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1283 		- kcage_freemem;
1284 	if (wanted <= 0)
1285 		return (0);
1286 	else if (freemem < pageout_reserve + 1) {
1287 		KCAGE_STAT_INCR(ke_lowfreemem);
1288 		return (0);
1289 	}
1290 
1291 	/*
1292 	 * Try to get the range list lock. If the lock is already
1293 	 * held, then don't get stuck here waiting for it.
1294 	 */
1295 	if (!kcage_range_trylock())
1296 		return (0);
1297 
1298 	KCAGE_STAT_INCR(ke_calls);
1299 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1300 
1301 	/*
1302 	 * Assimilate more pages from the global page pool into the cage.
1303 	 */
1304 	n = 0;				/* number of pages PP_SETNORELOC'd */
1305 	nf = 0;				/* number of those actually free */
1306 	while (kcage_on && nf < wanted) {
1307 		pfn = kcage_get_pfn();
1308 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1309 			KCAGE_STAT_INCR(ke_nopfn);
1310 			goto terminate;
1311 		}
1312 
1313 		KCAGE_STAT_INCR_SCAN(ke_examined);
1314 
1315 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1316 			KCAGE_STAT_INCR(ke_nopaget);
1317 			continue;
1318 		}
1319 		KCAGEPAGETS_INC();
1320 		/*
1321 		 * Sanity check. Skip this pfn if it is
1322 		 * being deleted.
1323 		 */
1324 		if (pfn_is_being_deleted(pfn)) {
1325 			KCAGE_STAT_INCR(ke_deleting);
1326 			continue;
1327 		}
1328 
1329 		/*
1330 		 * NORELOC is only set at boot-time or by this routine
1331 		 * under the kcage_range_mutex lock which is currently
1332 		 * held. This means we can do a fast check here before
1333 		 * locking the page in kcage_assimilate_page.
1334 		 */
1335 		if (PP_ISNORELOC(pp)) {
1336 			KCAGE_STAT_INCR(ke_isnoreloc);
1337 			continue;
1338 		}
1339 
1340 		switch (kcage_assimilate_page(pp, &nfreed)) {
1341 			case 0:		/* assimilated, page is free */
1342 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1343 				did_something = 1;
1344 				nf += nfreed;
1345 				n++;
1346 				break;
1347 
1348 			case EBUSY:	/* assimilated, page not free */
1349 			case ERANGE:	/* assimilated, page not root */
1350 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1351 				did_something = 1;
1352 				n++;
1353 				break;
1354 
1355 			case ENOMEM:	/* assimilated, but no mem */
1356 				KCAGE_STAT_INCR(ke_terminate);
1357 				did_something = 1;
1358 				n++;
1359 				goto terminate;
1360 
1361 			case EAGAIN:	/* can't assimilate */
1362 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1363 				break;
1364 
1365 			default:	/* catch this with debug kernels */
1366 				ASSERT(0);
1367 				break;
1368 		}
1369 	}
1370 
1371 	/*
1372 	 * Realign cage edge with the nearest physical address
1373 	 * boundry for big pages. This is done to give us a
1374 	 * better chance of actually getting usable big pages
1375 	 * in the cage.
1376 	 */
1377 
1378 terminate:
1379 	kcage_range_unlock();
1380 
1381 	return (did_something);
1382 }
1383 
1384 /*
1385  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1386  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1387  * if relocation is successful, otherwise it is only unlocked.
1388  * On entry, page opp must be exclusively locked and not free.
1389  * *nfreedp: number of pages freed.
1390  */
1391 static int
1392 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1393 {
1394 	page_t *opp = pp;
1395 	page_t *rpp = NULL;
1396 	spgcnt_t npgs;
1397 	int result;
1398 
1399 	ASSERT(!PP_ISFREE(opp));
1400 	ASSERT(PAGE_EXCL(opp));
1401 
1402 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1403 	*nfreedp = npgs;
1404 	if (result == 0) {
1405 		while (npgs-- > 0) {
1406 			page_t *tpp;
1407 
1408 			ASSERT(rpp != NULL);
1409 			tpp = rpp;
1410 			page_sub(&rpp, tpp);
1411 			page_unlock(tpp);
1412 		}
1413 
1414 		ASSERT(rpp == NULL);
1415 
1416 		return (0);		/* success */
1417 	}
1418 
1419 	page_unlock(opp);
1420 	return (result);
1421 }
1422 
1423 /*
1424  * Based on page_invalidate_pages()
1425  *
1426  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1427  * of use must be updated to match the new page_relocate() when it
1428  * becomes available.
1429  *
1430  * Return result of kcage_relocate_page or zero if page was directly freed.
1431  * *nfreedp: number of pages freed.
1432  */
1433 static int
1434 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1435 {
1436 	int result;
1437 
1438 #if defined(__sparc)
1439 	extern struct vnode prom_ppages;
1440 	ASSERT(pp->p_vnode != &prom_ppages);
1441 #endif /* __sparc */
1442 
1443 	ASSERT(!PP_ISFREE(pp));
1444 	ASSERT(PAGE_EXCL(pp));
1445 
1446 	/*
1447 	 * Is this page involved in some I/O? shared?
1448 	 * The page_struct_lock need not be acquired to
1449 	 * examine these fields since the page has an
1450 	 * "exclusive" lock.
1451 	 */
1452 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1453 		result = kcage_relocate_page(pp, nfreedp);
1454 #ifdef KCAGE_STATS
1455 		if (result == 0)
1456 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1457 		else if (result == ENOMEM)
1458 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1459 #endif
1460 		return (result);
1461 	}
1462 
1463 	ASSERT(pp->p_vnode->v_type != VCHR);
1464 
1465 	/*
1466 	 * Unload the mappings and check if mod bit is set.
1467 	 */
1468 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1469 
1470 	if (hat_ismod(pp)) {
1471 		result = kcage_relocate_page(pp, nfreedp);
1472 #ifdef KCAGE_STATS
1473 		if (result == 0)
1474 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1475 		else if (result == ENOMEM)
1476 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1477 #endif
1478 		return (result);
1479 	}
1480 
1481 	if (!page_try_demote_pages(pp)) {
1482 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1483 		page_unlock(pp);
1484 		return (EAGAIN);
1485 	}
1486 
1487 	page_destroy(pp, 0);
1488 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1489 	*nfreedp = 1;
1490 	return (0);
1491 }
1492 
1493 static void
1494 kcage_cageout()
1495 {
1496 	pfn_t pfn;
1497 	page_t *pp;
1498 	callb_cpr_t cprinfo;
1499 	int did_something;
1500 	int scan_again;
1501 	pfn_t start_pfn;
1502 	int pass;
1503 	int last_pass;
1504 	int pages_skipped;
1505 	int shared_skipped;
1506 	uint_t shared_level = 8;
1507 	pgcnt_t nfreed;
1508 #ifdef KCAGE_STATS
1509 	clock_t scan_start;
1510 #endif
1511 
1512 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1513 		callb_generic_cpr, "cageout");
1514 
1515 	mutex_enter(&kcage_cageout_mutex);
1516 
1517 	pfn = PFN_INVALID;		/* force scan reset */
1518 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1519 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1520 
1521 loop:
1522 	/*
1523 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1524 	 * that kcage_freemem is less than kcage_desfree. When it does
1525 	 * notice, kcage_freemem_sub() will wake us up via call to
1526 	 * kcage_cageout_wakeup().
1527 	 */
1528 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1529 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1530 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1531 
1532 	KCAGE_STAT_INCR(kt_wakeups);
1533 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1534 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1535 	pass = 0;
1536 	last_pass = 0;
1537 
1538 #ifdef KCAGE_STATS
1539 	scan_start = lbolt;
1540 #endif
1541 
1542 again:
1543 	if (!kcage_on)
1544 		goto loop;
1545 
1546 	KCAGE_STAT_INCR(kt_scans);
1547 	KCAGE_STAT_INCR_SCAN(kt_passes);
1548 
1549 	did_something = 0;
1550 	pages_skipped = 0;
1551 	shared_skipped = 0;
1552 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1553 		(pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1554 
1555 		if (start_pfn == PFN_INVALID)
1556 			start_pfn = pfn;
1557 		else if (start_pfn == pfn) {
1558 			last_pass = pass;
1559 			pass += 1;
1560 			/*
1561 			 * Did a complete walk of kernel cage, but didn't free
1562 			 * any pages.  If only one cpu is online then
1563 			 * stop kernel cage walk and try expanding.
1564 			 */
1565 			if (ncpus_online == 1 && did_something == 0) {
1566 				KCAGE_STAT_INCR(kt_cageout_break);
1567 				break;
1568 			}
1569 		}
1570 
1571 		pp = page_numtopp_nolock(pfn);
1572 		if (pp == NULL) {
1573 			continue;
1574 		}
1575 
1576 		KCAGE_STAT_INCR_SCAN(kt_examined);
1577 
1578 		/*
1579 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1580 		 * of the lock. If one is missed it will be seen next
1581 		 * time through.
1582 		 *
1583 		 * Skip non-caged-pages. These pages can exist in the cage
1584 		 * because, if during cage expansion, a page is
1585 		 * encountered that is long-term locked the lock prevents the
1586 		 * expansion logic from setting the P_NORELOC flag. Hence,
1587 		 * non-caged-pages surrounded by caged-pages.
1588 		 */
1589 		if (!PP_ISNORELOC(pp)) {
1590 			switch (kcage_assimilate_page(pp, &nfreed)) {
1591 				case 0:
1592 					did_something = 1;
1593 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1594 					    nfreed);
1595 					break;
1596 
1597 				case EBUSY:
1598 				case ERANGE:
1599 					did_something = 1;
1600 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1601 					break;
1602 
1603 				case EAGAIN:
1604 				case ENOMEM:
1605 					break;
1606 
1607 				default:
1608 					/* catch this with debug kernels */
1609 					ASSERT(0);
1610 					break;
1611 			}
1612 
1613 			continue;
1614 		} else {
1615 			int prm;
1616 
1617 			if (PP_ISFREE(pp)) {
1618 				continue;
1619 			}
1620 
1621 			if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) ||
1622 			    !page_trylock(pp, SE_EXCL)) {
1623 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1624 				continue;
1625 			}
1626 
1627 			/* P_NORELOC bit should not have gone away. */
1628 			ASSERT(PP_ISNORELOC(pp));
1629 			if (PP_ISFREE(pp) || (pp->p_vnode == &kvp &&
1630 			    pp->p_lckcnt > 0)) {
1631 				page_unlock(pp);
1632 				continue;
1633 			}
1634 
1635 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1636 			if (hat_page_getshare(pp) > shared_level) {
1637 				page_unlock(pp);
1638 				pages_skipped = 1;
1639 				shared_skipped = 1;
1640 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1641 				continue;
1642 			}
1643 
1644 			/*
1645 			 * In pass {0, 1}, skip page if ref bit is set.
1646 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1647 			 */
1648 			prm = hat_pagesync(pp,
1649 				HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1650 
1651 			/* On first pass ignore ref'd pages */
1652 			if (pass <= 1 && (prm & P_REF)) {
1653 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1654 				pages_skipped = 1;
1655 				page_unlock(pp);
1656 				continue;
1657 			}
1658 
1659 			/* On pass 2, page_destroy if mod bit is not set */
1660 			if (pass <= 2) {
1661 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1662 					pp->p_lckcnt || pp->p_cowcnt) {
1663 					pages_skipped = 1;
1664 					page_unlock(pp);
1665 				} else {
1666 
1667 					/*
1668 					 * unload the mappings before
1669 					 * checking if mod bit is set
1670 					 */
1671 					(void) hat_pageunload(pp,
1672 						HAT_FORCE_PGUNLOAD);
1673 
1674 					/*
1675 					 * skip this page if modified
1676 					 */
1677 					if (hat_ismod(pp)) {
1678 						pages_skipped = 1;
1679 						page_unlock(pp);
1680 						continue;
1681 					}
1682 
1683 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1684 					page_destroy(pp, 0);
1685 					did_something = 1;
1686 				}
1687 				continue;
1688 			}
1689 
1690 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1691 				did_something = 1;
1692 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1693 			}
1694 
1695 			/*
1696 			 * No need to drop the page lock here.
1697 			 * Kcage_invalidate_page has done that for us
1698 			 * either explicitly or through a page_free.
1699 			 */
1700 		}
1701 	}
1702 
1703 	/*
1704 	 * Expand the cage only if available cage memory is really low.
1705 	 * This test is done only after a complete scan of the cage.
1706 	 * The reason for not checking and expanding more often is to
1707 	 * avoid rapid expansion of the cage. Naturally, scanning the
1708 	 * cage takes time. So by scanning first, we use that work as a
1709 	 * delay loop in between expand decisions.
1710 	 */
1711 
1712 	scan_again = 0;
1713 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1714 		/*
1715 		 * Kcage_expand() will return a non-zero value if it was
1716 		 * able to expand the cage -- whether or not the new
1717 		 * pages are free and immediately usable. If non-zero,
1718 		 * we do another scan of the cage. The pages might be
1719 		 * freed during that scan or by time we get back here.
1720 		 * If not, we will attempt another expansion.
1721 		 * However, if kcage_expand() returns zero, then it was
1722 		 * unable to expand the cage. This is the case when the
1723 		 * the growth list is exausted, therefore no work was done
1724 		 * and there is no reason to scan the cage again.
1725 		 * Note: Kernel cage scan is not repeated on single-cpu
1726 		 * system to avoid kernel cage thread hogging cpu.
1727 		 */
1728 		if (pass <= 3 && pages_skipped && ncpus_online > 1)
1729 			scan_again = 1;
1730 		else
1731 			(void) kcage_expand(); /* don't scan again */
1732 	} else if (kcage_freemem < kcage_lotsfree) {
1733 		/*
1734 		 * If available cage memory is less than abundant
1735 		 * and a full scan of the cage has not yet been completed,
1736 		 * or a scan has completed and some work was performed,
1737 		 * or pages were skipped because of sharing,
1738 		 * or we simply have not yet completed two passes,
1739 		 * then do another scan.
1740 		 */
1741 		if (pass <= 2 && pages_skipped)
1742 			scan_again = 1;
1743 		if (pass == last_pass || did_something)
1744 			scan_again = 1;
1745 		else if (shared_skipped && shared_level < (8<<24)) {
1746 			shared_level <<= 1;
1747 			scan_again = 1;
1748 		}
1749 	}
1750 
1751 	if (scan_again && ncpus_online > 1)
1752 		goto again;
1753 	else {
1754 		if (shared_level > 8)
1755 			shared_level >>= 1;
1756 
1757 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1758 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1759 		KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start);
1760 		KCAGE_STAT_INC_SCAN_INDEX;
1761 		goto loop;
1762 	}
1763 
1764 	/*NOTREACHED*/
1765 }
1766 
1767 void
1768 kcage_cageout_wakeup()
1769 {
1770 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1771 		if (kcage_cageout_ready) {
1772 			cv_signal(&kcage_cageout_cv);
1773 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1774 			/*
1775 			 * Available cage memory is really low. Time to
1776 			 * start expanding the cage. However, the
1777 			 * kernel cage thread is not yet ready to
1778 			 * do the work. Use *this* thread, which is
1779 			 * most likely to be t0, to do the work.
1780 			 */
1781 			KCAGE_STAT_INCR(kcw_expandearly);
1782 			(void) kcage_expand();
1783 			KCAGE_STAT_INC_SCAN_INDEX;
1784 		}
1785 
1786 		mutex_exit(&kcage_cageout_mutex);
1787 	}
1788 	/* else, kernel cage thread is already running */
1789 }
1790 
1791 void
1792 kcage_tick()
1793 {
1794 	/*
1795 	 * Once per second we wake up all the threads throttled
1796 	 * waiting for cage memory, in case we've become stuck
1797 	 * and haven't made forward progress expanding the cage.
1798 	 */
1799 	if (kcage_on && kcage_cageout_ready)
1800 		cv_broadcast(&kcage_throttle_cv);
1801 }
1802