xref: /titanic_44/usr/src/uts/common/os/mem_cage.c (revision 767b0abf70408797bf5ca4a8dac501bb1a90003d)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/proc.h>
32 #include <sys/callb.h>
33 #include <sys/vnode.h>
34 #include <sys/debug.h>
35 #include <sys/systm.h>		/* for bzero */
36 #include <sys/memlist.h>
37 #include <sys/cmn_err.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
40 #include <sys/atomic.h>		/* used to update kcage_freemem */
41 #include <sys/kmem.h>		/* for kmem_reap */
42 #include <sys/errno.h>
43 #include <sys/mem_cage.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/page.h>
46 #include <vm/hat.h>
47 #include <vm/vm_dep.h>
48 #include <sys/mem_config.h>
49 #include <sys/lgrp.h>
50 #include <sys/rwlock.h>
51 
52 extern pri_t maxclsyspri;
53 
54 #ifdef DEBUG
55 #define	KCAGE_STATS
56 #endif
57 
58 #ifdef KCAGE_STATS
59 
60 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
61 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
62 
63 struct kcage_stats_scan {
64 	/* managed by KCAGE_STAT_* macros */
65 	clock_t	scan_lbolt;
66 	uint_t	scan_id;
67 
68 	/* set in kcage_cageout() */
69 	uint_t	kt_passes;
70 	clock_t	kt_ticks;
71 	pgcnt_t	kt_kcage_freemem_start;
72 	pgcnt_t	kt_kcage_freemem_end;
73 	pgcnt_t kt_freemem_start;
74 	pgcnt_t kt_freemem_end;
75 	uint_t	kt_examined;
76 	uint_t	kt_cantlock;
77 	uint_t	kt_gotone;
78 	uint_t	kt_gotonefree;
79 	uint_t	kt_skiplevel;
80 	uint_t	kt_skipshared;
81 	uint_t	kt_skiprefd;
82 	uint_t	kt_destroy;
83 
84 	/* set in kcage_invalidate_page() */
85 	uint_t	kip_reloclocked;
86 	uint_t	kip_relocmod;
87 	uint_t	kip_destroy;
88 	uint_t	kip_nomem;
89 	uint_t	kip_demotefailed;
90 
91 	/* set in kcage_expand() */
92 	uint_t	ke_wanted;
93 	uint_t	ke_examined;
94 	uint_t	ke_lefthole;
95 	uint_t	ke_gotone;
96 	uint_t	ke_gotonefree;
97 };
98 
99 struct kcage_stats {
100 	/* managed by KCAGE_STAT_* macros */
101 	uint_t	version;
102 	uint_t	size;
103 
104 	/* set in kcage_cageout */
105 	uint_t	kt_wakeups;
106 	uint_t	kt_scans;
107 	uint_t	kt_cageout_break;
108 
109 	/* set in kcage_expand */
110 	uint_t	ke_calls;
111 	uint_t	ke_nopfn;
112 	uint_t	ke_nopaget;
113 	uint_t	ke_isnoreloc;
114 	uint_t	ke_deleting;
115 	uint_t	ke_lowfreemem;
116 	uint_t	ke_terminate;
117 
118 	/* set in kcage_freemem_add() */
119 	uint_t	kfa_trottlewake;
120 
121 	/* set in kcage_freemem_sub() */
122 	uint_t	kfs_cagewake;
123 
124 	/* set in kcage_create_throttle */
125 	uint_t	kct_calls;
126 	uint_t	kct_cageout;
127 	uint_t	kct_critical;
128 	uint_t	kct_exempt;
129 	uint_t	kct_cagewake;
130 	uint_t	kct_wait;
131 	uint_t	kct_progress;
132 	uint_t	kct_noprogress;
133 	uint_t	kct_timeout;
134 
135 	/* set in kcage_cageout_wakeup */
136 	uint_t	kcw_expandearly;
137 
138 	/* managed by KCAGE_STAT_* macros */
139 	uint_t	scan_array_size;
140 	uint_t	scan_index;
141 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
142 };
143 
144 static struct kcage_stats kcage_stats;
145 static struct kcage_stats_scan kcage_stats_scan_zero;
146 
147 /*
148  * No real need for atomics here. For the most part the incs and sets are
149  * done by the kernel cage thread. There are a few that are done by any
150  * number of other threads. Those cases are noted by comments.
151  */
152 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
153 
154 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
155 
156 #define	KCAGE_STAT_INCR_SCAN(m)	\
157 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
158 
159 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
160 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
161 
162 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
163 
164 #define	KCAGE_STAT_SETZ(m, v)	\
165 	if (kcage_stats.m == 0) kcage_stats.m = (v)
166 
167 #define	KCAGE_STAT_SET_SCAN(m, v)	\
168 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
169 
170 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
171 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
172 
173 #define	KCAGE_STAT_INC_SCAN_INDEX \
174 	KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \
175 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
176 	kcage_stats.scan_index = \
177 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
178 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
179 
180 #define	KCAGE_STAT_INIT_SCAN_INDEX \
181 	kcage_stats.version = KCAGE_STATS_VERSION; \
182 	kcage_stats.size = sizeof (kcage_stats); \
183 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
184 	kcage_stats.scan_index = 0
185 
186 #else /* KCAGE_STATS */
187 
188 #define	KCAGE_STAT_INCR(v)
189 #define	KCAGE_STAT_NINCR(m, v)
190 #define	KCAGE_STAT_INCR_SCAN(v)
191 #define	KCAGE_STAT_NINCR_SCAN(m, v)
192 #define	KCAGE_STAT_SET(m, v)
193 #define	KCAGE_STAT_SETZ(m, v)
194 #define	KCAGE_STAT_SET_SCAN(m, v)
195 #define	KCAGE_STAT_SETZ_SCAN(m, v)
196 #define	KCAGE_STAT_INC_SCAN_INDEX
197 #define	KCAGE_STAT_INIT_SCAN_INDEX
198 
199 #endif /* KCAGE_STATS */
200 
201 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
202 static kcondvar_t kcage_throttle_cv;
203 
204 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
205 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
206 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
207 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
208 
209 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
210 
211 /*
212  * Cage expansion happens within a range.
213  */
214 struct kcage_glist {
215 	struct kcage_glist	*next;
216 	pfn_t			base;
217 	pfn_t			lim;
218 	pfn_t			curr;
219 	int			decr;
220 };
221 
222 static struct kcage_glist *kcage_glist;
223 static struct kcage_glist *kcage_current_glist;
224 
225 /*
226  * The firstfree element is provided so that kmem_alloc can be avoided
227  * until that cage has somewhere to go. This is not currently a problem
228  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
229  */
230 static struct kcage_glist kcage_glist_firstfree;
231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
232 
233 /*
234  * Miscellaneous forward references
235  */
236 static struct kcage_glist *kcage_glist_alloc(void);
237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
238 static void kcage_cageout(void);
239 static int kcage_invalidate_page(page_t *, pgcnt_t *);
240 static int kcage_setnoreloc_pages(page_t *, se_t);
241 
242 /*
243  * Kernel Memory Cage counters and thresholds.
244  */
245 int kcage_on = 0;
246 pgcnt_t kcage_freemem;
247 pgcnt_t kcage_needfree;
248 pgcnt_t kcage_lotsfree;
249 pgcnt_t kcage_desfree;
250 pgcnt_t kcage_minfree;
251 pgcnt_t kcage_throttlefree;
252 pgcnt_t	kcage_reserve;
253 int kcage_maxwait = 10;	/* in seconds */
254 
255 /* when we use lp for kmem we start the cage at a higher initial value */
256 pgcnt_t kcage_kmemlp_mincage;
257 
258 #ifdef DEBUG
259 pgcnt_t	kcage_pagets;
260 #define	KCAGEPAGETS_INC()	kcage_pagets++
261 #else
262 #define	KCAGEPAGETS_INC()
263 #endif
264 
265 /* kstats to export what pages are currently caged */
266 kmutex_t kcage_kstat_lock;
267 static int kcage_kstat_update(kstat_t *ksp, int rw);
268 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
269 
270 /*
271  * Startup and Dynamic Reconfiguration interfaces.
272  * kcage_range_lock()
273  * kcage_range_unlock()
274  * kcage_range_islocked()
275  * kcage_range_add()
276  * kcage_range_del()
277  * kcage_init()
278  * kcage_set_thresholds()
279  */
280 
281 /*
282  * Called outside of this file to add/remove from the list,
283  * therefore, it takes a writer lock
284  */
285 void
286 kcage_range_lock(void)
287 {
288 	rw_enter(&kcage_range_rwlock, RW_WRITER);
289 }
290 
291 void
292 kcage_range_unlock(void)
293 {
294 	rw_exit(&kcage_range_rwlock);
295 }
296 
297 int
298 kcage_range_islocked(void)
299 {
300 	return (rw_lock_held(&kcage_range_rwlock));
301 }
302 
303 /*
304  * Called from page_get_contig_pages to get the approximate kcage pfn range
305  * for exclusion from search for contiguous pages. This routine is called
306  * without kcage_range lock (kcage routines can call page_get_contig_pages
307  * through page_relocate) and with the assumption, based on kcage_range_add,
308  * that kcage_current_glist always contain a valid pointer.
309  */
310 
311 int
312 kcage_current_pfn(pfn_t *pfncur)
313 {
314 	struct kcage_glist *lp = kcage_current_glist;
315 
316 	ASSERT(kcage_on);
317 
318 	ASSERT(lp != NULL);
319 
320 	*pfncur = lp->curr;
321 
322 	return (lp->decr);
323 }
324 
325 /*
326  * Called from vm_pagelist.c during coalesce to find kernel cage regions
327  * within an mnode. Looks for the lowest range between lo and hi.
328  *
329  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
330  * Non-cage memory is defined between kcage_current_glist and list end.
331  *
332  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
333  * non-cage range.
334  *
335  * Returns zero on success and nlo, nhi:
336  * 	lo <= nlo < nhi <= hi
337  * Returns non-zero if no overlapping range is found.
338  */
339 int
340 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
341     pfn_t *nlo, pfn_t *nhi)
342 {
343 	struct kcage_glist *lp;
344 	pfn_t tlo = hi;
345 	pfn_t thi = hi;
346 
347 	ASSERT(lo <= hi);
348 
349 	/*
350 	 * Reader lock protects the list, but kcage_get_pfn
351 	 * running concurrently may advance kcage_current_glist
352 	 * and also update kcage_current_glist->curr. Page
353 	 * coalesce can handle this race condition.
354 	 */
355 	rw_enter(&kcage_range_rwlock, RW_READER);
356 
357 	for (lp = incage ? kcage_glist : kcage_current_glist;
358 		lp != NULL; lp = lp->next) {
359 
360 		pfn_t klo, khi;
361 
362 		/* find the range limits in this element */
363 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
364 			klo = lp->curr;
365 			khi = lp->lim;
366 		} else {
367 			klo = lp->base;
368 			khi = lp->curr;
369 		}
370 
371 		/* handle overlap */
372 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
373 			tlo = MAX(lo, klo);
374 			thi = MIN(hi, khi);
375 			if (tlo == lo)
376 				break;
377 		}
378 
379 		/* check end of kcage */
380 		if (incage && lp == kcage_current_glist) {
381 			break;
382 		}
383 	}
384 
385 	rw_exit(&kcage_range_rwlock);
386 
387 	/* return non-zero if no overlapping range found */
388 	if (tlo == thi)
389 		return (1);
390 
391 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
392 
393 	/* return overlapping range */
394 	*nlo = tlo;
395 	*nhi = thi;
396 	return (0);
397 }
398 
399 int
400 kcage_range_init(struct memlist *ml, int decr)
401 {
402 	int ret = 0;
403 
404 	ASSERT(kcage_range_islocked());
405 
406 	if (decr) {
407 		while (ml->next != NULL)
408 			ml = ml->next;
409 	}
410 
411 	while (ml != NULL) {
412 		ret = kcage_range_add(btop(ml->address), btop(ml->size), decr);
413 		if (ret)
414 			break;
415 
416 		ml = (decr ? ml->prev : ml->next);
417 	}
418 
419 	return (ret);
420 }
421 
422 /*
423  * Third arg controls direction of growth: 0: increasing pfns,
424  * 1: decreasing.
425  * Calls to add and delete must be protected by calls to
426  * kcage_range_lock() and kcage_range_unlock().
427  */
428 int
429 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr)
430 {
431 	struct kcage_glist *new, **lpp;
432 	pfn_t lim;
433 
434 	ASSERT(kcage_range_islocked());
435 
436 	ASSERT(npgs != 0);
437 	if (npgs == 0)
438 		return (EINVAL);
439 
440 	lim = base + npgs;
441 
442 	ASSERT(lim > base);
443 	if (lim <= base)
444 		return (EINVAL);
445 
446 	new = kcage_glist_alloc();
447 	if (new == NULL) {
448 		return (ENOMEM);
449 	}
450 
451 	new->base = base;
452 	new->lim = lim;
453 	new->decr = decr;
454 	if (new->decr != 0)
455 		new->curr = new->lim;
456 	else
457 		new->curr = new->base;
458 	/*
459 	 * Any overlapping existing ranges are removed by deleting
460 	 * from the new list as we search for the tail.
461 	 */
462 	lpp = &kcage_glist;
463 	while (*lpp != NULL) {
464 		int ret;
465 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
466 		if (ret != 0)
467 			return (ret);
468 		lpp = &(*lpp)->next;
469 	}
470 
471 	*lpp = new;
472 
473 	if (kcage_current_glist == NULL) {
474 		kcage_current_glist = kcage_glist;
475 	}
476 
477 	return (0);
478 }
479 
480 /*
481  * Calls to add and delete must be protected by calls to
482  * kcage_range_lock() and kcage_range_unlock().
483  */
484 int
485 kcage_range_delete(pfn_t base, pgcnt_t npgs)
486 {
487 	struct kcage_glist *lp;
488 	pfn_t lim;
489 
490 	ASSERT(kcage_range_islocked());
491 
492 	ASSERT(npgs != 0);
493 	if (npgs == 0)
494 		return (EINVAL);
495 
496 	lim = base + npgs;
497 
498 	ASSERT(lim > base);
499 	if (lim <= base)
500 		return (EINVAL);
501 
502 	/*
503 	 * Check if the delete is OK first as a number of elements
504 	 * might be involved and it will be difficult to go
505 	 * back and undo (can't just add the range back in).
506 	 */
507 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
508 		/*
509 		 * If there have been no pages allocated from this
510 		 * element, we don't need to check it.
511 		 */
512 		if ((lp->decr == 0 && lp->curr == lp->base) ||
513 		    (lp->decr != 0 && lp->curr == lp->lim))
514 			continue;
515 		/*
516 		 * If the element does not overlap, its OK.
517 		 */
518 		if (base >= lp->lim || lim <= lp->base)
519 			continue;
520 		/*
521 		 * Overlapping element: Does the range to be deleted
522 		 * overlap the area already used? If so fail.
523 		 */
524 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
525 			return (EBUSY);
526 		}
527 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
528 			return (EBUSY);
529 		}
530 	}
531 	return (kcage_glist_delete(base, lim, &kcage_glist));
532 }
533 
534 /*
535  * Calls to add and delete must be protected by calls to
536  * kcage_range_lock() and kcage_range_unlock().
537  * This routine gets called after successful Solaris memory
538  * delete operation from DR post memory delete routines.
539  */
540 int
541 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
542 {
543 	pfn_t lim;
544 
545 	ASSERT(kcage_range_islocked());
546 
547 	ASSERT(npgs != 0);
548 	if (npgs == 0)
549 		return (EINVAL);
550 
551 	lim = base + npgs;
552 
553 	ASSERT(lim > base);
554 	if (lim <= base)
555 		return (EINVAL);
556 
557 	return (kcage_glist_delete(base, lim, &kcage_glist));
558 }
559 
560 /*
561  * No locking is required here as the whole operation is covered
562  * by the kcage_range_lock().
563  */
564 static struct kcage_glist *
565 kcage_glist_alloc(void)
566 {
567 	struct kcage_glist *new;
568 
569 	if ((new = kcage_glist_freelist) != NULL) {
570 		kcage_glist_freelist = new->next;
571 		bzero(new, sizeof (*new));
572 	} else {
573 		new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP);
574 	}
575 	return (new);
576 }
577 
578 static void
579 kcage_glist_free(struct kcage_glist *lp)
580 {
581 	lp->next = kcage_glist_freelist;
582 	kcage_glist_freelist = lp;
583 }
584 
585 static int
586 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
587 {
588 	struct kcage_glist *lp, *prev = *lpp;
589 
590 	while ((lp = *lpp) != NULL) {
591 		if (lim > lp->base && base < lp->lim) {
592 			/* The delete range overlaps this element. */
593 			if (base <= lp->base && lim >= lp->lim) {
594 				/* Delete whole element. */
595 				*lpp = lp->next;
596 				if (lp == kcage_current_glist) {
597 					/* This can never happen. */
598 					ASSERT(kcage_current_glist != prev);
599 					kcage_current_glist = prev;
600 				}
601 				kcage_glist_free(lp);
602 				continue;
603 			}
604 
605 			/* Partial delete. */
606 			if (base > lp->base && lim < lp->lim) {
607 				struct kcage_glist *new;
608 
609 				/*
610 				 * Remove a section from the middle,
611 				 * need to allocate a new element.
612 				 */
613 				new = kcage_glist_alloc();
614 				if (new == NULL) {
615 					return (ENOMEM);
616 				}
617 
618 				/*
619 				 * Tranfser unused range to new.
620 				 * Edit lp in place to preserve
621 				 * kcage_current_glist.
622 				 */
623 				new->decr = lp->decr;
624 				if (new->decr != 0) {
625 					new->base = lp->base;
626 					new->lim = base;
627 					new->curr = base;
628 
629 					lp->base = lim;
630 				} else {
631 					new->base = lim;
632 					new->lim = lp->lim;
633 					new->curr = new->base;
634 
635 					lp->lim = base;
636 				}
637 
638 				/* Insert new. */
639 				new->next = lp->next;
640 				lp->next = new;
641 				lpp = &lp->next;
642 			} else {
643 				/* Delete part of current block. */
644 				if (base > lp->base) {
645 					ASSERT(lim >= lp->lim);
646 					ASSERT(base < lp->lim);
647 					if (lp->decr != 0 &&
648 					    lp->curr == lp->lim)
649 						lp->curr = base;
650 					lp->lim = base;
651 				} else {
652 					ASSERT(base <= lp->base);
653 					ASSERT(lim > lp->base);
654 					if (lp->decr == 0 &&
655 					    lp->curr == lp->base)
656 						lp->curr = lim;
657 					lp->base = lim;
658 				}
659 			}
660 		}
661 		prev = *lpp;
662 		lpp = &(*lpp)->next;
663 	}
664 
665 	return (0);
666 }
667 
668 /*
669  * The caller of kcage_get_pfn must hold the kcage_range_lock to make
670  * sure that there are no concurrent calls. The same lock
671  * must be obtained for range add and delete by calling
672  * kcage_range_lock() and kcage_range_unlock().
673  */
674 static pfn_t
675 kcage_get_pfn(void)
676 {
677 	struct kcage_glist *lp;
678 	pfn_t pfn;
679 
680 	ASSERT(kcage_range_islocked());
681 
682 	lp = kcage_current_glist;
683 	while (lp != NULL) {
684 		if (lp->decr != 0) {
685 			if (lp->curr != lp->base) {
686 				pfn = --lp->curr;
687 				return (pfn);
688 			}
689 		} else {
690 			if (lp->curr != lp->lim) {
691 				pfn = lp->curr++;
692 				return (pfn);
693 			}
694 		}
695 
696 		lp = lp->next;
697 		if (lp)
698 			kcage_current_glist = lp;
699 	}
700 
701 	return (PFN_INVALID);
702 }
703 
704 /*
705  * Walk the physical address space of the cage.
706  * This routine does not guarantee to return PFNs in the order
707  * in which they were allocated to the cage. Instead, it walks
708  * each range as they appear on the growth list returning the PFNs
709  * range in ascending order.
710  *
711  * To begin scanning at lower edge of cage, reset should be nonzero.
712  * To step through cage, reset should be zero.
713  *
714  * PFN_INVALID will be returned when the upper end of the cage is
715  * reached -- indicating a full scan of the cage has been completed since
716  * previous reset. PFN_INVALID will continue to be returned until
717  * kcage_walk_cage is reset.
718  *
719  * It is possible to receive a PFN_INVALID result on reset if a growth
720  * list is not installed or if none of the PFNs in the installed list have
721  * been allocated to the cage. In otherwords, there is no cage.
722  *
723  * Caller need not hold kcage_range_lock while calling this function
724  * as the front part of the list is static - pages never come out of
725  * the cage.
726  *
727  * The caller is expected to only be kcage_cageout().
728  */
729 static pfn_t
730 kcage_walk_cage(int reset)
731 {
732 	static struct kcage_glist *lp = NULL;
733 	static pfn_t pfn;
734 
735 	if (reset)
736 		lp = NULL;
737 	if (lp == NULL) {
738 		lp = kcage_glist;
739 		pfn = PFN_INVALID;
740 	}
741 again:
742 	if (pfn == PFN_INVALID) {
743 		if (lp == NULL)
744 			return (PFN_INVALID);
745 
746 		if (lp->decr != 0) {
747 			/*
748 			 * In this range the cage grows from the highest
749 			 * address towards the lowest.
750 			 * Arrange to return pfns from curr to lim-1,
751 			 * inclusive, in ascending order.
752 			 */
753 
754 			pfn = lp->curr;
755 		} else {
756 			/*
757 			 * In this range the cage grows from the lowest
758 			 * address towards the highest.
759 			 * Arrange to return pfns from base to curr,
760 			 * inclusive, in ascending order.
761 			 */
762 
763 			pfn = lp->base;
764 		}
765 	}
766 
767 	if (lp->decr != 0) {		/* decrementing pfn */
768 		if (pfn == lp->lim) {
769 			/* Don't go beyond the static part of the glist. */
770 			if (lp == kcage_current_glist)
771 				lp = NULL;
772 			else
773 				lp = lp->next;
774 			pfn = PFN_INVALID;
775 			goto again;
776 		}
777 
778 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
779 	} else {			/* incrementing pfn */
780 		if (pfn == lp->curr) {
781 			/* Don't go beyond the static part of the glist. */
782 			if (lp == kcage_current_glist)
783 				lp = NULL;
784 			else
785 				lp = lp->next;
786 			pfn = PFN_INVALID;
787 			goto again;
788 		}
789 
790 		ASSERT(pfn >= lp->base && pfn < lp->curr);
791 	}
792 
793 	return (pfn++);
794 }
795 
796 /*
797  * Callback functions for to recalc cage thresholds after
798  * Kphysm memory add/delete operations.
799  */
800 /*ARGSUSED*/
801 static void
802 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
803 {
804 	kcage_recalc_thresholds();
805 }
806 
807 /*ARGSUSED*/
808 static int
809 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
810 {
811 	/* TODO: when should cage refuse memory delete requests? */
812 	return (0);
813 }
814 
815 /*ARGSUSED*/
816 static  void
817 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
818 {
819 	kcage_recalc_thresholds();
820 }
821 
822 static kphysm_setup_vector_t kcage_kphysm_vectors = {
823 	KPHYSM_SETUP_VECTOR_VERSION,
824 	kcage_kphysm_postadd_cb,
825 	kcage_kphysm_predel_cb,
826 	kcage_kphysm_postdel_cb
827 };
828 
829 /*
830  * This is called before a CPR suspend and after a CPR resume.  We have to
831  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
832  * restart.
833  */
834 /*ARGSUSED*/
835 static boolean_t
836 kcage_cageout_cpr(void *arg, int code)
837 {
838 	if (code == CB_CODE_CPR_CHKPT) {
839 		ASSERT(kcage_cageout_ready);
840 		kcage_cageout_ready = 0;
841 		return (B_TRUE);
842 	} else if (code == CB_CODE_CPR_RESUME) {
843 		ASSERT(kcage_cageout_ready == 0);
844 		kcage_cageout_ready = 1;
845 		return (B_TRUE);
846 	}
847 	return (B_FALSE);
848 }
849 
850 /*
851  * kcage_recalc_preferred_size() increases initial cage size to improve large
852  * page availability when lp for kmem is enabled and kpr is disabled
853  */
854 static pgcnt_t
855 kcage_recalc_preferred_size(pgcnt_t preferred_size)
856 {
857 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
858 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
859 		if (lpmincage == 0) {
860 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
861 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
862 		}
863 		kcage_kmemlp_mincage = MIN(lpmincage,
864 			    (segkmem_kmemlp_max / PAGESIZE));
865 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
866 	}
867 	return (preferred_size);
868 }
869 
870 /*
871  * Kcage_init() builds the cage and initializes the cage thresholds.
872  * The size of the cage is determined by the argument preferred_size.
873  * or the actual amount of memory, whichever is smaller.
874  */
875 void
876 kcage_init(pgcnt_t preferred_size)
877 {
878 	pgcnt_t wanted;
879 	pfn_t pfn;
880 	page_t *pp;
881 	kstat_t *ksp;
882 
883 	extern struct vnode kvp;
884 	extern void page_list_noreloc_startup(page_t *);
885 
886 	ASSERT(!kcage_on);
887 	ASSERT(kcage_range_islocked());
888 
889 	/* increase preferred cage size for lp for kmem */
890 	preferred_size = kcage_recalc_preferred_size(preferred_size);
891 
892 	/* Debug note: initialize this now so early expansions can stat */
893 	KCAGE_STAT_INIT_SCAN_INDEX;
894 
895 	/*
896 	 * Initialize cage thresholds and install kphysm callback.
897 	 * If we can't arrange to have the thresholds track with
898 	 * available physical memory, then the cage thresholds may
899 	 * end up over time at levels that adversly effect system
900 	 * performance; so, bail out.
901 	 */
902 	kcage_recalc_thresholds();
903 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
904 		ASSERT(0);		/* Catch this in DEBUG kernels. */
905 		return;
906 	}
907 
908 	/*
909 	 * Limit startup cage size within the range of kcage_minfree
910 	 * and availrmem, inclusively.
911 	 */
912 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
913 
914 	/*
915 	 * Construct the cage. PFNs are allocated from the glist. It
916 	 * is assumed that the list has been properly ordered for the
917 	 * platform by the platform code. Typically, this is as simple
918 	 * as calling kcage_range_init(phys_avail, decr), where decr is
919 	 * 1 if the kernel has been loaded into upper end of physical
920 	 * memory, or 0 if the kernel has been loaded at the low end.
921 	 *
922 	 * Note: it is assumed that we are in the startup flow, so there
923 	 * is no reason to grab the page lock.
924 	 */
925 	kcage_freemem = 0;
926 	pfn = PFN_INVALID;			/* prime for alignment test */
927 	while (wanted != 0) {
928 		if ((pfn = kcage_get_pfn()) == PFN_INVALID)
929 			break;
930 
931 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
932 			KCAGEPAGETS_INC();
933 			/*
934 			 * Set the noreloc state on the page.
935 			 * If the page is free and not already
936 			 * on the noreloc list then move it.
937 			 */
938 			if (PP_ISFREE(pp)) {
939 				if (PP_ISNORELOC(pp) == 0)
940 					page_list_noreloc_startup(pp);
941 			} else {
942 				ASSERT(pp->p_szc == 0);
943 				PP_SETNORELOC(pp);
944 			}
945 		}
946 		PLCNT_XFER_NORELOC(pp);
947 		wanted -= 1;
948 	}
949 
950 	/*
951 	 * Need to go through and find kernel allocated pages
952 	 * and capture them into the Cage.  These will primarily
953 	 * be pages gotten through boot_alloc().
954 	 */
955 	if (kvp.v_pages) {
956 
957 		pp = kvp.v_pages;
958 		do {
959 			ASSERT(!PP_ISFREE(pp));
960 			ASSERT(pp->p_szc == 0);
961 			PP_SETNORELOC(pp);
962 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
963 
964 	}
965 
966 	kcage_on = 1;
967 
968 	/*
969 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
970 	 * after the cageout thread is blocked, and executes from cpr_resume()
971 	 * before the cageout thread is restarted.  By executing in this class,
972 	 * we are assured that the kernel cage thread won't miss wakeup calls
973 	 * and also CPR's larger kmem_alloc requests will not fail after
974 	 * CPR shuts down the cageout kernel thread.
975 	 */
976 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
977 	    "cageout");
978 
979 	/*
980 	 * Coalesce pages to improve large page availability. A better fix
981 	 * would to coalesce pages as they are included in the cage
982 	 */
983 	if (SEGKMEM_USE_LARGEPAGES) {
984 		extern void page_freelist_coalesce_all(int mnode);
985 		extern int max_mem_nodes;
986 		int mnode, max_mnodes = max_mem_nodes;
987 		for (mnode = 0; mnode < max_mnodes; mnode++) {
988 			page_freelist_coalesce_all(mnode);
989 		}
990 	}
991 
992 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
993 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
994 	if (ksp != NULL) {
995 		ksp->ks_update = kcage_kstat_update;
996 		ksp->ks_snapshot = kcage_kstat_snapshot;
997 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
998 		kstat_install(ksp);
999 	}
1000 
1001 }
1002 
1003 static int
1004 kcage_kstat_update(kstat_t *ksp, int rw)
1005 {
1006 	struct kcage_glist *lp;
1007 	uint_t count;
1008 
1009 	if (rw == KSTAT_WRITE)
1010 		return (EACCES);
1011 
1012 	count = 0;
1013 	kcage_range_lock();
1014 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
1015 		if (lp->decr) {
1016 			if (lp->curr != lp->lim) {
1017 				count++;
1018 			}
1019 		} else {
1020 			if (lp->curr != lp->base) {
1021 				count++;
1022 			}
1023 		}
1024 	}
1025 	kcage_range_unlock();
1026 
1027 	ksp->ks_ndata = count;
1028 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1029 
1030 	return (0);
1031 }
1032 
1033 static int
1034 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1035 {
1036 	struct kcage_glist *lp;
1037 	struct memunit {
1038 		uint64_t address;
1039 		uint64_t size;
1040 	} *kspmem;
1041 
1042 	if (rw == KSTAT_WRITE)
1043 		return (EACCES);
1044 
1045 	ksp->ks_snaptime = gethrtime();
1046 
1047 	kspmem = (struct memunit *)buf;
1048 	kcage_range_lock();
1049 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
1050 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1051 			break;
1052 
1053 		if (lp->decr) {
1054 			if (lp->curr != lp->lim) {
1055 				kspmem->address = ptob(lp->curr);
1056 				kspmem->size = ptob(lp->lim - lp->curr);
1057 			}
1058 		} else {
1059 			if (lp->curr != lp->base) {
1060 				kspmem->address = ptob(lp->base);
1061 				kspmem->size = ptob(lp->curr - lp->base);
1062 			}
1063 		}
1064 	}
1065 	kcage_range_unlock();
1066 
1067 	return (0);
1068 }
1069 
1070 void
1071 kcage_recalc_thresholds()
1072 {
1073 	static int first = 1;
1074 	static pgcnt_t init_lotsfree;
1075 	static pgcnt_t init_desfree;
1076 	static pgcnt_t init_minfree;
1077 	static pgcnt_t init_throttlefree;
1078 	static pgcnt_t init_reserve;
1079 
1080 	/* TODO: any reason to take more care than this with live editing? */
1081 	mutex_enter(&kcage_cageout_mutex);
1082 	mutex_enter(&freemem_lock);
1083 
1084 	if (first) {
1085 		first = 0;
1086 		init_lotsfree = kcage_lotsfree;
1087 		init_desfree = kcage_desfree;
1088 		init_minfree = kcage_minfree;
1089 		init_throttlefree = kcage_throttlefree;
1090 		init_reserve = kcage_reserve;
1091 	} else {
1092 		kcage_lotsfree = init_lotsfree;
1093 		kcage_desfree = init_desfree;
1094 		kcage_minfree = init_minfree;
1095 		kcage_throttlefree = init_throttlefree;
1096 		kcage_reserve = init_reserve;
1097 	}
1098 
1099 	if (kcage_lotsfree == 0)
1100 		kcage_lotsfree = MAX(32, total_pages / 256);
1101 
1102 	if (kcage_minfree == 0)
1103 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1104 
1105 	if (kcage_desfree == 0)
1106 		kcage_desfree = MAX(32, kcage_minfree);
1107 
1108 	if (kcage_throttlefree == 0)
1109 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1110 
1111 	if (kcage_reserve == 0)
1112 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1113 
1114 	mutex_exit(&freemem_lock);
1115 	mutex_exit(&kcage_cageout_mutex);
1116 
1117 	if (kcage_cageout_ready) {
1118 		if (kcage_freemem < kcage_desfree)
1119 			kcage_cageout_wakeup();
1120 
1121 		if (kcage_needfree) {
1122 			mutex_enter(&kcage_throttle_mutex);
1123 			cv_broadcast(&kcage_throttle_cv);
1124 			mutex_exit(&kcage_throttle_mutex);
1125 		}
1126 	}
1127 }
1128 
1129 /*
1130  * Pageout interface:
1131  * kcage_cageout_init()
1132  */
1133 void
1134 kcage_cageout_init()
1135 {
1136 	if (kcage_on) {
1137 
1138 		(void) thread_create(NULL, 0, kcage_cageout,
1139 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
1140 	}
1141 }
1142 
1143 
1144 /*
1145  * VM Interfaces:
1146  * kcage_create_throttle()
1147  * kcage_freemem_add()
1148  * kcage_freemem_sub()
1149  */
1150 
1151 /*
1152  * Wakeup cageout thread and throttle waiting for the number of pages
1153  * requested to become available.  For non-critical requests, a
1154  * timeout is added, since freemem accounting is separate from cage
1155  * freemem accounting: it's possible for us to get stuck and not make
1156  * forward progress even though there was sufficient freemem before
1157  * arriving here.
1158  */
1159 int
1160 kcage_create_throttle(pgcnt_t npages, int flags)
1161 {
1162 	int niter = 0;
1163 	pgcnt_t lastfree;
1164 	int enough = kcage_freemem > kcage_throttlefree + npages;
1165 
1166 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1167 
1168 	kcage_cageout_wakeup();			/* just to be sure */
1169 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1170 
1171 	/*
1172 	 * Obviously, we can't throttle the cageout thread since
1173 	 * we depend on it.  We also can't throttle the panic thread.
1174 	 */
1175 	if (curthread == kcage_cageout_thread || panicstr) {
1176 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1177 		return (KCT_CRIT);
1178 	}
1179 
1180 	/*
1181 	 * Don't throttle threads which are critical for proper
1182 	 * vm management if we're above kcage_throttlefree or
1183 	 * if freemem is very low.
1184 	 */
1185 	if (NOMEMWAIT()) {
1186 		if (enough) {
1187 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1188 			return (KCT_CRIT);
1189 		} else if (freemem < minfree) {
1190 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1191 			return (KCT_CRIT);
1192 		}
1193 	}
1194 
1195 	/*
1196 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1197 	 */
1198 	if (DISP_PRIO(curthread) > maxclsyspri &&
1199 	    kcage_freemem > kcage_reserve) {
1200 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1201 		return (KCT_CRIT);
1202 	}
1203 
1204 	/*
1205 	 * Cause all other threads (which are assumed to not be
1206 	 * critical to cageout) to wait here until their request
1207 	 * can be satisfied. Be a little paranoid and wake the
1208 	 * kernel cage on each loop through this logic.
1209 	 */
1210 	while (kcage_freemem < kcage_throttlefree + npages) {
1211 		ASSERT(kcage_on);
1212 
1213 		lastfree = kcage_freemem;
1214 
1215 		if (kcage_cageout_ready) {
1216 			mutex_enter(&kcage_throttle_mutex);
1217 
1218 			kcage_needfree += npages;
1219 			KCAGE_STAT_INCR(kct_wait);
1220 
1221 			kcage_cageout_wakeup();
1222 			KCAGE_STAT_INCR(kct_cagewake);
1223 
1224 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1225 
1226 			kcage_needfree -= npages;
1227 
1228 			mutex_exit(&kcage_throttle_mutex);
1229 		} else {
1230 			/*
1231 			 * NOTE: atomics are used just in case we enter
1232 			 * mp operation before the cageout thread is ready.
1233 			 */
1234 			atomic_add_long(&kcage_needfree, npages);
1235 
1236 			kcage_cageout_wakeup();
1237 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1238 
1239 			atomic_add_long(&kcage_needfree, -npages);
1240 		}
1241 
1242 		if ((flags & PG_WAIT) == 0) {
1243 			if (kcage_freemem > lastfree) {
1244 				KCAGE_STAT_INCR(kct_progress);
1245 				niter = 0;
1246 			} else {
1247 				KCAGE_STAT_INCR(kct_noprogress);
1248 				if (++niter >= kcage_maxwait) {
1249 					KCAGE_STAT_INCR(kct_timeout);
1250 					return (KCT_FAILURE);
1251 				}
1252 			}
1253 		}
1254 	}
1255 	return (KCT_NONCRIT);
1256 }
1257 
1258 void
1259 kcage_freemem_add(pgcnt_t npages)
1260 {
1261 	extern void wakeup_pcgs(void);
1262 
1263 	atomic_add_long(&kcage_freemem, npages);
1264 
1265 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1266 
1267 	if (kcage_needfree != 0 &&
1268 		kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1269 
1270 		mutex_enter(&kcage_throttle_mutex);
1271 		cv_broadcast(&kcage_throttle_cv);
1272 		KCAGE_STAT_INCR(kfa_trottlewake);
1273 		mutex_exit(&kcage_throttle_mutex);
1274 	}
1275 }
1276 
1277 void
1278 kcage_freemem_sub(pgcnt_t npages)
1279 {
1280 	atomic_add_long(&kcage_freemem, -npages);
1281 
1282 	if (kcage_freemem < kcage_desfree) {
1283 		kcage_cageout_wakeup();
1284 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1285 	}
1286 }
1287 
1288 /*
1289  * return 0 on failure and 1 on success.
1290  */
1291 static int
1292 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1293 {
1294 	pgcnt_t npgs, i;
1295 	page_t *pp;
1296 	pfn_t rootpfn = page_pptonum(rootpp);
1297 	uint_t szc;
1298 
1299 	ASSERT(!PP_ISFREE(rootpp));
1300 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1301 	if (!group_page_trylock(rootpp, se)) {
1302 		return (0);
1303 	}
1304 	szc = rootpp->p_szc;
1305 	if (szc == 0) {
1306 		/*
1307 		 * The szc of a locked page can only change for pages that are
1308 		 * non-swapfs (i.e. anonymous memory) file system pages.
1309 		 */
1310 		ASSERT(rootpp->p_vnode != NULL &&
1311 		    !PP_ISKAS(rootpp) &&
1312 		    !IS_SWAPFSVP(rootpp->p_vnode));
1313 		PP_SETNORELOC(rootpp);
1314 		return (1);
1315 	}
1316 	npgs = page_get_pagecnt(szc);
1317 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1318 	pp = rootpp;
1319 	for (i = 0; i < npgs; i++, pp++) {
1320 		ASSERT(PAGE_LOCKED_SE(pp, se));
1321 		ASSERT(!PP_ISFREE(pp));
1322 		ASSERT(pp->p_szc == szc);
1323 		PP_SETNORELOC(pp);
1324 	}
1325 	group_page_unlock(rootpp);
1326 	return (1);
1327 }
1328 
1329 /*
1330  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1331  * If successful and pages is free, move page to the tail of whichever
1332  * list it is on.
1333  * Returns:
1334  *   EBUSY  page already locked, assimilated but not free.
1335  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1336  *   EAGAIN page not assimilated. Page not free.
1337  *   ERANGE page assimilated. Page not root.
1338  *   0      page assimilated. Page free.
1339  *   *nfreedp number of pages freed.
1340  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1341  * to distinguish between a page that was already a NORELOC page from
1342  * those newly converted to NORELOC pages by this invocation of
1343  * kcage_assimilate_page.
1344  */
1345 static int
1346 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1347 {
1348 	if (page_trylock(pp, SE_EXCL)) {
1349 		if (PP_ISNORELOC(pp)) {
1350 check_free_and_return:
1351 			if (PP_ISFREE(pp)) {
1352 				page_unlock(pp);
1353 				*nfreedp = 0;
1354 				return (0);
1355 			} else {
1356 				page_unlock(pp);
1357 				return (EBUSY);
1358 			}
1359 			/*NOTREACHED*/
1360 		}
1361 	} else {
1362 		if (page_trylock(pp, SE_SHARED)) {
1363 			if (PP_ISNORELOC(pp))
1364 				goto check_free_and_return;
1365 		} else
1366 			return (EAGAIN);
1367 
1368 		if (!PP_ISFREE(pp)) {
1369 			page_unlock(pp);
1370 			return (EAGAIN);
1371 		}
1372 
1373 		/*
1374 		 * Need to upgrade the lock on it and set the NORELOC
1375 		 * bit. If it is free then remove it from the free
1376 		 * list so that the platform free list code can keep
1377 		 * NORELOC pages where they should be.
1378 		 */
1379 		/*
1380 		 * Before doing anything, get the exclusive lock.
1381 		 * This may fail (eg ISM pages are left shared locked).
1382 		 * If the page is free this will leave a hole in the
1383 		 * cage. There is no solution yet to this.
1384 		 */
1385 		if (!page_tryupgrade(pp)) {
1386 			page_unlock(pp);
1387 			return (EAGAIN);
1388 		}
1389 	}
1390 
1391 	ASSERT(PAGE_EXCL(pp));
1392 
1393 	if (PP_ISFREE(pp)) {
1394 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1395 
1396 		page_list_sub(pp, which);
1397 		ASSERT(pp->p_szc == 0);
1398 		PP_SETNORELOC(pp);
1399 		PLCNT_XFER_NORELOC(pp);
1400 		page_list_add(pp, which | PG_LIST_TAIL);
1401 
1402 		page_unlock(pp);
1403 		*nfreedp = 1;
1404 		return (0);
1405 	} else {
1406 		if (pp->p_szc != 0) {
1407 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1408 				page_unlock(pp);
1409 				return (EAGAIN);
1410 			}
1411 			ASSERT(PP_ISNORELOC(pp));
1412 		} else {
1413 			PP_SETNORELOC(pp);
1414 		}
1415 		PLCNT_XFER_NORELOC(pp);
1416 		return (kcage_invalidate_page(pp, nfreedp));
1417 	}
1418 	/*NOTREACHED*/
1419 }
1420 
1421 static int
1422 kcage_expand()
1423 {
1424 	int did_something = 0;
1425 
1426 	spgcnt_t wanted;
1427 	pfn_t pfn;
1428 	page_t *pp;
1429 	/* TODO: we don't really need n any more? */
1430 	pgcnt_t n;
1431 	pgcnt_t nf, nfreed;
1432 
1433 	/*
1434 	 * Expand the cage if available cage memory is really low. Calculate
1435 	 * the amount required to return kcage_freemem to the level of
1436 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1437 	 * more.  It is rare for their sum to create an artificial threshold
1438 	 * above kcage_lotsfree, but it is possible.
1439 	 *
1440 	 * Exit early if expansion amount is equal to or less than zero.
1441 	 * (<0 is possible if kcage_freemem rises suddenly.)
1442 	 *
1443 	 * Exit early when the global page pool (apparently) does not
1444 	 * have enough free pages to page_relocate() even a single page.
1445 	 */
1446 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1447 		- kcage_freemem;
1448 	if (wanted <= 0)
1449 		return (0);
1450 	else if (freemem < pageout_reserve + 1) {
1451 		KCAGE_STAT_INCR(ke_lowfreemem);
1452 		return (0);
1453 	}
1454 
1455 	/*
1456 	 * Try to get the range list reader lock. If the lock is already
1457 	 * held, then don't get stuck here waiting for it.
1458 	 */
1459 	if (!rw_tryenter(&kcage_range_rwlock, RW_READER))
1460 		return (0);
1461 
1462 	KCAGE_STAT_INCR(ke_calls);
1463 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1464 
1465 	/*
1466 	 * Assimilate more pages from the global page pool into the cage.
1467 	 */
1468 	n = 0;				/* number of pages PP_SETNORELOC'd */
1469 	nf = 0;				/* number of those actually free */
1470 	while (kcage_on && nf < wanted) {
1471 		pfn = kcage_get_pfn();
1472 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1473 			KCAGE_STAT_INCR(ke_nopfn);
1474 			goto terminate;
1475 		}
1476 
1477 		KCAGE_STAT_INCR_SCAN(ke_examined);
1478 
1479 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1480 			KCAGE_STAT_INCR(ke_nopaget);
1481 			continue;
1482 		}
1483 		KCAGEPAGETS_INC();
1484 		/*
1485 		 * Sanity check. Skip this pfn if it is
1486 		 * being deleted.
1487 		 */
1488 		if (pfn_is_being_deleted(pfn)) {
1489 			KCAGE_STAT_INCR(ke_deleting);
1490 			continue;
1491 		}
1492 
1493 		/*
1494 		 * NORELOC is only set at boot-time or by this routine
1495 		 * under the kcage_range_rwlock lock which is currently
1496 		 * held. This means we can do a fast check here before
1497 		 * locking the page in kcage_assimilate_page.
1498 		 */
1499 		if (PP_ISNORELOC(pp)) {
1500 			KCAGE_STAT_INCR(ke_isnoreloc);
1501 			continue;
1502 		}
1503 
1504 		switch (kcage_assimilate_page(pp, &nfreed)) {
1505 			case 0:		/* assimilated, page is free */
1506 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1507 				did_something = 1;
1508 				nf += nfreed;
1509 				n++;
1510 				break;
1511 
1512 			case EBUSY:	/* assimilated, page not free */
1513 			case ERANGE:	/* assimilated, page not root */
1514 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1515 				did_something = 1;
1516 				n++;
1517 				break;
1518 
1519 			case ENOMEM:	/* assimilated, but no mem */
1520 				KCAGE_STAT_INCR(ke_terminate);
1521 				did_something = 1;
1522 				n++;
1523 				goto terminate;
1524 
1525 			case EAGAIN:	/* can't assimilate */
1526 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1527 				break;
1528 
1529 			default:	/* catch this with debug kernels */
1530 				ASSERT(0);
1531 				break;
1532 		}
1533 	}
1534 
1535 	/*
1536 	 * Realign cage edge with the nearest physical address
1537 	 * boundry for big pages. This is done to give us a
1538 	 * better chance of actually getting usable big pages
1539 	 * in the cage.
1540 	 */
1541 
1542 terminate:
1543 	kcage_range_unlock();
1544 
1545 	return (did_something);
1546 }
1547 
1548 /*
1549  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1550  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1551  * if relocation is successful, otherwise it is only unlocked.
1552  * On entry, page opp must be exclusively locked and not free.
1553  * *nfreedp: number of pages freed.
1554  */
1555 static int
1556 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1557 {
1558 	page_t *opp = pp;
1559 	page_t *rpp = NULL;
1560 	spgcnt_t npgs;
1561 	int result;
1562 
1563 	ASSERT(!PP_ISFREE(opp));
1564 	ASSERT(PAGE_EXCL(opp));
1565 
1566 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1567 	*nfreedp = npgs;
1568 	if (result == 0) {
1569 		while (npgs-- > 0) {
1570 			page_t *tpp;
1571 
1572 			ASSERT(rpp != NULL);
1573 			tpp = rpp;
1574 			page_sub(&rpp, tpp);
1575 			page_unlock(tpp);
1576 		}
1577 
1578 		ASSERT(rpp == NULL);
1579 
1580 		return (0);		/* success */
1581 	}
1582 
1583 	page_unlock(opp);
1584 	return (result);
1585 }
1586 
1587 /*
1588  * Based on page_invalidate_pages()
1589  *
1590  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1591  * of use must be updated to match the new page_relocate() when it
1592  * becomes available.
1593  *
1594  * Return result of kcage_relocate_page or zero if page was directly freed.
1595  * *nfreedp: number of pages freed.
1596  */
1597 static int
1598 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1599 {
1600 	int result;
1601 
1602 #if defined(__sparc)
1603 	extern struct vnode prom_ppages;
1604 	ASSERT(pp->p_vnode != &prom_ppages);
1605 #endif /* __sparc */
1606 
1607 	ASSERT(!PP_ISFREE(pp));
1608 	ASSERT(PAGE_EXCL(pp));
1609 
1610 	/*
1611 	 * Is this page involved in some I/O? shared?
1612 	 * The page_struct_lock need not be acquired to
1613 	 * examine these fields since the page has an
1614 	 * "exclusive" lock.
1615 	 */
1616 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1617 		result = kcage_relocate_page(pp, nfreedp);
1618 #ifdef KCAGE_STATS
1619 		if (result == 0)
1620 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1621 		else if (result == ENOMEM)
1622 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1623 #endif
1624 		return (result);
1625 	}
1626 
1627 	ASSERT(pp->p_vnode->v_type != VCHR);
1628 
1629 	/*
1630 	 * Unload the mappings and check if mod bit is set.
1631 	 */
1632 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1633 
1634 	if (hat_ismod(pp)) {
1635 		result = kcage_relocate_page(pp, nfreedp);
1636 #ifdef KCAGE_STATS
1637 		if (result == 0)
1638 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1639 		else if (result == ENOMEM)
1640 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1641 #endif
1642 		return (result);
1643 	}
1644 
1645 	if (!page_try_demote_pages(pp)) {
1646 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1647 		page_unlock(pp);
1648 		return (EAGAIN);
1649 	}
1650 
1651 	page_destroy(pp, 0);
1652 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1653 	*nfreedp = 1;
1654 	return (0);
1655 }
1656 
1657 static void
1658 kcage_cageout()
1659 {
1660 	pfn_t pfn;
1661 	page_t *pp;
1662 	callb_cpr_t cprinfo;
1663 	int did_something;
1664 	int scan_again;
1665 	pfn_t start_pfn;
1666 	int pass;
1667 	int last_pass;
1668 	int pages_skipped;
1669 	int shared_skipped;
1670 	uint_t shared_level = 8;
1671 	pgcnt_t nfreed;
1672 #ifdef KCAGE_STATS
1673 	clock_t scan_start;
1674 #endif
1675 
1676 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1677 		callb_generic_cpr, "cageout");
1678 
1679 	mutex_enter(&kcage_cageout_mutex);
1680 	kcage_cageout_thread = curthread;
1681 
1682 	pfn = PFN_INVALID;		/* force scan reset */
1683 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1684 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1685 
1686 loop:
1687 	/*
1688 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1689 	 * that kcage_freemem is less than kcage_desfree. When it does
1690 	 * notice, kcage_freemem_sub() will wake us up via call to
1691 	 * kcage_cageout_wakeup().
1692 	 */
1693 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1694 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1695 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1696 
1697 	KCAGE_STAT_INCR(kt_wakeups);
1698 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1699 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1700 	pass = 0;
1701 	last_pass = 0;
1702 
1703 #ifdef KCAGE_STATS
1704 	scan_start = lbolt;
1705 #endif
1706 
1707 again:
1708 	if (!kcage_on)
1709 		goto loop;
1710 
1711 	KCAGE_STAT_INCR(kt_scans);
1712 	KCAGE_STAT_INCR_SCAN(kt_passes);
1713 
1714 	did_something = 0;
1715 	pages_skipped = 0;
1716 	shared_skipped = 0;
1717 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1718 		(pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1719 
1720 		if (start_pfn == PFN_INVALID)
1721 			start_pfn = pfn;
1722 		else if (start_pfn == pfn) {
1723 			last_pass = pass;
1724 			pass += 1;
1725 			/*
1726 			 * Did a complete walk of kernel cage, but didn't free
1727 			 * any pages.  If only one cpu is online then
1728 			 * stop kernel cage walk and try expanding.
1729 			 */
1730 			if (ncpus_online == 1 && did_something == 0) {
1731 				KCAGE_STAT_INCR(kt_cageout_break);
1732 				break;
1733 			}
1734 		}
1735 
1736 		pp = page_numtopp_nolock(pfn);
1737 		if (pp == NULL) {
1738 			continue;
1739 		}
1740 
1741 		KCAGE_STAT_INCR_SCAN(kt_examined);
1742 
1743 		/*
1744 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1745 		 * of the lock. If one is missed it will be seen next
1746 		 * time through.
1747 		 *
1748 		 * Skip non-caged-pages. These pages can exist in the cage
1749 		 * because, if during cage expansion, a page is
1750 		 * encountered that is long-term locked the lock prevents the
1751 		 * expansion logic from setting the P_NORELOC flag. Hence,
1752 		 * non-caged-pages surrounded by caged-pages.
1753 		 */
1754 		if (!PP_ISNORELOC(pp)) {
1755 			switch (kcage_assimilate_page(pp, &nfreed)) {
1756 				case 0:
1757 					did_something = 1;
1758 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1759 					    nfreed);
1760 					break;
1761 
1762 				case EBUSY:
1763 				case ERANGE:
1764 					did_something = 1;
1765 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1766 					break;
1767 
1768 				case EAGAIN:
1769 				case ENOMEM:
1770 					break;
1771 
1772 				default:
1773 					/* catch this with debug kernels */
1774 					ASSERT(0);
1775 					break;
1776 			}
1777 
1778 			continue;
1779 		} else {
1780 			int prm;
1781 
1782 			if (PP_ISFREE(pp)) {
1783 				continue;
1784 			}
1785 
1786 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
1787 			    !page_trylock(pp, SE_EXCL)) {
1788 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1789 				continue;
1790 			}
1791 
1792 			/* P_NORELOC bit should not have gone away. */
1793 			ASSERT(PP_ISNORELOC(pp));
1794 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
1795 			    pp->p_lckcnt > 0)) {
1796 				page_unlock(pp);
1797 				continue;
1798 			}
1799 
1800 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1801 			if (hat_page_getshare(pp) > shared_level) {
1802 				page_unlock(pp);
1803 				pages_skipped = 1;
1804 				shared_skipped = 1;
1805 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1806 				continue;
1807 			}
1808 
1809 			/*
1810 			 * In pass {0, 1}, skip page if ref bit is set.
1811 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1812 			 */
1813 			prm = hat_pagesync(pp,
1814 				HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1815 
1816 			/* On first pass ignore ref'd pages */
1817 			if (pass <= 1 && (prm & P_REF)) {
1818 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1819 				pages_skipped = 1;
1820 				page_unlock(pp);
1821 				continue;
1822 			}
1823 
1824 			/* On pass 2, page_destroy if mod bit is not set */
1825 			if (pass <= 2) {
1826 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1827 					pp->p_lckcnt || pp->p_cowcnt) {
1828 					pages_skipped = 1;
1829 					page_unlock(pp);
1830 				} else {
1831 
1832 					/*
1833 					 * unload the mappings before
1834 					 * checking if mod bit is set
1835 					 */
1836 					(void) hat_pageunload(pp,
1837 						HAT_FORCE_PGUNLOAD);
1838 
1839 					/*
1840 					 * skip this page if modified
1841 					 */
1842 					if (hat_ismod(pp)) {
1843 						pages_skipped = 1;
1844 						page_unlock(pp);
1845 						continue;
1846 					}
1847 
1848 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1849 					page_destroy(pp, 0);
1850 					did_something = 1;
1851 				}
1852 				continue;
1853 			}
1854 
1855 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1856 				did_something = 1;
1857 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1858 			}
1859 
1860 			/*
1861 			 * No need to drop the page lock here.
1862 			 * Kcage_invalidate_page has done that for us
1863 			 * either explicitly or through a page_free.
1864 			 */
1865 		}
1866 	}
1867 
1868 	/*
1869 	 * Expand the cage only if available cage memory is really low.
1870 	 * This test is done only after a complete scan of the cage.
1871 	 * The reason for not checking and expanding more often is to
1872 	 * avoid rapid expansion of the cage. Naturally, scanning the
1873 	 * cage takes time. So by scanning first, we use that work as a
1874 	 * delay loop in between expand decisions.
1875 	 */
1876 
1877 	scan_again = 0;
1878 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1879 		/*
1880 		 * Kcage_expand() will return a non-zero value if it was
1881 		 * able to expand the cage -- whether or not the new
1882 		 * pages are free and immediately usable. If non-zero,
1883 		 * we do another scan of the cage. The pages might be
1884 		 * freed during that scan or by time we get back here.
1885 		 * If not, we will attempt another expansion.
1886 		 * However, if kcage_expand() returns zero, then it was
1887 		 * unable to expand the cage. This is the case when the
1888 		 * the growth list is exausted, therefore no work was done
1889 		 * and there is no reason to scan the cage again.
1890 		 * Note: Kernel cage scan is not repeated on single-cpu
1891 		 * system to avoid kernel cage thread hogging cpu.
1892 		 */
1893 		if (pass <= 3 && pages_skipped && ncpus_online > 1)
1894 			scan_again = 1;
1895 		else
1896 			(void) kcage_expand(); /* don't scan again */
1897 	} else if (kcage_freemem < kcage_lotsfree) {
1898 		/*
1899 		 * If available cage memory is less than abundant
1900 		 * and a full scan of the cage has not yet been completed,
1901 		 * or a scan has completed and some work was performed,
1902 		 * or pages were skipped because of sharing,
1903 		 * or we simply have not yet completed two passes,
1904 		 * then do another scan.
1905 		 */
1906 		if (pass <= 2 && pages_skipped)
1907 			scan_again = 1;
1908 		if (pass == last_pass || did_something)
1909 			scan_again = 1;
1910 		else if (shared_skipped && shared_level < (8<<24)) {
1911 			shared_level <<= 1;
1912 			scan_again = 1;
1913 		}
1914 	}
1915 
1916 	if (scan_again && ncpus_online > 1)
1917 		goto again;
1918 	else {
1919 		if (shared_level > 8)
1920 			shared_level >>= 1;
1921 
1922 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1923 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1924 		KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start);
1925 		KCAGE_STAT_INC_SCAN_INDEX;
1926 		goto loop;
1927 	}
1928 
1929 	/*NOTREACHED*/
1930 }
1931 
1932 void
1933 kcage_cageout_wakeup()
1934 {
1935 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1936 		if (kcage_cageout_ready) {
1937 			cv_signal(&kcage_cageout_cv);
1938 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1939 			/*
1940 			 * Available cage memory is really low. Time to
1941 			 * start expanding the cage. However, the
1942 			 * kernel cage thread is not yet ready to
1943 			 * do the work. Use *this* thread, which is
1944 			 * most likely to be t0, to do the work.
1945 			 */
1946 			KCAGE_STAT_INCR(kcw_expandearly);
1947 			(void) kcage_expand();
1948 			KCAGE_STAT_INC_SCAN_INDEX;
1949 		}
1950 
1951 		mutex_exit(&kcage_cageout_mutex);
1952 	}
1953 	/* else, kernel cage thread is already running */
1954 }
1955 
1956 void
1957 kcage_tick()
1958 {
1959 	/*
1960 	 * Once per second we wake up all the threads throttled
1961 	 * waiting for cage memory, in case we've become stuck
1962 	 * and haven't made forward progress expanding the cage.
1963 	 */
1964 	if (kcage_on && kcage_cageout_ready)
1965 		cv_broadcast(&kcage_throttle_cv);
1966 }
1967