xref: /titanic_52/usr/src/uts/common/os/mem_cage.c (revision bb25c06cca41ca78e5fb87fbb8e81d55beb18c95)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/proc.h>
32 #include <sys/callb.h>
33 #include <sys/vnode.h>
34 #include <sys/debug.h>
35 #include <sys/systm.h>		/* for bzero */
36 #include <sys/memlist.h>
37 #include <sys/cmn_err.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
40 #include <sys/atomic.h>		/* used to update kcage_freemem */
41 #include <sys/kmem.h>		/* for kmem_reap */
42 #include <sys/errno.h>
43 #include <sys/mem_cage.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/page.h>
46 #include <vm/hat.h>
47 #include <vm/vm_dep.h>
48 #include <sys/mem_config.h>
49 #include <sys/lgrp.h>
50 #include <sys/rwlock.h>
51 
52 extern pri_t maxclsyspri;
53 
54 #ifdef DEBUG
55 #define	KCAGE_STATS
56 #endif
57 
58 #ifdef KCAGE_STATS
59 
60 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
61 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
62 
63 struct kcage_stats_scan {
64 	/* managed by KCAGE_STAT_* macros */
65 	clock_t	scan_lbolt;
66 	uint_t	scan_id;
67 
68 	/* set in kcage_cageout() */
69 	uint_t	kt_passes;
70 	clock_t	kt_ticks;
71 	pgcnt_t	kt_kcage_freemem_start;
72 	pgcnt_t	kt_kcage_freemem_end;
73 	pgcnt_t kt_freemem_start;
74 	pgcnt_t kt_freemem_end;
75 	uint_t	kt_examined;
76 	uint_t	kt_cantlock;
77 	uint_t	kt_gotone;
78 	uint_t	kt_gotonefree;
79 	uint_t	kt_skiplevel;
80 	uint_t	kt_skipshared;
81 	uint_t	kt_skiprefd;
82 	uint_t	kt_destroy;
83 
84 	/* set in kcage_invalidate_page() */
85 	uint_t	kip_reloclocked;
86 	uint_t	kip_relocmod;
87 	uint_t	kip_destroy;
88 	uint_t	kip_nomem;
89 	uint_t	kip_demotefailed;
90 
91 	/* set in kcage_expand() */
92 	uint_t	ke_wanted;
93 	uint_t	ke_examined;
94 	uint_t	ke_lefthole;
95 	uint_t	ke_gotone;
96 	uint_t	ke_gotonefree;
97 };
98 
99 struct kcage_stats {
100 	/* managed by KCAGE_STAT_* macros */
101 	uint_t	version;
102 	uint_t	size;
103 
104 	/* set in kcage_cageout */
105 	uint_t	kt_wakeups;
106 	uint_t	kt_scans;
107 	uint_t	kt_cageout_break;
108 
109 	/* set in kcage_expand */
110 	uint_t	ke_calls;
111 	uint_t	ke_nopfn;
112 	uint_t	ke_nopaget;
113 	uint_t	ke_isnoreloc;
114 	uint_t	ke_deleting;
115 	uint_t	ke_lowfreemem;
116 	uint_t	ke_terminate;
117 
118 	/* set in kcage_freemem_add() */
119 	uint_t	kfa_trottlewake;
120 
121 	/* set in kcage_freemem_sub() */
122 	uint_t	kfs_cagewake;
123 
124 	/* set in kcage_create_throttle */
125 	uint_t	kct_calls;
126 	uint_t	kct_cageout;
127 	uint_t	kct_critical;
128 	uint_t	kct_exempt;
129 	uint_t	kct_cagewake;
130 	uint_t	kct_wait;
131 	uint_t	kct_progress;
132 	uint_t	kct_noprogress;
133 	uint_t	kct_timeout;
134 
135 	/* set in kcage_cageout_wakeup */
136 	uint_t	kcw_expandearly;
137 
138 	/* managed by KCAGE_STAT_* macros */
139 	uint_t	scan_array_size;
140 	uint_t	scan_index;
141 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
142 };
143 
144 static struct kcage_stats kcage_stats;
145 static struct kcage_stats_scan kcage_stats_scan_zero;
146 
147 /*
148  * No real need for atomics here. For the most part the incs and sets are
149  * done by the kernel cage thread. There are a few that are done by any
150  * number of other threads. Those cases are noted by comments.
151  */
152 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
153 
154 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
155 
156 #define	KCAGE_STAT_INCR_SCAN(m)	\
157 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
158 
159 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
160 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
161 
162 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
163 
164 #define	KCAGE_STAT_SETZ(m, v)	\
165 	if (kcage_stats.m == 0) kcage_stats.m = (v)
166 
167 #define	KCAGE_STAT_SET_SCAN(m, v)	\
168 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
169 
170 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
171 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
172 
173 #define	KCAGE_STAT_INC_SCAN_INDEX \
174 	KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \
175 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
176 	kcage_stats.scan_index = \
177 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
178 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
179 
180 #define	KCAGE_STAT_INIT_SCAN_INDEX \
181 	kcage_stats.version = KCAGE_STATS_VERSION; \
182 	kcage_stats.size = sizeof (kcage_stats); \
183 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
184 	kcage_stats.scan_index = 0
185 
186 #else /* KCAGE_STATS */
187 
188 #define	KCAGE_STAT_INCR(v)
189 #define	KCAGE_STAT_NINCR(m, v)
190 #define	KCAGE_STAT_INCR_SCAN(v)
191 #define	KCAGE_STAT_NINCR_SCAN(m, v)
192 #define	KCAGE_STAT_SET(m, v)
193 #define	KCAGE_STAT_SETZ(m, v)
194 #define	KCAGE_STAT_SET_SCAN(m, v)
195 #define	KCAGE_STAT_SETZ_SCAN(m, v)
196 #define	KCAGE_STAT_INC_SCAN_INDEX
197 #define	KCAGE_STAT_INIT_SCAN_INDEX
198 
199 #endif /* KCAGE_STATS */
200 
201 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
202 static kcondvar_t kcage_throttle_cv;
203 
204 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
205 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
206 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
207 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
208 
209 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
210 
211 /*
212  * Cage expansion happens within a range.
213  */
214 struct kcage_glist {
215 	struct kcage_glist	*next;
216 	pfn_t			base;
217 	pfn_t			lim;
218 	pfn_t			curr;
219 	int			decr;
220 };
221 
222 static struct kcage_glist *kcage_glist;
223 static struct kcage_glist *kcage_current_glist;
224 
225 /*
226  * The firstfree element is provided so that kmem_alloc can be avoided
227  * until that cage has somewhere to go. This is not currently a problem
228  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
229  */
230 static struct kcage_glist kcage_glist_firstfree;
231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
232 
233 /*
234  * Miscellaneous forward references
235  */
236 static struct kcage_glist *kcage_glist_alloc(void);
237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
238 static void kcage_cageout(void);
239 static int kcage_invalidate_page(page_t *, pgcnt_t *);
240 static int kcage_setnoreloc_pages(page_t *, se_t);
241 
242 /*
243  * Kernel Memory Cage counters and thresholds.
244  */
245 int kcage_on = 0;
246 pgcnt_t kcage_freemem;
247 pgcnt_t kcage_needfree;
248 pgcnt_t kcage_lotsfree;
249 pgcnt_t kcage_desfree;
250 pgcnt_t kcage_minfree;
251 pgcnt_t kcage_throttlefree;
252 pgcnt_t	kcage_reserve;
253 int kcage_maxwait = 10;	/* in seconds */
254 
255 /* when we use lp for kmem we start the cage at a higher initial value */
256 pgcnt_t kcage_kmemlp_mincage;
257 
258 #ifdef DEBUG
259 pgcnt_t	kcage_pagets;
260 #define	KCAGEPAGETS_INC()	kcage_pagets++
261 #else
262 #define	KCAGEPAGETS_INC()
263 #endif
264 
265 /*
266  * Startup and Dynamic Reconfiguration interfaces.
267  * kcage_range_lock()
268  * kcage_range_unlock()
269  * kcage_range_islocked()
270  * kcage_range_add()
271  * kcage_range_del()
272  * kcage_init()
273  * kcage_set_thresholds()
274  */
275 
276 /*
277  * Called outside of this file to add/remove from the list,
278  * therefore, it takes a writer lock
279  */
280 void
281 kcage_range_lock(void)
282 {
283 	rw_enter(&kcage_range_rwlock, RW_WRITER);
284 }
285 
286 void
287 kcage_range_unlock(void)
288 {
289 	rw_exit(&kcage_range_rwlock);
290 }
291 
292 int
293 kcage_range_islocked(void)
294 {
295 	return (rw_lock_held(&kcage_range_rwlock));
296 }
297 
298 /*
299  * Called from page_get_contig_pages to get the approximate kcage pfn range
300  * for exclusion from search for contiguous pages. This routine is called
301  * without kcage_range lock (kcage routines can call page_get_contig_pages
302  * through page_relocate) and with the assumption, based on kcage_range_add,
303  * that kcage_current_glist always contain a valid pointer.
304  */
305 
306 int
307 kcage_current_pfn(pfn_t *pfncur)
308 {
309 	struct kcage_glist *lp = kcage_current_glist;
310 
311 	ASSERT(kcage_on);
312 
313 	ASSERT(lp != NULL);
314 
315 	*pfncur = lp->curr;
316 
317 	return (lp->decr);
318 }
319 
320 /*
321  * Called from vm_pagelist.c during coalesce to find kernel cage regions
322  * within an mnode. Looks for the lowest range between lo and hi.
323  *
324  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
325  * Non-cage memory is defined between kcage_current_glist and list end.
326  *
327  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
328  * non-cage range.
329  *
330  * Returns zero on success and nlo, nhi:
331  * 	lo <= nlo < nhi <= hi
332  * Returns non-zero if no overlapping range is found.
333  */
334 int
335 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
336     pfn_t *nlo, pfn_t *nhi)
337 {
338 	struct kcage_glist *lp;
339 	pfn_t tlo = hi;
340 	pfn_t thi = hi;
341 
342 	ASSERT(lo <= hi);
343 
344 	/*
345 	 * Reader lock protects the list, but kcage_get_pfn
346 	 * running concurrently may advance kcage_current_glist
347 	 * and also update kcage_current_glist->curr. Page
348 	 * coalesce can handle this race condition.
349 	 */
350 	rw_enter(&kcage_range_rwlock, RW_READER);
351 
352 	for (lp = incage ? kcage_glist : kcage_current_glist;
353 		lp != NULL; lp = lp->next) {
354 
355 		pfn_t klo, khi;
356 
357 		/* find the range limits in this element */
358 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
359 			klo = lp->curr;
360 			khi = lp->lim;
361 		} else {
362 			klo = lp->base;
363 			khi = lp->curr;
364 		}
365 
366 		/* handle overlap */
367 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
368 			tlo = MAX(lo, klo);
369 			thi = MIN(hi, khi);
370 			if (tlo == lo)
371 				break;
372 		}
373 
374 		/* check end of kcage */
375 		if (incage && lp == kcage_current_glist) {
376 			break;
377 		}
378 	}
379 
380 	rw_exit(&kcage_range_rwlock);
381 
382 	/* return non-zero if no overlapping range found */
383 	if (tlo == thi)
384 		return (1);
385 
386 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
387 
388 	/* return overlapping range */
389 	*nlo = tlo;
390 	*nhi = thi;
391 	return (0);
392 }
393 
394 int
395 kcage_range_init(struct memlist *ml, int decr)
396 {
397 	int ret = 0;
398 
399 	ASSERT(kcage_range_islocked());
400 
401 	if (decr) {
402 		while (ml->next != NULL)
403 			ml = ml->next;
404 	}
405 
406 	while (ml != NULL) {
407 		ret = kcage_range_add(btop(ml->address), btop(ml->size), decr);
408 		if (ret)
409 			break;
410 
411 		ml = (decr ? ml->prev : ml->next);
412 	}
413 
414 	return (ret);
415 }
416 
417 /*
418  * Third arg controls direction of growth: 0: increasing pfns,
419  * 1: decreasing.
420  * Calls to add and delete must be protected by calls to
421  * kcage_range_lock() and kcage_range_unlock().
422  */
423 int
424 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr)
425 {
426 	struct kcage_glist *new, **lpp;
427 	pfn_t lim;
428 
429 	ASSERT(kcage_range_islocked());
430 
431 	ASSERT(npgs != 0);
432 	if (npgs == 0)
433 		return (EINVAL);
434 
435 	lim = base + npgs;
436 
437 	ASSERT(lim > base);
438 	if (lim <= base)
439 		return (EINVAL);
440 
441 	new = kcage_glist_alloc();
442 	if (new == NULL) {
443 		return (ENOMEM);
444 	}
445 
446 	new->base = base;
447 	new->lim = lim;
448 	new->decr = decr;
449 	if (new->decr != 0)
450 		new->curr = new->lim;
451 	else
452 		new->curr = new->base;
453 	/*
454 	 * Any overlapping existing ranges are removed by deleting
455 	 * from the new list as we search for the tail.
456 	 */
457 	lpp = &kcage_glist;
458 	while (*lpp != NULL) {
459 		int ret;
460 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
461 		if (ret != 0)
462 			return (ret);
463 		lpp = &(*lpp)->next;
464 	}
465 
466 	*lpp = new;
467 
468 	if (kcage_current_glist == NULL) {
469 		kcage_current_glist = kcage_glist;
470 	}
471 
472 	return (0);
473 }
474 
475 /*
476  * Calls to add and delete must be protected by calls to
477  * kcage_range_lock() and kcage_range_unlock().
478  */
479 int
480 kcage_range_delete(pfn_t base, pgcnt_t npgs)
481 {
482 	struct kcage_glist *lp;
483 	pfn_t lim;
484 
485 	ASSERT(kcage_range_islocked());
486 
487 	ASSERT(npgs != 0);
488 	if (npgs == 0)
489 		return (EINVAL);
490 
491 	lim = base + npgs;
492 
493 	ASSERT(lim > base);
494 	if (lim <= base)
495 		return (EINVAL);
496 
497 	/*
498 	 * Check if the delete is OK first as a number of elements
499 	 * might be involved and it will be difficult to go
500 	 * back and undo (can't just add the range back in).
501 	 */
502 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
503 		/*
504 		 * If there have been no pages allocated from this
505 		 * element, we don't need to check it.
506 		 */
507 		if ((lp->decr == 0 && lp->curr == lp->base) ||
508 		    (lp->decr != 0 && lp->curr == lp->lim))
509 			continue;
510 		/*
511 		 * If the element does not overlap, its OK.
512 		 */
513 		if (base >= lp->lim || lim <= lp->base)
514 			continue;
515 		/*
516 		 * Overlapping element: Does the range to be deleted
517 		 * overlap the area already used? If so fail.
518 		 */
519 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
520 			return (EBUSY);
521 		}
522 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
523 			return (EBUSY);
524 		}
525 	}
526 	return (kcage_glist_delete(base, lim, &kcage_glist));
527 }
528 
529 /*
530  * Calls to add and delete must be protected by calls to
531  * kcage_range_lock() and kcage_range_unlock().
532  * This routine gets called after successful Solaris memory
533  * delete operation from DR post memory delete routines.
534  */
535 int
536 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
537 {
538 	pfn_t lim;
539 
540 	ASSERT(kcage_range_islocked());
541 
542 	ASSERT(npgs != 0);
543 	if (npgs == 0)
544 		return (EINVAL);
545 
546 	lim = base + npgs;
547 
548 	ASSERT(lim > base);
549 	if (lim <= base)
550 		return (EINVAL);
551 
552 	return (kcage_glist_delete(base, lim, &kcage_glist));
553 }
554 
555 /*
556  * No locking is required here as the whole operation is covered
557  * by the kcage_range_lock().
558  */
559 static struct kcage_glist *
560 kcage_glist_alloc(void)
561 {
562 	struct kcage_glist *new;
563 
564 	if ((new = kcage_glist_freelist) != NULL) {
565 		kcage_glist_freelist = new->next;
566 		bzero(new, sizeof (*new));
567 	} else {
568 		new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP);
569 	}
570 	return (new);
571 }
572 
573 static void
574 kcage_glist_free(struct kcage_glist *lp)
575 {
576 	lp->next = kcage_glist_freelist;
577 	kcage_glist_freelist = lp;
578 }
579 
580 static int
581 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
582 {
583 	struct kcage_glist *lp, *prev = *lpp;
584 
585 	while ((lp = *lpp) != NULL) {
586 		if (lim > lp->base && base < lp->lim) {
587 			/* The delete range overlaps this element. */
588 			if (base <= lp->base && lim >= lp->lim) {
589 				/* Delete whole element. */
590 				*lpp = lp->next;
591 				if (lp == kcage_current_glist) {
592 					/* This can never happen. */
593 					ASSERT(kcage_current_glist != prev);
594 					kcage_current_glist = prev;
595 				}
596 				kcage_glist_free(lp);
597 				continue;
598 			}
599 
600 			/* Partial delete. */
601 			if (base > lp->base && lim < lp->lim) {
602 				struct kcage_glist *new;
603 
604 				/*
605 				 * Remove a section from the middle,
606 				 * need to allocate a new element.
607 				 */
608 				new = kcage_glist_alloc();
609 				if (new == NULL) {
610 					return (ENOMEM);
611 				}
612 
613 				/*
614 				 * Tranfser unused range to new.
615 				 * Edit lp in place to preserve
616 				 * kcage_current_glist.
617 				 */
618 				new->decr = lp->decr;
619 				if (new->decr != 0) {
620 					new->base = lp->base;
621 					new->lim = base;
622 					new->curr = base;
623 
624 					lp->base = lim;
625 				} else {
626 					new->base = lim;
627 					new->lim = lp->lim;
628 					new->curr = new->base;
629 
630 					lp->lim = base;
631 				}
632 
633 				/* Insert new. */
634 				new->next = lp->next;
635 				lp->next = new;
636 				lpp = &lp->next;
637 			} else {
638 				/* Delete part of current block. */
639 				if (base > lp->base) {
640 					ASSERT(lim >= lp->lim);
641 					ASSERT(base < lp->lim);
642 					if (lp->decr != 0 &&
643 					    lp->curr == lp->lim)
644 						lp->curr = base;
645 					lp->lim = base;
646 				} else {
647 					ASSERT(base <= lp->base);
648 					ASSERT(lim > lp->base);
649 					if (lp->decr == 0 &&
650 					    lp->curr == lp->base)
651 						lp->curr = lim;
652 					lp->base = lim;
653 				}
654 			}
655 		}
656 		prev = *lpp;
657 		lpp = &(*lpp)->next;
658 	}
659 
660 	return (0);
661 }
662 
663 /*
664  * The caller of kcage_get_pfn must hold the kcage_range_lock to make
665  * sure that there are no concurrent calls. The same lock
666  * must be obtained for range add and delete by calling
667  * kcage_range_lock() and kcage_range_unlock().
668  */
669 static pfn_t
670 kcage_get_pfn(void)
671 {
672 	struct kcage_glist *lp;
673 	pfn_t pfn;
674 
675 	ASSERT(kcage_range_islocked());
676 
677 	lp = kcage_current_glist;
678 	while (lp != NULL) {
679 		if (lp->decr != 0) {
680 			if (lp->curr != lp->base) {
681 				pfn = --lp->curr;
682 				return (pfn);
683 			}
684 		} else {
685 			if (lp->curr != lp->lim) {
686 				pfn = lp->curr++;
687 				return (pfn);
688 			}
689 		}
690 
691 		lp = lp->next;
692 		if (lp)
693 			kcage_current_glist = lp;
694 	}
695 
696 	return (PFN_INVALID);
697 }
698 
699 /*
700  * Walk the physical address space of the cage.
701  * This routine does not guarantee to return PFNs in the order
702  * in which they were allocated to the cage. Instead, it walks
703  * each range as they appear on the growth list returning the PFNs
704  * range in ascending order.
705  *
706  * To begin scanning at lower edge of cage, reset should be nonzero.
707  * To step through cage, reset should be zero.
708  *
709  * PFN_INVALID will be returned when the upper end of the cage is
710  * reached -- indicating a full scan of the cage has been completed since
711  * previous reset. PFN_INVALID will continue to be returned until
712  * kcage_walk_cage is reset.
713  *
714  * It is possible to receive a PFN_INVALID result on reset if a growth
715  * list is not installed or if none of the PFNs in the installed list have
716  * been allocated to the cage. In otherwords, there is no cage.
717  *
718  * Caller need not hold kcage_range_lock while calling this function
719  * as the front part of the list is static - pages never come out of
720  * the cage.
721  *
722  * The caller is expected to only be kcage_cageout().
723  */
724 static pfn_t
725 kcage_walk_cage(int reset)
726 {
727 	static struct kcage_glist *lp = NULL;
728 	static pfn_t pfn;
729 
730 	if (reset)
731 		lp = NULL;
732 	if (lp == NULL) {
733 		lp = kcage_glist;
734 		pfn = PFN_INVALID;
735 	}
736 again:
737 	if (pfn == PFN_INVALID) {
738 		if (lp == NULL)
739 			return (PFN_INVALID);
740 
741 		if (lp->decr != 0) {
742 			/*
743 			 * In this range the cage grows from the highest
744 			 * address towards the lowest.
745 			 * Arrange to return pfns from curr to lim-1,
746 			 * inclusive, in ascending order.
747 			 */
748 
749 			pfn = lp->curr;
750 		} else {
751 			/*
752 			 * In this range the cage grows from the lowest
753 			 * address towards the highest.
754 			 * Arrange to return pfns from base to curr,
755 			 * inclusive, in ascending order.
756 			 */
757 
758 			pfn = lp->base;
759 		}
760 	}
761 
762 	if (lp->decr != 0) {		/* decrementing pfn */
763 		if (pfn == lp->lim) {
764 			/* Don't go beyond the static part of the glist. */
765 			if (lp == kcage_current_glist)
766 				lp = NULL;
767 			else
768 				lp = lp->next;
769 			pfn = PFN_INVALID;
770 			goto again;
771 		}
772 
773 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
774 	} else {			/* incrementing pfn */
775 		if (pfn == lp->curr) {
776 			/* Don't go beyond the static part of the glist. */
777 			if (lp == kcage_current_glist)
778 				lp = NULL;
779 			else
780 				lp = lp->next;
781 			pfn = PFN_INVALID;
782 			goto again;
783 		}
784 
785 		ASSERT(pfn >= lp->base && pfn < lp->curr);
786 	}
787 
788 	return (pfn++);
789 }
790 
791 /*
792  * Callback functions for to recalc cage thresholds after
793  * Kphysm memory add/delete operations.
794  */
795 /*ARGSUSED*/
796 static void
797 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
798 {
799 	kcage_recalc_thresholds();
800 }
801 
802 /*ARGSUSED*/
803 static int
804 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
805 {
806 	/* TODO: when should cage refuse memory delete requests? */
807 	return (0);
808 }
809 
810 /*ARGSUSED*/
811 static  void
812 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
813 {
814 	kcage_recalc_thresholds();
815 }
816 
817 static kphysm_setup_vector_t kcage_kphysm_vectors = {
818 	KPHYSM_SETUP_VECTOR_VERSION,
819 	kcage_kphysm_postadd_cb,
820 	kcage_kphysm_predel_cb,
821 	kcage_kphysm_postdel_cb
822 };
823 
824 /*
825  * This is called before a CPR suspend and after a CPR resume.  We have to
826  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
827  * restart.
828  */
829 /*ARGSUSED*/
830 static boolean_t
831 kcage_cageout_cpr(void *arg, int code)
832 {
833 	if (code == CB_CODE_CPR_CHKPT) {
834 		ASSERT(kcage_cageout_ready);
835 		kcage_cageout_ready = 0;
836 		return (B_TRUE);
837 	} else if (code == CB_CODE_CPR_RESUME) {
838 		ASSERT(kcage_cageout_ready == 0);
839 		kcage_cageout_ready = 1;
840 		return (B_TRUE);
841 	}
842 	return (B_FALSE);
843 }
844 
845 /*
846  * kcage_recalc_preferred_size() increases initial cage size to improve large
847  * page availability when lp for kmem is enabled and kpr is disabled
848  */
849 static pgcnt_t
850 kcage_recalc_preferred_size(pgcnt_t preferred_size)
851 {
852 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
853 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
854 		if (lpmincage == 0) {
855 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
856 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
857 		}
858 		kcage_kmemlp_mincage = MIN(lpmincage,
859 			    (segkmem_kmemlp_max / PAGESIZE));
860 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
861 	}
862 	return (preferred_size);
863 }
864 
865 /*
866  * Kcage_init() builds the cage and initializes the cage thresholds.
867  * The size of the cage is determined by the argument preferred_size.
868  * or the actual amount of memory, whichever is smaller.
869  */
870 void
871 kcage_init(pgcnt_t preferred_size)
872 {
873 	pgcnt_t wanted;
874 	pfn_t pfn;
875 	page_t *pp;
876 	extern struct vnode kvp;
877 	extern void page_list_noreloc_startup(page_t *);
878 
879 	ASSERT(!kcage_on);
880 	ASSERT(kcage_range_islocked());
881 
882 	/* increase preferred cage size for lp for kmem */
883 	preferred_size = kcage_recalc_preferred_size(preferred_size);
884 
885 	/* Debug note: initialize this now so early expansions can stat */
886 	KCAGE_STAT_INIT_SCAN_INDEX;
887 
888 	/*
889 	 * Initialize cage thresholds and install kphysm callback.
890 	 * If we can't arrange to have the thresholds track with
891 	 * available physical memory, then the cage thresholds may
892 	 * end up over time at levels that adversly effect system
893 	 * performance; so, bail out.
894 	 */
895 	kcage_recalc_thresholds();
896 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
897 		ASSERT(0);		/* Catch this in DEBUG kernels. */
898 		return;
899 	}
900 
901 	/*
902 	 * Limit startup cage size within the range of kcage_minfree
903 	 * and availrmem, inclusively.
904 	 */
905 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
906 
907 	/*
908 	 * Construct the cage. PFNs are allocated from the glist. It
909 	 * is assumed that the list has been properly ordered for the
910 	 * platform by the platform code. Typically, this is as simple
911 	 * as calling kcage_range_init(phys_avail, decr), where decr is
912 	 * 1 if the kernel has been loaded into upper end of physical
913 	 * memory, or 0 if the kernel has been loaded at the low end.
914 	 *
915 	 * Note: it is assumed that we are in the startup flow, so there
916 	 * is no reason to grab the page lock.
917 	 */
918 	kcage_freemem = 0;
919 	pfn = PFN_INVALID;			/* prime for alignment test */
920 	while (wanted != 0) {
921 		if ((pfn = kcage_get_pfn()) == PFN_INVALID)
922 			break;
923 
924 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
925 			KCAGEPAGETS_INC();
926 			/*
927 			 * Set the noreloc state on the page.
928 			 * If the page is free and not already
929 			 * on the noreloc list then move it.
930 			 */
931 			if (PP_ISFREE(pp)) {
932 				if (PP_ISNORELOC(pp) == 0)
933 					page_list_noreloc_startup(pp);
934 			} else {
935 				ASSERT(pp->p_szc == 0);
936 				PP_SETNORELOC(pp);
937 			}
938 		}
939 		PLCNT_XFER_NORELOC(pp);
940 		wanted -= 1;
941 	}
942 
943 	/*
944 	 * Need to go through and find kernel allocated pages
945 	 * and capture them into the Cage.  These will primarily
946 	 * be pages gotten through boot_alloc().
947 	 */
948 	if (kvp.v_pages) {
949 
950 		pp = kvp.v_pages;
951 		do {
952 			ASSERT(!PP_ISFREE(pp));
953 			ASSERT(pp->p_szc == 0);
954 			PP_SETNORELOC(pp);
955 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
956 
957 	}
958 
959 	kcage_on = 1;
960 
961 	/*
962 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
963 	 * after the cageout thread is blocked, and executes from cpr_resume()
964 	 * before the cageout thread is restarted.  By executing in this class,
965 	 * we are assured that the kernel cage thread won't miss wakeup calls
966 	 * and also CPR's larger kmem_alloc requests will not fail after
967 	 * CPR shuts down the cageout kernel thread.
968 	 */
969 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
970 	    "cageout");
971 
972 	/*
973 	 * Coalesce pages to improve large page availability. A better fix
974 	 * would to coalesce pages as they are included in the cage
975 	 */
976 	if (SEGKMEM_USE_LARGEPAGES) {
977 		extern void page_freelist_coalesce_all(int mnode);
978 		extern int max_mem_nodes;
979 		int mnode, max_mnodes = max_mem_nodes;
980 		for (mnode = 0; mnode < max_mnodes; mnode++) {
981 			page_freelist_coalesce_all(mnode);
982 		}
983 	}
984 }
985 
986 void
987 kcage_recalc_thresholds()
988 {
989 	static int first = 1;
990 	static pgcnt_t init_lotsfree;
991 	static pgcnt_t init_desfree;
992 	static pgcnt_t init_minfree;
993 	static pgcnt_t init_throttlefree;
994 	static pgcnt_t init_reserve;
995 
996 	/* TODO: any reason to take more care than this with live editing? */
997 	mutex_enter(&kcage_cageout_mutex);
998 	mutex_enter(&freemem_lock);
999 
1000 	if (first) {
1001 		first = 0;
1002 		init_lotsfree = kcage_lotsfree;
1003 		init_desfree = kcage_desfree;
1004 		init_minfree = kcage_minfree;
1005 		init_throttlefree = kcage_throttlefree;
1006 		init_reserve = kcage_reserve;
1007 	} else {
1008 		kcage_lotsfree = init_lotsfree;
1009 		kcage_desfree = init_desfree;
1010 		kcage_minfree = init_minfree;
1011 		kcage_throttlefree = init_throttlefree;
1012 		kcage_reserve = init_reserve;
1013 	}
1014 
1015 	if (kcage_lotsfree == 0)
1016 		kcage_lotsfree = MAX(32, total_pages / 256);
1017 
1018 	if (kcage_minfree == 0)
1019 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1020 
1021 	if (kcage_desfree == 0)
1022 		kcage_desfree = MAX(32, kcage_minfree);
1023 
1024 	if (kcage_throttlefree == 0)
1025 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1026 
1027 	if (kcage_reserve == 0)
1028 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1029 
1030 	mutex_exit(&freemem_lock);
1031 	mutex_exit(&kcage_cageout_mutex);
1032 
1033 	if (kcage_cageout_ready) {
1034 		if (kcage_freemem < kcage_desfree)
1035 			kcage_cageout_wakeup();
1036 
1037 		if (kcage_needfree) {
1038 			mutex_enter(&kcage_throttle_mutex);
1039 			cv_broadcast(&kcage_throttle_cv);
1040 			mutex_exit(&kcage_throttle_mutex);
1041 		}
1042 	}
1043 }
1044 
1045 /*
1046  * Pageout interface:
1047  * kcage_cageout_init()
1048  */
1049 void
1050 kcage_cageout_init()
1051 {
1052 	if (kcage_on) {
1053 
1054 		(void) thread_create(NULL, 0, kcage_cageout,
1055 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
1056 	}
1057 }
1058 
1059 
1060 /*
1061  * VM Interfaces:
1062  * kcage_create_throttle()
1063  * kcage_freemem_add()
1064  * kcage_freemem_sub()
1065  */
1066 
1067 /*
1068  * Wakeup cageout thread and throttle waiting for the number of pages
1069  * requested to become available.  For non-critical requests, a
1070  * timeout is added, since freemem accounting is separate from cage
1071  * freemem accounting: it's possible for us to get stuck and not make
1072  * forward progress even though there was sufficient freemem before
1073  * arriving here.
1074  */
1075 int
1076 kcage_create_throttle(pgcnt_t npages, int flags)
1077 {
1078 	int niter = 0;
1079 	pgcnt_t lastfree;
1080 	int enough = kcage_freemem > kcage_throttlefree + npages;
1081 
1082 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1083 
1084 	kcage_cageout_wakeup();			/* just to be sure */
1085 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1086 
1087 	/*
1088 	 * Obviously, we can't throttle the cageout thread since
1089 	 * we depend on it.  We also can't throttle the panic thread.
1090 	 */
1091 	if (curthread == kcage_cageout_thread || panicstr) {
1092 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1093 		return (KCT_CRIT);
1094 	}
1095 
1096 	/*
1097 	 * Don't throttle threads which are critical for proper
1098 	 * vm management if we're above kcage_throttlefree or
1099 	 * if freemem is very low.
1100 	 */
1101 	if (NOMEMWAIT()) {
1102 		if (enough) {
1103 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1104 			return (KCT_CRIT);
1105 		} else if (freemem < minfree) {
1106 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1107 			return (KCT_CRIT);
1108 		}
1109 	}
1110 
1111 	/*
1112 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1113 	 */
1114 	if (DISP_PRIO(curthread) > maxclsyspri &&
1115 	    kcage_freemem > kcage_reserve) {
1116 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1117 		return (KCT_CRIT);
1118 	}
1119 
1120 	/*
1121 	 * Cause all other threads (which are assumed to not be
1122 	 * critical to cageout) to wait here until their request
1123 	 * can be satisfied. Be a little paranoid and wake the
1124 	 * kernel cage on each loop through this logic.
1125 	 */
1126 	while (kcage_freemem < kcage_throttlefree + npages) {
1127 		ASSERT(kcage_on);
1128 
1129 		lastfree = kcage_freemem;
1130 
1131 		if (kcage_cageout_ready) {
1132 			mutex_enter(&kcage_throttle_mutex);
1133 
1134 			kcage_needfree += npages;
1135 			KCAGE_STAT_INCR(kct_wait);
1136 
1137 			kcage_cageout_wakeup();
1138 			KCAGE_STAT_INCR(kct_cagewake);
1139 
1140 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1141 
1142 			kcage_needfree -= npages;
1143 
1144 			mutex_exit(&kcage_throttle_mutex);
1145 		} else {
1146 			/*
1147 			 * NOTE: atomics are used just in case we enter
1148 			 * mp operation before the cageout thread is ready.
1149 			 */
1150 			atomic_add_long(&kcage_needfree, npages);
1151 
1152 			kcage_cageout_wakeup();
1153 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1154 
1155 			atomic_add_long(&kcage_needfree, -npages);
1156 		}
1157 
1158 		if ((flags & PG_WAIT) == 0) {
1159 			if (kcage_freemem > lastfree) {
1160 				KCAGE_STAT_INCR(kct_progress);
1161 				niter = 0;
1162 			} else {
1163 				KCAGE_STAT_INCR(kct_noprogress);
1164 				if (++niter >= kcage_maxwait) {
1165 					KCAGE_STAT_INCR(kct_timeout);
1166 					return (KCT_FAILURE);
1167 				}
1168 			}
1169 		}
1170 	}
1171 	return (KCT_NONCRIT);
1172 }
1173 
1174 void
1175 kcage_freemem_add(pgcnt_t npages)
1176 {
1177 	extern void wakeup_pcgs(void);
1178 
1179 	atomic_add_long(&kcage_freemem, npages);
1180 
1181 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1182 
1183 	if (kcage_needfree != 0 &&
1184 		kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1185 
1186 		mutex_enter(&kcage_throttle_mutex);
1187 		cv_broadcast(&kcage_throttle_cv);
1188 		KCAGE_STAT_INCR(kfa_trottlewake);
1189 		mutex_exit(&kcage_throttle_mutex);
1190 	}
1191 }
1192 
1193 void
1194 kcage_freemem_sub(pgcnt_t npages)
1195 {
1196 	atomic_add_long(&kcage_freemem, -npages);
1197 
1198 	if (kcage_freemem < kcage_desfree) {
1199 		kcage_cageout_wakeup();
1200 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1201 	}
1202 }
1203 
1204 /*
1205  * return 0 on failure and 1 on success.
1206  */
1207 static int
1208 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1209 {
1210 	pgcnt_t npgs, i;
1211 	page_t *pp;
1212 	pfn_t rootpfn = page_pptonum(rootpp);
1213 	uint_t szc;
1214 
1215 	ASSERT(!PP_ISFREE(rootpp));
1216 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1217 	if (!group_page_trylock(rootpp, se)) {
1218 		return (0);
1219 	}
1220 	szc = rootpp->p_szc;
1221 	if (szc == 0) {
1222 		/*
1223 		 * The szc of a locked page can only change for pages that are
1224 		 * non-swapfs (i.e. anonymous memory) file system pages.
1225 		 */
1226 		ASSERT(rootpp->p_vnode != NULL &&
1227 		    rootpp->p_vnode != &kvp &&
1228 		    !IS_SWAPFSVP(rootpp->p_vnode));
1229 		PP_SETNORELOC(rootpp);
1230 		return (1);
1231 	}
1232 	npgs = page_get_pagecnt(szc);
1233 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1234 	pp = rootpp;
1235 	for (i = 0; i < npgs; i++, pp++) {
1236 		ASSERT(PAGE_LOCKED_SE(pp, se));
1237 		ASSERT(!PP_ISFREE(pp));
1238 		ASSERT(pp->p_szc == szc);
1239 		PP_SETNORELOC(pp);
1240 	}
1241 	group_page_unlock(rootpp);
1242 	return (1);
1243 }
1244 
1245 /*
1246  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1247  * If successful and pages is free, move page to the tail of whichever
1248  * list it is on.
1249  * Returns:
1250  *   EBUSY  page already locked, assimilated but not free.
1251  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1252  *   EAGAIN page not assimilated. Page not free.
1253  *   ERANGE page assimilated. Page not root.
1254  *   0      page assimilated. Page free.
1255  *   *nfreedp number of pages freed.
1256  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1257  * to distinguish between a page that was already a NORELOC page from
1258  * those newly converted to NORELOC pages by this invocation of
1259  * kcage_assimilate_page.
1260  */
1261 static int
1262 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1263 {
1264 	if (page_trylock(pp, SE_EXCL)) {
1265 		if (PP_ISNORELOC(pp)) {
1266 check_free_and_return:
1267 			if (PP_ISFREE(pp)) {
1268 				page_unlock(pp);
1269 				*nfreedp = 0;
1270 				return (0);
1271 			} else {
1272 				page_unlock(pp);
1273 				return (EBUSY);
1274 			}
1275 			/*NOTREACHED*/
1276 		}
1277 	} else {
1278 		if (page_trylock(pp, SE_SHARED)) {
1279 			if (PP_ISNORELOC(pp))
1280 				goto check_free_and_return;
1281 		} else
1282 			return (EAGAIN);
1283 
1284 		if (!PP_ISFREE(pp)) {
1285 			page_unlock(pp);
1286 			return (EAGAIN);
1287 		}
1288 
1289 		/*
1290 		 * Need to upgrade the lock on it and set the NORELOC
1291 		 * bit. If it is free then remove it from the free
1292 		 * list so that the platform free list code can keep
1293 		 * NORELOC pages where they should be.
1294 		 */
1295 		/*
1296 		 * Before doing anything, get the exclusive lock.
1297 		 * This may fail (eg ISM pages are left shared locked).
1298 		 * If the page is free this will leave a hole in the
1299 		 * cage. There is no solution yet to this.
1300 		 */
1301 		if (!page_tryupgrade(pp)) {
1302 			page_unlock(pp);
1303 			return (EAGAIN);
1304 		}
1305 	}
1306 
1307 	ASSERT(PAGE_EXCL(pp));
1308 
1309 	if (PP_ISFREE(pp)) {
1310 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1311 
1312 		page_list_sub(pp, which);
1313 		ASSERT(pp->p_szc == 0);
1314 		PP_SETNORELOC(pp);
1315 		PLCNT_XFER_NORELOC(pp);
1316 		page_list_add(pp, which | PG_LIST_TAIL);
1317 
1318 		page_unlock(pp);
1319 		*nfreedp = 1;
1320 		return (0);
1321 	} else {
1322 		if (pp->p_szc != 0) {
1323 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1324 				page_unlock(pp);
1325 				return (EAGAIN);
1326 			}
1327 			ASSERT(PP_ISNORELOC(pp));
1328 		} else {
1329 			PP_SETNORELOC(pp);
1330 		}
1331 		PLCNT_XFER_NORELOC(pp);
1332 		return (kcage_invalidate_page(pp, nfreedp));
1333 	}
1334 	/*NOTREACHED*/
1335 }
1336 
1337 static int
1338 kcage_expand()
1339 {
1340 	int did_something = 0;
1341 
1342 	spgcnt_t wanted;
1343 	pfn_t pfn;
1344 	page_t *pp;
1345 	/* TODO: we don't really need n any more? */
1346 	pgcnt_t n;
1347 	pgcnt_t nf, nfreed;
1348 
1349 	/*
1350 	 * Expand the cage if available cage memory is really low. Calculate
1351 	 * the amount required to return kcage_freemem to the level of
1352 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1353 	 * more.  It is rare for their sum to create an artificial threshold
1354 	 * above kcage_lotsfree, but it is possible.
1355 	 *
1356 	 * Exit early if expansion amount is equal to or less than zero.
1357 	 * (<0 is possible if kcage_freemem rises suddenly.)
1358 	 *
1359 	 * Exit early when the global page pool (apparently) does not
1360 	 * have enough free pages to page_relocate() even a single page.
1361 	 */
1362 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1363 		- kcage_freemem;
1364 	if (wanted <= 0)
1365 		return (0);
1366 	else if (freemem < pageout_reserve + 1) {
1367 		KCAGE_STAT_INCR(ke_lowfreemem);
1368 		return (0);
1369 	}
1370 
1371 	/*
1372 	 * Try to get the range list reader lock. If the lock is already
1373 	 * held, then don't get stuck here waiting for it.
1374 	 */
1375 	if (!rw_tryenter(&kcage_range_rwlock, RW_READER))
1376 		return (0);
1377 
1378 	KCAGE_STAT_INCR(ke_calls);
1379 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1380 
1381 	/*
1382 	 * Assimilate more pages from the global page pool into the cage.
1383 	 */
1384 	n = 0;				/* number of pages PP_SETNORELOC'd */
1385 	nf = 0;				/* number of those actually free */
1386 	while (kcage_on && nf < wanted) {
1387 		pfn = kcage_get_pfn();
1388 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1389 			KCAGE_STAT_INCR(ke_nopfn);
1390 			goto terminate;
1391 		}
1392 
1393 		KCAGE_STAT_INCR_SCAN(ke_examined);
1394 
1395 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1396 			KCAGE_STAT_INCR(ke_nopaget);
1397 			continue;
1398 		}
1399 		KCAGEPAGETS_INC();
1400 		/*
1401 		 * Sanity check. Skip this pfn if it is
1402 		 * being deleted.
1403 		 */
1404 		if (pfn_is_being_deleted(pfn)) {
1405 			KCAGE_STAT_INCR(ke_deleting);
1406 			continue;
1407 		}
1408 
1409 		/*
1410 		 * NORELOC is only set at boot-time or by this routine
1411 		 * under the kcage_range_rwlock lock which is currently
1412 		 * held. This means we can do a fast check here before
1413 		 * locking the page in kcage_assimilate_page.
1414 		 */
1415 		if (PP_ISNORELOC(pp)) {
1416 			KCAGE_STAT_INCR(ke_isnoreloc);
1417 			continue;
1418 		}
1419 
1420 		switch (kcage_assimilate_page(pp, &nfreed)) {
1421 			case 0:		/* assimilated, page is free */
1422 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1423 				did_something = 1;
1424 				nf += nfreed;
1425 				n++;
1426 				break;
1427 
1428 			case EBUSY:	/* assimilated, page not free */
1429 			case ERANGE:	/* assimilated, page not root */
1430 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1431 				did_something = 1;
1432 				n++;
1433 				break;
1434 
1435 			case ENOMEM:	/* assimilated, but no mem */
1436 				KCAGE_STAT_INCR(ke_terminate);
1437 				did_something = 1;
1438 				n++;
1439 				goto terminate;
1440 
1441 			case EAGAIN:	/* can't assimilate */
1442 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1443 				break;
1444 
1445 			default:	/* catch this with debug kernels */
1446 				ASSERT(0);
1447 				break;
1448 		}
1449 	}
1450 
1451 	/*
1452 	 * Realign cage edge with the nearest physical address
1453 	 * boundry for big pages. This is done to give us a
1454 	 * better chance of actually getting usable big pages
1455 	 * in the cage.
1456 	 */
1457 
1458 terminate:
1459 	kcage_range_unlock();
1460 
1461 	return (did_something);
1462 }
1463 
1464 /*
1465  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1466  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1467  * if relocation is successful, otherwise it is only unlocked.
1468  * On entry, page opp must be exclusively locked and not free.
1469  * *nfreedp: number of pages freed.
1470  */
1471 static int
1472 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1473 {
1474 	page_t *opp = pp;
1475 	page_t *rpp = NULL;
1476 	spgcnt_t npgs;
1477 	int result;
1478 
1479 	ASSERT(!PP_ISFREE(opp));
1480 	ASSERT(PAGE_EXCL(opp));
1481 
1482 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1483 	*nfreedp = npgs;
1484 	if (result == 0) {
1485 		while (npgs-- > 0) {
1486 			page_t *tpp;
1487 
1488 			ASSERT(rpp != NULL);
1489 			tpp = rpp;
1490 			page_sub(&rpp, tpp);
1491 			page_unlock(tpp);
1492 		}
1493 
1494 		ASSERT(rpp == NULL);
1495 
1496 		return (0);		/* success */
1497 	}
1498 
1499 	page_unlock(opp);
1500 	return (result);
1501 }
1502 
1503 /*
1504  * Based on page_invalidate_pages()
1505  *
1506  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1507  * of use must be updated to match the new page_relocate() when it
1508  * becomes available.
1509  *
1510  * Return result of kcage_relocate_page or zero if page was directly freed.
1511  * *nfreedp: number of pages freed.
1512  */
1513 static int
1514 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1515 {
1516 	int result;
1517 
1518 #if defined(__sparc)
1519 	extern struct vnode prom_ppages;
1520 	ASSERT(pp->p_vnode != &prom_ppages);
1521 #endif /* __sparc */
1522 
1523 	ASSERT(!PP_ISFREE(pp));
1524 	ASSERT(PAGE_EXCL(pp));
1525 
1526 	/*
1527 	 * Is this page involved in some I/O? shared?
1528 	 * The page_struct_lock need not be acquired to
1529 	 * examine these fields since the page has an
1530 	 * "exclusive" lock.
1531 	 */
1532 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1533 		result = kcage_relocate_page(pp, nfreedp);
1534 #ifdef KCAGE_STATS
1535 		if (result == 0)
1536 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1537 		else if (result == ENOMEM)
1538 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1539 #endif
1540 		return (result);
1541 	}
1542 
1543 	ASSERT(pp->p_vnode->v_type != VCHR);
1544 
1545 	/*
1546 	 * Unload the mappings and check if mod bit is set.
1547 	 */
1548 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1549 
1550 	if (hat_ismod(pp)) {
1551 		result = kcage_relocate_page(pp, nfreedp);
1552 #ifdef KCAGE_STATS
1553 		if (result == 0)
1554 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1555 		else if (result == ENOMEM)
1556 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1557 #endif
1558 		return (result);
1559 	}
1560 
1561 	if (!page_try_demote_pages(pp)) {
1562 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1563 		page_unlock(pp);
1564 		return (EAGAIN);
1565 	}
1566 
1567 	page_destroy(pp, 0);
1568 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1569 	*nfreedp = 1;
1570 	return (0);
1571 }
1572 
1573 static void
1574 kcage_cageout()
1575 {
1576 	pfn_t pfn;
1577 	page_t *pp;
1578 	callb_cpr_t cprinfo;
1579 	int did_something;
1580 	int scan_again;
1581 	pfn_t start_pfn;
1582 	int pass;
1583 	int last_pass;
1584 	int pages_skipped;
1585 	int shared_skipped;
1586 	uint_t shared_level = 8;
1587 	pgcnt_t nfreed;
1588 #ifdef KCAGE_STATS
1589 	clock_t scan_start;
1590 #endif
1591 
1592 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1593 		callb_generic_cpr, "cageout");
1594 
1595 	mutex_enter(&kcage_cageout_mutex);
1596 	kcage_cageout_thread = curthread;
1597 
1598 	pfn = PFN_INVALID;		/* force scan reset */
1599 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1600 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1601 
1602 loop:
1603 	/*
1604 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1605 	 * that kcage_freemem is less than kcage_desfree. When it does
1606 	 * notice, kcage_freemem_sub() will wake us up via call to
1607 	 * kcage_cageout_wakeup().
1608 	 */
1609 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1610 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1611 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1612 
1613 	KCAGE_STAT_INCR(kt_wakeups);
1614 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1615 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1616 	pass = 0;
1617 	last_pass = 0;
1618 
1619 #ifdef KCAGE_STATS
1620 	scan_start = lbolt;
1621 #endif
1622 
1623 again:
1624 	if (!kcage_on)
1625 		goto loop;
1626 
1627 	KCAGE_STAT_INCR(kt_scans);
1628 	KCAGE_STAT_INCR_SCAN(kt_passes);
1629 
1630 	did_something = 0;
1631 	pages_skipped = 0;
1632 	shared_skipped = 0;
1633 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1634 		(pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1635 
1636 		if (start_pfn == PFN_INVALID)
1637 			start_pfn = pfn;
1638 		else if (start_pfn == pfn) {
1639 			last_pass = pass;
1640 			pass += 1;
1641 			/*
1642 			 * Did a complete walk of kernel cage, but didn't free
1643 			 * any pages.  If only one cpu is online then
1644 			 * stop kernel cage walk and try expanding.
1645 			 */
1646 			if (ncpus_online == 1 && did_something == 0) {
1647 				KCAGE_STAT_INCR(kt_cageout_break);
1648 				break;
1649 			}
1650 		}
1651 
1652 		pp = page_numtopp_nolock(pfn);
1653 		if (pp == NULL) {
1654 			continue;
1655 		}
1656 
1657 		KCAGE_STAT_INCR_SCAN(kt_examined);
1658 
1659 		/*
1660 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1661 		 * of the lock. If one is missed it will be seen next
1662 		 * time through.
1663 		 *
1664 		 * Skip non-caged-pages. These pages can exist in the cage
1665 		 * because, if during cage expansion, a page is
1666 		 * encountered that is long-term locked the lock prevents the
1667 		 * expansion logic from setting the P_NORELOC flag. Hence,
1668 		 * non-caged-pages surrounded by caged-pages.
1669 		 */
1670 		if (!PP_ISNORELOC(pp)) {
1671 			switch (kcage_assimilate_page(pp, &nfreed)) {
1672 				case 0:
1673 					did_something = 1;
1674 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1675 					    nfreed);
1676 					break;
1677 
1678 				case EBUSY:
1679 				case ERANGE:
1680 					did_something = 1;
1681 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1682 					break;
1683 
1684 				case EAGAIN:
1685 				case ENOMEM:
1686 					break;
1687 
1688 				default:
1689 					/* catch this with debug kernels */
1690 					ASSERT(0);
1691 					break;
1692 			}
1693 
1694 			continue;
1695 		} else {
1696 			int prm;
1697 
1698 			if (PP_ISFREE(pp)) {
1699 				continue;
1700 			}
1701 
1702 			if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) ||
1703 			    !page_trylock(pp, SE_EXCL)) {
1704 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1705 				continue;
1706 			}
1707 
1708 			/* P_NORELOC bit should not have gone away. */
1709 			ASSERT(PP_ISNORELOC(pp));
1710 			if (PP_ISFREE(pp) || (pp->p_vnode == &kvp &&
1711 			    pp->p_lckcnt > 0)) {
1712 				page_unlock(pp);
1713 				continue;
1714 			}
1715 
1716 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1717 			if (hat_page_getshare(pp) > shared_level) {
1718 				page_unlock(pp);
1719 				pages_skipped = 1;
1720 				shared_skipped = 1;
1721 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1722 				continue;
1723 			}
1724 
1725 			/*
1726 			 * In pass {0, 1}, skip page if ref bit is set.
1727 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1728 			 */
1729 			prm = hat_pagesync(pp,
1730 				HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1731 
1732 			/* On first pass ignore ref'd pages */
1733 			if (pass <= 1 && (prm & P_REF)) {
1734 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1735 				pages_skipped = 1;
1736 				page_unlock(pp);
1737 				continue;
1738 			}
1739 
1740 			/* On pass 2, page_destroy if mod bit is not set */
1741 			if (pass <= 2) {
1742 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1743 					pp->p_lckcnt || pp->p_cowcnt) {
1744 					pages_skipped = 1;
1745 					page_unlock(pp);
1746 				} else {
1747 
1748 					/*
1749 					 * unload the mappings before
1750 					 * checking if mod bit is set
1751 					 */
1752 					(void) hat_pageunload(pp,
1753 						HAT_FORCE_PGUNLOAD);
1754 
1755 					/*
1756 					 * skip this page if modified
1757 					 */
1758 					if (hat_ismod(pp)) {
1759 						pages_skipped = 1;
1760 						page_unlock(pp);
1761 						continue;
1762 					}
1763 
1764 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1765 					page_destroy(pp, 0);
1766 					did_something = 1;
1767 				}
1768 				continue;
1769 			}
1770 
1771 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1772 				did_something = 1;
1773 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1774 			}
1775 
1776 			/*
1777 			 * No need to drop the page lock here.
1778 			 * Kcage_invalidate_page has done that for us
1779 			 * either explicitly or through a page_free.
1780 			 */
1781 		}
1782 	}
1783 
1784 	/*
1785 	 * Expand the cage only if available cage memory is really low.
1786 	 * This test is done only after a complete scan of the cage.
1787 	 * The reason for not checking and expanding more often is to
1788 	 * avoid rapid expansion of the cage. Naturally, scanning the
1789 	 * cage takes time. So by scanning first, we use that work as a
1790 	 * delay loop in between expand decisions.
1791 	 */
1792 
1793 	scan_again = 0;
1794 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1795 		/*
1796 		 * Kcage_expand() will return a non-zero value if it was
1797 		 * able to expand the cage -- whether or not the new
1798 		 * pages are free and immediately usable. If non-zero,
1799 		 * we do another scan of the cage. The pages might be
1800 		 * freed during that scan or by time we get back here.
1801 		 * If not, we will attempt another expansion.
1802 		 * However, if kcage_expand() returns zero, then it was
1803 		 * unable to expand the cage. This is the case when the
1804 		 * the growth list is exausted, therefore no work was done
1805 		 * and there is no reason to scan the cage again.
1806 		 * Note: Kernel cage scan is not repeated on single-cpu
1807 		 * system to avoid kernel cage thread hogging cpu.
1808 		 */
1809 		if (pass <= 3 && pages_skipped && ncpus_online > 1)
1810 			scan_again = 1;
1811 		else
1812 			(void) kcage_expand(); /* don't scan again */
1813 	} else if (kcage_freemem < kcage_lotsfree) {
1814 		/*
1815 		 * If available cage memory is less than abundant
1816 		 * and a full scan of the cage has not yet been completed,
1817 		 * or a scan has completed and some work was performed,
1818 		 * or pages were skipped because of sharing,
1819 		 * or we simply have not yet completed two passes,
1820 		 * then do another scan.
1821 		 */
1822 		if (pass <= 2 && pages_skipped)
1823 			scan_again = 1;
1824 		if (pass == last_pass || did_something)
1825 			scan_again = 1;
1826 		else if (shared_skipped && shared_level < (8<<24)) {
1827 			shared_level <<= 1;
1828 			scan_again = 1;
1829 		}
1830 	}
1831 
1832 	if (scan_again && ncpus_online > 1)
1833 		goto again;
1834 	else {
1835 		if (shared_level > 8)
1836 			shared_level >>= 1;
1837 
1838 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1839 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1840 		KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start);
1841 		KCAGE_STAT_INC_SCAN_INDEX;
1842 		goto loop;
1843 	}
1844 
1845 	/*NOTREACHED*/
1846 }
1847 
1848 void
1849 kcage_cageout_wakeup()
1850 {
1851 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1852 		if (kcage_cageout_ready) {
1853 			cv_signal(&kcage_cageout_cv);
1854 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1855 			/*
1856 			 * Available cage memory is really low. Time to
1857 			 * start expanding the cage. However, the
1858 			 * kernel cage thread is not yet ready to
1859 			 * do the work. Use *this* thread, which is
1860 			 * most likely to be t0, to do the work.
1861 			 */
1862 			KCAGE_STAT_INCR(kcw_expandearly);
1863 			(void) kcage_expand();
1864 			KCAGE_STAT_INC_SCAN_INDEX;
1865 		}
1866 
1867 		mutex_exit(&kcage_cageout_mutex);
1868 	}
1869 	/* else, kernel cage thread is already running */
1870 }
1871 
1872 void
1873 kcage_tick()
1874 {
1875 	/*
1876 	 * Once per second we wake up all the threads throttled
1877 	 * waiting for cage memory, in case we've become stuck
1878 	 * and haven't made forward progress expanding the cage.
1879 	 */
1880 	if (kcage_on && kcage_cageout_ready)
1881 		cv_broadcast(&kcage_throttle_cv);
1882 }
1883