xref: /titanic_51/usr/src/uts/common/os/mem_cage.c (revision 9126c5e5838606c81bd43b70bc80e5c85d879340)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/param.h>
31 #include <sys/thread.h>
32 #include <sys/proc.h>
33 #include <sys/callb.h>
34 #include <sys/vnode.h>
35 #include <sys/debug.h>
36 #include <sys/systm.h>		/* for bzero */
37 #include <sys/memlist.h>
38 #include <sys/cmn_err.h>
39 #include <sys/sysmacros.h>
40 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
41 #include <sys/atomic.h>		/* used to update kcage_freemem */
42 #include <sys/kmem.h>		/* for kmem_reap */
43 #include <sys/errno.h>
44 #include <sys/mem_cage.h>
45 #include <vm/seg_kmem.h>
46 #include <vm/page.h>
47 #include <vm/hat.h>
48 #include <vm/vm_dep.h>
49 #include <sys/mem_config.h>
50 #include <sys/lgrp.h>
51 
52 extern pri_t maxclsyspri;
53 
54 #ifdef DEBUG
55 #define	KCAGE_STATS
56 #endif
57 
58 #ifdef KCAGE_STATS
59 
60 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
61 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
62 
63 struct kcage_stats_scan {
64 	/* managed by KCAGE_STAT_* macros */
65 	clock_t	scan_lbolt;
66 	uint_t	scan_id;
67 
68 	/* set in kcage_cageout() */
69 	uint_t	kt_passes;
70 	clock_t	kt_ticks;
71 	pgcnt_t	kt_kcage_freemem_start;
72 	pgcnt_t	kt_kcage_freemem_end;
73 	pgcnt_t kt_freemem_start;
74 	pgcnt_t kt_freemem_end;
75 	uint_t	kt_examined;
76 	uint_t	kt_cantlock;
77 	uint_t	kt_gotone;
78 	uint_t	kt_gotonefree;
79 	uint_t	kt_skiplevel;
80 	uint_t	kt_skipshared;
81 	uint_t	kt_skiprefd;
82 	uint_t	kt_destroy;
83 
84 	/* set in kcage_invalidate_page() */
85 	uint_t	kip_reloclocked;
86 	uint_t	kip_relocmod;
87 	uint_t	kip_destroy;
88 	uint_t	kip_nomem;
89 	uint_t	kip_demotefailed;
90 
91 	/* set in kcage_expand() */
92 	uint_t	ke_wanted;
93 	uint_t	ke_examined;
94 	uint_t	ke_lefthole;
95 	uint_t	ke_gotone;
96 	uint_t	ke_gotonefree;
97 };
98 
99 struct kcage_stats {
100 	/* managed by KCAGE_STAT_* macros */
101 	uint_t	version;
102 	uint_t	size;
103 
104 	/* set in kcage_cageout */
105 	uint_t	kt_wakeups;
106 	uint_t	kt_scans;
107 	uint_t	kt_cageout_break;
108 
109 	/* set in kcage_expand */
110 	uint_t	ke_calls;
111 	uint_t	ke_nopfn;
112 	uint_t	ke_nopaget;
113 	uint_t	ke_isnoreloc;
114 	uint_t	ke_deleting;
115 	uint_t	ke_lowfreemem;
116 	uint_t	ke_terminate;
117 
118 	/* set in kcage_freemem_add() */
119 	uint_t	kfa_trottlewake;
120 
121 	/* set in kcage_freemem_sub() */
122 	uint_t	kfs_cagewake;
123 
124 	/* set in kcage_create_throttle */
125 	uint_t	kct_calls;
126 	uint_t	kct_cageout;
127 	uint_t	kct_critical;
128 	uint_t	kct_exempt;
129 	uint_t	kct_cagewake;
130 	uint_t	kct_wait;
131 	uint_t	kct_progress;
132 	uint_t	kct_noprogress;
133 	uint_t	kct_timeout;
134 
135 	/* set in kcage_cageout_wakeup */
136 	uint_t	kcw_expandearly;
137 
138 	/* managed by KCAGE_STAT_* macros */
139 	uint_t	scan_array_size;
140 	uint_t	scan_index;
141 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
142 };
143 
144 static struct kcage_stats kcage_stats;
145 static struct kcage_stats_scan kcage_stats_scan_zero;
146 
147 /*
148  * No real need for atomics here. For the most part the incs and sets are
149  * done by the kernel cage thread. There are a few that are done by any
150  * number of other threads. Those cases are noted by comments.
151  */
152 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
153 
154 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
155 
156 #define	KCAGE_STAT_INCR_SCAN(m)	\
157 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
158 
159 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
160 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
161 
162 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
163 
164 #define	KCAGE_STAT_SETZ(m, v)	\
165 	if (kcage_stats.m == 0) kcage_stats.m = (v)
166 
167 #define	KCAGE_STAT_SET_SCAN(m, v)	\
168 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
169 
170 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
171 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
172 
173 #define	KCAGE_STAT_INC_SCAN_INDEX \
174 	KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \
175 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
176 	kcage_stats.scan_index = \
177 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
178 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
179 
180 #define	KCAGE_STAT_INIT_SCAN_INDEX \
181 	kcage_stats.version = KCAGE_STATS_VERSION; \
182 	kcage_stats.size = sizeof (kcage_stats); \
183 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
184 	kcage_stats.scan_index = 0
185 
186 #else /* KCAGE_STATS */
187 
188 #define	KCAGE_STAT_INCR(v)
189 #define	KCAGE_STAT_NINCR(m, v)
190 #define	KCAGE_STAT_INCR_SCAN(v)
191 #define	KCAGE_STAT_NINCR_SCAN(m, v)
192 #define	KCAGE_STAT_SET(m, v)
193 #define	KCAGE_STAT_SETZ(m, v)
194 #define	KCAGE_STAT_SET_SCAN(m, v)
195 #define	KCAGE_STAT_SETZ_SCAN(m, v)
196 #define	KCAGE_STAT_INC_SCAN_INDEX
197 #define	KCAGE_STAT_INIT_SCAN_INDEX
198 
199 #endif /* KCAGE_STATS */
200 
201 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
202 static kcondvar_t kcage_throttle_cv;
203 
204 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
205 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
206 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
207 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
208 
209 static kmutex_t kcage_range_mutex;	/* proctects kcage_glist elements */
210 
211 /*
212  * Cage expansion happens within a range.
213  */
214 struct kcage_glist {
215 	struct kcage_glist	*next;
216 	pfn_t			base;
217 	pfn_t			lim;
218 	pfn_t			curr;
219 	int			decr;
220 };
221 
222 static struct kcage_glist *kcage_glist;
223 static struct kcage_glist *kcage_current_glist;
224 
225 /*
226  * The firstfree element is provided so that kmem_alloc can be avoided
227  * until that cage has somewhere to go. This is not currently a problem
228  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
229  */
230 static struct kcage_glist kcage_glist_firstfree;
231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
232 
233 /*
234  * Miscellaneous forward references
235  */
236 static struct kcage_glist *kcage_glist_alloc(void);
237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
238 static void kcage_cageout(void);
239 static int kcage_invalidate_page(page_t *, pgcnt_t *);
240 static int kcage_setnoreloc_pages(page_t *, se_t);
241 
242 /*
243  * Kernel Memory Cage counters and thresholds.
244  */
245 int kcage_on = 0;
246 pgcnt_t kcage_freemem;
247 pgcnt_t kcage_needfree;
248 pgcnt_t kcage_lotsfree;
249 pgcnt_t kcage_desfree;
250 pgcnt_t kcage_minfree;
251 pgcnt_t kcage_throttlefree;
252 pgcnt_t	kcage_reserve;
253 int kcage_maxwait = 10;	/* in seconds */
254 
255 /* when we use lp for kmem we start the cage at a higher initial value */
256 pgcnt_t kcage_kmemlp_mincage;
257 
258 #ifdef DEBUG
259 pgcnt_t	kcage_pagets;
260 #define	KCAGEPAGETS_INC()	kcage_pagets++
261 #else
262 #define	KCAGEPAGETS_INC()
263 #endif
264 
265 /*
266  * Startup and Dynamic Reconfiguration interfaces.
267  * kcage_range_lock()
268  * kcage_range_unlock()
269  * kcage_range_islocked()
270  * kcage_range_add()
271  * kcage_range_del()
272  * kcage_init()
273  * kcage_set_thresholds()
274  */
275 
276 int
277 kcage_range_trylock(void)
278 {
279 	return (mutex_tryenter(&kcage_range_mutex));
280 }
281 
282 void
283 kcage_range_lock(void)
284 {
285 	mutex_enter(&kcage_range_mutex);
286 }
287 
288 void
289 kcage_range_unlock(void)
290 {
291 	mutex_exit(&kcage_range_mutex);
292 }
293 
294 int
295 kcage_range_islocked(void)
296 {
297 	return (MUTEX_HELD(&kcage_range_mutex));
298 }
299 
300 /*
301  * Called from page_get_contig_pages to get the approximate kcage pfn range
302  * for exclusion from search for contiguous pages. This routine is called
303  * without kcage_range lock (kcage routines can call page_get_contig_pages
304  * through page_relocate) and with the assumption, based on kcage_range_add,
305  * that kcage_current_glist always contain a valid pointer.
306  */
307 
308 int
309 kcage_current_pfn(pfn_t *pfncur)
310 {
311 	struct kcage_glist *lp = kcage_current_glist;
312 
313 	ASSERT(kcage_on);
314 
315 	ASSERT(lp != NULL);
316 
317 	*pfncur = lp->curr;
318 
319 	return (lp->decr);
320 }
321 
322 int
323 kcage_range_init(struct memlist *ml, int decr)
324 {
325 	int ret = 0;
326 
327 	ASSERT(kcage_range_islocked());
328 
329 	if (decr) {
330 		while (ml->next != NULL)
331 			ml = ml->next;
332 	}
333 
334 	while (ml != NULL) {
335 		ret = kcage_range_add(btop(ml->address), btop(ml->size), decr);
336 		if (ret)
337 			break;
338 
339 		ml = (decr ? ml->prev : ml->next);
340 	}
341 
342 	return (ret);
343 }
344 
345 /*
346  * Third arg controls direction of growth: 0: increasing pfns,
347  * 1: decreasing.
348  * Calls to add and delete must be protected by calls to
349  * kcage_range_lock() and kcage_range_unlock().
350  */
351 int
352 kcage_range_add(pfn_t base, pgcnt_t npgs, int decr)
353 {
354 	struct kcage_glist *new, **lpp;
355 	pfn_t lim;
356 
357 	ASSERT(kcage_range_islocked());
358 
359 	ASSERT(npgs != 0);
360 	if (npgs == 0)
361 		return (EINVAL);
362 
363 	lim = base + npgs;
364 
365 	ASSERT(lim > base);
366 	if (lim <= base)
367 		return (EINVAL);
368 
369 	new = kcage_glist_alloc();
370 	if (new == NULL) {
371 		return (ENOMEM);
372 	}
373 
374 	new->base = base;
375 	new->lim = lim;
376 	new->decr = decr;
377 	if (new->decr != 0)
378 		new->curr = new->lim;
379 	else
380 		new->curr = new->base;
381 	/*
382 	 * Any overlapping existing ranges are removed by deleting
383 	 * from the new list as we search for the tail.
384 	 */
385 	lpp = &kcage_glist;
386 	while (*lpp != NULL) {
387 		int ret;
388 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
389 		if (ret != 0)
390 			return (ret);
391 		lpp = &(*lpp)->next;
392 	}
393 
394 	*lpp = new;
395 
396 	if (kcage_current_glist == NULL) {
397 		kcage_current_glist = kcage_glist;
398 	}
399 
400 	return (0);
401 }
402 
403 /*
404  * Calls to add and delete must be protected by calls to
405  * kcage_range_lock() and kcage_range_unlock().
406  */
407 int
408 kcage_range_delete(pfn_t base, pgcnt_t npgs)
409 {
410 	struct kcage_glist *lp;
411 	pfn_t lim;
412 
413 	ASSERT(kcage_range_islocked());
414 
415 	ASSERT(npgs != 0);
416 	if (npgs == 0)
417 		return (EINVAL);
418 
419 	lim = base + npgs;
420 
421 	ASSERT(lim > base);
422 	if (lim <= base)
423 		return (EINVAL);
424 
425 	/*
426 	 * Check if the delete is OK first as a number of elements
427 	 * might be involved and it will be difficult to go
428 	 * back and undo (can't just add the range back in).
429 	 */
430 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
431 		/*
432 		 * If there have been no pages allocated from this
433 		 * element, we don't need to check it.
434 		 */
435 		if ((lp->decr == 0 && lp->curr == lp->base) ||
436 		    (lp->decr != 0 && lp->curr == lp->lim))
437 			continue;
438 		/*
439 		 * If the element does not overlap, its OK.
440 		 */
441 		if (base >= lp->lim || lim <= lp->base)
442 			continue;
443 		/*
444 		 * Overlapping element: Does the range to be deleted
445 		 * overlap the area already used? If so fail.
446 		 */
447 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
448 			return (EBUSY);
449 		}
450 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
451 			return (EBUSY);
452 		}
453 	}
454 	return (kcage_glist_delete(base, lim, &kcage_glist));
455 }
456 
457 /*
458  * Calls to add and delete must be protected by calls to
459  * kcage_range_lock() and kcage_range_unlock().
460  * This routine gets called after successful Solaris memory
461  * delete operation from DR post memory delete routines.
462  */
463 int
464 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
465 {
466 	pfn_t lim;
467 
468 	ASSERT(kcage_range_islocked());
469 
470 	ASSERT(npgs != 0);
471 	if (npgs == 0)
472 		return (EINVAL);
473 
474 	lim = base + npgs;
475 
476 	ASSERT(lim > base);
477 	if (lim <= base)
478 		return (EINVAL);
479 
480 	return (kcage_glist_delete(base, lim, &kcage_glist));
481 }
482 
483 /*
484  * No locking is required here as the whole operation is covered
485  * by the kcage_range_lock().
486  */
487 static struct kcage_glist *
488 kcage_glist_alloc(void)
489 {
490 	struct kcage_glist *new;
491 
492 	if ((new = kcage_glist_freelist) != NULL) {
493 		kcage_glist_freelist = new->next;
494 		bzero(new, sizeof (*new));
495 	} else {
496 		new = kmem_zalloc(sizeof (struct kcage_glist), KM_NOSLEEP);
497 	}
498 	return (new);
499 }
500 
501 static void
502 kcage_glist_free(struct kcage_glist *lp)
503 {
504 	lp->next = kcage_glist_freelist;
505 	kcage_glist_freelist = lp;
506 }
507 
508 static int
509 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
510 {
511 	struct kcage_glist *lp, *prev = *lpp;
512 
513 	while ((lp = *lpp) != NULL) {
514 		if (lim > lp->base && base < lp->lim) {
515 			/* The delete range overlaps this element. */
516 			if (base <= lp->base && lim >= lp->lim) {
517 				/* Delete whole element. */
518 				*lpp = lp->next;
519 				if (lp == kcage_current_glist) {
520 					/* This can never happen. */
521 					ASSERT(kcage_current_glist != prev);
522 					kcage_current_glist = prev;
523 				}
524 				kcage_glist_free(lp);
525 				continue;
526 			}
527 
528 			/* Partial delete. */
529 			if (base > lp->base && lim < lp->lim) {
530 				struct kcage_glist *new;
531 
532 				/*
533 				 * Remove a section from the middle,
534 				 * need to allocate a new element.
535 				 */
536 				new = kcage_glist_alloc();
537 				if (new == NULL) {
538 					return (ENOMEM);
539 				}
540 
541 				/*
542 				 * Tranfser unused range to new.
543 				 * Edit lp in place to preserve
544 				 * kcage_current_glist.
545 				 */
546 				new->decr = lp->decr;
547 				if (new->decr != 0) {
548 					new->base = lp->base;
549 					new->lim = base;
550 					new->curr = base;
551 
552 					lp->base = lim;
553 				} else {
554 					new->base = lim;
555 					new->lim = lp->lim;
556 					new->curr = new->base;
557 
558 					lp->lim = base;
559 				}
560 
561 				/* Insert new. */
562 				new->next = lp->next;
563 				lp->next = new;
564 				lpp = &lp->next;
565 			} else {
566 				/* Delete part of current block. */
567 				if (base > lp->base) {
568 					ASSERT(lim >= lp->lim);
569 					ASSERT(base < lp->lim);
570 					if (lp->decr != 0 &&
571 					    lp->curr == lp->lim)
572 						lp->curr = base;
573 					lp->lim = base;
574 				} else {
575 					ASSERT(base <= lp->base);
576 					ASSERT(lim > lp->base);
577 					if (lp->decr == 0 &&
578 					    lp->curr == lp->base)
579 						lp->curr = lim;
580 					lp->base = lim;
581 				}
582 			}
583 		}
584 		prev = *lpp;
585 		lpp = &(*lpp)->next;
586 	}
587 
588 	return (0);
589 }
590 
591 /*
592  * The caller of kcage_get_pfn must hold the kcage_range_lock to make
593  * sure that there are no concurrent calls. The same lock
594  * must be obtained for range add and delete by calling
595  * kcage_range_lock() and kcage_range_unlock().
596  */
597 static pfn_t
598 kcage_get_pfn(void)
599 {
600 	struct kcage_glist *lp;
601 	pfn_t pfn;
602 
603 	ASSERT(kcage_range_islocked());
604 
605 	lp = kcage_current_glist;
606 	while (lp != NULL) {
607 		if (lp->decr != 0) {
608 			if (lp->curr != lp->base) {
609 				pfn = --lp->curr;
610 				return (pfn);
611 			}
612 		} else {
613 			if (lp->curr != lp->lim) {
614 				pfn = lp->curr++;
615 				return (pfn);
616 			}
617 		}
618 
619 		lp = lp->next;
620 		if (lp)
621 			kcage_current_glist = lp;
622 	}
623 
624 	return (PFN_INVALID);
625 }
626 
627 /*
628  * Walk the physical address space of the cage.
629  * This routine does not guarantee to return PFNs in the order
630  * in which they were allocated to the cage. Instead, it walks
631  * each range as they appear on the growth list returning the PFNs
632  * range in ascending order.
633  *
634  * To begin scanning at lower edge of cage, reset should be nonzero.
635  * To step through cage, reset should be zero.
636  *
637  * PFN_INVALID will be returned when the upper end of the cage is
638  * reached -- indicating a full scan of the cage has been completed since
639  * previous reset. PFN_INVALID will continue to be returned until
640  * kcage_walk_cage is reset.
641  *
642  * It is possible to receive a PFN_INVALID result on reset if a growth
643  * list is not installed or if none of the PFNs in the installed list have
644  * been allocated to the cage. In otherwords, there is no cage.
645  *
646  * Caller need not hold kcage_range_lock while calling this function
647  * as the front part of the list is static - pages never come out of
648  * the cage.
649  *
650  * The caller is expected to only be kcage_cageout().
651  */
652 static pfn_t
653 kcage_walk_cage(int reset)
654 {
655 	static struct kcage_glist *lp = NULL;
656 	static pfn_t pfn;
657 
658 	if (reset)
659 		lp = NULL;
660 	if (lp == NULL) {
661 		lp = kcage_glist;
662 		pfn = PFN_INVALID;
663 	}
664 again:
665 	if (pfn == PFN_INVALID) {
666 		if (lp == NULL)
667 			return (PFN_INVALID);
668 
669 		if (lp->decr != 0) {
670 			/*
671 			 * In this range the cage grows from the highest
672 			 * address towards the lowest.
673 			 * Arrange to return pfns from curr to lim-1,
674 			 * inclusive, in ascending order.
675 			 */
676 
677 			pfn = lp->curr;
678 		} else {
679 			/*
680 			 * In this range the cage grows from the lowest
681 			 * address towards the highest.
682 			 * Arrange to return pfns from base to curr,
683 			 * inclusive, in ascending order.
684 			 */
685 
686 			pfn = lp->base;
687 		}
688 	}
689 
690 	if (lp->decr != 0) {		/* decrementing pfn */
691 		if (pfn == lp->lim) {
692 			/* Don't go beyond the static part of the glist. */
693 			if (lp == kcage_current_glist)
694 				lp = NULL;
695 			else
696 				lp = lp->next;
697 			pfn = PFN_INVALID;
698 			goto again;
699 		}
700 
701 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
702 	} else {			/* incrementing pfn */
703 		if (pfn == lp->curr) {
704 			/* Don't go beyond the static part of the glist. */
705 			if (lp == kcage_current_glist)
706 				lp = NULL;
707 			else
708 				lp = lp->next;
709 			pfn = PFN_INVALID;
710 			goto again;
711 		}
712 
713 		ASSERT(pfn >= lp->base && pfn < lp->curr);
714 	}
715 
716 	return (pfn++);
717 }
718 
719 /*
720  * Callback functions for to recalc cage thresholds after
721  * Kphysm memory add/delete operations.
722  */
723 /*ARGSUSED*/
724 static void
725 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
726 {
727 	kcage_recalc_thresholds();
728 }
729 
730 /*ARGSUSED*/
731 static int
732 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
733 {
734 	/* TODO: when should cage refuse memory delete requests? */
735 	return (0);
736 }
737 
738 /*ARGSUSED*/
739 static  void
740 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
741 {
742 	kcage_recalc_thresholds();
743 }
744 
745 static kphysm_setup_vector_t kcage_kphysm_vectors = {
746 	KPHYSM_SETUP_VECTOR_VERSION,
747 	kcage_kphysm_postadd_cb,
748 	kcage_kphysm_predel_cb,
749 	kcage_kphysm_postdel_cb
750 };
751 
752 /*
753  * This is called before a CPR suspend and after a CPR resume.  We have to
754  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
755  * restart.
756  */
757 /*ARGSUSED*/
758 static boolean_t
759 kcage_cageout_cpr(void *arg, int code)
760 {
761 	if (code == CB_CODE_CPR_CHKPT) {
762 		ASSERT(kcage_cageout_ready);
763 		kcage_cageout_ready = 0;
764 		return (B_TRUE);
765 	} else if (code == CB_CODE_CPR_RESUME) {
766 		ASSERT(kcage_cageout_ready == 0);
767 		kcage_cageout_ready = 1;
768 		return (B_TRUE);
769 	}
770 	return (B_FALSE);
771 }
772 
773 /*
774  * kcage_recalc_preferred_size() increases initial cage size to improve large
775  * page availability when lp for kmem is enabled and kpr is disabled
776  */
777 static pgcnt_t
778 kcage_recalc_preferred_size(pgcnt_t preferred_size)
779 {
780 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
781 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
782 		if (lpmincage == 0) {
783 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
784 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
785 		}
786 		kcage_kmemlp_mincage = MIN(lpmincage,
787 			    (segkmem_kmemlp_max / PAGESIZE));
788 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
789 	}
790 	return (preferred_size);
791 }
792 
793 /*
794  * Kcage_init() builds the cage and initializes the cage thresholds.
795  * The size of the cage is determined by the argument preferred_size.
796  * or the actual amount of memory, whichever is smaller.
797  */
798 void
799 kcage_init(pgcnt_t preferred_size)
800 {
801 	pgcnt_t wanted;
802 	pfn_t pfn;
803 	page_t *pp;
804 	extern struct vnode kvp;
805 	extern void page_list_noreloc_startup(page_t *);
806 
807 	ASSERT(!kcage_on);
808 	ASSERT(kcage_range_islocked());
809 
810 	/* increase preferred cage size for lp for kmem */
811 	preferred_size = kcage_recalc_preferred_size(preferred_size);
812 
813 	/* Debug note: initialize this now so early expansions can stat */
814 	KCAGE_STAT_INIT_SCAN_INDEX;
815 
816 	/*
817 	 * Initialize cage thresholds and install kphysm callback.
818 	 * If we can't arrange to have the thresholds track with
819 	 * available physical memory, then the cage thresholds may
820 	 * end up over time at levels that adversly effect system
821 	 * performance; so, bail out.
822 	 */
823 	kcage_recalc_thresholds();
824 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
825 		ASSERT(0);		/* Catch this in DEBUG kernels. */
826 		return;
827 	}
828 
829 	/*
830 	 * Limit startup cage size within the range of kcage_minfree
831 	 * and availrmem, inclusively.
832 	 */
833 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
834 
835 	/*
836 	 * Construct the cage. PFNs are allocated from the glist. It
837 	 * is assumed that the list has been properly ordered for the
838 	 * platform by the platform code. Typically, this is as simple
839 	 * as calling kcage_range_init(phys_avail, decr), where decr is
840 	 * 1 if the kernel has been loaded into upper end of physical
841 	 * memory, or 0 if the kernel has been loaded at the low end.
842 	 *
843 	 * Note: it is assumed that we are in the startup flow, so there
844 	 * is no reason to grab the page lock.
845 	 */
846 	kcage_freemem = 0;
847 	pfn = PFN_INVALID;			/* prime for alignment test */
848 	while (wanted != 0) {
849 		if ((pfn = kcage_get_pfn()) == PFN_INVALID)
850 			break;
851 
852 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
853 			KCAGEPAGETS_INC();
854 			/*
855 			 * Set the noreloc state on the page.
856 			 * If the page is free and not already
857 			 * on the noreloc list then move it.
858 			 */
859 			if (PP_ISFREE(pp)) {
860 				if (PP_ISNORELOC(pp) == 0)
861 					page_list_noreloc_startup(pp);
862 			} else {
863 				ASSERT(pp->p_szc == 0);
864 				PP_SETNORELOC(pp);
865 			}
866 		}
867 
868 		wanted -= 1;
869 	}
870 
871 	/*
872 	 * Need to go through and find kernel allocated pages
873 	 * and capture them into the Cage.  These will primarily
874 	 * be pages gotten through boot_alloc().
875 	 */
876 	if (kvp.v_pages) {
877 
878 		pp = kvp.v_pages;
879 		do {
880 			ASSERT(!PP_ISFREE(pp));
881 			ASSERT(pp->p_szc == 0);
882 			PP_SETNORELOC(pp);
883 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
884 
885 	}
886 
887 	kcage_on = 1;
888 
889 	/*
890 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
891 	 * after the cageout thread is blocked, and executes from cpr_resume()
892 	 * before the cageout thread is restarted.  By executing in this class,
893 	 * we are assured that the kernel cage thread won't miss wakeup calls
894 	 * and also CPR's larger kmem_alloc requests will not fail after
895 	 * CPR shuts down the cageout kernel thread.
896 	 */
897 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
898 	    "cageout");
899 
900 	/*
901 	 * Coalesce pages to improve large page availability. A better fix
902 	 * would to coalesce pages as they are included in the cage
903 	 */
904 	if (SEGKMEM_USE_LARGEPAGES) {
905 		extern void page_freelist_coalesce_all(int mnode);
906 		extern int max_mem_nodes;
907 		int mnode, max_mnodes = max_mem_nodes;
908 		for (mnode = 0; mnode < max_mnodes; mnode++) {
909 			page_freelist_coalesce_all(mnode);
910 		}
911 	}
912 }
913 
914 void
915 kcage_recalc_thresholds()
916 {
917 	static int first = 1;
918 	static pgcnt_t init_lotsfree;
919 	static pgcnt_t init_desfree;
920 	static pgcnt_t init_minfree;
921 	static pgcnt_t init_throttlefree;
922 	static pgcnt_t init_reserve;
923 
924 	/* TODO: any reason to take more care than this with live editing? */
925 	mutex_enter(&kcage_cageout_mutex);
926 	mutex_enter(&freemem_lock);
927 
928 	if (first) {
929 		first = 0;
930 		init_lotsfree = kcage_lotsfree;
931 		init_desfree = kcage_desfree;
932 		init_minfree = kcage_minfree;
933 		init_throttlefree = kcage_throttlefree;
934 		init_reserve = kcage_reserve;
935 	} else {
936 		kcage_lotsfree = init_lotsfree;
937 		kcage_desfree = init_desfree;
938 		kcage_minfree = init_minfree;
939 		kcage_throttlefree = init_throttlefree;
940 		kcage_reserve = init_reserve;
941 	}
942 
943 	if (kcage_lotsfree == 0)
944 		kcage_lotsfree = MAX(32, total_pages / 256);
945 
946 	if (kcage_minfree == 0)
947 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
948 
949 	if (kcage_desfree == 0)
950 		kcage_desfree = MAX(32, kcage_minfree);
951 
952 	if (kcage_throttlefree == 0)
953 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
954 
955 	if (kcage_reserve == 0)
956 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
957 
958 	mutex_exit(&freemem_lock);
959 	mutex_exit(&kcage_cageout_mutex);
960 
961 	if (kcage_cageout_ready) {
962 		if (kcage_freemem < kcage_desfree)
963 			kcage_cageout_wakeup();
964 
965 		if (kcage_needfree) {
966 			mutex_enter(&kcage_throttle_mutex);
967 			cv_broadcast(&kcage_throttle_cv);
968 			mutex_exit(&kcage_throttle_mutex);
969 		}
970 	}
971 }
972 
973 /*
974  * Pageout interface:
975  * kcage_cageout_init()
976  */
977 void
978 kcage_cageout_init()
979 {
980 	if (kcage_on) {
981 
982 		(void) thread_create(NULL, 0, kcage_cageout,
983 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
984 	}
985 }
986 
987 
988 /*
989  * VM Interfaces:
990  * kcage_create_throttle()
991  * kcage_freemem_add()
992  * kcage_freemem_sub()
993  */
994 
995 /*
996  * Wakeup cageout thread and throttle waiting for the number of pages
997  * requested to become available.  For non-critical requests, a
998  * timeout is added, since freemem accounting is separate from cage
999  * freemem accounting: it's possible for us to get stuck and not make
1000  * forward progress even though there was sufficient freemem before
1001  * arriving here.
1002  */
1003 int
1004 kcage_create_throttle(pgcnt_t npages, int flags)
1005 {
1006 	int niter = 0;
1007 	pgcnt_t lastfree;
1008 	int enough = kcage_freemem > kcage_throttlefree + npages;
1009 
1010 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1011 
1012 	kcage_cageout_wakeup();			/* just to be sure */
1013 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1014 
1015 	/*
1016 	 * Obviously, we can't throttle the cageout thread since
1017 	 * we depend on it.  We also can't throttle the panic thread.
1018 	 */
1019 	if (curthread == kcage_cageout_thread || panicstr) {
1020 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1021 		return (KCT_CRIT);
1022 	}
1023 
1024 	/*
1025 	 * Don't throttle threads which are critical for proper
1026 	 * vm management if we're above kcage_throttlefree or
1027 	 * if freemem is very low.
1028 	 */
1029 	if (NOMEMWAIT()) {
1030 		if (enough) {
1031 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1032 			return (KCT_CRIT);
1033 		} else if (freemem < minfree) {
1034 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1035 			return (KCT_CRIT);
1036 		}
1037 	}
1038 
1039 	/*
1040 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1041 	 */
1042 	if (DISP_PRIO(curthread) > maxclsyspri &&
1043 	    kcage_freemem > kcage_reserve) {
1044 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1045 		return (KCT_CRIT);
1046 	}
1047 
1048 	/*
1049 	 * Cause all other threads (which are assumed to not be
1050 	 * critical to cageout) to wait here until their request
1051 	 * can be satisfied. Be a little paranoid and wake the
1052 	 * kernel cage on each loop through this logic.
1053 	 */
1054 	while (kcage_freemem < kcage_throttlefree + npages) {
1055 		ASSERT(kcage_on);
1056 
1057 		lastfree = kcage_freemem;
1058 
1059 		if (kcage_cageout_ready) {
1060 			mutex_enter(&kcage_throttle_mutex);
1061 
1062 			kcage_needfree += npages;
1063 			KCAGE_STAT_INCR(kct_wait);
1064 
1065 			kcage_cageout_wakeup();
1066 			KCAGE_STAT_INCR(kct_cagewake);
1067 
1068 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1069 
1070 			kcage_needfree -= npages;
1071 
1072 			mutex_exit(&kcage_throttle_mutex);
1073 		} else {
1074 			/*
1075 			 * NOTE: atomics are used just in case we enter
1076 			 * mp operation before the cageout thread is ready.
1077 			 */
1078 			atomic_add_long(&kcage_needfree, npages);
1079 
1080 			kcage_cageout_wakeup();
1081 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1082 
1083 			atomic_add_long(&kcage_needfree, -npages);
1084 		}
1085 
1086 		if ((flags & PG_WAIT) == 0) {
1087 			if (kcage_freemem > lastfree) {
1088 				KCAGE_STAT_INCR(kct_progress);
1089 				niter = 0;
1090 			} else {
1091 				KCAGE_STAT_INCR(kct_noprogress);
1092 				if (++niter >= kcage_maxwait) {
1093 					KCAGE_STAT_INCR(kct_timeout);
1094 					return (KCT_FAILURE);
1095 				}
1096 			}
1097 		}
1098 	}
1099 	return (KCT_NONCRIT);
1100 }
1101 
1102 void
1103 kcage_freemem_add(pgcnt_t npages)
1104 {
1105 	extern void wakeup_pcgs(void);
1106 
1107 	atomic_add_long(&kcage_freemem, npages);
1108 
1109 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1110 
1111 	if (kcage_needfree != 0 &&
1112 		kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1113 
1114 		mutex_enter(&kcage_throttle_mutex);
1115 		cv_broadcast(&kcage_throttle_cv);
1116 		KCAGE_STAT_INCR(kfa_trottlewake);
1117 		mutex_exit(&kcage_throttle_mutex);
1118 	}
1119 }
1120 
1121 void
1122 kcage_freemem_sub(pgcnt_t npages)
1123 {
1124 	atomic_add_long(&kcage_freemem, -npages);
1125 
1126 	if (kcage_freemem < kcage_desfree) {
1127 		kcage_cageout_wakeup();
1128 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1129 	}
1130 }
1131 
1132 /*
1133  * return 0 on failure and 1 on success.
1134  */
1135 static int
1136 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1137 {
1138 	pgcnt_t npgs, i;
1139 	page_t *pp;
1140 	pfn_t rootpfn = page_pptonum(rootpp);
1141 	uint_t szc;
1142 
1143 	ASSERT(!PP_ISFREE(rootpp));
1144 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1145 	if (!group_page_trylock(rootpp, se)) {
1146 		return (0);
1147 	}
1148 	szc = rootpp->p_szc;
1149 	if (szc == 0) {
1150 		/*
1151 		 * The szc of a locked page can only change for pages that are
1152 		 * non-swapfs (i.e. anonymous memory) file system pages.
1153 		 */
1154 		ASSERT(rootpp->p_vnode != NULL &&
1155 		    rootpp->p_vnode != &kvp &&
1156 		    !IS_SWAPFSVP(rootpp->p_vnode));
1157 		PP_SETNORELOC(rootpp);
1158 		return (1);
1159 	}
1160 	npgs = page_get_pagecnt(szc);
1161 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1162 	pp = rootpp;
1163 	for (i = 0; i < npgs; i++, pp++) {
1164 		ASSERT(PAGE_LOCKED_SE(pp, se));
1165 		ASSERT(!PP_ISFREE(pp));
1166 		ASSERT(pp->p_szc == szc);
1167 		PP_SETNORELOC(pp);
1168 	}
1169 	group_page_unlock(rootpp);
1170 	return (1);
1171 }
1172 
1173 /*
1174  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1175  * If successful and pages is free, move page to the tail of whichever
1176  * list it is on.
1177  * Returns:
1178  *   EBUSY  page already locked, assimilated but not free.
1179  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1180  *   EAGAIN page not assimilated. Page not free.
1181  *   ERANGE page assimilated. Page not root.
1182  *   0      page assimilated. Page free.
1183  *   *nfreedp number of pages freed.
1184  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1185  * to distinguish between a page that was already a NORELOC page from
1186  * those newly converted to NORELOC pages by this invocation of
1187  * kcage_assimilate_page.
1188  */
1189 static int
1190 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1191 {
1192 	if (page_trylock(pp, SE_EXCL)) {
1193 		if (PP_ISNORELOC(pp)) {
1194 check_free_and_return:
1195 			if (PP_ISFREE(pp)) {
1196 				page_unlock(pp);
1197 				*nfreedp = 0;
1198 				return (0);
1199 			} else {
1200 				page_unlock(pp);
1201 				return (EBUSY);
1202 			}
1203 			/*NOTREACHED*/
1204 		}
1205 	} else {
1206 		if (page_trylock(pp, SE_SHARED)) {
1207 			if (PP_ISNORELOC(pp))
1208 				goto check_free_and_return;
1209 		} else
1210 			return (EAGAIN);
1211 
1212 		if (!PP_ISFREE(pp)) {
1213 			page_unlock(pp);
1214 			return (EAGAIN);
1215 		}
1216 
1217 		/*
1218 		 * Need to upgrade the lock on it and set the NORELOC
1219 		 * bit. If it is free then remove it from the free
1220 		 * list so that the platform free list code can keep
1221 		 * NORELOC pages where they should be.
1222 		 */
1223 		/*
1224 		 * Before doing anything, get the exclusive lock.
1225 		 * This may fail (eg ISM pages are left shared locked).
1226 		 * If the page is free this will leave a hole in the
1227 		 * cage. There is no solution yet to this.
1228 		 */
1229 		if (!page_tryupgrade(pp)) {
1230 			page_unlock(pp);
1231 			return (EAGAIN);
1232 		}
1233 	}
1234 
1235 	ASSERT(PAGE_EXCL(pp));
1236 
1237 	if (PP_ISFREE(pp)) {
1238 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1239 
1240 		page_list_sub(pp, which | PG_LIST_ISCAGE);
1241 		ASSERT(pp->p_szc == 0);
1242 		PP_SETNORELOC(pp);
1243 		page_list_add(pp, which | PG_LIST_TAIL | PG_LIST_ISCAGE);
1244 
1245 		page_unlock(pp);
1246 		*nfreedp = 1;
1247 		return (0);
1248 	} else {
1249 		if (pp->p_szc != 0) {
1250 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1251 				page_unlock(pp);
1252 				return (EAGAIN);
1253 			}
1254 			ASSERT(PP_ISNORELOC(pp));
1255 		} else {
1256 			PP_SETNORELOC(pp);
1257 		}
1258 		page_list_xfer(pp, MTYPE_NORELOC, MTYPE_RELOC);
1259 		return (kcage_invalidate_page(pp, nfreedp));
1260 	}
1261 	/*NOTREACHED*/
1262 }
1263 
1264 static int
1265 kcage_expand()
1266 {
1267 	int did_something = 0;
1268 
1269 	spgcnt_t wanted;
1270 	pfn_t pfn;
1271 	page_t *pp;
1272 	/* TODO: we don't really need n any more? */
1273 	pgcnt_t n;
1274 	pgcnt_t nf, nfreed;
1275 
1276 	/*
1277 	 * Expand the cage if available cage memory is really low. Calculate
1278 	 * the amount required to return kcage_freemem to the level of
1279 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1280 	 * more.  It is rare for their sum to create an artificial threshold
1281 	 * above kcage_lotsfree, but it is possible.
1282 	 *
1283 	 * Exit early if expansion amount is equal to or less than zero.
1284 	 * (<0 is possible if kcage_freemem rises suddenly.)
1285 	 *
1286 	 * Exit early when the global page pool (apparently) does not
1287 	 * have enough free pages to page_relocate() even a single page.
1288 	 */
1289 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1290 		- kcage_freemem;
1291 	if (wanted <= 0)
1292 		return (0);
1293 	else if (freemem < pageout_reserve + 1) {
1294 		KCAGE_STAT_INCR(ke_lowfreemem);
1295 		return (0);
1296 	}
1297 
1298 	/*
1299 	 * Try to get the range list lock. If the lock is already
1300 	 * held, then don't get stuck here waiting for it.
1301 	 */
1302 	if (!kcage_range_trylock())
1303 		return (0);
1304 
1305 	KCAGE_STAT_INCR(ke_calls);
1306 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1307 
1308 	/*
1309 	 * Assimilate more pages from the global page pool into the cage.
1310 	 */
1311 	n = 0;				/* number of pages PP_SETNORELOC'd */
1312 	nf = 0;				/* number of those actually free */
1313 	while (kcage_on && nf < wanted) {
1314 		pfn = kcage_get_pfn();
1315 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1316 			KCAGE_STAT_INCR(ke_nopfn);
1317 			goto terminate;
1318 		}
1319 
1320 		KCAGE_STAT_INCR_SCAN(ke_examined);
1321 
1322 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1323 			KCAGE_STAT_INCR(ke_nopaget);
1324 			continue;
1325 		}
1326 		KCAGEPAGETS_INC();
1327 		/*
1328 		 * Sanity check. Skip this pfn if it is
1329 		 * being deleted.
1330 		 */
1331 		if (pfn_is_being_deleted(pfn)) {
1332 			KCAGE_STAT_INCR(ke_deleting);
1333 			continue;
1334 		}
1335 
1336 		/*
1337 		 * NORELOC is only set at boot-time or by this routine
1338 		 * under the kcage_range_mutex lock which is currently
1339 		 * held. This means we can do a fast check here before
1340 		 * locking the page in kcage_assimilate_page.
1341 		 */
1342 		if (PP_ISNORELOC(pp)) {
1343 			KCAGE_STAT_INCR(ke_isnoreloc);
1344 			continue;
1345 		}
1346 
1347 		switch (kcage_assimilate_page(pp, &nfreed)) {
1348 			case 0:		/* assimilated, page is free */
1349 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1350 				did_something = 1;
1351 				nf += nfreed;
1352 				n++;
1353 				break;
1354 
1355 			case EBUSY:	/* assimilated, page not free */
1356 			case ERANGE:	/* assimilated, page not root */
1357 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1358 				did_something = 1;
1359 				n++;
1360 				break;
1361 
1362 			case ENOMEM:	/* assimilated, but no mem */
1363 				KCAGE_STAT_INCR(ke_terminate);
1364 				did_something = 1;
1365 				n++;
1366 				goto terminate;
1367 
1368 			case EAGAIN:	/* can't assimilate */
1369 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1370 				break;
1371 
1372 			default:	/* catch this with debug kernels */
1373 				ASSERT(0);
1374 				break;
1375 		}
1376 	}
1377 
1378 	/*
1379 	 * Realign cage edge with the nearest physical address
1380 	 * boundry for big pages. This is done to give us a
1381 	 * better chance of actually getting usable big pages
1382 	 * in the cage.
1383 	 */
1384 
1385 terminate:
1386 	kcage_range_unlock();
1387 
1388 	return (did_something);
1389 }
1390 
1391 /*
1392  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1393  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1394  * if relocation is successful, otherwise it is only unlocked.
1395  * On entry, page opp must be exclusively locked and not free.
1396  * *nfreedp: number of pages freed.
1397  */
1398 static int
1399 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1400 {
1401 	page_t *opp = pp;
1402 	page_t *rpp = NULL;
1403 	spgcnt_t npgs;
1404 	int result;
1405 
1406 	ASSERT(!PP_ISFREE(opp));
1407 	ASSERT(PAGE_EXCL(opp));
1408 
1409 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1410 	*nfreedp = npgs;
1411 	if (result == 0) {
1412 		while (npgs-- > 0) {
1413 			page_t *tpp;
1414 
1415 			ASSERT(rpp != NULL);
1416 			tpp = rpp;
1417 			page_sub(&rpp, tpp);
1418 			page_unlock(tpp);
1419 		}
1420 
1421 		ASSERT(rpp == NULL);
1422 
1423 		return (0);		/* success */
1424 	}
1425 
1426 	page_unlock(opp);
1427 	return (result);
1428 }
1429 
1430 /*
1431  * Based on page_invalidate_pages()
1432  *
1433  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1434  * of use must be updated to match the new page_relocate() when it
1435  * becomes available.
1436  *
1437  * Return result of kcage_relocate_page or zero if page was directly freed.
1438  * *nfreedp: number of pages freed.
1439  */
1440 static int
1441 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1442 {
1443 	int result;
1444 
1445 #if defined(__sparc)
1446 	extern struct vnode prom_ppages;
1447 	ASSERT(pp->p_vnode != &prom_ppages);
1448 #endif /* __sparc */
1449 
1450 	ASSERT(!PP_ISFREE(pp));
1451 	ASSERT(PAGE_EXCL(pp));
1452 
1453 	/*
1454 	 * Is this page involved in some I/O? shared?
1455 	 * The page_struct_lock need not be acquired to
1456 	 * examine these fields since the page has an
1457 	 * "exclusive" lock.
1458 	 */
1459 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1460 		result = kcage_relocate_page(pp, nfreedp);
1461 #ifdef KCAGE_STATS
1462 		if (result == 0)
1463 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1464 		else if (result == ENOMEM)
1465 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1466 #endif
1467 		return (result);
1468 	}
1469 
1470 	ASSERT(pp->p_vnode->v_type != VCHR);
1471 
1472 	/*
1473 	 * Unload the mappings and check if mod bit is set.
1474 	 */
1475 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1476 
1477 	if (hat_ismod(pp)) {
1478 		result = kcage_relocate_page(pp, nfreedp);
1479 #ifdef KCAGE_STATS
1480 		if (result == 0)
1481 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1482 		else if (result == ENOMEM)
1483 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1484 #endif
1485 		return (result);
1486 	}
1487 
1488 	if (!page_try_demote_pages(pp)) {
1489 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1490 		page_unlock(pp);
1491 		return (EAGAIN);
1492 	}
1493 
1494 	page_destroy(pp, 0);
1495 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1496 	*nfreedp = 1;
1497 	return (0);
1498 }
1499 
1500 static void
1501 kcage_cageout()
1502 {
1503 	pfn_t pfn;
1504 	page_t *pp;
1505 	callb_cpr_t cprinfo;
1506 	int did_something;
1507 	int scan_again;
1508 	pfn_t start_pfn;
1509 	int pass;
1510 	int last_pass;
1511 	int pages_skipped;
1512 	int shared_skipped;
1513 	uint_t shared_level = 8;
1514 	pgcnt_t nfreed;
1515 #ifdef KCAGE_STATS
1516 	clock_t scan_start;
1517 #endif
1518 
1519 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1520 		callb_generic_cpr, "cageout");
1521 
1522 	mutex_enter(&kcage_cageout_mutex);
1523 	kcage_cageout_thread = curthread;
1524 
1525 	pfn = PFN_INVALID;		/* force scan reset */
1526 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1527 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1528 
1529 loop:
1530 	/*
1531 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1532 	 * that kcage_freemem is less than kcage_desfree. When it does
1533 	 * notice, kcage_freemem_sub() will wake us up via call to
1534 	 * kcage_cageout_wakeup().
1535 	 */
1536 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1537 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1538 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1539 
1540 	KCAGE_STAT_INCR(kt_wakeups);
1541 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1542 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1543 	pass = 0;
1544 	last_pass = 0;
1545 
1546 #ifdef KCAGE_STATS
1547 	scan_start = lbolt;
1548 #endif
1549 
1550 again:
1551 	if (!kcage_on)
1552 		goto loop;
1553 
1554 	KCAGE_STAT_INCR(kt_scans);
1555 	KCAGE_STAT_INCR_SCAN(kt_passes);
1556 
1557 	did_something = 0;
1558 	pages_skipped = 0;
1559 	shared_skipped = 0;
1560 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1561 		(pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1562 
1563 		if (start_pfn == PFN_INVALID)
1564 			start_pfn = pfn;
1565 		else if (start_pfn == pfn) {
1566 			last_pass = pass;
1567 			pass += 1;
1568 			/*
1569 			 * Did a complete walk of kernel cage, but didn't free
1570 			 * any pages.  If only one cpu is online then
1571 			 * stop kernel cage walk and try expanding.
1572 			 */
1573 			if (ncpus_online == 1 && did_something == 0) {
1574 				KCAGE_STAT_INCR(kt_cageout_break);
1575 				break;
1576 			}
1577 		}
1578 
1579 		pp = page_numtopp_nolock(pfn);
1580 		if (pp == NULL) {
1581 			continue;
1582 		}
1583 
1584 		KCAGE_STAT_INCR_SCAN(kt_examined);
1585 
1586 		/*
1587 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1588 		 * of the lock. If one is missed it will be seen next
1589 		 * time through.
1590 		 *
1591 		 * Skip non-caged-pages. These pages can exist in the cage
1592 		 * because, if during cage expansion, a page is
1593 		 * encountered that is long-term locked the lock prevents the
1594 		 * expansion logic from setting the P_NORELOC flag. Hence,
1595 		 * non-caged-pages surrounded by caged-pages.
1596 		 */
1597 		if (!PP_ISNORELOC(pp)) {
1598 			switch (kcage_assimilate_page(pp, &nfreed)) {
1599 				case 0:
1600 					did_something = 1;
1601 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1602 					    nfreed);
1603 					break;
1604 
1605 				case EBUSY:
1606 				case ERANGE:
1607 					did_something = 1;
1608 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1609 					break;
1610 
1611 				case EAGAIN:
1612 				case ENOMEM:
1613 					break;
1614 
1615 				default:
1616 					/* catch this with debug kernels */
1617 					ASSERT(0);
1618 					break;
1619 			}
1620 
1621 			continue;
1622 		} else {
1623 			int prm;
1624 
1625 			if (PP_ISFREE(pp)) {
1626 				continue;
1627 			}
1628 
1629 			if ((pp->p_vnode == &kvp && pp->p_lckcnt > 0) ||
1630 			    !page_trylock(pp, SE_EXCL)) {
1631 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1632 				continue;
1633 			}
1634 
1635 			/* P_NORELOC bit should not have gone away. */
1636 			ASSERT(PP_ISNORELOC(pp));
1637 			if (PP_ISFREE(pp) || (pp->p_vnode == &kvp &&
1638 			    pp->p_lckcnt > 0)) {
1639 				page_unlock(pp);
1640 				continue;
1641 			}
1642 
1643 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1644 			if (hat_page_getshare(pp) > shared_level) {
1645 				page_unlock(pp);
1646 				pages_skipped = 1;
1647 				shared_skipped = 1;
1648 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1649 				continue;
1650 			}
1651 
1652 			/*
1653 			 * In pass {0, 1}, skip page if ref bit is set.
1654 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1655 			 */
1656 			prm = hat_pagesync(pp,
1657 				HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1658 
1659 			/* On first pass ignore ref'd pages */
1660 			if (pass <= 1 && (prm & P_REF)) {
1661 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1662 				pages_skipped = 1;
1663 				page_unlock(pp);
1664 				continue;
1665 			}
1666 
1667 			/* On pass 2, page_destroy if mod bit is not set */
1668 			if (pass <= 2) {
1669 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1670 					pp->p_lckcnt || pp->p_cowcnt) {
1671 					pages_skipped = 1;
1672 					page_unlock(pp);
1673 				} else {
1674 
1675 					/*
1676 					 * unload the mappings before
1677 					 * checking if mod bit is set
1678 					 */
1679 					(void) hat_pageunload(pp,
1680 						HAT_FORCE_PGUNLOAD);
1681 
1682 					/*
1683 					 * skip this page if modified
1684 					 */
1685 					if (hat_ismod(pp)) {
1686 						pages_skipped = 1;
1687 						page_unlock(pp);
1688 						continue;
1689 					}
1690 
1691 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1692 					page_destroy(pp, 0);
1693 					did_something = 1;
1694 				}
1695 				continue;
1696 			}
1697 
1698 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1699 				did_something = 1;
1700 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1701 			}
1702 
1703 			/*
1704 			 * No need to drop the page lock here.
1705 			 * Kcage_invalidate_page has done that for us
1706 			 * either explicitly or through a page_free.
1707 			 */
1708 		}
1709 	}
1710 
1711 	/*
1712 	 * Expand the cage only if available cage memory is really low.
1713 	 * This test is done only after a complete scan of the cage.
1714 	 * The reason for not checking and expanding more often is to
1715 	 * avoid rapid expansion of the cage. Naturally, scanning the
1716 	 * cage takes time. So by scanning first, we use that work as a
1717 	 * delay loop in between expand decisions.
1718 	 */
1719 
1720 	scan_again = 0;
1721 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1722 		/*
1723 		 * Kcage_expand() will return a non-zero value if it was
1724 		 * able to expand the cage -- whether or not the new
1725 		 * pages are free and immediately usable. If non-zero,
1726 		 * we do another scan of the cage. The pages might be
1727 		 * freed during that scan or by time we get back here.
1728 		 * If not, we will attempt another expansion.
1729 		 * However, if kcage_expand() returns zero, then it was
1730 		 * unable to expand the cage. This is the case when the
1731 		 * the growth list is exausted, therefore no work was done
1732 		 * and there is no reason to scan the cage again.
1733 		 * Note: Kernel cage scan is not repeated on single-cpu
1734 		 * system to avoid kernel cage thread hogging cpu.
1735 		 */
1736 		if (pass <= 3 && pages_skipped && ncpus_online > 1)
1737 			scan_again = 1;
1738 		else
1739 			(void) kcage_expand(); /* don't scan again */
1740 	} else if (kcage_freemem < kcage_lotsfree) {
1741 		/*
1742 		 * If available cage memory is less than abundant
1743 		 * and a full scan of the cage has not yet been completed,
1744 		 * or a scan has completed and some work was performed,
1745 		 * or pages were skipped because of sharing,
1746 		 * or we simply have not yet completed two passes,
1747 		 * then do another scan.
1748 		 */
1749 		if (pass <= 2 && pages_skipped)
1750 			scan_again = 1;
1751 		if (pass == last_pass || did_something)
1752 			scan_again = 1;
1753 		else if (shared_skipped && shared_level < (8<<24)) {
1754 			shared_level <<= 1;
1755 			scan_again = 1;
1756 		}
1757 	}
1758 
1759 	if (scan_again && ncpus_online > 1)
1760 		goto again;
1761 	else {
1762 		if (shared_level > 8)
1763 			shared_level >>= 1;
1764 
1765 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1766 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1767 		KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start);
1768 		KCAGE_STAT_INC_SCAN_INDEX;
1769 		goto loop;
1770 	}
1771 
1772 	/*NOTREACHED*/
1773 }
1774 
1775 void
1776 kcage_cageout_wakeup()
1777 {
1778 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1779 		if (kcage_cageout_ready) {
1780 			cv_signal(&kcage_cageout_cv);
1781 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1782 			/*
1783 			 * Available cage memory is really low. Time to
1784 			 * start expanding the cage. However, the
1785 			 * kernel cage thread is not yet ready to
1786 			 * do the work. Use *this* thread, which is
1787 			 * most likely to be t0, to do the work.
1788 			 */
1789 			KCAGE_STAT_INCR(kcw_expandearly);
1790 			(void) kcage_expand();
1791 			KCAGE_STAT_INC_SCAN_INDEX;
1792 		}
1793 
1794 		mutex_exit(&kcage_cageout_mutex);
1795 	}
1796 	/* else, kernel cage thread is already running */
1797 }
1798 
1799 void
1800 kcage_tick()
1801 {
1802 	/*
1803 	 * Once per second we wake up all the threads throttled
1804 	 * waiting for cage memory, in case we've become stuck
1805 	 * and haven't made forward progress expanding the cage.
1806 	 */
1807 	if (kcage_on && kcage_cageout_ready)
1808 		cv_broadcast(&kcage_throttle_cv);
1809 }
1810