xref: /titanic_50/usr/src/uts/common/os/mem_cage.c (revision 911106dfb16696472af8c1b7b4c554a829354fa8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/proc.h>
32 #include <sys/callb.h>
33 #include <sys/vnode.h>
34 #include <sys/debug.h>
35 #include <sys/systm.h>		/* for bzero */
36 #include <sys/memlist.h>
37 #include <sys/cmn_err.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
40 #include <sys/atomic.h>		/* used to update kcage_freemem */
41 #include <sys/kmem.h>		/* for kmem_reap */
42 #include <sys/errno.h>
43 #include <sys/mem_cage.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/page.h>
46 #include <vm/hat.h>
47 #include <vm/vm_dep.h>
48 #include <sys/mem_config.h>
49 #include <sys/lgrp.h>
50 #include <sys/rwlock.h>
51 
52 extern pri_t maxclsyspri;
53 
54 #ifdef DEBUG
55 #define	KCAGE_STATS
56 #endif
57 
58 #ifdef KCAGE_STATS
59 
60 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
61 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
62 
63 struct kcage_stats_scan {
64 	/* managed by KCAGE_STAT_* macros */
65 	clock_t	scan_lbolt;
66 	uint_t	scan_id;
67 
68 	/* set in kcage_cageout() */
69 	uint_t	kt_passes;
70 	clock_t	kt_ticks;
71 	pgcnt_t	kt_kcage_freemem_start;
72 	pgcnt_t	kt_kcage_freemem_end;
73 	pgcnt_t kt_freemem_start;
74 	pgcnt_t kt_freemem_end;
75 	uint_t	kt_examined;
76 	uint_t	kt_cantlock;
77 	uint_t	kt_gotone;
78 	uint_t	kt_gotonefree;
79 	uint_t	kt_skiplevel;
80 	uint_t	kt_skipshared;
81 	uint_t	kt_skiprefd;
82 	uint_t	kt_destroy;
83 
84 	/* set in kcage_invalidate_page() */
85 	uint_t	kip_reloclocked;
86 	uint_t	kip_relocmod;
87 	uint_t	kip_destroy;
88 	uint_t	kip_nomem;
89 	uint_t	kip_demotefailed;
90 
91 	/* set in kcage_expand() */
92 	uint_t	ke_wanted;
93 	uint_t	ke_examined;
94 	uint_t	ke_lefthole;
95 	uint_t	ke_gotone;
96 	uint_t	ke_gotonefree;
97 };
98 
99 struct kcage_stats {
100 	/* managed by KCAGE_STAT_* macros */
101 	uint_t	version;
102 	uint_t	size;
103 
104 	/* set in kcage_cageout */
105 	uint_t	kt_wakeups;
106 	uint_t	kt_scans;
107 	uint_t	kt_cageout_break;
108 
109 	/* set in kcage_expand */
110 	uint_t	ke_calls;
111 	uint_t	ke_nopfn;
112 	uint_t	ke_nopaget;
113 	uint_t	ke_isnoreloc;
114 	uint_t	ke_deleting;
115 	uint_t	ke_lowfreemem;
116 	uint_t	ke_terminate;
117 
118 	/* set in kcage_freemem_add() */
119 	uint_t	kfa_trottlewake;
120 
121 	/* set in kcage_freemem_sub() */
122 	uint_t	kfs_cagewake;
123 
124 	/* set in kcage_create_throttle */
125 	uint_t	kct_calls;
126 	uint_t	kct_cageout;
127 	uint_t	kct_critical;
128 	uint_t	kct_exempt;
129 	uint_t	kct_cagewake;
130 	uint_t	kct_wait;
131 	uint_t	kct_progress;
132 	uint_t	kct_noprogress;
133 	uint_t	kct_timeout;
134 
135 	/* set in kcage_cageout_wakeup */
136 	uint_t	kcw_expandearly;
137 
138 	/* managed by KCAGE_STAT_* macros */
139 	uint_t	scan_array_size;
140 	uint_t	scan_index;
141 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
142 };
143 
144 static struct kcage_stats kcage_stats;
145 static struct kcage_stats_scan kcage_stats_scan_zero;
146 
147 /*
148  * No real need for atomics here. For the most part the incs and sets are
149  * done by the kernel cage thread. There are a few that are done by any
150  * number of other threads. Those cases are noted by comments.
151  */
152 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
153 
154 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
155 
156 #define	KCAGE_STAT_INCR_SCAN(m)	\
157 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
158 
159 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
160 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
161 
162 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
163 
164 #define	KCAGE_STAT_SETZ(m, v)	\
165 	if (kcage_stats.m == 0) kcage_stats.m = (v)
166 
167 #define	KCAGE_STAT_SET_SCAN(m, v)	\
168 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
169 
170 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
171 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
172 
173 #define	KCAGE_STAT_INC_SCAN_INDEX \
174 	KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \
175 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
176 	kcage_stats.scan_index = \
177 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
178 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
179 
180 #define	KCAGE_STAT_INIT_SCAN_INDEX \
181 	kcage_stats.version = KCAGE_STATS_VERSION; \
182 	kcage_stats.size = sizeof (kcage_stats); \
183 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
184 	kcage_stats.scan_index = 0
185 
186 #else /* KCAGE_STATS */
187 
188 #define	KCAGE_STAT_INCR(v)
189 #define	KCAGE_STAT_NINCR(m, v)
190 #define	KCAGE_STAT_INCR_SCAN(v)
191 #define	KCAGE_STAT_NINCR_SCAN(m, v)
192 #define	KCAGE_STAT_SET(m, v)
193 #define	KCAGE_STAT_SETZ(m, v)
194 #define	KCAGE_STAT_SET_SCAN(m, v)
195 #define	KCAGE_STAT_SETZ_SCAN(m, v)
196 #define	KCAGE_STAT_INC_SCAN_INDEX
197 #define	KCAGE_STAT_INIT_SCAN_INDEX
198 
199 #endif /* KCAGE_STATS */
200 
201 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
202 static kcondvar_t kcage_throttle_cv;
203 
204 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
205 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
206 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
207 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
208 
209 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
210 
211 /*
212  * Cage expansion happens within a range.
213  */
214 struct kcage_glist {
215 	struct kcage_glist	*next;
216 	pfn_t			base;
217 	pfn_t			lim;
218 	pfn_t			curr;
219 	int			decr;
220 };
221 
222 static struct kcage_glist *kcage_glist;
223 static struct kcage_glist *kcage_current_glist;
224 
225 /*
226  * The firstfree element is provided so that kmem_alloc can be avoided
227  * until that cage has somewhere to go. This is not currently a problem
228  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
229  */
230 static vmem_t *kcage_arena;
231 static struct kcage_glist kcage_glist_firstfree;
232 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
233 
234 /*
235  * Miscellaneous forward references
236  */
237 static struct kcage_glist *kcage_glist_alloc(void);
238 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
239 static void kcage_cageout(void);
240 static int kcage_invalidate_page(page_t *, pgcnt_t *);
241 static int kcage_setnoreloc_pages(page_t *, se_t);
242 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
243 static void kcage_init(pgcnt_t preferred_size);
244 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
245 
246 /*
247  * Kernel Memory Cage counters and thresholds.
248  */
249 int kcage_on = 0;
250 pgcnt_t kcage_freemem;
251 pgcnt_t kcage_needfree;
252 pgcnt_t kcage_lotsfree;
253 pgcnt_t kcage_desfree;
254 pgcnt_t kcage_minfree;
255 pgcnt_t kcage_throttlefree;
256 pgcnt_t	kcage_reserve;
257 int kcage_maxwait = 10;	/* in seconds */
258 
259 /* when we use lp for kmem we start the cage at a higher initial value */
260 pgcnt_t kcage_kmemlp_mincage;
261 
262 #ifdef DEBUG
263 pgcnt_t	kcage_pagets;
264 #define	KCAGEPAGETS_INC()	kcage_pagets++
265 #else
266 #define	KCAGEPAGETS_INC()
267 #endif
268 
269 /* kstats to export what pages are currently caged */
270 kmutex_t kcage_kstat_lock;
271 static int kcage_kstat_update(kstat_t *ksp, int rw);
272 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
273 
274 /*
275  * Startup and Dynamic Reconfiguration interfaces.
276  * kcage_range_add()
277  * kcage_range_del()
278  * kcage_range_delete_post_mem_del()
279  * kcage_range_init()
280  * kcage_set_thresholds()
281  */
282 
283 /*
284  * Called from page_get_contig_pages to get the approximate kcage pfn range
285  * for exclusion from search for contiguous pages. This routine is called
286  * without kcage_range lock (kcage routines can call page_get_contig_pages
287  * through page_relocate) and with the assumption, based on kcage_range_add,
288  * that kcage_current_glist always contain a valid pointer.
289  */
290 
291 int
292 kcage_current_pfn(pfn_t *pfncur)
293 {
294 	struct kcage_glist *lp = kcage_current_glist;
295 
296 	ASSERT(kcage_on);
297 
298 	ASSERT(lp != NULL);
299 
300 	*pfncur = lp->curr;
301 
302 	return (lp->decr);
303 }
304 
305 /*
306  * Called from vm_pagelist.c during coalesce to find kernel cage regions
307  * within an mnode. Looks for the lowest range between lo and hi.
308  *
309  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
310  * Non-cage memory is defined between kcage_current_glist and list end.
311  *
312  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
313  * non-cage range.
314  *
315  * Returns zero on success and nlo, nhi:
316  * 	lo <= nlo < nhi <= hi
317  * Returns non-zero if no overlapping range is found.
318  */
319 int
320 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
321     pfn_t *nlo, pfn_t *nhi)
322 {
323 	struct kcage_glist *lp;
324 	pfn_t tlo = hi;
325 	pfn_t thi = hi;
326 
327 	ASSERT(lo <= hi);
328 
329 	/*
330 	 * Reader lock protects the list, but kcage_get_pfn
331 	 * running concurrently may advance kcage_current_glist
332 	 * and also update kcage_current_glist->curr. Page
333 	 * coalesce can handle this race condition.
334 	 */
335 	rw_enter(&kcage_range_rwlock, RW_READER);
336 
337 	for (lp = incage ? kcage_glist : kcage_current_glist;
338 	    lp != NULL; lp = lp->next) {
339 
340 		pfn_t klo, khi;
341 
342 		/* find the range limits in this element */
343 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
344 			klo = lp->curr;
345 			khi = lp->lim;
346 		} else {
347 			klo = lp->base;
348 			khi = lp->curr;
349 		}
350 
351 		/* handle overlap */
352 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
353 			tlo = MAX(lo, klo);
354 			thi = MIN(hi, khi);
355 			if (tlo == lo)
356 				break;
357 		}
358 
359 		/* check end of kcage */
360 		if (incage && lp == kcage_current_glist) {
361 			break;
362 		}
363 	}
364 
365 	rw_exit(&kcage_range_rwlock);
366 
367 	/* return non-zero if no overlapping range found */
368 	if (tlo == thi)
369 		return (1);
370 
371 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
372 
373 	/* return overlapping range */
374 	*nlo = tlo;
375 	*nhi = thi;
376 	return (0);
377 }
378 
379 void
380 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
381 {
382 	int ret = 0;
383 
384 	ASSERT(kcage_arena == NULL);
385 	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
386 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
387 	ASSERT(kcage_arena != NULL);
388 
389 	if (d == KCAGE_DOWN) {
390 		while (ml->next != NULL)
391 			ml = ml->next;
392 	}
393 
394 	rw_enter(&kcage_range_rwlock, RW_WRITER);
395 
396 	while (ml != NULL) {
397 		ret = kcage_range_add_internal(btop(ml->address),
398 		    btop(ml->size), d);
399 		if (ret)
400 			panic("kcage_range_add_internal failed: "
401 			    "ml=%p, ret=0x%x\n", ml, ret);
402 
403 		ml = (d == KCAGE_DOWN ? ml->prev : ml->next);
404 	}
405 
406 	rw_exit(&kcage_range_rwlock);
407 
408 	if (ret == 0)
409 		kcage_init(preferred_size);
410 }
411 
412 /*
413  * Third arg controls direction of growth: 0: increasing pfns,
414  * 1: decreasing.
415  */
416 static int
417 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
418 {
419 	struct kcage_glist *new, **lpp;
420 	pfn_t lim;
421 
422 	ASSERT(rw_write_held(&kcage_range_rwlock));
423 
424 	ASSERT(npgs != 0);
425 	if (npgs == 0)
426 		return (EINVAL);
427 
428 	lim = base + npgs;
429 
430 	ASSERT(lim > base);
431 	if (lim <= base)
432 		return (EINVAL);
433 
434 	new = kcage_glist_alloc();
435 	if (new == NULL) {
436 		return (ENOMEM);
437 	}
438 
439 	new->base = base;
440 	new->lim = lim;
441 	new->decr = (d == KCAGE_DOWN);
442 	if (new->decr != 0)
443 		new->curr = new->lim;
444 	else
445 		new->curr = new->base;
446 	/*
447 	 * Any overlapping existing ranges are removed by deleting
448 	 * from the new list as we search for the tail.
449 	 */
450 	lpp = &kcage_glist;
451 	while (*lpp != NULL) {
452 		int ret;
453 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
454 		if (ret != 0)
455 			return (ret);
456 		lpp = &(*lpp)->next;
457 	}
458 
459 	*lpp = new;
460 
461 	if (kcage_current_glist == NULL) {
462 		kcage_current_glist = kcage_glist;
463 	}
464 
465 	return (0);
466 }
467 
468 int
469 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
470 {
471 	int ret;
472 
473 	rw_enter(&kcage_range_rwlock, RW_WRITER);
474 	ret = kcage_range_add_internal(base, npgs, d);
475 	rw_exit(&kcage_range_rwlock);
476 	return (ret);
477 }
478 
479 /*
480  * Calls to add and delete must be protected by kcage_range_rwlock
481  */
482 static int
483 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
484 {
485 	struct kcage_glist *lp;
486 	pfn_t lim;
487 
488 	ASSERT(rw_write_held(&kcage_range_rwlock));
489 
490 	ASSERT(npgs != 0);
491 	if (npgs == 0)
492 		return (EINVAL);
493 
494 	lim = base + npgs;
495 
496 	ASSERT(lim > base);
497 	if (lim <= base)
498 		return (EINVAL);
499 
500 	/*
501 	 * Check if the delete is OK first as a number of elements
502 	 * might be involved and it will be difficult to go
503 	 * back and undo (can't just add the range back in).
504 	 */
505 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
506 		/*
507 		 * If there have been no pages allocated from this
508 		 * element, we don't need to check it.
509 		 */
510 		if ((lp->decr == 0 && lp->curr == lp->base) ||
511 		    (lp->decr != 0 && lp->curr == lp->lim))
512 			continue;
513 		/*
514 		 * If the element does not overlap, its OK.
515 		 */
516 		if (base >= lp->lim || lim <= lp->base)
517 			continue;
518 		/*
519 		 * Overlapping element: Does the range to be deleted
520 		 * overlap the area already used? If so fail.
521 		 */
522 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
523 			return (EBUSY);
524 		}
525 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
526 			return (EBUSY);
527 		}
528 	}
529 	return (kcage_glist_delete(base, lim, &kcage_glist));
530 }
531 
532 int
533 kcage_range_delete(pfn_t base, pgcnt_t npgs)
534 {
535 	int ret;
536 
537 	rw_enter(&kcage_range_rwlock, RW_WRITER);
538 	ret = kcage_range_delete_internal(base, npgs);
539 	rw_exit(&kcage_range_rwlock);
540 	return (ret);
541 }
542 
543 /*
544  * Calls to add and delete must be protected by kcage_range_rwlock.
545  * This routine gets called after successful Solaris memory
546  * delete operation from DR post memory delete routines.
547  */
548 static int
549 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
550 {
551 	pfn_t lim;
552 
553 	ASSERT(rw_write_held(&kcage_range_rwlock));
554 
555 	ASSERT(npgs != 0);
556 	if (npgs == 0)
557 		return (EINVAL);
558 
559 	lim = base + npgs;
560 
561 	ASSERT(lim > base);
562 	if (lim <= base)
563 		return (EINVAL);
564 
565 	return (kcage_glist_delete(base, lim, &kcage_glist));
566 }
567 
568 int
569 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
570 {
571 	int ret;
572 
573 	rw_enter(&kcage_range_rwlock, RW_WRITER);
574 	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
575 	rw_exit(&kcage_range_rwlock);
576 	return (ret);
577 }
578 
579 /*
580  * No locking is required here as the whole operation is covered
581  * by kcage_range_rwlock writer lock.
582  */
583 static struct kcage_glist *
584 kcage_glist_alloc(void)
585 {
586 	struct kcage_glist *new;
587 
588 	if ((new = kcage_glist_freelist) != NULL) {
589 		kcage_glist_freelist = new->next;
590 	} else {
591 		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
592 	}
593 
594 	if (new != NULL)
595 		bzero(new, sizeof (*new));
596 
597 	return (new);
598 }
599 
600 static void
601 kcage_glist_free(struct kcage_glist *lp)
602 {
603 	lp->next = kcage_glist_freelist;
604 	kcage_glist_freelist = lp;
605 }
606 
607 static int
608 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
609 {
610 	struct kcage_glist *lp, *prev = *lpp;
611 
612 	while ((lp = *lpp) != NULL) {
613 		if (lim > lp->base && base < lp->lim) {
614 			/* The delete range overlaps this element. */
615 			if (base <= lp->base && lim >= lp->lim) {
616 				/* Delete whole element. */
617 				*lpp = lp->next;
618 				if (lp == kcage_current_glist) {
619 					/* This can never happen. */
620 					ASSERT(kcage_current_glist != prev);
621 					kcage_current_glist = prev;
622 				}
623 				kcage_glist_free(lp);
624 				continue;
625 			}
626 
627 			/* Partial delete. */
628 			if (base > lp->base && lim < lp->lim) {
629 				struct kcage_glist *new;
630 
631 				/*
632 				 * Remove a section from the middle,
633 				 * need to allocate a new element.
634 				 */
635 				new = kcage_glist_alloc();
636 				if (new == NULL) {
637 					return (ENOMEM);
638 				}
639 
640 				/*
641 				 * Tranfser unused range to new.
642 				 * Edit lp in place to preserve
643 				 * kcage_current_glist.
644 				 */
645 				new->decr = lp->decr;
646 				if (new->decr != 0) {
647 					new->base = lp->base;
648 					new->lim = base;
649 					new->curr = base;
650 
651 					lp->base = lim;
652 				} else {
653 					new->base = lim;
654 					new->lim = lp->lim;
655 					new->curr = new->base;
656 
657 					lp->lim = base;
658 				}
659 
660 				/* Insert new. */
661 				new->next = lp->next;
662 				lp->next = new;
663 				lpp = &lp->next;
664 			} else {
665 				/* Delete part of current block. */
666 				if (base > lp->base) {
667 					ASSERT(lim >= lp->lim);
668 					ASSERT(base < lp->lim);
669 					if (lp->decr != 0 &&
670 					    lp->curr == lp->lim)
671 						lp->curr = base;
672 					lp->lim = base;
673 				} else {
674 					ASSERT(base <= lp->base);
675 					ASSERT(lim > lp->base);
676 					if (lp->decr == 0 &&
677 					    lp->curr == lp->base)
678 						lp->curr = lim;
679 					lp->base = lim;
680 				}
681 			}
682 		}
683 		prev = *lpp;
684 		lpp = &(*lpp)->next;
685 	}
686 
687 	return (0);
688 }
689 
690 /*
691  * If lockit is 1, kcage_get_pfn holds the
692  * reader lock for kcage_range_rwlock.
693  * Changes to lp->curr can cause race conditions, but
694  * they are handled by higher level code (see kcage_next_range.)
695  */
696 static pfn_t
697 kcage_get_pfn(int lockit)
698 {
699 	struct kcage_glist *lp;
700 	pfn_t pfn = PFN_INVALID;
701 
702 	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
703 		return (pfn);
704 
705 	lp = kcage_current_glist;
706 	while (lp != NULL) {
707 		if (lp->decr != 0) {
708 			if (lp->curr != lp->base) {
709 				pfn = --lp->curr;
710 				break;
711 			}
712 		} else {
713 			if (lp->curr != lp->lim) {
714 				pfn = lp->curr++;
715 				break;
716 			}
717 		}
718 
719 		lp = lp->next;
720 		if (lp)
721 			kcage_current_glist = lp;
722 	}
723 
724 	if (lockit)
725 		rw_exit(&kcage_range_rwlock);
726 	return (pfn);
727 }
728 
729 /*
730  * Walk the physical address space of the cage.
731  * This routine does not guarantee to return PFNs in the order
732  * in which they were allocated to the cage. Instead, it walks
733  * each range as they appear on the growth list returning the PFNs
734  * range in ascending order.
735  *
736  * To begin scanning at lower edge of cage, reset should be nonzero.
737  * To step through cage, reset should be zero.
738  *
739  * PFN_INVALID will be returned when the upper end of the cage is
740  * reached -- indicating a full scan of the cage has been completed since
741  * previous reset. PFN_INVALID will continue to be returned until
742  * kcage_walk_cage is reset.
743  *
744  * It is possible to receive a PFN_INVALID result on reset if a growth
745  * list is not installed or if none of the PFNs in the installed list have
746  * been allocated to the cage. In otherwords, there is no cage.
747  *
748  * Caller need not hold kcage_range_rwlock while calling this function
749  * as the front part of the list is static - pages never come out of
750  * the cage.
751  *
752  * The caller is expected to only be kcage_cageout().
753  */
754 static pfn_t
755 kcage_walk_cage(int reset)
756 {
757 	static struct kcage_glist *lp = NULL;
758 	static pfn_t pfn;
759 
760 	if (reset)
761 		lp = NULL;
762 	if (lp == NULL) {
763 		lp = kcage_glist;
764 		pfn = PFN_INVALID;
765 	}
766 again:
767 	if (pfn == PFN_INVALID) {
768 		if (lp == NULL)
769 			return (PFN_INVALID);
770 
771 		if (lp->decr != 0) {
772 			/*
773 			 * In this range the cage grows from the highest
774 			 * address towards the lowest.
775 			 * Arrange to return pfns from curr to lim-1,
776 			 * inclusive, in ascending order.
777 			 */
778 
779 			pfn = lp->curr;
780 		} else {
781 			/*
782 			 * In this range the cage grows from the lowest
783 			 * address towards the highest.
784 			 * Arrange to return pfns from base to curr,
785 			 * inclusive, in ascending order.
786 			 */
787 
788 			pfn = lp->base;
789 		}
790 	}
791 
792 	if (lp->decr != 0) {		/* decrementing pfn */
793 		if (pfn == lp->lim) {
794 			/* Don't go beyond the static part of the glist. */
795 			if (lp == kcage_current_glist)
796 				lp = NULL;
797 			else
798 				lp = lp->next;
799 			pfn = PFN_INVALID;
800 			goto again;
801 		}
802 
803 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
804 	} else {			/* incrementing pfn */
805 		if (pfn == lp->curr) {
806 			/* Don't go beyond the static part of the glist. */
807 			if (lp == kcage_current_glist)
808 				lp = NULL;
809 			else
810 				lp = lp->next;
811 			pfn = PFN_INVALID;
812 			goto again;
813 		}
814 
815 		ASSERT(pfn >= lp->base && pfn < lp->curr);
816 	}
817 
818 	return (pfn++);
819 }
820 
821 /*
822  * Callback functions for to recalc cage thresholds after
823  * Kphysm memory add/delete operations.
824  */
825 /*ARGSUSED*/
826 static void
827 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
828 {
829 	kcage_recalc_thresholds();
830 }
831 
832 /*ARGSUSED*/
833 static int
834 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
835 {
836 	/* TODO: when should cage refuse memory delete requests? */
837 	return (0);
838 }
839 
840 /*ARGSUSED*/
841 static  void
842 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
843 {
844 	kcage_recalc_thresholds();
845 }
846 
847 static kphysm_setup_vector_t kcage_kphysm_vectors = {
848 	KPHYSM_SETUP_VECTOR_VERSION,
849 	kcage_kphysm_postadd_cb,
850 	kcage_kphysm_predel_cb,
851 	kcage_kphysm_postdel_cb
852 };
853 
854 /*
855  * This is called before a CPR suspend and after a CPR resume.  We have to
856  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
857  * restart.
858  */
859 /*ARGSUSED*/
860 static boolean_t
861 kcage_cageout_cpr(void *arg, int code)
862 {
863 	if (code == CB_CODE_CPR_CHKPT) {
864 		ASSERT(kcage_cageout_ready);
865 		kcage_cageout_ready = 0;
866 		return (B_TRUE);
867 	} else if (code == CB_CODE_CPR_RESUME) {
868 		ASSERT(kcage_cageout_ready == 0);
869 		kcage_cageout_ready = 1;
870 		return (B_TRUE);
871 	}
872 	return (B_FALSE);
873 }
874 
875 /*
876  * kcage_recalc_preferred_size() increases initial cage size to improve large
877  * page availability when lp for kmem is enabled and kpr is disabled
878  */
879 static pgcnt_t
880 kcage_recalc_preferred_size(pgcnt_t preferred_size)
881 {
882 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
883 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
884 		if (lpmincage == 0) {
885 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
886 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
887 		}
888 		kcage_kmemlp_mincage = MIN(lpmincage,
889 		    (segkmem_kmemlp_max / PAGESIZE));
890 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
891 	}
892 	return (preferred_size);
893 }
894 
895 /*
896  * Kcage_init() builds the cage and initializes the cage thresholds.
897  * The size of the cage is determined by the argument preferred_size.
898  * or the actual amount of memory, whichever is smaller.
899  */
900 static void
901 kcage_init(pgcnt_t preferred_size)
902 {
903 	pgcnt_t wanted;
904 	pfn_t pfn;
905 	page_t *pp;
906 	kstat_t *ksp;
907 
908 	extern struct vnode kvp;
909 	extern void page_list_noreloc_startup(page_t *);
910 
911 	ASSERT(!kcage_on);
912 
913 	/* increase preferred cage size for lp for kmem */
914 	preferred_size = kcage_recalc_preferred_size(preferred_size);
915 
916 	/* Debug note: initialize this now so early expansions can stat */
917 	KCAGE_STAT_INIT_SCAN_INDEX;
918 
919 	/*
920 	 * Initialize cage thresholds and install kphysm callback.
921 	 * If we can't arrange to have the thresholds track with
922 	 * available physical memory, then the cage thresholds may
923 	 * end up over time at levels that adversly effect system
924 	 * performance; so, bail out.
925 	 */
926 	kcage_recalc_thresholds();
927 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
928 		ASSERT(0);		/* Catch this in DEBUG kernels. */
929 		return;
930 	}
931 
932 	/*
933 	 * Limit startup cage size within the range of kcage_minfree
934 	 * and availrmem, inclusively.
935 	 */
936 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
937 
938 	/*
939 	 * Construct the cage. PFNs are allocated from the glist. It
940 	 * is assumed that the list has been properly ordered for the
941 	 * platform by the platform code. Typically, this is as simple
942 	 * as calling kcage_range_init(phys_avail, decr), where decr is
943 	 * 1 if the kernel has been loaded into upper end of physical
944 	 * memory, or 0 if the kernel has been loaded at the low end.
945 	 *
946 	 * Note: it is assumed that we are in the startup flow, so there
947 	 * is no reason to grab the page lock.
948 	 */
949 	kcage_freemem = 0;
950 	pfn = PFN_INVALID;			/* prime for alignment test */
951 	while (wanted != 0) {
952 		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
953 			break;
954 
955 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
956 			KCAGEPAGETS_INC();
957 			/*
958 			 * Set the noreloc state on the page.
959 			 * If the page is free and not already
960 			 * on the noreloc list then move it.
961 			 */
962 			if (PP_ISFREE(pp)) {
963 				if (PP_ISNORELOC(pp) == 0)
964 					page_list_noreloc_startup(pp);
965 			} else {
966 				ASSERT(pp->p_szc == 0);
967 				PP_SETNORELOC(pp);
968 			}
969 		}
970 		PLCNT_XFER_NORELOC(pp);
971 		wanted -= 1;
972 	}
973 
974 	/*
975 	 * Need to go through and find kernel allocated pages
976 	 * and capture them into the Cage.  These will primarily
977 	 * be pages gotten through boot_alloc().
978 	 */
979 	if (kvp.v_pages) {
980 
981 		pp = kvp.v_pages;
982 		do {
983 			ASSERT(!PP_ISFREE(pp));
984 			ASSERT(pp->p_szc == 0);
985 			PP_SETNORELOC(pp);
986 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
987 
988 	}
989 
990 	kcage_on = 1;
991 
992 	/*
993 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
994 	 * after the cageout thread is blocked, and executes from cpr_resume()
995 	 * before the cageout thread is restarted.  By executing in this class,
996 	 * we are assured that the kernel cage thread won't miss wakeup calls
997 	 * and also CPR's larger kmem_alloc requests will not fail after
998 	 * CPR shuts down the cageout kernel thread.
999 	 */
1000 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
1001 	    "cageout");
1002 
1003 	/*
1004 	 * Coalesce pages to improve large page availability. A better fix
1005 	 * would to coalesce pages as they are included in the cage
1006 	 */
1007 	if (SEGKMEM_USE_LARGEPAGES) {
1008 		extern void page_freelist_coalesce_all(int mnode);
1009 		page_freelist_coalesce_all(-1);	/* do all mnodes */
1010 	}
1011 
1012 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
1013 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
1014 	if (ksp != NULL) {
1015 		ksp->ks_update = kcage_kstat_update;
1016 		ksp->ks_snapshot = kcage_kstat_snapshot;
1017 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
1018 		kstat_install(ksp);
1019 	}
1020 }
1021 
1022 static int
1023 kcage_kstat_update(kstat_t *ksp, int rw)
1024 {
1025 	struct kcage_glist *lp;
1026 	uint_t count;
1027 
1028 	if (rw == KSTAT_WRITE)
1029 		return (EACCES);
1030 
1031 	count = 0;
1032 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1033 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
1034 		if (lp->decr) {
1035 			if (lp->curr != lp->lim) {
1036 				count++;
1037 			}
1038 		} else {
1039 			if (lp->curr != lp->base) {
1040 				count++;
1041 			}
1042 		}
1043 	}
1044 	rw_exit(&kcage_range_rwlock);
1045 
1046 	ksp->ks_ndata = count;
1047 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1048 
1049 	return (0);
1050 }
1051 
1052 static int
1053 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1054 {
1055 	struct kcage_glist *lp;
1056 	struct memunit {
1057 		uint64_t address;
1058 		uint64_t size;
1059 	} *kspmem;
1060 
1061 	if (rw == KSTAT_WRITE)
1062 		return (EACCES);
1063 
1064 	ksp->ks_snaptime = gethrtime();
1065 
1066 	kspmem = (struct memunit *)buf;
1067 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1068 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
1069 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1070 			break;
1071 
1072 		if (lp->decr) {
1073 			if (lp->curr != lp->lim) {
1074 				kspmem->address = ptob(lp->curr);
1075 				kspmem->size = ptob(lp->lim - lp->curr);
1076 			}
1077 		} else {
1078 			if (lp->curr != lp->base) {
1079 				kspmem->address = ptob(lp->base);
1080 				kspmem->size = ptob(lp->curr - lp->base);
1081 			}
1082 		}
1083 	}
1084 	rw_exit(&kcage_range_rwlock);
1085 
1086 	return (0);
1087 }
1088 
1089 void
1090 kcage_recalc_thresholds()
1091 {
1092 	static int first = 1;
1093 	static pgcnt_t init_lotsfree;
1094 	static pgcnt_t init_desfree;
1095 	static pgcnt_t init_minfree;
1096 	static pgcnt_t init_throttlefree;
1097 	static pgcnt_t init_reserve;
1098 
1099 	/* TODO: any reason to take more care than this with live editing? */
1100 	mutex_enter(&kcage_cageout_mutex);
1101 	mutex_enter(&freemem_lock);
1102 
1103 	if (first) {
1104 		first = 0;
1105 		init_lotsfree = kcage_lotsfree;
1106 		init_desfree = kcage_desfree;
1107 		init_minfree = kcage_minfree;
1108 		init_throttlefree = kcage_throttlefree;
1109 		init_reserve = kcage_reserve;
1110 	} else {
1111 		kcage_lotsfree = init_lotsfree;
1112 		kcage_desfree = init_desfree;
1113 		kcage_minfree = init_minfree;
1114 		kcage_throttlefree = init_throttlefree;
1115 		kcage_reserve = init_reserve;
1116 	}
1117 
1118 	if (kcage_lotsfree == 0)
1119 		kcage_lotsfree = MAX(32, total_pages / 256);
1120 
1121 	if (kcage_minfree == 0)
1122 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1123 
1124 	if (kcage_desfree == 0)
1125 		kcage_desfree = MAX(32, kcage_minfree);
1126 
1127 	if (kcage_throttlefree == 0)
1128 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1129 
1130 	if (kcage_reserve == 0)
1131 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1132 
1133 	mutex_exit(&freemem_lock);
1134 	mutex_exit(&kcage_cageout_mutex);
1135 
1136 	if (kcage_cageout_ready) {
1137 		if (kcage_freemem < kcage_desfree)
1138 			kcage_cageout_wakeup();
1139 
1140 		if (kcage_needfree) {
1141 			mutex_enter(&kcage_throttle_mutex);
1142 			cv_broadcast(&kcage_throttle_cv);
1143 			mutex_exit(&kcage_throttle_mutex);
1144 		}
1145 	}
1146 }
1147 
1148 /*
1149  * Pageout interface:
1150  * kcage_cageout_init()
1151  */
1152 void
1153 kcage_cageout_init()
1154 {
1155 	if (kcage_on) {
1156 
1157 		(void) thread_create(NULL, 0, kcage_cageout,
1158 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
1159 	}
1160 }
1161 
1162 
1163 /*
1164  * VM Interfaces:
1165  * kcage_create_throttle()
1166  * kcage_freemem_add()
1167  * kcage_freemem_sub()
1168  */
1169 
1170 /*
1171  * Wakeup cageout thread and throttle waiting for the number of pages
1172  * requested to become available.  For non-critical requests, a
1173  * timeout is added, since freemem accounting is separate from cage
1174  * freemem accounting: it's possible for us to get stuck and not make
1175  * forward progress even though there was sufficient freemem before
1176  * arriving here.
1177  */
1178 int
1179 kcage_create_throttle(pgcnt_t npages, int flags)
1180 {
1181 	int niter = 0;
1182 	pgcnt_t lastfree;
1183 	int enough = kcage_freemem > kcage_throttlefree + npages;
1184 
1185 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1186 
1187 	kcage_cageout_wakeup();			/* just to be sure */
1188 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1189 
1190 	/*
1191 	 * Obviously, we can't throttle the cageout thread since
1192 	 * we depend on it.  We also can't throttle the panic thread.
1193 	 */
1194 	if (curthread == kcage_cageout_thread || panicstr) {
1195 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1196 		return (KCT_CRIT);
1197 	}
1198 
1199 	/*
1200 	 * Don't throttle threads which are critical for proper
1201 	 * vm management if we're above kcage_throttlefree or
1202 	 * if freemem is very low.
1203 	 */
1204 	if (NOMEMWAIT()) {
1205 		if (enough) {
1206 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1207 			return (KCT_CRIT);
1208 		} else if (freemem < minfree) {
1209 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1210 			return (KCT_CRIT);
1211 		}
1212 	}
1213 
1214 	/*
1215 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1216 	 */
1217 	if (DISP_PRIO(curthread) > maxclsyspri &&
1218 	    kcage_freemem > kcage_reserve) {
1219 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1220 		return (KCT_CRIT);
1221 	}
1222 
1223 	/*
1224 	 * Cause all other threads (which are assumed to not be
1225 	 * critical to cageout) to wait here until their request
1226 	 * can be satisfied. Be a little paranoid and wake the
1227 	 * kernel cage on each loop through this logic.
1228 	 */
1229 	while (kcage_freemem < kcage_throttlefree + npages) {
1230 		ASSERT(kcage_on);
1231 
1232 		lastfree = kcage_freemem;
1233 
1234 		if (kcage_cageout_ready) {
1235 			mutex_enter(&kcage_throttle_mutex);
1236 
1237 			kcage_needfree += npages;
1238 			KCAGE_STAT_INCR(kct_wait);
1239 
1240 			kcage_cageout_wakeup();
1241 			KCAGE_STAT_INCR(kct_cagewake);
1242 
1243 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1244 
1245 			kcage_needfree -= npages;
1246 
1247 			mutex_exit(&kcage_throttle_mutex);
1248 		} else {
1249 			/*
1250 			 * NOTE: atomics are used just in case we enter
1251 			 * mp operation before the cageout thread is ready.
1252 			 */
1253 			atomic_add_long(&kcage_needfree, npages);
1254 
1255 			kcage_cageout_wakeup();
1256 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1257 
1258 			atomic_add_long(&kcage_needfree, -npages);
1259 		}
1260 
1261 		if ((flags & PG_WAIT) == 0) {
1262 			if (kcage_freemem > lastfree) {
1263 				KCAGE_STAT_INCR(kct_progress);
1264 				niter = 0;
1265 			} else {
1266 				KCAGE_STAT_INCR(kct_noprogress);
1267 				if (++niter >= kcage_maxwait) {
1268 					KCAGE_STAT_INCR(kct_timeout);
1269 					return (KCT_FAILURE);
1270 				}
1271 			}
1272 		}
1273 	}
1274 	return (KCT_NONCRIT);
1275 }
1276 
1277 void
1278 kcage_freemem_add(pgcnt_t npages)
1279 {
1280 	extern void wakeup_pcgs(void);
1281 
1282 	atomic_add_long(&kcage_freemem, npages);
1283 
1284 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1285 
1286 	if (kcage_needfree != 0 &&
1287 	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1288 
1289 		mutex_enter(&kcage_throttle_mutex);
1290 		cv_broadcast(&kcage_throttle_cv);
1291 		KCAGE_STAT_INCR(kfa_trottlewake);
1292 		mutex_exit(&kcage_throttle_mutex);
1293 	}
1294 }
1295 
1296 void
1297 kcage_freemem_sub(pgcnt_t npages)
1298 {
1299 	atomic_add_long(&kcage_freemem, -npages);
1300 
1301 	if (kcage_freemem < kcage_desfree) {
1302 		kcage_cageout_wakeup();
1303 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1304 	}
1305 }
1306 
1307 /*
1308  * return 0 on failure and 1 on success.
1309  */
1310 static int
1311 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1312 {
1313 	pgcnt_t npgs, i;
1314 	page_t *pp;
1315 	pfn_t rootpfn = page_pptonum(rootpp);
1316 	uint_t szc;
1317 
1318 	ASSERT(!PP_ISFREE(rootpp));
1319 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1320 	if (!group_page_trylock(rootpp, se)) {
1321 		return (0);
1322 	}
1323 	szc = rootpp->p_szc;
1324 	if (szc == 0) {
1325 		/*
1326 		 * The szc of a locked page can only change for pages that are
1327 		 * non-swapfs (i.e. anonymous memory) file system pages.
1328 		 */
1329 		ASSERT(rootpp->p_vnode != NULL &&
1330 		    !PP_ISKAS(rootpp) &&
1331 		    !IS_SWAPFSVP(rootpp->p_vnode));
1332 		PP_SETNORELOC(rootpp);
1333 		return (1);
1334 	}
1335 	npgs = page_get_pagecnt(szc);
1336 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1337 	pp = rootpp;
1338 	for (i = 0; i < npgs; i++, pp++) {
1339 		ASSERT(PAGE_LOCKED_SE(pp, se));
1340 		ASSERT(!PP_ISFREE(pp));
1341 		ASSERT(pp->p_szc == szc);
1342 		PP_SETNORELOC(pp);
1343 	}
1344 	group_page_unlock(rootpp);
1345 	return (1);
1346 }
1347 
1348 /*
1349  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1350  * If successful and pages is free, move page to the tail of whichever
1351  * list it is on.
1352  * Returns:
1353  *   EBUSY  page already locked, assimilated but not free.
1354  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1355  *   EAGAIN page not assimilated. Page not free.
1356  *   ERANGE page assimilated. Page not root.
1357  *   0      page assimilated. Page free.
1358  *   *nfreedp number of pages freed.
1359  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1360  * to distinguish between a page that was already a NORELOC page from
1361  * those newly converted to NORELOC pages by this invocation of
1362  * kcage_assimilate_page.
1363  */
1364 static int
1365 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1366 {
1367 	if (page_trylock(pp, SE_EXCL)) {
1368 		if (PP_ISNORELOC(pp)) {
1369 check_free_and_return:
1370 			if (PP_ISFREE(pp)) {
1371 				page_unlock(pp);
1372 				*nfreedp = 0;
1373 				return (0);
1374 			} else {
1375 				page_unlock(pp);
1376 				return (EBUSY);
1377 			}
1378 			/*NOTREACHED*/
1379 		}
1380 	} else {
1381 		if (page_trylock(pp, SE_SHARED)) {
1382 			if (PP_ISNORELOC(pp))
1383 				goto check_free_and_return;
1384 		} else
1385 			return (EAGAIN);
1386 
1387 		if (!PP_ISFREE(pp)) {
1388 			page_unlock(pp);
1389 			return (EAGAIN);
1390 		}
1391 
1392 		/*
1393 		 * Need to upgrade the lock on it and set the NORELOC
1394 		 * bit. If it is free then remove it from the free
1395 		 * list so that the platform free list code can keep
1396 		 * NORELOC pages where they should be.
1397 		 */
1398 		/*
1399 		 * Before doing anything, get the exclusive lock.
1400 		 * This may fail (eg ISM pages are left shared locked).
1401 		 * If the page is free this will leave a hole in the
1402 		 * cage. There is no solution yet to this.
1403 		 */
1404 		if (!page_tryupgrade(pp)) {
1405 			page_unlock(pp);
1406 			return (EAGAIN);
1407 		}
1408 	}
1409 
1410 	ASSERT(PAGE_EXCL(pp));
1411 
1412 	if (PP_ISFREE(pp)) {
1413 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1414 
1415 		page_list_sub(pp, which);
1416 		ASSERT(pp->p_szc == 0);
1417 		PP_SETNORELOC(pp);
1418 		PLCNT_XFER_NORELOC(pp);
1419 		page_list_add(pp, which | PG_LIST_TAIL);
1420 
1421 		page_unlock(pp);
1422 		*nfreedp = 1;
1423 		return (0);
1424 	} else {
1425 		if (pp->p_szc != 0) {
1426 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1427 				page_unlock(pp);
1428 				return (EAGAIN);
1429 			}
1430 			ASSERT(PP_ISNORELOC(pp));
1431 		} else {
1432 			PP_SETNORELOC(pp);
1433 		}
1434 		PLCNT_XFER_NORELOC(pp);
1435 		return (kcage_invalidate_page(pp, nfreedp));
1436 	}
1437 	/*NOTREACHED*/
1438 }
1439 
1440 static int
1441 kcage_expand()
1442 {
1443 	int did_something = 0;
1444 
1445 	spgcnt_t wanted;
1446 	pfn_t pfn;
1447 	page_t *pp;
1448 	/* TODO: we don't really need n any more? */
1449 	pgcnt_t n;
1450 	pgcnt_t nf, nfreed;
1451 
1452 	/*
1453 	 * Expand the cage if available cage memory is really low. Calculate
1454 	 * the amount required to return kcage_freemem to the level of
1455 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1456 	 * more.  It is rare for their sum to create an artificial threshold
1457 	 * above kcage_lotsfree, but it is possible.
1458 	 *
1459 	 * Exit early if expansion amount is equal to or less than zero.
1460 	 * (<0 is possible if kcage_freemem rises suddenly.)
1461 	 *
1462 	 * Exit early when the global page pool (apparently) does not
1463 	 * have enough free pages to page_relocate() even a single page.
1464 	 */
1465 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1466 	    - kcage_freemem;
1467 	if (wanted <= 0)
1468 		return (0);
1469 	else if (freemem < pageout_reserve + 1) {
1470 		KCAGE_STAT_INCR(ke_lowfreemem);
1471 		return (0);
1472 	}
1473 
1474 	KCAGE_STAT_INCR(ke_calls);
1475 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1476 
1477 	/*
1478 	 * Assimilate more pages from the global page pool into the cage.
1479 	 */
1480 	n = 0;				/* number of pages PP_SETNORELOC'd */
1481 	nf = 0;				/* number of those actually free */
1482 	while (kcage_on && nf < wanted) {
1483 		pfn = kcage_get_pfn(1);
1484 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1485 			KCAGE_STAT_INCR(ke_nopfn);
1486 			goto terminate;
1487 		}
1488 
1489 		KCAGE_STAT_INCR_SCAN(ke_examined);
1490 
1491 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1492 			KCAGE_STAT_INCR(ke_nopaget);
1493 			continue;
1494 		}
1495 		KCAGEPAGETS_INC();
1496 		/*
1497 		 * Sanity check. Skip this pfn if it is
1498 		 * being deleted.
1499 		 */
1500 		if (pfn_is_being_deleted(pfn)) {
1501 			KCAGE_STAT_INCR(ke_deleting);
1502 			continue;
1503 		}
1504 
1505 		if (PP_ISNORELOC(pp)) {
1506 			KCAGE_STAT_INCR(ke_isnoreloc);
1507 			continue;
1508 		}
1509 
1510 		switch (kcage_assimilate_page(pp, &nfreed)) {
1511 			case 0:		/* assimilated, page is free */
1512 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1513 				did_something = 1;
1514 				nf += nfreed;
1515 				n++;
1516 				break;
1517 
1518 			case EBUSY:	/* assimilated, page not free */
1519 			case ERANGE:	/* assimilated, page not root */
1520 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1521 				did_something = 1;
1522 				n++;
1523 				break;
1524 
1525 			case ENOMEM:	/* assimilated, but no mem */
1526 				KCAGE_STAT_INCR(ke_terminate);
1527 				did_something = 1;
1528 				n++;
1529 				goto terminate;
1530 
1531 			case EAGAIN:	/* can't assimilate */
1532 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1533 				break;
1534 
1535 			default:	/* catch this with debug kernels */
1536 				ASSERT(0);
1537 				break;
1538 		}
1539 	}
1540 
1541 	/*
1542 	 * Realign cage edge with the nearest physical address
1543 	 * boundry for big pages. This is done to give us a
1544 	 * better chance of actually getting usable big pages
1545 	 * in the cage.
1546 	 */
1547 
1548 terminate:
1549 
1550 	return (did_something);
1551 }
1552 
1553 /*
1554  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1555  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1556  * if relocation is successful, otherwise it is only unlocked.
1557  * On entry, page opp must be exclusively locked and not free.
1558  * *nfreedp: number of pages freed.
1559  */
1560 static int
1561 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1562 {
1563 	page_t *opp = pp;
1564 	page_t *rpp = NULL;
1565 	spgcnt_t npgs;
1566 	int result;
1567 
1568 	ASSERT(!PP_ISFREE(opp));
1569 	ASSERT(PAGE_EXCL(opp));
1570 
1571 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1572 	*nfreedp = npgs;
1573 	if (result == 0) {
1574 		while (npgs-- > 0) {
1575 			page_t *tpp;
1576 
1577 			ASSERT(rpp != NULL);
1578 			tpp = rpp;
1579 			page_sub(&rpp, tpp);
1580 			page_unlock(tpp);
1581 		}
1582 
1583 		ASSERT(rpp == NULL);
1584 
1585 		return (0);		/* success */
1586 	}
1587 
1588 	page_unlock(opp);
1589 	return (result);
1590 }
1591 
1592 /*
1593  * Based on page_invalidate_pages()
1594  *
1595  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1596  * of use must be updated to match the new page_relocate() when it
1597  * becomes available.
1598  *
1599  * Return result of kcage_relocate_page or zero if page was directly freed.
1600  * *nfreedp: number of pages freed.
1601  */
1602 static int
1603 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1604 {
1605 	int result;
1606 
1607 #if defined(__sparc)
1608 	extern struct vnode prom_ppages;
1609 	ASSERT(pp->p_vnode != &prom_ppages);
1610 #endif /* __sparc */
1611 
1612 	ASSERT(!PP_ISFREE(pp));
1613 	ASSERT(PAGE_EXCL(pp));
1614 
1615 	/*
1616 	 * Is this page involved in some I/O? shared?
1617 	 * The page_struct_lock need not be acquired to
1618 	 * examine these fields since the page has an
1619 	 * "exclusive" lock.
1620 	 */
1621 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1622 		result = kcage_relocate_page(pp, nfreedp);
1623 #ifdef KCAGE_STATS
1624 		if (result == 0)
1625 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1626 		else if (result == ENOMEM)
1627 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1628 #endif
1629 		return (result);
1630 	}
1631 
1632 	ASSERT(pp->p_vnode->v_type != VCHR);
1633 
1634 	/*
1635 	 * Unload the mappings and check if mod bit is set.
1636 	 */
1637 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1638 
1639 	if (hat_ismod(pp)) {
1640 		result = kcage_relocate_page(pp, nfreedp);
1641 #ifdef KCAGE_STATS
1642 		if (result == 0)
1643 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1644 		else if (result == ENOMEM)
1645 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1646 #endif
1647 		return (result);
1648 	}
1649 
1650 	if (!page_try_demote_pages(pp)) {
1651 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1652 		page_unlock(pp);
1653 		return (EAGAIN);
1654 	}
1655 
1656 	page_destroy(pp, 0);
1657 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1658 	*nfreedp = 1;
1659 	return (0);
1660 }
1661 
1662 static void
1663 kcage_cageout()
1664 {
1665 	pfn_t pfn;
1666 	page_t *pp;
1667 	callb_cpr_t cprinfo;
1668 	int did_something;
1669 	int scan_again;
1670 	pfn_t start_pfn;
1671 	int pass;
1672 	int last_pass;
1673 	int pages_skipped;
1674 	int shared_skipped;
1675 	ulong_t shared_level = 8;
1676 	pgcnt_t nfreed;
1677 #ifdef KCAGE_STATS
1678 	clock_t scan_start;
1679 #endif
1680 
1681 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1682 	    callb_generic_cpr, "cageout");
1683 
1684 	mutex_enter(&kcage_cageout_mutex);
1685 	kcage_cageout_thread = curthread;
1686 
1687 	pfn = PFN_INVALID;		/* force scan reset */
1688 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1689 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1690 
1691 loop:
1692 	/*
1693 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1694 	 * that kcage_freemem is less than kcage_desfree. When it does
1695 	 * notice, kcage_freemem_sub() will wake us up via call to
1696 	 * kcage_cageout_wakeup().
1697 	 */
1698 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1699 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1700 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1701 
1702 	KCAGE_STAT_INCR(kt_wakeups);
1703 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1704 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1705 	pass = 0;
1706 	last_pass = 0;
1707 
1708 #ifdef KCAGE_STATS
1709 	scan_start = lbolt;
1710 #endif
1711 
1712 again:
1713 	if (!kcage_on)
1714 		goto loop;
1715 
1716 	KCAGE_STAT_INCR(kt_scans);
1717 	KCAGE_STAT_INCR_SCAN(kt_passes);
1718 
1719 	did_something = 0;
1720 	pages_skipped = 0;
1721 	shared_skipped = 0;
1722 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1723 	    (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1724 
1725 		if (start_pfn == PFN_INVALID)
1726 			start_pfn = pfn;
1727 		else if (start_pfn == pfn) {
1728 			last_pass = pass;
1729 			pass += 1;
1730 			/*
1731 			 * Did a complete walk of kernel cage, but didn't free
1732 			 * any pages.  If only one cpu is online then
1733 			 * stop kernel cage walk and try expanding.
1734 			 */
1735 			if (ncpus_online == 1 && did_something == 0) {
1736 				KCAGE_STAT_INCR(kt_cageout_break);
1737 				break;
1738 			}
1739 		}
1740 
1741 		pp = page_numtopp_nolock(pfn);
1742 		if (pp == NULL) {
1743 			continue;
1744 		}
1745 
1746 		KCAGE_STAT_INCR_SCAN(kt_examined);
1747 
1748 		/*
1749 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1750 		 * of the lock. If one is missed it will be seen next
1751 		 * time through.
1752 		 *
1753 		 * Skip non-caged-pages. These pages can exist in the cage
1754 		 * because, if during cage expansion, a page is
1755 		 * encountered that is long-term locked the lock prevents the
1756 		 * expansion logic from setting the P_NORELOC flag. Hence,
1757 		 * non-caged-pages surrounded by caged-pages.
1758 		 */
1759 		if (!PP_ISNORELOC(pp)) {
1760 			switch (kcage_assimilate_page(pp, &nfreed)) {
1761 				case 0:
1762 					did_something = 1;
1763 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1764 					    nfreed);
1765 					break;
1766 
1767 				case EBUSY:
1768 				case ERANGE:
1769 					did_something = 1;
1770 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1771 					break;
1772 
1773 				case EAGAIN:
1774 				case ENOMEM:
1775 					break;
1776 
1777 				default:
1778 					/* catch this with debug kernels */
1779 					ASSERT(0);
1780 					break;
1781 			}
1782 
1783 			continue;
1784 		} else {
1785 			int prm;
1786 
1787 			if (PP_ISFREE(pp)) {
1788 				continue;
1789 			}
1790 
1791 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
1792 			    !page_trylock(pp, SE_EXCL)) {
1793 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1794 				continue;
1795 			}
1796 
1797 			/* P_NORELOC bit should not have gone away. */
1798 			ASSERT(PP_ISNORELOC(pp));
1799 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
1800 			    pp->p_lckcnt > 0)) {
1801 				page_unlock(pp);
1802 				continue;
1803 			}
1804 
1805 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1806 			if (hat_page_checkshare(pp, shared_level)) {
1807 				page_unlock(pp);
1808 				pages_skipped = 1;
1809 				shared_skipped = 1;
1810 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1811 				continue;
1812 			}
1813 
1814 			/*
1815 			 * In pass {0, 1}, skip page if ref bit is set.
1816 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1817 			 */
1818 			prm = hat_pagesync(pp,
1819 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1820 
1821 			/* On first pass ignore ref'd pages */
1822 			if (pass <= 1 && (prm & P_REF)) {
1823 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1824 				pages_skipped = 1;
1825 				page_unlock(pp);
1826 				continue;
1827 			}
1828 
1829 			/* On pass 2, page_destroy if mod bit is not set */
1830 			if (pass <= 2) {
1831 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1832 				    pp->p_lckcnt || pp->p_cowcnt) {
1833 					pages_skipped = 1;
1834 					page_unlock(pp);
1835 				} else {
1836 
1837 					/*
1838 					 * unload the mappings before
1839 					 * checking if mod bit is set
1840 					 */
1841 					(void) hat_pageunload(pp,
1842 					    HAT_FORCE_PGUNLOAD);
1843 
1844 					/*
1845 					 * skip this page if modified
1846 					 */
1847 					if (hat_ismod(pp)) {
1848 						pages_skipped = 1;
1849 						page_unlock(pp);
1850 						continue;
1851 					}
1852 
1853 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1854 					page_destroy(pp, 0);
1855 					did_something = 1;
1856 				}
1857 				continue;
1858 			}
1859 
1860 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1861 				did_something = 1;
1862 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1863 			}
1864 
1865 			/*
1866 			 * No need to drop the page lock here.
1867 			 * Kcage_invalidate_page has done that for us
1868 			 * either explicitly or through a page_free.
1869 			 */
1870 		}
1871 	}
1872 
1873 	/*
1874 	 * Expand the cage only if available cage memory is really low.
1875 	 * This test is done only after a complete scan of the cage.
1876 	 * The reason for not checking and expanding more often is to
1877 	 * avoid rapid expansion of the cage. Naturally, scanning the
1878 	 * cage takes time. So by scanning first, we use that work as a
1879 	 * delay loop in between expand decisions.
1880 	 */
1881 
1882 	scan_again = 0;
1883 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1884 		/*
1885 		 * Kcage_expand() will return a non-zero value if it was
1886 		 * able to expand the cage -- whether or not the new
1887 		 * pages are free and immediately usable. If non-zero,
1888 		 * we do another scan of the cage. The pages might be
1889 		 * freed during that scan or by time we get back here.
1890 		 * If not, we will attempt another expansion.
1891 		 * However, if kcage_expand() returns zero, then it was
1892 		 * unable to expand the cage. This is the case when the
1893 		 * the growth list is exausted, therefore no work was done
1894 		 * and there is no reason to scan the cage again.
1895 		 * Note: Kernel cage scan is not repeated on single-cpu
1896 		 * system to avoid kernel cage thread hogging cpu.
1897 		 */
1898 		if (pass <= 3 && pages_skipped && ncpus_online > 1)
1899 			scan_again = 1;
1900 		else
1901 			(void) kcage_expand(); /* don't scan again */
1902 	} else if (kcage_freemem < kcage_lotsfree) {
1903 		/*
1904 		 * If available cage memory is less than abundant
1905 		 * and a full scan of the cage has not yet been completed,
1906 		 * or a scan has completed and some work was performed,
1907 		 * or pages were skipped because of sharing,
1908 		 * or we simply have not yet completed two passes,
1909 		 * then do another scan.
1910 		 */
1911 		if (pass <= 2 && pages_skipped)
1912 			scan_again = 1;
1913 		if (pass == last_pass || did_something)
1914 			scan_again = 1;
1915 		else if (shared_skipped && shared_level < (8<<24)) {
1916 			shared_level <<= 1;
1917 			scan_again = 1;
1918 		}
1919 	}
1920 
1921 	if (scan_again && ncpus_online > 1)
1922 		goto again;
1923 	else {
1924 		if (shared_level > 8)
1925 			shared_level >>= 1;
1926 
1927 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1928 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1929 		KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start);
1930 		KCAGE_STAT_INC_SCAN_INDEX;
1931 		goto loop;
1932 	}
1933 
1934 	/*NOTREACHED*/
1935 }
1936 
1937 void
1938 kcage_cageout_wakeup()
1939 {
1940 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1941 		if (kcage_cageout_ready) {
1942 			cv_signal(&kcage_cageout_cv);
1943 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1944 			/*
1945 			 * Available cage memory is really low. Time to
1946 			 * start expanding the cage. However, the
1947 			 * kernel cage thread is not yet ready to
1948 			 * do the work. Use *this* thread, which is
1949 			 * most likely to be t0, to do the work.
1950 			 */
1951 			KCAGE_STAT_INCR(kcw_expandearly);
1952 			(void) kcage_expand();
1953 			KCAGE_STAT_INC_SCAN_INDEX;
1954 		}
1955 
1956 		mutex_exit(&kcage_cageout_mutex);
1957 	}
1958 	/* else, kernel cage thread is already running */
1959 }
1960 
1961 void
1962 kcage_tick()
1963 {
1964 	/*
1965 	 * Once per second we wake up all the threads throttled
1966 	 * waiting for cage memory, in case we've become stuck
1967 	 * and haven't made forward progress expanding the cage.
1968 	 */
1969 	if (kcage_on && kcage_cageout_ready)
1970 		cv_broadcast(&kcage_throttle_cv);
1971 }
1972