xref: /titanic_50/usr/src/uts/common/os/mem_cage.c (revision ff17c8bf86c3e567734be83f90267edee20f580f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/types.h>
29 #include <sys/param.h>
30 #include <sys/thread.h>
31 #include <sys/proc.h>
32 #include <sys/callb.h>
33 #include <sys/vnode.h>
34 #include <sys/debug.h>
35 #include <sys/systm.h>		/* for bzero */
36 #include <sys/memlist.h>
37 #include <sys/cmn_err.h>
38 #include <sys/sysmacros.h>
39 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
40 #include <sys/atomic.h>		/* used to update kcage_freemem */
41 #include <sys/kmem.h>		/* for kmem_reap */
42 #include <sys/errno.h>
43 #include <sys/mem_cage.h>
44 #include <vm/seg_kmem.h>
45 #include <vm/page.h>
46 #include <vm/hat.h>
47 #include <vm/vm_dep.h>
48 #include <sys/mem_config.h>
49 #include <sys/lgrp.h>
50 #include <sys/rwlock.h>
51 #include <sys/cpupart.h>
52 
53 extern pri_t maxclsyspri;
54 
55 #ifdef DEBUG
56 #define	KCAGE_STATS
57 #endif
58 
59 #ifdef KCAGE_STATS
60 
61 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
62 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
63 
64 struct kcage_stats_scan {
65 	/* managed by KCAGE_STAT_* macros */
66 	clock_t	scan_lbolt;
67 	uint_t	scan_id;
68 
69 	/* set in kcage_cageout() */
70 	uint_t	kt_passes;
71 	clock_t	kt_ticks;
72 	pgcnt_t	kt_kcage_freemem_start;
73 	pgcnt_t	kt_kcage_freemem_end;
74 	pgcnt_t kt_freemem_start;
75 	pgcnt_t kt_freemem_end;
76 	uint_t	kt_examined;
77 	uint_t	kt_cantlock;
78 	uint_t	kt_gotone;
79 	uint_t	kt_gotonefree;
80 	uint_t	kt_skiplevel;
81 	uint_t	kt_skipshared;
82 	uint_t	kt_skiprefd;
83 	uint_t	kt_destroy;
84 
85 	/* set in kcage_invalidate_page() */
86 	uint_t	kip_reloclocked;
87 	uint_t	kip_relocmod;
88 	uint_t	kip_destroy;
89 	uint_t	kip_nomem;
90 	uint_t	kip_demotefailed;
91 
92 	/* set in kcage_expand() */
93 	uint_t	ke_wanted;
94 	uint_t	ke_examined;
95 	uint_t	ke_lefthole;
96 	uint_t	ke_gotone;
97 	uint_t	ke_gotonefree;
98 };
99 
100 struct kcage_stats {
101 	/* managed by KCAGE_STAT_* macros */
102 	uint_t	version;
103 	uint_t	size;
104 
105 	/* set in kcage_cageout */
106 	uint_t	kt_wakeups;
107 	uint_t	kt_scans;
108 	uint_t	kt_cageout_break;
109 
110 	/* set in kcage_expand */
111 	uint_t	ke_calls;
112 	uint_t	ke_nopfn;
113 	uint_t	ke_nopaget;
114 	uint_t	ke_isnoreloc;
115 	uint_t	ke_deleting;
116 	uint_t	ke_lowfreemem;
117 	uint_t	ke_terminate;
118 
119 	/* set in kcage_freemem_add() */
120 	uint_t	kfa_trottlewake;
121 
122 	/* set in kcage_freemem_sub() */
123 	uint_t	kfs_cagewake;
124 
125 	/* set in kcage_create_throttle */
126 	uint_t	kct_calls;
127 	uint_t	kct_cageout;
128 	uint_t	kct_critical;
129 	uint_t	kct_exempt;
130 	uint_t	kct_cagewake;
131 	uint_t	kct_wait;
132 	uint_t	kct_progress;
133 	uint_t	kct_noprogress;
134 	uint_t	kct_timeout;
135 
136 	/* set in kcage_cageout_wakeup */
137 	uint_t	kcw_expandearly;
138 
139 	/* managed by KCAGE_STAT_* macros */
140 	uint_t	scan_array_size;
141 	uint_t	scan_index;
142 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
143 };
144 
145 static struct kcage_stats kcage_stats;
146 static struct kcage_stats_scan kcage_stats_scan_zero;
147 
148 /*
149  * No real need for atomics here. For the most part the incs and sets are
150  * done by the kernel cage thread. There are a few that are done by any
151  * number of other threads. Those cases are noted by comments.
152  */
153 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
154 
155 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
156 
157 #define	KCAGE_STAT_INCR_SCAN(m)	\
158 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
159 
160 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
161 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
162 
163 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
164 
165 #define	KCAGE_STAT_SETZ(m, v)	\
166 	if (kcage_stats.m == 0) kcage_stats.m = (v)
167 
168 #define	KCAGE_STAT_SET_SCAN(m, v)	\
169 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
170 
171 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
172 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
173 
174 #define	KCAGE_STAT_INC_SCAN_INDEX \
175 	KCAGE_STAT_SET_SCAN(scan_lbolt, lbolt); \
176 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
177 	kcage_stats.scan_index = \
178 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
179 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
180 
181 #define	KCAGE_STAT_INIT_SCAN_INDEX \
182 	kcage_stats.version = KCAGE_STATS_VERSION; \
183 	kcage_stats.size = sizeof (kcage_stats); \
184 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
185 	kcage_stats.scan_index = 0
186 
187 #else /* KCAGE_STATS */
188 
189 #define	KCAGE_STAT_INCR(v)
190 #define	KCAGE_STAT_NINCR(m, v)
191 #define	KCAGE_STAT_INCR_SCAN(v)
192 #define	KCAGE_STAT_NINCR_SCAN(m, v)
193 #define	KCAGE_STAT_SET(m, v)
194 #define	KCAGE_STAT_SETZ(m, v)
195 #define	KCAGE_STAT_SET_SCAN(m, v)
196 #define	KCAGE_STAT_SETZ_SCAN(m, v)
197 #define	KCAGE_STAT_INC_SCAN_INDEX
198 #define	KCAGE_STAT_INIT_SCAN_INDEX
199 
200 #endif /* KCAGE_STATS */
201 
202 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
203 static kcondvar_t kcage_throttle_cv;
204 
205 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
206 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
207 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
208 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
209 
210 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
211 
212 /*
213  * Cage expansion happens within a range.
214  */
215 struct kcage_glist {
216 	struct kcage_glist	*next;
217 	pfn_t			base;
218 	pfn_t			lim;
219 	pfn_t			curr;
220 	int			decr;
221 };
222 
223 static struct kcage_glist *kcage_glist;
224 static struct kcage_glist *kcage_current_glist;
225 
226 /*
227  * The firstfree element is provided so that kmem_alloc can be avoided
228  * until that cage has somewhere to go. This is not currently a problem
229  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
230  */
231 static vmem_t *kcage_arena;
232 static struct kcage_glist kcage_glist_firstfree;
233 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
234 
235 /*
236  * Miscellaneous forward references
237  */
238 static struct kcage_glist *kcage_glist_alloc(void);
239 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
240 static void kcage_cageout(void);
241 static int kcage_invalidate_page(page_t *, pgcnt_t *);
242 static int kcage_setnoreloc_pages(page_t *, se_t);
243 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
244 static void kcage_init(pgcnt_t preferred_size);
245 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
246 
247 /*
248  * Kernel Memory Cage counters and thresholds.
249  */
250 int kcage_on = 0;
251 pgcnt_t kcage_freemem;
252 pgcnt_t kcage_needfree;
253 pgcnt_t kcage_lotsfree;
254 pgcnt_t kcage_desfree;
255 pgcnt_t kcage_minfree;
256 pgcnt_t kcage_throttlefree;
257 pgcnt_t	kcage_reserve;
258 int kcage_maxwait = 10;	/* in seconds */
259 
260 /* when we use lp for kmem we start the cage at a higher initial value */
261 pgcnt_t kcage_kmemlp_mincage;
262 
263 #ifdef DEBUG
264 pgcnt_t	kcage_pagets;
265 #define	KCAGEPAGETS_INC()	kcage_pagets++
266 #else
267 #define	KCAGEPAGETS_INC()
268 #endif
269 
270 /* kstats to export what pages are currently caged */
271 kmutex_t kcage_kstat_lock;
272 static int kcage_kstat_update(kstat_t *ksp, int rw);
273 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
274 
275 /*
276  * Startup and Dynamic Reconfiguration interfaces.
277  * kcage_range_add()
278  * kcage_range_del()
279  * kcage_range_delete_post_mem_del()
280  * kcage_range_init()
281  * kcage_set_thresholds()
282  */
283 
284 /*
285  * Called from page_get_contig_pages to get the approximate kcage pfn range
286  * for exclusion from search for contiguous pages. This routine is called
287  * without kcage_range lock (kcage routines can call page_get_contig_pages
288  * through page_relocate) and with the assumption, based on kcage_range_add,
289  * that kcage_current_glist always contain a valid pointer.
290  */
291 
292 int
293 kcage_current_pfn(pfn_t *pfncur)
294 {
295 	struct kcage_glist *lp = kcage_current_glist;
296 
297 	ASSERT(kcage_on);
298 
299 	ASSERT(lp != NULL);
300 
301 	*pfncur = lp->curr;
302 
303 	return (lp->decr);
304 }
305 
306 /*
307  * Called from vm_pagelist.c during coalesce to find kernel cage regions
308  * within an mnode. Looks for the lowest range between lo and hi.
309  *
310  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
311  * Non-cage memory is defined between kcage_current_glist and list end.
312  *
313  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
314  * non-cage range.
315  *
316  * Returns zero on success and nlo, nhi:
317  * 	lo <= nlo < nhi <= hi
318  * Returns non-zero if no overlapping range is found.
319  */
320 int
321 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
322     pfn_t *nlo, pfn_t *nhi)
323 {
324 	struct kcage_glist *lp;
325 	pfn_t tlo = hi;
326 	pfn_t thi = hi;
327 
328 	ASSERT(lo <= hi);
329 
330 	/*
331 	 * Reader lock protects the list, but kcage_get_pfn
332 	 * running concurrently may advance kcage_current_glist
333 	 * and also update kcage_current_glist->curr. Page
334 	 * coalesce can handle this race condition.
335 	 */
336 	rw_enter(&kcage_range_rwlock, RW_READER);
337 
338 	for (lp = incage ? kcage_glist : kcage_current_glist;
339 	    lp != NULL; lp = lp->next) {
340 
341 		pfn_t klo, khi;
342 
343 		/* find the range limits in this element */
344 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
345 			klo = lp->curr;
346 			khi = lp->lim;
347 		} else {
348 			klo = lp->base;
349 			khi = lp->curr;
350 		}
351 
352 		/* handle overlap */
353 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
354 			tlo = MAX(lo, klo);
355 			thi = MIN(hi, khi);
356 			if (tlo == lo)
357 				break;
358 		}
359 
360 		/* check end of kcage */
361 		if (incage && lp == kcage_current_glist) {
362 			break;
363 		}
364 	}
365 
366 	rw_exit(&kcage_range_rwlock);
367 
368 	/* return non-zero if no overlapping range found */
369 	if (tlo == thi)
370 		return (1);
371 
372 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
373 
374 	/* return overlapping range */
375 	*nlo = tlo;
376 	*nhi = thi;
377 	return (0);
378 }
379 
380 void
381 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
382 {
383 	int ret = 0;
384 
385 	ASSERT(kcage_arena == NULL);
386 	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
387 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
388 	ASSERT(kcage_arena != NULL);
389 
390 	if (d == KCAGE_DOWN) {
391 		while (ml->next != NULL)
392 			ml = ml->next;
393 	}
394 
395 	rw_enter(&kcage_range_rwlock, RW_WRITER);
396 
397 	while (ml != NULL) {
398 		ret = kcage_range_add_internal(btop(ml->address),
399 		    btop(ml->size), d);
400 		if (ret)
401 			panic("kcage_range_add_internal failed: "
402 			    "ml=%p, ret=0x%x\n", ml, ret);
403 
404 		ml = (d == KCAGE_DOWN ? ml->prev : ml->next);
405 	}
406 
407 	rw_exit(&kcage_range_rwlock);
408 
409 	if (ret == 0)
410 		kcage_init(preferred_size);
411 }
412 
413 /*
414  * Third arg controls direction of growth: 0: increasing pfns,
415  * 1: decreasing.
416  */
417 static int
418 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
419 {
420 	struct kcage_glist *new, **lpp;
421 	pfn_t lim;
422 
423 	ASSERT(rw_write_held(&kcage_range_rwlock));
424 
425 	ASSERT(npgs != 0);
426 	if (npgs == 0)
427 		return (EINVAL);
428 
429 	lim = base + npgs;
430 
431 	ASSERT(lim > base);
432 	if (lim <= base)
433 		return (EINVAL);
434 
435 	new = kcage_glist_alloc();
436 	if (new == NULL) {
437 		return (ENOMEM);
438 	}
439 
440 	new->base = base;
441 	new->lim = lim;
442 	new->decr = (d == KCAGE_DOWN);
443 	if (new->decr != 0)
444 		new->curr = new->lim;
445 	else
446 		new->curr = new->base;
447 	/*
448 	 * Any overlapping existing ranges are removed by deleting
449 	 * from the new list as we search for the tail.
450 	 */
451 	lpp = &kcage_glist;
452 	while (*lpp != NULL) {
453 		int ret;
454 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
455 		if (ret != 0)
456 			return (ret);
457 		lpp = &(*lpp)->next;
458 	}
459 
460 	*lpp = new;
461 
462 	if (kcage_current_glist == NULL) {
463 		kcage_current_glist = kcage_glist;
464 	}
465 
466 	return (0);
467 }
468 
469 int
470 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
471 {
472 	int ret;
473 
474 	rw_enter(&kcage_range_rwlock, RW_WRITER);
475 	ret = kcage_range_add_internal(base, npgs, d);
476 	rw_exit(&kcage_range_rwlock);
477 	return (ret);
478 }
479 
480 /*
481  * Calls to add and delete must be protected by kcage_range_rwlock
482  */
483 static int
484 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
485 {
486 	struct kcage_glist *lp;
487 	pfn_t lim;
488 
489 	ASSERT(rw_write_held(&kcage_range_rwlock));
490 
491 	ASSERT(npgs != 0);
492 	if (npgs == 0)
493 		return (EINVAL);
494 
495 	lim = base + npgs;
496 
497 	ASSERT(lim > base);
498 	if (lim <= base)
499 		return (EINVAL);
500 
501 	/*
502 	 * Check if the delete is OK first as a number of elements
503 	 * might be involved and it will be difficult to go
504 	 * back and undo (can't just add the range back in).
505 	 */
506 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
507 		/*
508 		 * If there have been no pages allocated from this
509 		 * element, we don't need to check it.
510 		 */
511 		if ((lp->decr == 0 && lp->curr == lp->base) ||
512 		    (lp->decr != 0 && lp->curr == lp->lim))
513 			continue;
514 		/*
515 		 * If the element does not overlap, its OK.
516 		 */
517 		if (base >= lp->lim || lim <= lp->base)
518 			continue;
519 		/*
520 		 * Overlapping element: Does the range to be deleted
521 		 * overlap the area already used? If so fail.
522 		 */
523 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
524 			return (EBUSY);
525 		}
526 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
527 			return (EBUSY);
528 		}
529 	}
530 	return (kcage_glist_delete(base, lim, &kcage_glist));
531 }
532 
533 int
534 kcage_range_delete(pfn_t base, pgcnt_t npgs)
535 {
536 	int ret;
537 
538 	rw_enter(&kcage_range_rwlock, RW_WRITER);
539 	ret = kcage_range_delete_internal(base, npgs);
540 	rw_exit(&kcage_range_rwlock);
541 	return (ret);
542 }
543 
544 /*
545  * Calls to add and delete must be protected by kcage_range_rwlock.
546  * This routine gets called after successful Solaris memory
547  * delete operation from DR post memory delete routines.
548  */
549 static int
550 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
551 {
552 	pfn_t lim;
553 
554 	ASSERT(rw_write_held(&kcage_range_rwlock));
555 
556 	ASSERT(npgs != 0);
557 	if (npgs == 0)
558 		return (EINVAL);
559 
560 	lim = base + npgs;
561 
562 	ASSERT(lim > base);
563 	if (lim <= base)
564 		return (EINVAL);
565 
566 	return (kcage_glist_delete(base, lim, &kcage_glist));
567 }
568 
569 int
570 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
571 {
572 	int ret;
573 
574 	rw_enter(&kcage_range_rwlock, RW_WRITER);
575 	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
576 	rw_exit(&kcage_range_rwlock);
577 	return (ret);
578 }
579 
580 /*
581  * No locking is required here as the whole operation is covered
582  * by kcage_range_rwlock writer lock.
583  */
584 static struct kcage_glist *
585 kcage_glist_alloc(void)
586 {
587 	struct kcage_glist *new;
588 
589 	if ((new = kcage_glist_freelist) != NULL) {
590 		kcage_glist_freelist = new->next;
591 	} else {
592 		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
593 	}
594 
595 	if (new != NULL)
596 		bzero(new, sizeof (*new));
597 
598 	return (new);
599 }
600 
601 static void
602 kcage_glist_free(struct kcage_glist *lp)
603 {
604 	lp->next = kcage_glist_freelist;
605 	kcage_glist_freelist = lp;
606 }
607 
608 static int
609 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
610 {
611 	struct kcage_glist *lp, *prev = *lpp;
612 
613 	while ((lp = *lpp) != NULL) {
614 		if (lim > lp->base && base < lp->lim) {
615 			/* The delete range overlaps this element. */
616 			if (base <= lp->base && lim >= lp->lim) {
617 				/* Delete whole element. */
618 				*lpp = lp->next;
619 				if (lp == kcage_current_glist) {
620 					/* This can never happen. */
621 					ASSERT(kcage_current_glist != prev);
622 					kcage_current_glist = prev;
623 				}
624 				kcage_glist_free(lp);
625 				continue;
626 			}
627 
628 			/* Partial delete. */
629 			if (base > lp->base && lim < lp->lim) {
630 				struct kcage_glist *new;
631 
632 				/*
633 				 * Remove a section from the middle,
634 				 * need to allocate a new element.
635 				 */
636 				new = kcage_glist_alloc();
637 				if (new == NULL) {
638 					return (ENOMEM);
639 				}
640 
641 				/*
642 				 * Tranfser unused range to new.
643 				 * Edit lp in place to preserve
644 				 * kcage_current_glist.
645 				 */
646 				new->decr = lp->decr;
647 				if (new->decr != 0) {
648 					new->base = lp->base;
649 					new->lim = base;
650 					new->curr = base;
651 
652 					lp->base = lim;
653 				} else {
654 					new->base = lim;
655 					new->lim = lp->lim;
656 					new->curr = new->base;
657 
658 					lp->lim = base;
659 				}
660 
661 				/* Insert new. */
662 				new->next = lp->next;
663 				lp->next = new;
664 				lpp = &lp->next;
665 			} else {
666 				/* Delete part of current block. */
667 				if (base > lp->base) {
668 					ASSERT(lim >= lp->lim);
669 					ASSERT(base < lp->lim);
670 					if (lp->decr != 0 &&
671 					    lp->curr == lp->lim)
672 						lp->curr = base;
673 					lp->lim = base;
674 				} else {
675 					ASSERT(base <= lp->base);
676 					ASSERT(lim > lp->base);
677 					if (lp->decr == 0 &&
678 					    lp->curr == lp->base)
679 						lp->curr = lim;
680 					lp->base = lim;
681 				}
682 			}
683 		}
684 		prev = *lpp;
685 		lpp = &(*lpp)->next;
686 	}
687 
688 	return (0);
689 }
690 
691 /*
692  * If lockit is 1, kcage_get_pfn holds the
693  * reader lock for kcage_range_rwlock.
694  * Changes to lp->curr can cause race conditions, but
695  * they are handled by higher level code (see kcage_next_range.)
696  */
697 static pfn_t
698 kcage_get_pfn(int lockit)
699 {
700 	struct kcage_glist *lp;
701 	pfn_t pfn = PFN_INVALID;
702 
703 	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
704 		return (pfn);
705 
706 	lp = kcage_current_glist;
707 	while (lp != NULL) {
708 		if (lp->decr != 0) {
709 			if (lp->curr != lp->base) {
710 				pfn = --lp->curr;
711 				break;
712 			}
713 		} else {
714 			if (lp->curr != lp->lim) {
715 				pfn = lp->curr++;
716 				break;
717 			}
718 		}
719 
720 		lp = lp->next;
721 		if (lp)
722 			kcage_current_glist = lp;
723 	}
724 
725 	if (lockit)
726 		rw_exit(&kcage_range_rwlock);
727 	return (pfn);
728 }
729 
730 /*
731  * Walk the physical address space of the cage.
732  * This routine does not guarantee to return PFNs in the order
733  * in which they were allocated to the cage. Instead, it walks
734  * each range as they appear on the growth list returning the PFNs
735  * range in ascending order.
736  *
737  * To begin scanning at lower edge of cage, reset should be nonzero.
738  * To step through cage, reset should be zero.
739  *
740  * PFN_INVALID will be returned when the upper end of the cage is
741  * reached -- indicating a full scan of the cage has been completed since
742  * previous reset. PFN_INVALID will continue to be returned until
743  * kcage_walk_cage is reset.
744  *
745  * It is possible to receive a PFN_INVALID result on reset if a growth
746  * list is not installed or if none of the PFNs in the installed list have
747  * been allocated to the cage. In otherwords, there is no cage.
748  *
749  * Caller need not hold kcage_range_rwlock while calling this function
750  * as the front part of the list is static - pages never come out of
751  * the cage.
752  *
753  * The caller is expected to only be kcage_cageout().
754  */
755 static pfn_t
756 kcage_walk_cage(int reset)
757 {
758 	static struct kcage_glist *lp = NULL;
759 	static pfn_t pfn;
760 
761 	if (reset)
762 		lp = NULL;
763 	if (lp == NULL) {
764 		lp = kcage_glist;
765 		pfn = PFN_INVALID;
766 	}
767 again:
768 	if (pfn == PFN_INVALID) {
769 		if (lp == NULL)
770 			return (PFN_INVALID);
771 
772 		if (lp->decr != 0) {
773 			/*
774 			 * In this range the cage grows from the highest
775 			 * address towards the lowest.
776 			 * Arrange to return pfns from curr to lim-1,
777 			 * inclusive, in ascending order.
778 			 */
779 
780 			pfn = lp->curr;
781 		} else {
782 			/*
783 			 * In this range the cage grows from the lowest
784 			 * address towards the highest.
785 			 * Arrange to return pfns from base to curr,
786 			 * inclusive, in ascending order.
787 			 */
788 
789 			pfn = lp->base;
790 		}
791 	}
792 
793 	if (lp->decr != 0) {		/* decrementing pfn */
794 		if (pfn == lp->lim) {
795 			/* Don't go beyond the static part of the glist. */
796 			if (lp == kcage_current_glist)
797 				lp = NULL;
798 			else
799 				lp = lp->next;
800 			pfn = PFN_INVALID;
801 			goto again;
802 		}
803 
804 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
805 	} else {			/* incrementing pfn */
806 		if (pfn == lp->curr) {
807 			/* Don't go beyond the static part of the glist. */
808 			if (lp == kcage_current_glist)
809 				lp = NULL;
810 			else
811 				lp = lp->next;
812 			pfn = PFN_INVALID;
813 			goto again;
814 		}
815 
816 		ASSERT(pfn >= lp->base && pfn < lp->curr);
817 	}
818 
819 	return (pfn++);
820 }
821 
822 /*
823  * Callback functions for to recalc cage thresholds after
824  * Kphysm memory add/delete operations.
825  */
826 /*ARGSUSED*/
827 static void
828 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
829 {
830 	kcage_recalc_thresholds();
831 }
832 
833 /*ARGSUSED*/
834 static int
835 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
836 {
837 	/* TODO: when should cage refuse memory delete requests? */
838 	return (0);
839 }
840 
841 /*ARGSUSED*/
842 static  void
843 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
844 {
845 	kcage_recalc_thresholds();
846 }
847 
848 static kphysm_setup_vector_t kcage_kphysm_vectors = {
849 	KPHYSM_SETUP_VECTOR_VERSION,
850 	kcage_kphysm_postadd_cb,
851 	kcage_kphysm_predel_cb,
852 	kcage_kphysm_postdel_cb
853 };
854 
855 /*
856  * This is called before a CPR suspend and after a CPR resume.  We have to
857  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
858  * restart.
859  */
860 /*ARGSUSED*/
861 static boolean_t
862 kcage_cageout_cpr(void *arg, int code)
863 {
864 	if (code == CB_CODE_CPR_CHKPT) {
865 		ASSERT(kcage_cageout_ready);
866 		kcage_cageout_ready = 0;
867 		return (B_TRUE);
868 	} else if (code == CB_CODE_CPR_RESUME) {
869 		ASSERT(kcage_cageout_ready == 0);
870 		kcage_cageout_ready = 1;
871 		return (B_TRUE);
872 	}
873 	return (B_FALSE);
874 }
875 
876 /*
877  * kcage_recalc_preferred_size() increases initial cage size to improve large
878  * page availability when lp for kmem is enabled and kpr is disabled
879  */
880 static pgcnt_t
881 kcage_recalc_preferred_size(pgcnt_t preferred_size)
882 {
883 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
884 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
885 		if (lpmincage == 0) {
886 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
887 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
888 		}
889 		kcage_kmemlp_mincage = MIN(lpmincage,
890 		    (segkmem_kmemlp_max / PAGESIZE));
891 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
892 	}
893 	return (preferred_size);
894 }
895 
896 /*
897  * Kcage_init() builds the cage and initializes the cage thresholds.
898  * The size of the cage is determined by the argument preferred_size.
899  * or the actual amount of memory, whichever is smaller.
900  */
901 static void
902 kcage_init(pgcnt_t preferred_size)
903 {
904 	pgcnt_t wanted;
905 	pfn_t pfn;
906 	page_t *pp;
907 	kstat_t *ksp;
908 
909 	extern struct vnode kvp;
910 	extern void page_list_noreloc_startup(page_t *);
911 
912 	ASSERT(!kcage_on);
913 
914 	/* increase preferred cage size for lp for kmem */
915 	preferred_size = kcage_recalc_preferred_size(preferred_size);
916 
917 	/* Debug note: initialize this now so early expansions can stat */
918 	KCAGE_STAT_INIT_SCAN_INDEX;
919 
920 	/*
921 	 * Initialize cage thresholds and install kphysm callback.
922 	 * If we can't arrange to have the thresholds track with
923 	 * available physical memory, then the cage thresholds may
924 	 * end up over time at levels that adversly effect system
925 	 * performance; so, bail out.
926 	 */
927 	kcage_recalc_thresholds();
928 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
929 		ASSERT(0);		/* Catch this in DEBUG kernels. */
930 		return;
931 	}
932 
933 	/*
934 	 * Limit startup cage size within the range of kcage_minfree
935 	 * and availrmem, inclusively.
936 	 */
937 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
938 
939 	/*
940 	 * Construct the cage. PFNs are allocated from the glist. It
941 	 * is assumed that the list has been properly ordered for the
942 	 * platform by the platform code. Typically, this is as simple
943 	 * as calling kcage_range_init(phys_avail, decr), where decr is
944 	 * 1 if the kernel has been loaded into upper end of physical
945 	 * memory, or 0 if the kernel has been loaded at the low end.
946 	 *
947 	 * Note: it is assumed that we are in the startup flow, so there
948 	 * is no reason to grab the page lock.
949 	 */
950 	kcage_freemem = 0;
951 	pfn = PFN_INVALID;			/* prime for alignment test */
952 	while (wanted != 0) {
953 		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
954 			break;
955 
956 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
957 			KCAGEPAGETS_INC();
958 			/*
959 			 * Set the noreloc state on the page.
960 			 * If the page is free and not already
961 			 * on the noreloc list then move it.
962 			 */
963 			if (PP_ISFREE(pp)) {
964 				if (PP_ISNORELOC(pp) == 0)
965 					page_list_noreloc_startup(pp);
966 			} else {
967 				ASSERT(pp->p_szc == 0);
968 				PP_SETNORELOC(pp);
969 			}
970 		}
971 		PLCNT_XFER_NORELOC(pp);
972 		wanted -= 1;
973 	}
974 
975 	/*
976 	 * Need to go through and find kernel allocated pages
977 	 * and capture them into the Cage.  These will primarily
978 	 * be pages gotten through boot_alloc().
979 	 */
980 	if (kvp.v_pages) {
981 
982 		pp = kvp.v_pages;
983 		do {
984 			ASSERT(!PP_ISFREE(pp));
985 			ASSERT(pp->p_szc == 0);
986 			PP_SETNORELOC(pp);
987 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
988 
989 	}
990 
991 	kcage_on = 1;
992 
993 	/*
994 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
995 	 * after the cageout thread is blocked, and executes from cpr_resume()
996 	 * before the cageout thread is restarted.  By executing in this class,
997 	 * we are assured that the kernel cage thread won't miss wakeup calls
998 	 * and also CPR's larger kmem_alloc requests will not fail after
999 	 * CPR shuts down the cageout kernel thread.
1000 	 */
1001 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
1002 	    "cageout");
1003 
1004 	/*
1005 	 * Coalesce pages to improve large page availability. A better fix
1006 	 * would to coalesce pages as they are included in the cage
1007 	 */
1008 	if (SEGKMEM_USE_LARGEPAGES) {
1009 		extern void page_freelist_coalesce_all(int mnode);
1010 		page_freelist_coalesce_all(-1);	/* do all mnodes */
1011 	}
1012 
1013 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
1014 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
1015 	if (ksp != NULL) {
1016 		ksp->ks_update = kcage_kstat_update;
1017 		ksp->ks_snapshot = kcage_kstat_snapshot;
1018 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
1019 		kstat_install(ksp);
1020 	}
1021 }
1022 
1023 static int
1024 kcage_kstat_update(kstat_t *ksp, int rw)
1025 {
1026 	struct kcage_glist *lp;
1027 	uint_t count;
1028 
1029 	if (rw == KSTAT_WRITE)
1030 		return (EACCES);
1031 
1032 	count = 0;
1033 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1034 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
1035 		if (lp->decr) {
1036 			if (lp->curr != lp->lim) {
1037 				count++;
1038 			}
1039 		} else {
1040 			if (lp->curr != lp->base) {
1041 				count++;
1042 			}
1043 		}
1044 	}
1045 	rw_exit(&kcage_range_rwlock);
1046 
1047 	ksp->ks_ndata = count;
1048 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1049 
1050 	return (0);
1051 }
1052 
1053 static int
1054 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1055 {
1056 	struct kcage_glist *lp;
1057 	struct memunit {
1058 		uint64_t address;
1059 		uint64_t size;
1060 	} *kspmem;
1061 
1062 	if (rw == KSTAT_WRITE)
1063 		return (EACCES);
1064 
1065 	ksp->ks_snaptime = gethrtime();
1066 
1067 	kspmem = (struct memunit *)buf;
1068 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1069 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
1070 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1071 			break;
1072 
1073 		if (lp->decr) {
1074 			if (lp->curr != lp->lim) {
1075 				kspmem->address = ptob(lp->curr);
1076 				kspmem->size = ptob(lp->lim - lp->curr);
1077 			}
1078 		} else {
1079 			if (lp->curr != lp->base) {
1080 				kspmem->address = ptob(lp->base);
1081 				kspmem->size = ptob(lp->curr - lp->base);
1082 			}
1083 		}
1084 	}
1085 	rw_exit(&kcage_range_rwlock);
1086 
1087 	return (0);
1088 }
1089 
1090 void
1091 kcage_recalc_thresholds()
1092 {
1093 	static int first = 1;
1094 	static pgcnt_t init_lotsfree;
1095 	static pgcnt_t init_desfree;
1096 	static pgcnt_t init_minfree;
1097 	static pgcnt_t init_throttlefree;
1098 	static pgcnt_t init_reserve;
1099 
1100 	/* TODO: any reason to take more care than this with live editing? */
1101 	mutex_enter(&kcage_cageout_mutex);
1102 	mutex_enter(&freemem_lock);
1103 
1104 	if (first) {
1105 		first = 0;
1106 		init_lotsfree = kcage_lotsfree;
1107 		init_desfree = kcage_desfree;
1108 		init_minfree = kcage_minfree;
1109 		init_throttlefree = kcage_throttlefree;
1110 		init_reserve = kcage_reserve;
1111 	} else {
1112 		kcage_lotsfree = init_lotsfree;
1113 		kcage_desfree = init_desfree;
1114 		kcage_minfree = init_minfree;
1115 		kcage_throttlefree = init_throttlefree;
1116 		kcage_reserve = init_reserve;
1117 	}
1118 
1119 	if (kcage_lotsfree == 0)
1120 		kcage_lotsfree = MAX(32, total_pages / 256);
1121 
1122 	if (kcage_minfree == 0)
1123 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1124 
1125 	if (kcage_desfree == 0)
1126 		kcage_desfree = MAX(32, kcage_minfree);
1127 
1128 	if (kcage_throttlefree == 0)
1129 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1130 
1131 	if (kcage_reserve == 0)
1132 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1133 
1134 	mutex_exit(&freemem_lock);
1135 	mutex_exit(&kcage_cageout_mutex);
1136 
1137 	if (kcage_cageout_ready) {
1138 		if (kcage_freemem < kcage_desfree)
1139 			kcage_cageout_wakeup();
1140 
1141 		if (kcage_needfree) {
1142 			mutex_enter(&kcage_throttle_mutex);
1143 			cv_broadcast(&kcage_throttle_cv);
1144 			mutex_exit(&kcage_throttle_mutex);
1145 		}
1146 	}
1147 }
1148 
1149 /*
1150  * Pageout interface:
1151  * kcage_cageout_init()
1152  */
1153 void
1154 kcage_cageout_init()
1155 {
1156 	if (kcage_on) {
1157 
1158 		(void) thread_create(NULL, 0, kcage_cageout,
1159 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
1160 	}
1161 }
1162 
1163 
1164 /*
1165  * VM Interfaces:
1166  * kcage_create_throttle()
1167  * kcage_freemem_add()
1168  * kcage_freemem_sub()
1169  */
1170 
1171 /*
1172  * Wakeup cageout thread and throttle waiting for the number of pages
1173  * requested to become available.  For non-critical requests, a
1174  * timeout is added, since freemem accounting is separate from cage
1175  * freemem accounting: it's possible for us to get stuck and not make
1176  * forward progress even though there was sufficient freemem before
1177  * arriving here.
1178  */
1179 int
1180 kcage_create_throttle(pgcnt_t npages, int flags)
1181 {
1182 	int niter = 0;
1183 	pgcnt_t lastfree;
1184 	int enough = kcage_freemem > kcage_throttlefree + npages;
1185 
1186 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1187 
1188 	kcage_cageout_wakeup();			/* just to be sure */
1189 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1190 
1191 	/*
1192 	 * Obviously, we can't throttle the cageout thread since
1193 	 * we depend on it.  We also can't throttle the panic thread.
1194 	 */
1195 	if (curthread == kcage_cageout_thread || panicstr) {
1196 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1197 		return (KCT_CRIT);
1198 	}
1199 
1200 	/*
1201 	 * Don't throttle threads which are critical for proper
1202 	 * vm management if we're above kcage_throttlefree or
1203 	 * if freemem is very low.
1204 	 */
1205 	if (NOMEMWAIT()) {
1206 		if (enough) {
1207 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1208 			return (KCT_CRIT);
1209 		} else if (freemem < minfree) {
1210 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1211 			return (KCT_CRIT);
1212 		}
1213 	}
1214 
1215 	/*
1216 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1217 	 */
1218 	if (DISP_PRIO(curthread) > maxclsyspri &&
1219 	    kcage_freemem > kcage_reserve) {
1220 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1221 		return (KCT_CRIT);
1222 	}
1223 
1224 	/*
1225 	 * Cause all other threads (which are assumed to not be
1226 	 * critical to cageout) to wait here until their request
1227 	 * can be satisfied. Be a little paranoid and wake the
1228 	 * kernel cage on each loop through this logic.
1229 	 */
1230 	while (kcage_freemem < kcage_throttlefree + npages) {
1231 		ASSERT(kcage_on);
1232 
1233 		lastfree = kcage_freemem;
1234 
1235 		if (kcage_cageout_ready) {
1236 			mutex_enter(&kcage_throttle_mutex);
1237 
1238 			kcage_needfree += npages;
1239 			KCAGE_STAT_INCR(kct_wait);
1240 
1241 			kcage_cageout_wakeup();
1242 			KCAGE_STAT_INCR(kct_cagewake);
1243 
1244 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1245 
1246 			kcage_needfree -= npages;
1247 
1248 			mutex_exit(&kcage_throttle_mutex);
1249 		} else {
1250 			/*
1251 			 * NOTE: atomics are used just in case we enter
1252 			 * mp operation before the cageout thread is ready.
1253 			 */
1254 			atomic_add_long(&kcage_needfree, npages);
1255 
1256 			kcage_cageout_wakeup();
1257 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1258 
1259 			atomic_add_long(&kcage_needfree, -npages);
1260 		}
1261 
1262 		if ((flags & PG_WAIT) == 0) {
1263 			if (kcage_freemem > lastfree) {
1264 				KCAGE_STAT_INCR(kct_progress);
1265 				niter = 0;
1266 			} else {
1267 				KCAGE_STAT_INCR(kct_noprogress);
1268 				if (++niter >= kcage_maxwait) {
1269 					KCAGE_STAT_INCR(kct_timeout);
1270 					return (KCT_FAILURE);
1271 				}
1272 			}
1273 		}
1274 
1275 		if (NOMEMWAIT() && freemem < minfree) {
1276 			return (KCT_CRIT);
1277 		}
1278 
1279 	}
1280 	return (KCT_NONCRIT);
1281 }
1282 
1283 void
1284 kcage_freemem_add(pgcnt_t npages)
1285 {
1286 	extern void wakeup_pcgs(void);
1287 
1288 	atomic_add_long(&kcage_freemem, npages);
1289 
1290 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1291 
1292 	if (kcage_needfree != 0 &&
1293 	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1294 
1295 		mutex_enter(&kcage_throttle_mutex);
1296 		cv_broadcast(&kcage_throttle_cv);
1297 		KCAGE_STAT_INCR(kfa_trottlewake);
1298 		mutex_exit(&kcage_throttle_mutex);
1299 	}
1300 }
1301 
1302 void
1303 kcage_freemem_sub(pgcnt_t npages)
1304 {
1305 	atomic_add_long(&kcage_freemem, -npages);
1306 
1307 	if (kcage_freemem < kcage_desfree) {
1308 		kcage_cageout_wakeup();
1309 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1310 	}
1311 }
1312 
1313 /*
1314  * return 0 on failure and 1 on success.
1315  */
1316 static int
1317 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1318 {
1319 	pgcnt_t npgs, i;
1320 	page_t *pp;
1321 	pfn_t rootpfn = page_pptonum(rootpp);
1322 	uint_t szc;
1323 
1324 	ASSERT(!PP_ISFREE(rootpp));
1325 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1326 	if (!group_page_trylock(rootpp, se)) {
1327 		return (0);
1328 	}
1329 	szc = rootpp->p_szc;
1330 	if (szc == 0) {
1331 		/*
1332 		 * The szc of a locked page can only change for pages that are
1333 		 * non-swapfs (i.e. anonymous memory) file system pages.
1334 		 */
1335 		ASSERT(rootpp->p_vnode != NULL &&
1336 		    !PP_ISKAS(rootpp) &&
1337 		    !IS_SWAPFSVP(rootpp->p_vnode));
1338 		PP_SETNORELOC(rootpp);
1339 		return (1);
1340 	}
1341 	npgs = page_get_pagecnt(szc);
1342 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1343 	pp = rootpp;
1344 	for (i = 0; i < npgs; i++, pp++) {
1345 		ASSERT(PAGE_LOCKED_SE(pp, se));
1346 		ASSERT(!PP_ISFREE(pp));
1347 		ASSERT(pp->p_szc == szc);
1348 		PP_SETNORELOC(pp);
1349 	}
1350 	group_page_unlock(rootpp);
1351 	return (1);
1352 }
1353 
1354 /*
1355  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1356  * If successful and pages is free, move page to the tail of whichever
1357  * list it is on.
1358  * Returns:
1359  *   EBUSY  page already locked, assimilated but not free.
1360  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1361  *   EAGAIN page not assimilated. Page not free.
1362  *   ERANGE page assimilated. Page not root.
1363  *   0      page assimilated. Page free.
1364  *   *nfreedp number of pages freed.
1365  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1366  * to distinguish between a page that was already a NORELOC page from
1367  * those newly converted to NORELOC pages by this invocation of
1368  * kcage_assimilate_page.
1369  */
1370 static int
1371 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1372 {
1373 	if (page_trylock(pp, SE_EXCL)) {
1374 		if (PP_ISNORELOC(pp)) {
1375 check_free_and_return:
1376 			if (PP_ISFREE(pp)) {
1377 				page_unlock(pp);
1378 				*nfreedp = 0;
1379 				return (0);
1380 			} else {
1381 				page_unlock(pp);
1382 				return (EBUSY);
1383 			}
1384 			/*NOTREACHED*/
1385 		}
1386 	} else {
1387 		if (page_trylock(pp, SE_SHARED)) {
1388 			if (PP_ISNORELOC(pp))
1389 				goto check_free_and_return;
1390 		} else
1391 			return (EAGAIN);
1392 
1393 		if (!PP_ISFREE(pp)) {
1394 			page_unlock(pp);
1395 			return (EAGAIN);
1396 		}
1397 
1398 		/*
1399 		 * Need to upgrade the lock on it and set the NORELOC
1400 		 * bit. If it is free then remove it from the free
1401 		 * list so that the platform free list code can keep
1402 		 * NORELOC pages where they should be.
1403 		 */
1404 		/*
1405 		 * Before doing anything, get the exclusive lock.
1406 		 * This may fail (eg ISM pages are left shared locked).
1407 		 * If the page is free this will leave a hole in the
1408 		 * cage. There is no solution yet to this.
1409 		 */
1410 		if (!page_tryupgrade(pp)) {
1411 			page_unlock(pp);
1412 			return (EAGAIN);
1413 		}
1414 	}
1415 
1416 	ASSERT(PAGE_EXCL(pp));
1417 
1418 	if (PP_ISFREE(pp)) {
1419 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1420 
1421 		page_list_sub(pp, which);
1422 		ASSERT(pp->p_szc == 0);
1423 		PP_SETNORELOC(pp);
1424 		PLCNT_XFER_NORELOC(pp);
1425 		page_list_add(pp, which | PG_LIST_TAIL);
1426 
1427 		page_unlock(pp);
1428 		*nfreedp = 1;
1429 		return (0);
1430 	} else {
1431 		if (pp->p_szc != 0) {
1432 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1433 				page_unlock(pp);
1434 				return (EAGAIN);
1435 			}
1436 			ASSERT(PP_ISNORELOC(pp));
1437 		} else {
1438 			PP_SETNORELOC(pp);
1439 		}
1440 		PLCNT_XFER_NORELOC(pp);
1441 		return (kcage_invalidate_page(pp, nfreedp));
1442 	}
1443 	/*NOTREACHED*/
1444 }
1445 
1446 static int
1447 kcage_expand()
1448 {
1449 	int did_something = 0;
1450 
1451 	spgcnt_t wanted;
1452 	pfn_t pfn;
1453 	page_t *pp;
1454 	/* TODO: we don't really need n any more? */
1455 	pgcnt_t n;
1456 	pgcnt_t nf, nfreed;
1457 
1458 	/*
1459 	 * Expand the cage if available cage memory is really low. Calculate
1460 	 * the amount required to return kcage_freemem to the level of
1461 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1462 	 * more.  It is rare for their sum to create an artificial threshold
1463 	 * above kcage_lotsfree, but it is possible.
1464 	 *
1465 	 * Exit early if expansion amount is equal to or less than zero.
1466 	 * (<0 is possible if kcage_freemem rises suddenly.)
1467 	 *
1468 	 * Exit early when the global page pool (apparently) does not
1469 	 * have enough free pages to page_relocate() even a single page.
1470 	 */
1471 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1472 	    - kcage_freemem;
1473 	if (wanted <= 0)
1474 		return (0);
1475 	else if (freemem < pageout_reserve + 1) {
1476 		KCAGE_STAT_INCR(ke_lowfreemem);
1477 		return (0);
1478 	}
1479 
1480 	KCAGE_STAT_INCR(ke_calls);
1481 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1482 
1483 	/*
1484 	 * Assimilate more pages from the global page pool into the cage.
1485 	 */
1486 	n = 0;				/* number of pages PP_SETNORELOC'd */
1487 	nf = 0;				/* number of those actually free */
1488 	while (kcage_on && nf < wanted) {
1489 		pfn = kcage_get_pfn(1);
1490 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1491 			KCAGE_STAT_INCR(ke_nopfn);
1492 			goto terminate;
1493 		}
1494 
1495 		KCAGE_STAT_INCR_SCAN(ke_examined);
1496 
1497 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1498 			KCAGE_STAT_INCR(ke_nopaget);
1499 			continue;
1500 		}
1501 		KCAGEPAGETS_INC();
1502 		/*
1503 		 * Sanity check. Skip this pfn if it is
1504 		 * being deleted.
1505 		 */
1506 		if (pfn_is_being_deleted(pfn)) {
1507 			KCAGE_STAT_INCR(ke_deleting);
1508 			continue;
1509 		}
1510 
1511 		if (PP_ISNORELOC(pp)) {
1512 			KCAGE_STAT_INCR(ke_isnoreloc);
1513 			continue;
1514 		}
1515 
1516 		switch (kcage_assimilate_page(pp, &nfreed)) {
1517 			case 0:		/* assimilated, page is free */
1518 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1519 				did_something = 1;
1520 				nf += nfreed;
1521 				n++;
1522 				break;
1523 
1524 			case EBUSY:	/* assimilated, page not free */
1525 			case ERANGE:	/* assimilated, page not root */
1526 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1527 				did_something = 1;
1528 				n++;
1529 				break;
1530 
1531 			case ENOMEM:	/* assimilated, but no mem */
1532 				KCAGE_STAT_INCR(ke_terminate);
1533 				did_something = 1;
1534 				n++;
1535 				goto terminate;
1536 
1537 			case EAGAIN:	/* can't assimilate */
1538 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1539 				break;
1540 
1541 			default:	/* catch this with debug kernels */
1542 				ASSERT(0);
1543 				break;
1544 		}
1545 	}
1546 
1547 	/*
1548 	 * Realign cage edge with the nearest physical address
1549 	 * boundry for big pages. This is done to give us a
1550 	 * better chance of actually getting usable big pages
1551 	 * in the cage.
1552 	 */
1553 
1554 terminate:
1555 
1556 	return (did_something);
1557 }
1558 
1559 /*
1560  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1561  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1562  * if relocation is successful, otherwise it is only unlocked.
1563  * On entry, page opp must be exclusively locked and not free.
1564  * *nfreedp: number of pages freed.
1565  */
1566 static int
1567 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1568 {
1569 	page_t *opp = pp;
1570 	page_t *rpp = NULL;
1571 	spgcnt_t npgs;
1572 	int result;
1573 
1574 	ASSERT(!PP_ISFREE(opp));
1575 	ASSERT(PAGE_EXCL(opp));
1576 
1577 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1578 	*nfreedp = npgs;
1579 	if (result == 0) {
1580 		while (npgs-- > 0) {
1581 			page_t *tpp;
1582 
1583 			ASSERT(rpp != NULL);
1584 			tpp = rpp;
1585 			page_sub(&rpp, tpp);
1586 			page_unlock(tpp);
1587 		}
1588 
1589 		ASSERT(rpp == NULL);
1590 
1591 		return (0);		/* success */
1592 	}
1593 
1594 	page_unlock(opp);
1595 	return (result);
1596 }
1597 
1598 /*
1599  * Based on page_invalidate_pages()
1600  *
1601  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1602  * of use must be updated to match the new page_relocate() when it
1603  * becomes available.
1604  *
1605  * Return result of kcage_relocate_page or zero if page was directly freed.
1606  * *nfreedp: number of pages freed.
1607  */
1608 static int
1609 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1610 {
1611 	int result;
1612 
1613 #if defined(__sparc)
1614 	extern struct vnode prom_ppages;
1615 	ASSERT(pp->p_vnode != &prom_ppages);
1616 #endif /* __sparc */
1617 
1618 	ASSERT(!PP_ISFREE(pp));
1619 	ASSERT(PAGE_EXCL(pp));
1620 
1621 	/*
1622 	 * Is this page involved in some I/O? shared?
1623 	 * The page_struct_lock need not be acquired to
1624 	 * examine these fields since the page has an
1625 	 * "exclusive" lock.
1626 	 */
1627 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1628 		result = kcage_relocate_page(pp, nfreedp);
1629 #ifdef KCAGE_STATS
1630 		if (result == 0)
1631 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1632 		else if (result == ENOMEM)
1633 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1634 #endif
1635 		return (result);
1636 	}
1637 
1638 	ASSERT(pp->p_vnode->v_type != VCHR);
1639 
1640 	/*
1641 	 * Unload the mappings and check if mod bit is set.
1642 	 */
1643 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1644 
1645 	if (hat_ismod(pp)) {
1646 		result = kcage_relocate_page(pp, nfreedp);
1647 #ifdef KCAGE_STATS
1648 		if (result == 0)
1649 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1650 		else if (result == ENOMEM)
1651 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1652 #endif
1653 		return (result);
1654 	}
1655 
1656 	if (!page_try_demote_pages(pp)) {
1657 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1658 		page_unlock(pp);
1659 		return (EAGAIN);
1660 	}
1661 
1662 	page_destroy(pp, 0);
1663 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1664 	*nfreedp = 1;
1665 	return (0);
1666 }
1667 
1668 static void
1669 kcage_cageout()
1670 {
1671 	pfn_t pfn;
1672 	page_t *pp;
1673 	callb_cpr_t cprinfo;
1674 	int did_something;
1675 	int scan_again;
1676 	pfn_t start_pfn;
1677 	int pass;
1678 	int last_pass;
1679 	int pages_skipped;
1680 	int shared_skipped;
1681 	ulong_t shared_level = 8;
1682 	pgcnt_t nfreed;
1683 #ifdef KCAGE_STATS
1684 	clock_t scan_start;
1685 #endif
1686 
1687 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1688 	    callb_generic_cpr, "cageout");
1689 
1690 	mutex_enter(&kcage_cageout_mutex);
1691 	kcage_cageout_thread = curthread;
1692 
1693 	pfn = PFN_INVALID;		/* force scan reset */
1694 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1695 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1696 
1697 loop:
1698 	/*
1699 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1700 	 * that kcage_freemem is less than kcage_desfree. When it does
1701 	 * notice, kcage_freemem_sub() will wake us up via call to
1702 	 * kcage_cageout_wakeup().
1703 	 */
1704 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1705 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1706 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1707 
1708 	KCAGE_STAT_INCR(kt_wakeups);
1709 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1710 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1711 	pass = 0;
1712 	last_pass = 0;
1713 
1714 #ifdef KCAGE_STATS
1715 	scan_start = lbolt;
1716 #endif
1717 
1718 again:
1719 	if (!kcage_on)
1720 		goto loop;
1721 
1722 	KCAGE_STAT_INCR(kt_scans);
1723 	KCAGE_STAT_INCR_SCAN(kt_passes);
1724 
1725 	did_something = 0;
1726 	pages_skipped = 0;
1727 	shared_skipped = 0;
1728 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1729 	    (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1730 
1731 		if (start_pfn == PFN_INVALID)
1732 			start_pfn = pfn;
1733 		else if (start_pfn == pfn) {
1734 			last_pass = pass;
1735 			pass += 1;
1736 			/*
1737 			 * Did a complete walk of kernel cage, but didn't free
1738 			 * any pages.  If only one cpu is active then
1739 			 * stop kernel cage walk and try expanding.
1740 			 */
1741 			if (cp_default.cp_ncpus == 1 && did_something == 0) {
1742 				KCAGE_STAT_INCR(kt_cageout_break);
1743 				break;
1744 			}
1745 		}
1746 
1747 		pp = page_numtopp_nolock(pfn);
1748 		if (pp == NULL) {
1749 			continue;
1750 		}
1751 
1752 		KCAGE_STAT_INCR_SCAN(kt_examined);
1753 
1754 		/*
1755 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1756 		 * of the lock. If one is missed it will be seen next
1757 		 * time through.
1758 		 *
1759 		 * Skip non-caged-pages. These pages can exist in the cage
1760 		 * because, if during cage expansion, a page is
1761 		 * encountered that is long-term locked the lock prevents the
1762 		 * expansion logic from setting the P_NORELOC flag. Hence,
1763 		 * non-caged-pages surrounded by caged-pages.
1764 		 */
1765 		if (!PP_ISNORELOC(pp)) {
1766 			switch (kcage_assimilate_page(pp, &nfreed)) {
1767 				case 0:
1768 					did_something = 1;
1769 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1770 					    nfreed);
1771 					break;
1772 
1773 				case EBUSY:
1774 				case ERANGE:
1775 					did_something = 1;
1776 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1777 					break;
1778 
1779 				case EAGAIN:
1780 				case ENOMEM:
1781 					break;
1782 
1783 				default:
1784 					/* catch this with debug kernels */
1785 					ASSERT(0);
1786 					break;
1787 			}
1788 
1789 			continue;
1790 		} else {
1791 			int prm;
1792 
1793 			if (PP_ISFREE(pp)) {
1794 				continue;
1795 			}
1796 
1797 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
1798 			    !page_trylock(pp, SE_EXCL)) {
1799 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1800 				continue;
1801 			}
1802 
1803 			/* P_NORELOC bit should not have gone away. */
1804 			ASSERT(PP_ISNORELOC(pp));
1805 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
1806 			    pp->p_lckcnt > 0)) {
1807 				page_unlock(pp);
1808 				continue;
1809 			}
1810 
1811 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1812 			if (hat_page_checkshare(pp, shared_level)) {
1813 				page_unlock(pp);
1814 				pages_skipped = 1;
1815 				shared_skipped = 1;
1816 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1817 				continue;
1818 			}
1819 
1820 			/*
1821 			 * In pass {0, 1}, skip page if ref bit is set.
1822 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1823 			 */
1824 			prm = hat_pagesync(pp,
1825 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1826 
1827 			/* On first pass ignore ref'd pages */
1828 			if (pass <= 1 && (prm & P_REF)) {
1829 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1830 				pages_skipped = 1;
1831 				page_unlock(pp);
1832 				continue;
1833 			}
1834 
1835 			/* On pass 2, page_destroy if mod bit is not set */
1836 			if (pass <= 2) {
1837 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1838 				    pp->p_lckcnt || pp->p_cowcnt) {
1839 					pages_skipped = 1;
1840 					page_unlock(pp);
1841 				} else {
1842 
1843 					/*
1844 					 * unload the mappings before
1845 					 * checking if mod bit is set
1846 					 */
1847 					(void) hat_pageunload(pp,
1848 					    HAT_FORCE_PGUNLOAD);
1849 
1850 					/*
1851 					 * skip this page if modified
1852 					 */
1853 					if (hat_ismod(pp)) {
1854 						pages_skipped = 1;
1855 						page_unlock(pp);
1856 						continue;
1857 					}
1858 
1859 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1860 					page_destroy(pp, 0);
1861 					did_something = 1;
1862 				}
1863 				continue;
1864 			}
1865 
1866 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1867 				did_something = 1;
1868 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1869 			}
1870 
1871 			/*
1872 			 * No need to drop the page lock here.
1873 			 * Kcage_invalidate_page has done that for us
1874 			 * either explicitly or through a page_free.
1875 			 */
1876 		}
1877 	}
1878 
1879 	/*
1880 	 * Expand the cage only if available cage memory is really low.
1881 	 * This test is done only after a complete scan of the cage.
1882 	 * The reason for not checking and expanding more often is to
1883 	 * avoid rapid expansion of the cage. Naturally, scanning the
1884 	 * cage takes time. So by scanning first, we use that work as a
1885 	 * delay loop in between expand decisions.
1886 	 */
1887 
1888 	scan_again = 0;
1889 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1890 		/*
1891 		 * Kcage_expand() will return a non-zero value if it was
1892 		 * able to expand the cage -- whether or not the new
1893 		 * pages are free and immediately usable. If non-zero,
1894 		 * we do another scan of the cage. The pages might be
1895 		 * freed during that scan or by time we get back here.
1896 		 * If not, we will attempt another expansion.
1897 		 * However, if kcage_expand() returns zero, then it was
1898 		 * unable to expand the cage. This is the case when the
1899 		 * the growth list is exausted, therefore no work was done
1900 		 * and there is no reason to scan the cage again.
1901 		 * Note: Kernel cage scan is not repeated when only one
1902 		 * cpu is active to avoid kernel cage thread hogging cpu.
1903 		 */
1904 		if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1)
1905 			scan_again = 1;
1906 		else
1907 			(void) kcage_expand(); /* don't scan again */
1908 	} else if (kcage_freemem < kcage_lotsfree) {
1909 		/*
1910 		 * If available cage memory is less than abundant
1911 		 * and a full scan of the cage has not yet been completed,
1912 		 * or a scan has completed and some work was performed,
1913 		 * or pages were skipped because of sharing,
1914 		 * or we simply have not yet completed two passes,
1915 		 * then do another scan.
1916 		 */
1917 		if (pass <= 2 && pages_skipped)
1918 			scan_again = 1;
1919 		if (pass == last_pass || did_something)
1920 			scan_again = 1;
1921 		else if (shared_skipped && shared_level < (8<<24)) {
1922 			shared_level <<= 1;
1923 			scan_again = 1;
1924 		}
1925 	}
1926 
1927 	if (scan_again && cp_default.cp_ncpus > 1)
1928 		goto again;
1929 	else {
1930 		if (shared_level > 8)
1931 			shared_level >>= 1;
1932 
1933 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1934 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1935 		KCAGE_STAT_SET_SCAN(kt_ticks, lbolt - scan_start);
1936 		KCAGE_STAT_INC_SCAN_INDEX;
1937 		goto loop;
1938 	}
1939 
1940 	/*NOTREACHED*/
1941 }
1942 
1943 void
1944 kcage_cageout_wakeup()
1945 {
1946 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1947 		if (kcage_cageout_ready) {
1948 			cv_signal(&kcage_cageout_cv);
1949 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1950 			/*
1951 			 * Available cage memory is really low. Time to
1952 			 * start expanding the cage. However, the
1953 			 * kernel cage thread is not yet ready to
1954 			 * do the work. Use *this* thread, which is
1955 			 * most likely to be t0, to do the work.
1956 			 */
1957 			KCAGE_STAT_INCR(kcw_expandearly);
1958 			(void) kcage_expand();
1959 			KCAGE_STAT_INC_SCAN_INDEX;
1960 		}
1961 
1962 		mutex_exit(&kcage_cageout_mutex);
1963 	}
1964 	/* else, kernel cage thread is already running */
1965 }
1966 
1967 void
1968 kcage_tick()
1969 {
1970 	/*
1971 	 * Once per second we wake up all the threads throttled
1972 	 * waiting for cage memory, in case we've become stuck
1973 	 * and haven't made forward progress expanding the cage.
1974 	 */
1975 	if (kcage_on && kcage_cageout_ready)
1976 		cv_broadcast(&kcage_throttle_cv);
1977 }
1978