xref: /illumos-gate/usr/src/uts/common/os/mem_cage.c (revision 16f0fd39d0c84c014919d701f87f5fc48be58d31)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/param.h>
28 #include <sys/thread.h>
29 #include <sys/proc.h>
30 #include <sys/callb.h>
31 #include <sys/vnode.h>
32 #include <sys/debug.h>
33 #include <sys/systm.h>		/* for bzero */
34 #include <sys/memlist.h>
35 #include <sys/cmn_err.h>
36 #include <sys/sysmacros.h>
37 #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
38 #include <sys/atomic.h>		/* used to update kcage_freemem */
39 #include <sys/kmem.h>		/* for kmem_reap */
40 #include <sys/errno.h>
41 #include <sys/mem_cage.h>
42 #include <vm/seg_kmem.h>
43 #include <vm/page.h>
44 #include <vm/hat.h>
45 #include <vm/vm_dep.h>
46 #include <sys/mem_config.h>
47 #include <sys/lgrp.h>
48 #include <sys/rwlock.h>
49 #include <sys/cpupart.h>
50 
51 extern pri_t maxclsyspri;
52 
53 #ifdef DEBUG
54 #define	KCAGE_STATS
55 #endif
56 
57 #ifdef KCAGE_STATS
58 
59 #define	KCAGE_STATS_VERSION 9	/* can help report generators */
60 #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
61 
62 struct kcage_stats_scan {
63 	/* managed by KCAGE_STAT_* macros */
64 	clock_t	scan_lbolt;
65 	uint_t	scan_id;
66 
67 	/* set in kcage_cageout() */
68 	uint_t	kt_passes;
69 	clock_t	kt_ticks;
70 	pgcnt_t	kt_kcage_freemem_start;
71 	pgcnt_t	kt_kcage_freemem_end;
72 	pgcnt_t kt_freemem_start;
73 	pgcnt_t kt_freemem_end;
74 	uint_t	kt_examined;
75 	uint_t	kt_cantlock;
76 	uint_t	kt_gotone;
77 	uint_t	kt_gotonefree;
78 	uint_t	kt_skiplevel;
79 	uint_t	kt_skipshared;
80 	uint_t	kt_skiprefd;
81 	uint_t	kt_destroy;
82 
83 	/* set in kcage_invalidate_page() */
84 	uint_t	kip_reloclocked;
85 	uint_t	kip_relocmod;
86 	uint_t	kip_destroy;
87 	uint_t	kip_nomem;
88 	uint_t	kip_demotefailed;
89 
90 	/* set in kcage_expand() */
91 	uint_t	ke_wanted;
92 	uint_t	ke_examined;
93 	uint_t	ke_lefthole;
94 	uint_t	ke_gotone;
95 	uint_t	ke_gotonefree;
96 };
97 
98 struct kcage_stats {
99 	/* managed by KCAGE_STAT_* macros */
100 	uint_t	version;
101 	uint_t	size;
102 
103 	/* set in kcage_cageout */
104 	uint_t	kt_wakeups;
105 	uint_t	kt_scans;
106 	uint_t	kt_cageout_break;
107 
108 	/* set in kcage_expand */
109 	uint_t	ke_calls;
110 	uint_t	ke_nopfn;
111 	uint_t	ke_nopaget;
112 	uint_t	ke_isnoreloc;
113 	uint_t	ke_deleting;
114 	uint_t	ke_lowfreemem;
115 	uint_t	ke_terminate;
116 
117 	/* set in kcage_freemem_add() */
118 	uint_t	kfa_trottlewake;
119 
120 	/* set in kcage_freemem_sub() */
121 	uint_t	kfs_cagewake;
122 
123 	/* set in kcage_create_throttle */
124 	uint_t	kct_calls;
125 	uint_t	kct_cageout;
126 	uint_t	kct_critical;
127 	uint_t	kct_exempt;
128 	uint_t	kct_cagewake;
129 	uint_t	kct_wait;
130 	uint_t	kct_progress;
131 	uint_t	kct_noprogress;
132 	uint_t	kct_timeout;
133 
134 	/* set in kcage_cageout_wakeup */
135 	uint_t	kcw_expandearly;
136 
137 	/* managed by KCAGE_STAT_* macros */
138 	uint_t	scan_array_size;
139 	uint_t	scan_index;
140 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
141 };
142 
143 static struct kcage_stats kcage_stats;
144 static struct kcage_stats_scan kcage_stats_scan_zero;
145 
146 /*
147  * No real need for atomics here. For the most part the incs and sets are
148  * done by the kernel cage thread. There are a few that are done by any
149  * number of other threads. Those cases are noted by comments.
150  */
151 #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
152 
153 #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
154 
155 #define	KCAGE_STAT_INCR_SCAN(m)	\
156 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
157 
158 #define	KCAGE_STAT_NINCR_SCAN(m, v) \
159 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
160 
161 #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
162 
163 #define	KCAGE_STAT_SETZ(m, v)	\
164 	if (kcage_stats.m == 0) kcage_stats.m = (v)
165 
166 #define	KCAGE_STAT_SET_SCAN(m, v)	\
167 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
168 
169 #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
170 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
171 
172 #define	KCAGE_STAT_INC_SCAN_INDEX \
173 	KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \
174 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
175 	kcage_stats.scan_index = \
176 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
177 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
178 
179 #define	KCAGE_STAT_INIT_SCAN_INDEX \
180 	kcage_stats.version = KCAGE_STATS_VERSION; \
181 	kcage_stats.size = sizeof (kcage_stats); \
182 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
183 	kcage_stats.scan_index = 0
184 
185 #else /* KCAGE_STATS */
186 
187 #define	KCAGE_STAT_INCR(v)
188 #define	KCAGE_STAT_NINCR(m, v)
189 #define	KCAGE_STAT_INCR_SCAN(v)
190 #define	KCAGE_STAT_NINCR_SCAN(m, v)
191 #define	KCAGE_STAT_SET(m, v)
192 #define	KCAGE_STAT_SETZ(m, v)
193 #define	KCAGE_STAT_SET_SCAN(m, v)
194 #define	KCAGE_STAT_SETZ_SCAN(m, v)
195 #define	KCAGE_STAT_INC_SCAN_INDEX
196 #define	KCAGE_STAT_INIT_SCAN_INDEX
197 
198 #endif /* KCAGE_STATS */
199 
200 static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
201 static kcondvar_t kcage_throttle_cv;
202 
203 static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
204 static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
205 static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
206 kthread_id_t kcage_cageout_thread;	/* to aid debugging */
207 
208 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
209 
210 /*
211  * Cage expansion happens within a range.
212  */
213 struct kcage_glist {
214 	struct kcage_glist	*next;
215 	pfn_t			base;
216 	pfn_t			lim;
217 	pfn_t			curr;
218 	int			decr;
219 };
220 
221 static struct kcage_glist *kcage_glist;
222 static struct kcage_glist *kcage_current_glist;
223 
224 /*
225  * The firstfree element is provided so that kmem_alloc can be avoided
226  * until that cage has somewhere to go. This is not currently a problem
227  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
228  */
229 static vmem_t *kcage_arena;
230 static struct kcage_glist kcage_glist_firstfree;
231 static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
232 
233 /*
234  * Miscellaneous forward references
235  */
236 static struct kcage_glist *kcage_glist_alloc(void);
237 static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
238 static void kcage_cageout(void);
239 static int kcage_invalidate_page(page_t *, pgcnt_t *);
240 static int kcage_setnoreloc_pages(page_t *, se_t);
241 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
242 static void kcage_init(pgcnt_t preferred_size);
243 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
244 
245 /*
246  * Kernel Memory Cage counters and thresholds.
247  */
248 int kcage_on = 0;
249 pgcnt_t kcage_freemem;
250 pgcnt_t kcage_needfree;
251 pgcnt_t kcage_lotsfree;
252 pgcnt_t kcage_desfree;
253 pgcnt_t kcage_minfree;
254 pgcnt_t kcage_throttlefree;
255 pgcnt_t	kcage_reserve;
256 int kcage_maxwait = 10;	/* in seconds */
257 
258 /* when we use lp for kmem we start the cage at a higher initial value */
259 pgcnt_t kcage_kmemlp_mincage;
260 
261 #ifdef DEBUG
262 pgcnt_t	kcage_pagets;
263 #define	KCAGEPAGETS_INC()	kcage_pagets++
264 #else
265 #define	KCAGEPAGETS_INC()
266 #endif
267 
268 /* kstats to export what pages are currently caged */
269 kmutex_t kcage_kstat_lock;
270 static int kcage_kstat_update(kstat_t *ksp, int rw);
271 static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
272 
273 /*
274  * Startup and Dynamic Reconfiguration interfaces.
275  * kcage_range_add()
276  * kcage_range_del()
277  * kcage_range_delete_post_mem_del()
278  * kcage_range_init()
279  * kcage_set_thresholds()
280  */
281 
282 /*
283  * Called from page_get_contig_pages to get the approximate kcage pfn range
284  * for exclusion from search for contiguous pages. This routine is called
285  * without kcage_range lock (kcage routines can call page_get_contig_pages
286  * through page_relocate) and with the assumption, based on kcage_range_add,
287  * that kcage_current_glist always contain a valid pointer.
288  */
289 
290 int
291 kcage_current_pfn(pfn_t *pfncur)
292 {
293 	struct kcage_glist *lp = kcage_current_glist;
294 
295 	ASSERT(kcage_on);
296 
297 	ASSERT(lp != NULL);
298 
299 	*pfncur = lp->curr;
300 
301 	return (lp->decr);
302 }
303 
304 /*
305  * Called from vm_pagelist.c during coalesce to find kernel cage regions
306  * within an mnode. Looks for the lowest range between lo and hi.
307  *
308  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
309  * Non-cage memory is defined between kcage_current_glist and list end.
310  *
311  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
312  * non-cage range.
313  *
314  * Returns zero on success and nlo, nhi:
315  * 	lo <= nlo < nhi <= hi
316  * Returns non-zero if no overlapping range is found.
317  */
318 int
319 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
320     pfn_t *nlo, pfn_t *nhi)
321 {
322 	struct kcage_glist *lp;
323 	pfn_t tlo = hi;
324 	pfn_t thi = hi;
325 
326 	ASSERT(lo <= hi);
327 
328 	/*
329 	 * Reader lock protects the list, but kcage_get_pfn
330 	 * running concurrently may advance kcage_current_glist
331 	 * and also update kcage_current_glist->curr. Page
332 	 * coalesce can handle this race condition.
333 	 */
334 	rw_enter(&kcage_range_rwlock, RW_READER);
335 
336 	for (lp = incage ? kcage_glist : kcage_current_glist;
337 	    lp != NULL; lp = lp->next) {
338 
339 		pfn_t klo, khi;
340 
341 		/* find the range limits in this element */
342 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
343 			klo = lp->curr;
344 			khi = lp->lim;
345 		} else {
346 			klo = lp->base;
347 			khi = lp->curr;
348 		}
349 
350 		/* handle overlap */
351 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
352 			tlo = MAX(lo, klo);
353 			thi = MIN(hi, khi);
354 			if (tlo == lo)
355 				break;
356 		}
357 
358 		/* check end of kcage */
359 		if (incage && lp == kcage_current_glist) {
360 			break;
361 		}
362 	}
363 
364 	rw_exit(&kcage_range_rwlock);
365 
366 	/* return non-zero if no overlapping range found */
367 	if (tlo == thi)
368 		return (1);
369 
370 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
371 
372 	/* return overlapping range */
373 	*nlo = tlo;
374 	*nhi = thi;
375 	return (0);
376 }
377 
378 void
379 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
380 {
381 	int ret = 0;
382 
383 	ASSERT(kcage_arena == NULL);
384 	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
385 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
386 	ASSERT(kcage_arena != NULL);
387 
388 	if (d == KCAGE_DOWN) {
389 		while (ml->next != NULL)
390 			ml = ml->next;
391 	}
392 
393 	rw_enter(&kcage_range_rwlock, RW_WRITER);
394 
395 	while (ml != NULL) {
396 		ret = kcage_range_add_internal(btop(ml->address),
397 		    btop(ml->size), d);
398 		if (ret)
399 			panic("kcage_range_add_internal failed: "
400 			    "ml=%p, ret=0x%x\n", (void *)ml, ret);
401 
402 		ml = (d == KCAGE_DOWN ? ml->prev : ml->next);
403 	}
404 
405 	rw_exit(&kcage_range_rwlock);
406 
407 	if (ret == 0)
408 		kcage_init(preferred_size);
409 }
410 
411 /*
412  * Third arg controls direction of growth: 0: increasing pfns,
413  * 1: decreasing.
414  */
415 static int
416 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
417 {
418 	struct kcage_glist *new, **lpp;
419 	pfn_t lim;
420 
421 	ASSERT(rw_write_held(&kcage_range_rwlock));
422 
423 	ASSERT(npgs != 0);
424 	if (npgs == 0)
425 		return (EINVAL);
426 
427 	lim = base + npgs;
428 
429 	ASSERT(lim > base);
430 	if (lim <= base)
431 		return (EINVAL);
432 
433 	new = kcage_glist_alloc();
434 	if (new == NULL) {
435 		return (ENOMEM);
436 	}
437 
438 	new->base = base;
439 	new->lim = lim;
440 	new->decr = (d == KCAGE_DOWN);
441 	if (new->decr != 0)
442 		new->curr = new->lim;
443 	else
444 		new->curr = new->base;
445 	/*
446 	 * Any overlapping existing ranges are removed by deleting
447 	 * from the new list as we search for the tail.
448 	 */
449 	lpp = &kcage_glist;
450 	while (*lpp != NULL) {
451 		int ret;
452 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
453 		if (ret != 0)
454 			return (ret);
455 		lpp = &(*lpp)->next;
456 	}
457 
458 	*lpp = new;
459 
460 	if (kcage_current_glist == NULL) {
461 		kcage_current_glist = kcage_glist;
462 	}
463 
464 	return (0);
465 }
466 
467 int
468 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
469 {
470 	int ret;
471 
472 	rw_enter(&kcage_range_rwlock, RW_WRITER);
473 	ret = kcage_range_add_internal(base, npgs, d);
474 	rw_exit(&kcage_range_rwlock);
475 	return (ret);
476 }
477 
478 /*
479  * Calls to add and delete must be protected by kcage_range_rwlock
480  */
481 static int
482 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
483 {
484 	struct kcage_glist *lp;
485 	pfn_t lim;
486 
487 	ASSERT(rw_write_held(&kcage_range_rwlock));
488 
489 	ASSERT(npgs != 0);
490 	if (npgs == 0)
491 		return (EINVAL);
492 
493 	lim = base + npgs;
494 
495 	ASSERT(lim > base);
496 	if (lim <= base)
497 		return (EINVAL);
498 
499 	/*
500 	 * Check if the delete is OK first as a number of elements
501 	 * might be involved and it will be difficult to go
502 	 * back and undo (can't just add the range back in).
503 	 */
504 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
505 		/*
506 		 * If there have been no pages allocated from this
507 		 * element, we don't need to check it.
508 		 */
509 		if ((lp->decr == 0 && lp->curr == lp->base) ||
510 		    (lp->decr != 0 && lp->curr == lp->lim))
511 			continue;
512 		/*
513 		 * If the element does not overlap, its OK.
514 		 */
515 		if (base >= lp->lim || lim <= lp->base)
516 			continue;
517 		/*
518 		 * Overlapping element: Does the range to be deleted
519 		 * overlap the area already used? If so fail.
520 		 */
521 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
522 			return (EBUSY);
523 		}
524 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
525 			return (EBUSY);
526 		}
527 	}
528 	return (kcage_glist_delete(base, lim, &kcage_glist));
529 }
530 
531 int
532 kcage_range_delete(pfn_t base, pgcnt_t npgs)
533 {
534 	int ret;
535 
536 	rw_enter(&kcage_range_rwlock, RW_WRITER);
537 	ret = kcage_range_delete_internal(base, npgs);
538 	rw_exit(&kcage_range_rwlock);
539 	return (ret);
540 }
541 
542 /*
543  * Calls to add and delete must be protected by kcage_range_rwlock.
544  * This routine gets called after successful Solaris memory
545  * delete operation from DR post memory delete routines.
546  */
547 static int
548 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
549 {
550 	pfn_t lim;
551 
552 	ASSERT(rw_write_held(&kcage_range_rwlock));
553 
554 	ASSERT(npgs != 0);
555 	if (npgs == 0)
556 		return (EINVAL);
557 
558 	lim = base + npgs;
559 
560 	ASSERT(lim > base);
561 	if (lim <= base)
562 		return (EINVAL);
563 
564 	return (kcage_glist_delete(base, lim, &kcage_glist));
565 }
566 
567 int
568 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
569 {
570 	int ret;
571 
572 	rw_enter(&kcage_range_rwlock, RW_WRITER);
573 	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
574 	rw_exit(&kcage_range_rwlock);
575 	return (ret);
576 }
577 
578 /*
579  * No locking is required here as the whole operation is covered
580  * by kcage_range_rwlock writer lock.
581  */
582 static struct kcage_glist *
583 kcage_glist_alloc(void)
584 {
585 	struct kcage_glist *new;
586 
587 	if ((new = kcage_glist_freelist) != NULL) {
588 		kcage_glist_freelist = new->next;
589 	} else if (kernel_cage_enable) {
590 		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
591 	} else {
592 		/*
593 		 * On DR supported platforms we allow memory add
594 		 * even when kernel cage is disabled. "kcage_arena" is
595 		 * created only when kernel cage is enabled.
596 		 */
597 		new = kmem_zalloc(sizeof (*new), KM_NOSLEEP);
598 	}
599 
600 	if (new != NULL)
601 		bzero(new, sizeof (*new));
602 
603 	return (new);
604 }
605 
606 static void
607 kcage_glist_free(struct kcage_glist *lp)
608 {
609 	lp->next = kcage_glist_freelist;
610 	kcage_glist_freelist = lp;
611 }
612 
613 static int
614 kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
615 {
616 	struct kcage_glist *lp, *prev = *lpp;
617 
618 	while ((lp = *lpp) != NULL) {
619 		if (lim > lp->base && base < lp->lim) {
620 			/* The delete range overlaps this element. */
621 			if (base <= lp->base && lim >= lp->lim) {
622 				/* Delete whole element. */
623 				*lpp = lp->next;
624 				if (lp == kcage_current_glist) {
625 					/* This can never happen. */
626 					ASSERT(kcage_current_glist != prev);
627 					kcage_current_glist = prev;
628 				}
629 				kcage_glist_free(lp);
630 				continue;
631 			}
632 
633 			/* Partial delete. */
634 			if (base > lp->base && lim < lp->lim) {
635 				struct kcage_glist *new;
636 
637 				/*
638 				 * Remove a section from the middle,
639 				 * need to allocate a new element.
640 				 */
641 				new = kcage_glist_alloc();
642 				if (new == NULL) {
643 					return (ENOMEM);
644 				}
645 
646 				/*
647 				 * Tranfser unused range to new.
648 				 * Edit lp in place to preserve
649 				 * kcage_current_glist.
650 				 */
651 				new->decr = lp->decr;
652 				if (new->decr != 0) {
653 					new->base = lp->base;
654 					new->lim = base;
655 					new->curr = base;
656 
657 					lp->base = lim;
658 				} else {
659 					new->base = lim;
660 					new->lim = lp->lim;
661 					new->curr = new->base;
662 
663 					lp->lim = base;
664 				}
665 
666 				/* Insert new. */
667 				new->next = lp->next;
668 				lp->next = new;
669 				lpp = &lp->next;
670 			} else {
671 				/* Delete part of current block. */
672 				if (base > lp->base) {
673 					ASSERT(lim >= lp->lim);
674 					ASSERT(base < lp->lim);
675 					if (lp->decr != 0 &&
676 					    lp->curr == lp->lim)
677 						lp->curr = base;
678 					lp->lim = base;
679 				} else {
680 					ASSERT(base <= lp->base);
681 					ASSERT(lim > lp->base);
682 					if (lp->decr == 0 &&
683 					    lp->curr == lp->base)
684 						lp->curr = lim;
685 					lp->base = lim;
686 				}
687 			}
688 		}
689 		prev = *lpp;
690 		lpp = &(*lpp)->next;
691 	}
692 
693 	return (0);
694 }
695 
696 /*
697  * If lockit is 1, kcage_get_pfn holds the
698  * reader lock for kcage_range_rwlock.
699  * Changes to lp->curr can cause race conditions, but
700  * they are handled by higher level code (see kcage_next_range.)
701  */
702 static pfn_t
703 kcage_get_pfn(int lockit)
704 {
705 	struct kcage_glist *lp;
706 	pfn_t pfn = PFN_INVALID;
707 
708 	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
709 		return (pfn);
710 
711 	lp = kcage_current_glist;
712 	while (lp != NULL) {
713 		if (lp->decr != 0) {
714 			if (lp->curr != lp->base) {
715 				pfn = --lp->curr;
716 				break;
717 			}
718 		} else {
719 			if (lp->curr != lp->lim) {
720 				pfn = lp->curr++;
721 				break;
722 			}
723 		}
724 
725 		lp = lp->next;
726 		if (lp)
727 			kcage_current_glist = lp;
728 	}
729 
730 	if (lockit)
731 		rw_exit(&kcage_range_rwlock);
732 	return (pfn);
733 }
734 
735 /*
736  * Walk the physical address space of the cage.
737  * This routine does not guarantee to return PFNs in the order
738  * in which they were allocated to the cage. Instead, it walks
739  * each range as they appear on the growth list returning the PFNs
740  * range in ascending order.
741  *
742  * To begin scanning at lower edge of cage, reset should be nonzero.
743  * To step through cage, reset should be zero.
744  *
745  * PFN_INVALID will be returned when the upper end of the cage is
746  * reached -- indicating a full scan of the cage has been completed since
747  * previous reset. PFN_INVALID will continue to be returned until
748  * kcage_walk_cage is reset.
749  *
750  * It is possible to receive a PFN_INVALID result on reset if a growth
751  * list is not installed or if none of the PFNs in the installed list have
752  * been allocated to the cage. In otherwords, there is no cage.
753  *
754  * Caller need not hold kcage_range_rwlock while calling this function
755  * as the front part of the list is static - pages never come out of
756  * the cage.
757  *
758  * The caller is expected to only be kcage_cageout().
759  */
760 static pfn_t
761 kcage_walk_cage(int reset)
762 {
763 	static struct kcage_glist *lp = NULL;
764 	static pfn_t pfn;
765 
766 	if (reset)
767 		lp = NULL;
768 	if (lp == NULL) {
769 		lp = kcage_glist;
770 		pfn = PFN_INVALID;
771 	}
772 again:
773 	if (pfn == PFN_INVALID) {
774 		if (lp == NULL)
775 			return (PFN_INVALID);
776 
777 		if (lp->decr != 0) {
778 			/*
779 			 * In this range the cage grows from the highest
780 			 * address towards the lowest.
781 			 * Arrange to return pfns from curr to lim-1,
782 			 * inclusive, in ascending order.
783 			 */
784 
785 			pfn = lp->curr;
786 		} else {
787 			/*
788 			 * In this range the cage grows from the lowest
789 			 * address towards the highest.
790 			 * Arrange to return pfns from base to curr,
791 			 * inclusive, in ascending order.
792 			 */
793 
794 			pfn = lp->base;
795 		}
796 	}
797 
798 	if (lp->decr != 0) {		/* decrementing pfn */
799 		if (pfn == lp->lim) {
800 			/* Don't go beyond the static part of the glist. */
801 			if (lp == kcage_current_glist)
802 				lp = NULL;
803 			else
804 				lp = lp->next;
805 			pfn = PFN_INVALID;
806 			goto again;
807 		}
808 
809 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
810 	} else {			/* incrementing pfn */
811 		if (pfn == lp->curr) {
812 			/* Don't go beyond the static part of the glist. */
813 			if (lp == kcage_current_glist)
814 				lp = NULL;
815 			else
816 				lp = lp->next;
817 			pfn = PFN_INVALID;
818 			goto again;
819 		}
820 
821 		ASSERT(pfn >= lp->base && pfn < lp->curr);
822 	}
823 
824 	return (pfn++);
825 }
826 
827 /*
828  * Callback functions for to recalc cage thresholds after
829  * Kphysm memory add/delete operations.
830  */
831 /*ARGSUSED*/
832 static void
833 kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
834 {
835 	kcage_recalc_thresholds();
836 }
837 
838 /*ARGSUSED*/
839 static int
840 kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
841 {
842 	/* TODO: when should cage refuse memory delete requests? */
843 	return (0);
844 }
845 
846 /*ARGSUSED*/
847 static  void
848 kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
849 {
850 	kcage_recalc_thresholds();
851 }
852 
853 static kphysm_setup_vector_t kcage_kphysm_vectors = {
854 	KPHYSM_SETUP_VECTOR_VERSION,
855 	kcage_kphysm_postadd_cb,
856 	kcage_kphysm_predel_cb,
857 	kcage_kphysm_postdel_cb
858 };
859 
860 /*
861  * This is called before a CPR suspend and after a CPR resume.  We have to
862  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
863  * restart.
864  */
865 /*ARGSUSED*/
866 static boolean_t
867 kcage_cageout_cpr(void *arg, int code)
868 {
869 	if (code == CB_CODE_CPR_CHKPT) {
870 		ASSERT(kcage_cageout_ready);
871 		kcage_cageout_ready = 0;
872 		return (B_TRUE);
873 	} else if (code == CB_CODE_CPR_RESUME) {
874 		ASSERT(kcage_cageout_ready == 0);
875 		kcage_cageout_ready = 1;
876 		return (B_TRUE);
877 	}
878 	return (B_FALSE);
879 }
880 
881 /*
882  * kcage_recalc_preferred_size() increases initial cage size to improve large
883  * page availability when lp for kmem is enabled and kpr is disabled
884  */
885 static pgcnt_t
886 kcage_recalc_preferred_size(pgcnt_t preferred_size)
887 {
888 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
889 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
890 		if (lpmincage == 0) {
891 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
892 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
893 		}
894 		kcage_kmemlp_mincage = MIN(lpmincage,
895 		    (segkmem_kmemlp_max / PAGESIZE));
896 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
897 	}
898 	return (preferred_size);
899 }
900 
901 /*
902  * Kcage_init() builds the cage and initializes the cage thresholds.
903  * The size of the cage is determined by the argument preferred_size.
904  * or the actual amount of memory, whichever is smaller.
905  */
906 static void
907 kcage_init(pgcnt_t preferred_size)
908 {
909 	pgcnt_t wanted;
910 	pfn_t pfn;
911 	page_t *pp;
912 	kstat_t *ksp;
913 
914 	extern struct vnode kvp;
915 	extern void page_list_noreloc_startup(page_t *);
916 
917 	ASSERT(!kcage_on);
918 
919 	/* increase preferred cage size for lp for kmem */
920 	preferred_size = kcage_recalc_preferred_size(preferred_size);
921 
922 	/* Debug note: initialize this now so early expansions can stat */
923 	KCAGE_STAT_INIT_SCAN_INDEX;
924 
925 	/*
926 	 * Initialize cage thresholds and install kphysm callback.
927 	 * If we can't arrange to have the thresholds track with
928 	 * available physical memory, then the cage thresholds may
929 	 * end up over time at levels that adversly effect system
930 	 * performance; so, bail out.
931 	 */
932 	kcage_recalc_thresholds();
933 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
934 		ASSERT(0);		/* Catch this in DEBUG kernels. */
935 		return;
936 	}
937 
938 	/*
939 	 * Limit startup cage size within the range of kcage_minfree
940 	 * and availrmem, inclusively.
941 	 */
942 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
943 
944 	/*
945 	 * Construct the cage. PFNs are allocated from the glist. It
946 	 * is assumed that the list has been properly ordered for the
947 	 * platform by the platform code. Typically, this is as simple
948 	 * as calling kcage_range_init(phys_avail, decr), where decr is
949 	 * 1 if the kernel has been loaded into upper end of physical
950 	 * memory, or 0 if the kernel has been loaded at the low end.
951 	 *
952 	 * Note: it is assumed that we are in the startup flow, so there
953 	 * is no reason to grab the page lock.
954 	 */
955 	kcage_freemem = 0;
956 	pfn = PFN_INVALID;			/* prime for alignment test */
957 	while (wanted != 0) {
958 		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
959 			break;
960 
961 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
962 			KCAGEPAGETS_INC();
963 			/*
964 			 * Set the noreloc state on the page.
965 			 * If the page is free and not already
966 			 * on the noreloc list then move it.
967 			 */
968 			if (PP_ISFREE(pp)) {
969 				if (PP_ISNORELOC(pp) == 0)
970 					page_list_noreloc_startup(pp);
971 			} else {
972 				ASSERT(pp->p_szc == 0);
973 				PP_SETNORELOC(pp);
974 			}
975 		}
976 		PLCNT_XFER_NORELOC(pp);
977 		wanted -= 1;
978 	}
979 
980 	/*
981 	 * Need to go through and find kernel allocated pages
982 	 * and capture them into the Cage.  These will primarily
983 	 * be pages gotten through boot_alloc().
984 	 */
985 	if (kvp.v_pages) {
986 
987 		pp = kvp.v_pages;
988 		do {
989 			ASSERT(!PP_ISFREE(pp));
990 			ASSERT(pp->p_szc == 0);
991 			if (PP_ISNORELOC(pp) == 0) {
992 				PP_SETNORELOC(pp);
993 				PLCNT_XFER_NORELOC(pp);
994 			}
995 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
996 
997 	}
998 
999 	kcage_on = 1;
1000 
1001 	/*
1002 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
1003 	 * after the cageout thread is blocked, and executes from cpr_resume()
1004 	 * before the cageout thread is restarted.  By executing in this class,
1005 	 * we are assured that the kernel cage thread won't miss wakeup calls
1006 	 * and also CPR's larger kmem_alloc requests will not fail after
1007 	 * CPR shuts down the cageout kernel thread.
1008 	 */
1009 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
1010 	    "cageout");
1011 
1012 	/*
1013 	 * Coalesce pages to improve large page availability. A better fix
1014 	 * would to coalesce pages as they are included in the cage
1015 	 */
1016 	if (SEGKMEM_USE_LARGEPAGES) {
1017 		extern void page_freelist_coalesce_all(int mnode);
1018 		page_freelist_coalesce_all(-1);	/* do all mnodes */
1019 	}
1020 
1021 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
1022 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
1023 	if (ksp != NULL) {
1024 		ksp->ks_update = kcage_kstat_update;
1025 		ksp->ks_snapshot = kcage_kstat_snapshot;
1026 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
1027 		kstat_install(ksp);
1028 	}
1029 }
1030 
1031 static int
1032 kcage_kstat_update(kstat_t *ksp, int rw)
1033 {
1034 	struct kcage_glist *lp;
1035 	uint_t count;
1036 
1037 	if (rw == KSTAT_WRITE)
1038 		return (EACCES);
1039 
1040 	count = 0;
1041 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1042 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
1043 		if (lp->decr) {
1044 			if (lp->curr != lp->lim) {
1045 				count++;
1046 			}
1047 		} else {
1048 			if (lp->curr != lp->base) {
1049 				count++;
1050 			}
1051 		}
1052 	}
1053 	rw_exit(&kcage_range_rwlock);
1054 
1055 	ksp->ks_ndata = count;
1056 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
1057 
1058 	return (0);
1059 }
1060 
1061 static int
1062 kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
1063 {
1064 	struct kcage_glist *lp;
1065 	struct memunit {
1066 		uint64_t address;
1067 		uint64_t size;
1068 	} *kspmem;
1069 
1070 	if (rw == KSTAT_WRITE)
1071 		return (EACCES);
1072 
1073 	ksp->ks_snaptime = gethrtime();
1074 
1075 	kspmem = (struct memunit *)buf;
1076 	rw_enter(&kcage_range_rwlock, RW_WRITER);
1077 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
1078 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
1079 			break;
1080 
1081 		if (lp->decr) {
1082 			if (lp->curr != lp->lim) {
1083 				kspmem->address = ptob(lp->curr);
1084 				kspmem->size = ptob(lp->lim - lp->curr);
1085 			}
1086 		} else {
1087 			if (lp->curr != lp->base) {
1088 				kspmem->address = ptob(lp->base);
1089 				kspmem->size = ptob(lp->curr - lp->base);
1090 			}
1091 		}
1092 	}
1093 	rw_exit(&kcage_range_rwlock);
1094 
1095 	return (0);
1096 }
1097 
1098 void
1099 kcage_recalc_thresholds()
1100 {
1101 	static int first = 1;
1102 	static pgcnt_t init_lotsfree;
1103 	static pgcnt_t init_desfree;
1104 	static pgcnt_t init_minfree;
1105 	static pgcnt_t init_throttlefree;
1106 	static pgcnt_t init_reserve;
1107 
1108 	/* TODO: any reason to take more care than this with live editing? */
1109 	mutex_enter(&kcage_cageout_mutex);
1110 	mutex_enter(&freemem_lock);
1111 
1112 	if (first) {
1113 		first = 0;
1114 		init_lotsfree = kcage_lotsfree;
1115 		init_desfree = kcage_desfree;
1116 		init_minfree = kcage_minfree;
1117 		init_throttlefree = kcage_throttlefree;
1118 		init_reserve = kcage_reserve;
1119 	} else {
1120 		kcage_lotsfree = init_lotsfree;
1121 		kcage_desfree = init_desfree;
1122 		kcage_minfree = init_minfree;
1123 		kcage_throttlefree = init_throttlefree;
1124 		kcage_reserve = init_reserve;
1125 	}
1126 
1127 	if (kcage_lotsfree == 0)
1128 		kcage_lotsfree = MAX(32, total_pages / 256);
1129 
1130 	if (kcage_minfree == 0)
1131 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
1132 
1133 	if (kcage_desfree == 0)
1134 		kcage_desfree = MAX(32, kcage_minfree);
1135 
1136 	if (kcage_throttlefree == 0)
1137 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
1138 
1139 	if (kcage_reserve == 0)
1140 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1141 
1142 	mutex_exit(&freemem_lock);
1143 	mutex_exit(&kcage_cageout_mutex);
1144 
1145 	if (kcage_cageout_ready) {
1146 		if (kcage_freemem < kcage_desfree)
1147 			kcage_cageout_wakeup();
1148 
1149 		if (kcage_needfree) {
1150 			mutex_enter(&kcage_throttle_mutex);
1151 			cv_broadcast(&kcage_throttle_cv);
1152 			mutex_exit(&kcage_throttle_mutex);
1153 		}
1154 	}
1155 }
1156 
1157 /*
1158  * Pageout interface:
1159  * kcage_cageout_init()
1160  */
1161 void
1162 kcage_cageout_init()
1163 {
1164 	if (kcage_on) {
1165 
1166 		(void) thread_create(NULL, 0, kcage_cageout,
1167 		    NULL, 0, proc_pageout, TS_RUN, maxclsyspri - 1);
1168 	}
1169 }
1170 
1171 
1172 /*
1173  * VM Interfaces:
1174  * kcage_create_throttle()
1175  * kcage_freemem_add()
1176  * kcage_freemem_sub()
1177  */
1178 
1179 /*
1180  * Wakeup cageout thread and throttle waiting for the number of pages
1181  * requested to become available.  For non-critical requests, a
1182  * timeout is added, since freemem accounting is separate from cage
1183  * freemem accounting: it's possible for us to get stuck and not make
1184  * forward progress even though there was sufficient freemem before
1185  * arriving here.
1186  */
1187 int
1188 kcage_create_throttle(pgcnt_t npages, int flags)
1189 {
1190 	int niter = 0;
1191 	pgcnt_t lastfree;
1192 	int enough = kcage_freemem > kcage_throttlefree + npages;
1193 
1194 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
1195 
1196 	kcage_cageout_wakeup();			/* just to be sure */
1197 	KCAGE_STAT_INCR(kct_cagewake);		/* unprotected incr. */
1198 
1199 	/*
1200 	 * Obviously, we can't throttle the cageout thread since
1201 	 * we depend on it.  We also can't throttle the panic thread.
1202 	 */
1203 	if (curthread == kcage_cageout_thread || panicstr) {
1204 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
1205 		return (KCT_CRIT);
1206 	}
1207 
1208 	/*
1209 	 * Don't throttle threads which are critical for proper
1210 	 * vm management if we're above kcage_throttlefree or
1211 	 * if freemem is very low.
1212 	 */
1213 	if (NOMEMWAIT()) {
1214 		if (enough) {
1215 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1216 			return (KCT_CRIT);
1217 		} else if (freemem < minfree) {
1218 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
1219 			return (KCT_CRIT);
1220 		}
1221 	}
1222 
1223 	/*
1224 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
1225 	 */
1226 	if (DISP_PRIO(curthread) > maxclsyspri &&
1227 	    kcage_freemem > kcage_reserve) {
1228 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
1229 		return (KCT_CRIT);
1230 	}
1231 
1232 	/*
1233 	 * Cause all other threads (which are assumed to not be
1234 	 * critical to cageout) to wait here until their request
1235 	 * can be satisfied. Be a little paranoid and wake the
1236 	 * kernel cage on each loop through this logic.
1237 	 */
1238 	while (kcage_freemem < kcage_throttlefree + npages) {
1239 		ASSERT(kcage_on);
1240 
1241 		lastfree = kcage_freemem;
1242 
1243 		if (kcage_cageout_ready) {
1244 			mutex_enter(&kcage_throttle_mutex);
1245 
1246 			kcage_needfree += npages;
1247 			KCAGE_STAT_INCR(kct_wait);
1248 
1249 			kcage_cageout_wakeup();
1250 			KCAGE_STAT_INCR(kct_cagewake);
1251 
1252 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
1253 
1254 			kcage_needfree -= npages;
1255 
1256 			mutex_exit(&kcage_throttle_mutex);
1257 		} else {
1258 			/*
1259 			 * NOTE: atomics are used just in case we enter
1260 			 * mp operation before the cageout thread is ready.
1261 			 */
1262 			atomic_add_long(&kcage_needfree, npages);
1263 
1264 			kcage_cageout_wakeup();
1265 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
1266 
1267 			atomic_add_long(&kcage_needfree, -npages);
1268 		}
1269 
1270 		if ((flags & PG_WAIT) == 0) {
1271 			if (kcage_freemem > lastfree) {
1272 				KCAGE_STAT_INCR(kct_progress);
1273 				niter = 0;
1274 			} else {
1275 				KCAGE_STAT_INCR(kct_noprogress);
1276 				if (++niter >= kcage_maxwait) {
1277 					KCAGE_STAT_INCR(kct_timeout);
1278 					return (KCT_FAILURE);
1279 				}
1280 			}
1281 		}
1282 
1283 		if (NOMEMWAIT() && freemem < minfree) {
1284 			return (KCT_CRIT);
1285 		}
1286 
1287 	}
1288 	return (KCT_NONCRIT);
1289 }
1290 
1291 void
1292 kcage_freemem_add(pgcnt_t npages)
1293 {
1294 	extern void wakeup_pcgs(void);
1295 
1296 	atomic_add_long(&kcage_freemem, npages);
1297 
1298 	wakeup_pcgs();  /* wakeup threads in pcgs() */
1299 
1300 	if (kcage_needfree != 0 &&
1301 	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
1302 
1303 		mutex_enter(&kcage_throttle_mutex);
1304 		cv_broadcast(&kcage_throttle_cv);
1305 		KCAGE_STAT_INCR(kfa_trottlewake);
1306 		mutex_exit(&kcage_throttle_mutex);
1307 	}
1308 }
1309 
1310 void
1311 kcage_freemem_sub(pgcnt_t npages)
1312 {
1313 	atomic_add_long(&kcage_freemem, -npages);
1314 
1315 	if (kcage_freemem < kcage_desfree) {
1316 		kcage_cageout_wakeup();
1317 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
1318 	}
1319 }
1320 
1321 /*
1322  * return 0 on failure and 1 on success.
1323  */
1324 static int
1325 kcage_setnoreloc_pages(page_t *rootpp, se_t se)
1326 {
1327 	pgcnt_t npgs, i;
1328 	page_t *pp;
1329 	pfn_t rootpfn = page_pptonum(rootpp);
1330 	uint_t szc;
1331 
1332 	ASSERT(!PP_ISFREE(rootpp));
1333 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
1334 	if (!group_page_trylock(rootpp, se)) {
1335 		return (0);
1336 	}
1337 	szc = rootpp->p_szc;
1338 	if (szc == 0) {
1339 		/*
1340 		 * The szc of a locked page can only change for pages that are
1341 		 * non-swapfs (i.e. anonymous memory) file system pages.
1342 		 */
1343 		ASSERT(rootpp->p_vnode != NULL &&
1344 		    !PP_ISKAS(rootpp) &&
1345 		    !IS_SWAPFSVP(rootpp->p_vnode));
1346 		PP_SETNORELOC(rootpp);
1347 		return (1);
1348 	}
1349 	npgs = page_get_pagecnt(szc);
1350 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
1351 	pp = rootpp;
1352 	for (i = 0; i < npgs; i++, pp++) {
1353 		ASSERT(PAGE_LOCKED_SE(pp, se));
1354 		ASSERT(!PP_ISFREE(pp));
1355 		ASSERT(pp->p_szc == szc);
1356 		PP_SETNORELOC(pp);
1357 	}
1358 	group_page_unlock(rootpp);
1359 	return (1);
1360 }
1361 
1362 /*
1363  * Attempt to convert page to a caged page (set the P_NORELOC flag).
1364  * If successful and pages is free, move page to the tail of whichever
1365  * list it is on.
1366  * Returns:
1367  *   EBUSY  page already locked, assimilated but not free.
1368  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
1369  *   EAGAIN page not assimilated. Page not free.
1370  *   ERANGE page assimilated. Page not root.
1371  *   0      page assimilated. Page free.
1372  *   *nfreedp number of pages freed.
1373  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
1374  * to distinguish between a page that was already a NORELOC page from
1375  * those newly converted to NORELOC pages by this invocation of
1376  * kcage_assimilate_page.
1377  */
1378 static int
1379 kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
1380 {
1381 	if (page_trylock(pp, SE_EXCL)) {
1382 		if (PP_ISNORELOC(pp)) {
1383 check_free_and_return:
1384 			if (PP_ISFREE(pp)) {
1385 				page_unlock(pp);
1386 				*nfreedp = 0;
1387 				return (0);
1388 			} else {
1389 				page_unlock(pp);
1390 				return (EBUSY);
1391 			}
1392 			/*NOTREACHED*/
1393 		}
1394 	} else {
1395 		if (page_trylock(pp, SE_SHARED)) {
1396 			if (PP_ISNORELOC(pp))
1397 				goto check_free_and_return;
1398 		} else
1399 			return (EAGAIN);
1400 
1401 		if (!PP_ISFREE(pp)) {
1402 			page_unlock(pp);
1403 			return (EAGAIN);
1404 		}
1405 
1406 		/*
1407 		 * Need to upgrade the lock on it and set the NORELOC
1408 		 * bit. If it is free then remove it from the free
1409 		 * list so that the platform free list code can keep
1410 		 * NORELOC pages where they should be.
1411 		 */
1412 		/*
1413 		 * Before doing anything, get the exclusive lock.
1414 		 * This may fail (eg ISM pages are left shared locked).
1415 		 * If the page is free this will leave a hole in the
1416 		 * cage. There is no solution yet to this.
1417 		 */
1418 		if (!page_tryupgrade(pp)) {
1419 			page_unlock(pp);
1420 			return (EAGAIN);
1421 		}
1422 	}
1423 
1424 	ASSERT(PAGE_EXCL(pp));
1425 
1426 	if (PP_ISFREE(pp)) {
1427 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
1428 
1429 		page_list_sub(pp, which);
1430 		ASSERT(pp->p_szc == 0);
1431 		PP_SETNORELOC(pp);
1432 		PLCNT_XFER_NORELOC(pp);
1433 		page_list_add(pp, which | PG_LIST_TAIL);
1434 
1435 		page_unlock(pp);
1436 		*nfreedp = 1;
1437 		return (0);
1438 	} else {
1439 		if (pp->p_szc != 0) {
1440 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
1441 				page_unlock(pp);
1442 				return (EAGAIN);
1443 			}
1444 			ASSERT(PP_ISNORELOC(pp));
1445 		} else {
1446 			PP_SETNORELOC(pp);
1447 		}
1448 		PLCNT_XFER_NORELOC(pp);
1449 		return (kcage_invalidate_page(pp, nfreedp));
1450 	}
1451 	/*NOTREACHED*/
1452 }
1453 
1454 static int
1455 kcage_expand()
1456 {
1457 	int did_something = 0;
1458 
1459 	spgcnt_t wanted;
1460 	pfn_t pfn;
1461 	page_t *pp;
1462 	/* TODO: we don't really need n any more? */
1463 	pgcnt_t n;
1464 	pgcnt_t nf, nfreed;
1465 
1466 	/*
1467 	 * Expand the cage if available cage memory is really low. Calculate
1468 	 * the amount required to return kcage_freemem to the level of
1469 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
1470 	 * more.  It is rare for their sum to create an artificial threshold
1471 	 * above kcage_lotsfree, but it is possible.
1472 	 *
1473 	 * Exit early if expansion amount is equal to or less than zero.
1474 	 * (<0 is possible if kcage_freemem rises suddenly.)
1475 	 *
1476 	 * Exit early when the global page pool (apparently) does not
1477 	 * have enough free pages to page_relocate() even a single page.
1478 	 */
1479 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
1480 	    - kcage_freemem;
1481 	if (wanted <= 0)
1482 		return (0);
1483 	else if (freemem < pageout_reserve + 1) {
1484 		KCAGE_STAT_INCR(ke_lowfreemem);
1485 		return (0);
1486 	}
1487 
1488 	KCAGE_STAT_INCR(ke_calls);
1489 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
1490 
1491 	/*
1492 	 * Assimilate more pages from the global page pool into the cage.
1493 	 */
1494 	n = 0;				/* number of pages PP_SETNORELOC'd */
1495 	nf = 0;				/* number of those actually free */
1496 	while (kcage_on && nf < wanted) {
1497 		pfn = kcage_get_pfn(1);
1498 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
1499 			KCAGE_STAT_INCR(ke_nopfn);
1500 			goto terminate;
1501 		}
1502 
1503 		KCAGE_STAT_INCR_SCAN(ke_examined);
1504 
1505 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
1506 			KCAGE_STAT_INCR(ke_nopaget);
1507 			continue;
1508 		}
1509 		KCAGEPAGETS_INC();
1510 		/*
1511 		 * Sanity check. Skip this pfn if it is
1512 		 * being deleted.
1513 		 */
1514 		if (pfn_is_being_deleted(pfn)) {
1515 			KCAGE_STAT_INCR(ke_deleting);
1516 			continue;
1517 		}
1518 
1519 		if (PP_ISNORELOC(pp)) {
1520 			KCAGE_STAT_INCR(ke_isnoreloc);
1521 			continue;
1522 		}
1523 
1524 		switch (kcage_assimilate_page(pp, &nfreed)) {
1525 			case 0:		/* assimilated, page is free */
1526 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
1527 				did_something = 1;
1528 				nf += nfreed;
1529 				n++;
1530 				break;
1531 
1532 			case EBUSY:	/* assimilated, page not free */
1533 			case ERANGE:	/* assimilated, page not root */
1534 				KCAGE_STAT_INCR_SCAN(ke_gotone);
1535 				did_something = 1;
1536 				n++;
1537 				break;
1538 
1539 			case ENOMEM:	/* assimilated, but no mem */
1540 				KCAGE_STAT_INCR(ke_terminate);
1541 				did_something = 1;
1542 				n++;
1543 				goto terminate;
1544 
1545 			case EAGAIN:	/* can't assimilate */
1546 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
1547 				break;
1548 
1549 			default:	/* catch this with debug kernels */
1550 				ASSERT(0);
1551 				break;
1552 		}
1553 	}
1554 
1555 	/*
1556 	 * Realign cage edge with the nearest physical address
1557 	 * boundry for big pages. This is done to give us a
1558 	 * better chance of actually getting usable big pages
1559 	 * in the cage.
1560 	 */
1561 
1562 terminate:
1563 
1564 	return (did_something);
1565 }
1566 
1567 /*
1568  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
1569  * (Replacement Page Pointer) in the global pool. Page opp will be freed
1570  * if relocation is successful, otherwise it is only unlocked.
1571  * On entry, page opp must be exclusively locked and not free.
1572  * *nfreedp: number of pages freed.
1573  */
1574 static int
1575 kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
1576 {
1577 	page_t *opp = pp;
1578 	page_t *rpp = NULL;
1579 	spgcnt_t npgs;
1580 	int result;
1581 
1582 	ASSERT(!PP_ISFREE(opp));
1583 	ASSERT(PAGE_EXCL(opp));
1584 
1585 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
1586 	*nfreedp = npgs;
1587 	if (result == 0) {
1588 		while (npgs-- > 0) {
1589 			page_t *tpp;
1590 
1591 			ASSERT(rpp != NULL);
1592 			tpp = rpp;
1593 			page_sub(&rpp, tpp);
1594 			page_unlock(tpp);
1595 		}
1596 
1597 		ASSERT(rpp == NULL);
1598 
1599 		return (0);		/* success */
1600 	}
1601 
1602 	page_unlock(opp);
1603 	return (result);
1604 }
1605 
1606 /*
1607  * Based on page_invalidate_pages()
1608  *
1609  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
1610  * of use must be updated to match the new page_relocate() when it
1611  * becomes available.
1612  *
1613  * Return result of kcage_relocate_page or zero if page was directly freed.
1614  * *nfreedp: number of pages freed.
1615  */
1616 static int
1617 kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
1618 {
1619 	int result;
1620 
1621 #if defined(__sparc)
1622 	extern struct vnode prom_ppages;
1623 	ASSERT(pp->p_vnode != &prom_ppages);
1624 #endif /* __sparc */
1625 
1626 	ASSERT(!PP_ISFREE(pp));
1627 	ASSERT(PAGE_EXCL(pp));
1628 
1629 	/*
1630 	 * Is this page involved in some I/O? shared?
1631 	 * The page_struct_lock need not be acquired to
1632 	 * examine these fields since the page has an
1633 	 * "exclusive" lock.
1634 	 */
1635 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1636 		result = kcage_relocate_page(pp, nfreedp);
1637 #ifdef KCAGE_STATS
1638 		if (result == 0)
1639 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
1640 		else if (result == ENOMEM)
1641 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1642 #endif
1643 		return (result);
1644 	}
1645 
1646 	ASSERT(pp->p_vnode->v_type != VCHR);
1647 
1648 	/*
1649 	 * Unload the mappings and check if mod bit is set.
1650 	 */
1651 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1652 
1653 	if (hat_ismod(pp)) {
1654 		result = kcage_relocate_page(pp, nfreedp);
1655 #ifdef KCAGE_STATS
1656 		if (result == 0)
1657 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
1658 		else if (result == ENOMEM)
1659 			KCAGE_STAT_INCR_SCAN(kip_nomem);
1660 #endif
1661 		return (result);
1662 	}
1663 
1664 	if (!page_try_demote_pages(pp)) {
1665 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
1666 		page_unlock(pp);
1667 		return (EAGAIN);
1668 	}
1669 
1670 	/* LINTED: constant in conditional context */
1671 	VN_DISPOSE(pp, B_INVAL, 0, kcred);
1672 	KCAGE_STAT_INCR_SCAN(kip_destroy);
1673 	*nfreedp = 1;
1674 	return (0);
1675 }
1676 
1677 static void
1678 kcage_cageout()
1679 {
1680 	pfn_t pfn;
1681 	page_t *pp;
1682 	callb_cpr_t cprinfo;
1683 	int did_something;
1684 	int scan_again;
1685 	pfn_t start_pfn;
1686 	int pass;
1687 	int last_pass;
1688 	int pages_skipped;
1689 	int shared_skipped;
1690 	ulong_t shared_level = 8;
1691 	pgcnt_t nfreed;
1692 #ifdef KCAGE_STATS
1693 	clock_t scan_start;
1694 #endif
1695 
1696 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
1697 	    callb_generic_cpr, "cageout");
1698 
1699 	mutex_enter(&kcage_cageout_mutex);
1700 	kcage_cageout_thread = curthread;
1701 
1702 	pfn = PFN_INVALID;		/* force scan reset */
1703 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
1704 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
1705 
1706 loop:
1707 	/*
1708 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
1709 	 * that kcage_freemem is less than kcage_desfree. When it does
1710 	 * notice, kcage_freemem_sub() will wake us up via call to
1711 	 * kcage_cageout_wakeup().
1712 	 */
1713 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
1714 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
1715 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
1716 
1717 	KCAGE_STAT_INCR(kt_wakeups);
1718 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
1719 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
1720 	pass = 0;
1721 	last_pass = 0;
1722 
1723 #ifdef KCAGE_STATS
1724 	scan_start = ddi_get_lbolt();
1725 #endif
1726 
1727 again:
1728 	if (!kcage_on)
1729 		goto loop;
1730 
1731 	KCAGE_STAT_INCR(kt_scans);
1732 	KCAGE_STAT_INCR_SCAN(kt_passes);
1733 
1734 	did_something = 0;
1735 	pages_skipped = 0;
1736 	shared_skipped = 0;
1737 	while ((kcage_freemem < kcage_lotsfree || kcage_needfree) &&
1738 	    (pfn = kcage_walk_cage(pfn == PFN_INVALID)) != PFN_INVALID) {
1739 
1740 		if (start_pfn == PFN_INVALID)
1741 			start_pfn = pfn;
1742 		else if (start_pfn == pfn) {
1743 			last_pass = pass;
1744 			pass += 1;
1745 			/*
1746 			 * Did a complete walk of kernel cage, but didn't free
1747 			 * any pages.  If only one cpu is active then
1748 			 * stop kernel cage walk and try expanding.
1749 			 */
1750 			if (cp_default.cp_ncpus == 1 && did_something == 0) {
1751 				KCAGE_STAT_INCR(kt_cageout_break);
1752 				break;
1753 			}
1754 		}
1755 
1756 		pp = page_numtopp_nolock(pfn);
1757 		if (pp == NULL) {
1758 			continue;
1759 		}
1760 
1761 		KCAGE_STAT_INCR_SCAN(kt_examined);
1762 
1763 		/*
1764 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
1765 		 * of the lock. If one is missed it will be seen next
1766 		 * time through.
1767 		 *
1768 		 * Skip non-caged-pages. These pages can exist in the cage
1769 		 * because, if during cage expansion, a page is
1770 		 * encountered that is long-term locked the lock prevents the
1771 		 * expansion logic from setting the P_NORELOC flag. Hence,
1772 		 * non-caged-pages surrounded by caged-pages.
1773 		 */
1774 		if (!PP_ISNORELOC(pp)) {
1775 			switch (kcage_assimilate_page(pp, &nfreed)) {
1776 				case 0:
1777 					did_something = 1;
1778 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
1779 					    nfreed);
1780 					break;
1781 
1782 				case EBUSY:
1783 				case ERANGE:
1784 					did_something = 1;
1785 					KCAGE_STAT_INCR_SCAN(kt_gotone);
1786 					break;
1787 
1788 				case EAGAIN:
1789 				case ENOMEM:
1790 					break;
1791 
1792 				default:
1793 					/* catch this with debug kernels */
1794 					ASSERT(0);
1795 					break;
1796 			}
1797 
1798 			continue;
1799 		} else {
1800 			int prm;
1801 
1802 			if (PP_ISFREE(pp)) {
1803 				continue;
1804 			}
1805 
1806 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
1807 			    !page_trylock(pp, SE_EXCL)) {
1808 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
1809 				continue;
1810 			}
1811 
1812 			/* P_NORELOC bit should not have gone away. */
1813 			ASSERT(PP_ISNORELOC(pp));
1814 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
1815 			    pp->p_lckcnt > 0)) {
1816 				page_unlock(pp);
1817 				continue;
1818 			}
1819 
1820 			KCAGE_STAT_SET_SCAN(kt_skiplevel, shared_level);
1821 			if (hat_page_checkshare(pp, shared_level)) {
1822 				page_unlock(pp);
1823 				pages_skipped = 1;
1824 				shared_skipped = 1;
1825 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
1826 				continue;
1827 			}
1828 
1829 			/*
1830 			 * In pass {0, 1}, skip page if ref bit is set.
1831 			 * In pass {0, 1, 2}, skip page if mod bit is set.
1832 			 */
1833 			prm = hat_pagesync(pp,
1834 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD);
1835 
1836 			/* On first pass ignore ref'd pages */
1837 			if (pass <= 1 && (prm & P_REF)) {
1838 				KCAGE_STAT_INCR_SCAN(kt_skiprefd);
1839 				pages_skipped = 1;
1840 				page_unlock(pp);
1841 				continue;
1842 			}
1843 
1844 			/* On pass 2, VN_DISPOSE if mod bit is not set */
1845 			if (pass <= 2) {
1846 				if (pp->p_szc != 0 || (prm & P_MOD) ||
1847 				    pp->p_lckcnt || pp->p_cowcnt) {
1848 					pages_skipped = 1;
1849 					page_unlock(pp);
1850 				} else {
1851 
1852 					/*
1853 					 * unload the mappings before
1854 					 * checking if mod bit is set
1855 					 */
1856 					(void) hat_pageunload(pp,
1857 					    HAT_FORCE_PGUNLOAD);
1858 
1859 					/*
1860 					 * skip this page if modified
1861 					 */
1862 					if (hat_ismod(pp)) {
1863 						pages_skipped = 1;
1864 						page_unlock(pp);
1865 						continue;
1866 					}
1867 
1868 					KCAGE_STAT_INCR_SCAN(kt_destroy);
1869 					/* constant in conditional context */
1870 					/* LINTED */
1871 					VN_DISPOSE(pp, B_INVAL, 0, kcred);
1872 					did_something = 1;
1873 				}
1874 				continue;
1875 			}
1876 
1877 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
1878 				did_something = 1;
1879 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
1880 			}
1881 
1882 			/*
1883 			 * No need to drop the page lock here.
1884 			 * Kcage_invalidate_page has done that for us
1885 			 * either explicitly or through a page_free.
1886 			 */
1887 		}
1888 	}
1889 
1890 	/*
1891 	 * Expand the cage only if available cage memory is really low.
1892 	 * This test is done only after a complete scan of the cage.
1893 	 * The reason for not checking and expanding more often is to
1894 	 * avoid rapid expansion of the cage. Naturally, scanning the
1895 	 * cage takes time. So by scanning first, we use that work as a
1896 	 * delay loop in between expand decisions.
1897 	 */
1898 
1899 	scan_again = 0;
1900 	if (kcage_freemem < kcage_minfree || kcage_needfree) {
1901 		/*
1902 		 * Kcage_expand() will return a non-zero value if it was
1903 		 * able to expand the cage -- whether or not the new
1904 		 * pages are free and immediately usable. If non-zero,
1905 		 * we do another scan of the cage. The pages might be
1906 		 * freed during that scan or by time we get back here.
1907 		 * If not, we will attempt another expansion.
1908 		 * However, if kcage_expand() returns zero, then it was
1909 		 * unable to expand the cage. This is the case when the
1910 		 * the growth list is exausted, therefore no work was done
1911 		 * and there is no reason to scan the cage again.
1912 		 * Note: Kernel cage scan is not repeated when only one
1913 		 * cpu is active to avoid kernel cage thread hogging cpu.
1914 		 */
1915 		if (pass <= 3 && pages_skipped && cp_default.cp_ncpus > 1)
1916 			scan_again = 1;
1917 		else
1918 			(void) kcage_expand(); /* don't scan again */
1919 	} else if (kcage_freemem < kcage_lotsfree) {
1920 		/*
1921 		 * If available cage memory is less than abundant
1922 		 * and a full scan of the cage has not yet been completed,
1923 		 * or a scan has completed and some work was performed,
1924 		 * or pages were skipped because of sharing,
1925 		 * or we simply have not yet completed two passes,
1926 		 * then do another scan.
1927 		 */
1928 		if (pass <= 2 && pages_skipped)
1929 			scan_again = 1;
1930 		if (pass == last_pass || did_something)
1931 			scan_again = 1;
1932 		else if (shared_skipped && shared_level < (8<<24)) {
1933 			shared_level <<= 1;
1934 			scan_again = 1;
1935 		}
1936 	}
1937 
1938 	if (scan_again && cp_default.cp_ncpus > 1)
1939 		goto again;
1940 	else {
1941 		if (shared_level > 8)
1942 			shared_level >>= 1;
1943 
1944 		KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
1945 		KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1946 		KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
1947 		KCAGE_STAT_INC_SCAN_INDEX;
1948 		goto loop;
1949 	}
1950 
1951 	/*NOTREACHED*/
1952 }
1953 
1954 void
1955 kcage_cageout_wakeup()
1956 {
1957 	if (mutex_tryenter(&kcage_cageout_mutex)) {
1958 		if (kcage_cageout_ready) {
1959 			cv_signal(&kcage_cageout_cv);
1960 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
1961 			/*
1962 			 * Available cage memory is really low. Time to
1963 			 * start expanding the cage. However, the
1964 			 * kernel cage thread is not yet ready to
1965 			 * do the work. Use *this* thread, which is
1966 			 * most likely to be t0, to do the work.
1967 			 */
1968 			KCAGE_STAT_INCR(kcw_expandearly);
1969 			(void) kcage_expand();
1970 			KCAGE_STAT_INC_SCAN_INDEX;
1971 		}
1972 
1973 		mutex_exit(&kcage_cageout_mutex);
1974 	}
1975 	/* else, kernel cage thread is already running */
1976 }
1977 
1978 void
1979 kcage_tick()
1980 {
1981 	/*
1982 	 * Once per second we wake up all the threads throttled
1983 	 * waiting for cage memory, in case we've become stuck
1984 	 * and haven't made forward progress expanding the cage.
1985 	 */
1986 	if (kcage_on && kcage_cageout_ready)
1987 		cv_broadcast(&kcage_throttle_cv);
1988 }
1989