xref: /titanic_50/usr/src/uts/common/os/mem_cage.c (revision 6899cf3fadefdc866b7d37382f3631ea45a8f6fb)
17c478bd9Sstevel@tonic-gate /*
27c478bd9Sstevel@tonic-gate  * CDDL HEADER START
37c478bd9Sstevel@tonic-gate  *
47c478bd9Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5ee88d2b9Skchow  * Common Development and Distribution License (the "License").
6ee88d2b9Skchow  * You may not use this file except in compliance with the License.
77c478bd9Sstevel@tonic-gate  *
87c478bd9Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
97c478bd9Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
107c478bd9Sstevel@tonic-gate  * See the License for the specific language governing permissions
117c478bd9Sstevel@tonic-gate  * and limitations under the License.
127c478bd9Sstevel@tonic-gate  *
137c478bd9Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
147c478bd9Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
157c478bd9Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
167c478bd9Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
177c478bd9Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
187c478bd9Sstevel@tonic-gate  *
197c478bd9Sstevel@tonic-gate  * CDDL HEADER END
207c478bd9Sstevel@tonic-gate  */
217c478bd9Sstevel@tonic-gate /*
22*23a80de1SStan Studzinski  * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
237c478bd9Sstevel@tonic-gate  */
247c478bd9Sstevel@tonic-gate 
257c478bd9Sstevel@tonic-gate #include <sys/types.h>
267c478bd9Sstevel@tonic-gate #include <sys/param.h>
277c478bd9Sstevel@tonic-gate #include <sys/thread.h>
287c478bd9Sstevel@tonic-gate #include <sys/proc.h>
297c478bd9Sstevel@tonic-gate #include <sys/callb.h>
307c478bd9Sstevel@tonic-gate #include <sys/vnode.h>
317c478bd9Sstevel@tonic-gate #include <sys/debug.h>
327c478bd9Sstevel@tonic-gate #include <sys/systm.h>		/* for bzero */
337c478bd9Sstevel@tonic-gate #include <sys/memlist.h>
347c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h>
357c478bd9Sstevel@tonic-gate #include <sys/sysmacros.h>
367c478bd9Sstevel@tonic-gate #include <sys/vmsystm.h>	/* for NOMEMWAIT() */
377c478bd9Sstevel@tonic-gate #include <sys/atomic.h>		/* used to update kcage_freemem */
387c478bd9Sstevel@tonic-gate #include <sys/kmem.h>		/* for kmem_reap */
397c478bd9Sstevel@tonic-gate #include <sys/errno.h>
407c478bd9Sstevel@tonic-gate #include <sys/mem_cage.h>
417c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h>
427c478bd9Sstevel@tonic-gate #include <vm/page.h>
437c478bd9Sstevel@tonic-gate #include <vm/hat.h>
44affbd3ccSkchow #include <vm/vm_dep.h>
457c478bd9Sstevel@tonic-gate #include <sys/mem_config.h>
467c478bd9Sstevel@tonic-gate #include <sys/lgrp.h>
475d07b933Sdp78419 #include <sys/rwlock.h>
48dc84a327Svb70745 #include <sys/cpupart.h>
497c478bd9Sstevel@tonic-gate 
507c478bd9Sstevel@tonic-gate extern pri_t maxclsyspri;
517c478bd9Sstevel@tonic-gate 
527c478bd9Sstevel@tonic-gate #ifdef DEBUG
537c478bd9Sstevel@tonic-gate #define	KCAGE_STATS
547c478bd9Sstevel@tonic-gate #endif
557c478bd9Sstevel@tonic-gate 
567c478bd9Sstevel@tonic-gate #ifdef KCAGE_STATS
577c478bd9Sstevel@tonic-gate 
587c478bd9Sstevel@tonic-gate #define	KCAGE_STATS_VERSION 9	/* can help report generators */
597c478bd9Sstevel@tonic-gate #define	KCAGE_STATS_NSCANS 256	/* depth of scan statistics buffer */
607c478bd9Sstevel@tonic-gate 
617c478bd9Sstevel@tonic-gate struct kcage_stats_scan {
627c478bd9Sstevel@tonic-gate 	/* managed by KCAGE_STAT_* macros */
637c478bd9Sstevel@tonic-gate 	clock_t	scan_lbolt;
647c478bd9Sstevel@tonic-gate 	uint_t	scan_id;
657c478bd9Sstevel@tonic-gate 
667c478bd9Sstevel@tonic-gate 	/* set in kcage_cageout() */
677c478bd9Sstevel@tonic-gate 	uint_t	kt_passes;
687c478bd9Sstevel@tonic-gate 	clock_t	kt_ticks;
697c478bd9Sstevel@tonic-gate 	pgcnt_t	kt_kcage_freemem_start;
707c478bd9Sstevel@tonic-gate 	pgcnt_t	kt_kcage_freemem_end;
717c478bd9Sstevel@tonic-gate 	pgcnt_t kt_freemem_start;
727c478bd9Sstevel@tonic-gate 	pgcnt_t kt_freemem_end;
737c478bd9Sstevel@tonic-gate 	uint_t	kt_examined;
747c478bd9Sstevel@tonic-gate 	uint_t	kt_cantlock;
757c478bd9Sstevel@tonic-gate 	uint_t	kt_gotone;
767c478bd9Sstevel@tonic-gate 	uint_t	kt_gotonefree;
777c478bd9Sstevel@tonic-gate 	uint_t	kt_skipshared;
787c478bd9Sstevel@tonic-gate 	uint_t	kt_skiprefd;
797c478bd9Sstevel@tonic-gate 	uint_t	kt_destroy;
807c478bd9Sstevel@tonic-gate 
817c478bd9Sstevel@tonic-gate 	/* set in kcage_invalidate_page() */
827c478bd9Sstevel@tonic-gate 	uint_t	kip_reloclocked;
837c478bd9Sstevel@tonic-gate 	uint_t	kip_relocmod;
847c478bd9Sstevel@tonic-gate 	uint_t	kip_destroy;
857c478bd9Sstevel@tonic-gate 	uint_t	kip_nomem;
867c478bd9Sstevel@tonic-gate 	uint_t	kip_demotefailed;
877c478bd9Sstevel@tonic-gate 
887c478bd9Sstevel@tonic-gate 	/* set in kcage_expand() */
897c478bd9Sstevel@tonic-gate 	uint_t	ke_wanted;
907c478bd9Sstevel@tonic-gate 	uint_t	ke_examined;
917c478bd9Sstevel@tonic-gate 	uint_t	ke_lefthole;
927c478bd9Sstevel@tonic-gate 	uint_t	ke_gotone;
937c478bd9Sstevel@tonic-gate 	uint_t	ke_gotonefree;
947c478bd9Sstevel@tonic-gate };
957c478bd9Sstevel@tonic-gate 
967c478bd9Sstevel@tonic-gate struct kcage_stats {
977c478bd9Sstevel@tonic-gate 	/* managed by KCAGE_STAT_* macros */
987c478bd9Sstevel@tonic-gate 	uint_t	version;
997c478bd9Sstevel@tonic-gate 	uint_t	size;
1007c478bd9Sstevel@tonic-gate 
1017c478bd9Sstevel@tonic-gate 	/* set in kcage_cageout */
1027c478bd9Sstevel@tonic-gate 	uint_t	kt_wakeups;
1037c478bd9Sstevel@tonic-gate 	uint_t	kt_scans;
1047c478bd9Sstevel@tonic-gate 	uint_t	kt_cageout_break;
1057c478bd9Sstevel@tonic-gate 
1067c478bd9Sstevel@tonic-gate 	/* set in kcage_expand */
1077c478bd9Sstevel@tonic-gate 	uint_t	ke_calls;
1087c478bd9Sstevel@tonic-gate 	uint_t	ke_nopfn;
1097c478bd9Sstevel@tonic-gate 	uint_t	ke_nopaget;
1107c478bd9Sstevel@tonic-gate 	uint_t	ke_isnoreloc;
1117c478bd9Sstevel@tonic-gate 	uint_t	ke_deleting;
1127c478bd9Sstevel@tonic-gate 	uint_t	ke_lowfreemem;
1137c478bd9Sstevel@tonic-gate 	uint_t	ke_terminate;
1147c478bd9Sstevel@tonic-gate 
1157c478bd9Sstevel@tonic-gate 	/* set in kcage_freemem_add() */
1167c478bd9Sstevel@tonic-gate 	uint_t	kfa_trottlewake;
1177c478bd9Sstevel@tonic-gate 
1187c478bd9Sstevel@tonic-gate 	/* set in kcage_freemem_sub() */
1197c478bd9Sstevel@tonic-gate 	uint_t	kfs_cagewake;
1207c478bd9Sstevel@tonic-gate 
1217c478bd9Sstevel@tonic-gate 	/* set in kcage_create_throttle */
1227c478bd9Sstevel@tonic-gate 	uint_t	kct_calls;
1237c478bd9Sstevel@tonic-gate 	uint_t	kct_cageout;
1247c478bd9Sstevel@tonic-gate 	uint_t	kct_critical;
1257c478bd9Sstevel@tonic-gate 	uint_t	kct_exempt;
1267c478bd9Sstevel@tonic-gate 	uint_t	kct_cagewake;
1277c478bd9Sstevel@tonic-gate 	uint_t	kct_wait;
1287c478bd9Sstevel@tonic-gate 	uint_t	kct_progress;
1297c478bd9Sstevel@tonic-gate 	uint_t	kct_noprogress;
1307c478bd9Sstevel@tonic-gate 	uint_t	kct_timeout;
1317c478bd9Sstevel@tonic-gate 
1327c478bd9Sstevel@tonic-gate 	/* set in kcage_cageout_wakeup */
1337c478bd9Sstevel@tonic-gate 	uint_t	kcw_expandearly;
1347c478bd9Sstevel@tonic-gate 
1357c478bd9Sstevel@tonic-gate 	/* managed by KCAGE_STAT_* macros */
1367c478bd9Sstevel@tonic-gate 	uint_t	scan_array_size;
1377c478bd9Sstevel@tonic-gate 	uint_t	scan_index;
1387c478bd9Sstevel@tonic-gate 	struct kcage_stats_scan scans[KCAGE_STATS_NSCANS];
1397c478bd9Sstevel@tonic-gate };
1407c478bd9Sstevel@tonic-gate 
1417c478bd9Sstevel@tonic-gate static struct kcage_stats kcage_stats;
1427c478bd9Sstevel@tonic-gate static struct kcage_stats_scan kcage_stats_scan_zero;
1437c478bd9Sstevel@tonic-gate 
1447c478bd9Sstevel@tonic-gate /*
1457c478bd9Sstevel@tonic-gate  * No real need for atomics here. For the most part the incs and sets are
1467c478bd9Sstevel@tonic-gate  * done by the kernel cage thread. There are a few that are done by any
1477c478bd9Sstevel@tonic-gate  * number of other threads. Those cases are noted by comments.
1487c478bd9Sstevel@tonic-gate  */
1497c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INCR(m)	kcage_stats.m++
1507c478bd9Sstevel@tonic-gate 
1517c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_NINCR(m, v) kcage_stats.m += (v)
1527c478bd9Sstevel@tonic-gate 
1537c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INCR_SCAN(m)	\
1547c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR(scans[kcage_stats.scan_index].m)
1557c478bd9Sstevel@tonic-gate 
1567c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_NINCR_SCAN(m, v) \
1577c478bd9Sstevel@tonic-gate 	KCAGE_STAT_NINCR(scans[kcage_stats.scan_index].m, v)
1587c478bd9Sstevel@tonic-gate 
1597c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SET(m, v)	kcage_stats.m = (v)
1607c478bd9Sstevel@tonic-gate 
1617c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SETZ(m, v)	\
1627c478bd9Sstevel@tonic-gate 	if (kcage_stats.m == 0) kcage_stats.m = (v)
1637c478bd9Sstevel@tonic-gate 
1647c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SET_SCAN(m, v)	\
1657c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET(scans[kcage_stats.scan_index].m, v)
1667c478bd9Sstevel@tonic-gate 
1677c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SETZ_SCAN(m, v)	\
1687c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SETZ(scans[kcage_stats.scan_index].m, v)
1697c478bd9Sstevel@tonic-gate 
1707c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INC_SCAN_INDEX \
171d3d50737SRafael Vanoni 	KCAGE_STAT_SET_SCAN(scan_lbolt, ddi_get_lbolt()); \
1727c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET_SCAN(scan_id, kcage_stats.scan_index); \
1737c478bd9Sstevel@tonic-gate 	kcage_stats.scan_index = \
1747c478bd9Sstevel@tonic-gate 	(kcage_stats.scan_index + 1) % KCAGE_STATS_NSCANS; \
1757c478bd9Sstevel@tonic-gate 	kcage_stats.scans[kcage_stats.scan_index] = kcage_stats_scan_zero
1767c478bd9Sstevel@tonic-gate 
1777c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INIT_SCAN_INDEX \
1787c478bd9Sstevel@tonic-gate 	kcage_stats.version = KCAGE_STATS_VERSION; \
1797c478bd9Sstevel@tonic-gate 	kcage_stats.size = sizeof (kcage_stats); \
1807c478bd9Sstevel@tonic-gate 	kcage_stats.scan_array_size = KCAGE_STATS_NSCANS; \
1817c478bd9Sstevel@tonic-gate 	kcage_stats.scan_index = 0
1827c478bd9Sstevel@tonic-gate 
1837c478bd9Sstevel@tonic-gate #else /* KCAGE_STATS */
1847c478bd9Sstevel@tonic-gate 
1857c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INCR(v)
1867c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_NINCR(m, v)
1877c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INCR_SCAN(v)
1887c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_NINCR_SCAN(m, v)
1897c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SET(m, v)
1907c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SETZ(m, v)
1917c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SET_SCAN(m, v)
1927c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_SETZ_SCAN(m, v)
1937c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INC_SCAN_INDEX
1947c478bd9Sstevel@tonic-gate #define	KCAGE_STAT_INIT_SCAN_INDEX
1957c478bd9Sstevel@tonic-gate 
1967c478bd9Sstevel@tonic-gate #endif /* KCAGE_STATS */
1977c478bd9Sstevel@tonic-gate 
1987c478bd9Sstevel@tonic-gate static kmutex_t kcage_throttle_mutex;	/* protects kcage_throttle_cv */
1997c478bd9Sstevel@tonic-gate static kcondvar_t kcage_throttle_cv;
2007c478bd9Sstevel@tonic-gate 
2017c478bd9Sstevel@tonic-gate static kmutex_t kcage_cageout_mutex;	/* protects cv and ready flag */
2027c478bd9Sstevel@tonic-gate static kcondvar_t kcage_cageout_cv;	/* cageout thread naps here */
2037c478bd9Sstevel@tonic-gate static int kcage_cageout_ready;		/* nonzero when cageout thread ready */
2047c478bd9Sstevel@tonic-gate kthread_id_t kcage_cageout_thread;	/* to aid debugging */
2057c478bd9Sstevel@tonic-gate 
2065d07b933Sdp78419 static krwlock_t kcage_range_rwlock;	/* protects kcage_glist elements */
2077c478bd9Sstevel@tonic-gate 
2087c478bd9Sstevel@tonic-gate /*
2097c478bd9Sstevel@tonic-gate  * Cage expansion happens within a range.
2107c478bd9Sstevel@tonic-gate  */
2117c478bd9Sstevel@tonic-gate struct kcage_glist {
2127c478bd9Sstevel@tonic-gate 	struct kcage_glist	*next;
2137c478bd9Sstevel@tonic-gate 	pfn_t			base;
2147c478bd9Sstevel@tonic-gate 	pfn_t			lim;
2157c478bd9Sstevel@tonic-gate 	pfn_t			curr;
2167c478bd9Sstevel@tonic-gate 	int			decr;
2177c478bd9Sstevel@tonic-gate };
2187c478bd9Sstevel@tonic-gate 
2197c478bd9Sstevel@tonic-gate static struct kcage_glist *kcage_glist;
2207c478bd9Sstevel@tonic-gate static struct kcage_glist *kcage_current_glist;
2217c478bd9Sstevel@tonic-gate 
2227c478bd9Sstevel@tonic-gate /*
2237c478bd9Sstevel@tonic-gate  * The firstfree element is provided so that kmem_alloc can be avoided
2247c478bd9Sstevel@tonic-gate  * until that cage has somewhere to go. This is not currently a problem
2257c478bd9Sstevel@tonic-gate  * as early kmem_alloc's use BOP_ALLOC instead of page_create_va.
2267c478bd9Sstevel@tonic-gate  */
22785f58038Sdp78419 static vmem_t *kcage_arena;
2287c478bd9Sstevel@tonic-gate static struct kcage_glist kcage_glist_firstfree;
2297c478bd9Sstevel@tonic-gate static struct kcage_glist *kcage_glist_freelist = &kcage_glist_firstfree;
2307c478bd9Sstevel@tonic-gate 
2317c478bd9Sstevel@tonic-gate /*
2327c478bd9Sstevel@tonic-gate  * Miscellaneous forward references
2337c478bd9Sstevel@tonic-gate  */
2347c478bd9Sstevel@tonic-gate static struct kcage_glist *kcage_glist_alloc(void);
2357c478bd9Sstevel@tonic-gate static int kcage_glist_delete(pfn_t, pfn_t, struct kcage_glist **);
2367c478bd9Sstevel@tonic-gate static void kcage_cageout(void);
2377c478bd9Sstevel@tonic-gate static int kcage_invalidate_page(page_t *, pgcnt_t *);
2387c478bd9Sstevel@tonic-gate static int kcage_setnoreloc_pages(page_t *, se_t);
23985f58038Sdp78419 static int kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t);
24085f58038Sdp78419 static void kcage_init(pgcnt_t preferred_size);
24185f58038Sdp78419 static int kcage_range_delete_internal(pfn_t base, pgcnt_t npgs);
2427c478bd9Sstevel@tonic-gate 
2437c478bd9Sstevel@tonic-gate /*
2447c478bd9Sstevel@tonic-gate  * Kernel Memory Cage counters and thresholds.
2457c478bd9Sstevel@tonic-gate  */
2467c478bd9Sstevel@tonic-gate int kcage_on = 0;
2477c478bd9Sstevel@tonic-gate pgcnt_t kcage_freemem;
2487c478bd9Sstevel@tonic-gate pgcnt_t kcage_needfree;
2497c478bd9Sstevel@tonic-gate pgcnt_t kcage_lotsfree;
2507c478bd9Sstevel@tonic-gate pgcnt_t kcage_desfree;
2517c478bd9Sstevel@tonic-gate pgcnt_t kcage_minfree;
2527c478bd9Sstevel@tonic-gate pgcnt_t kcage_throttlefree;
253bc203165Svb70745 pgcnt_t	kcage_reserve;
2547c478bd9Sstevel@tonic-gate int kcage_maxwait = 10;	/* in seconds */
2557c478bd9Sstevel@tonic-gate 
2567c478bd9Sstevel@tonic-gate /* when we use lp for kmem we start the cage at a higher initial value */
2577c478bd9Sstevel@tonic-gate pgcnt_t kcage_kmemlp_mincage;
2587c478bd9Sstevel@tonic-gate 
2597c478bd9Sstevel@tonic-gate #ifdef DEBUG
2607c478bd9Sstevel@tonic-gate pgcnt_t	kcage_pagets;
2617c478bd9Sstevel@tonic-gate #define	KCAGEPAGETS_INC()	kcage_pagets++
2627c478bd9Sstevel@tonic-gate #else
2637c478bd9Sstevel@tonic-gate #define	KCAGEPAGETS_INC()
2647c478bd9Sstevel@tonic-gate #endif
2657c478bd9Sstevel@tonic-gate 
2668b464eb8Smec /* kstats to export what pages are currently caged */
2678b464eb8Smec kmutex_t kcage_kstat_lock;
2688b464eb8Smec static int kcage_kstat_update(kstat_t *ksp, int rw);
2698b464eb8Smec static int kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
2708b464eb8Smec 
2717c478bd9Sstevel@tonic-gate /*
2727c478bd9Sstevel@tonic-gate  * Startup and Dynamic Reconfiguration interfaces.
2737c478bd9Sstevel@tonic-gate  * kcage_range_add()
2747c478bd9Sstevel@tonic-gate  * kcage_range_del()
27585f58038Sdp78419  * kcage_range_delete_post_mem_del()
27685f58038Sdp78419  * kcage_range_init()
2777c478bd9Sstevel@tonic-gate  * kcage_set_thresholds()
2787c478bd9Sstevel@tonic-gate  */
2797c478bd9Sstevel@tonic-gate 
2805d07b933Sdp78419 /*
2817c478bd9Sstevel@tonic-gate  * Called from page_get_contig_pages to get the approximate kcage pfn range
2827c478bd9Sstevel@tonic-gate  * for exclusion from search for contiguous pages. This routine is called
2837c478bd9Sstevel@tonic-gate  * without kcage_range lock (kcage routines can call page_get_contig_pages
2847c478bd9Sstevel@tonic-gate  * through page_relocate) and with the assumption, based on kcage_range_add,
2857c478bd9Sstevel@tonic-gate  * that kcage_current_glist always contain a valid pointer.
2867c478bd9Sstevel@tonic-gate  */
2877c478bd9Sstevel@tonic-gate 
2887c478bd9Sstevel@tonic-gate int
kcage_current_pfn(pfn_t * pfncur)2897c478bd9Sstevel@tonic-gate kcage_current_pfn(pfn_t *pfncur)
2907c478bd9Sstevel@tonic-gate {
2917c478bd9Sstevel@tonic-gate 	struct kcage_glist *lp = kcage_current_glist;
2927c478bd9Sstevel@tonic-gate 
2937c478bd9Sstevel@tonic-gate 	ASSERT(kcage_on);
2947c478bd9Sstevel@tonic-gate 
2957c478bd9Sstevel@tonic-gate 	ASSERT(lp != NULL);
2967c478bd9Sstevel@tonic-gate 
2977c478bd9Sstevel@tonic-gate 	*pfncur = lp->curr;
2987c478bd9Sstevel@tonic-gate 
2997c478bd9Sstevel@tonic-gate 	return (lp->decr);
3007c478bd9Sstevel@tonic-gate }
3017c478bd9Sstevel@tonic-gate 
3025d07b933Sdp78419 /*
3035d07b933Sdp78419  * Called from vm_pagelist.c during coalesce to find kernel cage regions
3045d07b933Sdp78419  * within an mnode. Looks for the lowest range between lo and hi.
3055d07b933Sdp78419  *
3065d07b933Sdp78419  * Kernel cage memory is defined between kcage_glist and kcage_current_glist.
3075d07b933Sdp78419  * Non-cage memory is defined between kcage_current_glist and list end.
3085d07b933Sdp78419  *
3095d07b933Sdp78419  * If incage is set, returns the lowest kcage range. Otherwise returns lowest
3105d07b933Sdp78419  * non-cage range.
3115d07b933Sdp78419  *
3125d07b933Sdp78419  * Returns zero on success and nlo, nhi:
3135d07b933Sdp78419  * 	lo <= nlo < nhi <= hi
3145d07b933Sdp78419  * Returns non-zero if no overlapping range is found.
3155d07b933Sdp78419  */
3165d07b933Sdp78419 int
kcage_next_range(int incage,pfn_t lo,pfn_t hi,pfn_t * nlo,pfn_t * nhi)3175d07b933Sdp78419 kcage_next_range(int incage, pfn_t lo, pfn_t hi,
3185d07b933Sdp78419     pfn_t *nlo, pfn_t *nhi)
3195d07b933Sdp78419 {
3205d07b933Sdp78419 	struct kcage_glist *lp;
3215d07b933Sdp78419 	pfn_t tlo = hi;
3225d07b933Sdp78419 	pfn_t thi = hi;
3235d07b933Sdp78419 
3245d07b933Sdp78419 	ASSERT(lo <= hi);
3255d07b933Sdp78419 
3265d07b933Sdp78419 	/*
3275d07b933Sdp78419 	 * Reader lock protects the list, but kcage_get_pfn
3285d07b933Sdp78419 	 * running concurrently may advance kcage_current_glist
3295d07b933Sdp78419 	 * and also update kcage_current_glist->curr. Page
3305d07b933Sdp78419 	 * coalesce can handle this race condition.
3315d07b933Sdp78419 	 */
3325d07b933Sdp78419 	rw_enter(&kcage_range_rwlock, RW_READER);
3335d07b933Sdp78419 
3345d07b933Sdp78419 	for (lp = incage ? kcage_glist : kcage_current_glist;
3355d07b933Sdp78419 	    lp != NULL; lp = lp->next) {
3365d07b933Sdp78419 
3375d07b933Sdp78419 		pfn_t klo, khi;
3385d07b933Sdp78419 
3395d07b933Sdp78419 		/* find the range limits in this element */
3405d07b933Sdp78419 		if ((incage && lp->decr) || (!incage && !lp->decr)) {
3415d07b933Sdp78419 			klo = lp->curr;
3425d07b933Sdp78419 			khi = lp->lim;
3435d07b933Sdp78419 		} else {
3445d07b933Sdp78419 			klo = lp->base;
3455d07b933Sdp78419 			khi = lp->curr;
3465d07b933Sdp78419 		}
3475d07b933Sdp78419 
3485d07b933Sdp78419 		/* handle overlap */
3495d07b933Sdp78419 		if (klo < tlo && klo < khi && lo < khi && klo < hi) {
3505d07b933Sdp78419 			tlo = MAX(lo, klo);
3515d07b933Sdp78419 			thi = MIN(hi, khi);
3525d07b933Sdp78419 			if (tlo == lo)
3535d07b933Sdp78419 				break;
3545d07b933Sdp78419 		}
3555d07b933Sdp78419 
3565d07b933Sdp78419 		/* check end of kcage */
3575d07b933Sdp78419 		if (incage && lp == kcage_current_glist) {
3585d07b933Sdp78419 			break;
3595d07b933Sdp78419 		}
3605d07b933Sdp78419 	}
3615d07b933Sdp78419 
3625d07b933Sdp78419 	rw_exit(&kcage_range_rwlock);
3635d07b933Sdp78419 
3645d07b933Sdp78419 	/* return non-zero if no overlapping range found */
3655d07b933Sdp78419 	if (tlo == thi)
3665d07b933Sdp78419 		return (1);
3675d07b933Sdp78419 
3685d07b933Sdp78419 	ASSERT(lo <= tlo && tlo < thi && thi <= hi);
3695d07b933Sdp78419 
3705d07b933Sdp78419 	/* return overlapping range */
3715d07b933Sdp78419 	*nlo = tlo;
3725d07b933Sdp78419 	*nhi = thi;
3735d07b933Sdp78419 	return (0);
3745d07b933Sdp78419 }
3755d07b933Sdp78419 
37685f58038Sdp78419 void
kcage_range_init(struct memlist * ml,kcage_dir_t d,pgcnt_t preferred_size)37785f58038Sdp78419 kcage_range_init(struct memlist *ml, kcage_dir_t d, pgcnt_t preferred_size)
3787c478bd9Sstevel@tonic-gate {
3797c478bd9Sstevel@tonic-gate 	int ret = 0;
3807c478bd9Sstevel@tonic-gate 
38185f58038Sdp78419 	ASSERT(kcage_arena == NULL);
38285f58038Sdp78419 	kcage_arena = vmem_create("kcage_arena", NULL, 0, sizeof (uint64_t),
38385f58038Sdp78419 	    segkmem_alloc, segkmem_free, heap_arena, 0, VM_SLEEP);
38485f58038Sdp78419 	ASSERT(kcage_arena != NULL);
3857c478bd9Sstevel@tonic-gate 
38685f58038Sdp78419 	if (d == KCAGE_DOWN) {
38756f33205SJonathan Adams 		while (ml->ml_next != NULL)
38856f33205SJonathan Adams 			ml = ml->ml_next;
3897c478bd9Sstevel@tonic-gate 	}
3907c478bd9Sstevel@tonic-gate 
39185f58038Sdp78419 	rw_enter(&kcage_range_rwlock, RW_WRITER);
3928c754b1bSdp78419 
39385f58038Sdp78419 	while (ml != NULL) {
39456f33205SJonathan Adams 		ret = kcage_range_add_internal(btop(ml->ml_address),
39556f33205SJonathan Adams 		    btop(ml->ml_size), d);
39685f58038Sdp78419 		if (ret)
39785f58038Sdp78419 			panic("kcage_range_add_internal failed: "
398903a11ebSrh87107 			    "ml=%p, ret=0x%x\n", (void *)ml, ret);
39985f58038Sdp78419 
40056f33205SJonathan Adams 		ml = (d == KCAGE_DOWN ? ml->ml_prev : ml->ml_next);
4017c478bd9Sstevel@tonic-gate 	}
4027c478bd9Sstevel@tonic-gate 
40385f58038Sdp78419 	rw_exit(&kcage_range_rwlock);
40485f58038Sdp78419 
40585f58038Sdp78419 	if (ret == 0)
40685f58038Sdp78419 		kcage_init(preferred_size);
4077c478bd9Sstevel@tonic-gate }
4087c478bd9Sstevel@tonic-gate 
4097c478bd9Sstevel@tonic-gate /*
4107c478bd9Sstevel@tonic-gate  * Third arg controls direction of growth: 0: increasing pfns,
4117c478bd9Sstevel@tonic-gate  * 1: decreasing.
4127c478bd9Sstevel@tonic-gate  */
41385f58038Sdp78419 static int
kcage_range_add_internal(pfn_t base,pgcnt_t npgs,kcage_dir_t d)41485f58038Sdp78419 kcage_range_add_internal(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
4157c478bd9Sstevel@tonic-gate {
4167c478bd9Sstevel@tonic-gate 	struct kcage_glist *new, **lpp;
4177c478bd9Sstevel@tonic-gate 	pfn_t lim;
4187c478bd9Sstevel@tonic-gate 
41985f58038Sdp78419 	ASSERT(rw_write_held(&kcage_range_rwlock));
4207c478bd9Sstevel@tonic-gate 
4217c478bd9Sstevel@tonic-gate 	ASSERT(npgs != 0);
4227c478bd9Sstevel@tonic-gate 	if (npgs == 0)
4237c478bd9Sstevel@tonic-gate 		return (EINVAL);
4247c478bd9Sstevel@tonic-gate 
4257c478bd9Sstevel@tonic-gate 	lim = base + npgs;
4267c478bd9Sstevel@tonic-gate 
4277c478bd9Sstevel@tonic-gate 	ASSERT(lim > base);
4287c478bd9Sstevel@tonic-gate 	if (lim <= base)
4297c478bd9Sstevel@tonic-gate 		return (EINVAL);
4307c478bd9Sstevel@tonic-gate 
4317c478bd9Sstevel@tonic-gate 	new = kcage_glist_alloc();
4327c478bd9Sstevel@tonic-gate 	if (new == NULL) {
4337c478bd9Sstevel@tonic-gate 		return (ENOMEM);
4347c478bd9Sstevel@tonic-gate 	}
4357c478bd9Sstevel@tonic-gate 
4367c478bd9Sstevel@tonic-gate 	new->base = base;
4377c478bd9Sstevel@tonic-gate 	new->lim = lim;
43885f58038Sdp78419 	new->decr = (d == KCAGE_DOWN);
4397c478bd9Sstevel@tonic-gate 	if (new->decr != 0)
4407c478bd9Sstevel@tonic-gate 		new->curr = new->lim;
4417c478bd9Sstevel@tonic-gate 	else
4427c478bd9Sstevel@tonic-gate 		new->curr = new->base;
4437c478bd9Sstevel@tonic-gate 	/*
4447c478bd9Sstevel@tonic-gate 	 * Any overlapping existing ranges are removed by deleting
4457c478bd9Sstevel@tonic-gate 	 * from the new list as we search for the tail.
4467c478bd9Sstevel@tonic-gate 	 */
4477c478bd9Sstevel@tonic-gate 	lpp = &kcage_glist;
4487c478bd9Sstevel@tonic-gate 	while (*lpp != NULL) {
4497c478bd9Sstevel@tonic-gate 		int ret;
4507c478bd9Sstevel@tonic-gate 		ret = kcage_glist_delete((*lpp)->base, (*lpp)->lim, &new);
4517c478bd9Sstevel@tonic-gate 		if (ret != 0)
4527c478bd9Sstevel@tonic-gate 			return (ret);
4537c478bd9Sstevel@tonic-gate 		lpp = &(*lpp)->next;
4547c478bd9Sstevel@tonic-gate 	}
4557c478bd9Sstevel@tonic-gate 
4567c478bd9Sstevel@tonic-gate 	*lpp = new;
4577c478bd9Sstevel@tonic-gate 
4587c478bd9Sstevel@tonic-gate 	if (kcage_current_glist == NULL) {
4597c478bd9Sstevel@tonic-gate 		kcage_current_glist = kcage_glist;
4607c478bd9Sstevel@tonic-gate 	}
4617c478bd9Sstevel@tonic-gate 
4627c478bd9Sstevel@tonic-gate 	return (0);
4637c478bd9Sstevel@tonic-gate }
4647c478bd9Sstevel@tonic-gate 
4656b990117Sdm120769 int
kcage_range_add(pfn_t base,pgcnt_t npgs,kcage_dir_t d)46685f58038Sdp78419 kcage_range_add(pfn_t base, pgcnt_t npgs, kcage_dir_t d)
46785f58038Sdp78419 {
46885f58038Sdp78419 	int ret;
46985f58038Sdp78419 
47085f58038Sdp78419 	rw_enter(&kcage_range_rwlock, RW_WRITER);
47185f58038Sdp78419 	ret = kcage_range_add_internal(base, npgs, d);
47285f58038Sdp78419 	rw_exit(&kcage_range_rwlock);
47385f58038Sdp78419 	return (ret);
47485f58038Sdp78419 }
47585f58038Sdp78419 
47685f58038Sdp78419 /*
47785f58038Sdp78419  * Calls to add and delete must be protected by kcage_range_rwlock
47885f58038Sdp78419  */
47985f58038Sdp78419 static int
kcage_range_delete_internal(pfn_t base,pgcnt_t npgs)48085f58038Sdp78419 kcage_range_delete_internal(pfn_t base, pgcnt_t npgs)
4817c478bd9Sstevel@tonic-gate {
4827c478bd9Sstevel@tonic-gate 	struct kcage_glist *lp;
4837c478bd9Sstevel@tonic-gate 	pfn_t lim;
4847c478bd9Sstevel@tonic-gate 
48585f58038Sdp78419 	ASSERT(rw_write_held(&kcage_range_rwlock));
4867c478bd9Sstevel@tonic-gate 
4877c478bd9Sstevel@tonic-gate 	ASSERT(npgs != 0);
4887c478bd9Sstevel@tonic-gate 	if (npgs == 0)
4897c478bd9Sstevel@tonic-gate 		return (EINVAL);
4907c478bd9Sstevel@tonic-gate 
4917c478bd9Sstevel@tonic-gate 	lim = base + npgs;
4927c478bd9Sstevel@tonic-gate 
4937c478bd9Sstevel@tonic-gate 	ASSERT(lim > base);
4947c478bd9Sstevel@tonic-gate 	if (lim <= base)
4957c478bd9Sstevel@tonic-gate 		return (EINVAL);
4967c478bd9Sstevel@tonic-gate 
4977c478bd9Sstevel@tonic-gate 	/*
4987c478bd9Sstevel@tonic-gate 	 * Check if the delete is OK first as a number of elements
4997c478bd9Sstevel@tonic-gate 	 * might be involved and it will be difficult to go
5007c478bd9Sstevel@tonic-gate 	 * back and undo (can't just add the range back in).
5017c478bd9Sstevel@tonic-gate 	 */
5027c478bd9Sstevel@tonic-gate 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
5037c478bd9Sstevel@tonic-gate 		/*
5047c478bd9Sstevel@tonic-gate 		 * If there have been no pages allocated from this
5057c478bd9Sstevel@tonic-gate 		 * element, we don't need to check it.
5067c478bd9Sstevel@tonic-gate 		 */
5077c478bd9Sstevel@tonic-gate 		if ((lp->decr == 0 && lp->curr == lp->base) ||
5087c478bd9Sstevel@tonic-gate 		    (lp->decr != 0 && lp->curr == lp->lim))
5097c478bd9Sstevel@tonic-gate 			continue;
5107c478bd9Sstevel@tonic-gate 		/*
5117c478bd9Sstevel@tonic-gate 		 * If the element does not overlap, its OK.
5127c478bd9Sstevel@tonic-gate 		 */
5137c478bd9Sstevel@tonic-gate 		if (base >= lp->lim || lim <= lp->base)
5147c478bd9Sstevel@tonic-gate 			continue;
5157c478bd9Sstevel@tonic-gate 		/*
5167c478bd9Sstevel@tonic-gate 		 * Overlapping element: Does the range to be deleted
5177c478bd9Sstevel@tonic-gate 		 * overlap the area already used? If so fail.
5187c478bd9Sstevel@tonic-gate 		 */
5197c478bd9Sstevel@tonic-gate 		if (lp->decr == 0 && base < lp->curr && lim >= lp->base) {
5207c478bd9Sstevel@tonic-gate 			return (EBUSY);
5217c478bd9Sstevel@tonic-gate 		}
5227c478bd9Sstevel@tonic-gate 		if (lp->decr != 0 && base < lp->lim && lim >= lp->curr) {
5237c478bd9Sstevel@tonic-gate 			return (EBUSY);
5247c478bd9Sstevel@tonic-gate 		}
5257c478bd9Sstevel@tonic-gate 	}
5267c478bd9Sstevel@tonic-gate 	return (kcage_glist_delete(base, lim, &kcage_glist));
5277c478bd9Sstevel@tonic-gate }
5287c478bd9Sstevel@tonic-gate 
52985f58038Sdp78419 int
kcage_range_delete(pfn_t base,pgcnt_t npgs)53085f58038Sdp78419 kcage_range_delete(pfn_t base, pgcnt_t npgs)
53185f58038Sdp78419 {
53285f58038Sdp78419 	int ret;
53385f58038Sdp78419 
53485f58038Sdp78419 	rw_enter(&kcage_range_rwlock, RW_WRITER);
53585f58038Sdp78419 	ret = kcage_range_delete_internal(base, npgs);
53685f58038Sdp78419 	rw_exit(&kcage_range_rwlock);
53785f58038Sdp78419 	return (ret);
53885f58038Sdp78419 }
53985f58038Sdp78419 
5407c478bd9Sstevel@tonic-gate /*
54185f58038Sdp78419  * Calls to add and delete must be protected by kcage_range_rwlock.
5427c478bd9Sstevel@tonic-gate  * This routine gets called after successful Solaris memory
5437c478bd9Sstevel@tonic-gate  * delete operation from DR post memory delete routines.
5447c478bd9Sstevel@tonic-gate  */
54585f58038Sdp78419 static int
kcage_range_delete_post_mem_del_internal(pfn_t base,pgcnt_t npgs)54685f58038Sdp78419 kcage_range_delete_post_mem_del_internal(pfn_t base, pgcnt_t npgs)
5477c478bd9Sstevel@tonic-gate {
5487c478bd9Sstevel@tonic-gate 	pfn_t lim;
5497c478bd9Sstevel@tonic-gate 
55085f58038Sdp78419 	ASSERT(rw_write_held(&kcage_range_rwlock));
5517c478bd9Sstevel@tonic-gate 
5527c478bd9Sstevel@tonic-gate 	ASSERT(npgs != 0);
5537c478bd9Sstevel@tonic-gate 	if (npgs == 0)
5547c478bd9Sstevel@tonic-gate 		return (EINVAL);
5557c478bd9Sstevel@tonic-gate 
5567c478bd9Sstevel@tonic-gate 	lim = base + npgs;
5577c478bd9Sstevel@tonic-gate 
5587c478bd9Sstevel@tonic-gate 	ASSERT(lim > base);
5597c478bd9Sstevel@tonic-gate 	if (lim <= base)
5607c478bd9Sstevel@tonic-gate 		return (EINVAL);
5617c478bd9Sstevel@tonic-gate 
5627c478bd9Sstevel@tonic-gate 	return (kcage_glist_delete(base, lim, &kcage_glist));
5637c478bd9Sstevel@tonic-gate }
5647c478bd9Sstevel@tonic-gate 
56585f58038Sdp78419 int
kcage_range_delete_post_mem_del(pfn_t base,pgcnt_t npgs)56685f58038Sdp78419 kcage_range_delete_post_mem_del(pfn_t base, pgcnt_t npgs)
56785f58038Sdp78419 {
56885f58038Sdp78419 	int ret;
56985f58038Sdp78419 
57085f58038Sdp78419 	rw_enter(&kcage_range_rwlock, RW_WRITER);
57185f58038Sdp78419 	ret = kcage_range_delete_post_mem_del_internal(base, npgs);
57285f58038Sdp78419 	rw_exit(&kcage_range_rwlock);
57385f58038Sdp78419 	return (ret);
57485f58038Sdp78419 }
57585f58038Sdp78419 
5767c478bd9Sstevel@tonic-gate /*
5777c478bd9Sstevel@tonic-gate  * No locking is required here as the whole operation is covered
57885f58038Sdp78419  * by kcage_range_rwlock writer lock.
5797c478bd9Sstevel@tonic-gate  */
5807c478bd9Sstevel@tonic-gate static struct kcage_glist *
kcage_glist_alloc(void)5817c478bd9Sstevel@tonic-gate kcage_glist_alloc(void)
5827c478bd9Sstevel@tonic-gate {
5837c478bd9Sstevel@tonic-gate 	struct kcage_glist *new;
5847c478bd9Sstevel@tonic-gate 
5857c478bd9Sstevel@tonic-gate 	if ((new = kcage_glist_freelist) != NULL) {
5867c478bd9Sstevel@tonic-gate 		kcage_glist_freelist = new->next;
5875cc9da9eSVijay Balakrishna, SG-RPE 	} else if (kernel_cage_enable) {
58885f58038Sdp78419 		new = vmem_alloc(kcage_arena, sizeof (*new), VM_NOSLEEP);
5895cc9da9eSVijay Balakrishna, SG-RPE 	} else {
5905cc9da9eSVijay Balakrishna, SG-RPE 		/*
5915cc9da9eSVijay Balakrishna, SG-RPE 		 * On DR supported platforms we allow memory add
5925cc9da9eSVijay Balakrishna, SG-RPE 		 * even when kernel cage is disabled. "kcage_arena" is
5935cc9da9eSVijay Balakrishna, SG-RPE 		 * created only when kernel cage is enabled.
5945cc9da9eSVijay Balakrishna, SG-RPE 		 */
5955cc9da9eSVijay Balakrishna, SG-RPE 		new = kmem_zalloc(sizeof (*new), KM_NOSLEEP);
5966b990117Sdm120769 	}
59785f58038Sdp78419 
59885f58038Sdp78419 	if (new != NULL)
59985f58038Sdp78419 		bzero(new, sizeof (*new));
60085f58038Sdp78419 
6017c478bd9Sstevel@tonic-gate 	return (new);
6027c478bd9Sstevel@tonic-gate }
6037c478bd9Sstevel@tonic-gate 
6047c478bd9Sstevel@tonic-gate static void
kcage_glist_free(struct kcage_glist * lp)6057c478bd9Sstevel@tonic-gate kcage_glist_free(struct kcage_glist *lp)
6067c478bd9Sstevel@tonic-gate {
6077c478bd9Sstevel@tonic-gate 	lp->next = kcage_glist_freelist;
6087c478bd9Sstevel@tonic-gate 	kcage_glist_freelist = lp;
6097c478bd9Sstevel@tonic-gate }
6107c478bd9Sstevel@tonic-gate 
6117c478bd9Sstevel@tonic-gate static int
kcage_glist_delete(pfn_t base,pfn_t lim,struct kcage_glist ** lpp)6127c478bd9Sstevel@tonic-gate kcage_glist_delete(pfn_t base, pfn_t lim, struct kcage_glist **lpp)
6137c478bd9Sstevel@tonic-gate {
6147c478bd9Sstevel@tonic-gate 	struct kcage_glist *lp, *prev = *lpp;
6157c478bd9Sstevel@tonic-gate 
6167c478bd9Sstevel@tonic-gate 	while ((lp = *lpp) != NULL) {
6177c478bd9Sstevel@tonic-gate 		if (lim > lp->base && base < lp->lim) {
6187c478bd9Sstevel@tonic-gate 			/* The delete range overlaps this element. */
6197c478bd9Sstevel@tonic-gate 			if (base <= lp->base && lim >= lp->lim) {
6207c478bd9Sstevel@tonic-gate 				/* Delete whole element. */
6217c478bd9Sstevel@tonic-gate 				*lpp = lp->next;
6227c478bd9Sstevel@tonic-gate 				if (lp == kcage_current_glist) {
6237c478bd9Sstevel@tonic-gate 					/* This can never happen. */
6247c478bd9Sstevel@tonic-gate 					ASSERT(kcage_current_glist != prev);
6257c478bd9Sstevel@tonic-gate 					kcage_current_glist = prev;
6267c478bd9Sstevel@tonic-gate 				}
6277c478bd9Sstevel@tonic-gate 				kcage_glist_free(lp);
6287c478bd9Sstevel@tonic-gate 				continue;
6297c478bd9Sstevel@tonic-gate 			}
6307c478bd9Sstevel@tonic-gate 
6317c478bd9Sstevel@tonic-gate 			/* Partial delete. */
6327c478bd9Sstevel@tonic-gate 			if (base > lp->base && lim < lp->lim) {
6337c478bd9Sstevel@tonic-gate 				struct kcage_glist *new;
6347c478bd9Sstevel@tonic-gate 
6357c478bd9Sstevel@tonic-gate 				/*
6367c478bd9Sstevel@tonic-gate 				 * Remove a section from the middle,
6377c478bd9Sstevel@tonic-gate 				 * need to allocate a new element.
6387c478bd9Sstevel@tonic-gate 				 */
6397c478bd9Sstevel@tonic-gate 				new = kcage_glist_alloc();
6407c478bd9Sstevel@tonic-gate 				if (new == NULL) {
6417c478bd9Sstevel@tonic-gate 					return (ENOMEM);
6427c478bd9Sstevel@tonic-gate 				}
6437c478bd9Sstevel@tonic-gate 
6447c478bd9Sstevel@tonic-gate 				/*
6457c478bd9Sstevel@tonic-gate 				 * Tranfser unused range to new.
6467c478bd9Sstevel@tonic-gate 				 * Edit lp in place to preserve
6477c478bd9Sstevel@tonic-gate 				 * kcage_current_glist.
6487c478bd9Sstevel@tonic-gate 				 */
6497c478bd9Sstevel@tonic-gate 				new->decr = lp->decr;
6507c478bd9Sstevel@tonic-gate 				if (new->decr != 0) {
6517c478bd9Sstevel@tonic-gate 					new->base = lp->base;
6527c478bd9Sstevel@tonic-gate 					new->lim = base;
6537c478bd9Sstevel@tonic-gate 					new->curr = base;
6547c478bd9Sstevel@tonic-gate 
6557c478bd9Sstevel@tonic-gate 					lp->base = lim;
6567c478bd9Sstevel@tonic-gate 				} else {
6577c478bd9Sstevel@tonic-gate 					new->base = lim;
6587c478bd9Sstevel@tonic-gate 					new->lim = lp->lim;
6597c478bd9Sstevel@tonic-gate 					new->curr = new->base;
6607c478bd9Sstevel@tonic-gate 
6617c478bd9Sstevel@tonic-gate 					lp->lim = base;
6627c478bd9Sstevel@tonic-gate 				}
6637c478bd9Sstevel@tonic-gate 
6647c478bd9Sstevel@tonic-gate 				/* Insert new. */
6657c478bd9Sstevel@tonic-gate 				new->next = lp->next;
6667c478bd9Sstevel@tonic-gate 				lp->next = new;
6677c478bd9Sstevel@tonic-gate 				lpp = &lp->next;
6687c478bd9Sstevel@tonic-gate 			} else {
6697c478bd9Sstevel@tonic-gate 				/* Delete part of current block. */
6707c478bd9Sstevel@tonic-gate 				if (base > lp->base) {
6717c478bd9Sstevel@tonic-gate 					ASSERT(lim >= lp->lim);
6727c478bd9Sstevel@tonic-gate 					ASSERT(base < lp->lim);
6737c478bd9Sstevel@tonic-gate 					if (lp->decr != 0 &&
6747c478bd9Sstevel@tonic-gate 					    lp->curr == lp->lim)
6757c478bd9Sstevel@tonic-gate 						lp->curr = base;
6767c478bd9Sstevel@tonic-gate 					lp->lim = base;
6777c478bd9Sstevel@tonic-gate 				} else {
6787c478bd9Sstevel@tonic-gate 					ASSERT(base <= lp->base);
6797c478bd9Sstevel@tonic-gate 					ASSERT(lim > lp->base);
6807c478bd9Sstevel@tonic-gate 					if (lp->decr == 0 &&
6817c478bd9Sstevel@tonic-gate 					    lp->curr == lp->base)
6827c478bd9Sstevel@tonic-gate 						lp->curr = lim;
6837c478bd9Sstevel@tonic-gate 					lp->base = lim;
6847c478bd9Sstevel@tonic-gate 				}
6857c478bd9Sstevel@tonic-gate 			}
6867c478bd9Sstevel@tonic-gate 		}
6877c478bd9Sstevel@tonic-gate 		prev = *lpp;
6887c478bd9Sstevel@tonic-gate 		lpp = &(*lpp)->next;
6897c478bd9Sstevel@tonic-gate 	}
6907c478bd9Sstevel@tonic-gate 
6917c478bd9Sstevel@tonic-gate 	return (0);
6927c478bd9Sstevel@tonic-gate }
6937c478bd9Sstevel@tonic-gate 
6947c478bd9Sstevel@tonic-gate /*
69585f58038Sdp78419  * If lockit is 1, kcage_get_pfn holds the
69685f58038Sdp78419  * reader lock for kcage_range_rwlock.
69785f58038Sdp78419  * Changes to lp->curr can cause race conditions, but
69885f58038Sdp78419  * they are handled by higher level code (see kcage_next_range.)
6997c478bd9Sstevel@tonic-gate  */
7007c478bd9Sstevel@tonic-gate static pfn_t
kcage_get_pfn(int lockit)70185f58038Sdp78419 kcage_get_pfn(int lockit)
7027c478bd9Sstevel@tonic-gate {
7037c478bd9Sstevel@tonic-gate 	struct kcage_glist *lp;
70485f58038Sdp78419 	pfn_t pfn = PFN_INVALID;
7057c478bd9Sstevel@tonic-gate 
70685f58038Sdp78419 	if (lockit && !rw_tryenter(&kcage_range_rwlock, RW_READER))
70785f58038Sdp78419 		return (pfn);
7087c478bd9Sstevel@tonic-gate 
7097c478bd9Sstevel@tonic-gate 	lp = kcage_current_glist;
7107c478bd9Sstevel@tonic-gate 	while (lp != NULL) {
7117c478bd9Sstevel@tonic-gate 		if (lp->decr != 0) {
7127c478bd9Sstevel@tonic-gate 			if (lp->curr != lp->base) {
7137c478bd9Sstevel@tonic-gate 				pfn = --lp->curr;
71485f58038Sdp78419 				break;
7157c478bd9Sstevel@tonic-gate 			}
7167c478bd9Sstevel@tonic-gate 		} else {
7177c478bd9Sstevel@tonic-gate 			if (lp->curr != lp->lim) {
7187c478bd9Sstevel@tonic-gate 				pfn = lp->curr++;
71985f58038Sdp78419 				break;
7207c478bd9Sstevel@tonic-gate 			}
7217c478bd9Sstevel@tonic-gate 		}
7227c478bd9Sstevel@tonic-gate 
7237c478bd9Sstevel@tonic-gate 		lp = lp->next;
7247c478bd9Sstevel@tonic-gate 		if (lp)
7257c478bd9Sstevel@tonic-gate 			kcage_current_glist = lp;
7267c478bd9Sstevel@tonic-gate 	}
7277c478bd9Sstevel@tonic-gate 
72885f58038Sdp78419 	if (lockit)
72985f58038Sdp78419 		rw_exit(&kcage_range_rwlock);
73085f58038Sdp78419 	return (pfn);
7317c478bd9Sstevel@tonic-gate }
7327c478bd9Sstevel@tonic-gate 
7337c478bd9Sstevel@tonic-gate /*
7347c478bd9Sstevel@tonic-gate  * Walk the physical address space of the cage.
7357c478bd9Sstevel@tonic-gate  * This routine does not guarantee to return PFNs in the order
7367c478bd9Sstevel@tonic-gate  * in which they were allocated to the cage. Instead, it walks
7377c478bd9Sstevel@tonic-gate  * each range as they appear on the growth list returning the PFNs
7387c478bd9Sstevel@tonic-gate  * range in ascending order.
7397c478bd9Sstevel@tonic-gate  *
7407c478bd9Sstevel@tonic-gate  * To begin scanning at lower edge of cage, reset should be nonzero.
7417c478bd9Sstevel@tonic-gate  * To step through cage, reset should be zero.
7427c478bd9Sstevel@tonic-gate  *
7437c478bd9Sstevel@tonic-gate  * PFN_INVALID will be returned when the upper end of the cage is
7447c478bd9Sstevel@tonic-gate  * reached -- indicating a full scan of the cage has been completed since
7457c478bd9Sstevel@tonic-gate  * previous reset. PFN_INVALID will continue to be returned until
7467c478bd9Sstevel@tonic-gate  * kcage_walk_cage is reset.
7477c478bd9Sstevel@tonic-gate  *
7487c478bd9Sstevel@tonic-gate  * It is possible to receive a PFN_INVALID result on reset if a growth
7497c478bd9Sstevel@tonic-gate  * list is not installed or if none of the PFNs in the installed list have
7507c478bd9Sstevel@tonic-gate  * been allocated to the cage. In otherwords, there is no cage.
7517c478bd9Sstevel@tonic-gate  *
75285f58038Sdp78419  * Caller need not hold kcage_range_rwlock while calling this function
7537c478bd9Sstevel@tonic-gate  * as the front part of the list is static - pages never come out of
7547c478bd9Sstevel@tonic-gate  * the cage.
7557c478bd9Sstevel@tonic-gate  *
7567c478bd9Sstevel@tonic-gate  * The caller is expected to only be kcage_cageout().
7577c478bd9Sstevel@tonic-gate  */
7587c478bd9Sstevel@tonic-gate static pfn_t
kcage_walk_cage(int reset)7597c478bd9Sstevel@tonic-gate kcage_walk_cage(int reset)
7607c478bd9Sstevel@tonic-gate {
7617c478bd9Sstevel@tonic-gate 	static struct kcage_glist *lp = NULL;
7627c478bd9Sstevel@tonic-gate 	static pfn_t pfn;
7637c478bd9Sstevel@tonic-gate 
7647c478bd9Sstevel@tonic-gate 	if (reset)
7657c478bd9Sstevel@tonic-gate 		lp = NULL;
7667c478bd9Sstevel@tonic-gate 	if (lp == NULL) {
7677c478bd9Sstevel@tonic-gate 		lp = kcage_glist;
7687c478bd9Sstevel@tonic-gate 		pfn = PFN_INVALID;
7697c478bd9Sstevel@tonic-gate 	}
7707c478bd9Sstevel@tonic-gate again:
7717c478bd9Sstevel@tonic-gate 	if (pfn == PFN_INVALID) {
7727c478bd9Sstevel@tonic-gate 		if (lp == NULL)
7737c478bd9Sstevel@tonic-gate 			return (PFN_INVALID);
7747c478bd9Sstevel@tonic-gate 
7757c478bd9Sstevel@tonic-gate 		if (lp->decr != 0) {
7767c478bd9Sstevel@tonic-gate 			/*
7777c478bd9Sstevel@tonic-gate 			 * In this range the cage grows from the highest
7787c478bd9Sstevel@tonic-gate 			 * address towards the lowest.
7797c478bd9Sstevel@tonic-gate 			 * Arrange to return pfns from curr to lim-1,
7807c478bd9Sstevel@tonic-gate 			 * inclusive, in ascending order.
7817c478bd9Sstevel@tonic-gate 			 */
7827c478bd9Sstevel@tonic-gate 
7837c478bd9Sstevel@tonic-gate 			pfn = lp->curr;
7847c478bd9Sstevel@tonic-gate 		} else {
7857c478bd9Sstevel@tonic-gate 			/*
7867c478bd9Sstevel@tonic-gate 			 * In this range the cage grows from the lowest
7877c478bd9Sstevel@tonic-gate 			 * address towards the highest.
7887c478bd9Sstevel@tonic-gate 			 * Arrange to return pfns from base to curr,
7897c478bd9Sstevel@tonic-gate 			 * inclusive, in ascending order.
7907c478bd9Sstevel@tonic-gate 			 */
7917c478bd9Sstevel@tonic-gate 
7927c478bd9Sstevel@tonic-gate 			pfn = lp->base;
7937c478bd9Sstevel@tonic-gate 		}
7947c478bd9Sstevel@tonic-gate 	}
7957c478bd9Sstevel@tonic-gate 
7967c478bd9Sstevel@tonic-gate 	if (lp->decr != 0) {		/* decrementing pfn */
7977c478bd9Sstevel@tonic-gate 		if (pfn == lp->lim) {
7987c478bd9Sstevel@tonic-gate 			/* Don't go beyond the static part of the glist. */
7997c478bd9Sstevel@tonic-gate 			if (lp == kcage_current_glist)
8007c478bd9Sstevel@tonic-gate 				lp = NULL;
8017c478bd9Sstevel@tonic-gate 			else
8027c478bd9Sstevel@tonic-gate 				lp = lp->next;
8037c478bd9Sstevel@tonic-gate 			pfn = PFN_INVALID;
8047c478bd9Sstevel@tonic-gate 			goto again;
8057c478bd9Sstevel@tonic-gate 		}
8067c478bd9Sstevel@tonic-gate 
8077c478bd9Sstevel@tonic-gate 		ASSERT(pfn >= lp->curr && pfn < lp->lim);
8087c478bd9Sstevel@tonic-gate 	} else {			/* incrementing pfn */
8097c478bd9Sstevel@tonic-gate 		if (pfn == lp->curr) {
8107c478bd9Sstevel@tonic-gate 			/* Don't go beyond the static part of the glist. */
8117c478bd9Sstevel@tonic-gate 			if (lp == kcage_current_glist)
8127c478bd9Sstevel@tonic-gate 				lp = NULL;
8137c478bd9Sstevel@tonic-gate 			else
8147c478bd9Sstevel@tonic-gate 				lp = lp->next;
8157c478bd9Sstevel@tonic-gate 			pfn = PFN_INVALID;
8167c478bd9Sstevel@tonic-gate 			goto again;
8177c478bd9Sstevel@tonic-gate 		}
8187c478bd9Sstevel@tonic-gate 
8197c478bd9Sstevel@tonic-gate 		ASSERT(pfn >= lp->base && pfn < lp->curr);
8207c478bd9Sstevel@tonic-gate 	}
8217c478bd9Sstevel@tonic-gate 
8227c478bd9Sstevel@tonic-gate 	return (pfn++);
8237c478bd9Sstevel@tonic-gate }
8247c478bd9Sstevel@tonic-gate 
8257c478bd9Sstevel@tonic-gate /*
8267c478bd9Sstevel@tonic-gate  * Callback functions for to recalc cage thresholds after
8277c478bd9Sstevel@tonic-gate  * Kphysm memory add/delete operations.
8287c478bd9Sstevel@tonic-gate  */
8297c478bd9Sstevel@tonic-gate /*ARGSUSED*/
8307c478bd9Sstevel@tonic-gate static void
kcage_kphysm_postadd_cb(void * arg,pgcnt_t delta_pages)8317c478bd9Sstevel@tonic-gate kcage_kphysm_postadd_cb(void *arg, pgcnt_t delta_pages)
8327c478bd9Sstevel@tonic-gate {
8337c478bd9Sstevel@tonic-gate 	kcage_recalc_thresholds();
8347c478bd9Sstevel@tonic-gate }
8357c478bd9Sstevel@tonic-gate 
8367c478bd9Sstevel@tonic-gate /*ARGSUSED*/
8377c478bd9Sstevel@tonic-gate static int
kcage_kphysm_predel_cb(void * arg,pgcnt_t delta_pages)8387c478bd9Sstevel@tonic-gate kcage_kphysm_predel_cb(void *arg, pgcnt_t delta_pages)
8397c478bd9Sstevel@tonic-gate {
8407c478bd9Sstevel@tonic-gate 	/* TODO: when should cage refuse memory delete requests? */
8417c478bd9Sstevel@tonic-gate 	return (0);
8427c478bd9Sstevel@tonic-gate }
8437c478bd9Sstevel@tonic-gate 
8447c478bd9Sstevel@tonic-gate /*ARGSUSED*/
8457c478bd9Sstevel@tonic-gate static  void
kcage_kphysm_postdel_cb(void * arg,pgcnt_t delta_pages,int cancelled)8467c478bd9Sstevel@tonic-gate kcage_kphysm_postdel_cb(void *arg, pgcnt_t delta_pages, int cancelled)
8477c478bd9Sstevel@tonic-gate {
8487c478bd9Sstevel@tonic-gate 	kcage_recalc_thresholds();
8497c478bd9Sstevel@tonic-gate }
8507c478bd9Sstevel@tonic-gate 
8517c478bd9Sstevel@tonic-gate static kphysm_setup_vector_t kcage_kphysm_vectors = {
8527c478bd9Sstevel@tonic-gate 	KPHYSM_SETUP_VECTOR_VERSION,
8537c478bd9Sstevel@tonic-gate 	kcage_kphysm_postadd_cb,
8547c478bd9Sstevel@tonic-gate 	kcage_kphysm_predel_cb,
8557c478bd9Sstevel@tonic-gate 	kcage_kphysm_postdel_cb
8567c478bd9Sstevel@tonic-gate };
8577c478bd9Sstevel@tonic-gate 
8587c478bd9Sstevel@tonic-gate /*
8597c478bd9Sstevel@tonic-gate  * This is called before a CPR suspend and after a CPR resume.  We have to
8607c478bd9Sstevel@tonic-gate  * turn off kcage_cageout_ready before a suspend, and turn it back on after a
8617c478bd9Sstevel@tonic-gate  * restart.
8627c478bd9Sstevel@tonic-gate  */
8637c478bd9Sstevel@tonic-gate /*ARGSUSED*/
8647c478bd9Sstevel@tonic-gate static boolean_t
kcage_cageout_cpr(void * arg,int code)8657c478bd9Sstevel@tonic-gate kcage_cageout_cpr(void *arg, int code)
8667c478bd9Sstevel@tonic-gate {
8677c478bd9Sstevel@tonic-gate 	if (code == CB_CODE_CPR_CHKPT) {
8687c478bd9Sstevel@tonic-gate 		ASSERT(kcage_cageout_ready);
8697c478bd9Sstevel@tonic-gate 		kcage_cageout_ready = 0;
8707c478bd9Sstevel@tonic-gate 		return (B_TRUE);
8717c478bd9Sstevel@tonic-gate 	} else if (code == CB_CODE_CPR_RESUME) {
8727c478bd9Sstevel@tonic-gate 		ASSERT(kcage_cageout_ready == 0);
8737c478bd9Sstevel@tonic-gate 		kcage_cageout_ready = 1;
8747c478bd9Sstevel@tonic-gate 		return (B_TRUE);
8757c478bd9Sstevel@tonic-gate 	}
8767c478bd9Sstevel@tonic-gate 	return (B_FALSE);
8777c478bd9Sstevel@tonic-gate }
8787c478bd9Sstevel@tonic-gate 
8797c478bd9Sstevel@tonic-gate /*
8807c478bd9Sstevel@tonic-gate  * kcage_recalc_preferred_size() increases initial cage size to improve large
8817c478bd9Sstevel@tonic-gate  * page availability when lp for kmem is enabled and kpr is disabled
8827c478bd9Sstevel@tonic-gate  */
8837c478bd9Sstevel@tonic-gate static pgcnt_t
kcage_recalc_preferred_size(pgcnt_t preferred_size)8847c478bd9Sstevel@tonic-gate kcage_recalc_preferred_size(pgcnt_t preferred_size)
8857c478bd9Sstevel@tonic-gate {
8867c478bd9Sstevel@tonic-gate 	if (SEGKMEM_USE_LARGEPAGES && segkmem_reloc == 0) {
8877c478bd9Sstevel@tonic-gate 		pgcnt_t lpmincage = kcage_kmemlp_mincage;
8887c478bd9Sstevel@tonic-gate 		if (lpmincage == 0) {
8897c478bd9Sstevel@tonic-gate 			lpmincage = MIN(P2ROUNDUP(((physmem * PAGESIZE) / 8),
8907c478bd9Sstevel@tonic-gate 			    segkmem_heaplp_quantum), 0x40000000UL) / PAGESIZE;
8917c478bd9Sstevel@tonic-gate 		}
8927c478bd9Sstevel@tonic-gate 		kcage_kmemlp_mincage = MIN(lpmincage,
8937c478bd9Sstevel@tonic-gate 		    (segkmem_kmemlp_max / PAGESIZE));
8947c478bd9Sstevel@tonic-gate 		preferred_size = MAX(kcage_kmemlp_mincage, preferred_size);
8957c478bd9Sstevel@tonic-gate 	}
8967c478bd9Sstevel@tonic-gate 	return (preferred_size);
8977c478bd9Sstevel@tonic-gate }
8987c478bd9Sstevel@tonic-gate 
8997c478bd9Sstevel@tonic-gate /*
9007c478bd9Sstevel@tonic-gate  * Kcage_init() builds the cage and initializes the cage thresholds.
9017c478bd9Sstevel@tonic-gate  * The size of the cage is determined by the argument preferred_size.
9027c478bd9Sstevel@tonic-gate  * or the actual amount of memory, whichever is smaller.
9037c478bd9Sstevel@tonic-gate  */
90485f58038Sdp78419 static void
kcage_init(pgcnt_t preferred_size)9057c478bd9Sstevel@tonic-gate kcage_init(pgcnt_t preferred_size)
9067c478bd9Sstevel@tonic-gate {
9077c478bd9Sstevel@tonic-gate 	pgcnt_t wanted;
9087c478bd9Sstevel@tonic-gate 	pfn_t pfn;
9097c478bd9Sstevel@tonic-gate 	page_t *pp;
9108b464eb8Smec 	kstat_t *ksp;
9118b464eb8Smec 
9127c478bd9Sstevel@tonic-gate 	extern void page_list_noreloc_startup(page_t *);
9137c478bd9Sstevel@tonic-gate 
9147c478bd9Sstevel@tonic-gate 	ASSERT(!kcage_on);
9157c478bd9Sstevel@tonic-gate 
9167c478bd9Sstevel@tonic-gate 	/* increase preferred cage size for lp for kmem */
9177c478bd9Sstevel@tonic-gate 	preferred_size = kcage_recalc_preferred_size(preferred_size);
9187c478bd9Sstevel@tonic-gate 
9197c478bd9Sstevel@tonic-gate 	/* Debug note: initialize this now so early expansions can stat */
9207c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INIT_SCAN_INDEX;
9217c478bd9Sstevel@tonic-gate 
9227c478bd9Sstevel@tonic-gate 	/*
9237c478bd9Sstevel@tonic-gate 	 * Initialize cage thresholds and install kphysm callback.
9247c478bd9Sstevel@tonic-gate 	 * If we can't arrange to have the thresholds track with
9257c478bd9Sstevel@tonic-gate 	 * available physical memory, then the cage thresholds may
9267c478bd9Sstevel@tonic-gate 	 * end up over time at levels that adversly effect system
9277c478bd9Sstevel@tonic-gate 	 * performance; so, bail out.
9287c478bd9Sstevel@tonic-gate 	 */
9297c478bd9Sstevel@tonic-gate 	kcage_recalc_thresholds();
9307c478bd9Sstevel@tonic-gate 	if (kphysm_setup_func_register(&kcage_kphysm_vectors, NULL)) {
9317c478bd9Sstevel@tonic-gate 		ASSERT(0);		/* Catch this in DEBUG kernels. */
9327c478bd9Sstevel@tonic-gate 		return;
9337c478bd9Sstevel@tonic-gate 	}
9347c478bd9Sstevel@tonic-gate 
9357c478bd9Sstevel@tonic-gate 	/*
9367c478bd9Sstevel@tonic-gate 	 * Limit startup cage size within the range of kcage_minfree
9377c478bd9Sstevel@tonic-gate 	 * and availrmem, inclusively.
9387c478bd9Sstevel@tonic-gate 	 */
9397c478bd9Sstevel@tonic-gate 	wanted = MIN(MAX(preferred_size, kcage_minfree), availrmem);
9407c478bd9Sstevel@tonic-gate 
9417c478bd9Sstevel@tonic-gate 	/*
9427c478bd9Sstevel@tonic-gate 	 * Construct the cage. PFNs are allocated from the glist. It
9437c478bd9Sstevel@tonic-gate 	 * is assumed that the list has been properly ordered for the
9447c478bd9Sstevel@tonic-gate 	 * platform by the platform code. Typically, this is as simple
9457c478bd9Sstevel@tonic-gate 	 * as calling kcage_range_init(phys_avail, decr), where decr is
9467c478bd9Sstevel@tonic-gate 	 * 1 if the kernel has been loaded into upper end of physical
9477c478bd9Sstevel@tonic-gate 	 * memory, or 0 if the kernel has been loaded at the low end.
9487c478bd9Sstevel@tonic-gate 	 *
9497c478bd9Sstevel@tonic-gate 	 * Note: it is assumed that we are in the startup flow, so there
9507c478bd9Sstevel@tonic-gate 	 * is no reason to grab the page lock.
9517c478bd9Sstevel@tonic-gate 	 */
9527c478bd9Sstevel@tonic-gate 	kcage_freemem = 0;
9537c478bd9Sstevel@tonic-gate 	pfn = PFN_INVALID;			/* prime for alignment test */
9547c478bd9Sstevel@tonic-gate 	while (wanted != 0) {
95585f58038Sdp78419 		if ((pfn = kcage_get_pfn(0)) == PFN_INVALID)
9567c478bd9Sstevel@tonic-gate 			break;
9577c478bd9Sstevel@tonic-gate 
9587c478bd9Sstevel@tonic-gate 		if ((pp = page_numtopp_nolock(pfn)) != NULL) {
9597c478bd9Sstevel@tonic-gate 			KCAGEPAGETS_INC();
9607c478bd9Sstevel@tonic-gate 			/*
9617c478bd9Sstevel@tonic-gate 			 * Set the noreloc state on the page.
9627c478bd9Sstevel@tonic-gate 			 * If the page is free and not already
9637c478bd9Sstevel@tonic-gate 			 * on the noreloc list then move it.
9647c478bd9Sstevel@tonic-gate 			 */
9657c478bd9Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
9667c478bd9Sstevel@tonic-gate 				if (PP_ISNORELOC(pp) == 0)
9677c478bd9Sstevel@tonic-gate 					page_list_noreloc_startup(pp);
9687c478bd9Sstevel@tonic-gate 			} else {
9697c478bd9Sstevel@tonic-gate 				ASSERT(pp->p_szc == 0);
9707c478bd9Sstevel@tonic-gate 				PP_SETNORELOC(pp);
9717c478bd9Sstevel@tonic-gate 			}
9727c478bd9Sstevel@tonic-gate 		}
973e21bae1bSkchow 		PLCNT_XFER_NORELOC(pp);
9747c478bd9Sstevel@tonic-gate 		wanted -= 1;
9757c478bd9Sstevel@tonic-gate 	}
9767c478bd9Sstevel@tonic-gate 
9777c478bd9Sstevel@tonic-gate 	/*
9787c478bd9Sstevel@tonic-gate 	 * Need to go through and find kernel allocated pages
9797c478bd9Sstevel@tonic-gate 	 * and capture them into the Cage.  These will primarily
9807c478bd9Sstevel@tonic-gate 	 * be pages gotten through boot_alloc().
9817c478bd9Sstevel@tonic-gate 	 */
9827c478bd9Sstevel@tonic-gate 	if (kvp.v_pages) {
9837c478bd9Sstevel@tonic-gate 
9847c478bd9Sstevel@tonic-gate 		pp = kvp.v_pages;
9857c478bd9Sstevel@tonic-gate 		do {
9867c478bd9Sstevel@tonic-gate 			ASSERT(!PP_ISFREE(pp));
9877c478bd9Sstevel@tonic-gate 			ASSERT(pp->p_szc == 0);
98800e145c7Skchow 			if (PP_ISNORELOC(pp) == 0) {
9897c478bd9Sstevel@tonic-gate 				PP_SETNORELOC(pp);
99000e145c7Skchow 				PLCNT_XFER_NORELOC(pp);
99100e145c7Skchow 			}
9927c478bd9Sstevel@tonic-gate 		} while ((pp = pp->p_vpnext) != kvp.v_pages);
9937c478bd9Sstevel@tonic-gate 
9947c478bd9Sstevel@tonic-gate 	}
9957c478bd9Sstevel@tonic-gate 
9967c478bd9Sstevel@tonic-gate 	kcage_on = 1;
9977c478bd9Sstevel@tonic-gate 
9987c478bd9Sstevel@tonic-gate 	/*
9997c478bd9Sstevel@tonic-gate 	 * CB_CL_CPR_POST_KERNEL is the class that executes from cpr_suspend()
10007c478bd9Sstevel@tonic-gate 	 * after the cageout thread is blocked, and executes from cpr_resume()
10017c478bd9Sstevel@tonic-gate 	 * before the cageout thread is restarted.  By executing in this class,
10027c478bd9Sstevel@tonic-gate 	 * we are assured that the kernel cage thread won't miss wakeup calls
10037c478bd9Sstevel@tonic-gate 	 * and also CPR's larger kmem_alloc requests will not fail after
10047c478bd9Sstevel@tonic-gate 	 * CPR shuts down the cageout kernel thread.
10057c478bd9Sstevel@tonic-gate 	 */
10067c478bd9Sstevel@tonic-gate 	(void) callb_add(kcage_cageout_cpr, NULL, CB_CL_CPR_POST_KERNEL,
10077c478bd9Sstevel@tonic-gate 	    "cageout");
10087c478bd9Sstevel@tonic-gate 
10097c478bd9Sstevel@tonic-gate 	/*
10107c478bd9Sstevel@tonic-gate 	 * Coalesce pages to improve large page availability. A better fix
10117c478bd9Sstevel@tonic-gate 	 * would to coalesce pages as they are included in the cage
10127c478bd9Sstevel@tonic-gate 	 */
10137c478bd9Sstevel@tonic-gate 	if (SEGKMEM_USE_LARGEPAGES) {
10147c478bd9Sstevel@tonic-gate 		extern void page_freelist_coalesce_all(int mnode);
1015ce8eb11aSdp78419 		page_freelist_coalesce_all(-1);	/* do all mnodes */
10167c478bd9Sstevel@tonic-gate 	}
10178b464eb8Smec 
10188b464eb8Smec 	ksp = kstat_create("kcage", 0, "kcage_page_list", "misc",
10198b464eb8Smec 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VAR_SIZE | KSTAT_FLAG_VIRTUAL);
10208b464eb8Smec 	if (ksp != NULL) {
10218b464eb8Smec 		ksp->ks_update = kcage_kstat_update;
10228b464eb8Smec 		ksp->ks_snapshot = kcage_kstat_snapshot;
10238b464eb8Smec 		ksp->ks_lock = &kcage_kstat_lock; /* XXX - not really needed */
10248b464eb8Smec 		kstat_install(ksp);
10258b464eb8Smec 	}
10268b464eb8Smec }
10278b464eb8Smec 
10288b464eb8Smec static int
kcage_kstat_update(kstat_t * ksp,int rw)10298b464eb8Smec kcage_kstat_update(kstat_t *ksp, int rw)
10308b464eb8Smec {
10318b464eb8Smec 	struct kcage_glist *lp;
10328b464eb8Smec 	uint_t count;
10338b464eb8Smec 
10348b464eb8Smec 	if (rw == KSTAT_WRITE)
10358b464eb8Smec 		return (EACCES);
10368b464eb8Smec 
10378b464eb8Smec 	count = 0;
103885f58038Sdp78419 	rw_enter(&kcage_range_rwlock, RW_WRITER);
10398b464eb8Smec 	for (lp = kcage_glist; lp != NULL; lp = lp->next) {
10408b464eb8Smec 		if (lp->decr) {
10418b464eb8Smec 			if (lp->curr != lp->lim) {
10428b464eb8Smec 				count++;
10438b464eb8Smec 			}
10448b464eb8Smec 		} else {
10458b464eb8Smec 			if (lp->curr != lp->base) {
10468b464eb8Smec 				count++;
10478b464eb8Smec 			}
10488b464eb8Smec 		}
10498b464eb8Smec 	}
105085f58038Sdp78419 	rw_exit(&kcage_range_rwlock);
10518b464eb8Smec 
10528b464eb8Smec 	ksp->ks_ndata = count;
10538b464eb8Smec 	ksp->ks_data_size = count * 2 * sizeof (uint64_t);
10548b464eb8Smec 
10558b464eb8Smec 	return (0);
10568b464eb8Smec }
10578b464eb8Smec 
10588b464eb8Smec static int
kcage_kstat_snapshot(kstat_t * ksp,void * buf,int rw)10598b464eb8Smec kcage_kstat_snapshot(kstat_t *ksp, void *buf, int rw)
10608b464eb8Smec {
10618b464eb8Smec 	struct kcage_glist *lp;
10628b464eb8Smec 	struct memunit {
10638b464eb8Smec 		uint64_t address;
10648b464eb8Smec 		uint64_t size;
10658b464eb8Smec 	} *kspmem;
10668b464eb8Smec 
10678b464eb8Smec 	if (rw == KSTAT_WRITE)
10688b464eb8Smec 		return (EACCES);
10698b464eb8Smec 
10708b464eb8Smec 	ksp->ks_snaptime = gethrtime();
10718b464eb8Smec 
10728b464eb8Smec 	kspmem = (struct memunit *)buf;
107385f58038Sdp78419 	rw_enter(&kcage_range_rwlock, RW_WRITER);
10748b464eb8Smec 	for (lp = kcage_glist; lp != NULL; lp = lp->next, kspmem++) {
10758b464eb8Smec 		if ((caddr_t)kspmem >= (caddr_t)buf + ksp->ks_data_size)
10768b464eb8Smec 			break;
10778b464eb8Smec 
10788b464eb8Smec 		if (lp->decr) {
10798b464eb8Smec 			if (lp->curr != lp->lim) {
10808b464eb8Smec 				kspmem->address = ptob(lp->curr);
10818b464eb8Smec 				kspmem->size = ptob(lp->lim - lp->curr);
10828b464eb8Smec 			}
10838b464eb8Smec 		} else {
10848b464eb8Smec 			if (lp->curr != lp->base) {
10858b464eb8Smec 				kspmem->address = ptob(lp->base);
10868b464eb8Smec 				kspmem->size = ptob(lp->curr - lp->base);
10878b464eb8Smec 			}
10888b464eb8Smec 		}
10898b464eb8Smec 	}
109085f58038Sdp78419 	rw_exit(&kcage_range_rwlock);
10918b464eb8Smec 
10928b464eb8Smec 	return (0);
10937c478bd9Sstevel@tonic-gate }
10947c478bd9Sstevel@tonic-gate 
10957c478bd9Sstevel@tonic-gate void
kcage_recalc_thresholds()10967c478bd9Sstevel@tonic-gate kcage_recalc_thresholds()
10977c478bd9Sstevel@tonic-gate {
10987c478bd9Sstevel@tonic-gate 	static int first = 1;
10997c478bd9Sstevel@tonic-gate 	static pgcnt_t init_lotsfree;
11007c478bd9Sstevel@tonic-gate 	static pgcnt_t init_desfree;
11017c478bd9Sstevel@tonic-gate 	static pgcnt_t init_minfree;
11027c478bd9Sstevel@tonic-gate 	static pgcnt_t init_throttlefree;
1103bc203165Svb70745 	static pgcnt_t init_reserve;
11047c478bd9Sstevel@tonic-gate 
11057c478bd9Sstevel@tonic-gate 	/* TODO: any reason to take more care than this with live editing? */
11067c478bd9Sstevel@tonic-gate 	mutex_enter(&kcage_cageout_mutex);
11077c478bd9Sstevel@tonic-gate 	mutex_enter(&freemem_lock);
11087c478bd9Sstevel@tonic-gate 
11097c478bd9Sstevel@tonic-gate 	if (first) {
11107c478bd9Sstevel@tonic-gate 		first = 0;
11117c478bd9Sstevel@tonic-gate 		init_lotsfree = kcage_lotsfree;
11127c478bd9Sstevel@tonic-gate 		init_desfree = kcage_desfree;
11137c478bd9Sstevel@tonic-gate 		init_minfree = kcage_minfree;
11147c478bd9Sstevel@tonic-gate 		init_throttlefree = kcage_throttlefree;
1115bc203165Svb70745 		init_reserve = kcage_reserve;
11167c478bd9Sstevel@tonic-gate 	} else {
11177c478bd9Sstevel@tonic-gate 		kcage_lotsfree = init_lotsfree;
11187c478bd9Sstevel@tonic-gate 		kcage_desfree = init_desfree;
11197c478bd9Sstevel@tonic-gate 		kcage_minfree = init_minfree;
11207c478bd9Sstevel@tonic-gate 		kcage_throttlefree = init_throttlefree;
1121bc203165Svb70745 		kcage_reserve = init_reserve;
11227c478bd9Sstevel@tonic-gate 	}
11237c478bd9Sstevel@tonic-gate 
11247c478bd9Sstevel@tonic-gate 	if (kcage_lotsfree == 0)
11257c478bd9Sstevel@tonic-gate 		kcage_lotsfree = MAX(32, total_pages / 256);
11267c478bd9Sstevel@tonic-gate 
11277c478bd9Sstevel@tonic-gate 	if (kcage_minfree == 0)
11287c478bd9Sstevel@tonic-gate 		kcage_minfree = MAX(32, kcage_lotsfree / 2);
11297c478bd9Sstevel@tonic-gate 
11307c478bd9Sstevel@tonic-gate 	if (kcage_desfree == 0)
11317c478bd9Sstevel@tonic-gate 		kcage_desfree = MAX(32, kcage_minfree);
11327c478bd9Sstevel@tonic-gate 
11337c478bd9Sstevel@tonic-gate 	if (kcage_throttlefree == 0)
11347c478bd9Sstevel@tonic-gate 		kcage_throttlefree = MAX(32, kcage_minfree / 2);
11357c478bd9Sstevel@tonic-gate 
1136bc203165Svb70745 	if (kcage_reserve == 0)
1137bc203165Svb70745 		kcage_reserve = MIN(32, kcage_throttlefree / 2);
1138bc203165Svb70745 
11397c478bd9Sstevel@tonic-gate 	mutex_exit(&freemem_lock);
11407c478bd9Sstevel@tonic-gate 	mutex_exit(&kcage_cageout_mutex);
11417c478bd9Sstevel@tonic-gate 
11427c478bd9Sstevel@tonic-gate 	if (kcage_cageout_ready) {
11437c478bd9Sstevel@tonic-gate 		if (kcage_freemem < kcage_desfree)
11447c478bd9Sstevel@tonic-gate 			kcage_cageout_wakeup();
11457c478bd9Sstevel@tonic-gate 
11467c478bd9Sstevel@tonic-gate 		if (kcage_needfree) {
11477c478bd9Sstevel@tonic-gate 			mutex_enter(&kcage_throttle_mutex);
11487c478bd9Sstevel@tonic-gate 			cv_broadcast(&kcage_throttle_cv);
11497c478bd9Sstevel@tonic-gate 			mutex_exit(&kcage_throttle_mutex);
11507c478bd9Sstevel@tonic-gate 		}
11517c478bd9Sstevel@tonic-gate 	}
11527c478bd9Sstevel@tonic-gate }
11537c478bd9Sstevel@tonic-gate 
11547c478bd9Sstevel@tonic-gate /*
11557c478bd9Sstevel@tonic-gate  * Pageout interface:
11567c478bd9Sstevel@tonic-gate  * kcage_cageout_init()
11577c478bd9Sstevel@tonic-gate  */
11587c478bd9Sstevel@tonic-gate void
kcage_cageout_init()11597c478bd9Sstevel@tonic-gate kcage_cageout_init()
11607c478bd9Sstevel@tonic-gate {
11617c478bd9Sstevel@tonic-gate 	if (kcage_on) {
116235a5a358SJonathan Adams 		(void) lwp_kernel_create(proc_pageout, kcage_cageout, NULL,
116335a5a358SJonathan Adams 		    TS_RUN, maxclsyspri - 1);
11647c478bd9Sstevel@tonic-gate 	}
11657c478bd9Sstevel@tonic-gate }
11667c478bd9Sstevel@tonic-gate 
11677c478bd9Sstevel@tonic-gate 
11687c478bd9Sstevel@tonic-gate /*
11697c478bd9Sstevel@tonic-gate  * VM Interfaces:
11707c478bd9Sstevel@tonic-gate  * kcage_create_throttle()
11717c478bd9Sstevel@tonic-gate  * kcage_freemem_add()
11727c478bd9Sstevel@tonic-gate  * kcage_freemem_sub()
11737c478bd9Sstevel@tonic-gate  */
11747c478bd9Sstevel@tonic-gate 
11757c478bd9Sstevel@tonic-gate /*
11767c478bd9Sstevel@tonic-gate  * Wakeup cageout thread and throttle waiting for the number of pages
11777c478bd9Sstevel@tonic-gate  * requested to become available.  For non-critical requests, a
11787c478bd9Sstevel@tonic-gate  * timeout is added, since freemem accounting is separate from cage
11797c478bd9Sstevel@tonic-gate  * freemem accounting: it's possible for us to get stuck and not make
11807c478bd9Sstevel@tonic-gate  * forward progress even though there was sufficient freemem before
11817c478bd9Sstevel@tonic-gate  * arriving here.
11827c478bd9Sstevel@tonic-gate  */
11837c478bd9Sstevel@tonic-gate int
kcage_create_throttle(pgcnt_t npages,int flags)11847c478bd9Sstevel@tonic-gate kcage_create_throttle(pgcnt_t npages, int flags)
11857c478bd9Sstevel@tonic-gate {
11867c478bd9Sstevel@tonic-gate 
11877c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR(kct_calls);		/* unprotected incr. */
11887c478bd9Sstevel@tonic-gate 
11897c478bd9Sstevel@tonic-gate 	/*
11907c478bd9Sstevel@tonic-gate 	 * Obviously, we can't throttle the cageout thread since
11917c478bd9Sstevel@tonic-gate 	 * we depend on it.  We also can't throttle the panic thread.
11927c478bd9Sstevel@tonic-gate 	 */
11937c478bd9Sstevel@tonic-gate 	if (curthread == kcage_cageout_thread || panicstr) {
11947c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR(kct_cageout);	/* unprotected incr. */
11957c478bd9Sstevel@tonic-gate 		return (KCT_CRIT);
11967c478bd9Sstevel@tonic-gate 	}
11977c478bd9Sstevel@tonic-gate 
11987c478bd9Sstevel@tonic-gate 	/*
11997c478bd9Sstevel@tonic-gate 	 * Don't throttle threads which are critical for proper
12007c478bd9Sstevel@tonic-gate 	 * vm management if we're above kcage_throttlefree or
12017c478bd9Sstevel@tonic-gate 	 * if freemem is very low.
12027c478bd9Sstevel@tonic-gate 	 */
12037c478bd9Sstevel@tonic-gate 	if (NOMEMWAIT()) {
1204*23a80de1SStan Studzinski 		if (kcage_freemem > kcage_throttlefree + npages) {
12057c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
12067c478bd9Sstevel@tonic-gate 			return (KCT_CRIT);
12077c478bd9Sstevel@tonic-gate 		} else if (freemem < minfree) {
12087c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(kct_critical);  /* unprotected incr. */
12097c478bd9Sstevel@tonic-gate 			return (KCT_CRIT);
12107c478bd9Sstevel@tonic-gate 		}
12117c478bd9Sstevel@tonic-gate 	}
12127c478bd9Sstevel@tonic-gate 
12137c478bd9Sstevel@tonic-gate 	/*
1214bc203165Svb70745 	 * Don't throttle real-time threads if kcage_freemem > kcage_reserve.
12157c478bd9Sstevel@tonic-gate 	 */
1216bc203165Svb70745 	if (DISP_PRIO(curthread) > maxclsyspri &&
1217bc203165Svb70745 	    kcage_freemem > kcage_reserve) {
12187c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR(kct_exempt);	/* unprotected incr. */
12197c478bd9Sstevel@tonic-gate 		return (KCT_CRIT);
12207c478bd9Sstevel@tonic-gate 	}
12217c478bd9Sstevel@tonic-gate 
12227c478bd9Sstevel@tonic-gate 	/*
12237c478bd9Sstevel@tonic-gate 	 * Cause all other threads (which are assumed to not be
12247c478bd9Sstevel@tonic-gate 	 * critical to cageout) to wait here until their request
12257c478bd9Sstevel@tonic-gate 	 * can be satisfied. Be a little paranoid and wake the
12267c478bd9Sstevel@tonic-gate 	 * kernel cage on each loop through this logic.
12277c478bd9Sstevel@tonic-gate 	 */
12287c478bd9Sstevel@tonic-gate 	while (kcage_freemem < kcage_throttlefree + npages) {
12297c478bd9Sstevel@tonic-gate 		ASSERT(kcage_on);
12307c478bd9Sstevel@tonic-gate 		if (kcage_cageout_ready) {
12317c478bd9Sstevel@tonic-gate 			mutex_enter(&kcage_throttle_mutex);
12327c478bd9Sstevel@tonic-gate 
12337c478bd9Sstevel@tonic-gate 			kcage_needfree += npages;
12347c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(kct_wait);
12357c478bd9Sstevel@tonic-gate 
12367c478bd9Sstevel@tonic-gate 			kcage_cageout_wakeup();
12377c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(kct_cagewake);
12387c478bd9Sstevel@tonic-gate 
12397c478bd9Sstevel@tonic-gate 			cv_wait(&kcage_throttle_cv, &kcage_throttle_mutex);
12407c478bd9Sstevel@tonic-gate 
12417c478bd9Sstevel@tonic-gate 			kcage_needfree -= npages;
12427c478bd9Sstevel@tonic-gate 
12437c478bd9Sstevel@tonic-gate 			mutex_exit(&kcage_throttle_mutex);
12447c478bd9Sstevel@tonic-gate 		} else {
12457c478bd9Sstevel@tonic-gate 			/*
12467c478bd9Sstevel@tonic-gate 			 * NOTE: atomics are used just in case we enter
12477c478bd9Sstevel@tonic-gate 			 * mp operation before the cageout thread is ready.
12487c478bd9Sstevel@tonic-gate 			 */
12497c478bd9Sstevel@tonic-gate 			atomic_add_long(&kcage_needfree, npages);
12507c478bd9Sstevel@tonic-gate 
12517c478bd9Sstevel@tonic-gate 			kcage_cageout_wakeup();
12527c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(kct_cagewake);	/* unprotected incr. */
12537c478bd9Sstevel@tonic-gate 
12547c478bd9Sstevel@tonic-gate 			atomic_add_long(&kcage_needfree, -npages);
12557c478bd9Sstevel@tonic-gate 		}
12567c478bd9Sstevel@tonic-gate 
1257a98e9dbfSaguzovsk 		if (NOMEMWAIT() && freemem < minfree) {
1258a98e9dbfSaguzovsk 			return (KCT_CRIT);
1259a98e9dbfSaguzovsk 		}
1260*23a80de1SStan Studzinski 		if ((flags & PG_WAIT) == 0) {
1261*23a80de1SStan Studzinski 			pgcnt_t limit = (flags & PG_NORMALPRI) ?
1262*23a80de1SStan Studzinski 			    throttlefree : pageout_reserve;
1263a98e9dbfSaguzovsk 
1264*23a80de1SStan Studzinski 			if ((kcage_freemem < kcage_throttlefree + npages) &&
1265*23a80de1SStan Studzinski 			    (freemem < limit + npages)) {
1266*23a80de1SStan Studzinski 				return (KCT_FAILURE);
1267*23a80de1SStan Studzinski 			} else {
1268*23a80de1SStan Studzinski 				return (KCT_NONCRIT);
1269*23a80de1SStan Studzinski 			}
1270*23a80de1SStan Studzinski 		}
12717c478bd9Sstevel@tonic-gate 	}
12727c478bd9Sstevel@tonic-gate 	return (KCT_NONCRIT);
12737c478bd9Sstevel@tonic-gate }
12747c478bd9Sstevel@tonic-gate 
12757c478bd9Sstevel@tonic-gate void
kcage_freemem_add(pgcnt_t npages)12767c478bd9Sstevel@tonic-gate kcage_freemem_add(pgcnt_t npages)
12777c478bd9Sstevel@tonic-gate {
12787c478bd9Sstevel@tonic-gate 	extern void wakeup_pcgs(void);
12797c478bd9Sstevel@tonic-gate 
12807c478bd9Sstevel@tonic-gate 	atomic_add_long(&kcage_freemem, npages);
12817c478bd9Sstevel@tonic-gate 
12827c478bd9Sstevel@tonic-gate 	wakeup_pcgs();  /* wakeup threads in pcgs() */
12837c478bd9Sstevel@tonic-gate 
12847c478bd9Sstevel@tonic-gate 	if (kcage_needfree != 0 &&
12857c478bd9Sstevel@tonic-gate 	    kcage_freemem >= (kcage_throttlefree + kcage_needfree)) {
12867c478bd9Sstevel@tonic-gate 
12877c478bd9Sstevel@tonic-gate 		mutex_enter(&kcage_throttle_mutex);
12887c478bd9Sstevel@tonic-gate 		cv_broadcast(&kcage_throttle_cv);
12897c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR(kfa_trottlewake);
12907c478bd9Sstevel@tonic-gate 		mutex_exit(&kcage_throttle_mutex);
12917c478bd9Sstevel@tonic-gate 	}
12927c478bd9Sstevel@tonic-gate }
12937c478bd9Sstevel@tonic-gate 
12947c478bd9Sstevel@tonic-gate void
kcage_freemem_sub(pgcnt_t npages)12957c478bd9Sstevel@tonic-gate kcage_freemem_sub(pgcnt_t npages)
12967c478bd9Sstevel@tonic-gate {
12977c478bd9Sstevel@tonic-gate 	atomic_add_long(&kcage_freemem, -npages);
12987c478bd9Sstevel@tonic-gate 
12997c478bd9Sstevel@tonic-gate 	if (kcage_freemem < kcage_desfree) {
13007c478bd9Sstevel@tonic-gate 		kcage_cageout_wakeup();
13017c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR(kfs_cagewake); /* unprotected incr. */
13027c478bd9Sstevel@tonic-gate 	}
13037c478bd9Sstevel@tonic-gate }
13047c478bd9Sstevel@tonic-gate 
13057c478bd9Sstevel@tonic-gate /*
13067c478bd9Sstevel@tonic-gate  * return 0 on failure and 1 on success.
13077c478bd9Sstevel@tonic-gate  */
13087c478bd9Sstevel@tonic-gate static int
kcage_setnoreloc_pages(page_t * rootpp,se_t se)13097c478bd9Sstevel@tonic-gate kcage_setnoreloc_pages(page_t *rootpp, se_t se)
13107c478bd9Sstevel@tonic-gate {
13117c478bd9Sstevel@tonic-gate 	pgcnt_t npgs, i;
13127c478bd9Sstevel@tonic-gate 	page_t *pp;
13137c478bd9Sstevel@tonic-gate 	pfn_t rootpfn = page_pptonum(rootpp);
13147c478bd9Sstevel@tonic-gate 	uint_t szc;
13157c478bd9Sstevel@tonic-gate 
13167c478bd9Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(rootpp));
13177c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_LOCKED_SE(rootpp, se));
13187c478bd9Sstevel@tonic-gate 	if (!group_page_trylock(rootpp, se)) {
13197c478bd9Sstevel@tonic-gate 		return (0);
13207c478bd9Sstevel@tonic-gate 	}
13217c478bd9Sstevel@tonic-gate 	szc = rootpp->p_szc;
13227c478bd9Sstevel@tonic-gate 	if (szc == 0) {
13237c478bd9Sstevel@tonic-gate 		/*
13247c478bd9Sstevel@tonic-gate 		 * The szc of a locked page can only change for pages that are
13257c478bd9Sstevel@tonic-gate 		 * non-swapfs (i.e. anonymous memory) file system pages.
13267c478bd9Sstevel@tonic-gate 		 */
13277c478bd9Sstevel@tonic-gate 		ASSERT(rootpp->p_vnode != NULL &&
1328ad23a2dbSjohansen 		    !PP_ISKAS(rootpp) &&
13297c478bd9Sstevel@tonic-gate 		    !IS_SWAPFSVP(rootpp->p_vnode));
13307c478bd9Sstevel@tonic-gate 		PP_SETNORELOC(rootpp);
13317c478bd9Sstevel@tonic-gate 		return (1);
13327c478bd9Sstevel@tonic-gate 	}
13337c478bd9Sstevel@tonic-gate 	npgs = page_get_pagecnt(szc);
13347c478bd9Sstevel@tonic-gate 	ASSERT(IS_P2ALIGNED(rootpfn, npgs));
13357c478bd9Sstevel@tonic-gate 	pp = rootpp;
1336affbd3ccSkchow 	for (i = 0; i < npgs; i++, pp++) {
13377c478bd9Sstevel@tonic-gate 		ASSERT(PAGE_LOCKED_SE(pp, se));
13387c478bd9Sstevel@tonic-gate 		ASSERT(!PP_ISFREE(pp));
13397c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == szc);
13407c478bd9Sstevel@tonic-gate 		PP_SETNORELOC(pp);
13417c478bd9Sstevel@tonic-gate 	}
13427c478bd9Sstevel@tonic-gate 	group_page_unlock(rootpp);
13437c478bd9Sstevel@tonic-gate 	return (1);
13447c478bd9Sstevel@tonic-gate }
13457c478bd9Sstevel@tonic-gate 
13467c478bd9Sstevel@tonic-gate /*
13477c478bd9Sstevel@tonic-gate  * Attempt to convert page to a caged page (set the P_NORELOC flag).
13487c478bd9Sstevel@tonic-gate  * If successful and pages is free, move page to the tail of whichever
13497c478bd9Sstevel@tonic-gate  * list it is on.
13507c478bd9Sstevel@tonic-gate  * Returns:
13517c478bd9Sstevel@tonic-gate  *   EBUSY  page already locked, assimilated but not free.
13527c478bd9Sstevel@tonic-gate  *   ENOMEM page assimilated, but memory too low to relocate. Page not free.
13537c478bd9Sstevel@tonic-gate  *   EAGAIN page not assimilated. Page not free.
13547c478bd9Sstevel@tonic-gate  *   ERANGE page assimilated. Page not root.
13557c478bd9Sstevel@tonic-gate  *   0      page assimilated. Page free.
13567c478bd9Sstevel@tonic-gate  *   *nfreedp number of pages freed.
13577c478bd9Sstevel@tonic-gate  * NOTE: With error codes ENOMEM, EBUSY, and 0 (zero), there is no way
13587c478bd9Sstevel@tonic-gate  * to distinguish between a page that was already a NORELOC page from
13597c478bd9Sstevel@tonic-gate  * those newly converted to NORELOC pages by this invocation of
13607c478bd9Sstevel@tonic-gate  * kcage_assimilate_page.
13617c478bd9Sstevel@tonic-gate  */
13627c478bd9Sstevel@tonic-gate static int
kcage_assimilate_page(page_t * pp,pgcnt_t * nfreedp)13637c478bd9Sstevel@tonic-gate kcage_assimilate_page(page_t *pp, pgcnt_t *nfreedp)
13647c478bd9Sstevel@tonic-gate {
13657c478bd9Sstevel@tonic-gate 	if (page_trylock(pp, SE_EXCL)) {
13667c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
13677c478bd9Sstevel@tonic-gate check_free_and_return:
13687c478bd9Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
13697c478bd9Sstevel@tonic-gate 				page_unlock(pp);
13707c478bd9Sstevel@tonic-gate 				*nfreedp = 0;
13717c478bd9Sstevel@tonic-gate 				return (0);
13727c478bd9Sstevel@tonic-gate 			} else {
13737c478bd9Sstevel@tonic-gate 				page_unlock(pp);
13747c478bd9Sstevel@tonic-gate 				return (EBUSY);
13757c478bd9Sstevel@tonic-gate 			}
13767c478bd9Sstevel@tonic-gate 			/*NOTREACHED*/
13777c478bd9Sstevel@tonic-gate 		}
13787c478bd9Sstevel@tonic-gate 	} else {
13797c478bd9Sstevel@tonic-gate 		if (page_trylock(pp, SE_SHARED)) {
13807c478bd9Sstevel@tonic-gate 			if (PP_ISNORELOC(pp))
13817c478bd9Sstevel@tonic-gate 				goto check_free_and_return;
1382*23a80de1SStan Studzinski 		} else {
13837c478bd9Sstevel@tonic-gate 			return (EAGAIN);
1384*23a80de1SStan Studzinski 		}
13857c478bd9Sstevel@tonic-gate 		if (!PP_ISFREE(pp)) {
13867c478bd9Sstevel@tonic-gate 			page_unlock(pp);
13877c478bd9Sstevel@tonic-gate 			return (EAGAIN);
13887c478bd9Sstevel@tonic-gate 		}
13897c478bd9Sstevel@tonic-gate 
13907c478bd9Sstevel@tonic-gate 		/*
13917c478bd9Sstevel@tonic-gate 		 * Need to upgrade the lock on it and set the NORELOC
13927c478bd9Sstevel@tonic-gate 		 * bit. If it is free then remove it from the free
13937c478bd9Sstevel@tonic-gate 		 * list so that the platform free list code can keep
13947c478bd9Sstevel@tonic-gate 		 * NORELOC pages where they should be.
13957c478bd9Sstevel@tonic-gate 		 */
13967c478bd9Sstevel@tonic-gate 		/*
13977c478bd9Sstevel@tonic-gate 		 * Before doing anything, get the exclusive lock.
13987c478bd9Sstevel@tonic-gate 		 * This may fail (eg ISM pages are left shared locked).
13997c478bd9Sstevel@tonic-gate 		 * If the page is free this will leave a hole in the
14007c478bd9Sstevel@tonic-gate 		 * cage. There is no solution yet to this.
14017c478bd9Sstevel@tonic-gate 		 */
14027c478bd9Sstevel@tonic-gate 		if (!page_tryupgrade(pp)) {
14037c478bd9Sstevel@tonic-gate 			page_unlock(pp);
14047c478bd9Sstevel@tonic-gate 			return (EAGAIN);
14057c478bd9Sstevel@tonic-gate 		}
14067c478bd9Sstevel@tonic-gate 	}
14077c478bd9Sstevel@tonic-gate 
14087c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
14097c478bd9Sstevel@tonic-gate 
14107c478bd9Sstevel@tonic-gate 	if (PP_ISFREE(pp)) {
14117c478bd9Sstevel@tonic-gate 		int which = PP_ISAGED(pp) ? PG_FREE_LIST : PG_CACHE_LIST;
14127c478bd9Sstevel@tonic-gate 
1413e21bae1bSkchow 		page_list_sub(pp, which);
14147c478bd9Sstevel@tonic-gate 		ASSERT(pp->p_szc == 0);
14157c478bd9Sstevel@tonic-gate 		PP_SETNORELOC(pp);
1416ee88d2b9Skchow 		PLCNT_XFER_NORELOC(pp);
1417e21bae1bSkchow 		page_list_add(pp, which | PG_LIST_TAIL);
14187c478bd9Sstevel@tonic-gate 
14197c478bd9Sstevel@tonic-gate 		page_unlock(pp);
14207c478bd9Sstevel@tonic-gate 		*nfreedp = 1;
14217c478bd9Sstevel@tonic-gate 		return (0);
14227c478bd9Sstevel@tonic-gate 	} else {
14237c478bd9Sstevel@tonic-gate 		if (pp->p_szc != 0) {
14247c478bd9Sstevel@tonic-gate 			if (!kcage_setnoreloc_pages(pp, SE_EXCL)) {
14257c478bd9Sstevel@tonic-gate 				page_unlock(pp);
14267c478bd9Sstevel@tonic-gate 				return (EAGAIN);
14277c478bd9Sstevel@tonic-gate 			}
14287c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISNORELOC(pp));
14297c478bd9Sstevel@tonic-gate 		} else {
14307c478bd9Sstevel@tonic-gate 			PP_SETNORELOC(pp);
14317c478bd9Sstevel@tonic-gate 		}
1432e21bae1bSkchow 		PLCNT_XFER_NORELOC(pp);
14337c478bd9Sstevel@tonic-gate 		return (kcage_invalidate_page(pp, nfreedp));
14347c478bd9Sstevel@tonic-gate 	}
14357c478bd9Sstevel@tonic-gate 	/*NOTREACHED*/
14367c478bd9Sstevel@tonic-gate }
14377c478bd9Sstevel@tonic-gate 
14387c478bd9Sstevel@tonic-gate static int
kcage_expand()14397c478bd9Sstevel@tonic-gate kcage_expand()
14407c478bd9Sstevel@tonic-gate {
14417c478bd9Sstevel@tonic-gate 	int did_something = 0;
14427c478bd9Sstevel@tonic-gate 
14437c478bd9Sstevel@tonic-gate 	spgcnt_t wanted;
14447c478bd9Sstevel@tonic-gate 	pfn_t pfn;
14457c478bd9Sstevel@tonic-gate 	page_t *pp;
14467c478bd9Sstevel@tonic-gate 	/* TODO: we don't really need n any more? */
14477c478bd9Sstevel@tonic-gate 	pgcnt_t n;
14487c478bd9Sstevel@tonic-gate 	pgcnt_t nf, nfreed;
14497c478bd9Sstevel@tonic-gate 
14507c478bd9Sstevel@tonic-gate 	/*
14517c478bd9Sstevel@tonic-gate 	 * Expand the cage if available cage memory is really low. Calculate
14527c478bd9Sstevel@tonic-gate 	 * the amount required to return kcage_freemem to the level of
14537c478bd9Sstevel@tonic-gate 	 * kcage_lotsfree, or to satisfy throttled requests, whichever is
14547c478bd9Sstevel@tonic-gate 	 * more.  It is rare for their sum to create an artificial threshold
14557c478bd9Sstevel@tonic-gate 	 * above kcage_lotsfree, but it is possible.
14567c478bd9Sstevel@tonic-gate 	 *
14577c478bd9Sstevel@tonic-gate 	 * Exit early if expansion amount is equal to or less than zero.
14587c478bd9Sstevel@tonic-gate 	 * (<0 is possible if kcage_freemem rises suddenly.)
14597c478bd9Sstevel@tonic-gate 	 *
1460*23a80de1SStan Studzinski 	 * Exit early when freemem drops below pageout_reserve plus the request.
14617c478bd9Sstevel@tonic-gate 	 */
14627c478bd9Sstevel@tonic-gate 	wanted = MAX(kcage_lotsfree, kcage_throttlefree + kcage_needfree)
14637c478bd9Sstevel@tonic-gate 	    - kcage_freemem;
1464*23a80de1SStan Studzinski 	if (wanted <= 0) {
14657c478bd9Sstevel@tonic-gate 		return (0);
1466*23a80de1SStan Studzinski 	} else if (freemem < pageout_reserve + wanted) {
14677c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR(ke_lowfreemem);
14687c478bd9Sstevel@tonic-gate 		return (0);
14697c478bd9Sstevel@tonic-gate 	}
14707c478bd9Sstevel@tonic-gate 
14717c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR(ke_calls);
14727c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET_SCAN(ke_wanted, (uint_t)wanted);
14737c478bd9Sstevel@tonic-gate 
14747c478bd9Sstevel@tonic-gate 	/*
14757c478bd9Sstevel@tonic-gate 	 * Assimilate more pages from the global page pool into the cage.
14767c478bd9Sstevel@tonic-gate 	 */
14777c478bd9Sstevel@tonic-gate 	n = 0;				/* number of pages PP_SETNORELOC'd */
14787c478bd9Sstevel@tonic-gate 	nf = 0;				/* number of those actually free */
14797c478bd9Sstevel@tonic-gate 	while (kcage_on && nf < wanted) {
148085f58038Sdp78419 		pfn = kcage_get_pfn(1);
14817c478bd9Sstevel@tonic-gate 		if (pfn == PFN_INVALID) {	/* eek! no where to grow */
14827c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(ke_nopfn);
14837c478bd9Sstevel@tonic-gate 			goto terminate;
14847c478bd9Sstevel@tonic-gate 		}
14857c478bd9Sstevel@tonic-gate 
14867c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR_SCAN(ke_examined);
14877c478bd9Sstevel@tonic-gate 
14887c478bd9Sstevel@tonic-gate 		if ((pp = page_numtopp_nolock(pfn)) == NULL) {
14897c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(ke_nopaget);
14907c478bd9Sstevel@tonic-gate 			continue;
14917c478bd9Sstevel@tonic-gate 		}
14927c478bd9Sstevel@tonic-gate 		KCAGEPAGETS_INC();
14937c478bd9Sstevel@tonic-gate 		/*
14947c478bd9Sstevel@tonic-gate 		 * Sanity check. Skip this pfn if it is
14957c478bd9Sstevel@tonic-gate 		 * being deleted.
14967c478bd9Sstevel@tonic-gate 		 */
14977c478bd9Sstevel@tonic-gate 		if (pfn_is_being_deleted(pfn)) {
14987c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(ke_deleting);
14997c478bd9Sstevel@tonic-gate 			continue;
15007c478bd9Sstevel@tonic-gate 		}
15017c478bd9Sstevel@tonic-gate 
15027c478bd9Sstevel@tonic-gate 		if (PP_ISNORELOC(pp)) {
15037c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(ke_isnoreloc);
15047c478bd9Sstevel@tonic-gate 			continue;
15057c478bd9Sstevel@tonic-gate 		}
15067c478bd9Sstevel@tonic-gate 
15077c478bd9Sstevel@tonic-gate 		switch (kcage_assimilate_page(pp, &nfreed)) {
15087c478bd9Sstevel@tonic-gate 			case 0:		/* assimilated, page is free */
15097c478bd9Sstevel@tonic-gate 				KCAGE_STAT_NINCR_SCAN(ke_gotonefree, nfreed);
15107c478bd9Sstevel@tonic-gate 				did_something = 1;
15117c478bd9Sstevel@tonic-gate 				nf += nfreed;
15127c478bd9Sstevel@tonic-gate 				n++;
15137c478bd9Sstevel@tonic-gate 				break;
15147c478bd9Sstevel@tonic-gate 
15157c478bd9Sstevel@tonic-gate 			case EBUSY:	/* assimilated, page not free */
15167c478bd9Sstevel@tonic-gate 			case ERANGE:	/* assimilated, page not root */
15177c478bd9Sstevel@tonic-gate 				KCAGE_STAT_INCR_SCAN(ke_gotone);
15187c478bd9Sstevel@tonic-gate 				did_something = 1;
15197c478bd9Sstevel@tonic-gate 				n++;
15207c478bd9Sstevel@tonic-gate 				break;
15217c478bd9Sstevel@tonic-gate 
15227c478bd9Sstevel@tonic-gate 			case ENOMEM:	/* assimilated, but no mem */
15237c478bd9Sstevel@tonic-gate 				KCAGE_STAT_INCR(ke_terminate);
15247c478bd9Sstevel@tonic-gate 				did_something = 1;
15257c478bd9Sstevel@tonic-gate 				n++;
15267c478bd9Sstevel@tonic-gate 				goto terminate;
15277c478bd9Sstevel@tonic-gate 
15287c478bd9Sstevel@tonic-gate 			case EAGAIN:	/* can't assimilate */
15297c478bd9Sstevel@tonic-gate 				KCAGE_STAT_INCR_SCAN(ke_lefthole);
15307c478bd9Sstevel@tonic-gate 				break;
15317c478bd9Sstevel@tonic-gate 
15327c478bd9Sstevel@tonic-gate 			default:	/* catch this with debug kernels */
15337c478bd9Sstevel@tonic-gate 				ASSERT(0);
15347c478bd9Sstevel@tonic-gate 				break;
15357c478bd9Sstevel@tonic-gate 		}
15367c478bd9Sstevel@tonic-gate 	}
15377c478bd9Sstevel@tonic-gate 
15387c478bd9Sstevel@tonic-gate 	/*
15397c478bd9Sstevel@tonic-gate 	 * Realign cage edge with the nearest physical address
15407c478bd9Sstevel@tonic-gate 	 * boundry for big pages. This is done to give us a
15417c478bd9Sstevel@tonic-gate 	 * better chance of actually getting usable big pages
15427c478bd9Sstevel@tonic-gate 	 * in the cage.
15437c478bd9Sstevel@tonic-gate 	 */
15447c478bd9Sstevel@tonic-gate 
15457c478bd9Sstevel@tonic-gate terminate:
15467c478bd9Sstevel@tonic-gate 
15477c478bd9Sstevel@tonic-gate 	return (did_something);
15487c478bd9Sstevel@tonic-gate }
15497c478bd9Sstevel@tonic-gate 
15507c478bd9Sstevel@tonic-gate /*
15517c478bd9Sstevel@tonic-gate  * Relocate page opp (Original Page Pointer) from cage pool to page rpp
15527c478bd9Sstevel@tonic-gate  * (Replacement Page Pointer) in the global pool. Page opp will be freed
15537c478bd9Sstevel@tonic-gate  * if relocation is successful, otherwise it is only unlocked.
15547c478bd9Sstevel@tonic-gate  * On entry, page opp must be exclusively locked and not free.
15557c478bd9Sstevel@tonic-gate  * *nfreedp: number of pages freed.
15567c478bd9Sstevel@tonic-gate  */
15577c478bd9Sstevel@tonic-gate static int
kcage_relocate_page(page_t * pp,pgcnt_t * nfreedp)15587c478bd9Sstevel@tonic-gate kcage_relocate_page(page_t *pp, pgcnt_t *nfreedp)
15597c478bd9Sstevel@tonic-gate {
15607c478bd9Sstevel@tonic-gate 	page_t *opp = pp;
15617c478bd9Sstevel@tonic-gate 	page_t *rpp = NULL;
15627c478bd9Sstevel@tonic-gate 	spgcnt_t npgs;
15637c478bd9Sstevel@tonic-gate 	int result;
15647c478bd9Sstevel@tonic-gate 
15657c478bd9Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(opp));
15667c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(opp));
15677c478bd9Sstevel@tonic-gate 
15687c478bd9Sstevel@tonic-gate 	result = page_relocate(&opp, &rpp, 1, 1, &npgs, NULL);
15697c478bd9Sstevel@tonic-gate 	*nfreedp = npgs;
15707c478bd9Sstevel@tonic-gate 	if (result == 0) {
15717c478bd9Sstevel@tonic-gate 		while (npgs-- > 0) {
15727c478bd9Sstevel@tonic-gate 			page_t *tpp;
15737c478bd9Sstevel@tonic-gate 
15747c478bd9Sstevel@tonic-gate 			ASSERT(rpp != NULL);
15757c478bd9Sstevel@tonic-gate 			tpp = rpp;
15767c478bd9Sstevel@tonic-gate 			page_sub(&rpp, tpp);
15777c478bd9Sstevel@tonic-gate 			page_unlock(tpp);
15787c478bd9Sstevel@tonic-gate 		}
15797c478bd9Sstevel@tonic-gate 
15807c478bd9Sstevel@tonic-gate 		ASSERT(rpp == NULL);
15817c478bd9Sstevel@tonic-gate 
15827c478bd9Sstevel@tonic-gate 		return (0);		/* success */
15837c478bd9Sstevel@tonic-gate 	}
15847c478bd9Sstevel@tonic-gate 
15857c478bd9Sstevel@tonic-gate 	page_unlock(opp);
15867c478bd9Sstevel@tonic-gate 	return (result);
15877c478bd9Sstevel@tonic-gate }
15887c478bd9Sstevel@tonic-gate 
15897c478bd9Sstevel@tonic-gate /*
15907c478bd9Sstevel@tonic-gate  * Based on page_invalidate_pages()
15917c478bd9Sstevel@tonic-gate  *
15927c478bd9Sstevel@tonic-gate  * Kcage_invalidate_page() uses page_relocate() twice. Both instances
15937c478bd9Sstevel@tonic-gate  * of use must be updated to match the new page_relocate() when it
15947c478bd9Sstevel@tonic-gate  * becomes available.
15957c478bd9Sstevel@tonic-gate  *
15967c478bd9Sstevel@tonic-gate  * Return result of kcage_relocate_page or zero if page was directly freed.
15977c478bd9Sstevel@tonic-gate  * *nfreedp: number of pages freed.
15987c478bd9Sstevel@tonic-gate  */
15997c478bd9Sstevel@tonic-gate static int
kcage_invalidate_page(page_t * pp,pgcnt_t * nfreedp)16007c478bd9Sstevel@tonic-gate kcage_invalidate_page(page_t *pp, pgcnt_t *nfreedp)
16017c478bd9Sstevel@tonic-gate {
16027c478bd9Sstevel@tonic-gate 	int result;
16037c478bd9Sstevel@tonic-gate 
16047c478bd9Sstevel@tonic-gate #if defined(__sparc)
1605af4c679fSSean McEnroe 	ASSERT(pp->p_vnode != &promvp);
16067c478bd9Sstevel@tonic-gate #endif /* __sparc */
16077c478bd9Sstevel@tonic-gate 	ASSERT(!PP_ISFREE(pp));
16087c478bd9Sstevel@tonic-gate 	ASSERT(PAGE_EXCL(pp));
16097c478bd9Sstevel@tonic-gate 
16107c478bd9Sstevel@tonic-gate 	/*
16117c478bd9Sstevel@tonic-gate 	 * Is this page involved in some I/O? shared?
16127c478bd9Sstevel@tonic-gate 	 * The page_struct_lock need not be acquired to
16137c478bd9Sstevel@tonic-gate 	 * examine these fields since the page has an
16147c478bd9Sstevel@tonic-gate 	 * "exclusive" lock.
16157c478bd9Sstevel@tonic-gate 	 */
16167c478bd9Sstevel@tonic-gate 	if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
16177c478bd9Sstevel@tonic-gate 		result = kcage_relocate_page(pp, nfreedp);
16187c478bd9Sstevel@tonic-gate #ifdef KCAGE_STATS
16197c478bd9Sstevel@tonic-gate 		if (result == 0)
16207c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR_SCAN(kip_reloclocked);
16217c478bd9Sstevel@tonic-gate 		else if (result == ENOMEM)
16227c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR_SCAN(kip_nomem);
16237c478bd9Sstevel@tonic-gate #endif
16247c478bd9Sstevel@tonic-gate 		return (result);
16257c478bd9Sstevel@tonic-gate 	}
16267c478bd9Sstevel@tonic-gate 
16277c478bd9Sstevel@tonic-gate 	ASSERT(pp->p_vnode->v_type != VCHR);
16287c478bd9Sstevel@tonic-gate 
16297c478bd9Sstevel@tonic-gate 	/*
16307c478bd9Sstevel@tonic-gate 	 * Unload the mappings and check if mod bit is set.
16317c478bd9Sstevel@tonic-gate 	 */
16327c478bd9Sstevel@tonic-gate 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
16337c478bd9Sstevel@tonic-gate 
16347c478bd9Sstevel@tonic-gate 	if (hat_ismod(pp)) {
16357c478bd9Sstevel@tonic-gate 		result = kcage_relocate_page(pp, nfreedp);
16367c478bd9Sstevel@tonic-gate #ifdef KCAGE_STATS
16377c478bd9Sstevel@tonic-gate 		if (result == 0)
16387c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR_SCAN(kip_relocmod);
16397c478bd9Sstevel@tonic-gate 		else if (result == ENOMEM)
16407c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR_SCAN(kip_nomem);
16417c478bd9Sstevel@tonic-gate #endif
16427c478bd9Sstevel@tonic-gate 		return (result);
16437c478bd9Sstevel@tonic-gate 	}
16447c478bd9Sstevel@tonic-gate 
16457c478bd9Sstevel@tonic-gate 	if (!page_try_demote_pages(pp)) {
16467c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR_SCAN(kip_demotefailed);
16477c478bd9Sstevel@tonic-gate 		page_unlock(pp);
16487c478bd9Sstevel@tonic-gate 		return (EAGAIN);
16497c478bd9Sstevel@tonic-gate 	}
16507c478bd9Sstevel@tonic-gate 
16512e0ea4c4SMichael Corcoran 	/* LINTED: constant in conditional context */
16522e0ea4c4SMichael Corcoran 	VN_DISPOSE(pp, B_INVAL, 0, kcred);
16537c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR_SCAN(kip_destroy);
16547c478bd9Sstevel@tonic-gate 	*nfreedp = 1;
16557c478bd9Sstevel@tonic-gate 	return (0);
16567c478bd9Sstevel@tonic-gate }
16577c478bd9Sstevel@tonic-gate 
1658*23a80de1SStan Studzinski /*
1659*23a80de1SStan Studzinski  * Expand cage only if there is not enough memory to satisfy
1660*23a80de1SStan Studzinski  * current request. We only do one (complete) scan of the cage.
1661*23a80de1SStan Studzinski  * Dirty pages and pages with shared mappings are skipped;
1662*23a80de1SStan Studzinski  * Locked pages (p_lckcnt and p_cowcnt) are also skipped.
1663*23a80de1SStan Studzinski  * All other pages are freed (if they can be locked).
1664*23a80de1SStan Studzinski  * This may affect caching of user pages which are in cage by freeing/
1665*23a80de1SStan Studzinski  * reclaiming them more often. However cage is mainly for kernel (heap)
1666*23a80de1SStan Studzinski  * pages and we want to keep user pages outside of cage. The above policy
1667*23a80de1SStan Studzinski  * should also reduce cage expansion plus it should speed up cage mem
1668*23a80de1SStan Studzinski  * allocations.
1669*23a80de1SStan Studzinski  */
16707c478bd9Sstevel@tonic-gate static void
kcage_cageout()16717c478bd9Sstevel@tonic-gate kcage_cageout()
16727c478bd9Sstevel@tonic-gate {
16737c478bd9Sstevel@tonic-gate 	pfn_t pfn;
16747c478bd9Sstevel@tonic-gate 	page_t *pp;
16757c478bd9Sstevel@tonic-gate 	callb_cpr_t cprinfo;
16767c478bd9Sstevel@tonic-gate 	int did_something;
16777c478bd9Sstevel@tonic-gate 	pfn_t start_pfn;
167805d3dc4bSpaulsan 	ulong_t shared_level = 8;
16797c478bd9Sstevel@tonic-gate 	pgcnt_t nfreed;
16807c478bd9Sstevel@tonic-gate #ifdef KCAGE_STATS
16817c478bd9Sstevel@tonic-gate 	clock_t scan_start;
16827c478bd9Sstevel@tonic-gate #endif
16837c478bd9Sstevel@tonic-gate 
16847c478bd9Sstevel@tonic-gate 	CALLB_CPR_INIT(&cprinfo, &kcage_cageout_mutex,
16857c478bd9Sstevel@tonic-gate 	    callb_generic_cpr, "cageout");
16867c478bd9Sstevel@tonic-gate 
16877c478bd9Sstevel@tonic-gate 	mutex_enter(&kcage_cageout_mutex);
1688dd2069f5Sggredvig 	kcage_cageout_thread = curthread;
16897c478bd9Sstevel@tonic-gate 
16907c478bd9Sstevel@tonic-gate 	pfn = PFN_INVALID;		/* force scan reset */
16917c478bd9Sstevel@tonic-gate 	start_pfn = PFN_INVALID;	/* force init with 1st cage pfn */
16927c478bd9Sstevel@tonic-gate 	kcage_cageout_ready = 1;	/* switch kcage_cageout_wakeup mode */
16937c478bd9Sstevel@tonic-gate 
16947c478bd9Sstevel@tonic-gate loop:
16957c478bd9Sstevel@tonic-gate 	/*
16967c478bd9Sstevel@tonic-gate 	 * Wait here. Sooner or later, kcage_freemem_sub() will notice
16977c478bd9Sstevel@tonic-gate 	 * that kcage_freemem is less than kcage_desfree. When it does
16987c478bd9Sstevel@tonic-gate 	 * notice, kcage_freemem_sub() will wake us up via call to
16997c478bd9Sstevel@tonic-gate 	 * kcage_cageout_wakeup().
17007c478bd9Sstevel@tonic-gate 	 */
17017c478bd9Sstevel@tonic-gate 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
17027c478bd9Sstevel@tonic-gate 	cv_wait(&kcage_cageout_cv, &kcage_cageout_mutex);
17037c478bd9Sstevel@tonic-gate 	CALLB_CPR_SAFE_END(&cprinfo, &kcage_cageout_mutex);
17047c478bd9Sstevel@tonic-gate 
17057c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR(kt_wakeups);
17067c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET_SCAN(kt_freemem_start, freemem);
17077c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_start, kcage_freemem);
17087c478bd9Sstevel@tonic-gate #ifdef KCAGE_STATS
1709d3d50737SRafael Vanoni 	scan_start = ddi_get_lbolt();
17107c478bd9Sstevel@tonic-gate #endif
17117c478bd9Sstevel@tonic-gate 	if (!kcage_on)
17127c478bd9Sstevel@tonic-gate 		goto loop;
17137c478bd9Sstevel@tonic-gate 
17147c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR(kt_scans);
17157c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INCR_SCAN(kt_passes);
17167c478bd9Sstevel@tonic-gate 
17177c478bd9Sstevel@tonic-gate 	did_something = 0;
1718*23a80de1SStan Studzinski 	while (kcage_freemem < kcage_lotsfree + kcage_needfree) {
1719*23a80de1SStan Studzinski 
1720*23a80de1SStan Studzinski 		if ((pfn = kcage_walk_cage(pfn == PFN_INVALID)) ==
1721*23a80de1SStan Studzinski 		    PFN_INVALID) {
1722*23a80de1SStan Studzinski 			break;
1723*23a80de1SStan Studzinski 		}
17247c478bd9Sstevel@tonic-gate 
17257c478bd9Sstevel@tonic-gate 		if (start_pfn == PFN_INVALID)
17267c478bd9Sstevel@tonic-gate 			start_pfn = pfn;
17277c478bd9Sstevel@tonic-gate 		else if (start_pfn == pfn) {
17287c478bd9Sstevel@tonic-gate 			/*
17297c478bd9Sstevel@tonic-gate 			 * Did a complete walk of kernel cage, but didn't free
1730dc84a327Svb70745 			 * any pages.  If only one cpu is active then
17317c478bd9Sstevel@tonic-gate 			 * stop kernel cage walk and try expanding.
17327c478bd9Sstevel@tonic-gate 			 */
1733dc84a327Svb70745 			if (cp_default.cp_ncpus == 1 && did_something == 0) {
17347c478bd9Sstevel@tonic-gate 				KCAGE_STAT_INCR(kt_cageout_break);
17357c478bd9Sstevel@tonic-gate 				break;
17367c478bd9Sstevel@tonic-gate 			}
17377c478bd9Sstevel@tonic-gate 		}
17387c478bd9Sstevel@tonic-gate 
17397c478bd9Sstevel@tonic-gate 		pp = page_numtopp_nolock(pfn);
17407c478bd9Sstevel@tonic-gate 		if (pp == NULL) {
17417c478bd9Sstevel@tonic-gate 			continue;
17427c478bd9Sstevel@tonic-gate 		}
17437c478bd9Sstevel@tonic-gate 
17447c478bd9Sstevel@tonic-gate 		KCAGE_STAT_INCR_SCAN(kt_examined);
17457c478bd9Sstevel@tonic-gate 
17467c478bd9Sstevel@tonic-gate 		/*
17477c478bd9Sstevel@tonic-gate 		 * Do a quick PP_ISNORELOC() and PP_ISFREE test outside
17487c478bd9Sstevel@tonic-gate 		 * of the lock. If one is missed it will be seen next
17497c478bd9Sstevel@tonic-gate 		 * time through.
17507c478bd9Sstevel@tonic-gate 		 *
17517c478bd9Sstevel@tonic-gate 		 * Skip non-caged-pages. These pages can exist in the cage
17527c478bd9Sstevel@tonic-gate 		 * because, if during cage expansion, a page is
17537c478bd9Sstevel@tonic-gate 		 * encountered that is long-term locked the lock prevents the
17547c478bd9Sstevel@tonic-gate 		 * expansion logic from setting the P_NORELOC flag. Hence,
17557c478bd9Sstevel@tonic-gate 		 * non-caged-pages surrounded by caged-pages.
17567c478bd9Sstevel@tonic-gate 		 */
17577c478bd9Sstevel@tonic-gate 		if (!PP_ISNORELOC(pp)) {
17587c478bd9Sstevel@tonic-gate 			switch (kcage_assimilate_page(pp, &nfreed)) {
17597c478bd9Sstevel@tonic-gate 				case 0:
17607c478bd9Sstevel@tonic-gate 					did_something = 1;
17617c478bd9Sstevel@tonic-gate 					KCAGE_STAT_NINCR_SCAN(kt_gotonefree,
17627c478bd9Sstevel@tonic-gate 					    nfreed);
17637c478bd9Sstevel@tonic-gate 					break;
17647c478bd9Sstevel@tonic-gate 
17657c478bd9Sstevel@tonic-gate 				case EBUSY:
17667c478bd9Sstevel@tonic-gate 				case ERANGE:
17677c478bd9Sstevel@tonic-gate 					did_something = 1;
17687c478bd9Sstevel@tonic-gate 					KCAGE_STAT_INCR_SCAN(kt_gotone);
17697c478bd9Sstevel@tonic-gate 					break;
17707c478bd9Sstevel@tonic-gate 
17717c478bd9Sstevel@tonic-gate 				case EAGAIN:
17727c478bd9Sstevel@tonic-gate 				case ENOMEM:
17737c478bd9Sstevel@tonic-gate 					break;
17747c478bd9Sstevel@tonic-gate 
17757c478bd9Sstevel@tonic-gate 				default:
17767c478bd9Sstevel@tonic-gate 					/* catch this with debug kernels */
17777c478bd9Sstevel@tonic-gate 					ASSERT(0);
17787c478bd9Sstevel@tonic-gate 					break;
17797c478bd9Sstevel@tonic-gate 			}
17807c478bd9Sstevel@tonic-gate 
17817c478bd9Sstevel@tonic-gate 			continue;
17827c478bd9Sstevel@tonic-gate 		} else {
17837c478bd9Sstevel@tonic-gate 			if (PP_ISFREE(pp)) {
17847c478bd9Sstevel@tonic-gate 				continue;
17857c478bd9Sstevel@tonic-gate 			}
17867c478bd9Sstevel@tonic-gate 
1787ad23a2dbSjohansen 			if ((PP_ISKAS(pp) && pp->p_lckcnt > 0) ||
17887c478bd9Sstevel@tonic-gate 			    !page_trylock(pp, SE_EXCL)) {
17897c478bd9Sstevel@tonic-gate 				KCAGE_STAT_INCR_SCAN(kt_cantlock);
17907c478bd9Sstevel@tonic-gate 				continue;
17917c478bd9Sstevel@tonic-gate 			}
17927c478bd9Sstevel@tonic-gate 
17937c478bd9Sstevel@tonic-gate 			/* P_NORELOC bit should not have gone away. */
17947c478bd9Sstevel@tonic-gate 			ASSERT(PP_ISNORELOC(pp));
1795ad23a2dbSjohansen 			if (PP_ISFREE(pp) || (PP_ISKAS(pp) &&
17967c478bd9Sstevel@tonic-gate 			    pp->p_lckcnt > 0)) {
17977c478bd9Sstevel@tonic-gate 				page_unlock(pp);
17987c478bd9Sstevel@tonic-gate 				continue;
17997c478bd9Sstevel@tonic-gate 			}
18007c478bd9Sstevel@tonic-gate 
180105d3dc4bSpaulsan 			if (hat_page_checkshare(pp, shared_level)) {
18027c478bd9Sstevel@tonic-gate 				page_unlock(pp);
18037c478bd9Sstevel@tonic-gate 				KCAGE_STAT_INCR_SCAN(kt_skipshared);
18047c478bd9Sstevel@tonic-gate 				continue;
18057c478bd9Sstevel@tonic-gate 			}
18067c478bd9Sstevel@tonic-gate 
18077c478bd9Sstevel@tonic-gate 			if (kcage_invalidate_page(pp, &nfreed) == 0) {
18087c478bd9Sstevel@tonic-gate 				did_something = 1;
18097c478bd9Sstevel@tonic-gate 				KCAGE_STAT_NINCR_SCAN(kt_gotonefree, nfreed);
18107c478bd9Sstevel@tonic-gate 			}
18117c478bd9Sstevel@tonic-gate 
18127c478bd9Sstevel@tonic-gate 			/*
18137c478bd9Sstevel@tonic-gate 			 * No need to drop the page lock here.
18147c478bd9Sstevel@tonic-gate 			 * Kcage_invalidate_page has done that for us
18157c478bd9Sstevel@tonic-gate 			 * either explicitly or through a page_free.
18167c478bd9Sstevel@tonic-gate 			 */
18177c478bd9Sstevel@tonic-gate 		}
18187c478bd9Sstevel@tonic-gate 	}
18197c478bd9Sstevel@tonic-gate 
1820*23a80de1SStan Studzinski 	if (kcage_freemem < kcage_throttlefree + kcage_needfree)
1821*23a80de1SStan Studzinski 		(void) kcage_expand();
18227c478bd9Sstevel@tonic-gate 
1823*23a80de1SStan Studzinski 	if (kcage_on && kcage_cageout_ready)
1824*23a80de1SStan Studzinski 		cv_broadcast(&kcage_throttle_cv);
18257c478bd9Sstevel@tonic-gate 
18267c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET_SCAN(kt_freemem_end, freemem);
18277c478bd9Sstevel@tonic-gate 	KCAGE_STAT_SET_SCAN(kt_kcage_freemem_end, kcage_freemem);
1828d3d50737SRafael Vanoni 	KCAGE_STAT_SET_SCAN(kt_ticks, ddi_get_lbolt() - scan_start);
18297c478bd9Sstevel@tonic-gate 	KCAGE_STAT_INC_SCAN_INDEX;
18307c478bd9Sstevel@tonic-gate 	goto loop;
18317c478bd9Sstevel@tonic-gate 
18327c478bd9Sstevel@tonic-gate 	/*NOTREACHED*/
18337c478bd9Sstevel@tonic-gate }
18347c478bd9Sstevel@tonic-gate 
18357c478bd9Sstevel@tonic-gate void
kcage_cageout_wakeup()18367c478bd9Sstevel@tonic-gate kcage_cageout_wakeup()
18377c478bd9Sstevel@tonic-gate {
18387c478bd9Sstevel@tonic-gate 	if (mutex_tryenter(&kcage_cageout_mutex)) {
18397c478bd9Sstevel@tonic-gate 		if (kcage_cageout_ready) {
18407c478bd9Sstevel@tonic-gate 			cv_signal(&kcage_cageout_cv);
18417c478bd9Sstevel@tonic-gate 		} else if (kcage_freemem < kcage_minfree || kcage_needfree) {
18427c478bd9Sstevel@tonic-gate 			/*
18437c478bd9Sstevel@tonic-gate 			 * Available cage memory is really low. Time to
18447c478bd9Sstevel@tonic-gate 			 * start expanding the cage. However, the
18457c478bd9Sstevel@tonic-gate 			 * kernel cage thread is not yet ready to
18467c478bd9Sstevel@tonic-gate 			 * do the work. Use *this* thread, which is
18477c478bd9Sstevel@tonic-gate 			 * most likely to be t0, to do the work.
18487c478bd9Sstevel@tonic-gate 			 */
18497c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INCR(kcw_expandearly);
18507c478bd9Sstevel@tonic-gate 			(void) kcage_expand();
18517c478bd9Sstevel@tonic-gate 			KCAGE_STAT_INC_SCAN_INDEX;
18527c478bd9Sstevel@tonic-gate 		}
18537c478bd9Sstevel@tonic-gate 
18547c478bd9Sstevel@tonic-gate 		mutex_exit(&kcage_cageout_mutex);
18557c478bd9Sstevel@tonic-gate 	}
18567c478bd9Sstevel@tonic-gate 	/* else, kernel cage thread is already running */
18577c478bd9Sstevel@tonic-gate }
18587c478bd9Sstevel@tonic-gate 
18597c478bd9Sstevel@tonic-gate void
kcage_tick()18607c478bd9Sstevel@tonic-gate kcage_tick()
18617c478bd9Sstevel@tonic-gate {
18627c478bd9Sstevel@tonic-gate 	/*
18637c478bd9Sstevel@tonic-gate 	 * Once per second we wake up all the threads throttled
18647c478bd9Sstevel@tonic-gate 	 * waiting for cage memory, in case we've become stuck
18657c478bd9Sstevel@tonic-gate 	 * and haven't made forward progress expanding the cage.
18667c478bd9Sstevel@tonic-gate 	 */
18677c478bd9Sstevel@tonic-gate 	if (kcage_on && kcage_cageout_ready)
18687c478bd9Sstevel@tonic-gate 		cv_broadcast(&kcage_throttle_cv);
18697c478bd9Sstevel@tonic-gate }
1870