xref: /titanic_52/usr/src/uts/common/vm/vm_seg.c (revision d0fa49b78d1f40d84ec76c363cdc38cf128511dd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 /*
42  * VM - segment management.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/inttypes.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/kmem.h>
51 #include <sys/sysmacros.h>
52 #include <sys/vmsystm.h>
53 #include <sys/tuneable.h>
54 #include <sys/debug.h>
55 #include <sys/fs/swapnode.h>
56 #include <sys/cmn_err.h>
57 #include <sys/callb.h>
58 #include <sys/mem_config.h>
59 #include <sys/mman.h>
60 
61 #include <vm/hat.h>
62 #include <vm/as.h>
63 #include <vm/seg.h>
64 #include <vm/seg_kmem.h>
65 #include <vm/seg_spt.h>
66 #include <vm/seg_vn.h>
67 #include <vm/anon.h>
68 
69 /*
70  * kstats for segment advise
71  */
72 segadvstat_t segadvstat = {
73 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
74 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
75 };
76 
77 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
78 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
79 
80 /*
81  * entry in the segment page cache
82  */
83 struct seg_pcache {
84 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
85 	struct seg_pcache	*p_hprev;
86 	pcache_link_t		p_plink;	/* per segment/amp list */
87 	void 			*p_htag0;	/* segment/amp pointer */
88 	caddr_t			p_addr;		/* base address/anon_idx */
89 	size_t			p_len;		/* total bytes */
90 	size_t			p_wlen;		/* writtable bytes at p_addr */
91 	struct page		**p_pp;		/* pp shadow list */
92 	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
93 	clock_t			p_lbolt;	/* lbolt from last use */
94 	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
95 	uint_t			p_active;	/* active count */
96 	uchar_t			p_write;	/* true if S_WRITE */
97 	uchar_t			p_ref;		/* reference byte */
98 	ushort_t		p_flags;	/* bit flags */
99 };
100 
101 struct seg_phash {
102 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
103 	struct seg_pcache	*p_hprev;
104 	kmutex_t		p_hmutex;	/* protects hash bucket */
105 	pcache_link_t		p_halink[2];	/* active bucket linkages */
106 };
107 
108 struct seg_phash_wired {
109 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
110 	struct seg_pcache	*p_hprev;
111 	kmutex_t		p_hmutex;	/* protects hash bucket */
112 };
113 
114 /*
115  * A parameter to control a maximum number of bytes that can be
116  * purged from pcache at a time.
117  */
118 #define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
119 
120 /*
121  * log2(fraction of pcache to reclaim at a time).
122  */
123 #define	P_SHRINK_SHFT		(5)
124 
125 /*
126  * The following variables can be tuned via /etc/system.
127  */
128 
129 int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
130 pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
131 ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
132 ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
133 int	segpcache_reap_sec = 1;		/* reap check rate in secs */
134 clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
135 int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
136 clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
137 int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
138 pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
139 
140 static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
141 static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
142 static kcondvar_t seg_pasync_cv;
143 
144 #pragma align 64(pctrl1)
145 #pragma align 64(pctrl2)
146 #pragma align 64(pctrl3)
147 
148 /*
149  * Keep frequently used variables together in one cache line.
150  */
151 static struct p_ctrl1 {
152 	uint_t p_disabled;		/* if not 0, caching temporarily off */
153 	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
154 	size_t p_hashwin_sz;		/* # of non wired buckets */
155 	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
156 	size_t p_hashwired_sz;		/* # of wired buckets */
157 	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
158 	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
159 #ifdef _LP64
160 	ulong_t pad[1];
161 #endif /* _LP64 */
162 } pctrl1;
163 
164 static struct p_ctrl2 {
165 	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
166 	pgcnt_t  p_locked_win;	/* # pages from window */
167 	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
168 	uchar_t	 p_ahcur;	/* current active links for insert/delete */
169 	uchar_t  p_athr_on;	/* async reclaim thread is running. */
170 	pcache_link_t p_ahhead[2]; /* active buckets linkages */
171 } pctrl2;
172 
173 static struct p_ctrl3 {
174 	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
175 	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
176 	ulong_t p_athr_full_ahb;	/* athread walk stats */
177 	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
178 	int	p_shrink_shft;		/* reap shift factor */
179 #ifdef _LP64
180 	ulong_t pad[3];
181 #endif /* _LP64 */
182 } pctrl3;
183 
184 #define	seg_pdisabled			pctrl1.p_disabled
185 #define	seg_pmaxwindow			pctrl1.p_maxwin
186 #define	seg_phashsize_win		pctrl1.p_hashwin_sz
187 #define	seg_phashtab_win		pctrl1.p_htabwin
188 #define	seg_phashsize_wired		pctrl1.p_hashwired_sz
189 #define	seg_phashtab_wired		pctrl1.p_htabwired
190 #define	seg_pkmcache			pctrl1.p_kmcache
191 #define	seg_pmem_mtx			pctrl2.p_mem_mtx
192 #define	seg_plocked_window		pctrl2.p_locked_win
193 #define	seg_plocked			pctrl2.p_locked
194 #define	seg_pahcur			pctrl2.p_ahcur
195 #define	seg_pathr_on			pctrl2.p_athr_on
196 #define	seg_pahhead			pctrl2.p_ahhead
197 #define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
198 #define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
199 #define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
200 #define	seg_pshrink_shift		pctrl3.p_shrink_shft
201 #define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
202 
203 #define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
204 #define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
205 #define	P_BASESHIFT			(6)
206 
207 kthread_t *seg_pasync_thr;
208 
209 extern struct seg_ops segvn_ops;
210 extern struct seg_ops segspt_shmops;
211 
212 #define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
213 #define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
214 
215 #define	LBOLT_DELTA(t)	((ulong_t)(lbolt - (t)))
216 
217 #define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
218 
219 /*
220  * htag0 argument can be a seg or amp pointer.
221  */
222 #define	P_HASHBP(seg, htag0, addr, flags)				\
223 	(IS_PFLAGS_WIRED((flags)) ?					\
224 	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
225 	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
226 	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
227 	    (((uintptr_t)(htag0) >> 3) ^				\
228 	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
229 	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
230 
231 /*
232  * htag0 argument can be a seg or amp pointer.
233  */
234 #define	P_MATCH(pcp, htag0, addr, len)					\
235 	((pcp)->p_htag0 == (htag0) &&					\
236 	(pcp)->p_addr == (addr) &&					\
237 	(pcp)->p_len >= (len))
238 
239 #define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
240 	((pcp)->p_pp == (pp) &&						\
241 	(pcp)->p_htag0 == (htag0) &&					\
242 	(pcp)->p_addr == (addr) &&					\
243 	(pcp)->p_len >= (len))
244 
245 #define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
246     offsetof(struct seg_pcache, p_plink)))
247 
248 #define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
249     offsetof(struct seg_phash, p_halink[l])))
250 
251 /*
252  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
253  * active hash bucket lists. We maintain active bucket lists to reduce the
254  * overhead of finding active buckets during asynchronous purging since there
255  * can be 10s of millions of buckets on a large system but only a small subset
256  * of them in actual use.
257  *
258  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
259  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
260  * buckets. The other list is used by asynchronous purge thread. This allows
261  * the purge thread to walk its active list without holding seg_pmem_mtx for a
262  * long time. When asynchronous thread is done with its list it switches to
263  * current active list and makes the list it just finished processing as
264  * current active list.
265  *
266  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
267  * yet on any list.  seg_premove_abuck() may remove the bucket from either
268  * list. If the bucket is on current list it will be always removed. Otherwise
269  * the bucket is only removed if asynchronous purge thread is not currently
270  * running or seg_premove_abuck() is called by asynchronous purge thread
271  * itself. A given bucket can only be on one of active lists at a time. These
272  * routines should be called with per bucket lock held.  The routines use
273  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
274  * the first entry is added to the bucket chain and seg_premove_abuck() must
275  * be called after the last pcp entry is deleted from its chain. Per bucket
276  * lock should be held by the callers.  This avoids a potential race condition
277  * when seg_premove_abuck() removes a bucket after pcp entries are added to
278  * its list after the caller checked that the bucket has no entries. (this
279  * race would cause a loss of an active bucket from the active lists).
280  *
281  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
282  * New entries are added to the end of the list since LRU is used as the
283  * purging policy.
284  */
285 static void
286 seg_padd_abuck(struct seg_phash *hp)
287 {
288 	int lix;
289 
290 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
291 	ASSERT((struct seg_phash *)hp->p_hnext != hp);
292 	ASSERT((struct seg_phash *)hp->p_hprev != hp);
293 	ASSERT(hp->p_hnext == hp->p_hprev);
294 	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
295 	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
296 	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
297 	ASSERT(hp >= seg_phashtab_win &&
298 	    hp < &seg_phashtab_win[seg_phashsize_win]);
299 
300 	/*
301 	 * This bucket can already be on one of active lists
302 	 * since seg_premove_abuck() may have failed to remove it
303 	 * before.
304 	 */
305 	mutex_enter(&seg_pmem_mtx);
306 	lix = seg_pahcur;
307 	ASSERT(lix >= 0 && lix <= 1);
308 	if (hp->p_halink[lix].p_lnext != NULL) {
309 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
310 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
311 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
312 		mutex_exit(&seg_pmem_mtx);
313 		return;
314 	}
315 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
316 
317 	/*
318 	 * If this bucket is still on list !lix async thread can't yet remove
319 	 * it since we hold here per bucket lock. In this case just return
320 	 * since async thread will eventually find and process this bucket.
321 	 */
322 	if (hp->p_halink[!lix].p_lnext != NULL) {
323 		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
324 		mutex_exit(&seg_pmem_mtx);
325 		return;
326 	}
327 	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
328 	/*
329 	 * This bucket is not on any active bucket list yet.
330 	 * Add the bucket to the tail of current active list.
331 	 */
332 	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
333 	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
334 	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
335 	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
336 	mutex_exit(&seg_pmem_mtx);
337 }
338 
339 static void
340 seg_premove_abuck(struct seg_phash *hp, int athr)
341 {
342 	int lix;
343 
344 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
345 	ASSERT((struct seg_phash *)hp->p_hnext == hp);
346 	ASSERT((struct seg_phash *)hp->p_hprev == hp);
347 	ASSERT(hp >= seg_phashtab_win &&
348 	    hp < &seg_phashtab_win[seg_phashsize_win]);
349 
350 	if (athr) {
351 		ASSERT(seg_pathr_on);
352 		ASSERT(seg_pahcur <= 1);
353 		/*
354 		 * We are called by asynchronous thread that found this bucket
355 		 * on not currently active (i.e. !seg_pahcur) list. Remove it
356 		 * from there.  Per bucket lock we are holding makes sure
357 		 * seg_pinsert() can't sneak in and add pcp entries to this
358 		 * bucket right before we remove the bucket from its list.
359 		 */
360 		lix = !seg_pahcur;
361 		ASSERT(hp->p_halink[lix].p_lnext != NULL);
362 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
363 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
364 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
365 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
366 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
367 		hp->p_halink[lix].p_lnext = NULL;
368 		hp->p_halink[lix].p_lprev = NULL;
369 		return;
370 	}
371 
372 	mutex_enter(&seg_pmem_mtx);
373 	lix = seg_pahcur;
374 	ASSERT(lix >= 0 && lix <= 1);
375 
376 	/*
377 	 * If the bucket is on currently active list just remove it from
378 	 * there.
379 	 */
380 	if (hp->p_halink[lix].p_lnext != NULL) {
381 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
382 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
383 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
384 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
385 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
386 		hp->p_halink[lix].p_lnext = NULL;
387 		hp->p_halink[lix].p_lprev = NULL;
388 		mutex_exit(&seg_pmem_mtx);
389 		return;
390 	}
391 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
392 
393 	/*
394 	 * If asynchronous thread is not running we can remove the bucket from
395 	 * not currently active list. The bucket must be on this list since we
396 	 * already checked that it's not on the other list and the bucket from
397 	 * which we just deleted the last pcp entry must be still on one of the
398 	 * active bucket lists.
399 	 */
400 	lix = !lix;
401 	ASSERT(hp->p_halink[lix].p_lnext != NULL);
402 	ASSERT(hp->p_halink[lix].p_lprev != NULL);
403 
404 	if (!seg_pathr_on) {
405 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
406 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
407 		hp->p_halink[lix].p_lnext = NULL;
408 		hp->p_halink[lix].p_lprev = NULL;
409 	}
410 	mutex_exit(&seg_pmem_mtx);
411 }
412 
413 /*
414  * Check if bucket pointed by hp already has a pcp entry that matches request
415  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
416  * Also delete matching entries that cover smaller address range but start
417  * at the same address as addr argument. Return the list of deleted entries if
418  * any. This is an internal helper function called from seg_pinsert() only
419  * for non wired shadow lists. The caller already holds a per seg/amp list
420  * lock.
421  */
422 static struct seg_pcache *
423 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
424     caddr_t addr, size_t len, int *found)
425 {
426 	struct seg_pcache *pcp;
427 	struct seg_pcache *delcallb_list = NULL;
428 
429 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
430 
431 	*found = 0;
432 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
433 	    pcp = pcp->p_hnext) {
434 		ASSERT(pcp->p_hashp == hp);
435 		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
436 			ASSERT(!IS_PCP_WIRED(pcp));
437 			if (pcp->p_len < len) {
438 				pcache_link_t *plinkp;
439 				if (pcp->p_active) {
440 					continue;
441 				}
442 				plinkp = &pcp->p_plink;
443 				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
444 				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
445 				pcp->p_hprev->p_hnext = pcp->p_hnext;
446 				pcp->p_hnext->p_hprev = pcp->p_hprev;
447 				pcp->p_hprev = delcallb_list;
448 				delcallb_list = pcp;
449 			} else {
450 				*found = 1;
451 				break;
452 			}
453 		}
454 	}
455 	return (delcallb_list);
456 }
457 
458 /*
459  * lookup an address range in pagelock cache. Return shadow list and bump up
460  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
461  * as a lookup tag.
462  */
463 struct page **
464 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
465     enum seg_rw rw, uint_t flags)
466 {
467 	struct seg_pcache *pcp;
468 	struct seg_phash *hp;
469 	void *htag0;
470 
471 	ASSERT(seg != NULL);
472 	ASSERT(rw == S_READ || rw == S_WRITE);
473 
474 	/*
475 	 * Skip pagelock cache, while DR is in progress or
476 	 * seg_pcache is off.
477 	 */
478 	if (seg_pdisabled) {
479 		return (NULL);
480 	}
481 	ASSERT(seg_phashsize_win != 0);
482 
483 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
484 	hp = P_HASHBP(seg, htag0, addr, flags);
485 	mutex_enter(&hp->p_hmutex);
486 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
487 	    pcp = pcp->p_hnext) {
488 		ASSERT(pcp->p_hashp == hp);
489 		if (P_MATCH(pcp, htag0, addr, len)) {
490 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
491 			/*
492 			 * If this request wants to write pages
493 			 * but write permissions starting from
494 			 * addr don't cover the entire length len
495 			 * return lookup failure back to the caller.
496 			 * It will check protections and fail this
497 			 * pagelock operation with EACCESS error.
498 			 */
499 			if (rw == S_WRITE && pcp->p_wlen < len) {
500 				break;
501 			}
502 			if (pcp->p_active == UINT_MAX) {
503 				break;
504 			}
505 			pcp->p_active++;
506 			if (rw == S_WRITE && !pcp->p_write) {
507 				pcp->p_write = 1;
508 			}
509 			mutex_exit(&hp->p_hmutex);
510 			return (pcp->p_pp);
511 		}
512 	}
513 	mutex_exit(&hp->p_hmutex);
514 	return (NULL);
515 }
516 
517 /*
518  * mark address range inactive. If the cache is off or the address range is
519  * not in the cache or another shadow list that covers bigger range is found
520  * we call the segment driver to reclaim the pages. Otherwise just decrement
521  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
522  * otherwise use seg as a lookup tag.
523  */
524 void
525 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
526     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
527     seg_preclaim_cbfunc_t callback)
528 {
529 	struct seg_pcache *pcp;
530 	struct seg_phash *hp;
531 	kmutex_t *pmtx = NULL;
532 	pcache_link_t *pheadp;
533 	void *htag0;
534 	pgcnt_t npages = 0;
535 	int keep = 0;
536 
537 	ASSERT(seg != NULL);
538 	ASSERT(rw == S_READ || rw == S_WRITE);
539 
540 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
541 
542 	/*
543 	 * Skip lookup if pcache is not configured.
544 	 */
545 	if (seg_phashsize_win == 0) {
546 		goto out;
547 	}
548 
549 	/*
550 	 * Grab per seg/amp lock before hash lock if we are going to remove
551 	 * inactive entry from pcache.
552 	 */
553 	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
554 		if (amp == NULL) {
555 			pheadp = &seg->s_phead;
556 			pmtx = &seg->s_pmtx;
557 		} else {
558 			pheadp = &amp->a_phead;
559 			pmtx = &amp->a_pmtx;
560 		}
561 		mutex_enter(pmtx);
562 	}
563 
564 	hp = P_HASHBP(seg, htag0, addr, flags);
565 	mutex_enter(&hp->p_hmutex);
566 again:
567 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
568 	    pcp = pcp->p_hnext) {
569 		ASSERT(pcp->p_hashp == hp);
570 		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
571 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
572 			ASSERT(pcp->p_active);
573 			if (keep) {
574 				/*
575 				 * Don't remove this pcp entry
576 				 * if we didn't find duplicate
577 				 * shadow lists on second search.
578 				 * Somebody removed those duplicates
579 				 * since we dropped hash lock after first
580 				 * search.
581 				 */
582 				ASSERT(pmtx != NULL);
583 				ASSERT(!IS_PFLAGS_WIRED(flags));
584 				mutex_exit(pmtx);
585 				pmtx = NULL;
586 			}
587 			pcp->p_active--;
588 			if (pcp->p_active == 0 && (pmtx != NULL ||
589 			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
590 
591 				/*
592 				 * This entry is no longer active.  Remove it
593 				 * now either because pcaching is temporarily
594 				 * disabled or there're other pcp entries that
595 				 * can match this pagelock request (i.e. this
596 				 * entry is a duplicate).
597 				 */
598 
599 				ASSERT(callback == pcp->p_callback);
600 				if (pmtx != NULL) {
601 					pcache_link_t *plinkp = &pcp->p_plink;
602 					ASSERT(!IS_PCP_WIRED(pcp));
603 					ASSERT(pheadp->p_lnext != pheadp);
604 					ASSERT(pheadp->p_lprev != pheadp);
605 					plinkp->p_lprev->p_lnext =
606 					    plinkp->p_lnext;
607 					plinkp->p_lnext->p_lprev =
608 					    plinkp->p_lprev;
609 				}
610 				pcp->p_hprev->p_hnext = pcp->p_hnext;
611 				pcp->p_hnext->p_hprev = pcp->p_hprev;
612 				if (!IS_PCP_WIRED(pcp) &&
613 				    hp->p_hnext == (struct seg_pcache *)hp) {
614 					/*
615 					 * We removed the last entry from this
616 					 * bucket.  Now remove the bucket from
617 					 * its active list.
618 					 */
619 					seg_premove_abuck(hp, 0);
620 				}
621 				mutex_exit(&hp->p_hmutex);
622 				if (pmtx != NULL) {
623 					mutex_exit(pmtx);
624 				}
625 				len = pcp->p_len;
626 				npages = btop(len);
627 				if (rw != S_WRITE && pcp->p_write) {
628 					rw = S_WRITE;
629 				}
630 				kmem_cache_free(seg_pkmcache, pcp);
631 				goto out;
632 			} else {
633 				/*
634 				 * We found a matching pcp entry but will not
635 				 * free it right away even if it's no longer
636 				 * active.
637 				 */
638 				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
639 					/*
640 					 * Set the reference bit and mark the
641 					 * time of last access to this pcp
642 					 * so that asynchronous thread doesn't
643 					 * free it immediately since
644 					 * it may be reactivated very soon.
645 					 */
646 					pcp->p_lbolt = lbolt;
647 					pcp->p_ref = 1;
648 				}
649 				mutex_exit(&hp->p_hmutex);
650 				if (pmtx != NULL) {
651 					mutex_exit(pmtx);
652 				}
653 				return;
654 			}
655 		} else if (!IS_PFLAGS_WIRED(flags) &&
656 		    P_MATCH(pcp, htag0, addr, len)) {
657 			/*
658 			 * This is a duplicate pcp entry.  This situation may
659 			 * happen if a bigger shadow list that covers our
660 			 * range was added while our entry was still active.
661 			 * Now we can free our pcp entry if it becomes
662 			 * inactive.
663 			 */
664 			if (!pcp->p_active) {
665 				/*
666 				 * Mark this entry as referenced just in case
667 				 * we'll free our own pcp entry soon.
668 				 */
669 				pcp->p_lbolt = lbolt;
670 				pcp->p_ref = 1;
671 			}
672 			if (pmtx != NULL) {
673 				/*
674 				 * we are already holding pmtx and found a
675 				 * duplicate.  Don't keep our own pcp entry.
676 				 */
677 				keep = 0;
678 				continue;
679 			}
680 			/*
681 			 * We have to use mutex_tryenter to attempt to lock
682 			 * seg/amp list lock since we already hold hash lock
683 			 * and seg/amp list lock is above hash lock in lock
684 			 * order.  If mutex_tryenter fails drop hash lock and
685 			 * retake both locks in correct order and research
686 			 * this hash chain.
687 			 */
688 			ASSERT(keep == 0);
689 			if (amp == NULL) {
690 				pheadp = &seg->s_phead;
691 				pmtx = &seg->s_pmtx;
692 			} else {
693 				pheadp = &amp->a_phead;
694 				pmtx = &amp->a_pmtx;
695 			}
696 			if (!mutex_tryenter(pmtx)) {
697 				mutex_exit(&hp->p_hmutex);
698 				mutex_enter(pmtx);
699 				mutex_enter(&hp->p_hmutex);
700 				/*
701 				 * If we don't find bigger shadow list on
702 				 * second search (it may happen since we
703 				 * dropped bucket lock) keep the entry that
704 				 * matches our own shadow list.
705 				 */
706 				keep = 1;
707 				goto again;
708 			}
709 		}
710 	}
711 	mutex_exit(&hp->p_hmutex);
712 	if (pmtx != NULL) {
713 		mutex_exit(pmtx);
714 	}
715 out:
716 	(*callback)(htag0, addr, len, pp, rw, 0);
717 	if (npages) {
718 		mutex_enter(&seg_pmem_mtx);
719 		ASSERT(seg_plocked >= npages);
720 		seg_plocked -= npages;
721 		if (!IS_PFLAGS_WIRED(flags)) {
722 			ASSERT(seg_plocked_window >= npages);
723 			seg_plocked_window -= npages;
724 		}
725 		mutex_exit(&seg_pmem_mtx);
726 	}
727 
728 }
729 
730 #ifdef DEBUG
731 static uint32_t p_insert_chk_mtbf = 0;
732 #endif
733 
734 /*
735  * The seg_pinsert_check() is used by segment drivers to predict whether
736  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
737  */
738 /*ARGSUSED*/
739 int
740 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
741     size_t len, uint_t flags)
742 {
743 	ASSERT(seg != NULL);
744 
745 #ifdef DEBUG
746 	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
747 		return (SEGP_FAIL);
748 	}
749 #endif
750 
751 	if (seg_pdisabled) {
752 		return (SEGP_FAIL);
753 	}
754 	ASSERT(seg_phashsize_win != 0);
755 
756 	if (IS_PFLAGS_WIRED(flags)) {
757 		return (SEGP_SUCCESS);
758 	}
759 
760 	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
761 		return (SEGP_FAIL);
762 	}
763 
764 	if (freemem < desfree) {
765 		return (SEGP_FAIL);
766 	}
767 
768 	return (SEGP_SUCCESS);
769 }
770 
771 #ifdef DEBUG
772 static uint32_t p_insert_mtbf = 0;
773 #endif
774 
775 /*
776  * Insert address range with shadow list into pagelock cache if there's no
777  * shadow list already cached for this address range. If the cache is off or
778  * caching is temporarily disabled or the allowed 'window' is exceeded return
779  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
780  *
781  * For non wired shadow lists (segvn case) include address in the hashing
782  * function to avoid linking all the entries from the same segment or amp on
783  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
784  * pcache entries are also linked on a per segment/amp list so that all
785  * entries can be found quickly during seg/amp purge without walking the
786  * entire pcache hash table.  For wired shadow lists (segspt case) we
787  * don't use address hashing and per segment linking because the caller
788  * currently inserts only one entry per segment that covers the entire
789  * segment. If we used per segment linking even for segspt it would complicate
790  * seg_ppurge_wiredpp() locking.
791  *
792  * Both hash bucket and per seg/amp locks need to be held before adding a non
793  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
794  * first.
795  *
796  * This function will also remove from pcache old inactive shadow lists that
797  * overlap with this request but cover smaller range for the same start
798  * address.
799  */
800 int
801 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
802     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
803     seg_preclaim_cbfunc_t callback)
804 {
805 	struct seg_pcache *pcp;
806 	struct seg_phash *hp;
807 	pgcnt_t npages;
808 	pcache_link_t *pheadp;
809 	kmutex_t *pmtx;
810 	struct seg_pcache *delcallb_list = NULL;
811 
812 	ASSERT(seg != NULL);
813 	ASSERT(rw == S_READ || rw == S_WRITE);
814 	ASSERT(rw == S_READ || wlen == len);
815 	ASSERT(rw == S_WRITE || wlen <= len);
816 	ASSERT(amp == NULL || wlen == len);
817 
818 #ifdef DEBUG
819 	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
820 		return (SEGP_FAIL);
821 	}
822 #endif
823 
824 	if (seg_pdisabled) {
825 		return (SEGP_FAIL);
826 	}
827 	ASSERT(seg_phashsize_win != 0);
828 
829 	ASSERT((len & PAGEOFFSET) == 0);
830 	npages = btop(len);
831 	mutex_enter(&seg_pmem_mtx);
832 	if (!IS_PFLAGS_WIRED(flags)) {
833 		if (seg_plocked_window + npages > seg_pmaxwindow) {
834 			mutex_exit(&seg_pmem_mtx);
835 			return (SEGP_FAIL);
836 		}
837 		seg_plocked_window += npages;
838 	}
839 	seg_plocked += npages;
840 	mutex_exit(&seg_pmem_mtx);
841 
842 	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
843 	/*
844 	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
845 	 */
846 	if (amp == NULL) {
847 		pcp->p_htag0 = (void *)seg;
848 		pcp->p_flags = flags & 0xffff;
849 	} else {
850 		pcp->p_htag0 = (void *)amp;
851 		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
852 	}
853 	pcp->p_addr = addr;
854 	pcp->p_len = len;
855 	pcp->p_wlen = wlen;
856 	pcp->p_pp = pp;
857 	pcp->p_write = (rw == S_WRITE);
858 	pcp->p_callback = callback;
859 	pcp->p_active = 1;
860 
861 	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
862 	if (!IS_PFLAGS_WIRED(flags)) {
863 		int found;
864 		void *htag0;
865 		if (amp == NULL) {
866 			pheadp = &seg->s_phead;
867 			pmtx = &seg->s_pmtx;
868 			htag0 = (void *)seg;
869 		} else {
870 			pheadp = &amp->a_phead;
871 			pmtx = &amp->a_pmtx;
872 			htag0 = (void *)amp;
873 		}
874 		mutex_enter(pmtx);
875 		mutex_enter(&hp->p_hmutex);
876 		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
877 		    len, &found);
878 		if (found) {
879 			mutex_exit(&hp->p_hmutex);
880 			mutex_exit(pmtx);
881 			mutex_enter(&seg_pmem_mtx);
882 			seg_plocked -= npages;
883 			seg_plocked_window -= npages;
884 			mutex_exit(&seg_pmem_mtx);
885 			kmem_cache_free(seg_pkmcache, pcp);
886 			goto out;
887 		}
888 		pcp->p_plink.p_lnext = pheadp->p_lnext;
889 		pcp->p_plink.p_lprev = pheadp;
890 		pheadp->p_lnext->p_lprev = &pcp->p_plink;
891 		pheadp->p_lnext = &pcp->p_plink;
892 	} else {
893 		mutex_enter(&hp->p_hmutex);
894 	}
895 	pcp->p_hashp = hp;
896 	pcp->p_hnext = hp->p_hnext;
897 	pcp->p_hprev = (struct seg_pcache *)hp;
898 	hp->p_hnext->p_hprev = pcp;
899 	hp->p_hnext = pcp;
900 	if (!IS_PFLAGS_WIRED(flags) &&
901 	    hp->p_hprev == pcp) {
902 		seg_padd_abuck(hp);
903 	}
904 	mutex_exit(&hp->p_hmutex);
905 	if (!IS_PFLAGS_WIRED(flags)) {
906 		mutex_exit(pmtx);
907 	}
908 
909 out:
910 	npages = 0;
911 	while (delcallb_list != NULL) {
912 		pcp = delcallb_list;
913 		delcallb_list = pcp->p_hprev;
914 		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
915 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
916 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
917 		npages += btop(pcp->p_len);
918 		kmem_cache_free(seg_pkmcache, pcp);
919 	}
920 	if (npages) {
921 		ASSERT(!IS_PFLAGS_WIRED(flags));
922 		mutex_enter(&seg_pmem_mtx);
923 		ASSERT(seg_plocked >= npages);
924 		ASSERT(seg_plocked_window >= npages);
925 		seg_plocked -= npages;
926 		seg_plocked_window -= npages;
927 		mutex_exit(&seg_pmem_mtx);
928 	}
929 
930 	return (SEGP_SUCCESS);
931 }
932 
933 /*
934  * purge entries from the pagelock cache if not active
935  * and not recently used.
936  */
937 static void
938 seg_ppurge_async(int force)
939 {
940 	struct seg_pcache *delcallb_list = NULL;
941 	struct seg_pcache *pcp;
942 	struct seg_phash *hp;
943 	pgcnt_t npages = 0;
944 	pgcnt_t npages_window = 0;
945 	pgcnt_t	npgs_to_purge;
946 	pgcnt_t npgs_purged = 0;
947 	int hlinks = 0;
948 	int hlix;
949 	pcache_link_t *hlinkp;
950 	pcache_link_t *hlnextp = NULL;
951 	int lowmem;
952 	int trim;
953 
954 	ASSERT(seg_phashsize_win != 0);
955 
956 	/*
957 	 * if the cache is off or empty, return
958 	 */
959 	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
960 		return;
961 	}
962 
963 	if (!force) {
964 		lowmem = 0;
965 		trim = 0;
966 		if (freemem < lotsfree + needfree) {
967 			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
968 			if (fmem <= 5 * (desfree >> 2)) {
969 				lowmem = 1;
970 			} else if (fmem <= 7 * (lotsfree >> 3)) {
971 				if (seg_plocked_window >=
972 				    (availrmem_initial >> 1)) {
973 					lowmem = 1;
974 				}
975 			} else if (fmem < lotsfree) {
976 				if (seg_plocked_window >=
977 				    3 * (availrmem_initial >> 2)) {
978 					lowmem = 1;
979 				}
980 			}
981 		}
982 		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
983 			trim = 1;
984 		}
985 		if (!lowmem && !trim) {
986 			return;
987 		}
988 		npgs_to_purge = seg_plocked_window >>
989 		    seg_pshrink_shift;
990 		if (lowmem) {
991 			npgs_to_purge = MIN(npgs_to_purge,
992 			    MAX(seg_pmaxapurge_npages, desfree));
993 		} else {
994 			npgs_to_purge = MIN(npgs_to_purge,
995 			    seg_pmaxapurge_npages);
996 		}
997 		if (npgs_to_purge == 0) {
998 			return;
999 		}
1000 	} else {
1001 		struct seg_phash_wired *hpw;
1002 
1003 		ASSERT(seg_phashsize_wired != 0);
1004 
1005 		for (hpw = seg_phashtab_wired;
1006 		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1007 
1008 			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1009 				continue;
1010 			}
1011 
1012 			mutex_enter(&hpw->p_hmutex);
1013 
1014 			for (pcp = hpw->p_hnext;
1015 			    pcp != (struct seg_pcache *)hpw;
1016 			    pcp = pcp->p_hnext) {
1017 
1018 				ASSERT(IS_PCP_WIRED(pcp));
1019 				ASSERT(pcp->p_hashp ==
1020 				    (struct seg_phash *)hpw);
1021 
1022 				if (pcp->p_active) {
1023 					continue;
1024 				}
1025 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1026 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1027 				pcp->p_hprev = delcallb_list;
1028 				delcallb_list = pcp;
1029 			}
1030 			mutex_exit(&hpw->p_hmutex);
1031 		}
1032 	}
1033 
1034 	mutex_enter(&seg_pmem_mtx);
1035 	if (seg_pathr_on) {
1036 		mutex_exit(&seg_pmem_mtx);
1037 		goto runcb;
1038 	}
1039 	seg_pathr_on = 1;
1040 	mutex_exit(&seg_pmem_mtx);
1041 	ASSERT(seg_pahcur <= 1);
1042 	hlix = !seg_pahcur;
1043 
1044 again:
1045 	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1046 	    hlinkp = hlnextp) {
1047 
1048 		hlnextp = hlinkp->p_lnext;
1049 		ASSERT(hlnextp != NULL);
1050 
1051 		hp = hlink2phash(hlinkp, hlix);
1052 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1053 			seg_pathr_empty_ahb++;
1054 			continue;
1055 		}
1056 		seg_pathr_full_ahb++;
1057 		mutex_enter(&hp->p_hmutex);
1058 
1059 		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1060 		    pcp = pcp->p_hnext) {
1061 			pcache_link_t *pheadp;
1062 			pcache_link_t *plinkp;
1063 			void *htag0;
1064 			kmutex_t *pmtx;
1065 
1066 			ASSERT(!IS_PCP_WIRED(pcp));
1067 			ASSERT(pcp->p_hashp == hp);
1068 
1069 			if (pcp->p_active) {
1070 				continue;
1071 			}
1072 			if (!force && pcp->p_ref &&
1073 			    PCP_AGE(pcp) < seg_pmax_pcpage) {
1074 				pcp->p_ref = 0;
1075 				continue;
1076 			}
1077 			plinkp = &pcp->p_plink;
1078 			htag0 = pcp->p_htag0;
1079 			if (pcp->p_flags & SEGP_AMP) {
1080 				pheadp = &((amp_t *)htag0)->a_phead;
1081 				pmtx = &((amp_t *)htag0)->a_pmtx;
1082 			} else {
1083 				pheadp = &((seg_t *)htag0)->s_phead;
1084 				pmtx = &((seg_t *)htag0)->s_pmtx;
1085 			}
1086 			if (!mutex_tryenter(pmtx)) {
1087 				continue;
1088 			}
1089 			ASSERT(pheadp->p_lnext != pheadp);
1090 			ASSERT(pheadp->p_lprev != pheadp);
1091 			plinkp->p_lprev->p_lnext =
1092 			    plinkp->p_lnext;
1093 			plinkp->p_lnext->p_lprev =
1094 			    plinkp->p_lprev;
1095 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1096 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1097 			mutex_exit(pmtx);
1098 			pcp->p_hprev = delcallb_list;
1099 			delcallb_list = pcp;
1100 			npgs_purged += btop(pcp->p_len);
1101 		}
1102 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1103 			seg_premove_abuck(hp, 1);
1104 		}
1105 		mutex_exit(&hp->p_hmutex);
1106 		if (npgs_purged >= seg_plocked_window) {
1107 			break;
1108 		}
1109 		if (!force) {
1110 			if (npgs_purged >= npgs_to_purge) {
1111 				break;
1112 			}
1113 			if (!trim && !(seg_pathr_full_ahb & 15)) {
1114 				ASSERT(lowmem);
1115 				if (freemem >= lotsfree + needfree) {
1116 					break;
1117 				}
1118 			}
1119 		}
1120 	}
1121 
1122 	if (hlinkp == &seg_pahhead[hlix]) {
1123 		/*
1124 		 * We processed the entire hlix active bucket list
1125 		 * but didn't find enough pages to reclaim.
1126 		 * Switch the lists and walk the other list
1127 		 * if we haven't done it yet.
1128 		 */
1129 		mutex_enter(&seg_pmem_mtx);
1130 		ASSERT(seg_pathr_on);
1131 		ASSERT(seg_pahcur == !hlix);
1132 		seg_pahcur = hlix;
1133 		mutex_exit(&seg_pmem_mtx);
1134 		if (++hlinks < 2) {
1135 			hlix = !hlix;
1136 			goto again;
1137 		}
1138 	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1139 	    seg_pahhead[hlix].p_lnext != hlinkp) {
1140 		ASSERT(hlinkp != NULL);
1141 		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1142 		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1143 		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1144 
1145 		/*
1146 		 * Reinsert the header to point to hlinkp
1147 		 * so that we start from hlinkp bucket next time around.
1148 		 */
1149 		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1150 		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1151 		seg_pahhead[hlix].p_lnext = hlinkp;
1152 		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1153 		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1154 		hlinkp->p_lprev = &seg_pahhead[hlix];
1155 	}
1156 
1157 	mutex_enter(&seg_pmem_mtx);
1158 	ASSERT(seg_pathr_on);
1159 	seg_pathr_on = 0;
1160 	mutex_exit(&seg_pmem_mtx);
1161 
1162 runcb:
1163 	/*
1164 	 * Run the delayed callback list. segments/amps can't go away until
1165 	 * callback is executed since they must have non 0 softlockcnt. That's
1166 	 * why we don't need to hold as/seg/amp locks to execute the callback.
1167 	 */
1168 	while (delcallb_list != NULL) {
1169 		pcp = delcallb_list;
1170 		delcallb_list = pcp->p_hprev;
1171 		ASSERT(!pcp->p_active);
1172 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1173 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1174 		npages += btop(pcp->p_len);
1175 		if (!IS_PCP_WIRED(pcp)) {
1176 			npages_window += btop(pcp->p_len);
1177 		}
1178 		kmem_cache_free(seg_pkmcache, pcp);
1179 	}
1180 	if (npages) {
1181 		mutex_enter(&seg_pmem_mtx);
1182 		ASSERT(seg_plocked >= npages);
1183 		ASSERT(seg_plocked_window >= npages_window);
1184 		seg_plocked -= npages;
1185 		seg_plocked_window -= npages_window;
1186 		mutex_exit(&seg_pmem_mtx);
1187 	}
1188 }
1189 
1190 /*
1191  * Remove cached pages for segment(s) entries from hashtable.  The segments
1192  * are identified by pp array. This is useful for multiple seg's cached on
1193  * behalf of dummy segment (ISM/DISM) with common pp array.
1194  */
1195 void
1196 seg_ppurge_wiredpp(struct page **pp)
1197 {
1198 	struct seg_pcache *pcp;
1199 	struct seg_phash_wired *hp;
1200 	pgcnt_t npages = 0;
1201 	struct	seg_pcache *delcallb_list = NULL;
1202 
1203 	/*
1204 	 * if the cache is empty, return
1205 	 */
1206 	if (seg_plocked == 0) {
1207 		return;
1208 	}
1209 	ASSERT(seg_phashsize_wired != 0);
1210 
1211 	for (hp = seg_phashtab_wired;
1212 	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1213 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1214 			continue;
1215 		}
1216 		mutex_enter(&hp->p_hmutex);
1217 		pcp = hp->p_hnext;
1218 		while (pcp != (struct seg_pcache *)hp) {
1219 			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1220 			ASSERT(IS_PCP_WIRED(pcp));
1221 			/*
1222 			 * purge entries which are not active
1223 			 */
1224 			if (!pcp->p_active && pcp->p_pp == pp) {
1225 				ASSERT(pcp->p_htag0 != NULL);
1226 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1227 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1228 				pcp->p_hprev = delcallb_list;
1229 				delcallb_list = pcp;
1230 			}
1231 			pcp = pcp->p_hnext;
1232 		}
1233 		mutex_exit(&hp->p_hmutex);
1234 		/*
1235 		 * segments can't go away until callback is executed since
1236 		 * they must have non 0 softlockcnt. That's why we don't
1237 		 * need to hold as/seg locks to execute the callback.
1238 		 */
1239 		while (delcallb_list != NULL) {
1240 			int done;
1241 			pcp = delcallb_list;
1242 			delcallb_list = pcp->p_hprev;
1243 			ASSERT(!pcp->p_active);
1244 			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1245 			    pcp->p_len, pcp->p_pp,
1246 			    pcp->p_write ? S_WRITE : S_READ, 1);
1247 			npages += btop(pcp->p_len);
1248 			ASSERT(IS_PCP_WIRED(pcp));
1249 			kmem_cache_free(seg_pkmcache, pcp);
1250 			if (done) {
1251 				ASSERT(delcallb_list == NULL);
1252 				goto out;
1253 			}
1254 		}
1255 	}
1256 
1257 out:
1258 	mutex_enter(&seg_pmem_mtx);
1259 	ASSERT(seg_plocked >= npages);
1260 	seg_plocked -= npages;
1261 	mutex_exit(&seg_pmem_mtx);
1262 }
1263 
1264 /*
1265  * purge all entries for a given segment. Since we
1266  * callback into the segment driver directly for page
1267  * reclaim the caller needs to hold the right locks.
1268  */
1269 void
1270 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1271 {
1272 	struct seg_pcache *delcallb_list = NULL;
1273 	struct seg_pcache *pcp;
1274 	struct seg_phash *hp;
1275 	pgcnt_t npages = 0;
1276 	void *htag0;
1277 
1278 	if (seg_plocked == 0) {
1279 		return;
1280 	}
1281 	ASSERT(seg_phashsize_win != 0);
1282 
1283 	/*
1284 	 * If amp is not NULL use amp as a lookup tag otherwise use seg
1285 	 * as a lookup tag.
1286 	 */
1287 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1288 	ASSERT(htag0 != NULL);
1289 	if (IS_PFLAGS_WIRED(flags)) {
1290 		hp = P_HASHBP(seg, htag0, 0, flags);
1291 		mutex_enter(&hp->p_hmutex);
1292 		pcp = hp->p_hnext;
1293 		while (pcp != (struct seg_pcache *)hp) {
1294 			ASSERT(pcp->p_hashp == hp);
1295 			ASSERT(IS_PCP_WIRED(pcp));
1296 			if (pcp->p_htag0 == htag0) {
1297 				if (pcp->p_active) {
1298 					break;
1299 				}
1300 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1301 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1302 				pcp->p_hprev = delcallb_list;
1303 				delcallb_list = pcp;
1304 			}
1305 			pcp = pcp->p_hnext;
1306 		}
1307 		mutex_exit(&hp->p_hmutex);
1308 	} else {
1309 		pcache_link_t *plinkp;
1310 		pcache_link_t *pheadp;
1311 		kmutex_t *pmtx;
1312 
1313 		if (amp == NULL) {
1314 			ASSERT(seg != NULL);
1315 			pheadp = &seg->s_phead;
1316 			pmtx = &seg->s_pmtx;
1317 		} else {
1318 			pheadp = &amp->a_phead;
1319 			pmtx = &amp->a_pmtx;
1320 		}
1321 		mutex_enter(pmtx);
1322 		while ((plinkp = pheadp->p_lnext) != pheadp) {
1323 			pcp = plink2pcache(plinkp);
1324 			ASSERT(!IS_PCP_WIRED(pcp));
1325 			ASSERT(pcp->p_htag0 == htag0);
1326 			hp = pcp->p_hashp;
1327 			mutex_enter(&hp->p_hmutex);
1328 			if (pcp->p_active) {
1329 				mutex_exit(&hp->p_hmutex);
1330 				break;
1331 			}
1332 			ASSERT(plinkp->p_lprev == pheadp);
1333 			pheadp->p_lnext = plinkp->p_lnext;
1334 			plinkp->p_lnext->p_lprev = pheadp;
1335 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1336 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1337 			pcp->p_hprev = delcallb_list;
1338 			delcallb_list = pcp;
1339 			if (hp->p_hnext == (struct seg_pcache *)hp) {
1340 				seg_premove_abuck(hp, 0);
1341 			}
1342 			mutex_exit(&hp->p_hmutex);
1343 		}
1344 		mutex_exit(pmtx);
1345 	}
1346 	while (delcallb_list != NULL) {
1347 		pcp = delcallb_list;
1348 		delcallb_list = pcp->p_hprev;
1349 		ASSERT(!pcp->p_active);
1350 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1351 		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1352 		npages += btop(pcp->p_len);
1353 		kmem_cache_free(seg_pkmcache, pcp);
1354 	}
1355 	mutex_enter(&seg_pmem_mtx);
1356 	ASSERT(seg_plocked >= npages);
1357 	seg_plocked -= npages;
1358 	if (!IS_PFLAGS_WIRED(flags)) {
1359 		ASSERT(seg_plocked_window >= npages);
1360 		seg_plocked_window -= npages;
1361 	}
1362 	mutex_exit(&seg_pmem_mtx);
1363 }
1364 
1365 static void seg_pinit_mem_config(void);
1366 
1367 /*
1368  * setup the pagelock cache
1369  */
1370 static void
1371 seg_pinit(void)
1372 {
1373 	struct seg_phash *hp;
1374 	ulong_t i;
1375 	pgcnt_t physmegs;
1376 
1377 	seg_plocked = 0;
1378 	seg_plocked_window = 0;
1379 
1380 	if (segpcache_enabled == 0) {
1381 		seg_phashsize_win = 0;
1382 		seg_phashsize_wired = 0;
1383 		seg_pdisabled = 1;
1384 		return;
1385 	}
1386 
1387 	seg_pdisabled = 0;
1388 	seg_pkmcache = kmem_cache_create("seg_pcache",
1389 	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1390 	if (segpcache_pcp_maxage_ticks <= 0) {
1391 		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1392 	}
1393 	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1394 	seg_pathr_empty_ahb = 0;
1395 	seg_pathr_full_ahb = 0;
1396 	seg_pshrink_shift = segpcache_shrink_shift;
1397 	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1398 
1399 	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1401 	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1402 	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1403 
1404 	physmegs = physmem >> (20 - PAGESHIFT);
1405 
1406 	/*
1407 	 * If segpcache_hashsize_win was not set in /etc/system or it has
1408 	 * absurd value set it to a default.
1409 	 */
1410 	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1411 		/*
1412 		 * Create one bucket per 32K (or at least per 8 pages) of
1413 		 * available memory.
1414 		 */
1415 		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1416 		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1417 	}
1418 	if (!ISP2(segpcache_hashsize_win)) {
1419 		ulong_t rndfac = ~(1UL <<
1420 		    (highbit(segpcache_hashsize_win) - 1));
1421 		rndfac &= segpcache_hashsize_win;
1422 		segpcache_hashsize_win += rndfac;
1423 		segpcache_hashsize_win = 1 <<
1424 		    (highbit(segpcache_hashsize_win) - 1);
1425 	}
1426 	seg_phashsize_win = segpcache_hashsize_win;
1427 	seg_phashtab_win = kmem_zalloc(
1428 	    seg_phashsize_win * sizeof (struct seg_phash),
1429 	    KM_SLEEP);
1430 	for (i = 0; i < seg_phashsize_win; i++) {
1431 		hp = &seg_phashtab_win[i];
1432 		hp->p_hnext = (struct seg_pcache *)hp;
1433 		hp->p_hprev = (struct seg_pcache *)hp;
1434 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1435 	}
1436 
1437 	seg_pahcur = 0;
1438 	seg_pathr_on = 0;
1439 	seg_pahhead[0].p_lnext = &seg_pahhead[0];
1440 	seg_pahhead[0].p_lprev = &seg_pahhead[0];
1441 	seg_pahhead[1].p_lnext = &seg_pahhead[1];
1442 	seg_pahhead[1].p_lprev = &seg_pahhead[1];
1443 
1444 	/*
1445 	 * If segpcache_hashsize_wired was not set in /etc/system or it has
1446 	 * absurd value set it to a default.
1447 	 */
1448 	if (segpcache_hashsize_wired == 0 ||
1449 	    segpcache_hashsize_wired > physmem / 4) {
1450 		/*
1451 		 * Choose segpcache_hashsize_wired based on physmem.
1452 		 * Create a bucket per 128K bytes upto 256K buckets.
1453 		 */
1454 		if (physmegs < 20 * 1024) {
1455 			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1456 		} else {
1457 			segpcache_hashsize_wired = 256 * 1024;
1458 		}
1459 	}
1460 	if (!ISP2(segpcache_hashsize_wired)) {
1461 		segpcache_hashsize_wired = 1 <<
1462 		    highbit(segpcache_hashsize_wired);
1463 	}
1464 	seg_phashsize_wired = segpcache_hashsize_wired;
1465 	seg_phashtab_wired = kmem_zalloc(
1466 	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1467 	for (i = 0; i < seg_phashsize_wired; i++) {
1468 		hp = (struct seg_phash *)&seg_phashtab_wired[i];
1469 		hp->p_hnext = (struct seg_pcache *)hp;
1470 		hp->p_hprev = (struct seg_pcache *)hp;
1471 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1472 	}
1473 
1474 	if (segpcache_maxwindow == 0) {
1475 		if (physmegs < 64) {
1476 			/* 3% of memory */
1477 			segpcache_maxwindow = availrmem >> 5;
1478 		} else if (physmegs < 512) {
1479 			/* 12% of memory */
1480 			segpcache_maxwindow = availrmem >> 3;
1481 		} else if (physmegs < 1024) {
1482 			/* 25% of memory */
1483 			segpcache_maxwindow = availrmem >> 2;
1484 		} else if (physmegs < 2048) {
1485 			/* 50% of memory */
1486 			segpcache_maxwindow = availrmem >> 1;
1487 		} else {
1488 			/* no limit */
1489 			segpcache_maxwindow = (pgcnt_t)-1;
1490 		}
1491 	}
1492 	seg_pmaxwindow = segpcache_maxwindow;
1493 	seg_pinit_mem_config();
1494 }
1495 
1496 /*
1497  * called by pageout if memory is low
1498  */
1499 void
1500 seg_preap(void)
1501 {
1502 	/*
1503 	 * if the cache is off or empty, return
1504 	 */
1505 	if (seg_plocked_window == 0) {
1506 		return;
1507 	}
1508 	ASSERT(seg_phashsize_win != 0);
1509 
1510 	/*
1511 	 * If somebody is already purging pcache
1512 	 * just return.
1513 	 */
1514 	if (seg_pdisabled) {
1515 		return;
1516 	}
1517 
1518 	cv_signal(&seg_pasync_cv);
1519 }
1520 
1521 /*
1522  * run as a backgroud thread and reclaim pagelock
1523  * pages which have not been used recently
1524  */
1525 void
1526 seg_pasync_thread(void)
1527 {
1528 	callb_cpr_t cpr_info;
1529 
1530 	if (seg_phashsize_win == 0) {
1531 		thread_exit();
1532 		/*NOTREACHED*/
1533 	}
1534 
1535 	seg_pasync_thr = curthread;
1536 
1537 	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1538 	    callb_generic_cpr, "seg_pasync");
1539 
1540 	if (segpcache_reap_ticks <= 0) {
1541 		segpcache_reap_ticks = segpcache_reap_sec * hz;
1542 	}
1543 
1544 	mutex_enter(&seg_pasync_mtx);
1545 	for (;;) {
1546 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
1547 		(void) cv_timedwait(&seg_pasync_cv, &seg_pasync_mtx,
1548 		    lbolt + segpcache_reap_ticks);
1549 		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1550 		if (seg_pdisabled == 0) {
1551 			seg_ppurge_async(0);
1552 		}
1553 	}
1554 }
1555 
1556 static struct kmem_cache *seg_cache;
1557 
1558 /*
1559  * Initialize segment management data structures.
1560  */
1561 void
1562 seg_init(void)
1563 {
1564 	kstat_t *ksp;
1565 
1566 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1567 	    0, NULL, NULL, NULL, NULL, NULL, 0);
1568 
1569 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1570 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1571 	if (ksp) {
1572 		ksp->ks_data = (void *)segadvstat_ptr;
1573 		kstat_install(ksp);
1574 	}
1575 
1576 	seg_pinit();
1577 }
1578 
1579 /*
1580  * Allocate a segment to cover [base, base+size]
1581  * and attach it to the specified address space.
1582  */
1583 struct seg *
1584 seg_alloc(struct as *as, caddr_t base, size_t size)
1585 {
1586 	struct seg *new;
1587 	caddr_t segbase;
1588 	size_t segsize;
1589 
1590 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1591 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1592 	    (uintptr_t)segbase;
1593 
1594 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1595 		return ((struct seg *)NULL);	/* bad virtual addr range */
1596 
1597 	if (as != &kas &&
1598 	    valid_usr_range(segbase, segsize, 0, as,
1599 	    as->a_userlimit) != RANGE_OKAY)
1600 		return ((struct seg *)NULL);	/* bad virtual addr range */
1601 
1602 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1603 	new->s_ops = NULL;
1604 	new->s_data = NULL;
1605 	new->s_szc = 0;
1606 	new->s_flags = 0;
1607 	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1608 	new->s_phead.p_lnext = &new->s_phead;
1609 	new->s_phead.p_lprev = &new->s_phead;
1610 	if (seg_attach(as, segbase, segsize, new) < 0) {
1611 		kmem_cache_free(seg_cache, new);
1612 		return ((struct seg *)NULL);
1613 	}
1614 	/* caller must fill in ops, data */
1615 	return (new);
1616 }
1617 
1618 /*
1619  * Attach a segment to the address space.  Used by seg_alloc()
1620  * and for kernel startup to attach to static segments.
1621  */
1622 int
1623 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1624 {
1625 	seg->s_as = as;
1626 	seg->s_base = base;
1627 	seg->s_size = size;
1628 
1629 	/*
1630 	 * as_addseg() will add the segment at the appropraite point
1631 	 * in the list. It will return -1 if there is overlap with
1632 	 * an already existing segment.
1633 	 */
1634 	return (as_addseg(as, seg));
1635 }
1636 
1637 /*
1638  * Unmap a segment and free it from its associated address space.
1639  * This should be called by anybody who's finished with a whole segment's
1640  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
1641  * responsibility of the segment driver to unlink the the segment
1642  * from the address space, and to free public and private data structures
1643  * associated with the segment.  (This is typically done by a call to
1644  * seg_free()).
1645  */
1646 void
1647 seg_unmap(struct seg *seg)
1648 {
1649 #ifdef DEBUG
1650 	int ret;
1651 #endif /* DEBUG */
1652 
1653 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1654 
1655 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
1656 	ASSERT(seg->s_data != NULL);
1657 
1658 	/* Unmap the whole mapping */
1659 #ifdef DEBUG
1660 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1661 	ASSERT(ret == 0);
1662 #else
1663 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1664 #endif /* DEBUG */
1665 }
1666 
1667 /*
1668  * Free the segment from its associated as. This should only be called
1669  * if a mapping to the segment has not yet been established (e.g., if
1670  * an error occurs in the middle of doing an as_map when the segment
1671  * has already been partially set up) or if it has already been deleted
1672  * (e.g., from a segment driver unmap routine if the unmap applies to the
1673  * entire segment). If the mapping is currently set up then seg_unmap() should
1674  * be called instead.
1675  */
1676 void
1677 seg_free(struct seg *seg)
1678 {
1679 	register struct as *as = seg->s_as;
1680 	struct seg *tseg = as_removeseg(as, seg);
1681 
1682 	ASSERT(tseg == seg);
1683 
1684 	/*
1685 	 * If the segment private data field is NULL,
1686 	 * then segment driver is not attached yet.
1687 	 */
1688 	if (seg->s_data != NULL)
1689 		SEGOP_FREE(seg);
1690 
1691 	mutex_destroy(&seg->s_pmtx);
1692 	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1693 	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1694 	kmem_cache_free(seg_cache, seg);
1695 }
1696 
1697 /*ARGSUSED*/
1698 static void
1699 seg_p_mem_config_post_add(
1700 	void *arg,
1701 	pgcnt_t delta_pages)
1702 {
1703 	/* Nothing to do. */
1704 }
1705 
1706 void
1707 seg_p_enable(void)
1708 {
1709 	mutex_enter(&seg_pcache_mtx);
1710 	ASSERT(seg_pdisabled != 0);
1711 	seg_pdisabled--;
1712 	mutex_exit(&seg_pcache_mtx);
1713 }
1714 
1715 /*
1716  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1717  * cache.
1718  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1719  * SEGP_FAIL if the cache could not be emptied.
1720  */
1721 int
1722 seg_p_disable(void)
1723 {
1724 	pgcnt_t	old_plocked;
1725 	int stall_count = 0;
1726 
1727 	mutex_enter(&seg_pcache_mtx);
1728 	seg_pdisabled++;
1729 	ASSERT(seg_pdisabled != 0);
1730 	mutex_exit(&seg_pcache_mtx);
1731 
1732 	/*
1733 	 * Attempt to empty the cache. Terminate if seg_plocked does not
1734 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1735 	 */
1736 	while (seg_plocked != 0) {
1737 		ASSERT(seg_phashsize_win != 0);
1738 		old_plocked = seg_plocked;
1739 		seg_ppurge_async(1);
1740 		if (seg_plocked == old_plocked) {
1741 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
1742 				return (SEGP_FAIL);
1743 			}
1744 		} else
1745 			stall_count = 0;
1746 		if (seg_plocked != 0)
1747 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1748 	}
1749 	return (SEGP_SUCCESS);
1750 }
1751 
1752 /*
1753  * Attempt to purge seg_pcache.  May need to return before this has
1754  * completed to allow other pre_del callbacks to unlock pages. This is
1755  * ok because:
1756  *	1) The seg_pdisabled flag has been set so at least we won't
1757  *	cache anymore locks and the locks we couldn't purge
1758  *	will not be held if they do get released by a subsequent
1759  *	pre-delete callback.
1760  *
1761  *	2) The rest of the memory delete thread processing does not
1762  *	depend on the changes made in this pre-delete callback. No
1763  *	panics will result, the worst that will happen is that the
1764  *	DR code will timeout and cancel the delete.
1765  */
1766 /*ARGSUSED*/
1767 static int
1768 seg_p_mem_config_pre_del(
1769 	void *arg,
1770 	pgcnt_t delta_pages)
1771 {
1772 	if (seg_phashsize_win == 0) {
1773 		return (0);
1774 	}
1775 	if (seg_p_disable() != SEGP_SUCCESS)
1776 		cmn_err(CE_NOTE,
1777 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
1778 	return (0);
1779 }
1780 
1781 /*ARGSUSED*/
1782 static void
1783 seg_p_mem_config_post_del(
1784 	void *arg,
1785 	pgcnt_t delta_pages,
1786 	int cancelled)
1787 {
1788 	if (seg_phashsize_win == 0) {
1789 		return;
1790 	}
1791 	seg_p_enable();
1792 }
1793 
1794 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1795 	KPHYSM_SETUP_VECTOR_VERSION,
1796 	seg_p_mem_config_post_add,
1797 	seg_p_mem_config_pre_del,
1798 	seg_p_mem_config_post_del,
1799 };
1800 
1801 static void
1802 seg_pinit_mem_config(void)
1803 {
1804 	int ret;
1805 
1806 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1807 	/*
1808 	 * Want to catch this in the debug kernel. At run time, if the
1809 	 * callbacks don't get run all will be OK as the disable just makes
1810 	 * it more likely that the pages can be collected.
1811 	 */
1812 	ASSERT(ret == 0);
1813 }
1814 
1815 /*
1816  * Verify that segment is not a shared anonymous segment which reserves
1817  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1818  * from one zone to another if any segments are shared.  This is because the
1819  * last process to exit will credit the swap reservation.  This could lead
1820  * to the swap being reserved by one zone, and credited to another.
1821  */
1822 boolean_t
1823 seg_can_change_zones(struct seg *seg)
1824 {
1825 	struct segvn_data *svd;
1826 
1827 	if (seg->s_ops == &segspt_shmops)
1828 		return (B_FALSE);
1829 
1830 	if (seg->s_ops == &segvn_ops) {
1831 		svd = (struct segvn_data *)seg->s_data;
1832 		if (svd->type == MAP_SHARED &&
1833 		    svd->amp != NULL &&
1834 		    svd->amp->swresv > 0)
1835 		return (B_FALSE);
1836 	}
1837 	return (B_TRUE);
1838 }
1839 
1840 /*
1841  * Return swap reserved by a segment backing a private mapping.
1842  */
1843 size_t
1844 seg_swresv(struct seg *seg)
1845 {
1846 	struct segvn_data *svd;
1847 	size_t swap = 0;
1848 
1849 	if (seg->s_ops == &segvn_ops) {
1850 		svd = (struct segvn_data *)seg->s_data;
1851 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1852 			swap = svd->swresv;
1853 	}
1854 	return (swap);
1855 }
1856