xref: /titanic_52/usr/src/uts/common/vm/vm_seg.c (revision 55f5292c612446ce6f93ddd248c0019b5974618b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 /*
40  * VM - segment management.
41  */
42 
43 #include <sys/types.h>
44 #include <sys/inttypes.h>
45 #include <sys/t_lock.h>
46 #include <sys/param.h>
47 #include <sys/systm.h>
48 #include <sys/kmem.h>
49 #include <sys/sysmacros.h>
50 #include <sys/vmsystm.h>
51 #include <sys/tuneable.h>
52 #include <sys/debug.h>
53 #include <sys/fs/swapnode.h>
54 #include <sys/cmn_err.h>
55 #include <sys/callb.h>
56 #include <sys/mem_config.h>
57 #include <sys/mman.h>
58 
59 #include <vm/hat.h>
60 #include <vm/as.h>
61 #include <vm/seg.h>
62 #include <vm/seg_kmem.h>
63 #include <vm/seg_spt.h>
64 #include <vm/seg_vn.h>
65 #include <vm/anon.h>
66 
67 /*
68  * kstats for segment advise
69  */
70 segadvstat_t segadvstat = {
71 	{ "MADV_FREE_hit",	KSTAT_DATA_ULONG },
72 	{ "MADV_FREE_miss",	KSTAT_DATA_ULONG },
73 };
74 
75 kstat_named_t *segadvstat_ptr = (kstat_named_t *)&segadvstat;
76 uint_t segadvstat_ndata = sizeof (segadvstat) / sizeof (kstat_named_t);
77 
78 /*
79  * entry in the segment page cache
80  */
81 struct seg_pcache {
82 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
83 	struct seg_pcache	*p_hprev;
84 	pcache_link_t		p_plink;	/* per segment/amp list */
85 	void 			*p_htag0;	/* segment/amp pointer */
86 	caddr_t			p_addr;		/* base address/anon_idx */
87 	size_t			p_len;		/* total bytes */
88 	size_t			p_wlen;		/* writtable bytes at p_addr */
89 	struct page		**p_pp;		/* pp shadow list */
90 	seg_preclaim_cbfunc_t	p_callback;	/* reclaim callback function */
91 	clock_t			p_lbolt;	/* lbolt from last use */
92 	struct seg_phash	*p_hashp;	/* our pcache hash bucket */
93 	uint_t			p_active;	/* active count */
94 	uchar_t			p_write;	/* true if S_WRITE */
95 	uchar_t			p_ref;		/* reference byte */
96 	ushort_t		p_flags;	/* bit flags */
97 };
98 
99 struct seg_phash {
100 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
101 	struct seg_pcache	*p_hprev;
102 	kmutex_t		p_hmutex;	/* protects hash bucket */
103 	pcache_link_t		p_halink[2];	/* active bucket linkages */
104 };
105 
106 struct seg_phash_wired {
107 	struct seg_pcache	*p_hnext;	/* list for hashed blocks */
108 	struct seg_pcache	*p_hprev;
109 	kmutex_t		p_hmutex;	/* protects hash bucket */
110 };
111 
112 /*
113  * A parameter to control a maximum number of bytes that can be
114  * purged from pcache at a time.
115  */
116 #define	P_MAX_APURGE_BYTES	(1024 * 1024 * 1024)
117 
118 /*
119  * log2(fraction of pcache to reclaim at a time).
120  */
121 #define	P_SHRINK_SHFT		(5)
122 
123 /*
124  * The following variables can be tuned via /etc/system.
125  */
126 
127 int	segpcache_enabled = 1;		/* if 1, shadow lists are cached */
128 pgcnt_t	segpcache_maxwindow = 0;	/* max # of pages that can be cached */
129 ulong_t	segpcache_hashsize_win = 0;	/* # of non wired buckets */
130 ulong_t	segpcache_hashsize_wired = 0;	/* # of wired buckets */
131 int	segpcache_reap_sec = 1;		/* reap check rate in secs */
132 clock_t	segpcache_reap_ticks = 0;	/* reap interval in ticks */
133 int	segpcache_pcp_maxage_sec = 1;	/* pcp max age in secs */
134 clock_t	segpcache_pcp_maxage_ticks = 0;	/* pcp max age in ticks */
135 int	segpcache_shrink_shift = P_SHRINK_SHFT;	/* log2 reap fraction */
136 pgcnt_t	segpcache_maxapurge_bytes = P_MAX_APURGE_BYTES;	/* max purge bytes */
137 
138 static kmutex_t seg_pcache_mtx;	/* protects seg_pdisabled counter */
139 static kmutex_t seg_pasync_mtx;	/* protects async thread scheduling */
140 static kcondvar_t seg_pasync_cv;
141 
142 #pragma align 64(pctrl1)
143 #pragma align 64(pctrl2)
144 #pragma align 64(pctrl3)
145 
146 /*
147  * Keep frequently used variables together in one cache line.
148  */
149 static struct p_ctrl1 {
150 	uint_t p_disabled;		/* if not 0, caching temporarily off */
151 	pgcnt_t p_maxwin;		/* max # of pages that can be cached */
152 	size_t p_hashwin_sz;		/* # of non wired buckets */
153 	struct seg_phash *p_htabwin;	/* hash table for non wired entries */
154 	size_t p_hashwired_sz;		/* # of wired buckets */
155 	struct seg_phash_wired *p_htabwired; /* hash table for wired entries */
156 	kmem_cache_t *p_kmcache;	/* kmem cache for seg_pcache structs */
157 #ifdef _LP64
158 	ulong_t pad[1];
159 #endif /* _LP64 */
160 } pctrl1;
161 
162 static struct p_ctrl2 {
163 	kmutex_t p_mem_mtx;	/* protects window counter and p_halinks */
164 	pgcnt_t  p_locked_win;	/* # pages from window */
165 	pgcnt_t  p_locked;	/* # of pages cached by pagelock */
166 	uchar_t	 p_ahcur;	/* current active links for insert/delete */
167 	uchar_t  p_athr_on;	/* async reclaim thread is running. */
168 	pcache_link_t p_ahhead[2]; /* active buckets linkages */
169 } pctrl2;
170 
171 static struct p_ctrl3 {
172 	clock_t	p_pcp_maxage;		/* max pcp age in ticks */
173 	ulong_t	p_athr_empty_ahb;	/* athread walk stats */
174 	ulong_t p_athr_full_ahb;	/* athread walk stats */
175 	pgcnt_t	p_maxapurge_npages;	/* max pages to purge at a time */
176 	int	p_shrink_shft;		/* reap shift factor */
177 #ifdef _LP64
178 	ulong_t pad[3];
179 #endif /* _LP64 */
180 } pctrl3;
181 
182 #define	seg_pdisabled			pctrl1.p_disabled
183 #define	seg_pmaxwindow			pctrl1.p_maxwin
184 #define	seg_phashsize_win		pctrl1.p_hashwin_sz
185 #define	seg_phashtab_win		pctrl1.p_htabwin
186 #define	seg_phashsize_wired		pctrl1.p_hashwired_sz
187 #define	seg_phashtab_wired		pctrl1.p_htabwired
188 #define	seg_pkmcache			pctrl1.p_kmcache
189 #define	seg_pmem_mtx			pctrl2.p_mem_mtx
190 #define	seg_plocked_window		pctrl2.p_locked_win
191 #define	seg_plocked			pctrl2.p_locked
192 #define	seg_pahcur			pctrl2.p_ahcur
193 #define	seg_pathr_on			pctrl2.p_athr_on
194 #define	seg_pahhead			pctrl2.p_ahhead
195 #define	seg_pmax_pcpage			pctrl3.p_pcp_maxage
196 #define	seg_pathr_empty_ahb		pctrl3.p_athr_empty_ahb
197 #define	seg_pathr_full_ahb		pctrl3.p_athr_full_ahb
198 #define	seg_pshrink_shift		pctrl3.p_shrink_shft
199 #define	seg_pmaxapurge_npages		pctrl3.p_maxapurge_npages
200 
201 #define	P_HASHWIN_MASK			(seg_phashsize_win - 1)
202 #define	P_HASHWIRED_MASK		(seg_phashsize_wired - 1)
203 #define	P_BASESHIFT			(6)
204 
205 kthread_t *seg_pasync_thr;
206 
207 extern struct seg_ops segvn_ops;
208 extern struct seg_ops segspt_shmops;
209 
210 #define	IS_PFLAGS_WIRED(flags) ((flags) & SEGP_FORCE_WIRED)
211 #define	IS_PCP_WIRED(pcp) IS_PFLAGS_WIRED((pcp)->p_flags)
212 
213 #define	LBOLT_DELTA(t)	((ulong_t)(ddi_get_lbolt() - (t)))
214 
215 #define	PCP_AGE(pcp)	LBOLT_DELTA((pcp)->p_lbolt)
216 
217 /*
218  * htag0 argument can be a seg or amp pointer.
219  */
220 #define	P_HASHBP(seg, htag0, addr, flags)				\
221 	(IS_PFLAGS_WIRED((flags)) ?					\
222 	    ((struct seg_phash *)&seg_phashtab_wired[P_HASHWIRED_MASK &	\
223 	    ((uintptr_t)(htag0) >> P_BASESHIFT)]) :			\
224 	    (&seg_phashtab_win[P_HASHWIN_MASK &				\
225 	    (((uintptr_t)(htag0) >> 3) ^				\
226 	    ((uintptr_t)(addr) >> ((flags & SEGP_PSHIFT) ?		\
227 	    (flags >> 16) : page_get_shift((seg)->s_szc))))]))
228 
229 /*
230  * htag0 argument can be a seg or amp pointer.
231  */
232 #define	P_MATCH(pcp, htag0, addr, len)					\
233 	((pcp)->p_htag0 == (htag0) &&					\
234 	(pcp)->p_addr == (addr) &&					\
235 	(pcp)->p_len >= (len))
236 
237 #define	P_MATCH_PP(pcp, htag0, addr, len, pp)				\
238 	((pcp)->p_pp == (pp) &&						\
239 	(pcp)->p_htag0 == (htag0) &&					\
240 	(pcp)->p_addr == (addr) &&					\
241 	(pcp)->p_len >= (len))
242 
243 #define	plink2pcache(pl)	((struct seg_pcache *)((uintptr_t)(pl) - \
244     offsetof(struct seg_pcache, p_plink)))
245 
246 #define	hlink2phash(hl, l)	((struct seg_phash *)((uintptr_t)(hl) -	\
247     offsetof(struct seg_phash, p_halink[l])))
248 
249 /*
250  * seg_padd_abuck()/seg_premove_abuck() link and unlink hash buckets from
251  * active hash bucket lists. We maintain active bucket lists to reduce the
252  * overhead of finding active buckets during asynchronous purging since there
253  * can be 10s of millions of buckets on a large system but only a small subset
254  * of them in actual use.
255  *
256  * There're 2 active bucket lists. Current active list (as per seg_pahcur) is
257  * used by seg_pinsert()/seg_pinactive()/seg_ppurge() to add and delete
258  * buckets. The other list is used by asynchronous purge thread. This allows
259  * the purge thread to walk its active list without holding seg_pmem_mtx for a
260  * long time. When asynchronous thread is done with its list it switches to
261  * current active list and makes the list it just finished processing as
262  * current active list.
263  *
264  * seg_padd_abuck() only adds the bucket to current list if the bucket is not
265  * yet on any list.  seg_premove_abuck() may remove the bucket from either
266  * list. If the bucket is on current list it will be always removed. Otherwise
267  * the bucket is only removed if asynchronous purge thread is not currently
268  * running or seg_premove_abuck() is called by asynchronous purge thread
269  * itself. A given bucket can only be on one of active lists at a time. These
270  * routines should be called with per bucket lock held.  The routines use
271  * seg_pmem_mtx to protect list updates. seg_padd_abuck() must be called after
272  * the first entry is added to the bucket chain and seg_premove_abuck() must
273  * be called after the last pcp entry is deleted from its chain. Per bucket
274  * lock should be held by the callers.  This avoids a potential race condition
275  * when seg_premove_abuck() removes a bucket after pcp entries are added to
276  * its list after the caller checked that the bucket has no entries. (this
277  * race would cause a loss of an active bucket from the active lists).
278  *
279  * Both lists are circular doubly linked lists anchored at seg_pahhead heads.
280  * New entries are added to the end of the list since LRU is used as the
281  * purging policy.
282  */
283 static void
284 seg_padd_abuck(struct seg_phash *hp)
285 {
286 	int lix;
287 
288 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
289 	ASSERT((struct seg_phash *)hp->p_hnext != hp);
290 	ASSERT((struct seg_phash *)hp->p_hprev != hp);
291 	ASSERT(hp->p_hnext == hp->p_hprev);
292 	ASSERT(!IS_PCP_WIRED(hp->p_hnext));
293 	ASSERT(hp->p_hnext->p_hnext == (struct seg_pcache *)hp);
294 	ASSERT(hp->p_hprev->p_hprev == (struct seg_pcache *)hp);
295 	ASSERT(hp >= seg_phashtab_win &&
296 	    hp < &seg_phashtab_win[seg_phashsize_win]);
297 
298 	/*
299 	 * This bucket can already be on one of active lists
300 	 * since seg_premove_abuck() may have failed to remove it
301 	 * before.
302 	 */
303 	mutex_enter(&seg_pmem_mtx);
304 	lix = seg_pahcur;
305 	ASSERT(lix >= 0 && lix <= 1);
306 	if (hp->p_halink[lix].p_lnext != NULL) {
307 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
308 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
309 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
310 		mutex_exit(&seg_pmem_mtx);
311 		return;
312 	}
313 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
314 
315 	/*
316 	 * If this bucket is still on list !lix async thread can't yet remove
317 	 * it since we hold here per bucket lock. In this case just return
318 	 * since async thread will eventually find and process this bucket.
319 	 */
320 	if (hp->p_halink[!lix].p_lnext != NULL) {
321 		ASSERT(hp->p_halink[!lix].p_lprev != NULL);
322 		mutex_exit(&seg_pmem_mtx);
323 		return;
324 	}
325 	ASSERT(hp->p_halink[!lix].p_lprev == NULL);
326 	/*
327 	 * This bucket is not on any active bucket list yet.
328 	 * Add the bucket to the tail of current active list.
329 	 */
330 	hp->p_halink[lix].p_lnext = &seg_pahhead[lix];
331 	hp->p_halink[lix].p_lprev = seg_pahhead[lix].p_lprev;
332 	seg_pahhead[lix].p_lprev->p_lnext = &hp->p_halink[lix];
333 	seg_pahhead[lix].p_lprev = &hp->p_halink[lix];
334 	mutex_exit(&seg_pmem_mtx);
335 }
336 
337 static void
338 seg_premove_abuck(struct seg_phash *hp, int athr)
339 {
340 	int lix;
341 
342 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
343 	ASSERT((struct seg_phash *)hp->p_hnext == hp);
344 	ASSERT((struct seg_phash *)hp->p_hprev == hp);
345 	ASSERT(hp >= seg_phashtab_win &&
346 	    hp < &seg_phashtab_win[seg_phashsize_win]);
347 
348 	if (athr) {
349 		ASSERT(seg_pathr_on);
350 		ASSERT(seg_pahcur <= 1);
351 		/*
352 		 * We are called by asynchronous thread that found this bucket
353 		 * on not currently active (i.e. !seg_pahcur) list. Remove it
354 		 * from there.  Per bucket lock we are holding makes sure
355 		 * seg_pinsert() can't sneak in and add pcp entries to this
356 		 * bucket right before we remove the bucket from its list.
357 		 */
358 		lix = !seg_pahcur;
359 		ASSERT(hp->p_halink[lix].p_lnext != NULL);
360 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
361 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
362 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
363 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
364 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
365 		hp->p_halink[lix].p_lnext = NULL;
366 		hp->p_halink[lix].p_lprev = NULL;
367 		return;
368 	}
369 
370 	mutex_enter(&seg_pmem_mtx);
371 	lix = seg_pahcur;
372 	ASSERT(lix >= 0 && lix <= 1);
373 
374 	/*
375 	 * If the bucket is on currently active list just remove it from
376 	 * there.
377 	 */
378 	if (hp->p_halink[lix].p_lnext != NULL) {
379 		ASSERT(hp->p_halink[lix].p_lprev != NULL);
380 		ASSERT(hp->p_halink[!lix].p_lnext == NULL);
381 		ASSERT(hp->p_halink[!lix].p_lprev == NULL);
382 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
383 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
384 		hp->p_halink[lix].p_lnext = NULL;
385 		hp->p_halink[lix].p_lprev = NULL;
386 		mutex_exit(&seg_pmem_mtx);
387 		return;
388 	}
389 	ASSERT(hp->p_halink[lix].p_lprev == NULL);
390 
391 	/*
392 	 * If asynchronous thread is not running we can remove the bucket from
393 	 * not currently active list. The bucket must be on this list since we
394 	 * already checked that it's not on the other list and the bucket from
395 	 * which we just deleted the last pcp entry must be still on one of the
396 	 * active bucket lists.
397 	 */
398 	lix = !lix;
399 	ASSERT(hp->p_halink[lix].p_lnext != NULL);
400 	ASSERT(hp->p_halink[lix].p_lprev != NULL);
401 
402 	if (!seg_pathr_on) {
403 		hp->p_halink[lix].p_lnext->p_lprev = hp->p_halink[lix].p_lprev;
404 		hp->p_halink[lix].p_lprev->p_lnext = hp->p_halink[lix].p_lnext;
405 		hp->p_halink[lix].p_lnext = NULL;
406 		hp->p_halink[lix].p_lprev = NULL;
407 	}
408 	mutex_exit(&seg_pmem_mtx);
409 }
410 
411 /*
412  * Check if bucket pointed by hp already has a pcp entry that matches request
413  * htag0, addr and len. Set *found to 1 if match is found and to 0 otherwise.
414  * Also delete matching entries that cover smaller address range but start
415  * at the same address as addr argument. Return the list of deleted entries if
416  * any. This is an internal helper function called from seg_pinsert() only
417  * for non wired shadow lists. The caller already holds a per seg/amp list
418  * lock.
419  */
420 static struct seg_pcache *
421 seg_plookup_checkdup(struct seg_phash *hp, void *htag0,
422     caddr_t addr, size_t len, int *found)
423 {
424 	struct seg_pcache *pcp;
425 	struct seg_pcache *delcallb_list = NULL;
426 
427 	ASSERT(MUTEX_HELD(&hp->p_hmutex));
428 
429 	*found = 0;
430 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
431 	    pcp = pcp->p_hnext) {
432 		ASSERT(pcp->p_hashp == hp);
433 		if (pcp->p_htag0 == htag0 && pcp->p_addr == addr) {
434 			ASSERT(!IS_PCP_WIRED(pcp));
435 			if (pcp->p_len < len) {
436 				pcache_link_t *plinkp;
437 				if (pcp->p_active) {
438 					continue;
439 				}
440 				plinkp = &pcp->p_plink;
441 				plinkp->p_lprev->p_lnext = plinkp->p_lnext;
442 				plinkp->p_lnext->p_lprev = plinkp->p_lprev;
443 				pcp->p_hprev->p_hnext = pcp->p_hnext;
444 				pcp->p_hnext->p_hprev = pcp->p_hprev;
445 				pcp->p_hprev = delcallb_list;
446 				delcallb_list = pcp;
447 			} else {
448 				*found = 1;
449 				break;
450 			}
451 		}
452 	}
453 	return (delcallb_list);
454 }
455 
456 /*
457  * lookup an address range in pagelock cache. Return shadow list and bump up
458  * active count. If amp is not NULL use amp as a lookup tag otherwise use seg
459  * as a lookup tag.
460  */
461 struct page **
462 seg_plookup(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
463     enum seg_rw rw, uint_t flags)
464 {
465 	struct seg_pcache *pcp;
466 	struct seg_phash *hp;
467 	void *htag0;
468 
469 	ASSERT(seg != NULL);
470 	ASSERT(rw == S_READ || rw == S_WRITE);
471 
472 	/*
473 	 * Skip pagelock cache, while DR is in progress or
474 	 * seg_pcache is off.
475 	 */
476 	if (seg_pdisabled) {
477 		return (NULL);
478 	}
479 	ASSERT(seg_phashsize_win != 0);
480 
481 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
482 	hp = P_HASHBP(seg, htag0, addr, flags);
483 	mutex_enter(&hp->p_hmutex);
484 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
485 	    pcp = pcp->p_hnext) {
486 		ASSERT(pcp->p_hashp == hp);
487 		if (P_MATCH(pcp, htag0, addr, len)) {
488 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
489 			/*
490 			 * If this request wants to write pages
491 			 * but write permissions starting from
492 			 * addr don't cover the entire length len
493 			 * return lookup failure back to the caller.
494 			 * It will check protections and fail this
495 			 * pagelock operation with EACCESS error.
496 			 */
497 			if (rw == S_WRITE && pcp->p_wlen < len) {
498 				break;
499 			}
500 			if (pcp->p_active == UINT_MAX) {
501 				break;
502 			}
503 			pcp->p_active++;
504 			if (rw == S_WRITE && !pcp->p_write) {
505 				pcp->p_write = 1;
506 			}
507 			mutex_exit(&hp->p_hmutex);
508 			return (pcp->p_pp);
509 		}
510 	}
511 	mutex_exit(&hp->p_hmutex);
512 	return (NULL);
513 }
514 
515 /*
516  * mark address range inactive. If the cache is off or the address range is
517  * not in the cache or another shadow list that covers bigger range is found
518  * we call the segment driver to reclaim the pages. Otherwise just decrement
519  * active count and set ref bit.  If amp is not NULL use amp as a lookup tag
520  * otherwise use seg as a lookup tag.
521  */
522 void
523 seg_pinactive(struct seg *seg, struct anon_map *amp, caddr_t addr,
524     size_t len, struct page **pp, enum seg_rw rw, uint_t flags,
525     seg_preclaim_cbfunc_t callback)
526 {
527 	struct seg_pcache *pcp;
528 	struct seg_phash *hp;
529 	kmutex_t *pmtx = NULL;
530 	pcache_link_t *pheadp;
531 	void *htag0;
532 	pgcnt_t npages = 0;
533 	int keep = 0;
534 
535 	ASSERT(seg != NULL);
536 	ASSERT(rw == S_READ || rw == S_WRITE);
537 
538 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
539 
540 	/*
541 	 * Skip lookup if pcache is not configured.
542 	 */
543 	if (seg_phashsize_win == 0) {
544 		goto out;
545 	}
546 
547 	/*
548 	 * Grab per seg/amp lock before hash lock if we are going to remove
549 	 * inactive entry from pcache.
550 	 */
551 	if (!IS_PFLAGS_WIRED(flags) && seg_pdisabled) {
552 		if (amp == NULL) {
553 			pheadp = &seg->s_phead;
554 			pmtx = &seg->s_pmtx;
555 		} else {
556 			pheadp = &amp->a_phead;
557 			pmtx = &amp->a_pmtx;
558 		}
559 		mutex_enter(pmtx);
560 	}
561 
562 	hp = P_HASHBP(seg, htag0, addr, flags);
563 	mutex_enter(&hp->p_hmutex);
564 again:
565 	for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
566 	    pcp = pcp->p_hnext) {
567 		ASSERT(pcp->p_hashp == hp);
568 		if (P_MATCH_PP(pcp, htag0, addr, len, pp)) {
569 			ASSERT(IS_PFLAGS_WIRED(flags) == IS_PCP_WIRED(pcp));
570 			ASSERT(pcp->p_active);
571 			if (keep) {
572 				/*
573 				 * Don't remove this pcp entry
574 				 * if we didn't find duplicate
575 				 * shadow lists on second search.
576 				 * Somebody removed those duplicates
577 				 * since we dropped hash lock after first
578 				 * search.
579 				 */
580 				ASSERT(pmtx != NULL);
581 				ASSERT(!IS_PFLAGS_WIRED(flags));
582 				mutex_exit(pmtx);
583 				pmtx = NULL;
584 			}
585 			pcp->p_active--;
586 			if (pcp->p_active == 0 && (pmtx != NULL ||
587 			    (seg_pdisabled && IS_PFLAGS_WIRED(flags)))) {
588 
589 				/*
590 				 * This entry is no longer active.  Remove it
591 				 * now either because pcaching is temporarily
592 				 * disabled or there're other pcp entries that
593 				 * can match this pagelock request (i.e. this
594 				 * entry is a duplicate).
595 				 */
596 
597 				ASSERT(callback == pcp->p_callback);
598 				if (pmtx != NULL) {
599 					pcache_link_t *plinkp = &pcp->p_plink;
600 					ASSERT(!IS_PCP_WIRED(pcp));
601 					ASSERT(pheadp->p_lnext != pheadp);
602 					ASSERT(pheadp->p_lprev != pheadp);
603 					plinkp->p_lprev->p_lnext =
604 					    plinkp->p_lnext;
605 					plinkp->p_lnext->p_lprev =
606 					    plinkp->p_lprev;
607 				}
608 				pcp->p_hprev->p_hnext = pcp->p_hnext;
609 				pcp->p_hnext->p_hprev = pcp->p_hprev;
610 				if (!IS_PCP_WIRED(pcp) &&
611 				    hp->p_hnext == (struct seg_pcache *)hp) {
612 					/*
613 					 * We removed the last entry from this
614 					 * bucket.  Now remove the bucket from
615 					 * its active list.
616 					 */
617 					seg_premove_abuck(hp, 0);
618 				}
619 				mutex_exit(&hp->p_hmutex);
620 				if (pmtx != NULL) {
621 					mutex_exit(pmtx);
622 				}
623 				len = pcp->p_len;
624 				npages = btop(len);
625 				if (rw != S_WRITE && pcp->p_write) {
626 					rw = S_WRITE;
627 				}
628 				kmem_cache_free(seg_pkmcache, pcp);
629 				goto out;
630 			} else {
631 				/*
632 				 * We found a matching pcp entry but will not
633 				 * free it right away even if it's no longer
634 				 * active.
635 				 */
636 				if (!pcp->p_active && !IS_PCP_WIRED(pcp)) {
637 					/*
638 					 * Set the reference bit and mark the
639 					 * time of last access to this pcp
640 					 * so that asynchronous thread doesn't
641 					 * free it immediately since
642 					 * it may be reactivated very soon.
643 					 */
644 					pcp->p_lbolt = ddi_get_lbolt();
645 					pcp->p_ref = 1;
646 				}
647 				mutex_exit(&hp->p_hmutex);
648 				if (pmtx != NULL) {
649 					mutex_exit(pmtx);
650 				}
651 				return;
652 			}
653 		} else if (!IS_PFLAGS_WIRED(flags) &&
654 		    P_MATCH(pcp, htag0, addr, len)) {
655 			/*
656 			 * This is a duplicate pcp entry.  This situation may
657 			 * happen if a bigger shadow list that covers our
658 			 * range was added while our entry was still active.
659 			 * Now we can free our pcp entry if it becomes
660 			 * inactive.
661 			 */
662 			if (!pcp->p_active) {
663 				/*
664 				 * Mark this entry as referenced just in case
665 				 * we'll free our own pcp entry soon.
666 				 */
667 				pcp->p_lbolt = ddi_get_lbolt();
668 				pcp->p_ref = 1;
669 			}
670 			if (pmtx != NULL) {
671 				/*
672 				 * we are already holding pmtx and found a
673 				 * duplicate.  Don't keep our own pcp entry.
674 				 */
675 				keep = 0;
676 				continue;
677 			}
678 			/*
679 			 * We have to use mutex_tryenter to attempt to lock
680 			 * seg/amp list lock since we already hold hash lock
681 			 * and seg/amp list lock is above hash lock in lock
682 			 * order.  If mutex_tryenter fails drop hash lock and
683 			 * retake both locks in correct order and research
684 			 * this hash chain.
685 			 */
686 			ASSERT(keep == 0);
687 			if (amp == NULL) {
688 				pheadp = &seg->s_phead;
689 				pmtx = &seg->s_pmtx;
690 			} else {
691 				pheadp = &amp->a_phead;
692 				pmtx = &amp->a_pmtx;
693 			}
694 			if (!mutex_tryenter(pmtx)) {
695 				mutex_exit(&hp->p_hmutex);
696 				mutex_enter(pmtx);
697 				mutex_enter(&hp->p_hmutex);
698 				/*
699 				 * If we don't find bigger shadow list on
700 				 * second search (it may happen since we
701 				 * dropped bucket lock) keep the entry that
702 				 * matches our own shadow list.
703 				 */
704 				keep = 1;
705 				goto again;
706 			}
707 		}
708 	}
709 	mutex_exit(&hp->p_hmutex);
710 	if (pmtx != NULL) {
711 		mutex_exit(pmtx);
712 	}
713 out:
714 	(*callback)(htag0, addr, len, pp, rw, 0);
715 	if (npages) {
716 		mutex_enter(&seg_pmem_mtx);
717 		ASSERT(seg_plocked >= npages);
718 		seg_plocked -= npages;
719 		if (!IS_PFLAGS_WIRED(flags)) {
720 			ASSERT(seg_plocked_window >= npages);
721 			seg_plocked_window -= npages;
722 		}
723 		mutex_exit(&seg_pmem_mtx);
724 	}
725 
726 }
727 
728 #ifdef DEBUG
729 static uint32_t p_insert_chk_mtbf = 0;
730 #endif
731 
732 /*
733  * The seg_pinsert_check() is used by segment drivers to predict whether
734  * a call to seg_pinsert will fail and thereby avoid wasteful pre-processing.
735  */
736 /*ARGSUSED*/
737 int
738 seg_pinsert_check(struct seg *seg, struct anon_map *amp, caddr_t addr,
739     size_t len, uint_t flags)
740 {
741 	ASSERT(seg != NULL);
742 
743 #ifdef DEBUG
744 	if (p_insert_chk_mtbf && !(gethrtime() % p_insert_chk_mtbf)) {
745 		return (SEGP_FAIL);
746 	}
747 #endif
748 
749 	if (seg_pdisabled) {
750 		return (SEGP_FAIL);
751 	}
752 	ASSERT(seg_phashsize_win != 0);
753 
754 	if (IS_PFLAGS_WIRED(flags)) {
755 		return (SEGP_SUCCESS);
756 	}
757 
758 	if (seg_plocked_window + btop(len) > seg_pmaxwindow) {
759 		return (SEGP_FAIL);
760 	}
761 
762 	if (freemem < desfree) {
763 		return (SEGP_FAIL);
764 	}
765 
766 	return (SEGP_SUCCESS);
767 }
768 
769 #ifdef DEBUG
770 static uint32_t p_insert_mtbf = 0;
771 #endif
772 
773 /*
774  * Insert address range with shadow list into pagelock cache if there's no
775  * shadow list already cached for this address range. If the cache is off or
776  * caching is temporarily disabled or the allowed 'window' is exceeded return
777  * SEGP_FAIL. Otherwise return SEGP_SUCCESS.
778  *
779  * For non wired shadow lists (segvn case) include address in the hashing
780  * function to avoid linking all the entries from the same segment or amp on
781  * the same bucket.  amp is used instead of seg if amp is not NULL. Non wired
782  * pcache entries are also linked on a per segment/amp list so that all
783  * entries can be found quickly during seg/amp purge without walking the
784  * entire pcache hash table.  For wired shadow lists (segspt case) we
785  * don't use address hashing and per segment linking because the caller
786  * currently inserts only one entry per segment that covers the entire
787  * segment. If we used per segment linking even for segspt it would complicate
788  * seg_ppurge_wiredpp() locking.
789  *
790  * Both hash bucket and per seg/amp locks need to be held before adding a non
791  * wired entry to hash and per seg/amp lists. per seg/amp lock should be taken
792  * first.
793  *
794  * This function will also remove from pcache old inactive shadow lists that
795  * overlap with this request but cover smaller range for the same start
796  * address.
797  */
798 int
799 seg_pinsert(struct seg *seg, struct anon_map *amp, caddr_t addr, size_t len,
800     size_t wlen, struct page **pp, enum seg_rw rw, uint_t flags,
801     seg_preclaim_cbfunc_t callback)
802 {
803 	struct seg_pcache *pcp;
804 	struct seg_phash *hp;
805 	pgcnt_t npages;
806 	pcache_link_t *pheadp;
807 	kmutex_t *pmtx;
808 	struct seg_pcache *delcallb_list = NULL;
809 
810 	ASSERT(seg != NULL);
811 	ASSERT(rw == S_READ || rw == S_WRITE);
812 	ASSERT(rw == S_READ || wlen == len);
813 	ASSERT(rw == S_WRITE || wlen <= len);
814 	ASSERT(amp == NULL || wlen == len);
815 
816 #ifdef DEBUG
817 	if (p_insert_mtbf && !(gethrtime() % p_insert_mtbf)) {
818 		return (SEGP_FAIL);
819 	}
820 #endif
821 
822 	if (seg_pdisabled) {
823 		return (SEGP_FAIL);
824 	}
825 	ASSERT(seg_phashsize_win != 0);
826 
827 	ASSERT((len & PAGEOFFSET) == 0);
828 	npages = btop(len);
829 	mutex_enter(&seg_pmem_mtx);
830 	if (!IS_PFLAGS_WIRED(flags)) {
831 		if (seg_plocked_window + npages > seg_pmaxwindow) {
832 			mutex_exit(&seg_pmem_mtx);
833 			return (SEGP_FAIL);
834 		}
835 		seg_plocked_window += npages;
836 	}
837 	seg_plocked += npages;
838 	mutex_exit(&seg_pmem_mtx);
839 
840 	pcp = kmem_cache_alloc(seg_pkmcache, KM_SLEEP);
841 	/*
842 	 * If amp is not NULL set htag0 to amp otherwise set it to seg.
843 	 */
844 	if (amp == NULL) {
845 		pcp->p_htag0 = (void *)seg;
846 		pcp->p_flags = flags & 0xffff;
847 	} else {
848 		pcp->p_htag0 = (void *)amp;
849 		pcp->p_flags = (flags & 0xffff) | SEGP_AMP;
850 	}
851 	pcp->p_addr = addr;
852 	pcp->p_len = len;
853 	pcp->p_wlen = wlen;
854 	pcp->p_pp = pp;
855 	pcp->p_write = (rw == S_WRITE);
856 	pcp->p_callback = callback;
857 	pcp->p_active = 1;
858 
859 	hp = P_HASHBP(seg, pcp->p_htag0, addr, flags);
860 	if (!IS_PFLAGS_WIRED(flags)) {
861 		int found;
862 		void *htag0;
863 		if (amp == NULL) {
864 			pheadp = &seg->s_phead;
865 			pmtx = &seg->s_pmtx;
866 			htag0 = (void *)seg;
867 		} else {
868 			pheadp = &amp->a_phead;
869 			pmtx = &amp->a_pmtx;
870 			htag0 = (void *)amp;
871 		}
872 		mutex_enter(pmtx);
873 		mutex_enter(&hp->p_hmutex);
874 		delcallb_list = seg_plookup_checkdup(hp, htag0, addr,
875 		    len, &found);
876 		if (found) {
877 			mutex_exit(&hp->p_hmutex);
878 			mutex_exit(pmtx);
879 			mutex_enter(&seg_pmem_mtx);
880 			seg_plocked -= npages;
881 			seg_plocked_window -= npages;
882 			mutex_exit(&seg_pmem_mtx);
883 			kmem_cache_free(seg_pkmcache, pcp);
884 			goto out;
885 		}
886 		pcp->p_plink.p_lnext = pheadp->p_lnext;
887 		pcp->p_plink.p_lprev = pheadp;
888 		pheadp->p_lnext->p_lprev = &pcp->p_plink;
889 		pheadp->p_lnext = &pcp->p_plink;
890 	} else {
891 		mutex_enter(&hp->p_hmutex);
892 	}
893 	pcp->p_hashp = hp;
894 	pcp->p_hnext = hp->p_hnext;
895 	pcp->p_hprev = (struct seg_pcache *)hp;
896 	hp->p_hnext->p_hprev = pcp;
897 	hp->p_hnext = pcp;
898 	if (!IS_PFLAGS_WIRED(flags) &&
899 	    hp->p_hprev == pcp) {
900 		seg_padd_abuck(hp);
901 	}
902 	mutex_exit(&hp->p_hmutex);
903 	if (!IS_PFLAGS_WIRED(flags)) {
904 		mutex_exit(pmtx);
905 	}
906 
907 out:
908 	npages = 0;
909 	while (delcallb_list != NULL) {
910 		pcp = delcallb_list;
911 		delcallb_list = pcp->p_hprev;
912 		ASSERT(!IS_PCP_WIRED(pcp) && !pcp->p_active);
913 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
914 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
915 		npages += btop(pcp->p_len);
916 		kmem_cache_free(seg_pkmcache, pcp);
917 	}
918 	if (npages) {
919 		ASSERT(!IS_PFLAGS_WIRED(flags));
920 		mutex_enter(&seg_pmem_mtx);
921 		ASSERT(seg_plocked >= npages);
922 		ASSERT(seg_plocked_window >= npages);
923 		seg_plocked -= npages;
924 		seg_plocked_window -= npages;
925 		mutex_exit(&seg_pmem_mtx);
926 	}
927 
928 	return (SEGP_SUCCESS);
929 }
930 
931 /*
932  * purge entries from the pagelock cache if not active
933  * and not recently used.
934  */
935 static void
936 seg_ppurge_async(int force)
937 {
938 	struct seg_pcache *delcallb_list = NULL;
939 	struct seg_pcache *pcp;
940 	struct seg_phash *hp;
941 	pgcnt_t npages = 0;
942 	pgcnt_t npages_window = 0;
943 	pgcnt_t	npgs_to_purge;
944 	pgcnt_t npgs_purged = 0;
945 	int hlinks = 0;
946 	int hlix;
947 	pcache_link_t *hlinkp;
948 	pcache_link_t *hlnextp = NULL;
949 	int lowmem;
950 	int trim;
951 
952 	ASSERT(seg_phashsize_win != 0);
953 
954 	/*
955 	 * if the cache is off or empty, return
956 	 */
957 	if (seg_plocked == 0 || (!force && seg_plocked_window == 0)) {
958 		return;
959 	}
960 
961 	if (!force) {
962 		lowmem = 0;
963 		trim = 0;
964 		if (freemem < lotsfree + needfree) {
965 			spgcnt_t fmem = MAX((spgcnt_t)(freemem - needfree), 0);
966 			if (fmem <= 5 * (desfree >> 2)) {
967 				lowmem = 1;
968 			} else if (fmem <= 7 * (lotsfree >> 3)) {
969 				if (seg_plocked_window >=
970 				    (availrmem_initial >> 1)) {
971 					lowmem = 1;
972 				}
973 			} else if (fmem < lotsfree) {
974 				if (seg_plocked_window >=
975 				    3 * (availrmem_initial >> 2)) {
976 					lowmem = 1;
977 				}
978 			}
979 		}
980 		if (seg_plocked_window >= 7 * (seg_pmaxwindow >> 3)) {
981 			trim = 1;
982 		}
983 		if (!lowmem && !trim) {
984 			return;
985 		}
986 		npgs_to_purge = seg_plocked_window >>
987 		    seg_pshrink_shift;
988 		if (lowmem) {
989 			npgs_to_purge = MIN(npgs_to_purge,
990 			    MAX(seg_pmaxapurge_npages, desfree));
991 		} else {
992 			npgs_to_purge = MIN(npgs_to_purge,
993 			    seg_pmaxapurge_npages);
994 		}
995 		if (npgs_to_purge == 0) {
996 			return;
997 		}
998 	} else {
999 		struct seg_phash_wired *hpw;
1000 
1001 		ASSERT(seg_phashsize_wired != 0);
1002 
1003 		for (hpw = seg_phashtab_wired;
1004 		    hpw < &seg_phashtab_wired[seg_phashsize_wired]; hpw++) {
1005 
1006 			if (hpw->p_hnext == (struct seg_pcache *)hpw) {
1007 				continue;
1008 			}
1009 
1010 			mutex_enter(&hpw->p_hmutex);
1011 
1012 			for (pcp = hpw->p_hnext;
1013 			    pcp != (struct seg_pcache *)hpw;
1014 			    pcp = pcp->p_hnext) {
1015 
1016 				ASSERT(IS_PCP_WIRED(pcp));
1017 				ASSERT(pcp->p_hashp ==
1018 				    (struct seg_phash *)hpw);
1019 
1020 				if (pcp->p_active) {
1021 					continue;
1022 				}
1023 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1024 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1025 				pcp->p_hprev = delcallb_list;
1026 				delcallb_list = pcp;
1027 			}
1028 			mutex_exit(&hpw->p_hmutex);
1029 		}
1030 	}
1031 
1032 	mutex_enter(&seg_pmem_mtx);
1033 	if (seg_pathr_on) {
1034 		mutex_exit(&seg_pmem_mtx);
1035 		goto runcb;
1036 	}
1037 	seg_pathr_on = 1;
1038 	mutex_exit(&seg_pmem_mtx);
1039 	ASSERT(seg_pahcur <= 1);
1040 	hlix = !seg_pahcur;
1041 
1042 again:
1043 	for (hlinkp = seg_pahhead[hlix].p_lnext; hlinkp != &seg_pahhead[hlix];
1044 	    hlinkp = hlnextp) {
1045 
1046 		hlnextp = hlinkp->p_lnext;
1047 		ASSERT(hlnextp != NULL);
1048 
1049 		hp = hlink2phash(hlinkp, hlix);
1050 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1051 			seg_pathr_empty_ahb++;
1052 			continue;
1053 		}
1054 		seg_pathr_full_ahb++;
1055 		mutex_enter(&hp->p_hmutex);
1056 
1057 		for (pcp = hp->p_hnext; pcp != (struct seg_pcache *)hp;
1058 		    pcp = pcp->p_hnext) {
1059 			pcache_link_t *pheadp;
1060 			pcache_link_t *plinkp;
1061 			void *htag0;
1062 			kmutex_t *pmtx;
1063 
1064 			ASSERT(!IS_PCP_WIRED(pcp));
1065 			ASSERT(pcp->p_hashp == hp);
1066 
1067 			if (pcp->p_active) {
1068 				continue;
1069 			}
1070 			if (!force && pcp->p_ref &&
1071 			    PCP_AGE(pcp) < seg_pmax_pcpage) {
1072 				pcp->p_ref = 0;
1073 				continue;
1074 			}
1075 			plinkp = &pcp->p_plink;
1076 			htag0 = pcp->p_htag0;
1077 			if (pcp->p_flags & SEGP_AMP) {
1078 				pheadp = &((amp_t *)htag0)->a_phead;
1079 				pmtx = &((amp_t *)htag0)->a_pmtx;
1080 			} else {
1081 				pheadp = &((seg_t *)htag0)->s_phead;
1082 				pmtx = &((seg_t *)htag0)->s_pmtx;
1083 			}
1084 			if (!mutex_tryenter(pmtx)) {
1085 				continue;
1086 			}
1087 			ASSERT(pheadp->p_lnext != pheadp);
1088 			ASSERT(pheadp->p_lprev != pheadp);
1089 			plinkp->p_lprev->p_lnext =
1090 			    plinkp->p_lnext;
1091 			plinkp->p_lnext->p_lprev =
1092 			    plinkp->p_lprev;
1093 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1094 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1095 			mutex_exit(pmtx);
1096 			pcp->p_hprev = delcallb_list;
1097 			delcallb_list = pcp;
1098 			npgs_purged += btop(pcp->p_len);
1099 		}
1100 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1101 			seg_premove_abuck(hp, 1);
1102 		}
1103 		mutex_exit(&hp->p_hmutex);
1104 		if (npgs_purged >= seg_plocked_window) {
1105 			break;
1106 		}
1107 		if (!force) {
1108 			if (npgs_purged >= npgs_to_purge) {
1109 				break;
1110 			}
1111 			if (!trim && !(seg_pathr_full_ahb & 15)) {
1112 				ASSERT(lowmem);
1113 				if (freemem >= lotsfree + needfree) {
1114 					break;
1115 				}
1116 			}
1117 		}
1118 	}
1119 
1120 	if (hlinkp == &seg_pahhead[hlix]) {
1121 		/*
1122 		 * We processed the entire hlix active bucket list
1123 		 * but didn't find enough pages to reclaim.
1124 		 * Switch the lists and walk the other list
1125 		 * if we haven't done it yet.
1126 		 */
1127 		mutex_enter(&seg_pmem_mtx);
1128 		ASSERT(seg_pathr_on);
1129 		ASSERT(seg_pahcur == !hlix);
1130 		seg_pahcur = hlix;
1131 		mutex_exit(&seg_pmem_mtx);
1132 		if (++hlinks < 2) {
1133 			hlix = !hlix;
1134 			goto again;
1135 		}
1136 	} else if ((hlinkp = hlnextp) != &seg_pahhead[hlix] &&
1137 	    seg_pahhead[hlix].p_lnext != hlinkp) {
1138 		ASSERT(hlinkp != NULL);
1139 		ASSERT(hlinkp->p_lprev != &seg_pahhead[hlix]);
1140 		ASSERT(seg_pahhead[hlix].p_lnext != &seg_pahhead[hlix]);
1141 		ASSERT(seg_pahhead[hlix].p_lprev != &seg_pahhead[hlix]);
1142 
1143 		/*
1144 		 * Reinsert the header to point to hlinkp
1145 		 * so that we start from hlinkp bucket next time around.
1146 		 */
1147 		seg_pahhead[hlix].p_lnext->p_lprev = seg_pahhead[hlix].p_lprev;
1148 		seg_pahhead[hlix].p_lprev->p_lnext = seg_pahhead[hlix].p_lnext;
1149 		seg_pahhead[hlix].p_lnext = hlinkp;
1150 		seg_pahhead[hlix].p_lprev = hlinkp->p_lprev;
1151 		hlinkp->p_lprev->p_lnext = &seg_pahhead[hlix];
1152 		hlinkp->p_lprev = &seg_pahhead[hlix];
1153 	}
1154 
1155 	mutex_enter(&seg_pmem_mtx);
1156 	ASSERT(seg_pathr_on);
1157 	seg_pathr_on = 0;
1158 	mutex_exit(&seg_pmem_mtx);
1159 
1160 runcb:
1161 	/*
1162 	 * Run the delayed callback list. segments/amps can't go away until
1163 	 * callback is executed since they must have non 0 softlockcnt. That's
1164 	 * why we don't need to hold as/seg/amp locks to execute the callback.
1165 	 */
1166 	while (delcallb_list != NULL) {
1167 		pcp = delcallb_list;
1168 		delcallb_list = pcp->p_hprev;
1169 		ASSERT(!pcp->p_active);
1170 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1171 		    pcp->p_len, pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 1);
1172 		npages += btop(pcp->p_len);
1173 		if (!IS_PCP_WIRED(pcp)) {
1174 			npages_window += btop(pcp->p_len);
1175 		}
1176 		kmem_cache_free(seg_pkmcache, pcp);
1177 	}
1178 	if (npages) {
1179 		mutex_enter(&seg_pmem_mtx);
1180 		ASSERT(seg_plocked >= npages);
1181 		ASSERT(seg_plocked_window >= npages_window);
1182 		seg_plocked -= npages;
1183 		seg_plocked_window -= npages_window;
1184 		mutex_exit(&seg_pmem_mtx);
1185 	}
1186 }
1187 
1188 /*
1189  * Remove cached pages for segment(s) entries from hashtable.  The segments
1190  * are identified by pp array. This is useful for multiple seg's cached on
1191  * behalf of dummy segment (ISM/DISM) with common pp array.
1192  */
1193 void
1194 seg_ppurge_wiredpp(struct page **pp)
1195 {
1196 	struct seg_pcache *pcp;
1197 	struct seg_phash_wired *hp;
1198 	pgcnt_t npages = 0;
1199 	struct	seg_pcache *delcallb_list = NULL;
1200 
1201 	/*
1202 	 * if the cache is empty, return
1203 	 */
1204 	if (seg_plocked == 0) {
1205 		return;
1206 	}
1207 	ASSERT(seg_phashsize_wired != 0);
1208 
1209 	for (hp = seg_phashtab_wired;
1210 	    hp < &seg_phashtab_wired[seg_phashsize_wired]; hp++) {
1211 		if (hp->p_hnext == (struct seg_pcache *)hp) {
1212 			continue;
1213 		}
1214 		mutex_enter(&hp->p_hmutex);
1215 		pcp = hp->p_hnext;
1216 		while (pcp != (struct seg_pcache *)hp) {
1217 			ASSERT(pcp->p_hashp == (struct seg_phash *)hp);
1218 			ASSERT(IS_PCP_WIRED(pcp));
1219 			/*
1220 			 * purge entries which are not active
1221 			 */
1222 			if (!pcp->p_active && pcp->p_pp == pp) {
1223 				ASSERT(pcp->p_htag0 != NULL);
1224 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1225 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1226 				pcp->p_hprev = delcallb_list;
1227 				delcallb_list = pcp;
1228 			}
1229 			pcp = pcp->p_hnext;
1230 		}
1231 		mutex_exit(&hp->p_hmutex);
1232 		/*
1233 		 * segments can't go away until callback is executed since
1234 		 * they must have non 0 softlockcnt. That's why we don't
1235 		 * need to hold as/seg locks to execute the callback.
1236 		 */
1237 		while (delcallb_list != NULL) {
1238 			int done;
1239 			pcp = delcallb_list;
1240 			delcallb_list = pcp->p_hprev;
1241 			ASSERT(!pcp->p_active);
1242 			done = (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr,
1243 			    pcp->p_len, pcp->p_pp,
1244 			    pcp->p_write ? S_WRITE : S_READ, 1);
1245 			npages += btop(pcp->p_len);
1246 			ASSERT(IS_PCP_WIRED(pcp));
1247 			kmem_cache_free(seg_pkmcache, pcp);
1248 			if (done) {
1249 				ASSERT(delcallb_list == NULL);
1250 				goto out;
1251 			}
1252 		}
1253 	}
1254 
1255 out:
1256 	mutex_enter(&seg_pmem_mtx);
1257 	ASSERT(seg_plocked >= npages);
1258 	seg_plocked -= npages;
1259 	mutex_exit(&seg_pmem_mtx);
1260 }
1261 
1262 /*
1263  * purge all entries for a given segment. Since we
1264  * callback into the segment driver directly for page
1265  * reclaim the caller needs to hold the right locks.
1266  */
1267 void
1268 seg_ppurge(struct seg *seg, struct anon_map *amp, uint_t flags)
1269 {
1270 	struct seg_pcache *delcallb_list = NULL;
1271 	struct seg_pcache *pcp;
1272 	struct seg_phash *hp;
1273 	pgcnt_t npages = 0;
1274 	void *htag0;
1275 
1276 	if (seg_plocked == 0) {
1277 		return;
1278 	}
1279 	ASSERT(seg_phashsize_win != 0);
1280 
1281 	/*
1282 	 * If amp is not NULL use amp as a lookup tag otherwise use seg
1283 	 * as a lookup tag.
1284 	 */
1285 	htag0 = (amp == NULL ? (void *)seg : (void *)amp);
1286 	ASSERT(htag0 != NULL);
1287 	if (IS_PFLAGS_WIRED(flags)) {
1288 		hp = P_HASHBP(seg, htag0, 0, flags);
1289 		mutex_enter(&hp->p_hmutex);
1290 		pcp = hp->p_hnext;
1291 		while (pcp != (struct seg_pcache *)hp) {
1292 			ASSERT(pcp->p_hashp == hp);
1293 			ASSERT(IS_PCP_WIRED(pcp));
1294 			if (pcp->p_htag0 == htag0) {
1295 				if (pcp->p_active) {
1296 					break;
1297 				}
1298 				pcp->p_hprev->p_hnext = pcp->p_hnext;
1299 				pcp->p_hnext->p_hprev = pcp->p_hprev;
1300 				pcp->p_hprev = delcallb_list;
1301 				delcallb_list = pcp;
1302 			}
1303 			pcp = pcp->p_hnext;
1304 		}
1305 		mutex_exit(&hp->p_hmutex);
1306 	} else {
1307 		pcache_link_t *plinkp;
1308 		pcache_link_t *pheadp;
1309 		kmutex_t *pmtx;
1310 
1311 		if (amp == NULL) {
1312 			ASSERT(seg != NULL);
1313 			pheadp = &seg->s_phead;
1314 			pmtx = &seg->s_pmtx;
1315 		} else {
1316 			pheadp = &amp->a_phead;
1317 			pmtx = &amp->a_pmtx;
1318 		}
1319 		mutex_enter(pmtx);
1320 		while ((plinkp = pheadp->p_lnext) != pheadp) {
1321 			pcp = plink2pcache(plinkp);
1322 			ASSERT(!IS_PCP_WIRED(pcp));
1323 			ASSERT(pcp->p_htag0 == htag0);
1324 			hp = pcp->p_hashp;
1325 			mutex_enter(&hp->p_hmutex);
1326 			if (pcp->p_active) {
1327 				mutex_exit(&hp->p_hmutex);
1328 				break;
1329 			}
1330 			ASSERT(plinkp->p_lprev == pheadp);
1331 			pheadp->p_lnext = plinkp->p_lnext;
1332 			plinkp->p_lnext->p_lprev = pheadp;
1333 			pcp->p_hprev->p_hnext = pcp->p_hnext;
1334 			pcp->p_hnext->p_hprev = pcp->p_hprev;
1335 			pcp->p_hprev = delcallb_list;
1336 			delcallb_list = pcp;
1337 			if (hp->p_hnext == (struct seg_pcache *)hp) {
1338 				seg_premove_abuck(hp, 0);
1339 			}
1340 			mutex_exit(&hp->p_hmutex);
1341 		}
1342 		mutex_exit(pmtx);
1343 	}
1344 	while (delcallb_list != NULL) {
1345 		pcp = delcallb_list;
1346 		delcallb_list = pcp->p_hprev;
1347 		ASSERT(!pcp->p_active);
1348 		(void) (*pcp->p_callback)(pcp->p_htag0, pcp->p_addr, pcp->p_len,
1349 		    pcp->p_pp, pcp->p_write ? S_WRITE : S_READ, 0);
1350 		npages += btop(pcp->p_len);
1351 		kmem_cache_free(seg_pkmcache, pcp);
1352 	}
1353 	mutex_enter(&seg_pmem_mtx);
1354 	ASSERT(seg_plocked >= npages);
1355 	seg_plocked -= npages;
1356 	if (!IS_PFLAGS_WIRED(flags)) {
1357 		ASSERT(seg_plocked_window >= npages);
1358 		seg_plocked_window -= npages;
1359 	}
1360 	mutex_exit(&seg_pmem_mtx);
1361 }
1362 
1363 static void seg_pinit_mem_config(void);
1364 
1365 /*
1366  * setup the pagelock cache
1367  */
1368 static void
1369 seg_pinit(void)
1370 {
1371 	struct seg_phash *hp;
1372 	ulong_t i;
1373 	pgcnt_t physmegs;
1374 
1375 	seg_plocked = 0;
1376 	seg_plocked_window = 0;
1377 
1378 	if (segpcache_enabled == 0) {
1379 		seg_phashsize_win = 0;
1380 		seg_phashsize_wired = 0;
1381 		seg_pdisabled = 1;
1382 		return;
1383 	}
1384 
1385 	seg_pdisabled = 0;
1386 	seg_pkmcache = kmem_cache_create("seg_pcache",
1387 	    sizeof (struct seg_pcache), 0, NULL, NULL, NULL, NULL, NULL, 0);
1388 	if (segpcache_pcp_maxage_ticks <= 0) {
1389 		segpcache_pcp_maxage_ticks = segpcache_pcp_maxage_sec * hz;
1390 	}
1391 	seg_pmax_pcpage = segpcache_pcp_maxage_ticks;
1392 	seg_pathr_empty_ahb = 0;
1393 	seg_pathr_full_ahb = 0;
1394 	seg_pshrink_shift = segpcache_shrink_shift;
1395 	seg_pmaxapurge_npages = btop(segpcache_maxapurge_bytes);
1396 
1397 	mutex_init(&seg_pcache_mtx, NULL, MUTEX_DEFAULT, NULL);
1398 	mutex_init(&seg_pmem_mtx, NULL, MUTEX_DEFAULT, NULL);
1399 	mutex_init(&seg_pasync_mtx, NULL, MUTEX_DEFAULT, NULL);
1400 	cv_init(&seg_pasync_cv, NULL, CV_DEFAULT, NULL);
1401 
1402 	physmegs = physmem >> (20 - PAGESHIFT);
1403 
1404 	/*
1405 	 * If segpcache_hashsize_win was not set in /etc/system or it has
1406 	 * absurd value set it to a default.
1407 	 */
1408 	if (segpcache_hashsize_win == 0 || segpcache_hashsize_win > physmem) {
1409 		/*
1410 		 * Create one bucket per 32K (or at least per 8 pages) of
1411 		 * available memory.
1412 		 */
1413 		pgcnt_t pages_per_bucket = MAX(btop(32 * 1024), 8);
1414 		segpcache_hashsize_win = MAX(1024, physmem / pages_per_bucket);
1415 	}
1416 	if (!ISP2(segpcache_hashsize_win)) {
1417 		ulong_t rndfac = ~(1UL <<
1418 		    (highbit(segpcache_hashsize_win) - 1));
1419 		rndfac &= segpcache_hashsize_win;
1420 		segpcache_hashsize_win += rndfac;
1421 		segpcache_hashsize_win = 1 <<
1422 		    (highbit(segpcache_hashsize_win) - 1);
1423 	}
1424 	seg_phashsize_win = segpcache_hashsize_win;
1425 	seg_phashtab_win = kmem_zalloc(
1426 	    seg_phashsize_win * sizeof (struct seg_phash),
1427 	    KM_SLEEP);
1428 	for (i = 0; i < seg_phashsize_win; i++) {
1429 		hp = &seg_phashtab_win[i];
1430 		hp->p_hnext = (struct seg_pcache *)hp;
1431 		hp->p_hprev = (struct seg_pcache *)hp;
1432 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1433 	}
1434 
1435 	seg_pahcur = 0;
1436 	seg_pathr_on = 0;
1437 	seg_pahhead[0].p_lnext = &seg_pahhead[0];
1438 	seg_pahhead[0].p_lprev = &seg_pahhead[0];
1439 	seg_pahhead[1].p_lnext = &seg_pahhead[1];
1440 	seg_pahhead[1].p_lprev = &seg_pahhead[1];
1441 
1442 	/*
1443 	 * If segpcache_hashsize_wired was not set in /etc/system or it has
1444 	 * absurd value set it to a default.
1445 	 */
1446 	if (segpcache_hashsize_wired == 0 ||
1447 	    segpcache_hashsize_wired > physmem / 4) {
1448 		/*
1449 		 * Choose segpcache_hashsize_wired based on physmem.
1450 		 * Create a bucket per 128K bytes upto 256K buckets.
1451 		 */
1452 		if (physmegs < 20 * 1024) {
1453 			segpcache_hashsize_wired = MAX(1024, physmegs << 3);
1454 		} else {
1455 			segpcache_hashsize_wired = 256 * 1024;
1456 		}
1457 	}
1458 	if (!ISP2(segpcache_hashsize_wired)) {
1459 		segpcache_hashsize_wired = 1 <<
1460 		    highbit(segpcache_hashsize_wired);
1461 	}
1462 	seg_phashsize_wired = segpcache_hashsize_wired;
1463 	seg_phashtab_wired = kmem_zalloc(
1464 	    seg_phashsize_wired * sizeof (struct seg_phash_wired), KM_SLEEP);
1465 	for (i = 0; i < seg_phashsize_wired; i++) {
1466 		hp = (struct seg_phash *)&seg_phashtab_wired[i];
1467 		hp->p_hnext = (struct seg_pcache *)hp;
1468 		hp->p_hprev = (struct seg_pcache *)hp;
1469 		mutex_init(&hp->p_hmutex, NULL, MUTEX_DEFAULT, NULL);
1470 	}
1471 
1472 	if (segpcache_maxwindow == 0) {
1473 		if (physmegs < 64) {
1474 			/* 3% of memory */
1475 			segpcache_maxwindow = availrmem >> 5;
1476 		} else if (physmegs < 512) {
1477 			/* 12% of memory */
1478 			segpcache_maxwindow = availrmem >> 3;
1479 		} else if (physmegs < 1024) {
1480 			/* 25% of memory */
1481 			segpcache_maxwindow = availrmem >> 2;
1482 		} else if (physmegs < 2048) {
1483 			/* 50% of memory */
1484 			segpcache_maxwindow = availrmem >> 1;
1485 		} else {
1486 			/* no limit */
1487 			segpcache_maxwindow = (pgcnt_t)-1;
1488 		}
1489 	}
1490 	seg_pmaxwindow = segpcache_maxwindow;
1491 	seg_pinit_mem_config();
1492 }
1493 
1494 /*
1495  * called by pageout if memory is low
1496  */
1497 void
1498 seg_preap(void)
1499 {
1500 	/*
1501 	 * if the cache is off or empty, return
1502 	 */
1503 	if (seg_plocked_window == 0) {
1504 		return;
1505 	}
1506 	ASSERT(seg_phashsize_win != 0);
1507 
1508 	/*
1509 	 * If somebody is already purging pcache
1510 	 * just return.
1511 	 */
1512 	if (seg_pdisabled) {
1513 		return;
1514 	}
1515 
1516 	cv_signal(&seg_pasync_cv);
1517 }
1518 
1519 /*
1520  * run as a backgroud thread and reclaim pagelock
1521  * pages which have not been used recently
1522  */
1523 void
1524 seg_pasync_thread(void)
1525 {
1526 	callb_cpr_t cpr_info;
1527 
1528 	if (seg_phashsize_win == 0) {
1529 		thread_exit();
1530 		/*NOTREACHED*/
1531 	}
1532 
1533 	seg_pasync_thr = curthread;
1534 
1535 	CALLB_CPR_INIT(&cpr_info, &seg_pasync_mtx,
1536 	    callb_generic_cpr, "seg_pasync");
1537 
1538 	if (segpcache_reap_ticks <= 0) {
1539 		segpcache_reap_ticks = segpcache_reap_sec * hz;
1540 	}
1541 
1542 	mutex_enter(&seg_pasync_mtx);
1543 	for (;;) {
1544 		CALLB_CPR_SAFE_BEGIN(&cpr_info);
1545 		(void) cv_reltimedwait(&seg_pasync_cv, &seg_pasync_mtx,
1546 		    segpcache_reap_ticks, TR_CLOCK_TICK);
1547 		CALLB_CPR_SAFE_END(&cpr_info, &seg_pasync_mtx);
1548 		if (seg_pdisabled == 0) {
1549 			seg_ppurge_async(0);
1550 		}
1551 	}
1552 }
1553 
1554 static struct kmem_cache *seg_cache;
1555 
1556 /*
1557  * Initialize segment management data structures.
1558  */
1559 void
1560 seg_init(void)
1561 {
1562 	kstat_t *ksp;
1563 
1564 	seg_cache = kmem_cache_create("seg_cache", sizeof (struct seg),
1565 	    0, NULL, NULL, NULL, NULL, NULL, 0);
1566 
1567 	ksp = kstat_create("unix", 0, "segadvstat", "vm", KSTAT_TYPE_NAMED,
1568 	    segadvstat_ndata, KSTAT_FLAG_VIRTUAL);
1569 	if (ksp) {
1570 		ksp->ks_data = (void *)segadvstat_ptr;
1571 		kstat_install(ksp);
1572 	}
1573 
1574 	seg_pinit();
1575 }
1576 
1577 /*
1578  * Allocate a segment to cover [base, base+size]
1579  * and attach it to the specified address space.
1580  */
1581 struct seg *
1582 seg_alloc(struct as *as, caddr_t base, size_t size)
1583 {
1584 	struct seg *new;
1585 	caddr_t segbase;
1586 	size_t segsize;
1587 
1588 	segbase = (caddr_t)((uintptr_t)base & (uintptr_t)PAGEMASK);
1589 	segsize = (((uintptr_t)(base + size) + PAGEOFFSET) & PAGEMASK) -
1590 	    (uintptr_t)segbase;
1591 
1592 	if (!valid_va_range(&segbase, &segsize, segsize, AH_LO))
1593 		return ((struct seg *)NULL);	/* bad virtual addr range */
1594 
1595 	if (as != &kas &&
1596 	    valid_usr_range(segbase, segsize, 0, as,
1597 	    as->a_userlimit) != RANGE_OKAY)
1598 		return ((struct seg *)NULL);	/* bad virtual addr range */
1599 
1600 	new = kmem_cache_alloc(seg_cache, KM_SLEEP);
1601 	new->s_ops = NULL;
1602 	new->s_data = NULL;
1603 	new->s_szc = 0;
1604 	new->s_flags = 0;
1605 	mutex_init(&new->s_pmtx, NULL, MUTEX_DEFAULT, NULL);
1606 	new->s_phead.p_lnext = &new->s_phead;
1607 	new->s_phead.p_lprev = &new->s_phead;
1608 	if (seg_attach(as, segbase, segsize, new) < 0) {
1609 		kmem_cache_free(seg_cache, new);
1610 		return ((struct seg *)NULL);
1611 	}
1612 	/* caller must fill in ops, data */
1613 	return (new);
1614 }
1615 
1616 /*
1617  * Attach a segment to the address space.  Used by seg_alloc()
1618  * and for kernel startup to attach to static segments.
1619  */
1620 int
1621 seg_attach(struct as *as, caddr_t base, size_t size, struct seg *seg)
1622 {
1623 	seg->s_as = as;
1624 	seg->s_base = base;
1625 	seg->s_size = size;
1626 
1627 	/*
1628 	 * as_addseg() will add the segment at the appropraite point
1629 	 * in the list. It will return -1 if there is overlap with
1630 	 * an already existing segment.
1631 	 */
1632 	return (as_addseg(as, seg));
1633 }
1634 
1635 /*
1636  * Unmap a segment and free it from its associated address space.
1637  * This should be called by anybody who's finished with a whole segment's
1638  * mapping.  Just calls SEGOP_UNMAP() on the whole mapping .  It is the
1639  * responsibility of the segment driver to unlink the the segment
1640  * from the address space, and to free public and private data structures
1641  * associated with the segment.  (This is typically done by a call to
1642  * seg_free()).
1643  */
1644 void
1645 seg_unmap(struct seg *seg)
1646 {
1647 #ifdef DEBUG
1648 	int ret;
1649 #endif /* DEBUG */
1650 
1651 	ASSERT(seg->s_as && AS_WRITE_HELD(seg->s_as, &seg->s_as->a_lock));
1652 
1653 	/* Shouldn't have called seg_unmap if mapping isn't yet established */
1654 	ASSERT(seg->s_data != NULL);
1655 
1656 	/* Unmap the whole mapping */
1657 #ifdef DEBUG
1658 	ret = SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1659 	ASSERT(ret == 0);
1660 #else
1661 	SEGOP_UNMAP(seg, seg->s_base, seg->s_size);
1662 #endif /* DEBUG */
1663 }
1664 
1665 /*
1666  * Free the segment from its associated as. This should only be called
1667  * if a mapping to the segment has not yet been established (e.g., if
1668  * an error occurs in the middle of doing an as_map when the segment
1669  * has already been partially set up) or if it has already been deleted
1670  * (e.g., from a segment driver unmap routine if the unmap applies to the
1671  * entire segment). If the mapping is currently set up then seg_unmap() should
1672  * be called instead.
1673  */
1674 void
1675 seg_free(struct seg *seg)
1676 {
1677 	register struct as *as = seg->s_as;
1678 	struct seg *tseg = as_removeseg(as, seg);
1679 
1680 	ASSERT(tseg == seg);
1681 
1682 	/*
1683 	 * If the segment private data field is NULL,
1684 	 * then segment driver is not attached yet.
1685 	 */
1686 	if (seg->s_data != NULL)
1687 		SEGOP_FREE(seg);
1688 
1689 	mutex_destroy(&seg->s_pmtx);
1690 	ASSERT(seg->s_phead.p_lnext == &seg->s_phead);
1691 	ASSERT(seg->s_phead.p_lprev == &seg->s_phead);
1692 	kmem_cache_free(seg_cache, seg);
1693 }
1694 
1695 /*ARGSUSED*/
1696 static void
1697 seg_p_mem_config_post_add(
1698 	void *arg,
1699 	pgcnt_t delta_pages)
1700 {
1701 	/* Nothing to do. */
1702 }
1703 
1704 void
1705 seg_p_enable(void)
1706 {
1707 	mutex_enter(&seg_pcache_mtx);
1708 	ASSERT(seg_pdisabled != 0);
1709 	seg_pdisabled--;
1710 	mutex_exit(&seg_pcache_mtx);
1711 }
1712 
1713 /*
1714  * seg_p_disable - disables seg_pcache, and then attempts to empty the
1715  * cache.
1716  * Returns SEGP_SUCCESS if the cache was successfully emptied, or
1717  * SEGP_FAIL if the cache could not be emptied.
1718  */
1719 int
1720 seg_p_disable(void)
1721 {
1722 	pgcnt_t	old_plocked;
1723 	int stall_count = 0;
1724 
1725 	mutex_enter(&seg_pcache_mtx);
1726 	seg_pdisabled++;
1727 	ASSERT(seg_pdisabled != 0);
1728 	mutex_exit(&seg_pcache_mtx);
1729 
1730 	/*
1731 	 * Attempt to empty the cache. Terminate if seg_plocked does not
1732 	 * diminish with SEGP_STALL_THRESHOLD consecutive attempts.
1733 	 */
1734 	while (seg_plocked != 0) {
1735 		ASSERT(seg_phashsize_win != 0);
1736 		old_plocked = seg_plocked;
1737 		seg_ppurge_async(1);
1738 		if (seg_plocked == old_plocked) {
1739 			if (stall_count++ > SEGP_STALL_THRESHOLD) {
1740 				return (SEGP_FAIL);
1741 			}
1742 		} else
1743 			stall_count = 0;
1744 		if (seg_plocked != 0)
1745 			delay(hz/SEGP_PREDEL_DELAY_FACTOR);
1746 	}
1747 	return (SEGP_SUCCESS);
1748 }
1749 
1750 /*
1751  * Attempt to purge seg_pcache.  May need to return before this has
1752  * completed to allow other pre_del callbacks to unlock pages. This is
1753  * ok because:
1754  *	1) The seg_pdisabled flag has been set so at least we won't
1755  *	cache anymore locks and the locks we couldn't purge
1756  *	will not be held if they do get released by a subsequent
1757  *	pre-delete callback.
1758  *
1759  *	2) The rest of the memory delete thread processing does not
1760  *	depend on the changes made in this pre-delete callback. No
1761  *	panics will result, the worst that will happen is that the
1762  *	DR code will timeout and cancel the delete.
1763  */
1764 /*ARGSUSED*/
1765 static int
1766 seg_p_mem_config_pre_del(
1767 	void *arg,
1768 	pgcnt_t delta_pages)
1769 {
1770 	if (seg_phashsize_win == 0) {
1771 		return (0);
1772 	}
1773 	if (seg_p_disable() != SEGP_SUCCESS)
1774 		cmn_err(CE_NOTE,
1775 		    "!Pre-delete couldn't purge"" pagelock cache - continuing");
1776 	return (0);
1777 }
1778 
1779 /*ARGSUSED*/
1780 static void
1781 seg_p_mem_config_post_del(
1782 	void *arg,
1783 	pgcnt_t delta_pages,
1784 	int cancelled)
1785 {
1786 	if (seg_phashsize_win == 0) {
1787 		return;
1788 	}
1789 	seg_p_enable();
1790 }
1791 
1792 static kphysm_setup_vector_t seg_p_mem_config_vec = {
1793 	KPHYSM_SETUP_VECTOR_VERSION,
1794 	seg_p_mem_config_post_add,
1795 	seg_p_mem_config_pre_del,
1796 	seg_p_mem_config_post_del,
1797 };
1798 
1799 static void
1800 seg_pinit_mem_config(void)
1801 {
1802 	int ret;
1803 
1804 	ret = kphysm_setup_func_register(&seg_p_mem_config_vec, (void *)NULL);
1805 	/*
1806 	 * Want to catch this in the debug kernel. At run time, if the
1807 	 * callbacks don't get run all will be OK as the disable just makes
1808 	 * it more likely that the pages can be collected.
1809 	 */
1810 	ASSERT(ret == 0);
1811 }
1812 
1813 /*
1814  * Verify that segment is not a shared anonymous segment which reserves
1815  * swap.  zone.max-swap accounting (zone->zone_max_swap) cannot be transfered
1816  * from one zone to another if any segments are shared.  This is because the
1817  * last process to exit will credit the swap reservation.  This could lead
1818  * to the swap being reserved by one zone, and credited to another.
1819  */
1820 boolean_t
1821 seg_can_change_zones(struct seg *seg)
1822 {
1823 	struct segvn_data *svd;
1824 
1825 	if (seg->s_ops == &segspt_shmops)
1826 		return (B_FALSE);
1827 
1828 	if (seg->s_ops == &segvn_ops) {
1829 		svd = (struct segvn_data *)seg->s_data;
1830 		if (svd->type == MAP_SHARED &&
1831 		    svd->amp != NULL &&
1832 		    svd->amp->swresv > 0)
1833 		return (B_FALSE);
1834 	}
1835 	return (B_TRUE);
1836 }
1837 
1838 /*
1839  * Return swap reserved by a segment backing a private mapping.
1840  */
1841 size_t
1842 seg_swresv(struct seg *seg)
1843 {
1844 	struct segvn_data *svd;
1845 	size_t swap = 0;
1846 
1847 	if (seg->s_ops == &segvn_ops) {
1848 		svd = (struct segvn_data *)seg->s_data;
1849 		if (svd->type == MAP_PRIVATE && svd->swresv > 0)
1850 			swap = svd->swresv;
1851 	}
1852 	return (swap);
1853 }
1854