xref: /titanic_51/usr/src/uts/common/vm/vm_page.c (revision 06fb6a368cb1af862cff62b9a1fd89171e9ac63a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T	*/
27 /*	  All Rights Reserved  	*/
28 
29 /*
30  * University Copyright- Copyright (c) 1982, 1986, 1988
31  * The Regents of the University of California
32  * All Rights Reserved
33  *
34  * University Acknowledgment- Portions of this document are derived from
35  * software developed by the University of California, Berkeley, and its
36  * contributors.
37  */
38 
39 #pragma ident	"%Z%%M%	%I%	%E% SMI"
40 
41 /*
42  * VM - physical page management.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/t_lock.h>
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/errno.h>
50 #include <sys/time.h>
51 #include <sys/vnode.h>
52 #include <sys/vm.h>
53 #include <sys/vtrace.h>
54 #include <sys/swap.h>
55 #include <sys/cmn_err.h>
56 #include <sys/tuneable.h>
57 #include <sys/sysmacros.h>
58 #include <sys/cpuvar.h>
59 #include <sys/callb.h>
60 #include <sys/debug.h>
61 #include <sys/tnf_probe.h>
62 #include <sys/condvar_impl.h>
63 #include <sys/mem_config.h>
64 #include <sys/mem_cage.h>
65 #include <sys/kmem.h>
66 #include <sys/atomic.h>
67 #include <sys/strlog.h>
68 #include <sys/mman.h>
69 #include <sys/ontrap.h>
70 #include <sys/lgrp.h>
71 #include <sys/vfs.h>
72 
73 #include <vm/hat.h>
74 #include <vm/anon.h>
75 #include <vm/page.h>
76 #include <vm/seg.h>
77 #include <vm/pvn.h>
78 #include <vm/seg_kmem.h>
79 #include <vm/vm_dep.h>
80 #include <sys/vm_usage.h>
81 #include <fs/fs_subr.h>
82 #include <sys/ddi.h>
83 #include <sys/modctl.h>
84 
85 static int nopageage = 0;
86 
87 static pgcnt_t max_page_get;	/* max page_get request size in pages */
88 pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
89 
90 /*
91  * freemem_lock protects all freemem variables:
92  * availrmem. Also this lock protects the globals which track the
93  * availrmem changes for accurate kernel footprint calculation.
94  * See below for an explanation of these
95  * globals.
96  */
97 kmutex_t freemem_lock;
98 pgcnt_t availrmem;
99 pgcnt_t availrmem_initial;
100 
101 /*
102  * These globals track availrmem changes to get a more accurate
103  * estimate of tke kernel size. Historically pp_kernel is used for
104  * kernel size and is based on availrmem. But availrmem is adjusted for
105  * locked pages in the system not just for kernel locked pages.
106  * These new counters will track the pages locked through segvn and
107  * by explicit user locking.
108  *
109  * pages_locked : How many pages are locked because of user specified
110  * locking through mlock or plock.
111  *
112  * pages_useclaim,pages_claimed : These two variables track the
113  * claim adjustments because of the protection changes on a segvn segment.
114  *
115  * All these globals are protected by the same lock which protects availrmem.
116  */
117 pgcnt_t pages_locked = 0;
118 pgcnt_t pages_useclaim = 0;
119 pgcnt_t pages_claimed = 0;
120 
121 
122 /*
123  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
124  */
125 static kmutex_t	new_freemem_lock;
126 static uint_t	freemem_wait;	/* someone waiting for freemem */
127 static kcondvar_t freemem_cv;
128 
129 /*
130  * The logical page free list is maintained as two lists, the 'free'
131  * and the 'cache' lists.
132  * The free list contains those pages that should be reused first.
133  *
134  * The implementation of the lists is machine dependent.
135  * page_get_freelist(), page_get_cachelist(),
136  * page_list_sub(), and page_list_add()
137  * form the interface to the machine dependent implementation.
138  *
139  * Pages with p_free set are on the cache list.
140  * Pages with p_free and p_age set are on the free list,
141  *
142  * A page may be locked while on either list.
143  */
144 
145 /*
146  * free list accounting stuff.
147  *
148  *
149  * Spread out the value for the number of pages on the
150  * page free and page cache lists.  If there is just one
151  * value, then it must be under just one lock.
152  * The lock contention and cache traffic are a real bother.
153  *
154  * When we acquire and then drop a single pcf lock
155  * we can start in the middle of the array of pcf structures.
156  * If we acquire more than one pcf lock at a time, we need to
157  * start at the front to avoid deadlocking.
158  *
159  * pcf_count holds the number of pages in each pool.
160  *
161  * pcf_block is set when page_create_get_something() has asked the
162  * PSM page freelist and page cachelist routines without specifying
163  * a color and nothing came back.  This is used to block anything
164  * else from moving pages from one list to the other while the
165  * lists are searched again.  If a page is freeed while pcf_block is
166  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
167  * of clearning pcf_block, doing the wakeups, etc.
168  */
169 
170 #define	MAX_PCF_FANOUT NCPU
171 static uint_t pcf_fanout = 1; /* Will get changed at boot time */
172 static uint_t pcf_fanout_mask = 0;
173 
174 struct pcf {
175 	kmutex_t	pcf_lock;	/* protects the structure */
176 	uint_t		pcf_count;	/* page count */
177 	uint_t		pcf_wait;	/* number of waiters */
178 	uint_t		pcf_block; 	/* pcgs flag to page_free() */
179 	uint_t		pcf_reserve; 	/* pages freed after pcf_block set */
180 	uint_t		pcf_fill[10];	/* to line up on the caches */
181 };
182 
183 /*
184  * PCF_INDEX hash needs to be dynamic (every so often the hash changes where
185  * it will hash the cpu to).  This is done to prevent a drain condition
186  * from happening.  This drain condition will occur when pcf_count decrement
187  * occurs on cpu A and the increment of pcf_count always occurs on cpu B.  An
188  * example of this shows up with device interrupts.  The dma buffer is allocated
189  * by the cpu requesting the IO thus the pcf_count is decremented based on that.
190  * When the memory is returned by the interrupt thread, the pcf_count will be
191  * incremented based on the cpu servicing the interrupt.
192  */
193 static struct pcf pcf[MAX_PCF_FANOUT];
194 #define	PCF_INDEX() ((int)(((long)CPU->cpu_seqid) + \
195 	(randtick() >> 24)) & (pcf_fanout_mask))
196 
197 static int pcf_decrement_bucket(pgcnt_t);
198 static int pcf_decrement_multiple(pgcnt_t *, pgcnt_t, int);
199 
200 kmutex_t	pcgs_lock;		/* serializes page_create_get_ */
201 kmutex_t	pcgs_cagelock;		/* serializes NOSLEEP cage allocs */
202 kmutex_t	pcgs_wait_lock;		/* used for delay in pcgs */
203 static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
204 
205 #ifdef VM_STATS
206 
207 /*
208  * No locks, but so what, they are only statistics.
209  */
210 
211 static struct page_tcnt {
212 	int	pc_free_cache;		/* free's into cache list */
213 	int	pc_free_dontneed;	/* free's with dontneed */
214 	int	pc_free_pageout;	/* free's from pageout */
215 	int	pc_free_free;		/* free's into free list */
216 	int	pc_free_pages;		/* free's into large page free list */
217 	int	pc_destroy_pages;	/* large page destroy's */
218 	int	pc_get_cache;		/* get's from cache list */
219 	int	pc_get_free;		/* get's from free list */
220 	int	pc_reclaim;		/* reclaim's */
221 	int	pc_abortfree;		/* abort's of free pages */
222 	int	pc_find_hit;		/* find's that find page */
223 	int	pc_find_miss;		/* find's that don't find page */
224 	int	pc_destroy_free;	/* # of free pages destroyed */
225 #define	PC_HASH_CNT	(4*PAGE_HASHAVELEN)
226 	int	pc_find_hashlen[PC_HASH_CNT+1];
227 	int	pc_addclaim_pages;
228 	int	pc_subclaim_pages;
229 	int	pc_free_replacement_page[2];
230 	int	pc_try_demote_pages[6];
231 	int	pc_demote_pages[2];
232 } pagecnt;
233 
234 uint_t	hashin_count;
235 uint_t	hashin_not_held;
236 uint_t	hashin_already;
237 
238 uint_t	hashout_count;
239 uint_t	hashout_not_held;
240 
241 uint_t	page_create_count;
242 uint_t	page_create_not_enough;
243 uint_t	page_create_not_enough_again;
244 uint_t	page_create_zero;
245 uint_t	page_create_hashout;
246 uint_t	page_create_page_lock_failed;
247 uint_t	page_create_trylock_failed;
248 uint_t	page_create_found_one;
249 uint_t	page_create_hashin_failed;
250 uint_t	page_create_dropped_phm;
251 
252 uint_t	page_create_new;
253 uint_t	page_create_exists;
254 uint_t	page_create_putbacks;
255 uint_t	page_create_overshoot;
256 
257 uint_t	page_reclaim_zero;
258 uint_t	page_reclaim_zero_locked;
259 
260 uint_t	page_rename_exists;
261 uint_t	page_rename_count;
262 
263 uint_t	page_lookup_cnt[20];
264 uint_t	page_lookup_nowait_cnt[10];
265 uint_t	page_find_cnt;
266 uint_t	page_exists_cnt;
267 uint_t	page_exists_forreal_cnt;
268 uint_t	page_lookup_dev_cnt;
269 uint_t	get_cachelist_cnt;
270 uint_t	page_create_cnt[10];
271 uint_t	alloc_pages[9];
272 uint_t	page_exphcontg[19];
273 uint_t  page_create_large_cnt[10];
274 
275 /*
276  * Collects statistics.
277  */
278 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
279 	uint_t	mylen = 0; \
280 			\
281 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
282 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
283 			break; \
284 	} \
285 	if ((pp) != NULL) \
286 		pagecnt.pc_find_hit++; \
287 	else \
288 		pagecnt.pc_find_miss++; \
289 	if (mylen > PC_HASH_CNT) \
290 		mylen = PC_HASH_CNT; \
291 	pagecnt.pc_find_hashlen[mylen]++; \
292 }
293 
294 #else	/* VM_STATS */
295 
296 /*
297  * Don't collect statistics
298  */
299 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
300 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
301 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
302 			break; \
303 	} \
304 }
305 
306 #endif	/* VM_STATS */
307 
308 
309 
310 #ifdef DEBUG
311 #define	MEMSEG_SEARCH_STATS
312 #endif
313 
314 #ifdef MEMSEG_SEARCH_STATS
315 struct memseg_stats {
316     uint_t nsearch;
317     uint_t nlastwon;
318     uint_t nhashwon;
319     uint_t nnotfound;
320 } memseg_stats;
321 
322 #define	MEMSEG_STAT_INCR(v) \
323 	atomic_add_32(&memseg_stats.v, 1)
324 #else
325 #define	MEMSEG_STAT_INCR(x)
326 #endif
327 
328 struct memseg *memsegs;		/* list of memory segments */
329 
330 /*
331  * /etc/system tunable to control large page allocation hueristic.
332  *
333  * Setting to LPAP_LOCAL will heavily prefer the local lgroup over remote lgroup
334  * for large page allocation requests.  If a large page is not readily
335  * avaliable on the local freelists we will go through additional effort
336  * to create a large page, potentially moving smaller pages around to coalesce
337  * larger pages in the local lgroup.
338  * Default value of LPAP_DEFAULT will go to remote freelists if large pages
339  * are not readily available in the local lgroup.
340  */
341 enum lpap {
342 	LPAP_DEFAULT,	/* default large page allocation policy */
343 	LPAP_LOCAL	/* local large page allocation policy */
344 };
345 
346 enum lpap lpg_alloc_prefer = LPAP_DEFAULT;
347 
348 static void page_init_mem_config(void);
349 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
350 static void page_do_hashout(page_t *);
351 static void page_capture_init();
352 int page_capture_take_action(page_t *, uint_t, void *);
353 
354 static void page_demote_vp_pages(page_t *);
355 
356 
357 void
358 pcf_init(void)
359 
360 {
361 	int i;
362 
363 	if (boot_ncpus != -1) {
364 		pcf_fanout = boot_ncpus;
365 	} else {
366 		pcf_fanout = max_ncpus;
367 	}
368 #ifdef sun4v
369 	/*
370 	 * Force at least 4 buckets if possible for sun4v.
371 	 */
372 	pcf_fanout = MAX(pcf_fanout, 4);
373 #endif /* sun4v */
374 
375 	/*
376 	 * Round up to the nearest power of 2.
377 	 */
378 	pcf_fanout = MIN(pcf_fanout, MAX_PCF_FANOUT);
379 	if (!ISP2(pcf_fanout)) {
380 		pcf_fanout = 1 << highbit(pcf_fanout);
381 
382 		if (pcf_fanout > MAX_PCF_FANOUT) {
383 			pcf_fanout = 1 << (highbit(MAX_PCF_FANOUT) - 1);
384 		}
385 	}
386 	pcf_fanout_mask = pcf_fanout - 1;
387 }
388 
389 /*
390  * vm subsystem related initialization
391  */
392 void
393 vm_init(void)
394 {
395 	boolean_t callb_vm_cpr(void *, int);
396 
397 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
398 	page_init_mem_config();
399 	page_retire_init();
400 	vm_usage_init();
401 	page_capture_init();
402 }
403 
404 /*
405  * This function is called at startup and when memory is added or deleted.
406  */
407 void
408 init_pages_pp_maximum()
409 {
410 	static pgcnt_t p_min;
411 	static pgcnt_t pages_pp_maximum_startup;
412 	static pgcnt_t avrmem_delta;
413 	static int init_done;
414 	static int user_set;	/* true if set in /etc/system */
415 
416 	if (init_done == 0) {
417 
418 		/* If the user specified a value, save it */
419 		if (pages_pp_maximum != 0) {
420 			user_set = 1;
421 			pages_pp_maximum_startup = pages_pp_maximum;
422 		}
423 
424 		/*
425 		 * Setting of pages_pp_maximum is based first time
426 		 * on the value of availrmem just after the start-up
427 		 * allocations. To preserve this relationship at run
428 		 * time, use a delta from availrmem_initial.
429 		 */
430 		ASSERT(availrmem_initial >= availrmem);
431 		avrmem_delta = availrmem_initial - availrmem;
432 
433 		/* The allowable floor of pages_pp_maximum */
434 		p_min = tune.t_minarmem + 100;
435 
436 		/* Make sure we don't come through here again. */
437 		init_done = 1;
438 	}
439 	/*
440 	 * Determine pages_pp_maximum, the number of currently available
441 	 * pages (availrmem) that can't be `locked'. If not set by
442 	 * the user, we set it to 4% of the currently available memory
443 	 * plus 4MB.
444 	 * But we also insist that it be greater than tune.t_minarmem;
445 	 * otherwise a process could lock down a lot of memory, get swapped
446 	 * out, and never have enough to get swapped back in.
447 	 */
448 	if (user_set)
449 		pages_pp_maximum = pages_pp_maximum_startup;
450 	else
451 		pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
452 		    + btop(4 * 1024 * 1024);
453 
454 	if (pages_pp_maximum <= p_min) {
455 		pages_pp_maximum = p_min;
456 	}
457 }
458 
459 void
460 set_max_page_get(pgcnt_t target_total_pages)
461 {
462 	max_page_get = target_total_pages / 2;
463 }
464 
465 static pgcnt_t pending_delete;
466 
467 /*ARGSUSED*/
468 static void
469 page_mem_config_post_add(
470 	void *arg,
471 	pgcnt_t delta_pages)
472 {
473 	set_max_page_get(total_pages - pending_delete);
474 	init_pages_pp_maximum();
475 }
476 
477 /*ARGSUSED*/
478 static int
479 page_mem_config_pre_del(
480 	void *arg,
481 	pgcnt_t delta_pages)
482 {
483 	pgcnt_t nv;
484 
485 	nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
486 	set_max_page_get(total_pages - nv);
487 	return (0);
488 }
489 
490 /*ARGSUSED*/
491 static void
492 page_mem_config_post_del(
493 	void *arg,
494 	pgcnt_t delta_pages,
495 	int cancelled)
496 {
497 	pgcnt_t nv;
498 
499 	nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
500 	set_max_page_get(total_pages - nv);
501 	if (!cancelled)
502 		init_pages_pp_maximum();
503 }
504 
505 static kphysm_setup_vector_t page_mem_config_vec = {
506 	KPHYSM_SETUP_VECTOR_VERSION,
507 	page_mem_config_post_add,
508 	page_mem_config_pre_del,
509 	page_mem_config_post_del,
510 };
511 
512 static void
513 page_init_mem_config(void)
514 {
515 	int ret;
516 
517 	ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
518 	ASSERT(ret == 0);
519 }
520 
521 /*
522  * Evenly spread out the PCF counters for large free pages
523  */
524 static void
525 page_free_large_ctr(pgcnt_t npages)
526 {
527 	static struct pcf	*p = pcf;
528 	pgcnt_t			lump;
529 
530 	freemem += npages;
531 
532 	lump = roundup(npages, pcf_fanout) / pcf_fanout;
533 
534 	while (npages > 0) {
535 
536 		ASSERT(!p->pcf_block);
537 
538 		if (lump < npages) {
539 			p->pcf_count += (uint_t)lump;
540 			npages -= lump;
541 		} else {
542 			p->pcf_count += (uint_t)npages;
543 			npages = 0;
544 		}
545 
546 		ASSERT(!p->pcf_wait);
547 
548 		if (++p > &pcf[pcf_fanout - 1])
549 			p = pcf;
550 	}
551 
552 	ASSERT(npages == 0);
553 }
554 
555 /*
556  * Add a physical chunk of memory to the system free lists during startup.
557  * Platform specific startup() allocates the memory for the page structs.
558  *
559  * num	- number of page structures
560  * base - page number (pfn) to be associated with the first page.
561  *
562  * Since we are doing this during startup (ie. single threaded), we will
563  * use shortcut routines to avoid any locking overhead while putting all
564  * these pages on the freelists.
565  *
566  * NOTE: Any changes performed to page_free(), must also be performed to
567  *	 add_physmem() since this is how we initialize all page_t's at
568  *	 boot time.
569  */
570 void
571 add_physmem(
572 	page_t	*pp,
573 	pgcnt_t	num,
574 	pfn_t	pnum)
575 {
576 	page_t	*root = NULL;
577 	uint_t	szc = page_num_pagesizes() - 1;
578 	pgcnt_t	large = page_get_pagecnt(szc);
579 	pgcnt_t	cnt = 0;
580 
581 	TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
582 	    "add_physmem:pp %p num %lu", pp, num);
583 
584 	/*
585 	 * Arbitrarily limit the max page_get request
586 	 * to 1/2 of the page structs we have.
587 	 */
588 	total_pages += num;
589 	set_max_page_get(total_pages);
590 
591 	PLCNT_MODIFY_MAX(pnum, (long)num);
592 
593 	/*
594 	 * The physical space for the pages array
595 	 * representing ram pages has already been
596 	 * allocated.  Here we initialize each lock
597 	 * in the page structure, and put each on
598 	 * the free list
599 	 */
600 	for (; num; pp++, pnum++, num--) {
601 
602 		/*
603 		 * this needs to fill in the page number
604 		 * and do any other arch specific initialization
605 		 */
606 		add_physmem_cb(pp, pnum);
607 
608 		pp->p_lckcnt = 0;
609 		pp->p_cowcnt = 0;
610 		pp->p_slckcnt = 0;
611 
612 		/*
613 		 * Initialize the page lock as unlocked, since nobody
614 		 * can see or access this page yet.
615 		 */
616 		pp->p_selock = 0;
617 
618 		/*
619 		 * Initialize IO lock
620 		 */
621 		page_iolock_init(pp);
622 
623 		/*
624 		 * initialize other fields in the page_t
625 		 */
626 		PP_SETFREE(pp);
627 		page_clr_all_props(pp);
628 		PP_SETAGED(pp);
629 		pp->p_offset = (u_offset_t)-1;
630 		pp->p_next = pp;
631 		pp->p_prev = pp;
632 
633 		/*
634 		 * Simple case: System doesn't support large pages.
635 		 */
636 		if (szc == 0) {
637 			pp->p_szc = 0;
638 			page_free_at_startup(pp);
639 			continue;
640 		}
641 
642 		/*
643 		 * Handle unaligned pages, we collect them up onto
644 		 * the root page until we have a full large page.
645 		 */
646 		if (!IS_P2ALIGNED(pnum, large)) {
647 
648 			/*
649 			 * If not in a large page,
650 			 * just free as small page.
651 			 */
652 			if (root == NULL) {
653 				pp->p_szc = 0;
654 				page_free_at_startup(pp);
655 				continue;
656 			}
657 
658 			/*
659 			 * Link a constituent page into the large page.
660 			 */
661 			pp->p_szc = szc;
662 			page_list_concat(&root, &pp);
663 
664 			/*
665 			 * When large page is fully formed, free it.
666 			 */
667 			if (++cnt == large) {
668 				page_free_large_ctr(cnt);
669 				page_list_add_pages(root, PG_LIST_ISINIT);
670 				root = NULL;
671 				cnt = 0;
672 			}
673 			continue;
674 		}
675 
676 		/*
677 		 * At this point we have a page number which
678 		 * is aligned. We assert that we aren't already
679 		 * in a different large page.
680 		 */
681 		ASSERT(IS_P2ALIGNED(pnum, large));
682 		ASSERT(root == NULL && cnt == 0);
683 
684 		/*
685 		 * If insufficient number of pages left to form
686 		 * a large page, just free the small page.
687 		 */
688 		if (num < large) {
689 			pp->p_szc = 0;
690 			page_free_at_startup(pp);
691 			continue;
692 		}
693 
694 		/*
695 		 * Otherwise start a new large page.
696 		 */
697 		pp->p_szc = szc;
698 		cnt++;
699 		root = pp;
700 	}
701 	ASSERT(root == NULL && cnt == 0);
702 }
703 
704 /*
705  * Find a page representing the specified [vp, offset].
706  * If we find the page but it is intransit coming in,
707  * it will have an "exclusive" lock and we wait for
708  * the i/o to complete.  A page found on the free list
709  * is always reclaimed and then locked.  On success, the page
710  * is locked, its data is valid and it isn't on the free
711  * list, while a NULL is returned if the page doesn't exist.
712  */
713 page_t *
714 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
715 {
716 	return (page_lookup_create(vp, off, se, NULL, NULL, 0));
717 }
718 
719 /*
720  * Find a page representing the specified [vp, offset].
721  * We either return the one we found or, if passed in,
722  * create one with identity of [vp, offset] of the
723  * pre-allocated page. If we find existing page but it is
724  * intransit coming in, it will have an "exclusive" lock
725  * and we wait for the i/o to complete.  A page found on
726  * the free list is always reclaimed and then locked.
727  * On success, the page is locked, its data is valid and
728  * it isn't on the free list, while a NULL is returned
729  * if the page doesn't exist and newpp is NULL;
730  */
731 page_t *
732 page_lookup_create(
733 	vnode_t *vp,
734 	u_offset_t off,
735 	se_t se,
736 	page_t *newpp,
737 	spgcnt_t *nrelocp,
738 	int flags)
739 {
740 	page_t		*pp;
741 	kmutex_t	*phm;
742 	ulong_t		index;
743 	uint_t		hash_locked;
744 	uint_t		es;
745 
746 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
747 	VM_STAT_ADD(page_lookup_cnt[0]);
748 	ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
749 
750 	/*
751 	 * Acquire the appropriate page hash lock since
752 	 * we have to search the hash list.  Pages that
753 	 * hash to this list can't change identity while
754 	 * this lock is held.
755 	 */
756 	hash_locked = 0;
757 	index = PAGE_HASH_FUNC(vp, off);
758 	phm = NULL;
759 top:
760 	PAGE_HASH_SEARCH(index, pp, vp, off);
761 	if (pp != NULL) {
762 		VM_STAT_ADD(page_lookup_cnt[1]);
763 		es = (newpp != NULL) ? 1 : 0;
764 		es |= flags;
765 		if (!hash_locked) {
766 			VM_STAT_ADD(page_lookup_cnt[2]);
767 			if (!page_try_reclaim_lock(pp, se, es)) {
768 				/*
769 				 * On a miss, acquire the phm.  Then
770 				 * next time, page_lock() will be called,
771 				 * causing a wait if the page is busy.
772 				 * just looping with page_trylock() would
773 				 * get pretty boring.
774 				 */
775 				VM_STAT_ADD(page_lookup_cnt[3]);
776 				phm = PAGE_HASH_MUTEX(index);
777 				mutex_enter(phm);
778 				hash_locked = 1;
779 				goto top;
780 			}
781 		} else {
782 			VM_STAT_ADD(page_lookup_cnt[4]);
783 			if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
784 				VM_STAT_ADD(page_lookup_cnt[5]);
785 				goto top;
786 			}
787 		}
788 
789 		/*
790 		 * Since `pp' is locked it can not change identity now.
791 		 * Reconfirm we locked the correct page.
792 		 *
793 		 * Both the p_vnode and p_offset *must* be cast volatile
794 		 * to force a reload of their values: The PAGE_HASH_SEARCH
795 		 * macro will have stuffed p_vnode and p_offset into
796 		 * registers before calling page_trylock(); another thread,
797 		 * actually holding the hash lock, could have changed the
798 		 * page's identity in memory, but our registers would not
799 		 * be changed, fooling the reconfirmation.  If the hash
800 		 * lock was held during the search, the casting would
801 		 * not be needed.
802 		 */
803 		VM_STAT_ADD(page_lookup_cnt[6]);
804 		if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
805 		    ((volatile u_offset_t)(pp->p_offset) != off)) {
806 			VM_STAT_ADD(page_lookup_cnt[7]);
807 			if (hash_locked) {
808 				panic("page_lookup_create: lost page %p",
809 				    (void *)pp);
810 				/*NOTREACHED*/
811 			}
812 			page_unlock(pp);
813 			phm = PAGE_HASH_MUTEX(index);
814 			mutex_enter(phm);
815 			hash_locked = 1;
816 			goto top;
817 		}
818 
819 		/*
820 		 * If page_trylock() was called, then pp may still be on
821 		 * the cachelist (can't be on the free list, it would not
822 		 * have been found in the search).  If it is on the
823 		 * cachelist it must be pulled now. To pull the page from
824 		 * the cachelist, it must be exclusively locked.
825 		 *
826 		 * The other big difference between page_trylock() and
827 		 * page_lock(), is that page_lock() will pull the
828 		 * page from whatever free list (the cache list in this
829 		 * case) the page is on.  If page_trylock() was used
830 		 * above, then we have to do the reclaim ourselves.
831 		 */
832 		if ((!hash_locked) && (PP_ISFREE(pp))) {
833 			ASSERT(PP_ISAGED(pp) == 0);
834 			VM_STAT_ADD(page_lookup_cnt[8]);
835 
836 			/*
837 			 * page_relcaim will insure that we
838 			 * have this page exclusively
839 			 */
840 
841 			if (!page_reclaim(pp, NULL)) {
842 				/*
843 				 * Page_reclaim dropped whatever lock
844 				 * we held.
845 				 */
846 				VM_STAT_ADD(page_lookup_cnt[9]);
847 				phm = PAGE_HASH_MUTEX(index);
848 				mutex_enter(phm);
849 				hash_locked = 1;
850 				goto top;
851 			} else if (se == SE_SHARED && newpp == NULL) {
852 				VM_STAT_ADD(page_lookup_cnt[10]);
853 				page_downgrade(pp);
854 			}
855 		}
856 
857 		if (hash_locked) {
858 			mutex_exit(phm);
859 		}
860 
861 		if (newpp != NULL && pp->p_szc < newpp->p_szc &&
862 		    PAGE_EXCL(pp) && nrelocp != NULL) {
863 			ASSERT(nrelocp != NULL);
864 			(void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
865 			    NULL);
866 			if (*nrelocp > 0) {
867 				VM_STAT_COND_ADD(*nrelocp == 1,
868 				    page_lookup_cnt[11]);
869 				VM_STAT_COND_ADD(*nrelocp > 1,
870 				    page_lookup_cnt[12]);
871 				pp = newpp;
872 				se = SE_EXCL;
873 			} else {
874 				if (se == SE_SHARED) {
875 					page_downgrade(pp);
876 				}
877 				VM_STAT_ADD(page_lookup_cnt[13]);
878 			}
879 		} else if (newpp != NULL && nrelocp != NULL) {
880 			if (PAGE_EXCL(pp) && se == SE_SHARED) {
881 				page_downgrade(pp);
882 			}
883 			VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
884 			    page_lookup_cnt[14]);
885 			VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
886 			    page_lookup_cnt[15]);
887 			VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
888 			    page_lookup_cnt[16]);
889 		} else if (newpp != NULL && PAGE_EXCL(pp)) {
890 			se = SE_EXCL;
891 		}
892 	} else if (!hash_locked) {
893 		VM_STAT_ADD(page_lookup_cnt[17]);
894 		phm = PAGE_HASH_MUTEX(index);
895 		mutex_enter(phm);
896 		hash_locked = 1;
897 		goto top;
898 	} else if (newpp != NULL) {
899 		/*
900 		 * If we have a preallocated page then
901 		 * insert it now and basically behave like
902 		 * page_create.
903 		 */
904 		VM_STAT_ADD(page_lookup_cnt[18]);
905 		/*
906 		 * Since we hold the page hash mutex and
907 		 * just searched for this page, page_hashin
908 		 * had better not fail.  If it does, that
909 		 * means some thread did not follow the
910 		 * page hash mutex rules.  Panic now and
911 		 * get it over with.  As usual, go down
912 		 * holding all the locks.
913 		 */
914 		ASSERT(MUTEX_HELD(phm));
915 		if (!page_hashin(newpp, vp, off, phm)) {
916 			ASSERT(MUTEX_HELD(phm));
917 			panic("page_lookup_create: hashin failed %p %p %llx %p",
918 			    (void *)newpp, (void *)vp, off, (void *)phm);
919 			/*NOTREACHED*/
920 		}
921 		ASSERT(MUTEX_HELD(phm));
922 		mutex_exit(phm);
923 		phm = NULL;
924 		page_set_props(newpp, P_REF);
925 		page_io_lock(newpp);
926 		pp = newpp;
927 		se = SE_EXCL;
928 	} else {
929 		VM_STAT_ADD(page_lookup_cnt[19]);
930 		mutex_exit(phm);
931 	}
932 
933 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
934 
935 	ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
936 
937 	return (pp);
938 }
939 
940 /*
941  * Search the hash list for the page representing the
942  * specified [vp, offset] and return it locked.  Skip
943  * free pages and pages that cannot be locked as requested.
944  * Used while attempting to kluster pages.
945  */
946 page_t *
947 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
948 {
949 	page_t		*pp;
950 	kmutex_t	*phm;
951 	ulong_t		index;
952 	uint_t		locked;
953 
954 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
955 	VM_STAT_ADD(page_lookup_nowait_cnt[0]);
956 
957 	index = PAGE_HASH_FUNC(vp, off);
958 	PAGE_HASH_SEARCH(index, pp, vp, off);
959 	locked = 0;
960 	if (pp == NULL) {
961 top:
962 		VM_STAT_ADD(page_lookup_nowait_cnt[1]);
963 		locked = 1;
964 		phm = PAGE_HASH_MUTEX(index);
965 		mutex_enter(phm);
966 		PAGE_HASH_SEARCH(index, pp, vp, off);
967 	}
968 
969 	if (pp == NULL || PP_ISFREE(pp)) {
970 		VM_STAT_ADD(page_lookup_nowait_cnt[2]);
971 		pp = NULL;
972 	} else {
973 		if (!page_trylock(pp, se)) {
974 			VM_STAT_ADD(page_lookup_nowait_cnt[3]);
975 			pp = NULL;
976 		} else {
977 			VM_STAT_ADD(page_lookup_nowait_cnt[4]);
978 			/*
979 			 * See the comment in page_lookup()
980 			 */
981 			if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
982 			    ((u_offset_t)(pp->p_offset) != off)) {
983 				VM_STAT_ADD(page_lookup_nowait_cnt[5]);
984 				if (locked) {
985 					panic("page_lookup_nowait %p",
986 					    (void *)pp);
987 					/*NOTREACHED*/
988 				}
989 				page_unlock(pp);
990 				goto top;
991 			}
992 			if (PP_ISFREE(pp)) {
993 				VM_STAT_ADD(page_lookup_nowait_cnt[6]);
994 				page_unlock(pp);
995 				pp = NULL;
996 			}
997 		}
998 	}
999 	if (locked) {
1000 		VM_STAT_ADD(page_lookup_nowait_cnt[7]);
1001 		mutex_exit(phm);
1002 	}
1003 
1004 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
1005 
1006 	return (pp);
1007 }
1008 
1009 /*
1010  * Search the hash list for a page with the specified [vp, off]
1011  * that is known to exist and is already locked.  This routine
1012  * is typically used by segment SOFTUNLOCK routines.
1013  */
1014 page_t *
1015 page_find(vnode_t *vp, u_offset_t off)
1016 {
1017 	page_t		*pp;
1018 	kmutex_t	*phm;
1019 	ulong_t		index;
1020 
1021 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1022 	VM_STAT_ADD(page_find_cnt);
1023 
1024 	index = PAGE_HASH_FUNC(vp, off);
1025 	phm = PAGE_HASH_MUTEX(index);
1026 
1027 	mutex_enter(phm);
1028 	PAGE_HASH_SEARCH(index, pp, vp, off);
1029 	mutex_exit(phm);
1030 
1031 	ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
1032 	return (pp);
1033 }
1034 
1035 /*
1036  * Determine whether a page with the specified [vp, off]
1037  * currently exists in the system.  Obviously this should
1038  * only be considered as a hint since nothing prevents the
1039  * page from disappearing or appearing immediately after
1040  * the return from this routine. Subsequently, we don't
1041  * even bother to lock the list.
1042  */
1043 page_t *
1044 page_exists(vnode_t *vp, u_offset_t off)
1045 {
1046 	page_t	*pp;
1047 	ulong_t		index;
1048 
1049 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1050 	VM_STAT_ADD(page_exists_cnt);
1051 
1052 	index = PAGE_HASH_FUNC(vp, off);
1053 	PAGE_HASH_SEARCH(index, pp, vp, off);
1054 
1055 	return (pp);
1056 }
1057 
1058 /*
1059  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1060  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1061  * with these pages locked SHARED. If necessary reclaim pages from
1062  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1063  *
1064  * If we fail to lock pages still return 1 if pages exist and contiguous.
1065  * But in this case return value is just a hint. ppa array won't be filled.
1066  * Caller should initialize ppa[0] as NULL to distinguish return value.
1067  *
1068  * Returns 0 if pages don't exist or not physically contiguous.
1069  *
1070  * This routine doesn't work for anonymous(swapfs) pages.
1071  */
1072 int
1073 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1074 {
1075 	pgcnt_t pages;
1076 	pfn_t pfn;
1077 	page_t *rootpp;
1078 	pgcnt_t i;
1079 	pgcnt_t j;
1080 	u_offset_t save_off = off;
1081 	ulong_t index;
1082 	kmutex_t *phm;
1083 	page_t *pp;
1084 	uint_t pszc;
1085 	int loopcnt = 0;
1086 
1087 	ASSERT(szc != 0);
1088 	ASSERT(vp != NULL);
1089 	ASSERT(!IS_SWAPFSVP(vp));
1090 	ASSERT(!VN_ISKAS(vp));
1091 
1092 again:
1093 	if (++loopcnt > 3) {
1094 		VM_STAT_ADD(page_exphcontg[0]);
1095 		return (0);
1096 	}
1097 
1098 	index = PAGE_HASH_FUNC(vp, off);
1099 	phm = PAGE_HASH_MUTEX(index);
1100 
1101 	mutex_enter(phm);
1102 	PAGE_HASH_SEARCH(index, pp, vp, off);
1103 	mutex_exit(phm);
1104 
1105 	VM_STAT_ADD(page_exphcontg[1]);
1106 
1107 	if (pp == NULL) {
1108 		VM_STAT_ADD(page_exphcontg[2]);
1109 		return (0);
1110 	}
1111 
1112 	pages = page_get_pagecnt(szc);
1113 	rootpp = pp;
1114 	pfn = rootpp->p_pagenum;
1115 
1116 	if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1117 		VM_STAT_ADD(page_exphcontg[3]);
1118 		if (!page_trylock(pp, SE_SHARED)) {
1119 			VM_STAT_ADD(page_exphcontg[4]);
1120 			return (1);
1121 		}
1122 		if (pp->p_szc != pszc || pp->p_vnode != vp ||
1123 		    pp->p_offset != off) {
1124 			VM_STAT_ADD(page_exphcontg[5]);
1125 			page_unlock(pp);
1126 			off = save_off;
1127 			goto again;
1128 		}
1129 		/*
1130 		 * szc was non zero and vnode and offset matched after we
1131 		 * locked the page it means it can't become free on us.
1132 		 */
1133 		ASSERT(!PP_ISFREE(pp));
1134 		if (!IS_P2ALIGNED(pfn, pages)) {
1135 			page_unlock(pp);
1136 			return (0);
1137 		}
1138 		ppa[0] = pp;
1139 		pp++;
1140 		off += PAGESIZE;
1141 		pfn++;
1142 		for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1143 			if (!page_trylock(pp, SE_SHARED)) {
1144 				VM_STAT_ADD(page_exphcontg[6]);
1145 				pp--;
1146 				while (i-- > 0) {
1147 					page_unlock(pp);
1148 					pp--;
1149 				}
1150 				ppa[0] = NULL;
1151 				return (1);
1152 			}
1153 			if (pp->p_szc != pszc) {
1154 				VM_STAT_ADD(page_exphcontg[7]);
1155 				page_unlock(pp);
1156 				pp--;
1157 				while (i-- > 0) {
1158 					page_unlock(pp);
1159 					pp--;
1160 				}
1161 				ppa[0] = NULL;
1162 				off = save_off;
1163 				goto again;
1164 			}
1165 			/*
1166 			 * szc the same as for previous already locked pages
1167 			 * with right identity. Since this page had correct
1168 			 * szc after we locked it can't get freed or destroyed
1169 			 * and therefore must have the expected identity.
1170 			 */
1171 			ASSERT(!PP_ISFREE(pp));
1172 			if (pp->p_vnode != vp ||
1173 			    pp->p_offset != off) {
1174 				panic("page_exists_physcontig: "
1175 				    "large page identity doesn't match");
1176 			}
1177 			ppa[i] = pp;
1178 			ASSERT(pp->p_pagenum == pfn);
1179 		}
1180 		VM_STAT_ADD(page_exphcontg[8]);
1181 		ppa[pages] = NULL;
1182 		return (1);
1183 	} else if (pszc >= szc) {
1184 		VM_STAT_ADD(page_exphcontg[9]);
1185 		if (!IS_P2ALIGNED(pfn, pages)) {
1186 			return (0);
1187 		}
1188 		return (1);
1189 	}
1190 
1191 	if (!IS_P2ALIGNED(pfn, pages)) {
1192 		VM_STAT_ADD(page_exphcontg[10]);
1193 		return (0);
1194 	}
1195 
1196 	if (page_numtomemseg_nolock(pfn) !=
1197 	    page_numtomemseg_nolock(pfn + pages - 1)) {
1198 		VM_STAT_ADD(page_exphcontg[11]);
1199 		return (0);
1200 	}
1201 
1202 	/*
1203 	 * We loop up 4 times across pages to promote page size.
1204 	 * We're extra cautious to promote page size atomically with respect
1205 	 * to everybody else.  But we can probably optimize into 1 loop if
1206 	 * this becomes an issue.
1207 	 */
1208 
1209 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1210 		ASSERT(pp->p_pagenum == pfn);
1211 		if (!page_trylock(pp, SE_EXCL)) {
1212 			VM_STAT_ADD(page_exphcontg[12]);
1213 			break;
1214 		}
1215 		if (pp->p_vnode != vp ||
1216 		    pp->p_offset != off) {
1217 			VM_STAT_ADD(page_exphcontg[13]);
1218 			page_unlock(pp);
1219 			break;
1220 		}
1221 		if (pp->p_szc >= szc) {
1222 			ASSERT(i == 0);
1223 			page_unlock(pp);
1224 			off = save_off;
1225 			goto again;
1226 		}
1227 	}
1228 
1229 	if (i != pages) {
1230 		VM_STAT_ADD(page_exphcontg[14]);
1231 		--pp;
1232 		while (i-- > 0) {
1233 			page_unlock(pp);
1234 			--pp;
1235 		}
1236 		return (0);
1237 	}
1238 
1239 	pp = rootpp;
1240 	for (i = 0; i < pages; i++, pp++) {
1241 		if (PP_ISFREE(pp)) {
1242 			VM_STAT_ADD(page_exphcontg[15]);
1243 			ASSERT(!PP_ISAGED(pp));
1244 			ASSERT(pp->p_szc == 0);
1245 			if (!page_reclaim(pp, NULL)) {
1246 				break;
1247 			}
1248 		} else {
1249 			ASSERT(pp->p_szc < szc);
1250 			VM_STAT_ADD(page_exphcontg[16]);
1251 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1252 		}
1253 	}
1254 	if (i < pages) {
1255 		VM_STAT_ADD(page_exphcontg[17]);
1256 		/*
1257 		 * page_reclaim failed because we were out of memory.
1258 		 * drop the rest of the locks and return because this page
1259 		 * must be already reallocated anyway.
1260 		 */
1261 		pp = rootpp;
1262 		for (j = 0; j < pages; j++, pp++) {
1263 			if (j != i) {
1264 				page_unlock(pp);
1265 			}
1266 		}
1267 		return (0);
1268 	}
1269 
1270 	off = save_off;
1271 	pp = rootpp;
1272 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1273 		ASSERT(PAGE_EXCL(pp));
1274 		ASSERT(!PP_ISFREE(pp));
1275 		ASSERT(!hat_page_is_mapped(pp));
1276 		ASSERT(pp->p_vnode == vp);
1277 		ASSERT(pp->p_offset == off);
1278 		pp->p_szc = szc;
1279 	}
1280 	pp = rootpp;
1281 	for (i = 0; i < pages; i++, pp++) {
1282 		if (ppa == NULL) {
1283 			page_unlock(pp);
1284 		} else {
1285 			ppa[i] = pp;
1286 			page_downgrade(ppa[i]);
1287 		}
1288 	}
1289 	if (ppa != NULL) {
1290 		ppa[pages] = NULL;
1291 	}
1292 	VM_STAT_ADD(page_exphcontg[18]);
1293 	ASSERT(vp->v_pages != NULL);
1294 	return (1);
1295 }
1296 
1297 /*
1298  * Determine whether a page with the specified [vp, off]
1299  * currently exists in the system and if so return its
1300  * size code. Obviously this should only be considered as
1301  * a hint since nothing prevents the page from disappearing
1302  * or appearing immediately after the return from this routine.
1303  */
1304 int
1305 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1306 {
1307 	page_t		*pp;
1308 	kmutex_t	*phm;
1309 	ulong_t		index;
1310 	int		rc = 0;
1311 
1312 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1313 	ASSERT(szc != NULL);
1314 	VM_STAT_ADD(page_exists_forreal_cnt);
1315 
1316 	index = PAGE_HASH_FUNC(vp, off);
1317 	phm = PAGE_HASH_MUTEX(index);
1318 
1319 	mutex_enter(phm);
1320 	PAGE_HASH_SEARCH(index, pp, vp, off);
1321 	if (pp != NULL) {
1322 		*szc = pp->p_szc;
1323 		rc = 1;
1324 	}
1325 	mutex_exit(phm);
1326 	return (rc);
1327 }
1328 
1329 /* wakeup threads waiting for pages in page_create_get_something() */
1330 void
1331 wakeup_pcgs(void)
1332 {
1333 	if (!CV_HAS_WAITERS(&pcgs_cv))
1334 		return;
1335 	cv_broadcast(&pcgs_cv);
1336 }
1337 
1338 /*
1339  * 'freemem' is used all over the kernel as an indication of how many
1340  * pages are free (either on the cache list or on the free page list)
1341  * in the system.  In very few places is a really accurate 'freemem'
1342  * needed.  To avoid contention of the lock protecting a the
1343  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1344  * sets freemem to the total of all NCPU buckets.  It is called from
1345  * clock() on each TICK.
1346  */
1347 void
1348 set_freemem()
1349 {
1350 	struct pcf	*p;
1351 	ulong_t		t;
1352 	uint_t		i;
1353 
1354 	t = 0;
1355 	p = pcf;
1356 	for (i = 0;  i < pcf_fanout; i++) {
1357 		t += p->pcf_count;
1358 		p++;
1359 	}
1360 	freemem = t;
1361 
1362 	/*
1363 	 * Don't worry about grabbing mutex.  It's not that
1364 	 * critical if we miss a tick or two.  This is
1365 	 * where we wakeup possible delayers in
1366 	 * page_create_get_something().
1367 	 */
1368 	wakeup_pcgs();
1369 }
1370 
1371 ulong_t
1372 get_freemem()
1373 {
1374 	struct pcf	*p;
1375 	ulong_t		t;
1376 	uint_t		i;
1377 
1378 	t = 0;
1379 	p = pcf;
1380 	for (i = 0; i < pcf_fanout; i++) {
1381 		t += p->pcf_count;
1382 		p++;
1383 	}
1384 	/*
1385 	 * We just calculated it, might as well set it.
1386 	 */
1387 	freemem = t;
1388 	return (t);
1389 }
1390 
1391 /*
1392  * Acquire all of the page cache & free (pcf) locks.
1393  */
1394 void
1395 pcf_acquire_all()
1396 {
1397 	struct pcf	*p;
1398 	uint_t		i;
1399 
1400 	p = pcf;
1401 	for (i = 0; i < pcf_fanout; i++) {
1402 		mutex_enter(&p->pcf_lock);
1403 		p++;
1404 	}
1405 }
1406 
1407 /*
1408  * Release all the pcf_locks.
1409  */
1410 void
1411 pcf_release_all()
1412 {
1413 	struct pcf	*p;
1414 	uint_t		i;
1415 
1416 	p = pcf;
1417 	for (i = 0; i < pcf_fanout; i++) {
1418 		mutex_exit(&p->pcf_lock);
1419 		p++;
1420 	}
1421 }
1422 
1423 /*
1424  * Inform the VM system that we need some pages freed up.
1425  * Calls must be symmetric, e.g.:
1426  *
1427  *	page_needfree(100);
1428  *	wait a bit;
1429  *	page_needfree(-100);
1430  */
1431 void
1432 page_needfree(spgcnt_t npages)
1433 {
1434 	mutex_enter(&new_freemem_lock);
1435 	needfree += npages;
1436 	mutex_exit(&new_freemem_lock);
1437 }
1438 
1439 /*
1440  * Throttle for page_create(): try to prevent freemem from dropping
1441  * below throttlefree.  We can't provide a 100% guarantee because
1442  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1443  * nibble away at the freelist.  However, we can block all PG_WAIT
1444  * allocations until memory becomes available.  The motivation is
1445  * that several things can fall apart when there's no free memory:
1446  *
1447  * (1) If pageout() needs memory to push a page, the system deadlocks.
1448  *
1449  * (2) By (broken) specification, timeout(9F) can neither fail nor
1450  *     block, so it has no choice but to panic the system if it
1451  *     cannot allocate a callout structure.
1452  *
1453  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1454  *     it panics if it cannot allocate a callback structure.
1455  *
1456  * (4) Untold numbers of third-party drivers have not yet been hardened
1457  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1458  *     success and panic the system with a data fault on failure.
1459  *     (The long-term solution to this particular problem is to ship
1460  *     hostile fault-injecting DEBUG kernels with the DDK.)
1461  *
1462  * It is theoretically impossible to guarantee success of non-blocking
1463  * allocations, but in practice, this throttle is very hard to break.
1464  */
1465 static int
1466 page_create_throttle(pgcnt_t npages, int flags)
1467 {
1468 	ulong_t	fm;
1469 	uint_t	i;
1470 	pgcnt_t tf;	/* effective value of throttlefree */
1471 
1472 	/*
1473 	 * Never deny pages when:
1474 	 * - it's a thread that cannot block [NOMEMWAIT()]
1475 	 * - the allocation cannot block and must not fail
1476 	 * - the allocation cannot block and is pageout dispensated
1477 	 */
1478 	if (NOMEMWAIT() ||
1479 	    ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1480 	    ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1481 		return (1);
1482 
1483 	/*
1484 	 * If the allocation can't block, we look favorably upon it
1485 	 * unless we're below pageout_reserve.  In that case we fail
1486 	 * the allocation because we want to make sure there are a few
1487 	 * pages available for pageout.
1488 	 */
1489 	if ((flags & PG_WAIT) == 0)
1490 		return (freemem >= npages + pageout_reserve);
1491 
1492 	/* Calculate the effective throttlefree value */
1493 	tf = throttlefree -
1494 	    ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1495 
1496 	cv_signal(&proc_pageout->p_cv);
1497 
1498 	for (;;) {
1499 		fm = 0;
1500 		pcf_acquire_all();
1501 		mutex_enter(&new_freemem_lock);
1502 		for (i = 0; i < pcf_fanout; i++) {
1503 			fm += pcf[i].pcf_count;
1504 			pcf[i].pcf_wait++;
1505 			mutex_exit(&pcf[i].pcf_lock);
1506 		}
1507 		freemem = fm;
1508 		if (freemem >= npages + tf) {
1509 			mutex_exit(&new_freemem_lock);
1510 			break;
1511 		}
1512 		needfree += npages;
1513 		freemem_wait++;
1514 		cv_wait(&freemem_cv, &new_freemem_lock);
1515 		freemem_wait--;
1516 		needfree -= npages;
1517 		mutex_exit(&new_freemem_lock);
1518 	}
1519 	return (1);
1520 }
1521 
1522 /*
1523  * page_create_wait() is called to either coalesce pages from the
1524  * different pcf buckets or to wait because there simply are not
1525  * enough pages to satisfy the caller's request.
1526  *
1527  * Sadly, this is called from platform/vm/vm_machdep.c
1528  */
1529 int
1530 page_create_wait(pgcnt_t npages, uint_t flags)
1531 {
1532 	pgcnt_t		total;
1533 	uint_t		i;
1534 	struct pcf	*p;
1535 
1536 	/*
1537 	 * Wait until there are enough free pages to satisfy our
1538 	 * entire request.
1539 	 * We set needfree += npages before prodding pageout, to make sure
1540 	 * it does real work when npages > lotsfree > freemem.
1541 	 */
1542 	VM_STAT_ADD(page_create_not_enough);
1543 
1544 	ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1545 checkagain:
1546 	if ((flags & PG_NORELOC) &&
1547 	    kcage_freemem < kcage_throttlefree + npages)
1548 		(void) kcage_create_throttle(npages, flags);
1549 
1550 	if (freemem < npages + throttlefree)
1551 		if (!page_create_throttle(npages, flags))
1552 			return (0);
1553 
1554 	if (pcf_decrement_bucket(npages) ||
1555 	    pcf_decrement_multiple(&total, npages, 0))
1556 		return (1);
1557 
1558 	/*
1559 	 * All of the pcf locks are held, there are not enough pages
1560 	 * to satisfy the request (npages < total).
1561 	 * Be sure to acquire the new_freemem_lock before dropping
1562 	 * the pcf locks.  This prevents dropping wakeups in page_free().
1563 	 * The order is always pcf_lock then new_freemem_lock.
1564 	 *
1565 	 * Since we hold all the pcf locks, it is a good time to set freemem.
1566 	 *
1567 	 * If the caller does not want to wait, return now.
1568 	 * Else turn the pageout daemon loose to find something
1569 	 * and wait till it does.
1570 	 *
1571 	 */
1572 	freemem = total;
1573 
1574 	if ((flags & PG_WAIT) == 0) {
1575 		pcf_release_all();
1576 
1577 		TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1578 		"page_create_nomem:npages %ld freemem %ld", npages, freemem);
1579 		return (0);
1580 	}
1581 
1582 	ASSERT(proc_pageout != NULL);
1583 	cv_signal(&proc_pageout->p_cv);
1584 
1585 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1586 	    "page_create_sleep_start: freemem %ld needfree %ld",
1587 	    freemem, needfree);
1588 
1589 	/*
1590 	 * We are going to wait.
1591 	 * We currently hold all of the pcf_locks,
1592 	 * get the new_freemem_lock (it protects freemem_wait),
1593 	 * before dropping the pcf_locks.
1594 	 */
1595 	mutex_enter(&new_freemem_lock);
1596 
1597 	p = pcf;
1598 	for (i = 0; i < pcf_fanout; i++) {
1599 		p->pcf_wait++;
1600 		mutex_exit(&p->pcf_lock);
1601 		p++;
1602 	}
1603 
1604 	needfree += npages;
1605 	freemem_wait++;
1606 
1607 	cv_wait(&freemem_cv, &new_freemem_lock);
1608 
1609 	freemem_wait--;
1610 	needfree -= npages;
1611 
1612 	mutex_exit(&new_freemem_lock);
1613 
1614 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1615 	    "page_create_sleep_end: freemem %ld needfree %ld",
1616 	    freemem, needfree);
1617 
1618 	VM_STAT_ADD(page_create_not_enough_again);
1619 	goto checkagain;
1620 }
1621 /*
1622  * A routine to do the opposite of page_create_wait().
1623  */
1624 void
1625 page_create_putback(spgcnt_t npages)
1626 {
1627 	struct pcf	*p;
1628 	pgcnt_t		lump;
1629 	uint_t		*which;
1630 
1631 	/*
1632 	 * When a contiguous lump is broken up, we have to
1633 	 * deal with lots of pages (min 64) so lets spread
1634 	 * the wealth around.
1635 	 */
1636 	lump = roundup(npages, pcf_fanout) / pcf_fanout;
1637 	freemem += npages;
1638 
1639 	for (p = pcf; (npages > 0) && (p < &pcf[pcf_fanout]); p++) {
1640 		which = &p->pcf_count;
1641 
1642 		mutex_enter(&p->pcf_lock);
1643 
1644 		if (p->pcf_block) {
1645 			which = &p->pcf_reserve;
1646 		}
1647 
1648 		if (lump < npages) {
1649 			*which += (uint_t)lump;
1650 			npages -= lump;
1651 		} else {
1652 			*which += (uint_t)npages;
1653 			npages = 0;
1654 		}
1655 
1656 		if (p->pcf_wait) {
1657 			mutex_enter(&new_freemem_lock);
1658 			/*
1659 			 * Check to see if some other thread
1660 			 * is actually waiting.  Another bucket
1661 			 * may have woken it up by now.  If there
1662 			 * are no waiters, then set our pcf_wait
1663 			 * count to zero to avoid coming in here
1664 			 * next time.
1665 			 */
1666 			if (freemem_wait) {
1667 				if (npages > 1) {
1668 					cv_broadcast(&freemem_cv);
1669 				} else {
1670 					cv_signal(&freemem_cv);
1671 				}
1672 				p->pcf_wait--;
1673 			} else {
1674 				p->pcf_wait = 0;
1675 			}
1676 			mutex_exit(&new_freemem_lock);
1677 		}
1678 		mutex_exit(&p->pcf_lock);
1679 	}
1680 	ASSERT(npages == 0);
1681 }
1682 
1683 /*
1684  * A helper routine for page_create_get_something.
1685  * The indenting got to deep down there.
1686  * Unblock the pcf counters.  Any pages freed after
1687  * pcf_block got set are moved to pcf_count and
1688  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1689  */
1690 static void
1691 pcgs_unblock(void)
1692 {
1693 	int		i;
1694 	struct pcf	*p;
1695 
1696 	/* Update freemem while we're here. */
1697 	freemem = 0;
1698 	p = pcf;
1699 	for (i = 0; i < pcf_fanout; i++) {
1700 		mutex_enter(&p->pcf_lock);
1701 		ASSERT(p->pcf_count == 0);
1702 		p->pcf_count = p->pcf_reserve;
1703 		p->pcf_block = 0;
1704 		freemem += p->pcf_count;
1705 		if (p->pcf_wait) {
1706 			mutex_enter(&new_freemem_lock);
1707 			if (freemem_wait) {
1708 				if (p->pcf_reserve > 1) {
1709 					cv_broadcast(&freemem_cv);
1710 					p->pcf_wait = 0;
1711 				} else {
1712 					cv_signal(&freemem_cv);
1713 					p->pcf_wait--;
1714 				}
1715 			} else {
1716 				p->pcf_wait = 0;
1717 			}
1718 			mutex_exit(&new_freemem_lock);
1719 		}
1720 		p->pcf_reserve = 0;
1721 		mutex_exit(&p->pcf_lock);
1722 		p++;
1723 	}
1724 }
1725 
1726 /*
1727  * Called from page_create_va() when both the cache and free lists
1728  * have been checked once.
1729  *
1730  * Either returns a page or panics since the accounting was done
1731  * way before we got here.
1732  *
1733  * We don't come here often, so leave the accounting on permanently.
1734  */
1735 
1736 #define	MAX_PCGS	100
1737 
1738 #ifdef	DEBUG
1739 #define	PCGS_TRIES	100
1740 #else	/* DEBUG */
1741 #define	PCGS_TRIES	10
1742 #endif	/* DEBUG */
1743 
1744 #ifdef	VM_STATS
1745 uint_t	pcgs_counts[PCGS_TRIES];
1746 uint_t	pcgs_too_many;
1747 uint_t	pcgs_entered;
1748 uint_t	pcgs_entered_noreloc;
1749 uint_t	pcgs_locked;
1750 uint_t	pcgs_cagelocked;
1751 #endif	/* VM_STATS */
1752 
1753 static page_t *
1754 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1755     caddr_t vaddr, uint_t flags)
1756 {
1757 	uint_t		count;
1758 	page_t		*pp;
1759 	uint_t		locked, i;
1760 	struct	pcf	*p;
1761 	lgrp_t		*lgrp;
1762 	int		cagelocked = 0;
1763 
1764 	VM_STAT_ADD(pcgs_entered);
1765 
1766 	/*
1767 	 * Tap any reserve freelists: if we fail now, we'll die
1768 	 * since the page(s) we're looking for have already been
1769 	 * accounted for.
1770 	 */
1771 	flags |= PG_PANIC;
1772 
1773 	if ((flags & PG_NORELOC) != 0) {
1774 		VM_STAT_ADD(pcgs_entered_noreloc);
1775 		/*
1776 		 * Requests for free pages from critical threads
1777 		 * such as pageout still won't throttle here, but
1778 		 * we must try again, to give the cageout thread
1779 		 * another chance to catch up. Since we already
1780 		 * accounted for the pages, we had better get them
1781 		 * this time.
1782 		 *
1783 		 * N.B. All non-critical threads acquire the pcgs_cagelock
1784 		 * to serialize access to the freelists. This implements a
1785 		 * turnstile-type synchornization to avoid starvation of
1786 		 * critical requests for PG_NORELOC memory by non-critical
1787 		 * threads: all non-critical threads must acquire a 'ticket'
1788 		 * before passing through, which entails making sure
1789 		 * kcage_freemem won't fall below minfree prior to grabbing
1790 		 * pages from the freelists.
1791 		 */
1792 		if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1793 			mutex_enter(&pcgs_cagelock);
1794 			cagelocked = 1;
1795 			VM_STAT_ADD(pcgs_cagelocked);
1796 		}
1797 	}
1798 
1799 	/*
1800 	 * Time to get serious.
1801 	 * We failed to get a `correctly colored' page from both the
1802 	 * free and cache lists.
1803 	 * We escalate in stage.
1804 	 *
1805 	 * First try both lists without worring about color.
1806 	 *
1807 	 * Then, grab all page accounting locks (ie. pcf[]) and
1808 	 * steal any pages that they have and set the pcf_block flag to
1809 	 * stop deletions from the lists.  This will help because
1810 	 * a page can get added to the free list while we are looking
1811 	 * at the cache list, then another page could be added to the cache
1812 	 * list allowing the page on the free list to be removed as we
1813 	 * move from looking at the cache list to the free list. This
1814 	 * could happen over and over. We would never find the page
1815 	 * we have accounted for.
1816 	 *
1817 	 * Noreloc pages are a subset of the global (relocatable) page pool.
1818 	 * They are not tracked separately in the pcf bins, so it is
1819 	 * impossible to know when doing pcf accounting if the available
1820 	 * page(s) are noreloc pages or not. When looking for a noreloc page
1821 	 * it is quite easy to end up here even if the global (relocatable)
1822 	 * page pool has plenty of free pages but the noreloc pool is empty.
1823 	 *
1824 	 * When the noreloc pool is empty (or low), additional noreloc pages
1825 	 * are created by converting pages from the global page pool. This
1826 	 * process will stall during pcf accounting if the pcf bins are
1827 	 * already locked. Such is the case when a noreloc allocation is
1828 	 * looping here in page_create_get_something waiting for more noreloc
1829 	 * pages to appear.
1830 	 *
1831 	 * Short of adding a new field to the pcf bins to accurately track
1832 	 * the number of free noreloc pages, we instead do not grab the
1833 	 * pcgs_lock, do not set the pcf blocks and do not timeout when
1834 	 * allocating a noreloc page. This allows noreloc allocations to
1835 	 * loop without blocking global page pool allocations.
1836 	 *
1837 	 * NOTE: the behaviour of page_create_get_something has not changed
1838 	 * for the case of global page pool allocations.
1839 	 */
1840 
1841 	flags &= ~PG_MATCH_COLOR;
1842 	locked = 0;
1843 #if defined(__i386) || defined(__amd64)
1844 	flags = page_create_update_flags_x86(flags);
1845 #endif
1846 
1847 	lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1848 
1849 	for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1850 		pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1851 		    flags, lgrp);
1852 		if (pp == NULL) {
1853 			pp = page_get_cachelist(vp, off, seg, vaddr,
1854 			    flags, lgrp);
1855 		}
1856 		if (pp == NULL) {
1857 			/*
1858 			 * Serialize.  Don't fight with other pcgs().
1859 			 */
1860 			if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1861 				mutex_enter(&pcgs_lock);
1862 				VM_STAT_ADD(pcgs_locked);
1863 				locked = 1;
1864 				p = pcf;
1865 				for (i = 0; i < pcf_fanout; i++) {
1866 					mutex_enter(&p->pcf_lock);
1867 					ASSERT(p->pcf_block == 0);
1868 					p->pcf_block = 1;
1869 					p->pcf_reserve = p->pcf_count;
1870 					p->pcf_count = 0;
1871 					mutex_exit(&p->pcf_lock);
1872 					p++;
1873 				}
1874 				freemem = 0;
1875 			}
1876 
1877 			if (count) {
1878 				/*
1879 				 * Since page_free() puts pages on
1880 				 * a list then accounts for it, we
1881 				 * just have to wait for page_free()
1882 				 * to unlock any page it was working
1883 				 * with. The page_lock()-page_reclaim()
1884 				 * path falls in the same boat.
1885 				 *
1886 				 * We don't need to check on the
1887 				 * PG_WAIT flag, we have already
1888 				 * accounted for the page we are
1889 				 * looking for in page_create_va().
1890 				 *
1891 				 * We just wait a moment to let any
1892 				 * locked pages on the lists free up,
1893 				 * then continue around and try again.
1894 				 *
1895 				 * Will be awakened by set_freemem().
1896 				 */
1897 				mutex_enter(&pcgs_wait_lock);
1898 				cv_wait(&pcgs_cv, &pcgs_wait_lock);
1899 				mutex_exit(&pcgs_wait_lock);
1900 			}
1901 		} else {
1902 #ifdef VM_STATS
1903 			if (count >= PCGS_TRIES) {
1904 				VM_STAT_ADD(pcgs_too_many);
1905 			} else {
1906 				VM_STAT_ADD(pcgs_counts[count]);
1907 			}
1908 #endif
1909 			if (locked) {
1910 				pcgs_unblock();
1911 				mutex_exit(&pcgs_lock);
1912 			}
1913 			if (cagelocked)
1914 				mutex_exit(&pcgs_cagelock);
1915 			return (pp);
1916 		}
1917 	}
1918 	/*
1919 	 * we go down holding the pcf locks.
1920 	 */
1921 	panic("no %spage found %d",
1922 	    ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1923 	/*NOTREACHED*/
1924 }
1925 
1926 /*
1927  * Create enough pages for "bytes" worth of data starting at
1928  * "off" in "vp".
1929  *
1930  *	Where flag must be one of:
1931  *
1932  *		PG_EXCL:	Exclusive create (fail if any page already
1933  *				exists in the page cache) which does not
1934  *				wait for memory to become available.
1935  *
1936  *		PG_WAIT:	Non-exclusive create which can wait for
1937  *				memory to become available.
1938  *
1939  *		PG_PHYSCONTIG:	Allocate physically contiguous pages.
1940  *				(Not Supported)
1941  *
1942  * A doubly linked list of pages is returned to the caller.  Each page
1943  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1944  * lock.
1945  *
1946  * Unable to change the parameters to page_create() in a minor release,
1947  * we renamed page_create() to page_create_va(), changed all known calls
1948  * from page_create() to page_create_va(), and created this wrapper.
1949  *
1950  * Upon a major release, we should break compatibility by deleting this
1951  * wrapper, and replacing all the strings "page_create_va", with "page_create".
1952  *
1953  * NOTE: There is a copy of this interface as page_create_io() in
1954  *	 i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1955  *	 there.
1956  */
1957 page_t *
1958 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1959 {
1960 	caddr_t random_vaddr;
1961 	struct seg kseg;
1962 
1963 #ifdef DEBUG
1964 	cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1965 	    (void *)caller());
1966 #endif
1967 
1968 	random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1969 	    (uintptr_t)(off >> PAGESHIFT));
1970 	kseg.s_as = &kas;
1971 
1972 	return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1973 }
1974 
1975 #ifdef DEBUG
1976 uint32_t pg_alloc_pgs_mtbf = 0;
1977 #endif
1978 
1979 /*
1980  * Used for large page support. It will attempt to allocate
1981  * a large page(s) off the freelist.
1982  *
1983  * Returns non zero on failure.
1984  */
1985 int
1986 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1987     page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz, int pgflags)
1988 {
1989 	pgcnt_t		npgs, curnpgs, totpgs;
1990 	size_t		pgsz;
1991 	page_t		*pplist = NULL, *pp;
1992 	int		err = 0;
1993 	lgrp_t		*lgrp;
1994 
1995 	ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
1996 	ASSERT(pgflags == 0 || pgflags == PG_LOCAL);
1997 
1998 	/*
1999 	 * Check if system heavily prefers local large pages over remote
2000 	 * on systems with multiple lgroups.
2001 	 */
2002 	if (lpg_alloc_prefer == LPAP_LOCAL && nlgrps > 1) {
2003 		pgflags = PG_LOCAL;
2004 	}
2005 
2006 	VM_STAT_ADD(alloc_pages[0]);
2007 
2008 #ifdef DEBUG
2009 	if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
2010 		return (ENOMEM);
2011 	}
2012 #endif
2013 
2014 	/*
2015 	 * One must be NULL but not both.
2016 	 * And one must be non NULL but not both.
2017 	 */
2018 	ASSERT(basepp != NULL || ppa != NULL);
2019 	ASSERT(basepp == NULL || ppa == NULL);
2020 
2021 #if defined(__i386) || defined(__amd64)
2022 	while (page_chk_freelist(szc) == 0) {
2023 		VM_STAT_ADD(alloc_pages[8]);
2024 		if (anypgsz == 0 || --szc == 0)
2025 			return (ENOMEM);
2026 	}
2027 #endif
2028 
2029 	pgsz = page_get_pagesize(szc);
2030 	totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
2031 
2032 	ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
2033 
2034 	(void) page_create_wait(npgs, PG_WAIT);
2035 
2036 	while (npgs && szc) {
2037 		lgrp = lgrp_mem_choose(seg, addr, pgsz);
2038 		if (pgflags == PG_LOCAL) {
2039 			pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2040 			    pgflags, lgrp);
2041 			if (pp == NULL) {
2042 				pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2043 				    0, lgrp);
2044 			}
2045 		} else {
2046 			pp = page_get_freelist(vp, 0, seg, addr, pgsz,
2047 			    0, lgrp);
2048 		}
2049 		if (pp != NULL) {
2050 			VM_STAT_ADD(alloc_pages[1]);
2051 			page_list_concat(&pplist, &pp);
2052 			ASSERT(npgs >= curnpgs);
2053 			npgs -= curnpgs;
2054 		} else if (anypgsz) {
2055 			VM_STAT_ADD(alloc_pages[2]);
2056 			szc--;
2057 			pgsz = page_get_pagesize(szc);
2058 			curnpgs = pgsz >> PAGESHIFT;
2059 		} else {
2060 			VM_STAT_ADD(alloc_pages[3]);
2061 			ASSERT(npgs == totpgs);
2062 			page_create_putback(npgs);
2063 			return (ENOMEM);
2064 		}
2065 	}
2066 	if (szc == 0) {
2067 		VM_STAT_ADD(alloc_pages[4]);
2068 		ASSERT(npgs != 0);
2069 		page_create_putback(npgs);
2070 		err = ENOMEM;
2071 	} else if (basepp != NULL) {
2072 		ASSERT(npgs == 0);
2073 		ASSERT(ppa == NULL);
2074 		*basepp = pplist;
2075 	}
2076 
2077 	npgs = totpgs - npgs;
2078 	pp = pplist;
2079 
2080 	/*
2081 	 * Clear the free and age bits. Also if we were passed in a ppa then
2082 	 * fill it in with all the constituent pages from the large page. But
2083 	 * if we failed to allocate all the pages just free what we got.
2084 	 */
2085 	while (npgs != 0) {
2086 		ASSERT(PP_ISFREE(pp));
2087 		ASSERT(PP_ISAGED(pp));
2088 		if (ppa != NULL || err != 0) {
2089 			if (err == 0) {
2090 				VM_STAT_ADD(alloc_pages[5]);
2091 				PP_CLRFREE(pp);
2092 				PP_CLRAGED(pp);
2093 				page_sub(&pplist, pp);
2094 				*ppa++ = pp;
2095 				npgs--;
2096 			} else {
2097 				VM_STAT_ADD(alloc_pages[6]);
2098 				ASSERT(pp->p_szc != 0);
2099 				curnpgs = page_get_pagecnt(pp->p_szc);
2100 				page_list_break(&pp, &pplist, curnpgs);
2101 				page_list_add_pages(pp, 0);
2102 				page_create_putback(curnpgs);
2103 				ASSERT(npgs >= curnpgs);
2104 				npgs -= curnpgs;
2105 			}
2106 			pp = pplist;
2107 		} else {
2108 			VM_STAT_ADD(alloc_pages[7]);
2109 			PP_CLRFREE(pp);
2110 			PP_CLRAGED(pp);
2111 			pp = pp->p_next;
2112 			npgs--;
2113 		}
2114 	}
2115 	return (err);
2116 }
2117 
2118 /*
2119  * Get a single large page off of the freelists, and set it up for use.
2120  * Number of bytes requested must be a supported page size.
2121  *
2122  * Note that this call may fail even if there is sufficient
2123  * memory available or PG_WAIT is set, so the caller must
2124  * be willing to fallback on page_create_va(), block and retry,
2125  * or fail the requester.
2126  */
2127 page_t *
2128 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2129     struct seg *seg, caddr_t vaddr, void *arg)
2130 {
2131 	pgcnt_t		npages;
2132 	page_t		*pp;
2133 	page_t		*rootpp;
2134 	lgrp_t		*lgrp;
2135 	lgrp_id_t	*lgrpid = (lgrp_id_t *)arg;
2136 
2137 	ASSERT(vp != NULL);
2138 
2139 	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2140 	    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
2141 	/* but no others */
2142 
2143 	ASSERT((flags & PG_EXCL) == PG_EXCL);
2144 
2145 	npages = btop(bytes);
2146 
2147 	if (!kcage_on || panicstr) {
2148 		/*
2149 		 * Cage is OFF, or we are single threaded in
2150 		 * panic, so make everything a RELOC request.
2151 		 */
2152 		flags &= ~PG_NORELOC;
2153 	}
2154 
2155 	/*
2156 	 * Make sure there's adequate physical memory available.
2157 	 * Note: PG_WAIT is ignored here.
2158 	 */
2159 	if (freemem <= throttlefree + npages) {
2160 		VM_STAT_ADD(page_create_large_cnt[1]);
2161 		return (NULL);
2162 	}
2163 
2164 	/*
2165 	 * If cage is on, dampen draw from cage when available
2166 	 * cage space is low.
2167 	 */
2168 	if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2169 	    kcage_freemem < kcage_throttlefree + npages) {
2170 
2171 		/*
2172 		 * The cage is on, the caller wants PG_NORELOC
2173 		 * pages and available cage memory is very low.
2174 		 * Call kcage_create_throttle() to attempt to
2175 		 * control demand on the cage.
2176 		 */
2177 		if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2178 			VM_STAT_ADD(page_create_large_cnt[2]);
2179 			return (NULL);
2180 		}
2181 	}
2182 
2183 	if (!pcf_decrement_bucket(npages) &&
2184 	    !pcf_decrement_multiple(NULL, npages, 1)) {
2185 		VM_STAT_ADD(page_create_large_cnt[4]);
2186 		return (NULL);
2187 	}
2188 
2189 	/*
2190 	 * This is where this function behaves fundamentally differently
2191 	 * than page_create_va(); since we're intending to map the page
2192 	 * with a single TTE, we have to get it as a physically contiguous
2193 	 * hardware pagesize chunk.  If we can't, we fail.
2194 	 */
2195 	if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2196 	    LGRP_EXISTS(lgrp_table[*lgrpid]))
2197 		lgrp = lgrp_table[*lgrpid];
2198 	else
2199 		lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2200 
2201 	if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2202 	    bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2203 		page_create_putback(npages);
2204 		VM_STAT_ADD(page_create_large_cnt[5]);
2205 		return (NULL);
2206 	}
2207 
2208 	/*
2209 	 * if we got the page with the wrong mtype give it back this is a
2210 	 * workaround for CR 6249718. When CR 6249718 is fixed we never get
2211 	 * inside "if" and the workaround becomes just a nop
2212 	 */
2213 	if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2214 		page_list_add_pages(rootpp, 0);
2215 		page_create_putback(npages);
2216 		VM_STAT_ADD(page_create_large_cnt[6]);
2217 		return (NULL);
2218 	}
2219 
2220 	/*
2221 	 * If satisfying this request has left us with too little
2222 	 * memory, start the wheels turning to get some back.  The
2223 	 * first clause of the test prevents waking up the pageout
2224 	 * daemon in situations where it would decide that there's
2225 	 * nothing to do.
2226 	 */
2227 	if (nscan < desscan && freemem < minfree) {
2228 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2229 		    "pageout_cv_signal:freemem %ld", freemem);
2230 		cv_signal(&proc_pageout->p_cv);
2231 	}
2232 
2233 	pp = rootpp;
2234 	while (npages--) {
2235 		ASSERT(PAGE_EXCL(pp));
2236 		ASSERT(pp->p_vnode == NULL);
2237 		ASSERT(!hat_page_is_mapped(pp));
2238 		PP_CLRFREE(pp);
2239 		PP_CLRAGED(pp);
2240 		if (!page_hashin(pp, vp, off, NULL))
2241 			panic("page_create_large: hashin failed: page %p",
2242 			    (void *)pp);
2243 		page_io_lock(pp);
2244 		off += PAGESIZE;
2245 		pp = pp->p_next;
2246 	}
2247 
2248 	VM_STAT_ADD(page_create_large_cnt[0]);
2249 	return (rootpp);
2250 }
2251 
2252 page_t *
2253 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2254     struct seg *seg, caddr_t vaddr)
2255 {
2256 	page_t		*plist = NULL;
2257 	pgcnt_t		npages;
2258 	pgcnt_t		found_on_free = 0;
2259 	pgcnt_t		pages_req;
2260 	page_t		*npp = NULL;
2261 	struct pcf	*p;
2262 	lgrp_t		*lgrp;
2263 
2264 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2265 	    "page_create_start:vp %p off %llx bytes %lu flags %x",
2266 	    vp, off, bytes, flags);
2267 
2268 	ASSERT(bytes != 0 && vp != NULL);
2269 
2270 	if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2271 		panic("page_create: invalid flags");
2272 		/*NOTREACHED*/
2273 	}
2274 	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2275 	    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
2276 	    /* but no others */
2277 
2278 	pages_req = npages = btopr(bytes);
2279 	/*
2280 	 * Try to see whether request is too large to *ever* be
2281 	 * satisfied, in order to prevent deadlock.  We arbitrarily
2282 	 * decide to limit maximum size requests to max_page_get.
2283 	 */
2284 	if (npages >= max_page_get) {
2285 		if ((flags & PG_WAIT) == 0) {
2286 			TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2287 			    "page_create_toobig:vp %p off %llx npages "
2288 			    "%lu max_page_get %lu",
2289 			    vp, off, npages, max_page_get);
2290 			return (NULL);
2291 		} else {
2292 			cmn_err(CE_WARN,
2293 			    "Request for too much kernel memory "
2294 			    "(%lu bytes), will hang forever", bytes);
2295 			for (;;)
2296 				delay(1000000000);
2297 		}
2298 	}
2299 
2300 	if (!kcage_on || panicstr) {
2301 		/*
2302 		 * Cage is OFF, or we are single threaded in
2303 		 * panic, so make everything a RELOC request.
2304 		 */
2305 		flags &= ~PG_NORELOC;
2306 	}
2307 
2308 	if (freemem <= throttlefree + npages)
2309 		if (!page_create_throttle(npages, flags))
2310 			return (NULL);
2311 
2312 	/*
2313 	 * If cage is on, dampen draw from cage when available
2314 	 * cage space is low.
2315 	 */
2316 	if ((flags & PG_NORELOC) &&
2317 	    kcage_freemem < kcage_throttlefree + npages) {
2318 
2319 		/*
2320 		 * The cage is on, the caller wants PG_NORELOC
2321 		 * pages and available cage memory is very low.
2322 		 * Call kcage_create_throttle() to attempt to
2323 		 * control demand on the cage.
2324 		 */
2325 		if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2326 			return (NULL);
2327 	}
2328 
2329 	VM_STAT_ADD(page_create_cnt[0]);
2330 
2331 	if (!pcf_decrement_bucket(npages)) {
2332 		/*
2333 		 * Have to look harder.  If npages is greater than
2334 		 * one, then we might have to coalesce the counters.
2335 		 *
2336 		 * Go wait.  We come back having accounted
2337 		 * for the memory.
2338 		 */
2339 		VM_STAT_ADD(page_create_cnt[1]);
2340 		if (!page_create_wait(npages, flags)) {
2341 			VM_STAT_ADD(page_create_cnt[2]);
2342 			return (NULL);
2343 		}
2344 	}
2345 
2346 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2347 	    "page_create_success:vp %p off %llx", vp, off);
2348 
2349 	/*
2350 	 * If satisfying this request has left us with too little
2351 	 * memory, start the wheels turning to get some back.  The
2352 	 * first clause of the test prevents waking up the pageout
2353 	 * daemon in situations where it would decide that there's
2354 	 * nothing to do.
2355 	 */
2356 	if (nscan < desscan && freemem < minfree) {
2357 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2358 		    "pageout_cv_signal:freemem %ld", freemem);
2359 		cv_signal(&proc_pageout->p_cv);
2360 	}
2361 
2362 	/*
2363 	 * Loop around collecting the requested number of pages.
2364 	 * Most of the time, we have to `create' a new page. With
2365 	 * this in mind, pull the page off the free list before
2366 	 * getting the hash lock.  This will minimize the hash
2367 	 * lock hold time, nesting, and the like.  If it turns
2368 	 * out we don't need the page, we put it back at the end.
2369 	 */
2370 	while (npages--) {
2371 		page_t		*pp;
2372 		kmutex_t	*phm = NULL;
2373 		ulong_t		index;
2374 
2375 		index = PAGE_HASH_FUNC(vp, off);
2376 top:
2377 		ASSERT(phm == NULL);
2378 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
2379 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2380 
2381 		if (npp == NULL) {
2382 			/*
2383 			 * Try to get a page from the freelist (ie,
2384 			 * a page with no [vp, off] tag).  If that
2385 			 * fails, use the cachelist.
2386 			 *
2387 			 * During the first attempt at both the free
2388 			 * and cache lists we try for the correct color.
2389 			 */
2390 			/*
2391 			 * XXXX-how do we deal with virtual indexed
2392 			 * caches and and colors?
2393 			 */
2394 			VM_STAT_ADD(page_create_cnt[4]);
2395 			/*
2396 			 * Get lgroup to allocate next page of shared memory
2397 			 * from and use it to specify where to allocate
2398 			 * the physical memory
2399 			 */
2400 			lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2401 			npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2402 			    flags | PG_MATCH_COLOR, lgrp);
2403 			if (npp == NULL) {
2404 				npp = page_get_cachelist(vp, off, seg,
2405 				    vaddr, flags | PG_MATCH_COLOR, lgrp);
2406 				if (npp == NULL) {
2407 					npp = page_create_get_something(vp,
2408 					    off, seg, vaddr,
2409 					    flags & ~PG_MATCH_COLOR);
2410 				}
2411 
2412 				if (PP_ISAGED(npp) == 0) {
2413 					/*
2414 					 * Since this page came from the
2415 					 * cachelist, we must destroy the
2416 					 * old vnode association.
2417 					 */
2418 					page_hashout(npp, NULL);
2419 				}
2420 			}
2421 		}
2422 
2423 		/*
2424 		 * We own this page!
2425 		 */
2426 		ASSERT(PAGE_EXCL(npp));
2427 		ASSERT(npp->p_vnode == NULL);
2428 		ASSERT(!hat_page_is_mapped(npp));
2429 		PP_CLRFREE(npp);
2430 		PP_CLRAGED(npp);
2431 
2432 		/*
2433 		 * Here we have a page in our hot little mits and are
2434 		 * just waiting to stuff it on the appropriate lists.
2435 		 * Get the mutex and check to see if it really does
2436 		 * not exist.
2437 		 */
2438 		phm = PAGE_HASH_MUTEX(index);
2439 		mutex_enter(phm);
2440 		PAGE_HASH_SEARCH(index, pp, vp, off);
2441 		if (pp == NULL) {
2442 			VM_STAT_ADD(page_create_new);
2443 			pp = npp;
2444 			npp = NULL;
2445 			if (!page_hashin(pp, vp, off, phm)) {
2446 				/*
2447 				 * Since we hold the page hash mutex and
2448 				 * just searched for this page, page_hashin
2449 				 * had better not fail.  If it does, that
2450 				 * means somethread did not follow the
2451 				 * page hash mutex rules.  Panic now and
2452 				 * get it over with.  As usual, go down
2453 				 * holding all the locks.
2454 				 */
2455 				ASSERT(MUTEX_HELD(phm));
2456 				panic("page_create: "
2457 				    "hashin failed %p %p %llx %p",
2458 				    (void *)pp, (void *)vp, off, (void *)phm);
2459 				/*NOTREACHED*/
2460 			}
2461 			ASSERT(MUTEX_HELD(phm));
2462 			mutex_exit(phm);
2463 			phm = NULL;
2464 
2465 			/*
2466 			 * Hat layer locking need not be done to set
2467 			 * the following bits since the page is not hashed
2468 			 * and was on the free list (i.e., had no mappings).
2469 			 *
2470 			 * Set the reference bit to protect
2471 			 * against immediate pageout
2472 			 *
2473 			 * XXXmh modify freelist code to set reference
2474 			 * bit so we don't have to do it here.
2475 			 */
2476 			page_set_props(pp, P_REF);
2477 			found_on_free++;
2478 		} else {
2479 			VM_STAT_ADD(page_create_exists);
2480 			if (flags & PG_EXCL) {
2481 				/*
2482 				 * Found an existing page, and the caller
2483 				 * wanted all new pages.  Undo all of the work
2484 				 * we have done.
2485 				 */
2486 				mutex_exit(phm);
2487 				phm = NULL;
2488 				while (plist != NULL) {
2489 					pp = plist;
2490 					page_sub(&plist, pp);
2491 					page_io_unlock(pp);
2492 					/* large pages should not end up here */
2493 					ASSERT(pp->p_szc == 0);
2494 					/*LINTED: constant in conditional ctx*/
2495 					VN_DISPOSE(pp, B_INVAL, 0, kcred);
2496 				}
2497 				VM_STAT_ADD(page_create_found_one);
2498 				goto fail;
2499 			}
2500 			ASSERT(flags & PG_WAIT);
2501 			if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2502 				/*
2503 				 * Start all over again if we blocked trying
2504 				 * to lock the page.
2505 				 */
2506 				mutex_exit(phm);
2507 				VM_STAT_ADD(page_create_page_lock_failed);
2508 				phm = NULL;
2509 				goto top;
2510 			}
2511 			mutex_exit(phm);
2512 			phm = NULL;
2513 
2514 			if (PP_ISFREE(pp)) {
2515 				ASSERT(PP_ISAGED(pp) == 0);
2516 				VM_STAT_ADD(pagecnt.pc_get_cache);
2517 				page_list_sub(pp, PG_CACHE_LIST);
2518 				PP_CLRFREE(pp);
2519 				found_on_free++;
2520 			}
2521 		}
2522 
2523 		/*
2524 		 * Got a page!  It is locked.  Acquire the i/o
2525 		 * lock since we are going to use the p_next and
2526 		 * p_prev fields to link the requested pages together.
2527 		 */
2528 		page_io_lock(pp);
2529 		page_add(&plist, pp);
2530 		plist = plist->p_next;
2531 		off += PAGESIZE;
2532 		vaddr += PAGESIZE;
2533 	}
2534 
2535 	ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2536 fail:
2537 	if (npp != NULL) {
2538 		/*
2539 		 * Did not need this page after all.
2540 		 * Put it back on the free list.
2541 		 */
2542 		VM_STAT_ADD(page_create_putbacks);
2543 		PP_SETFREE(npp);
2544 		PP_SETAGED(npp);
2545 		npp->p_offset = (u_offset_t)-1;
2546 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2547 		page_unlock(npp);
2548 
2549 	}
2550 
2551 	ASSERT(pages_req >= found_on_free);
2552 
2553 	{
2554 		uint_t overshoot = (uint_t)(pages_req - found_on_free);
2555 
2556 		if (overshoot) {
2557 			VM_STAT_ADD(page_create_overshoot);
2558 			p = &pcf[PCF_INDEX()];
2559 			mutex_enter(&p->pcf_lock);
2560 			if (p->pcf_block) {
2561 				p->pcf_reserve += overshoot;
2562 			} else {
2563 				p->pcf_count += overshoot;
2564 				if (p->pcf_wait) {
2565 					mutex_enter(&new_freemem_lock);
2566 					if (freemem_wait) {
2567 						cv_signal(&freemem_cv);
2568 						p->pcf_wait--;
2569 					} else {
2570 						p->pcf_wait = 0;
2571 					}
2572 					mutex_exit(&new_freemem_lock);
2573 				}
2574 			}
2575 			mutex_exit(&p->pcf_lock);
2576 			/* freemem is approximate, so this test OK */
2577 			if (!p->pcf_block)
2578 				freemem += overshoot;
2579 		}
2580 	}
2581 
2582 	return (plist);
2583 }
2584 
2585 /*
2586  * One or more constituent pages of this large page has been marked
2587  * toxic. Simply demote the large page to PAGESIZE pages and let
2588  * page_free() handle it. This routine should only be called by
2589  * large page free routines (page_free_pages() and page_destroy_pages().
2590  * All pages are locked SE_EXCL and have already been marked free.
2591  */
2592 static void
2593 page_free_toxic_pages(page_t *rootpp)
2594 {
2595 	page_t	*tpp;
2596 	pgcnt_t	i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2597 	uint_t	szc = rootpp->p_szc;
2598 
2599 	for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2600 		ASSERT(tpp->p_szc == szc);
2601 		ASSERT((PAGE_EXCL(tpp) &&
2602 		    !page_iolock_assert(tpp)) || panicstr);
2603 		tpp->p_szc = 0;
2604 	}
2605 
2606 	while (rootpp != NULL) {
2607 		tpp = rootpp;
2608 		page_sub(&rootpp, tpp);
2609 		ASSERT(PP_ISFREE(tpp));
2610 		PP_CLRFREE(tpp);
2611 		page_free(tpp, 1);
2612 	}
2613 }
2614 
2615 /*
2616  * Put page on the "free" list.
2617  * The free list is really two lists maintained by
2618  * the PSM of whatever machine we happen to be on.
2619  */
2620 void
2621 page_free(page_t *pp, int dontneed)
2622 {
2623 	struct pcf	*p;
2624 	uint_t		pcf_index;
2625 
2626 	ASSERT((PAGE_EXCL(pp) &&
2627 	    !page_iolock_assert(pp)) || panicstr);
2628 
2629 	if (PP_ISFREE(pp)) {
2630 		panic("page_free: page %p is free", (void *)pp);
2631 	}
2632 
2633 	if (pp->p_szc != 0) {
2634 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2635 		    PP_ISKAS(pp)) {
2636 			panic("page_free: anon or kernel "
2637 			    "or no vnode large page %p", (void *)pp);
2638 		}
2639 		page_demote_vp_pages(pp);
2640 		ASSERT(pp->p_szc == 0);
2641 	}
2642 
2643 	/*
2644 	 * The page_struct_lock need not be acquired to examine these
2645 	 * fields since the page has an "exclusive" lock.
2646 	 */
2647 	if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
2648 	    pp->p_slckcnt != 0) {
2649 		panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d "
2650 		    "slckcnt = %d", pp, page_pptonum(pp), pp->p_lckcnt,
2651 		    pp->p_cowcnt, pp->p_slckcnt);
2652 		/*NOTREACHED*/
2653 	}
2654 
2655 	ASSERT(!hat_page_getshare(pp));
2656 
2657 	PP_SETFREE(pp);
2658 	ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2659 	    !hat_ismod(pp));
2660 	page_clr_all_props(pp);
2661 	ASSERT(!hat_page_getshare(pp));
2662 
2663 	/*
2664 	 * Now we add the page to the head of the free list.
2665 	 * But if this page is associated with a paged vnode
2666 	 * then we adjust the head forward so that the page is
2667 	 * effectively at the end of the list.
2668 	 */
2669 	if (pp->p_vnode == NULL) {
2670 		/*
2671 		 * Page has no identity, put it on the free list.
2672 		 */
2673 		PP_SETAGED(pp);
2674 		pp->p_offset = (u_offset_t)-1;
2675 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2676 		VM_STAT_ADD(pagecnt.pc_free_free);
2677 		TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2678 		    "page_free_free:pp %p", pp);
2679 	} else {
2680 		PP_CLRAGED(pp);
2681 
2682 		if (!dontneed || nopageage) {
2683 			/* move it to the tail of the list */
2684 			page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2685 
2686 			VM_STAT_ADD(pagecnt.pc_free_cache);
2687 			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2688 			    "page_free_cache_tail:pp %p", pp);
2689 		} else {
2690 			page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2691 
2692 			VM_STAT_ADD(pagecnt.pc_free_dontneed);
2693 			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2694 			    "page_free_cache_head:pp %p", pp);
2695 		}
2696 	}
2697 	page_unlock(pp);
2698 
2699 	/*
2700 	 * Now do the `freemem' accounting.
2701 	 */
2702 	pcf_index = PCF_INDEX();
2703 	p = &pcf[pcf_index];
2704 
2705 	mutex_enter(&p->pcf_lock);
2706 	if (p->pcf_block) {
2707 		p->pcf_reserve += 1;
2708 	} else {
2709 		p->pcf_count += 1;
2710 		if (p->pcf_wait) {
2711 			mutex_enter(&new_freemem_lock);
2712 			/*
2713 			 * Check to see if some other thread
2714 			 * is actually waiting.  Another bucket
2715 			 * may have woken it up by now.  If there
2716 			 * are no waiters, then set our pcf_wait
2717 			 * count to zero to avoid coming in here
2718 			 * next time.  Also, since only one page
2719 			 * was put on the free list, just wake
2720 			 * up one waiter.
2721 			 */
2722 			if (freemem_wait) {
2723 				cv_signal(&freemem_cv);
2724 				p->pcf_wait--;
2725 			} else {
2726 				p->pcf_wait = 0;
2727 			}
2728 			mutex_exit(&new_freemem_lock);
2729 		}
2730 	}
2731 	mutex_exit(&p->pcf_lock);
2732 
2733 	/* freemem is approximate, so this test OK */
2734 	if (!p->pcf_block)
2735 		freemem += 1;
2736 }
2737 
2738 /*
2739  * Put page on the "free" list during intial startup.
2740  * This happens during initial single threaded execution.
2741  */
2742 void
2743 page_free_at_startup(page_t *pp)
2744 {
2745 	struct pcf	*p;
2746 	uint_t		pcf_index;
2747 
2748 	page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2749 	VM_STAT_ADD(pagecnt.pc_free_free);
2750 
2751 	/*
2752 	 * Now do the `freemem' accounting.
2753 	 */
2754 	pcf_index = PCF_INDEX();
2755 	p = &pcf[pcf_index];
2756 
2757 	ASSERT(p->pcf_block == 0);
2758 	ASSERT(p->pcf_wait == 0);
2759 	p->pcf_count += 1;
2760 
2761 	/* freemem is approximate, so this is OK */
2762 	freemem += 1;
2763 }
2764 
2765 void
2766 page_free_pages(page_t *pp)
2767 {
2768 	page_t	*tpp, *rootpp = NULL;
2769 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
2770 	pgcnt_t	i;
2771 	uint_t	szc = pp->p_szc;
2772 
2773 	VM_STAT_ADD(pagecnt.pc_free_pages);
2774 	TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2775 	    "page_free_free:pp %p", pp);
2776 
2777 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2778 	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2779 		panic("page_free_pages: not root page %p", (void *)pp);
2780 		/*NOTREACHED*/
2781 	}
2782 
2783 	for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2784 		ASSERT((PAGE_EXCL(tpp) &&
2785 		    !page_iolock_assert(tpp)) || panicstr);
2786 		if (PP_ISFREE(tpp)) {
2787 			panic("page_free_pages: page %p is free", (void *)tpp);
2788 			/*NOTREACHED*/
2789 		}
2790 		if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2791 		    tpp->p_cowcnt != 0 || tpp->p_slckcnt != 0) {
2792 			panic("page_free_pages %p", (void *)tpp);
2793 			/*NOTREACHED*/
2794 		}
2795 
2796 		ASSERT(!hat_page_getshare(tpp));
2797 		ASSERT(tpp->p_vnode == NULL);
2798 		ASSERT(tpp->p_szc == szc);
2799 
2800 		PP_SETFREE(tpp);
2801 		page_clr_all_props(tpp);
2802 		PP_SETAGED(tpp);
2803 		tpp->p_offset = (u_offset_t)-1;
2804 		ASSERT(tpp->p_next == tpp);
2805 		ASSERT(tpp->p_prev == tpp);
2806 		page_list_concat(&rootpp, &tpp);
2807 	}
2808 	ASSERT(rootpp == pp);
2809 
2810 	page_list_add_pages(rootpp, 0);
2811 	page_create_putback(pgcnt);
2812 }
2813 
2814 int free_pages = 1;
2815 
2816 /*
2817  * This routine attempts to return pages to the cachelist via page_release().
2818  * It does not *have* to be successful in all cases, since the pageout scanner
2819  * will catch any pages it misses.  It does need to be fast and not introduce
2820  * too much overhead.
2821  *
2822  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2823  * don't lock and retry.  This is ok, since the page scanner will eventually
2824  * find any page we miss in free_vp_pages().
2825  */
2826 void
2827 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2828 {
2829 	page_t *pp;
2830 	u_offset_t eoff;
2831 	extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2832 
2833 	eoff = off + len;
2834 
2835 	if (free_pages == 0)
2836 		return;
2837 	if (swap_in_range(vp, off, len))
2838 		return;
2839 
2840 	for (; off < eoff; off += PAGESIZE) {
2841 
2842 		/*
2843 		 * find the page using a fast, but inexact search. It'll be OK
2844 		 * if a few pages slip through the cracks here.
2845 		 */
2846 		pp = page_exists(vp, off);
2847 
2848 		/*
2849 		 * If we didn't find the page (it may not exist), the page
2850 		 * is free, looks still in use (shared), or we can't lock it,
2851 		 * just give up.
2852 		 */
2853 		if (pp == NULL ||
2854 		    PP_ISFREE(pp) ||
2855 		    page_share_cnt(pp) > 0 ||
2856 		    !page_trylock(pp, SE_EXCL))
2857 			continue;
2858 
2859 		/*
2860 		 * Once we have locked pp, verify that it's still the
2861 		 * correct page and not already free
2862 		 */
2863 		ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2864 		if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2865 			page_unlock(pp);
2866 			continue;
2867 		}
2868 
2869 		/*
2870 		 * try to release the page...
2871 		 */
2872 		(void) page_release(pp, 1);
2873 	}
2874 }
2875 
2876 /*
2877  * Reclaim the given page from the free list.
2878  * If pp is part of a large pages, only the given constituent page is reclaimed
2879  * and the large page it belonged to will be demoted.  This can only happen
2880  * if the page is not on the cachelist.
2881  *
2882  * Returns 1 on success or 0 on failure.
2883  *
2884  * The page is unlocked if it can't be reclaimed (when freemem == 0).
2885  * If `lock' is non-null, it will be dropped and re-acquired if
2886  * the routine must wait while freemem is 0.
2887  *
2888  * As it turns out, boot_getpages() does this.  It picks a page,
2889  * based on where OBP mapped in some address, gets its pfn, searches
2890  * the memsegs, locks the page, then pulls it off the free list!
2891  */
2892 int
2893 page_reclaim(page_t *pp, kmutex_t *lock)
2894 {
2895 	struct pcf	*p;
2896 	struct cpu	*cpup;
2897 	int		enough;
2898 	uint_t		i;
2899 
2900 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2901 	ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2902 
2903 	/*
2904 	 * If `freemem' is 0, we cannot reclaim this page from the
2905 	 * freelist, so release every lock we might hold: the page,
2906 	 * and the `lock' before blocking.
2907 	 *
2908 	 * The only way `freemem' can become 0 while there are pages
2909 	 * marked free (have their p->p_free bit set) is when the
2910 	 * system is low on memory and doing a page_create().  In
2911 	 * order to guarantee that once page_create() starts acquiring
2912 	 * pages it will be able to get all that it needs since `freemem'
2913 	 * was decreased by the requested amount.  So, we need to release
2914 	 * this page, and let page_create() have it.
2915 	 *
2916 	 * Since `freemem' being zero is not supposed to happen, just
2917 	 * use the usual hash stuff as a starting point.  If that bucket
2918 	 * is empty, then assume the worst, and start at the beginning
2919 	 * of the pcf array.  If we always start at the beginning
2920 	 * when acquiring more than one pcf lock, there won't be any
2921 	 * deadlock problems.
2922 	 */
2923 
2924 	/* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
2925 
2926 	if (freemem <= throttlefree && !page_create_throttle(1l, 0)) {
2927 		pcf_acquire_all();
2928 		goto page_reclaim_nomem;
2929 	}
2930 
2931 	enough = pcf_decrement_bucket(1);
2932 
2933 	if (!enough) {
2934 		VM_STAT_ADD(page_reclaim_zero);
2935 		/*
2936 		 * Check again. Its possible that some other thread
2937 		 * could have been right behind us, and added one
2938 		 * to a list somewhere.  Acquire each of the pcf locks
2939 		 * until we find a page.
2940 		 */
2941 		p = pcf;
2942 		for (i = 0; i < pcf_fanout; i++) {
2943 			mutex_enter(&p->pcf_lock);
2944 			if (p->pcf_count >= 1) {
2945 				p->pcf_count -= 1;
2946 				enough = 1;
2947 				break;
2948 			}
2949 			p++;
2950 		}
2951 
2952 		if (!enough) {
2953 page_reclaim_nomem:
2954 			/*
2955 			 * We really can't have page `pp'.
2956 			 * Time for the no-memory dance with
2957 			 * page_free().  This is just like
2958 			 * page_create_wait().  Plus the added
2959 			 * attraction of releasing whatever mutex
2960 			 * we held when we were called with in `lock'.
2961 			 * Page_unlock() will wakeup any thread
2962 			 * waiting around for this page.
2963 			 */
2964 			if (lock) {
2965 				VM_STAT_ADD(page_reclaim_zero_locked);
2966 				mutex_exit(lock);
2967 			}
2968 			page_unlock(pp);
2969 
2970 			/*
2971 			 * get this before we drop all the pcf locks.
2972 			 */
2973 			mutex_enter(&new_freemem_lock);
2974 
2975 			p = pcf;
2976 			for (i = 0; i < pcf_fanout; i++) {
2977 				p->pcf_wait++;
2978 				mutex_exit(&p->pcf_lock);
2979 				p++;
2980 			}
2981 
2982 			freemem_wait++;
2983 			cv_wait(&freemem_cv, &new_freemem_lock);
2984 			freemem_wait--;
2985 
2986 			mutex_exit(&new_freemem_lock);
2987 
2988 			if (lock) {
2989 				mutex_enter(lock);
2990 			}
2991 			return (0);
2992 		}
2993 
2994 		/*
2995 		 * The pcf accounting has been done,
2996 		 * though none of the pcf_wait flags have been set,
2997 		 * drop the locks and continue on.
2998 		 */
2999 		while (p >= pcf) {
3000 			mutex_exit(&p->pcf_lock);
3001 			p--;
3002 		}
3003 	}
3004 
3005 	/*
3006 	 * freemem is not protected by any lock. Thus, we cannot
3007 	 * have any assertion containing freemem here.
3008 	 */
3009 	freemem -= 1;
3010 
3011 	VM_STAT_ADD(pagecnt.pc_reclaim);
3012 
3013 	/*
3014 	 * page_list_sub will handle the case where pp is a large page.
3015 	 * It's possible that the page was promoted while on the freelist
3016 	 */
3017 	if (PP_ISAGED(pp)) {
3018 		page_list_sub(pp, PG_FREE_LIST);
3019 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3020 		    "page_reclaim_free:pp %p", pp);
3021 	} else {
3022 		page_list_sub(pp, PG_CACHE_LIST);
3023 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3024 		    "page_reclaim_cache:pp %p", pp);
3025 	}
3026 
3027 	/*
3028 	 * clear the p_free & p_age bits since this page is no longer
3029 	 * on the free list.  Notice that there was a brief time where
3030 	 * a page is marked as free, but is not on the list.
3031 	 *
3032 	 * Set the reference bit to protect against immediate pageout.
3033 	 */
3034 	PP_CLRFREE(pp);
3035 	PP_CLRAGED(pp);
3036 	page_set_props(pp, P_REF);
3037 
3038 	CPU_STATS_ENTER_K();
3039 	cpup = CPU;	/* get cpup now that CPU cannot change */
3040 	CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3041 	CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3042 	CPU_STATS_EXIT_K();
3043 	ASSERT(pp->p_szc == 0);
3044 
3045 	return (1);
3046 }
3047 
3048 /*
3049  * Destroy identity of the page and put it back on
3050  * the page free list.  Assumes that the caller has
3051  * acquired the "exclusive" lock on the page.
3052  */
3053 void
3054 page_destroy(page_t *pp, int dontfree)
3055 {
3056 	ASSERT((PAGE_EXCL(pp) &&
3057 	    !page_iolock_assert(pp)) || panicstr);
3058 	ASSERT(pp->p_slckcnt == 0 || panicstr);
3059 
3060 	if (pp->p_szc != 0) {
3061 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3062 		    PP_ISKAS(pp)) {
3063 			panic("page_destroy: anon or kernel or no vnode "
3064 			    "large page %p", (void *)pp);
3065 		}
3066 		page_demote_vp_pages(pp);
3067 		ASSERT(pp->p_szc == 0);
3068 	}
3069 
3070 	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3071 
3072 	/*
3073 	 * Unload translations, if any, then hash out the
3074 	 * page to erase its identity.
3075 	 */
3076 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3077 	page_hashout(pp, NULL);
3078 
3079 	if (!dontfree) {
3080 		/*
3081 		 * Acquire the "freemem_lock" for availrmem.
3082 		 * The page_struct_lock need not be acquired for lckcnt
3083 		 * and cowcnt since the page has an "exclusive" lock.
3084 		 */
3085 		if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3086 			mutex_enter(&freemem_lock);
3087 			if (pp->p_lckcnt != 0) {
3088 				availrmem++;
3089 				pp->p_lckcnt = 0;
3090 			}
3091 			if (pp->p_cowcnt != 0) {
3092 				availrmem += pp->p_cowcnt;
3093 				pp->p_cowcnt = 0;
3094 			}
3095 			mutex_exit(&freemem_lock);
3096 		}
3097 		/*
3098 		 * Put the page on the "free" list.
3099 		 */
3100 		page_free(pp, 0);
3101 	}
3102 }
3103 
3104 void
3105 page_destroy_pages(page_t *pp)
3106 {
3107 
3108 	page_t	*tpp, *rootpp = NULL;
3109 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
3110 	pgcnt_t	i, pglcks = 0;
3111 	uint_t	szc = pp->p_szc;
3112 
3113 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3114 
3115 	VM_STAT_ADD(pagecnt.pc_destroy_pages);
3116 
3117 	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3118 
3119 	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3120 		panic("page_destroy_pages: not root page %p", (void *)pp);
3121 		/*NOTREACHED*/
3122 	}
3123 
3124 	for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3125 		ASSERT((PAGE_EXCL(tpp) &&
3126 		    !page_iolock_assert(tpp)) || panicstr);
3127 		ASSERT(tpp->p_slckcnt == 0 || panicstr);
3128 		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3129 		page_hashout(tpp, NULL);
3130 		ASSERT(tpp->p_offset == (u_offset_t)-1);
3131 		if (tpp->p_lckcnt != 0) {
3132 			pglcks++;
3133 			tpp->p_lckcnt = 0;
3134 		} else if (tpp->p_cowcnt != 0) {
3135 			pglcks += tpp->p_cowcnt;
3136 			tpp->p_cowcnt = 0;
3137 		}
3138 		ASSERT(!hat_page_getshare(tpp));
3139 		ASSERT(tpp->p_vnode == NULL);
3140 		ASSERT(tpp->p_szc == szc);
3141 
3142 		PP_SETFREE(tpp);
3143 		page_clr_all_props(tpp);
3144 		PP_SETAGED(tpp);
3145 		ASSERT(tpp->p_next == tpp);
3146 		ASSERT(tpp->p_prev == tpp);
3147 		page_list_concat(&rootpp, &tpp);
3148 	}
3149 
3150 	ASSERT(rootpp == pp);
3151 	if (pglcks != 0) {
3152 		mutex_enter(&freemem_lock);
3153 		availrmem += pglcks;
3154 		mutex_exit(&freemem_lock);
3155 	}
3156 
3157 	page_list_add_pages(rootpp, 0);
3158 	page_create_putback(pgcnt);
3159 }
3160 
3161 /*
3162  * Similar to page_destroy(), but destroys pages which are
3163  * locked and known to be on the page free list.  Since
3164  * the page is known to be free and locked, no one can access
3165  * it.
3166  *
3167  * Also, the number of free pages does not change.
3168  */
3169 void
3170 page_destroy_free(page_t *pp)
3171 {
3172 	ASSERT(PAGE_EXCL(pp));
3173 	ASSERT(PP_ISFREE(pp));
3174 	ASSERT(pp->p_vnode);
3175 	ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3176 	ASSERT(!hat_page_is_mapped(pp));
3177 	ASSERT(PP_ISAGED(pp) == 0);
3178 	ASSERT(pp->p_szc == 0);
3179 
3180 	VM_STAT_ADD(pagecnt.pc_destroy_free);
3181 	page_list_sub(pp, PG_CACHE_LIST);
3182 
3183 	page_hashout(pp, NULL);
3184 	ASSERT(pp->p_vnode == NULL);
3185 	ASSERT(pp->p_offset == (u_offset_t)-1);
3186 	ASSERT(pp->p_hash == NULL);
3187 
3188 	PP_SETAGED(pp);
3189 	page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3190 	page_unlock(pp);
3191 
3192 	mutex_enter(&new_freemem_lock);
3193 	if (freemem_wait) {
3194 		cv_signal(&freemem_cv);
3195 	}
3196 	mutex_exit(&new_freemem_lock);
3197 }
3198 
3199 /*
3200  * Rename the page "opp" to have an identity specified
3201  * by [vp, off].  If a page already exists with this name
3202  * it is locked and destroyed.  Note that the page's
3203  * translations are not unloaded during the rename.
3204  *
3205  * This routine is used by the anon layer to "steal" the
3206  * original page and is not unlike destroying a page and
3207  * creating a new page using the same page frame.
3208  *
3209  * XXX -- Could deadlock if caller 1 tries to rename A to B while
3210  * caller 2 tries to rename B to A.
3211  */
3212 void
3213 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3214 {
3215 	page_t		*pp;
3216 	int		olckcnt = 0;
3217 	int		ocowcnt = 0;
3218 	kmutex_t	*phm;
3219 	ulong_t		index;
3220 
3221 	ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3222 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3223 	ASSERT(PP_ISFREE(opp) == 0);
3224 
3225 	VM_STAT_ADD(page_rename_count);
3226 
3227 	TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3228 	    "page rename:pp %p vp %p off %llx", opp, vp, off);
3229 
3230 	/*
3231 	 * CacheFS may call page_rename for a large NFS page
3232 	 * when both CacheFS and NFS mount points are used
3233 	 * by applications. Demote this large page before
3234 	 * renaming it, to ensure that there are no "partial"
3235 	 * large pages left lying around.
3236 	 */
3237 	if (opp->p_szc != 0) {
3238 		vnode_t *ovp = opp->p_vnode;
3239 		ASSERT(ovp != NULL);
3240 		ASSERT(!IS_SWAPFSVP(ovp));
3241 		ASSERT(!VN_ISKAS(ovp));
3242 		page_demote_vp_pages(opp);
3243 		ASSERT(opp->p_szc == 0);
3244 	}
3245 
3246 	page_hashout(opp, NULL);
3247 	PP_CLRAGED(opp);
3248 
3249 	/*
3250 	 * Acquire the appropriate page hash lock, since
3251 	 * we're going to rename the page.
3252 	 */
3253 	index = PAGE_HASH_FUNC(vp, off);
3254 	phm = PAGE_HASH_MUTEX(index);
3255 	mutex_enter(phm);
3256 top:
3257 	/*
3258 	 * Look for an existing page with this name and destroy it if found.
3259 	 * By holding the page hash lock all the way to the page_hashin()
3260 	 * call, we are assured that no page can be created with this
3261 	 * identity.  In the case when the phm lock is dropped to undo any
3262 	 * hat layer mappings, the existing page is held with an "exclusive"
3263 	 * lock, again preventing another page from being created with
3264 	 * this identity.
3265 	 */
3266 	PAGE_HASH_SEARCH(index, pp, vp, off);
3267 	if (pp != NULL) {
3268 		VM_STAT_ADD(page_rename_exists);
3269 
3270 		/*
3271 		 * As it turns out, this is one of only two places where
3272 		 * page_lock() needs to hold the passed in lock in the
3273 		 * successful case.  In all of the others, the lock could
3274 		 * be dropped as soon as the attempt is made to lock
3275 		 * the page.  It is tempting to add yet another arguement,
3276 		 * PL_KEEP or PL_DROP, to let page_lock know what to do.
3277 		 */
3278 		if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3279 			/*
3280 			 * Went to sleep because the page could not
3281 			 * be locked.  We were woken up when the page
3282 			 * was unlocked, or when the page was destroyed.
3283 			 * In either case, `phm' was dropped while we
3284 			 * slept.  Hence we should not just roar through
3285 			 * this loop.
3286 			 */
3287 			goto top;
3288 		}
3289 
3290 		/*
3291 		 * If an existing page is a large page, then demote
3292 		 * it to ensure that no "partial" large pages are
3293 		 * "created" after page_rename. An existing page
3294 		 * can be a CacheFS page, and can't belong to swapfs.
3295 		 */
3296 		if (hat_page_is_mapped(pp)) {
3297 			/*
3298 			 * Unload translations.  Since we hold the
3299 			 * exclusive lock on this page, the page
3300 			 * can not be changed while we drop phm.
3301 			 * This is also not a lock protocol violation,
3302 			 * but rather the proper way to do things.
3303 			 */
3304 			mutex_exit(phm);
3305 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3306 			if (pp->p_szc != 0) {
3307 				ASSERT(!IS_SWAPFSVP(vp));
3308 				ASSERT(!VN_ISKAS(vp));
3309 				page_demote_vp_pages(pp);
3310 				ASSERT(pp->p_szc == 0);
3311 			}
3312 			mutex_enter(phm);
3313 		} else if (pp->p_szc != 0) {
3314 			ASSERT(!IS_SWAPFSVP(vp));
3315 			ASSERT(!VN_ISKAS(vp));
3316 			mutex_exit(phm);
3317 			page_demote_vp_pages(pp);
3318 			ASSERT(pp->p_szc == 0);
3319 			mutex_enter(phm);
3320 		}
3321 		page_hashout(pp, phm);
3322 	}
3323 	/*
3324 	 * Hash in the page with the new identity.
3325 	 */
3326 	if (!page_hashin(opp, vp, off, phm)) {
3327 		/*
3328 		 * We were holding phm while we searched for [vp, off]
3329 		 * and only dropped phm if we found and locked a page.
3330 		 * If we can't create this page now, then some thing
3331 		 * is really broken.
3332 		 */
3333 		panic("page_rename: Can't hash in page: %p", (void *)pp);
3334 		/*NOTREACHED*/
3335 	}
3336 
3337 	ASSERT(MUTEX_HELD(phm));
3338 	mutex_exit(phm);
3339 
3340 	/*
3341 	 * Now that we have dropped phm, lets get around to finishing up
3342 	 * with pp.
3343 	 */
3344 	if (pp != NULL) {
3345 		ASSERT(!hat_page_is_mapped(pp));
3346 		/* for now large pages should not end up here */
3347 		ASSERT(pp->p_szc == 0);
3348 		/*
3349 		 * Save the locks for transfer to the new page and then
3350 		 * clear them so page_free doesn't think they're important.
3351 		 * The page_struct_lock need not be acquired for lckcnt and
3352 		 * cowcnt since the page has an "exclusive" lock.
3353 		 */
3354 		olckcnt = pp->p_lckcnt;
3355 		ocowcnt = pp->p_cowcnt;
3356 		pp->p_lckcnt = pp->p_cowcnt = 0;
3357 
3358 		/*
3359 		 * Put the page on the "free" list after we drop
3360 		 * the lock.  The less work under the lock the better.
3361 		 */
3362 		/*LINTED: constant in conditional context*/
3363 		VN_DISPOSE(pp, B_FREE, 0, kcred);
3364 	}
3365 
3366 	/*
3367 	 * Transfer the lock count from the old page (if any).
3368 	 * The page_struct_lock need not be acquired for lckcnt and
3369 	 * cowcnt since the page has an "exclusive" lock.
3370 	 */
3371 	opp->p_lckcnt += olckcnt;
3372 	opp->p_cowcnt += ocowcnt;
3373 }
3374 
3375 /*
3376  * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3377  *
3378  * Pages are normally inserted at the start of a vnode's v_pages list.
3379  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3380  * This can happen when a modified page is relocated for DR.
3381  *
3382  * Returns 1 on success and 0 on failure.
3383  */
3384 static int
3385 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3386 {
3387 	page_t		**listp;
3388 	page_t		*tp;
3389 	ulong_t		index;
3390 
3391 	ASSERT(PAGE_EXCL(pp));
3392 	ASSERT(vp != NULL);
3393 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3394 
3395 	/*
3396 	 * Be sure to set these up before the page is inserted on the hash
3397 	 * list.  As soon as the page is placed on the list some other
3398 	 * thread might get confused and wonder how this page could
3399 	 * possibly hash to this list.
3400 	 */
3401 	pp->p_vnode = vp;
3402 	pp->p_offset = offset;
3403 
3404 	/*
3405 	 * record if this page is on a swap vnode
3406 	 */
3407 	if ((vp->v_flag & VISSWAP) != 0)
3408 		PP_SETSWAP(pp);
3409 
3410 	index = PAGE_HASH_FUNC(vp, offset);
3411 	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3412 	listp = &page_hash[index];
3413 
3414 	/*
3415 	 * If this page is already hashed in, fail this attempt to add it.
3416 	 */
3417 	for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3418 		if (tp->p_vnode == vp && tp->p_offset == offset) {
3419 			pp->p_vnode = NULL;
3420 			pp->p_offset = (u_offset_t)(-1);
3421 			return (0);
3422 		}
3423 	}
3424 	pp->p_hash = *listp;
3425 	*listp = pp;
3426 
3427 	/*
3428 	 * Add the page to the vnode's list of pages
3429 	 */
3430 	if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3431 		listp = &vp->v_pages->p_vpprev->p_vpnext;
3432 	else
3433 		listp = &vp->v_pages;
3434 
3435 	page_vpadd(listp, pp);
3436 
3437 	return (1);
3438 }
3439 
3440 /*
3441  * Add page `pp' to both the hash and vp chains for [vp, offset].
3442  *
3443  * Returns 1 on success and 0 on failure.
3444  * If hold is passed in, it is not dropped.
3445  */
3446 int
3447 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3448 {
3449 	kmutex_t	*phm = NULL;
3450 	kmutex_t	*vphm;
3451 	int		rc;
3452 
3453 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3454 
3455 	TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3456 	    "page_hashin:pp %p vp %p offset %llx",
3457 	    pp, vp, offset);
3458 
3459 	VM_STAT_ADD(hashin_count);
3460 
3461 	if (hold != NULL)
3462 		phm = hold;
3463 	else {
3464 		VM_STAT_ADD(hashin_not_held);
3465 		phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3466 		mutex_enter(phm);
3467 	}
3468 
3469 	vphm = page_vnode_mutex(vp);
3470 	mutex_enter(vphm);
3471 	rc = page_do_hashin(pp, vp, offset);
3472 	mutex_exit(vphm);
3473 	if (hold == NULL)
3474 		mutex_exit(phm);
3475 	if (rc == 0)
3476 		VM_STAT_ADD(hashin_already);
3477 	return (rc);
3478 }
3479 
3480 /*
3481  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3482  * All mutexes must be held
3483  */
3484 static void
3485 page_do_hashout(page_t *pp)
3486 {
3487 	page_t	**hpp;
3488 	page_t	*hp;
3489 	vnode_t	*vp = pp->p_vnode;
3490 
3491 	ASSERT(vp != NULL);
3492 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3493 
3494 	/*
3495 	 * First, take pp off of its hash chain.
3496 	 */
3497 	hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3498 
3499 	for (;;) {
3500 		hp = *hpp;
3501 		if (hp == pp)
3502 			break;
3503 		if (hp == NULL) {
3504 			panic("page_do_hashout");
3505 			/*NOTREACHED*/
3506 		}
3507 		hpp = &hp->p_hash;
3508 	}
3509 	*hpp = pp->p_hash;
3510 
3511 	/*
3512 	 * Now remove it from its associated vnode.
3513 	 */
3514 	if (vp->v_pages)
3515 		page_vpsub(&vp->v_pages, pp);
3516 
3517 	pp->p_hash = NULL;
3518 	page_clr_all_props(pp);
3519 	PP_CLRSWAP(pp);
3520 	pp->p_vnode = NULL;
3521 	pp->p_offset = (u_offset_t)-1;
3522 }
3523 
3524 /*
3525  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3526  *
3527  * When `phm' is non-NULL it contains the address of the mutex protecting the
3528  * hash list pp is on.  It is not dropped.
3529  */
3530 void
3531 page_hashout(page_t *pp, kmutex_t *phm)
3532 {
3533 	vnode_t		*vp;
3534 	ulong_t		index;
3535 	kmutex_t	*nphm;
3536 	kmutex_t	*vphm;
3537 	kmutex_t	*sep;
3538 
3539 	ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3540 	ASSERT(pp->p_vnode != NULL);
3541 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3542 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3543 
3544 	vp = pp->p_vnode;
3545 
3546 	TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3547 	    "page_hashout:pp %p vp %p", pp, vp);
3548 
3549 	/* Kernel probe */
3550 	TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3551 	    tnf_opaque, vnode, vp,
3552 	    tnf_offset, offset, pp->p_offset);
3553 
3554 	/*
3555 	 *
3556 	 */
3557 	VM_STAT_ADD(hashout_count);
3558 	index = PAGE_HASH_FUNC(vp, pp->p_offset);
3559 	if (phm == NULL) {
3560 		VM_STAT_ADD(hashout_not_held);
3561 		nphm = PAGE_HASH_MUTEX(index);
3562 		mutex_enter(nphm);
3563 	}
3564 	ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3565 
3566 
3567 	/*
3568 	 * grab page vnode mutex and remove it...
3569 	 */
3570 	vphm = page_vnode_mutex(vp);
3571 	mutex_enter(vphm);
3572 
3573 	page_do_hashout(pp);
3574 
3575 	mutex_exit(vphm);
3576 	if (phm == NULL)
3577 		mutex_exit(nphm);
3578 
3579 	/*
3580 	 * Wake up processes waiting for this page.  The page's
3581 	 * identity has been changed, and is probably not the
3582 	 * desired page any longer.
3583 	 */
3584 	sep = page_se_mutex(pp);
3585 	mutex_enter(sep);
3586 	pp->p_selock &= ~SE_EWANTED;
3587 	if (CV_HAS_WAITERS(&pp->p_cv))
3588 		cv_broadcast(&pp->p_cv);
3589 	mutex_exit(sep);
3590 }
3591 
3592 /*
3593  * Add the page to the front of a linked list of pages
3594  * using the p_next & p_prev pointers for the list.
3595  * The caller is responsible for protecting the list pointers.
3596  */
3597 void
3598 page_add(page_t **ppp, page_t *pp)
3599 {
3600 	ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3601 
3602 	page_add_common(ppp, pp);
3603 }
3604 
3605 
3606 
3607 /*
3608  *  Common code for page_add() and mach_page_add()
3609  */
3610 void
3611 page_add_common(page_t **ppp, page_t *pp)
3612 {
3613 	if (*ppp == NULL) {
3614 		pp->p_next = pp->p_prev = pp;
3615 	} else {
3616 		pp->p_next = *ppp;
3617 		pp->p_prev = (*ppp)->p_prev;
3618 		(*ppp)->p_prev = pp;
3619 		pp->p_prev->p_next = pp;
3620 	}
3621 	*ppp = pp;
3622 }
3623 
3624 
3625 /*
3626  * Remove this page from a linked list of pages
3627  * using the p_next & p_prev pointers for the list.
3628  *
3629  * The caller is responsible for protecting the list pointers.
3630  */
3631 void
3632 page_sub(page_t **ppp, page_t *pp)
3633 {
3634 	ASSERT((PP_ISFREE(pp)) ? 1 :
3635 	    (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3636 
3637 	if (*ppp == NULL || pp == NULL) {
3638 		panic("page_sub: bad arg(s): pp %p, *ppp %p",
3639 		    (void *)pp, (void *)(*ppp));
3640 		/*NOTREACHED*/
3641 	}
3642 
3643 	page_sub_common(ppp, pp);
3644 }
3645 
3646 
3647 /*
3648  *  Common code for page_sub() and mach_page_sub()
3649  */
3650 void
3651 page_sub_common(page_t **ppp, page_t *pp)
3652 {
3653 	if (*ppp == pp)
3654 		*ppp = pp->p_next;		/* go to next page */
3655 
3656 	if (*ppp == pp)
3657 		*ppp = NULL;			/* page list is gone */
3658 	else {
3659 		pp->p_prev->p_next = pp->p_next;
3660 		pp->p_next->p_prev = pp->p_prev;
3661 	}
3662 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
3663 }
3664 
3665 
3666 /*
3667  * Break page list cppp into two lists with npages in the first list.
3668  * The tail is returned in nppp.
3669  */
3670 void
3671 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3672 {
3673 	page_t *s1pp = *oppp;
3674 	page_t *s2pp;
3675 	page_t *e1pp, *e2pp;
3676 	long n = 0;
3677 
3678 	if (s1pp == NULL) {
3679 		*nppp = NULL;
3680 		return;
3681 	}
3682 	if (npages == 0) {
3683 		*nppp = s1pp;
3684 		*oppp = NULL;
3685 		return;
3686 	}
3687 	for (n = 0, s2pp = *oppp; n < npages; n++) {
3688 		s2pp = s2pp->p_next;
3689 	}
3690 	/* Fix head and tail of new lists */
3691 	e1pp = s2pp->p_prev;
3692 	e2pp = s1pp->p_prev;
3693 	s1pp->p_prev = e1pp;
3694 	e1pp->p_next = s1pp;
3695 	s2pp->p_prev = e2pp;
3696 	e2pp->p_next = s2pp;
3697 
3698 	/* second list empty */
3699 	if (s2pp == s1pp) {
3700 		*oppp = s1pp;
3701 		*nppp = NULL;
3702 	} else {
3703 		*oppp = s1pp;
3704 		*nppp = s2pp;
3705 	}
3706 }
3707 
3708 /*
3709  * Concatenate page list nppp onto the end of list ppp.
3710  */
3711 void
3712 page_list_concat(page_t **ppp, page_t **nppp)
3713 {
3714 	page_t *s1pp, *s2pp, *e1pp, *e2pp;
3715 
3716 	if (*nppp == NULL) {
3717 		return;
3718 	}
3719 	if (*ppp == NULL) {
3720 		*ppp = *nppp;
3721 		return;
3722 	}
3723 	s1pp = *ppp;
3724 	e1pp =  s1pp->p_prev;
3725 	s2pp = *nppp;
3726 	e2pp = s2pp->p_prev;
3727 	s1pp->p_prev = e2pp;
3728 	e2pp->p_next = s1pp;
3729 	e1pp->p_next = s2pp;
3730 	s2pp->p_prev = e1pp;
3731 }
3732 
3733 /*
3734  * return the next page in the page list
3735  */
3736 page_t *
3737 page_list_next(page_t *pp)
3738 {
3739 	return (pp->p_next);
3740 }
3741 
3742 
3743 /*
3744  * Add the page to the front of the linked list of pages
3745  * using p_vpnext/p_vpprev pointers for the list.
3746  *
3747  * The caller is responsible for protecting the lists.
3748  */
3749 void
3750 page_vpadd(page_t **ppp, page_t *pp)
3751 {
3752 	if (*ppp == NULL) {
3753 		pp->p_vpnext = pp->p_vpprev = pp;
3754 	} else {
3755 		pp->p_vpnext = *ppp;
3756 		pp->p_vpprev = (*ppp)->p_vpprev;
3757 		(*ppp)->p_vpprev = pp;
3758 		pp->p_vpprev->p_vpnext = pp;
3759 	}
3760 	*ppp = pp;
3761 }
3762 
3763 /*
3764  * Remove this page from the linked list of pages
3765  * using p_vpnext/p_vpprev pointers for the list.
3766  *
3767  * The caller is responsible for protecting the lists.
3768  */
3769 void
3770 page_vpsub(page_t **ppp, page_t *pp)
3771 {
3772 	if (*ppp == NULL || pp == NULL) {
3773 		panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3774 		    (void *)pp, (void *)(*ppp));
3775 		/*NOTREACHED*/
3776 	}
3777 
3778 	if (*ppp == pp)
3779 		*ppp = pp->p_vpnext;		/* go to next page */
3780 
3781 	if (*ppp == pp)
3782 		*ppp = NULL;			/* page list is gone */
3783 	else {
3784 		pp->p_vpprev->p_vpnext = pp->p_vpnext;
3785 		pp->p_vpnext->p_vpprev = pp->p_vpprev;
3786 	}
3787 	pp->p_vpprev = pp->p_vpnext = pp;	/* make pp a list of one */
3788 }
3789 
3790 /*
3791  * Lock a physical page into memory "long term".  Used to support "lock
3792  * in memory" functions.  Accepts the page to be locked, and a cow variable
3793  * to indicate whether a the lock will travel to the new page during
3794  * a potential copy-on-write.
3795  */
3796 int
3797 page_pp_lock(
3798 	page_t *pp,			/* page to be locked */
3799 	int cow,			/* cow lock */
3800 	int kernel)			/* must succeed -- ignore checking */
3801 {
3802 	int r = 0;			/* result -- assume failure */
3803 
3804 	ASSERT(PAGE_LOCKED(pp));
3805 
3806 	page_struct_lock(pp);
3807 	/*
3808 	 * Acquire the "freemem_lock" for availrmem.
3809 	 */
3810 	if (cow) {
3811 		mutex_enter(&freemem_lock);
3812 		if ((availrmem > pages_pp_maximum) &&
3813 		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3814 			availrmem--;
3815 			pages_locked++;
3816 			mutex_exit(&freemem_lock);
3817 			r = 1;
3818 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3819 				cmn_err(CE_WARN,
3820 				    "COW lock limit reached on pfn 0x%lx",
3821 				    page_pptonum(pp));
3822 			}
3823 		} else
3824 			mutex_exit(&freemem_lock);
3825 	} else {
3826 		if (pp->p_lckcnt) {
3827 			if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3828 				r = 1;
3829 				if (++pp->p_lckcnt ==
3830 				    (ushort_t)PAGE_LOCK_MAXIMUM) {
3831 					cmn_err(CE_WARN, "Page lock limit "
3832 					    "reached on pfn 0x%lx",
3833 					    page_pptonum(pp));
3834 				}
3835 			}
3836 		} else {
3837 			if (kernel) {
3838 				/* availrmem accounting done by caller */
3839 				++pp->p_lckcnt;
3840 				r = 1;
3841 			} else {
3842 				mutex_enter(&freemem_lock);
3843 				if (availrmem > pages_pp_maximum) {
3844 					availrmem--;
3845 					pages_locked++;
3846 					++pp->p_lckcnt;
3847 					r = 1;
3848 				}
3849 				mutex_exit(&freemem_lock);
3850 			}
3851 		}
3852 	}
3853 	page_struct_unlock(pp);
3854 	return (r);
3855 }
3856 
3857 /*
3858  * Decommit a lock on a physical page frame.  Account for cow locks if
3859  * appropriate.
3860  */
3861 void
3862 page_pp_unlock(
3863 	page_t *pp,			/* page to be unlocked */
3864 	int cow,			/* expect cow lock */
3865 	int kernel)			/* this was a kernel lock */
3866 {
3867 	ASSERT(PAGE_LOCKED(pp));
3868 
3869 	page_struct_lock(pp);
3870 	/*
3871 	 * Acquire the "freemem_lock" for availrmem.
3872 	 * If cowcnt or lcknt is already 0 do nothing; i.e., we
3873 	 * could be called to unlock even if nothing is locked. This could
3874 	 * happen if locked file pages were truncated (removing the lock)
3875 	 * and the file was grown again and new pages faulted in; the new
3876 	 * pages are unlocked but the segment still thinks they're locked.
3877 	 */
3878 	if (cow) {
3879 		if (pp->p_cowcnt) {
3880 			mutex_enter(&freemem_lock);
3881 			pp->p_cowcnt--;
3882 			availrmem++;
3883 			pages_locked--;
3884 			mutex_exit(&freemem_lock);
3885 		}
3886 	} else {
3887 		if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3888 			if (!kernel) {
3889 				mutex_enter(&freemem_lock);
3890 				availrmem++;
3891 				pages_locked--;
3892 				mutex_exit(&freemem_lock);
3893 			}
3894 		}
3895 	}
3896 	page_struct_unlock(pp);
3897 }
3898 
3899 /*
3900  * This routine reserves availrmem for npages;
3901  * 	flags: KM_NOSLEEP or KM_SLEEP
3902  * 	returns 1 on success or 0 on failure
3903  */
3904 int
3905 page_resv(pgcnt_t npages, uint_t flags)
3906 {
3907 	mutex_enter(&freemem_lock);
3908 	while (availrmem < tune.t_minarmem + npages) {
3909 		if (flags & KM_NOSLEEP) {
3910 			mutex_exit(&freemem_lock);
3911 			return (0);
3912 		}
3913 		mutex_exit(&freemem_lock);
3914 		page_needfree(npages);
3915 		kmem_reap();
3916 		delay(hz >> 2);
3917 		page_needfree(-(spgcnt_t)npages);
3918 		mutex_enter(&freemem_lock);
3919 	}
3920 	availrmem -= npages;
3921 	mutex_exit(&freemem_lock);
3922 	return (1);
3923 }
3924 
3925 /*
3926  * This routine unreserves availrmem for npages;
3927  */
3928 void
3929 page_unresv(pgcnt_t npages)
3930 {
3931 	mutex_enter(&freemem_lock);
3932 	availrmem += npages;
3933 	mutex_exit(&freemem_lock);
3934 }
3935 
3936 /*
3937  * See Statement at the beginning of segvn_lockop() regarding
3938  * the way we handle cowcnts and lckcnts.
3939  *
3940  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
3941  * that breaks COW has PROT_WRITE.
3942  *
3943  * Note that, we may also break COW in case we are softlocking
3944  * on read access during physio;
3945  * in this softlock case, the vpage may not have PROT_WRITE.
3946  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
3947  * if the vpage doesn't have PROT_WRITE.
3948  *
3949  * This routine is never called if we are stealing a page
3950  * in anon_private.
3951  *
3952  * The caller subtracted from availrmem for read only mapping.
3953  * if lckcnt is 1 increment availrmem.
3954  */
3955 void
3956 page_pp_useclaim(
3957 	page_t *opp,		/* original page frame losing lock */
3958 	page_t *npp,		/* new page frame gaining lock */
3959 	uint_t	write_perm) 	/* set if vpage has PROT_WRITE */
3960 {
3961 	int payback = 0;
3962 
3963 	ASSERT(PAGE_LOCKED(opp));
3964 	ASSERT(PAGE_LOCKED(npp));
3965 
3966 	page_struct_lock(opp);
3967 
3968 	ASSERT(npp->p_cowcnt == 0);
3969 	ASSERT(npp->p_lckcnt == 0);
3970 
3971 	/* Don't use claim if nothing is locked (see page_pp_unlock above) */
3972 	if ((write_perm && opp->p_cowcnt != 0) ||
3973 	    (!write_perm && opp->p_lckcnt != 0)) {
3974 
3975 		if (write_perm) {
3976 			npp->p_cowcnt++;
3977 			ASSERT(opp->p_cowcnt != 0);
3978 			opp->p_cowcnt--;
3979 		} else {
3980 
3981 			ASSERT(opp->p_lckcnt != 0);
3982 
3983 			/*
3984 			 * We didn't need availrmem decremented if p_lckcnt on
3985 			 * original page is 1. Here, we are unlocking
3986 			 * read-only copy belonging to original page and
3987 			 * are locking a copy belonging to new page.
3988 			 */
3989 			if (opp->p_lckcnt == 1)
3990 				payback = 1;
3991 
3992 			npp->p_lckcnt++;
3993 			opp->p_lckcnt--;
3994 		}
3995 	}
3996 	if (payback) {
3997 		mutex_enter(&freemem_lock);
3998 		availrmem++;
3999 		pages_useclaim--;
4000 		mutex_exit(&freemem_lock);
4001 	}
4002 	page_struct_unlock(opp);
4003 }
4004 
4005 /*
4006  * Simple claim adjust functions -- used to support changes in
4007  * claims due to changes in access permissions.  Used by segvn_setprot().
4008  */
4009 int
4010 page_addclaim(page_t *pp)
4011 {
4012 	int r = 0;			/* result */
4013 
4014 	ASSERT(PAGE_LOCKED(pp));
4015 
4016 	page_struct_lock(pp);
4017 	ASSERT(pp->p_lckcnt != 0);
4018 
4019 	if (pp->p_lckcnt == 1) {
4020 		if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4021 			--pp->p_lckcnt;
4022 			r = 1;
4023 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4024 				cmn_err(CE_WARN,
4025 				    "COW lock limit reached on pfn 0x%lx",
4026 				    page_pptonum(pp));
4027 			}
4028 		}
4029 	} else {
4030 		mutex_enter(&freemem_lock);
4031 		if ((availrmem > pages_pp_maximum) &&
4032 		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4033 			--availrmem;
4034 			++pages_claimed;
4035 			mutex_exit(&freemem_lock);
4036 			--pp->p_lckcnt;
4037 			r = 1;
4038 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4039 				cmn_err(CE_WARN,
4040 				    "COW lock limit reached on pfn 0x%lx",
4041 				    page_pptonum(pp));
4042 			}
4043 		} else
4044 			mutex_exit(&freemem_lock);
4045 	}
4046 	page_struct_unlock(pp);
4047 	return (r);
4048 }
4049 
4050 int
4051 page_subclaim(page_t *pp)
4052 {
4053 	int r = 0;
4054 
4055 	ASSERT(PAGE_LOCKED(pp));
4056 
4057 	page_struct_lock(pp);
4058 	ASSERT(pp->p_cowcnt != 0);
4059 
4060 	if (pp->p_lckcnt) {
4061 		if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4062 			r = 1;
4063 			/*
4064 			 * for availrmem
4065 			 */
4066 			mutex_enter(&freemem_lock);
4067 			availrmem++;
4068 			pages_claimed--;
4069 			mutex_exit(&freemem_lock);
4070 
4071 			pp->p_cowcnt--;
4072 
4073 			if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4074 				cmn_err(CE_WARN,
4075 				    "Page lock limit reached on pfn 0x%lx",
4076 				    page_pptonum(pp));
4077 			}
4078 		}
4079 	} else {
4080 		r = 1;
4081 		pp->p_cowcnt--;
4082 		pp->p_lckcnt++;
4083 	}
4084 	page_struct_unlock(pp);
4085 	return (r);
4086 }
4087 
4088 int
4089 page_addclaim_pages(page_t  **ppa)
4090 {
4091 
4092 	pgcnt_t	lckpgs = 0, pg_idx;
4093 
4094 	VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4095 
4096 	mutex_enter(&page_llock);
4097 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4098 
4099 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4100 		ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4101 		if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4102 			mutex_exit(&page_llock);
4103 			return (0);
4104 		}
4105 		if (ppa[pg_idx]->p_lckcnt > 1)
4106 			lckpgs++;
4107 	}
4108 
4109 	if (lckpgs != 0) {
4110 		mutex_enter(&freemem_lock);
4111 		if (availrmem >= pages_pp_maximum + lckpgs) {
4112 			availrmem -= lckpgs;
4113 			pages_claimed += lckpgs;
4114 		} else {
4115 			mutex_exit(&freemem_lock);
4116 			mutex_exit(&page_llock);
4117 			return (0);
4118 		}
4119 		mutex_exit(&freemem_lock);
4120 	}
4121 
4122 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4123 		ppa[pg_idx]->p_lckcnt--;
4124 		ppa[pg_idx]->p_cowcnt++;
4125 	}
4126 	mutex_exit(&page_llock);
4127 	return (1);
4128 }
4129 
4130 int
4131 page_subclaim_pages(page_t  **ppa)
4132 {
4133 	pgcnt_t	ulckpgs = 0, pg_idx;
4134 
4135 	VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4136 
4137 	mutex_enter(&page_llock);
4138 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4139 
4140 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4141 		ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4142 		if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4143 			mutex_exit(&page_llock);
4144 			return (0);
4145 		}
4146 		if (ppa[pg_idx]->p_lckcnt != 0)
4147 			ulckpgs++;
4148 	}
4149 
4150 	if (ulckpgs != 0) {
4151 		mutex_enter(&freemem_lock);
4152 		availrmem += ulckpgs;
4153 		pages_claimed -= ulckpgs;
4154 		mutex_exit(&freemem_lock);
4155 	}
4156 
4157 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4158 		ppa[pg_idx]->p_cowcnt--;
4159 		ppa[pg_idx]->p_lckcnt++;
4160 
4161 	}
4162 	mutex_exit(&page_llock);
4163 	return (1);
4164 }
4165 
4166 page_t *
4167 page_numtopp(pfn_t pfnum, se_t se)
4168 {
4169 	page_t *pp;
4170 
4171 retry:
4172 	pp = page_numtopp_nolock(pfnum);
4173 	if (pp == NULL) {
4174 		return ((page_t *)NULL);
4175 	}
4176 
4177 	/*
4178 	 * Acquire the appropriate lock on the page.
4179 	 */
4180 	while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4181 		if (page_pptonum(pp) != pfnum)
4182 			goto retry;
4183 		continue;
4184 	}
4185 
4186 	if (page_pptonum(pp) != pfnum) {
4187 		page_unlock(pp);
4188 		goto retry;
4189 	}
4190 
4191 	return (pp);
4192 }
4193 
4194 page_t *
4195 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4196 {
4197 	page_t *pp;
4198 
4199 retry:
4200 	pp = page_numtopp_nolock(pfnum);
4201 	if (pp == NULL) {
4202 		return ((page_t *)NULL);
4203 	}
4204 
4205 	/*
4206 	 * Acquire the appropriate lock on the page.
4207 	 */
4208 	while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4209 		if (page_pptonum(pp) != pfnum)
4210 			goto retry;
4211 		continue;
4212 	}
4213 
4214 	if (page_pptonum(pp) != pfnum) {
4215 		page_unlock(pp);
4216 		goto retry;
4217 	}
4218 
4219 	return (pp);
4220 }
4221 
4222 /*
4223  * This routine is like page_numtopp, but will only return page structs
4224  * for pages which are ok for loading into hardware using the page struct.
4225  */
4226 page_t *
4227 page_numtopp_nowait(pfn_t pfnum, se_t se)
4228 {
4229 	page_t *pp;
4230 
4231 retry:
4232 	pp = page_numtopp_nolock(pfnum);
4233 	if (pp == NULL) {
4234 		return ((page_t *)NULL);
4235 	}
4236 
4237 	/*
4238 	 * Try to acquire the appropriate lock on the page.
4239 	 */
4240 	if (PP_ISFREE(pp))
4241 		pp = NULL;
4242 	else {
4243 		if (!page_trylock(pp, se))
4244 			pp = NULL;
4245 		else {
4246 			if (page_pptonum(pp) != pfnum) {
4247 				page_unlock(pp);
4248 				goto retry;
4249 			}
4250 			if (PP_ISFREE(pp)) {
4251 				page_unlock(pp);
4252 				pp = NULL;
4253 			}
4254 		}
4255 	}
4256 	return (pp);
4257 }
4258 
4259 /*
4260  * Returns a count of dirty pages that are in the process
4261  * of being written out.  If 'cleanit' is set, try to push the page.
4262  */
4263 pgcnt_t
4264 page_busy(int cleanit)
4265 {
4266 	page_t *page0 = page_first();
4267 	page_t *pp = page0;
4268 	pgcnt_t nppbusy = 0;
4269 	u_offset_t off;
4270 
4271 	do {
4272 		vnode_t *vp = pp->p_vnode;
4273 
4274 		/*
4275 		 * A page is a candidate for syncing if it is:
4276 		 *
4277 		 * (a)	On neither the freelist nor the cachelist
4278 		 * (b)	Hashed onto a vnode
4279 		 * (c)	Not a kernel page
4280 		 * (d)	Dirty
4281 		 * (e)	Not part of a swapfile
4282 		 * (f)	a page which belongs to a real vnode; eg has a non-null
4283 		 *	v_vfsp pointer.
4284 		 * (g)	Backed by a filesystem which doesn't have a
4285 		 *	stubbed-out sync operation
4286 		 */
4287 		if (!PP_ISFREE(pp) && vp != NULL && !VN_ISKAS(vp) &&
4288 		    hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4289 		    vfs_can_sync(vp->v_vfsp)) {
4290 			nppbusy++;
4291 			vfs_syncprogress();
4292 
4293 			if (!cleanit)
4294 				continue;
4295 			if (!page_trylock(pp, SE_EXCL))
4296 				continue;
4297 
4298 			if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4299 			    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4300 			    !(hat_pagesync(pp,
4301 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4302 				page_unlock(pp);
4303 				continue;
4304 			}
4305 			off = pp->p_offset;
4306 			VN_HOLD(vp);
4307 			page_unlock(pp);
4308 			(void) VOP_PUTPAGE(vp, off, PAGESIZE,
4309 			    B_ASYNC | B_FREE, kcred, NULL);
4310 			VN_RELE(vp);
4311 		}
4312 	} while ((pp = page_next(pp)) != page0);
4313 
4314 	return (nppbusy);
4315 }
4316 
4317 void page_invalidate_pages(void);
4318 
4319 /*
4320  * callback handler to vm sub-system
4321  *
4322  * callers make sure no recursive entries to this func.
4323  */
4324 /*ARGSUSED*/
4325 boolean_t
4326 callb_vm_cpr(void *arg, int code)
4327 {
4328 	if (code == CB_CODE_CPR_CHKPT)
4329 		page_invalidate_pages();
4330 	return (B_TRUE);
4331 }
4332 
4333 /*
4334  * Invalidate all pages of the system.
4335  * It shouldn't be called until all user page activities are all stopped.
4336  */
4337 void
4338 page_invalidate_pages()
4339 {
4340 	page_t *pp;
4341 	page_t *page0;
4342 	pgcnt_t nbusypages;
4343 	int retry = 0;
4344 	const int MAXRETRIES = 4;
4345 #if defined(__sparc)
4346 	extern struct vnode prom_ppages;
4347 #endif /* __sparc */
4348 
4349 top:
4350 	/*
4351 	 * Flush dirty pages and destroy the clean ones.
4352 	 */
4353 	nbusypages = 0;
4354 
4355 	pp = page0 = page_first();
4356 	do {
4357 		struct vnode	*vp;
4358 		u_offset_t	offset;
4359 		int		mod;
4360 
4361 		/*
4362 		 * skip the page if it has no vnode or the page associated
4363 		 * with the kernel vnode or prom allocated kernel mem.
4364 		 */
4365 #if defined(__sparc)
4366 		if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp) ||
4367 		    vp == &prom_ppages)
4368 #else /* x86 doesn't have prom or prom_ppage */
4369 		if ((vp = pp->p_vnode) == NULL || VN_ISKAS(vp))
4370 #endif /* __sparc */
4371 			continue;
4372 
4373 		/*
4374 		 * skip the page which is already free invalidated.
4375 		 */
4376 		if (PP_ISFREE(pp) && PP_ISAGED(pp))
4377 			continue;
4378 
4379 		/*
4380 		 * skip pages that are already locked or can't be "exclusively"
4381 		 * locked or are already free.  After we lock the page, check
4382 		 * the free and age bits again to be sure it's not destroied
4383 		 * yet.
4384 		 * To achieve max. parallelization, we use page_trylock instead
4385 		 * of page_lock so that we don't get block on individual pages
4386 		 * while we have thousands of other pages to process.
4387 		 */
4388 		if (!page_trylock(pp, SE_EXCL)) {
4389 			nbusypages++;
4390 			continue;
4391 		} else if (PP_ISFREE(pp)) {
4392 			if (!PP_ISAGED(pp)) {
4393 				page_destroy_free(pp);
4394 			} else {
4395 				page_unlock(pp);
4396 			}
4397 			continue;
4398 		}
4399 		/*
4400 		 * Is this page involved in some I/O? shared?
4401 		 *
4402 		 * The page_struct_lock need not be acquired to
4403 		 * examine these fields since the page has an
4404 		 * "exclusive" lock.
4405 		 */
4406 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4407 			page_unlock(pp);
4408 			continue;
4409 		}
4410 
4411 		if (vp->v_type == VCHR) {
4412 			panic("vp->v_type == VCHR");
4413 			/*NOTREACHED*/
4414 		}
4415 
4416 		if (!page_try_demote_pages(pp)) {
4417 			page_unlock(pp);
4418 			continue;
4419 		}
4420 
4421 		/*
4422 		 * Check the modified bit. Leave the bits alone in hardware
4423 		 * (they will be modified if we do the putpage).
4424 		 */
4425 		mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4426 		    & P_MOD);
4427 		if (mod) {
4428 			offset = pp->p_offset;
4429 			/*
4430 			 * Hold the vnode before releasing the page lock
4431 			 * to prevent it from being freed and re-used by
4432 			 * some other thread.
4433 			 */
4434 			VN_HOLD(vp);
4435 			page_unlock(pp);
4436 			/*
4437 			 * No error return is checked here. Callers such as
4438 			 * cpr deals with the dirty pages at the dump time
4439 			 * if this putpage fails.
4440 			 */
4441 			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4442 			    kcred, NULL);
4443 			VN_RELE(vp);
4444 		} else {
4445 			page_destroy(pp, 0);
4446 		}
4447 	} while ((pp = page_next(pp)) != page0);
4448 	if (nbusypages && retry++ < MAXRETRIES) {
4449 		delay(1);
4450 		goto top;
4451 	}
4452 }
4453 
4454 /*
4455  * Replace the page "old" with the page "new" on the page hash and vnode lists
4456  *
4457  * the replacement must be done in place, ie the equivalent sequence:
4458  *
4459  *	vp = old->p_vnode;
4460  *	off = old->p_offset;
4461  *	page_do_hashout(old)
4462  *	page_do_hashin(new, vp, off)
4463  *
4464  * doesn't work, since
4465  *  1) if old is the only page on the vnode, the v_pages list has a window
4466  *     where it looks empty. This will break file system assumptions.
4467  * and
4468  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4469  */
4470 static void
4471 page_do_relocate_hash(page_t *new, page_t *old)
4472 {
4473 	page_t	**hash_list;
4474 	vnode_t	*vp = old->p_vnode;
4475 	kmutex_t *sep;
4476 
4477 	ASSERT(PAGE_EXCL(old));
4478 	ASSERT(PAGE_EXCL(new));
4479 	ASSERT(vp != NULL);
4480 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4481 	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4482 
4483 	/*
4484 	 * First find old page on the page hash list
4485 	 */
4486 	hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4487 
4488 	for (;;) {
4489 		if (*hash_list == old)
4490 			break;
4491 		if (*hash_list == NULL) {
4492 			panic("page_do_hashout");
4493 			/*NOTREACHED*/
4494 		}
4495 		hash_list = &(*hash_list)->p_hash;
4496 	}
4497 
4498 	/*
4499 	 * update new and replace old with new on the page hash list
4500 	 */
4501 	new->p_vnode = old->p_vnode;
4502 	new->p_offset = old->p_offset;
4503 	new->p_hash = old->p_hash;
4504 	*hash_list = new;
4505 
4506 	if ((new->p_vnode->v_flag & VISSWAP) != 0)
4507 		PP_SETSWAP(new);
4508 
4509 	/*
4510 	 * replace old with new on the vnode's page list
4511 	 */
4512 	if (old->p_vpnext == old) {
4513 		new->p_vpnext = new;
4514 		new->p_vpprev = new;
4515 	} else {
4516 		new->p_vpnext = old->p_vpnext;
4517 		new->p_vpprev = old->p_vpprev;
4518 		new->p_vpnext->p_vpprev = new;
4519 		new->p_vpprev->p_vpnext = new;
4520 	}
4521 	if (vp->v_pages == old)
4522 		vp->v_pages = new;
4523 
4524 	/*
4525 	 * clear out the old page
4526 	 */
4527 	old->p_hash = NULL;
4528 	old->p_vpnext = NULL;
4529 	old->p_vpprev = NULL;
4530 	old->p_vnode = NULL;
4531 	PP_CLRSWAP(old);
4532 	old->p_offset = (u_offset_t)-1;
4533 	page_clr_all_props(old);
4534 
4535 	/*
4536 	 * Wake up processes waiting for this page.  The page's
4537 	 * identity has been changed, and is probably not the
4538 	 * desired page any longer.
4539 	 */
4540 	sep = page_se_mutex(old);
4541 	mutex_enter(sep);
4542 	old->p_selock &= ~SE_EWANTED;
4543 	if (CV_HAS_WAITERS(&old->p_cv))
4544 		cv_broadcast(&old->p_cv);
4545 	mutex_exit(sep);
4546 }
4547 
4548 /*
4549  * This function moves the identity of page "pp_old" to page "pp_new".
4550  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4551  * and need not be hashed out from anywhere.
4552  */
4553 void
4554 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4555 {
4556 	vnode_t *vp = pp_old->p_vnode;
4557 	u_offset_t off = pp_old->p_offset;
4558 	kmutex_t *phm, *vphm;
4559 
4560 	/*
4561 	 * Rehash two pages
4562 	 */
4563 	ASSERT(PAGE_EXCL(pp_old));
4564 	ASSERT(PAGE_EXCL(pp_new));
4565 	ASSERT(vp != NULL);
4566 	ASSERT(pp_new->p_vnode == NULL);
4567 
4568 	/*
4569 	 * hashout then hashin while holding the mutexes
4570 	 */
4571 	phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4572 	mutex_enter(phm);
4573 	vphm = page_vnode_mutex(vp);
4574 	mutex_enter(vphm);
4575 
4576 	page_do_relocate_hash(pp_new, pp_old);
4577 
4578 	mutex_exit(vphm);
4579 	mutex_exit(phm);
4580 
4581 	/*
4582 	 * The page_struct_lock need not be acquired for lckcnt and
4583 	 * cowcnt since the page has an "exclusive" lock.
4584 	 */
4585 	ASSERT(pp_new->p_lckcnt == 0);
4586 	ASSERT(pp_new->p_cowcnt == 0);
4587 	pp_new->p_lckcnt = pp_old->p_lckcnt;
4588 	pp_new->p_cowcnt = pp_old->p_cowcnt;
4589 	pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4590 
4591 	/* The following comment preserved from page_flip(). */
4592 	/* XXX - Do we need to protect fsdata? */
4593 	pp_new->p_fsdata = pp_old->p_fsdata;
4594 }
4595 
4596 /*
4597  * Helper routine used to lock all remaining members of a
4598  * large page. The caller is responsible for passing in a locked
4599  * pp. If pp is a large page, then it succeeds in locking all the
4600  * remaining constituent pages or it returns with only the
4601  * original page locked.
4602  *
4603  * Returns 1 on success, 0 on failure.
4604  *
4605  * If success is returned this routine guarantees p_szc for all constituent
4606  * pages of a large page pp belongs to can't change. To achieve this we
4607  * recheck szc of pp after locking all constituent pages and retry if szc
4608  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4609  * lock on one of constituent pages it can't be running after all constituent
4610  * pages are locked.  hat_page_demote() with a lock on a constituent page
4611  * outside of this large page (i.e. pp belonged to a larger large page) is
4612  * already done with all constituent pages of pp since the root's p_szc is
4613  * changed last. Therefore no need to synchronize with hat_page_demote() that
4614  * locked a constituent page outside of pp's current large page.
4615  */
4616 #ifdef DEBUG
4617 uint32_t gpg_trylock_mtbf = 0;
4618 #endif
4619 
4620 int
4621 group_page_trylock(page_t *pp, se_t se)
4622 {
4623 	page_t  *tpp;
4624 	pgcnt_t	npgs, i, j;
4625 	uint_t pszc = pp->p_szc;
4626 
4627 #ifdef DEBUG
4628 	if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4629 		return (0);
4630 	}
4631 #endif
4632 
4633 	if (pp != PP_GROUPLEADER(pp, pszc)) {
4634 		return (0);
4635 	}
4636 
4637 retry:
4638 	ASSERT(PAGE_LOCKED_SE(pp, se));
4639 	ASSERT(!PP_ISFREE(pp));
4640 	if (pszc == 0) {
4641 		return (1);
4642 	}
4643 	npgs = page_get_pagecnt(pszc);
4644 	tpp = pp + 1;
4645 	for (i = 1; i < npgs; i++, tpp++) {
4646 		if (!page_trylock(tpp, se)) {
4647 			tpp = pp + 1;
4648 			for (j = 1; j < i; j++, tpp++) {
4649 				page_unlock(tpp);
4650 			}
4651 			return (0);
4652 		}
4653 	}
4654 	if (pp->p_szc != pszc) {
4655 		ASSERT(pp->p_szc < pszc);
4656 		ASSERT(pp->p_vnode != NULL && !PP_ISKAS(pp) &&
4657 		    !IS_SWAPFSVP(pp->p_vnode));
4658 		tpp = pp + 1;
4659 		for (i = 1; i < npgs; i++, tpp++) {
4660 			page_unlock(tpp);
4661 		}
4662 		pszc = pp->p_szc;
4663 		goto retry;
4664 	}
4665 	return (1);
4666 }
4667 
4668 void
4669 group_page_unlock(page_t *pp)
4670 {
4671 	page_t *tpp;
4672 	pgcnt_t	npgs, i;
4673 
4674 	ASSERT(PAGE_LOCKED(pp));
4675 	ASSERT(!PP_ISFREE(pp));
4676 	ASSERT(pp == PP_PAGEROOT(pp));
4677 	npgs = page_get_pagecnt(pp->p_szc);
4678 	for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4679 		page_unlock(tpp);
4680 	}
4681 }
4682 
4683 /*
4684  * returns
4685  * 0 		: on success and *nrelocp is number of relocated PAGESIZE pages
4686  * ERANGE	: this is not a base page
4687  * EBUSY	: failure to get locks on the page/pages
4688  * ENOMEM	: failure to obtain replacement pages
4689  * EAGAIN	: OBP has not yet completed its boot-time handoff to the kernel
4690  * EIO		: An error occurred while trying to copy the page data
4691  *
4692  * Return with all constituent members of target and replacement
4693  * SE_EXCL locked. It is the callers responsibility to drop the
4694  * locks.
4695  */
4696 int
4697 do_page_relocate(
4698 	page_t **target,
4699 	page_t **replacement,
4700 	int grouplock,
4701 	spgcnt_t *nrelocp,
4702 	lgrp_t *lgrp)
4703 {
4704 	page_t *first_repl;
4705 	page_t *repl;
4706 	page_t *targ;
4707 	page_t *pl = NULL;
4708 	uint_t ppattr;
4709 	pfn_t   pfn, repl_pfn;
4710 	uint_t	szc;
4711 	spgcnt_t npgs, i;
4712 	int repl_contig = 0;
4713 	uint_t flags = 0;
4714 	spgcnt_t dofree = 0;
4715 
4716 	*nrelocp = 0;
4717 
4718 #if defined(__sparc)
4719 	/*
4720 	 * We need to wait till OBP has completed
4721 	 * its boot-time handoff of its resources to the kernel
4722 	 * before we allow page relocation
4723 	 */
4724 	if (page_relocate_ready == 0) {
4725 		return (EAGAIN);
4726 	}
4727 #endif
4728 
4729 	/*
4730 	 * If this is not a base page,
4731 	 * just return with 0x0 pages relocated.
4732 	 */
4733 	targ = *target;
4734 	ASSERT(PAGE_EXCL(targ));
4735 	ASSERT(!PP_ISFREE(targ));
4736 	szc = targ->p_szc;
4737 	ASSERT(szc < mmu_page_sizes);
4738 	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4739 	pfn = targ->p_pagenum;
4740 	if (pfn != PFN_BASE(pfn, szc)) {
4741 		VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4742 		return (ERANGE);
4743 	}
4744 
4745 	if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4746 		repl_pfn = repl->p_pagenum;
4747 		if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4748 			VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4749 			return (ERANGE);
4750 		}
4751 		repl_contig = 1;
4752 	}
4753 
4754 	/*
4755 	 * We must lock all members of this large page or we cannot
4756 	 * relocate any part of it.
4757 	 */
4758 	if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4759 		VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4760 		return (EBUSY);
4761 	}
4762 
4763 	/*
4764 	 * reread szc it could have been decreased before
4765 	 * group_page_trylock() was done.
4766 	 */
4767 	szc = targ->p_szc;
4768 	ASSERT(szc < mmu_page_sizes);
4769 	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4770 	ASSERT(pfn == PFN_BASE(pfn, szc));
4771 
4772 	npgs = page_get_pagecnt(targ->p_szc);
4773 
4774 	if (repl == NULL) {
4775 		dofree = npgs;		/* Size of target page in MMU pages */
4776 		if (!page_create_wait(dofree, 0)) {
4777 			if (grouplock != 0) {
4778 				group_page_unlock(targ);
4779 			}
4780 			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4781 			return (ENOMEM);
4782 		}
4783 
4784 		/*
4785 		 * seg kmem pages require that the target and replacement
4786 		 * page be the same pagesize.
4787 		 */
4788 		flags = (VN_ISKAS(targ->p_vnode)) ? PGR_SAMESZC : 0;
4789 		repl = page_get_replacement_page(targ, lgrp, flags);
4790 		if (repl == NULL) {
4791 			if (grouplock != 0) {
4792 				group_page_unlock(targ);
4793 			}
4794 			page_create_putback(dofree);
4795 			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4796 			return (ENOMEM);
4797 		}
4798 	}
4799 #ifdef DEBUG
4800 	else {
4801 		ASSERT(PAGE_LOCKED(repl));
4802 	}
4803 #endif /* DEBUG */
4804 
4805 #if defined(__sparc)
4806 	/*
4807 	 * Let hat_page_relocate() complete the relocation if it's kernel page
4808 	 */
4809 	if (VN_ISKAS(targ->p_vnode)) {
4810 		*replacement = repl;
4811 		if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4812 			if (grouplock != 0) {
4813 				group_page_unlock(targ);
4814 			}
4815 			if (dofree) {
4816 				*replacement = NULL;
4817 				page_free_replacement_page(repl);
4818 				page_create_putback(dofree);
4819 			}
4820 			VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4821 			return (EAGAIN);
4822 		}
4823 		VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4824 		return (0);
4825 	}
4826 #else
4827 #if defined(lint)
4828 	dofree = dofree;
4829 #endif
4830 #endif
4831 
4832 	first_repl = repl;
4833 
4834 	for (i = 0; i < npgs; i++) {
4835 		ASSERT(PAGE_EXCL(targ));
4836 		ASSERT(targ->p_slckcnt == 0);
4837 		ASSERT(repl->p_slckcnt == 0);
4838 
4839 		(void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4840 
4841 		ASSERT(hat_page_getshare(targ) == 0);
4842 		ASSERT(!PP_ISFREE(targ));
4843 		ASSERT(targ->p_pagenum == (pfn + i));
4844 		ASSERT(repl_contig == 0 ||
4845 		    repl->p_pagenum == (repl_pfn + i));
4846 
4847 		/*
4848 		 * Copy the page contents and attributes then
4849 		 * relocate the page in the page hash.
4850 		 */
4851 		if (ppcopy(targ, repl) == 0) {
4852 			targ = *target;
4853 			repl = first_repl;
4854 			VM_STAT_ADD(vmm_vmstats.ppr_copyfail);
4855 			if (grouplock != 0) {
4856 				group_page_unlock(targ);
4857 			}
4858 			if (dofree) {
4859 				*replacement = NULL;
4860 				page_free_replacement_page(repl);
4861 				page_create_putback(dofree);
4862 			}
4863 			return (EIO);
4864 		}
4865 
4866 		targ++;
4867 		if (repl_contig != 0) {
4868 			repl++;
4869 		} else {
4870 			repl = repl->p_next;
4871 		}
4872 	}
4873 
4874 	repl = first_repl;
4875 	targ = *target;
4876 
4877 	for (i = 0; i < npgs; i++) {
4878 		ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4879 		page_clr_all_props(repl);
4880 		page_set_props(repl, ppattr);
4881 		page_relocate_hash(repl, targ);
4882 
4883 		ASSERT(hat_page_getshare(targ) == 0);
4884 		ASSERT(hat_page_getshare(repl) == 0);
4885 		/*
4886 		 * Now clear the props on targ, after the
4887 		 * page_relocate_hash(), they no longer
4888 		 * have any meaning.
4889 		 */
4890 		page_clr_all_props(targ);
4891 		ASSERT(targ->p_next == targ);
4892 		ASSERT(targ->p_prev == targ);
4893 		page_list_concat(&pl, &targ);
4894 
4895 		targ++;
4896 		if (repl_contig != 0) {
4897 			repl++;
4898 		} else {
4899 			repl = repl->p_next;
4900 		}
4901 	}
4902 	/* assert that we have come full circle with repl */
4903 	ASSERT(repl_contig == 1 || first_repl == repl);
4904 
4905 	*target = pl;
4906 	if (*replacement == NULL) {
4907 		ASSERT(first_repl == repl);
4908 		*replacement = repl;
4909 	}
4910 	VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4911 	*nrelocp = npgs;
4912 	return (0);
4913 }
4914 /*
4915  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4916  */
4917 int
4918 page_relocate(
4919 	page_t **target,
4920 	page_t **replacement,
4921 	int grouplock,
4922 	int freetarget,
4923 	spgcnt_t *nrelocp,
4924 	lgrp_t *lgrp)
4925 {
4926 	spgcnt_t ret;
4927 
4928 	/* do_page_relocate returns 0 on success or errno value */
4929 	ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
4930 
4931 	if (ret != 0 || freetarget == 0) {
4932 		return (ret);
4933 	}
4934 	if (*nrelocp == 1) {
4935 		ASSERT(*target != NULL);
4936 		page_free(*target, 1);
4937 	} else {
4938 		page_t *tpp = *target;
4939 		uint_t szc = tpp->p_szc;
4940 		pgcnt_t npgs = page_get_pagecnt(szc);
4941 		ASSERT(npgs > 1);
4942 		ASSERT(szc != 0);
4943 		do {
4944 			ASSERT(PAGE_EXCL(tpp));
4945 			ASSERT(!hat_page_is_mapped(tpp));
4946 			ASSERT(tpp->p_szc == szc);
4947 			PP_SETFREE(tpp);
4948 			PP_SETAGED(tpp);
4949 			npgs--;
4950 		} while ((tpp = tpp->p_next) != *target);
4951 		ASSERT(npgs == 0);
4952 		page_list_add_pages(*target, 0);
4953 		npgs = page_get_pagecnt(szc);
4954 		page_create_putback(npgs);
4955 	}
4956 	return (ret);
4957 }
4958 
4959 /*
4960  * it is up to the caller to deal with pcf accounting.
4961  */
4962 void
4963 page_free_replacement_page(page_t *pplist)
4964 {
4965 	page_t *pp;
4966 
4967 	while (pplist != NULL) {
4968 		/*
4969 		 * pp_targ is a linked list.
4970 		 */
4971 		pp = pplist;
4972 		if (pp->p_szc == 0) {
4973 			page_sub(&pplist, pp);
4974 			page_clr_all_props(pp);
4975 			PP_SETFREE(pp);
4976 			PP_SETAGED(pp);
4977 			page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
4978 			page_unlock(pp);
4979 			VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
4980 		} else {
4981 			spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
4982 			page_t *tpp;
4983 			page_list_break(&pp, &pplist, curnpgs);
4984 			tpp = pp;
4985 			do {
4986 				ASSERT(PAGE_EXCL(tpp));
4987 				ASSERT(!hat_page_is_mapped(tpp));
4988 				page_clr_all_props(pp);
4989 				PP_SETFREE(tpp);
4990 				PP_SETAGED(tpp);
4991 			} while ((tpp = tpp->p_next) != pp);
4992 			page_list_add_pages(pp, 0);
4993 			VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
4994 		}
4995 	}
4996 }
4997 
4998 /*
4999  * Relocate target to non-relocatable replacement page.
5000  */
5001 int
5002 page_relocate_cage(page_t **target, page_t **replacement)
5003 {
5004 	page_t *tpp, *rpp;
5005 	spgcnt_t pgcnt, npgs;
5006 	int result;
5007 
5008 	tpp = *target;
5009 
5010 	ASSERT(PAGE_EXCL(tpp));
5011 	ASSERT(tpp->p_szc == 0);
5012 
5013 	pgcnt = btop(page_get_pagesize(tpp->p_szc));
5014 
5015 	do {
5016 		(void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5017 		rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5018 		if (rpp == NULL) {
5019 			page_create_putback(pgcnt);
5020 			kcage_cageout_wakeup();
5021 		}
5022 	} while (rpp == NULL);
5023 
5024 	ASSERT(PP_ISNORELOC(rpp));
5025 
5026 	result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5027 
5028 	if (result == 0) {
5029 		*replacement = rpp;
5030 		if (pgcnt != npgs)
5031 			panic("page_relocate_cage: partial relocation");
5032 	}
5033 
5034 	return (result);
5035 }
5036 
5037 /*
5038  * Release the page lock on a page, place on cachelist
5039  * tail if no longer mapped. Caller can let us know if
5040  * the page is known to be clean.
5041  */
5042 int
5043 page_release(page_t *pp, int checkmod)
5044 {
5045 	int status;
5046 
5047 	ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5048 	    (pp->p_vnode != NULL));
5049 
5050 	if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5051 	    ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5052 	    pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5053 	    !hat_page_is_mapped(pp)) {
5054 
5055 		/*
5056 		 * If page is modified, unlock it
5057 		 *
5058 		 * (p_nrm & P_MOD) bit has the latest stuff because:
5059 		 * (1) We found that this page doesn't have any mappings
5060 		 *	_after_ holding SE_EXCL and
5061 		 * (2) We didn't drop SE_EXCL lock after the check in (1)
5062 		 */
5063 		if (checkmod && hat_ismod(pp)) {
5064 			page_unlock(pp);
5065 			status = PGREL_MOD;
5066 		} else {
5067 			/*LINTED: constant in conditional context*/
5068 			VN_DISPOSE(pp, B_FREE, 0, kcred);
5069 			status = PGREL_CLEAN;
5070 		}
5071 	} else {
5072 		page_unlock(pp);
5073 		status = PGREL_NOTREL;
5074 	}
5075 	return (status);
5076 }
5077 
5078 /*
5079  * Given a constituent page, try to demote the large page on the freelist.
5080  *
5081  * Returns nonzero if the page could be demoted successfully. Returns with
5082  * the constituent page still locked.
5083  */
5084 int
5085 page_try_demote_free_pages(page_t *pp)
5086 {
5087 	page_t *rootpp = pp;
5088 	pfn_t	pfn = page_pptonum(pp);
5089 	spgcnt_t npgs;
5090 	uint_t	szc = pp->p_szc;
5091 
5092 	ASSERT(PP_ISFREE(pp));
5093 	ASSERT(PAGE_EXCL(pp));
5094 
5095 	/*
5096 	 * Adjust rootpp and lock it, if `pp' is not the base
5097 	 * constituent page.
5098 	 */
5099 	npgs = page_get_pagecnt(pp->p_szc);
5100 	if (npgs == 1) {
5101 		return (0);
5102 	}
5103 
5104 	if (!IS_P2ALIGNED(pfn, npgs)) {
5105 		pfn = P2ALIGN(pfn, npgs);
5106 		rootpp = page_numtopp_nolock(pfn);
5107 	}
5108 
5109 	if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5110 		return (0);
5111 	}
5112 
5113 	if (rootpp->p_szc != szc) {
5114 		if (pp != rootpp)
5115 			page_unlock(rootpp);
5116 		return (0);
5117 	}
5118 
5119 	page_demote_free_pages(rootpp);
5120 
5121 	if (pp != rootpp)
5122 		page_unlock(rootpp);
5123 
5124 	ASSERT(PP_ISFREE(pp));
5125 	ASSERT(PAGE_EXCL(pp));
5126 	return (1);
5127 }
5128 
5129 /*
5130  * Given a constituent page, try to demote the large page.
5131  *
5132  * Returns nonzero if the page could be demoted successfully. Returns with
5133  * the constituent page still locked.
5134  */
5135 int
5136 page_try_demote_pages(page_t *pp)
5137 {
5138 	page_t *tpp, *rootpp = pp;
5139 	pfn_t	pfn = page_pptonum(pp);
5140 	spgcnt_t i, npgs;
5141 	uint_t	szc = pp->p_szc;
5142 	vnode_t *vp = pp->p_vnode;
5143 
5144 	ASSERT(PAGE_EXCL(pp));
5145 
5146 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5147 
5148 	if (pp->p_szc == 0) {
5149 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5150 		return (1);
5151 	}
5152 
5153 	if (vp != NULL && !IS_SWAPFSVP(vp) && !VN_ISKAS(vp)) {
5154 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5155 		page_demote_vp_pages(pp);
5156 		ASSERT(pp->p_szc == 0);
5157 		return (1);
5158 	}
5159 
5160 	/*
5161 	 * Adjust rootpp if passed in is not the base
5162 	 * constituent page.
5163 	 */
5164 	npgs = page_get_pagecnt(pp->p_szc);
5165 	ASSERT(npgs > 1);
5166 	if (!IS_P2ALIGNED(pfn, npgs)) {
5167 		pfn = P2ALIGN(pfn, npgs);
5168 		rootpp = page_numtopp_nolock(pfn);
5169 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5170 		ASSERT(rootpp->p_vnode != NULL);
5171 		ASSERT(rootpp->p_szc == szc);
5172 	}
5173 
5174 	/*
5175 	 * We can't demote kernel pages since we can't hat_unload()
5176 	 * the mappings.
5177 	 */
5178 	if (VN_ISKAS(rootpp->p_vnode))
5179 		return (0);
5180 
5181 	/*
5182 	 * Attempt to lock all constituent pages except the page passed
5183 	 * in since it's already locked.
5184 	 */
5185 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5186 		ASSERT(!PP_ISFREE(tpp));
5187 		ASSERT(tpp->p_vnode != NULL);
5188 
5189 		if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5190 			break;
5191 		ASSERT(tpp->p_szc == rootpp->p_szc);
5192 		ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5193 	}
5194 
5195 	/*
5196 	 * If we failed to lock them all then unlock what we have
5197 	 * locked so far and bail.
5198 	 */
5199 	if (i < npgs) {
5200 		tpp = rootpp;
5201 		while (i-- > 0) {
5202 			if (tpp != pp)
5203 				page_unlock(tpp);
5204 			tpp++;
5205 		}
5206 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5207 		return (0);
5208 	}
5209 
5210 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5211 		ASSERT(PAGE_EXCL(tpp));
5212 		ASSERT(tpp->p_slckcnt == 0);
5213 		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5214 		tpp->p_szc = 0;
5215 	}
5216 
5217 	/*
5218 	 * Unlock all pages except the page passed in.
5219 	 */
5220 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5221 		ASSERT(!hat_page_is_mapped(tpp));
5222 		if (tpp != pp)
5223 			page_unlock(tpp);
5224 	}
5225 
5226 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5227 	return (1);
5228 }
5229 
5230 /*
5231  * Called by page_free() and page_destroy() to demote the page size code
5232  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5233  * p_szc on free list, neither can we just clear p_szc of a single page_t
5234  * within a large page since it will break other code that relies on p_szc
5235  * being the same for all page_t's of a large page). Anonymous pages should
5236  * never end up here because anon_map_getpages() cannot deal with p_szc
5237  * changes after a single constituent page is locked.  While anonymous or
5238  * kernel large pages are demoted or freed the entire large page at a time
5239  * with all constituent pages locked EXCL for the file system pages we
5240  * have to be able to demote a large page (i.e. decrease all constituent pages
5241  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5242  * we can easily deal with anonymous page demotion the entire large page at a
5243  * time is that those operation originate at address space level and concern
5244  * the entire large page region with actual demotion only done when pages are
5245  * not shared with any other processes (therefore we can always get EXCL lock
5246  * on all anonymous constituent pages after clearing segment page
5247  * cache). However file system pages can be truncated or invalidated at a
5248  * PAGESIZE level from the file system side and end up in page_free() or
5249  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5250  * and therefore pageout should be able to demote a large page by EXCL locking
5251  * any constituent page that is not under SOFTLOCK). In those cases we cannot
5252  * rely on being able to lock EXCL all constituent pages.
5253  *
5254  * To prevent szc changes on file system pages one has to lock all constituent
5255  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5256  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5257  * prevent szc changes is hat layer that uses its own page level mlist
5258  * locks. hat assumes that szc doesn't change after mlist lock for a page is
5259  * taken. Therefore we need to change szc under hat level locks if we only
5260  * have an EXCL lock on a single constituent page and hat still references any
5261  * of constituent pages.  (Note we can't "ignore" hat layer by simply
5262  * hat_pageunload() all constituent pages without having EXCL locks on all of
5263  * constituent pages). We use hat_page_demote() call to safely demote szc of
5264  * all constituent pages under hat locks when we only have an EXCL lock on one
5265  * of constituent pages.
5266  *
5267  * This routine calls page_szc_lock() before calling hat_page_demote() to
5268  * allow segvn in one special case not to lock all constituent pages SHARED
5269  * before calling hat_memload_array() that relies on p_szc not changing even
5270  * before hat level mlist lock is taken.  In that case segvn uses
5271  * page_szc_lock() to prevent hat_page_demote() changing p_szc values.
5272  *
5273  * Anonymous or kernel page demotion still has to lock all pages exclusively
5274  * and do hat_pageunload() on all constituent pages before demoting the page
5275  * therefore there's no need for anonymous or kernel page demotion to use
5276  * hat_page_demote() mechanism.
5277  *
5278  * hat_page_demote() removes all large mappings that map pp and then decreases
5279  * p_szc starting from the last constituent page of the large page. By working
5280  * from the tail of a large page in pfn decreasing order allows one looking at
5281  * the root page to know that hat_page_demote() is done for root's szc area.
5282  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5283  * pages within szc 1 area to prevent szc changes because hat_page_demote()
5284  * that started on this page when it had szc > 1 is done for this szc 1 area.
5285  *
5286  * We are guaranteed that all constituent pages of pp's large page belong to
5287  * the same vnode with the consecutive offsets increasing in the direction of
5288  * the pfn i.e. the identity of constituent pages can't change until their
5289  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5290  * large mappings to pp even though we don't lock any constituent page except
5291  * pp (i.e. we won't unload e.g. kernel locked page).
5292  */
5293 static void
5294 page_demote_vp_pages(page_t *pp)
5295 {
5296 	kmutex_t *mtx;
5297 
5298 	ASSERT(PAGE_EXCL(pp));
5299 	ASSERT(!PP_ISFREE(pp));
5300 	ASSERT(pp->p_vnode != NULL);
5301 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5302 	ASSERT(!PP_ISKAS(pp));
5303 
5304 	VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5305 
5306 	mtx = page_szc_lock(pp);
5307 	if (mtx != NULL) {
5308 		hat_page_demote(pp);
5309 		mutex_exit(mtx);
5310 	}
5311 	ASSERT(pp->p_szc == 0);
5312 }
5313 
5314 /*
5315  * Mark any existing pages for migration in the given range
5316  */
5317 void
5318 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5319     struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5320     u_offset_t vnoff, int rflag)
5321 {
5322 	struct anon	*ap;
5323 	vnode_t		*curvp;
5324 	lgrp_t		*from;
5325 	pgcnt_t		i;
5326 	pgcnt_t		nlocked;
5327 	u_offset_t	off;
5328 	pfn_t		pfn;
5329 	size_t		pgsz;
5330 	size_t		segpgsz;
5331 	pgcnt_t		pages;
5332 	uint_t		pszc;
5333 	page_t		**ppa;
5334 	pgcnt_t		ppa_nentries;
5335 	page_t		*pp;
5336 	caddr_t		va;
5337 	ulong_t		an_idx;
5338 	anon_sync_obj_t	cookie;
5339 
5340 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5341 
5342 	/*
5343 	 * Don't do anything if don't need to do lgroup optimizations
5344 	 * on this system
5345 	 */
5346 	if (!lgrp_optimizations())
5347 		return;
5348 
5349 	/*
5350 	 * Align address and length to (potentially large) page boundary
5351 	 */
5352 	segpgsz = page_get_pagesize(seg->s_szc);
5353 	addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5354 	if (rflag)
5355 		len = P2ROUNDUP(len, segpgsz);
5356 
5357 	/*
5358 	 * Allocate page array to accommodate largest page size
5359 	 */
5360 	pgsz = page_get_pagesize(page_num_pagesizes() - 1);
5361 	ppa_nentries = btop(pgsz);
5362 	ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP);
5363 
5364 	/*
5365 	 * Do one (large) page at a time
5366 	 */
5367 	va = addr;
5368 	while (va < addr + len) {
5369 		/*
5370 		 * Lookup (root) page for vnode and offset corresponding to
5371 		 * this virtual address
5372 		 * Try anonmap first since there may be copy-on-write
5373 		 * pages, but initialize vnode pointer and offset using
5374 		 * vnode arguments just in case there isn't an amp.
5375 		 */
5376 		curvp = vp;
5377 		off = vnoff + va - seg->s_base;
5378 		if (amp) {
5379 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5380 			an_idx = anon_index + seg_page(seg, va);
5381 			anon_array_enter(amp, an_idx, &cookie);
5382 			ap = anon_get_ptr(amp->ahp, an_idx);
5383 			if (ap)
5384 				swap_xlate(ap, &curvp, &off);
5385 			anon_array_exit(&cookie);
5386 			ANON_LOCK_EXIT(&amp->a_rwlock);
5387 		}
5388 
5389 		pp = NULL;
5390 		if (curvp)
5391 			pp = page_lookup(curvp, off, SE_SHARED);
5392 
5393 		/*
5394 		 * If there isn't a page at this virtual address,
5395 		 * skip to next page
5396 		 */
5397 		if (pp == NULL) {
5398 			va += PAGESIZE;
5399 			continue;
5400 		}
5401 
5402 		/*
5403 		 * Figure out which lgroup this page is in for kstats
5404 		 */
5405 		pfn = page_pptonum(pp);
5406 		from = lgrp_pfn_to_lgrp(pfn);
5407 
5408 		/*
5409 		 * Get page size, and round up and skip to next page boundary
5410 		 * if unaligned address
5411 		 */
5412 		pszc = pp->p_szc;
5413 		pgsz = page_get_pagesize(pszc);
5414 		pages = btop(pgsz);
5415 		if (!IS_P2ALIGNED(va, pgsz) ||
5416 		    !IS_P2ALIGNED(pfn, pages) ||
5417 		    pgsz > segpgsz) {
5418 			pgsz = MIN(pgsz, segpgsz);
5419 			page_unlock(pp);
5420 			i = btop(P2END((uintptr_t)va, pgsz) -
5421 			    (uintptr_t)va);
5422 			va = (caddr_t)P2END((uintptr_t)va, pgsz);
5423 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i);
5424 			continue;
5425 		}
5426 
5427 		/*
5428 		 * Upgrade to exclusive lock on page
5429 		 */
5430 		if (!page_tryupgrade(pp)) {
5431 			page_unlock(pp);
5432 			va += pgsz;
5433 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5434 			    btop(pgsz));
5435 			continue;
5436 		}
5437 
5438 		/*
5439 		 * Remember pages locked exclusively and how many
5440 		 */
5441 		ppa[0] = pp;
5442 		nlocked = 1;
5443 
5444 		/*
5445 		 * Lock constituent pages if this is large page
5446 		 */
5447 		if (pages > 1) {
5448 			/*
5449 			 * Lock all constituents except root page, since it
5450 			 * should be locked already.
5451 			 */
5452 			for (i = 1; i < pages; i++) {
5453 				pp++;
5454 				if (!page_trylock(pp, SE_EXCL)) {
5455 					break;
5456 				}
5457 				if (PP_ISFREE(pp) ||
5458 				    pp->p_szc != pszc) {
5459 					/*
5460 					 * hat_page_demote() raced in with us.
5461 					 */
5462 					ASSERT(!IS_SWAPFSVP(curvp));
5463 					page_unlock(pp);
5464 					break;
5465 				}
5466 				ppa[nlocked] = pp;
5467 				nlocked++;
5468 			}
5469 		}
5470 
5471 		/*
5472 		 * If all constituent pages couldn't be locked,
5473 		 * unlock pages locked so far and skip to next page.
5474 		 */
5475 		if (nlocked != pages) {
5476 			for (i = 0; i < nlocked; i++)
5477 				page_unlock(ppa[i]);
5478 			va += pgsz;
5479 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5480 			    btop(pgsz));
5481 			continue;
5482 		}
5483 
5484 		/*
5485 		 * hat_page_demote() can no longer happen
5486 		 * since last cons page had the right p_szc after
5487 		 * all cons pages were locked. all cons pages
5488 		 * should now have the same p_szc.
5489 		 */
5490 
5491 		/*
5492 		 * All constituent pages locked successfully, so mark
5493 		 * large page for migration and unload the mappings of
5494 		 * constituent pages, so a fault will occur on any part of the
5495 		 * large page
5496 		 */
5497 		PP_SETMIGRATE(ppa[0]);
5498 		for (i = 0; i < nlocked; i++) {
5499 			pp = ppa[i];
5500 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
5501 			ASSERT(hat_page_getshare(pp) == 0);
5502 			page_unlock(pp);
5503 		}
5504 		lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5505 
5506 		va += pgsz;
5507 	}
5508 	kmem_free(ppa, ppa_nentries * sizeof (page_t *));
5509 }
5510 
5511 /*
5512  * Migrate any pages that have been marked for migration in the given range
5513  */
5514 void
5515 page_migrate(
5516 	struct seg	*seg,
5517 	caddr_t		addr,
5518 	page_t		**ppa,
5519 	pgcnt_t		npages)
5520 {
5521 	lgrp_t		*from;
5522 	lgrp_t		*to;
5523 	page_t		*newpp;
5524 	page_t		*pp;
5525 	pfn_t		pfn;
5526 	size_t		pgsz;
5527 	spgcnt_t	page_cnt;
5528 	spgcnt_t	i;
5529 	uint_t		pszc;
5530 
5531 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5532 
5533 	while (npages > 0) {
5534 		pp = *ppa;
5535 		pszc = pp->p_szc;
5536 		pgsz = page_get_pagesize(pszc);
5537 		page_cnt = btop(pgsz);
5538 
5539 		/*
5540 		 * Check to see whether this page is marked for migration
5541 		 *
5542 		 * Assume that root page of large page is marked for
5543 		 * migration and none of the other constituent pages
5544 		 * are marked.  This really simplifies clearing the
5545 		 * migrate bit by not having to clear it from each
5546 		 * constituent page.
5547 		 *
5548 		 * note we don't want to relocate an entire large page if
5549 		 * someone is only using one subpage.
5550 		 */
5551 		if (npages < page_cnt)
5552 			break;
5553 
5554 		/*
5555 		 * Is it marked for migration?
5556 		 */
5557 		if (!PP_ISMIGRATE(pp))
5558 			goto next;
5559 
5560 		/*
5561 		 * Determine lgroups that page is being migrated between
5562 		 */
5563 		pfn = page_pptonum(pp);
5564 		if (!IS_P2ALIGNED(pfn, page_cnt)) {
5565 			break;
5566 		}
5567 		from = lgrp_pfn_to_lgrp(pfn);
5568 		to = lgrp_mem_choose(seg, addr, pgsz);
5569 
5570 		/*
5571 		 * Need to get exclusive lock's to migrate
5572 		 */
5573 		for (i = 0; i < page_cnt; i++) {
5574 			ASSERT(PAGE_LOCKED(ppa[i]));
5575 			if (page_pptonum(ppa[i]) != pfn + i ||
5576 			    ppa[i]->p_szc != pszc) {
5577 				break;
5578 			}
5579 			if (!page_tryupgrade(ppa[i])) {
5580 				lgrp_stat_add(from->lgrp_id,
5581 				    LGRP_PM_FAIL_LOCK_PGS,
5582 				    page_cnt);
5583 				break;
5584 			}
5585 
5586 			/*
5587 			 * Check to see whether we are trying to migrate
5588 			 * page to lgroup where it is allocated already.
5589 			 * If so, clear the migrate bit and skip to next
5590 			 * page.
5591 			 */
5592 			if (i == 0 && to == from) {
5593 				PP_CLRMIGRATE(ppa[0]);
5594 				page_downgrade(ppa[0]);
5595 				goto next;
5596 			}
5597 		}
5598 
5599 		/*
5600 		 * If all constituent pages couldn't be locked,
5601 		 * unlock pages locked so far and skip to next page.
5602 		 */
5603 		if (i != page_cnt) {
5604 			while (--i != -1) {
5605 				page_downgrade(ppa[i]);
5606 			}
5607 			goto next;
5608 		}
5609 
5610 		(void) page_create_wait(page_cnt, PG_WAIT);
5611 		newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5612 		if (newpp == NULL) {
5613 			page_create_putback(page_cnt);
5614 			for (i = 0; i < page_cnt; i++) {
5615 				page_downgrade(ppa[i]);
5616 			}
5617 			lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5618 			    page_cnt);
5619 			goto next;
5620 		}
5621 		ASSERT(newpp->p_szc == pszc);
5622 		/*
5623 		 * Clear migrate bit and relocate page
5624 		 */
5625 		PP_CLRMIGRATE(pp);
5626 		if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5627 			panic("page_migrate: page_relocate failed");
5628 		}
5629 		ASSERT(page_cnt * PAGESIZE == pgsz);
5630 
5631 		/*
5632 		 * Keep stats for number of pages migrated from and to
5633 		 * each lgroup
5634 		 */
5635 		lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5636 		lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5637 		/*
5638 		 * update the page_t array we were passed in and
5639 		 * unlink constituent pages of a large page.
5640 		 */
5641 		for (i = 0; i < page_cnt; ++i, ++pp) {
5642 			ASSERT(PAGE_EXCL(newpp));
5643 			ASSERT(newpp->p_szc == pszc);
5644 			ppa[i] = newpp;
5645 			pp = newpp;
5646 			page_sub(&newpp, pp);
5647 			page_downgrade(pp);
5648 		}
5649 		ASSERT(newpp == NULL);
5650 next:
5651 		addr += pgsz;
5652 		ppa += page_cnt;
5653 		npages -= page_cnt;
5654 	}
5655 }
5656 
5657 ulong_t mem_waiters 	= 0;
5658 ulong_t	max_count 	= 20;
5659 #define	MAX_DELAY	0x1ff
5660 
5661 /*
5662  * Check if enough memory is available to proceed.
5663  * Depending on system configuration and how much memory is
5664  * reserved for swap we need to check against two variables.
5665  * e.g. on systems with little physical swap availrmem can be
5666  * more reliable indicator of how much memory is available.
5667  * On systems with large phys swap freemem can be better indicator.
5668  * If freemem drops below threshold level don't return an error
5669  * immediately but wake up pageout to free memory and block.
5670  * This is done number of times. If pageout is not able to free
5671  * memory within certain time return an error.
5672  * The same applies for availrmem but kmem_reap is used to
5673  * free memory.
5674  */
5675 int
5676 page_mem_avail(pgcnt_t npages)
5677 {
5678 	ulong_t count;
5679 
5680 #if defined(__i386)
5681 	if (freemem > desfree + npages &&
5682 	    availrmem > swapfs_reserve + npages &&
5683 	    btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem +
5684 	    npages)
5685 		return (1);
5686 #else
5687 	if (freemem > desfree + npages &&
5688 	    availrmem > swapfs_reserve + npages)
5689 		return (1);
5690 #endif
5691 
5692 	count = max_count;
5693 	atomic_add_long(&mem_waiters, 1);
5694 
5695 	while (freemem < desfree + npages && --count) {
5696 		cv_signal(&proc_pageout->p_cv);
5697 		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
5698 			atomic_add_long(&mem_waiters, -1);
5699 			return (0);
5700 		}
5701 	}
5702 	if (count == 0) {
5703 		atomic_add_long(&mem_waiters, -1);
5704 		return (0);
5705 	}
5706 
5707 	count = max_count;
5708 	while (availrmem < swapfs_reserve + npages && --count) {
5709 		kmem_reap();
5710 		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
5711 			atomic_add_long(&mem_waiters, -1);
5712 			return (0);
5713 		}
5714 	}
5715 	atomic_add_long(&mem_waiters, -1);
5716 	if (count == 0)
5717 		return (0);
5718 
5719 #if defined(__i386)
5720 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
5721 	    tune.t_minarmem + npages)
5722 		return (0);
5723 #endif
5724 	return (1);
5725 }
5726 
5727 #define	MAX_CNT	60	/* max num of iterations */
5728 /*
5729  * Reclaim/reserve availrmem for npages.
5730  * If there is not enough memory start reaping seg, kmem caches.
5731  * Start pageout scanner (via page_needfree()).
5732  * Exit after ~ MAX_CNT s regardless of how much memory has been released.
5733  * Note: There is no guarantee that any availrmem will be freed as
5734  * this memory typically is locked (kernel heap) or reserved for swap.
5735  * Also due to memory fragmentation kmem allocator may not be able
5736  * to free any memory (single user allocated buffer will prevent
5737  * freeing slab or a page).
5738  */
5739 int
5740 page_reclaim_mem(pgcnt_t npages, pgcnt_t epages, int adjust)
5741 {
5742 	int	i = 0;
5743 	int	ret = 0;
5744 	pgcnt_t	deficit;
5745 	pgcnt_t old_availrmem;
5746 
5747 	mutex_enter(&freemem_lock);
5748 	old_availrmem = availrmem - 1;
5749 	while ((availrmem < tune.t_minarmem + npages + epages) &&
5750 	    (old_availrmem < availrmem) && (i++ < MAX_CNT)) {
5751 		old_availrmem = availrmem;
5752 		deficit = tune.t_minarmem + npages + epages - availrmem;
5753 		mutex_exit(&freemem_lock);
5754 		page_needfree(deficit);
5755 		kmem_reap();
5756 		delay(hz);
5757 		page_needfree(-(spgcnt_t)deficit);
5758 		mutex_enter(&freemem_lock);
5759 	}
5760 
5761 	if (adjust && (availrmem >= tune.t_minarmem + npages + epages)) {
5762 		availrmem -= npages;
5763 		ret = 1;
5764 	}
5765 
5766 	mutex_exit(&freemem_lock);
5767 
5768 	return (ret);
5769 }
5770 
5771 /*
5772  * Search the memory segments to locate the desired page.  Within a
5773  * segment, pages increase linearly with one page structure per
5774  * physical page frame (size PAGESIZE).  The search begins
5775  * with the segment that was accessed last, to take advantage of locality.
5776  * If the hint misses, we start from the beginning of the sorted memseg list
5777  */
5778 
5779 
5780 /*
5781  * Some data structures for pfn to pp lookup.
5782  */
5783 ulong_t mhash_per_slot;
5784 struct memseg *memseg_hash[N_MEM_SLOTS];
5785 
5786 page_t *
5787 page_numtopp_nolock(pfn_t pfnum)
5788 {
5789 	struct memseg *seg;
5790 	page_t *pp;
5791 	vm_cpu_data_t *vc = CPU->cpu_vm_data;
5792 
5793 	ASSERT(vc != NULL);
5794 
5795 	MEMSEG_STAT_INCR(nsearch);
5796 
5797 	/* Try last winner first */
5798 	if (((seg = vc->vc_pnum_memseg) != NULL) &&
5799 	    (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5800 		MEMSEG_STAT_INCR(nlastwon);
5801 		pp = seg->pages + (pfnum - seg->pages_base);
5802 		if (pp->p_pagenum == pfnum)
5803 			return ((page_t *)pp);
5804 	}
5805 
5806 	/* Else Try hash */
5807 	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5808 	    (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5809 		MEMSEG_STAT_INCR(nhashwon);
5810 		vc->vc_pnum_memseg = seg;
5811 		pp = seg->pages + (pfnum - seg->pages_base);
5812 		if (pp->p_pagenum == pfnum)
5813 			return ((page_t *)pp);
5814 	}
5815 
5816 	/* Else Brute force */
5817 	for (seg = memsegs; seg != NULL; seg = seg->next) {
5818 		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5819 			vc->vc_pnum_memseg = seg;
5820 			pp = seg->pages + (pfnum - seg->pages_base);
5821 			return ((page_t *)pp);
5822 		}
5823 	}
5824 	vc->vc_pnum_memseg = NULL;
5825 	MEMSEG_STAT_INCR(nnotfound);
5826 	return ((page_t *)NULL);
5827 
5828 }
5829 
5830 struct memseg *
5831 page_numtomemseg_nolock(pfn_t pfnum)
5832 {
5833 	struct memseg *seg;
5834 	page_t *pp;
5835 
5836 	/* Try hash */
5837 	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5838 	    (pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5839 		pp = seg->pages + (pfnum - seg->pages_base);
5840 		if (pp->p_pagenum == pfnum)
5841 			return (seg);
5842 	}
5843 
5844 	/* Else Brute force */
5845 	for (seg = memsegs; seg != NULL; seg = seg->next) {
5846 		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5847 			return (seg);
5848 		}
5849 	}
5850 	return ((struct memseg *)NULL);
5851 }
5852 
5853 /*
5854  * Given a page and a count return the page struct that is
5855  * n structs away from the current one in the global page
5856  * list.
5857  *
5858  * This function wraps to the first page upon
5859  * reaching the end of the memseg list.
5860  */
5861 page_t *
5862 page_nextn(page_t *pp, ulong_t n)
5863 {
5864 	struct memseg *seg;
5865 	page_t *ppn;
5866 	vm_cpu_data_t *vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5867 
5868 	ASSERT(vc != NULL);
5869 
5870 	if (((seg = vc->vc_pnext_memseg) == NULL) ||
5871 	    (seg->pages_base == seg->pages_end) ||
5872 	    !(pp >= seg->pages && pp < seg->epages)) {
5873 
5874 		for (seg = memsegs; seg; seg = seg->next) {
5875 			if (pp >= seg->pages && pp < seg->epages)
5876 				break;
5877 		}
5878 
5879 		if (seg == NULL) {
5880 			/* Memory delete got in, return something valid. */
5881 			/* TODO: fix me. */
5882 			seg = memsegs;
5883 			pp = seg->pages;
5884 		}
5885 	}
5886 
5887 	/* check for wraparound - possible if n is large */
5888 	while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5889 		n -= seg->epages - pp;
5890 		seg = seg->next;
5891 		if (seg == NULL)
5892 			seg = memsegs;
5893 		pp = seg->pages;
5894 	}
5895 	vc->vc_pnext_memseg = seg;
5896 	return (ppn);
5897 }
5898 
5899 /*
5900  * Initialize for a loop using page_next_scan_large().
5901  */
5902 page_t *
5903 page_next_scan_init(void **cookie)
5904 {
5905 	ASSERT(cookie != NULL);
5906 	*cookie = (void *)memsegs;
5907 	return ((page_t *)memsegs->pages);
5908 }
5909 
5910 /*
5911  * Return the next page in a scan of page_t's, assuming we want
5912  * to skip over sub-pages within larger page sizes.
5913  *
5914  * The cookie is used to keep track of the current memseg.
5915  */
5916 page_t *
5917 page_next_scan_large(
5918 	page_t		*pp,
5919 	ulong_t		*n,
5920 	void		**cookie)
5921 {
5922 	struct memseg	*seg = (struct memseg *)*cookie;
5923 	page_t		*new_pp;
5924 	ulong_t		cnt;
5925 	pfn_t		pfn;
5926 
5927 
5928 	/*
5929 	 * get the count of page_t's to skip based on the page size
5930 	 */
5931 	ASSERT(pp != NULL);
5932 	if (pp->p_szc == 0) {
5933 		cnt = 1;
5934 	} else {
5935 		pfn = page_pptonum(pp);
5936 		cnt = page_get_pagecnt(pp->p_szc);
5937 		cnt -= pfn & (cnt - 1);
5938 	}
5939 	*n += cnt;
5940 	new_pp = pp + cnt;
5941 
5942 	/*
5943 	 * Catch if we went past the end of the current memory segment. If so,
5944 	 * just move to the next segment with pages.
5945 	 */
5946 	if (new_pp >= seg->epages) {
5947 		do {
5948 			seg = seg->next;
5949 			if (seg == NULL)
5950 				seg = memsegs;
5951 		} while (seg->pages == seg->epages);
5952 		new_pp = seg->pages;
5953 		*cookie = (void *)seg;
5954 	}
5955 
5956 	return (new_pp);
5957 }
5958 
5959 
5960 /*
5961  * Returns next page in list. Note: this function wraps
5962  * to the first page in the list upon reaching the end
5963  * of the list. Callers should be aware of this fact.
5964  */
5965 
5966 /* We should change this be a #define */
5967 
5968 page_t *
5969 page_next(page_t *pp)
5970 {
5971 	return (page_nextn(pp, 1));
5972 }
5973 
5974 page_t *
5975 page_first()
5976 {
5977 	return ((page_t *)memsegs->pages);
5978 }
5979 
5980 
5981 /*
5982  * This routine is called at boot with the initial memory configuration
5983  * and when memory is added or removed.
5984  */
5985 void
5986 build_pfn_hash()
5987 {
5988 	pfn_t cur;
5989 	pgcnt_t index;
5990 	struct memseg *pseg;
5991 	int	i;
5992 
5993 	/*
5994 	 * Clear memseg_hash array.
5995 	 * Since memory add/delete is designed to operate concurrently
5996 	 * with normal operation, the hash rebuild must be able to run
5997 	 * concurrently with page_numtopp_nolock(). To support this
5998 	 * functionality, assignments to memseg_hash array members must
5999 	 * be done atomically.
6000 	 *
6001 	 * NOTE: bzero() does not currently guarantee this for kernel
6002 	 * threads, and cannot be used here.
6003 	 */
6004 	for (i = 0; i < N_MEM_SLOTS; i++)
6005 		memseg_hash[i] = NULL;
6006 
6007 	hat_kpm_mseghash_clear(N_MEM_SLOTS);
6008 
6009 	/*
6010 	 * Physmax is the last valid pfn.
6011 	 */
6012 	mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6013 	for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6014 		index = MEMSEG_PFN_HASH(pseg->pages_base);
6015 		cur = pseg->pages_base;
6016 		do {
6017 			if (index >= N_MEM_SLOTS)
6018 				index = MEMSEG_PFN_HASH(cur);
6019 
6020 			if (memseg_hash[index] == NULL ||
6021 			    memseg_hash[index]->pages_base > pseg->pages_base) {
6022 				memseg_hash[index] = pseg;
6023 				hat_kpm_mseghash_update(index, pseg);
6024 			}
6025 			cur += mhash_per_slot;
6026 			index++;
6027 		} while (cur < pseg->pages_end);
6028 	}
6029 }
6030 
6031 /*
6032  * Return the pagenum for the pp
6033  */
6034 pfn_t
6035 page_pptonum(page_t *pp)
6036 {
6037 	return (pp->p_pagenum);
6038 }
6039 
6040 /*
6041  * interface to the referenced and modified etc bits
6042  * in the PSM part of the page struct
6043  * when no locking is desired.
6044  */
6045 void
6046 page_set_props(page_t *pp, uint_t flags)
6047 {
6048 	ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6049 	pp->p_nrm |= (uchar_t)flags;
6050 }
6051 
6052 void
6053 page_clr_all_props(page_t *pp)
6054 {
6055 	pp->p_nrm = 0;
6056 }
6057 
6058 /*
6059  * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6060  */
6061 int
6062 page_clear_lck_cow(page_t *pp, int adjust)
6063 {
6064 	int	f_amount;
6065 
6066 	ASSERT(PAGE_EXCL(pp));
6067 
6068 	/*
6069 	 * The page_struct_lock need not be acquired here since
6070 	 * we require the caller hold the page exclusively locked.
6071 	 */
6072 	f_amount = 0;
6073 	if (pp->p_lckcnt) {
6074 		f_amount = 1;
6075 		pp->p_lckcnt = 0;
6076 	}
6077 	if (pp->p_cowcnt) {
6078 		f_amount += pp->p_cowcnt;
6079 		pp->p_cowcnt = 0;
6080 	}
6081 
6082 	if (adjust && f_amount) {
6083 		mutex_enter(&freemem_lock);
6084 		availrmem += f_amount;
6085 		mutex_exit(&freemem_lock);
6086 	}
6087 
6088 	return (f_amount);
6089 }
6090 
6091 /*
6092  * The following functions is called from free_vp_pages()
6093  * for an inexact estimate of a newly free'd page...
6094  */
6095 ulong_t
6096 page_share_cnt(page_t *pp)
6097 {
6098 	return (hat_page_getshare(pp));
6099 }
6100 
6101 int
6102 page_isshared(page_t *pp)
6103 {
6104 	return (hat_page_checkshare(pp, 1));
6105 }
6106 
6107 int
6108 page_isfree(page_t *pp)
6109 {
6110 	return (PP_ISFREE(pp));
6111 }
6112 
6113 int
6114 page_isref(page_t *pp)
6115 {
6116 	return (hat_page_getattr(pp, P_REF));
6117 }
6118 
6119 int
6120 page_ismod(page_t *pp)
6121 {
6122 	return (hat_page_getattr(pp, P_MOD));
6123 }
6124 
6125 /*
6126  * The following code all currently relates to the page capture logic:
6127  *
6128  * This logic is used for cases where there is a desire to claim a certain
6129  * physical page in the system for the caller.  As it may not be possible
6130  * to capture the page immediately, the p_toxic bits are used in the page
6131  * structure to indicate that someone wants to capture this page.  When the
6132  * page gets unlocked, the toxic flag will be noted and an attempt to capture
6133  * the page will be made.  If it is successful, the original callers callback
6134  * will be called with the page to do with it what they please.
6135  *
6136  * There is also an async thread which wakes up to attempt to capture
6137  * pages occasionally which have the capture bit set.  All of the pages which
6138  * need to be captured asynchronously have been inserted into the
6139  * page_capture_hash and thus this thread walks that hash list.  Items in the
6140  * hash have an expiration time so this thread handles that as well by removing
6141  * the item from the hash if it has expired.
6142  *
6143  * Some important things to note are:
6144  * - if the PR_CAPTURE bit is set on a page, then the page is in the
6145  *   page_capture_hash.  The page_capture_hash_head.pchh_mutex is needed
6146  *   to set and clear this bit, and while the lock is held is the only time
6147  *   you can add or remove an entry from the hash.
6148  * - the PR_CAPTURE bit can only be set and cleared while holding the
6149  *   page_capture_hash_head.pchh_mutex
6150  * - the t_flag field of the thread struct is used with the T_CAPTURING
6151  *   flag to prevent recursion while dealing with large pages.
6152  * - pages which need to be retired never expire on the page_capture_hash.
6153  */
6154 
6155 static void page_capture_thread(void);
6156 static kthread_t *pc_thread_id;
6157 kcondvar_t pc_cv;
6158 static kmutex_t pc_thread_mutex;
6159 static clock_t pc_thread_shortwait;
6160 static clock_t pc_thread_longwait;
6161 static int pc_thread_retry;
6162 
6163 struct page_capture_callback pc_cb[PC_NUM_CALLBACKS];
6164 
6165 /* Note that this is a circular linked list */
6166 typedef struct page_capture_hash_bucket {
6167 	page_t *pp;
6168 	uint_t szc;
6169 	uint_t flags;
6170 	clock_t expires;	/* lbolt at which this request expires. */
6171 	void *datap;		/* Cached data passed in for callback */
6172 	struct page_capture_hash_bucket *next;
6173 	struct page_capture_hash_bucket *prev;
6174 } page_capture_hash_bucket_t;
6175 
6176 /*
6177  * Each hash bucket will have it's own mutex and two lists which are:
6178  * active (0):	represents requests which have not been processed by
6179  *		the page_capture async thread yet.
6180  * walked (1):	represents requests which have been processed by the
6181  *		page_capture async thread within it's given walk of this bucket.
6182  *
6183  * These are all needed so that we can synchronize all async page_capture
6184  * events.  When the async thread moves to a new bucket, it will append the
6185  * walked list to the active list and walk each item one at a time, moving it
6186  * from the active list to the walked list.  Thus if there is an async request
6187  * outstanding for a given page, it will always be in one of the two lists.
6188  * New requests will always be added to the active list.
6189  * If we were not able to capture a page before the request expired, we'd free
6190  * up the request structure which would indicate to page_capture that there is
6191  * no longer a need for the given page, and clear the PR_CAPTURE flag if
6192  * possible.
6193  */
6194 typedef struct page_capture_hash_head {
6195 	kmutex_t pchh_mutex;
6196 	uint_t num_pages;
6197 	page_capture_hash_bucket_t lists[2]; /* sentinel nodes */
6198 } page_capture_hash_head_t;
6199 
6200 #ifdef DEBUG
6201 #define	NUM_PAGE_CAPTURE_BUCKETS 4
6202 #else
6203 #define	NUM_PAGE_CAPTURE_BUCKETS 64
6204 #endif
6205 
6206 page_capture_hash_head_t page_capture_hash[NUM_PAGE_CAPTURE_BUCKETS];
6207 
6208 /* for now use a very simple hash based upon the size of a page struct */
6209 #define	PAGE_CAPTURE_HASH(pp)	\
6210 	((int)(((uintptr_t)pp >> 7) & (NUM_PAGE_CAPTURE_BUCKETS - 1)))
6211 
6212 extern pgcnt_t swapfs_minfree;
6213 
6214 int page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap);
6215 
6216 /*
6217  * a callback function is required for page capture requests.
6218  */
6219 void
6220 page_capture_register_callback(uint_t index, clock_t duration,
6221     int (*cb_func)(page_t *, void *, uint_t))
6222 {
6223 	ASSERT(pc_cb[index].cb_active == 0);
6224 	ASSERT(cb_func != NULL);
6225 	rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6226 	pc_cb[index].duration = duration;
6227 	pc_cb[index].cb_func = cb_func;
6228 	pc_cb[index].cb_active = 1;
6229 	rw_exit(&pc_cb[index].cb_rwlock);
6230 }
6231 
6232 void
6233 page_capture_unregister_callback(uint_t index)
6234 {
6235 	int i, j;
6236 	struct page_capture_hash_bucket *bp1;
6237 	struct page_capture_hash_bucket *bp2;
6238 	struct page_capture_hash_bucket *head = NULL;
6239 	uint_t flags = (1 << index);
6240 
6241 	rw_enter(&pc_cb[index].cb_rwlock, RW_WRITER);
6242 	ASSERT(pc_cb[index].cb_active == 1);
6243 	pc_cb[index].duration = 0;	/* Paranoia */
6244 	pc_cb[index].cb_func = NULL;	/* Paranoia */
6245 	pc_cb[index].cb_active = 0;
6246 	rw_exit(&pc_cb[index].cb_rwlock);
6247 
6248 	/*
6249 	 * Just move all the entries to a private list which we can walk
6250 	 * through without the need to hold any locks.
6251 	 * No more requests can get added to the hash lists for this consumer
6252 	 * as the cb_active field for the callback has been cleared.
6253 	 */
6254 	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
6255 		mutex_enter(&page_capture_hash[i].pchh_mutex);
6256 		for (j = 0; j < 2; j++) {
6257 			bp1 = page_capture_hash[i].lists[j].next;
6258 			/* walk through all but first (sentinel) element */
6259 			while (bp1 != &page_capture_hash[i].lists[j]) {
6260 				bp2 = bp1;
6261 				if (bp2->flags & flags) {
6262 					bp1 = bp2->next;
6263 					bp1->prev = bp2->prev;
6264 					bp2->prev->next = bp1;
6265 					bp2->next = head;
6266 					head = bp2;
6267 					/*
6268 					 * Clear the PR_CAPTURE bit as we
6269 					 * hold appropriate locks here.
6270 					 */
6271 					page_clrtoxic(head->pp, PR_CAPTURE);
6272 					page_capture_hash[i].num_pages--;
6273 					continue;
6274 				}
6275 				bp1 = bp1->next;
6276 			}
6277 		}
6278 		mutex_exit(&page_capture_hash[i].pchh_mutex);
6279 	}
6280 
6281 	while (head != NULL) {
6282 		bp1 = head;
6283 		head = head->next;
6284 		kmem_free(bp1, sizeof (*bp1));
6285 	}
6286 }
6287 
6288 
6289 /*
6290  * Find pp in the active list and move it to the walked list if it
6291  * exists.
6292  * Note that most often pp should be at the front of the active list
6293  * as it is currently used and thus there is no other sort of optimization
6294  * being done here as this is a linked list data structure.
6295  * Returns 1 on successful move or 0 if page could not be found.
6296  */
6297 static int
6298 page_capture_move_to_walked(page_t *pp)
6299 {
6300 	page_capture_hash_bucket_t *bp;
6301 	int index;
6302 
6303 	index = PAGE_CAPTURE_HASH(pp);
6304 
6305 	mutex_enter(&page_capture_hash[index].pchh_mutex);
6306 	bp = page_capture_hash[index].lists[0].next;
6307 	while (bp != &page_capture_hash[index].lists[0]) {
6308 		if (bp->pp == pp) {
6309 			/* Remove from old list */
6310 			bp->next->prev = bp->prev;
6311 			bp->prev->next = bp->next;
6312 
6313 			/* Add to new list */
6314 			bp->next = page_capture_hash[index].lists[1].next;
6315 			bp->prev = &page_capture_hash[index].lists[1];
6316 			page_capture_hash[index].lists[1].next = bp;
6317 			bp->next->prev = bp;
6318 			mutex_exit(&page_capture_hash[index].pchh_mutex);
6319 
6320 			return (1);
6321 		}
6322 		bp = bp->next;
6323 	}
6324 	mutex_exit(&page_capture_hash[index].pchh_mutex);
6325 	return (0);
6326 }
6327 
6328 /*
6329  * Add a new entry to the page capture hash.  The only case where a new
6330  * entry is not added is when the page capture consumer is no longer registered.
6331  * In this case, we'll silently not add the page to the hash.  We know that
6332  * page retire will always be registered for the case where we are currently
6333  * unretiring a page and thus there are no conflicts.
6334  */
6335 static void
6336 page_capture_add_hash(page_t *pp, uint_t szc, uint_t flags, void *datap)
6337 {
6338 	page_capture_hash_bucket_t *bp1;
6339 	page_capture_hash_bucket_t *bp2;
6340 	int index;
6341 	int cb_index;
6342 	int i;
6343 #ifdef DEBUG
6344 	page_capture_hash_bucket_t *tp1;
6345 	int l;
6346 #endif
6347 
6348 	ASSERT(!(flags & CAPTURE_ASYNC));
6349 
6350 	bp1 = kmem_alloc(sizeof (struct page_capture_hash_bucket), KM_SLEEP);
6351 
6352 	bp1->pp = pp;
6353 	bp1->szc = szc;
6354 	bp1->flags = flags;
6355 	bp1->datap = datap;
6356 
6357 	for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6358 		if ((flags >> cb_index) & 1) {
6359 			break;
6360 		}
6361 	}
6362 
6363 	ASSERT(cb_index != PC_NUM_CALLBACKS);
6364 
6365 	rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6366 	if (pc_cb[cb_index].cb_active) {
6367 		if (pc_cb[cb_index].duration == -1) {
6368 			bp1->expires = (clock_t)-1;
6369 		} else {
6370 			bp1->expires = lbolt + pc_cb[cb_index].duration;
6371 		}
6372 	} else {
6373 		/* There's no callback registered so don't add to the hash */
6374 		rw_exit(&pc_cb[cb_index].cb_rwlock);
6375 		kmem_free(bp1, sizeof (*bp1));
6376 		return;
6377 	}
6378 
6379 	index = PAGE_CAPTURE_HASH(pp);
6380 
6381 	/*
6382 	 * Only allow capture flag to be modified under this mutex.
6383 	 * Prevents multiple entries for same page getting added.
6384 	 */
6385 	mutex_enter(&page_capture_hash[index].pchh_mutex);
6386 
6387 	/*
6388 	 * if not already on the hash, set capture bit and add to the hash
6389 	 */
6390 	if (!(pp->p_toxic & PR_CAPTURE)) {
6391 #ifdef DEBUG
6392 		/* Check for duplicate entries */
6393 		for (l = 0; l < 2; l++) {
6394 			tp1 = page_capture_hash[index].lists[l].next;
6395 			while (tp1 != &page_capture_hash[index].lists[l]) {
6396 				if (tp1->pp == pp) {
6397 					panic("page pp 0x%p already on hash "
6398 					    "at 0x%p\n", pp, tp1);
6399 				}
6400 				tp1 = tp1->next;
6401 			}
6402 		}
6403 
6404 #endif
6405 		page_settoxic(pp, PR_CAPTURE);
6406 		bp1->next = page_capture_hash[index].lists[0].next;
6407 		bp1->prev = &page_capture_hash[index].lists[0];
6408 		bp1->next->prev = bp1;
6409 		page_capture_hash[index].lists[0].next = bp1;
6410 		page_capture_hash[index].num_pages++;
6411 		if (flags & CAPTURE_RETIRE) {
6412 			page_retire_incr_pend_count();
6413 		}
6414 		mutex_exit(&page_capture_hash[index].pchh_mutex);
6415 		rw_exit(&pc_cb[cb_index].cb_rwlock);
6416 		cv_signal(&pc_cv);
6417 		return;
6418 	}
6419 
6420 	/*
6421 	 * A page retire request will replace any other request.
6422 	 * A second physmem request which is for a different process than
6423 	 * the currently registered one will be dropped as there is
6424 	 * no way to hold the private data for both calls.
6425 	 * In the future, once there are more callers, this will have to
6426 	 * be worked out better as there needs to be private storage for
6427 	 * at least each type of caller (maybe have datap be an array of
6428 	 * *void's so that we can index based upon callers index).
6429 	 */
6430 
6431 	/* walk hash list to update expire time */
6432 	for (i = 0; i < 2; i++) {
6433 		bp2 = page_capture_hash[index].lists[i].next;
6434 		while (bp2 != &page_capture_hash[index].lists[i]) {
6435 			if (bp2->pp == pp) {
6436 				if (flags & CAPTURE_RETIRE) {
6437 					if (!(bp2->flags & CAPTURE_RETIRE)) {
6438 						page_retire_incr_pend_count();
6439 						bp2->flags = flags;
6440 						bp2->expires = bp1->expires;
6441 						bp2->datap = datap;
6442 					}
6443 				} else {
6444 					ASSERT(flags & CAPTURE_PHYSMEM);
6445 					if (!(bp2->flags & CAPTURE_RETIRE) &&
6446 					    (datap == bp2->datap)) {
6447 						bp2->expires = bp1->expires;
6448 					}
6449 				}
6450 				mutex_exit(&page_capture_hash[index].
6451 				    pchh_mutex);
6452 				rw_exit(&pc_cb[cb_index].cb_rwlock);
6453 				kmem_free(bp1, sizeof (*bp1));
6454 				return;
6455 			}
6456 			bp2 = bp2->next;
6457 		}
6458 	}
6459 
6460 	/*
6461 	 * the PR_CAPTURE flag is protected by the page_capture_hash mutexes
6462 	 * and thus it either has to be set or not set and can't change
6463 	 * while holding the mutex above.
6464 	 */
6465 	panic("page_capture_add_hash, PR_CAPTURE flag set on pp %p\n", pp);
6466 }
6467 
6468 /*
6469  * We have a page in our hands, lets try and make it ours by turning
6470  * it into a clean page like it had just come off the freelists.
6471  *
6472  * Returns 0 on success, with the page still EXCL locked.
6473  * On failure, the page will be unlocked, and returns EAGAIN
6474  */
6475 static int
6476 page_capture_clean_page(page_t *pp)
6477 {
6478 	page_t *newpp;
6479 	int skip_unlock = 0;
6480 	spgcnt_t count;
6481 	page_t *tpp;
6482 	int ret = 0;
6483 	int extra;
6484 
6485 	ASSERT(PAGE_EXCL(pp));
6486 	ASSERT(!PP_RETIRED(pp));
6487 	ASSERT(curthread->t_flag & T_CAPTURING);
6488 
6489 	if (PP_ISFREE(pp)) {
6490 		if (!page_reclaim(pp, NULL)) {
6491 			skip_unlock = 1;
6492 			ret = EAGAIN;
6493 			goto cleanup;
6494 		}
6495 		ASSERT(pp->p_szc == 0);
6496 		if (pp->p_vnode != NULL) {
6497 			/*
6498 			 * Since this page came from the
6499 			 * cachelist, we must destroy the
6500 			 * old vnode association.
6501 			 */
6502 			page_hashout(pp, NULL);
6503 		}
6504 		goto cleanup;
6505 	}
6506 
6507 	/*
6508 	 * If we know page_relocate will fail, skip it
6509 	 * It could still fail due to a UE on another page but we
6510 	 * can't do anything about that.
6511 	 */
6512 	if (pp->p_toxic & PR_UE) {
6513 		goto skip_relocate;
6514 	}
6515 
6516 	/*
6517 	 * It's possible that pages can not have a vnode as fsflush comes
6518 	 * through and cleans up these pages.  It's ugly but that's how it is.
6519 	 */
6520 	if (pp->p_vnode == NULL) {
6521 		goto skip_relocate;
6522 	}
6523 
6524 	/*
6525 	 * Page was not free, so lets try to relocate it.
6526 	 * page_relocate only works with root pages, so if this is not a root
6527 	 * page, we need to demote it to try and relocate it.
6528 	 * Unfortunately this is the best we can do right now.
6529 	 */
6530 	newpp = NULL;
6531 	if ((pp->p_szc > 0) && (pp != PP_PAGEROOT(pp))) {
6532 		if (page_try_demote_pages(pp) == 0) {
6533 			ret = EAGAIN;
6534 			goto cleanup;
6535 		}
6536 	}
6537 	ret = page_relocate(&pp, &newpp, 1, 0, &count, NULL);
6538 	if (ret == 0) {
6539 		page_t *npp;
6540 		/* unlock the new page(s) */
6541 		while (count-- > 0) {
6542 			ASSERT(newpp != NULL);
6543 			npp = newpp;
6544 			page_sub(&newpp, npp);
6545 			page_unlock(npp);
6546 		}
6547 		ASSERT(newpp == NULL);
6548 		/*
6549 		 * Check to see if the page we have is too large.
6550 		 * If so, demote it freeing up the extra pages.
6551 		 */
6552 		if (pp->p_szc > 0) {
6553 			/* For now demote extra pages to szc == 0 */
6554 			extra = page_get_pagecnt(pp->p_szc) - 1;
6555 			while (extra > 0) {
6556 				tpp = pp->p_next;
6557 				page_sub(&pp, tpp);
6558 				tpp->p_szc = 0;
6559 				page_free(tpp, 1);
6560 				extra--;
6561 			}
6562 			/* Make sure to set our page to szc 0 as well */
6563 			ASSERT(pp->p_next == pp && pp->p_prev == pp);
6564 			pp->p_szc = 0;
6565 		}
6566 		goto cleanup;
6567 	} else if (ret == EIO) {
6568 		ret = EAGAIN;
6569 		goto cleanup;
6570 	} else {
6571 		/*
6572 		 * Need to reset return type as we failed to relocate the page
6573 		 * but that does not mean that some of the next steps will not
6574 		 * work.
6575 		 */
6576 		ret = 0;
6577 	}
6578 
6579 skip_relocate:
6580 
6581 	if (pp->p_szc > 0) {
6582 		if (page_try_demote_pages(pp) == 0) {
6583 			ret = EAGAIN;
6584 			goto cleanup;
6585 		}
6586 	}
6587 
6588 	ASSERT(pp->p_szc == 0);
6589 
6590 	if (hat_ismod(pp)) {
6591 		ret = EAGAIN;
6592 		goto cleanup;
6593 	}
6594 	if (PP_ISKAS(pp)) {
6595 		ret = EAGAIN;
6596 		goto cleanup;
6597 	}
6598 	if (pp->p_lckcnt || pp->p_cowcnt) {
6599 		ret = EAGAIN;
6600 		goto cleanup;
6601 	}
6602 
6603 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
6604 	ASSERT(!hat_page_is_mapped(pp));
6605 
6606 	if (hat_ismod(pp)) {
6607 		/*
6608 		 * This is a semi-odd case as the page is now modified but not
6609 		 * mapped as we just unloaded the mappings above.
6610 		 */
6611 		ret = EAGAIN;
6612 		goto cleanup;
6613 	}
6614 	if (pp->p_vnode != NULL) {
6615 		page_hashout(pp, NULL);
6616 	}
6617 
6618 	/*
6619 	 * At this point, the page should be in a clean state and
6620 	 * we can do whatever we want with it.
6621 	 */
6622 
6623 cleanup:
6624 	if (ret != 0) {
6625 		if (!skip_unlock) {
6626 			page_unlock(pp);
6627 		}
6628 	} else {
6629 		ASSERT(pp->p_szc == 0);
6630 		ASSERT(PAGE_EXCL(pp));
6631 
6632 		pp->p_next = pp;
6633 		pp->p_prev = pp;
6634 	}
6635 	return (ret);
6636 }
6637 
6638 /*
6639  * Various callers of page_trycapture() can have different restrictions upon
6640  * what memory they have access to.
6641  * Returns 0 on success, with the following error codes on failure:
6642  *      EPERM - The requested page is long term locked, and thus repeated
6643  *              requests to capture this page will likely fail.
6644  *      ENOMEM - There was not enough free memory in the system to safely
6645  *              map the requested page.
6646  *      ENOENT - The requested page was inside the kernel cage, and the
6647  *              PHYSMEM_CAGE flag was not set.
6648  */
6649 int
6650 page_capture_pre_checks(page_t *pp, uint_t flags)
6651 {
6652 #if defined(__sparc)
6653 	extern struct vnode prom_ppages;
6654 #endif /* __sparc */
6655 
6656 	ASSERT(pp != NULL);
6657 
6658 #if defined(__sparc)
6659 	if (pp->p_vnode == &prom_ppages) {
6660 		return (EPERM);
6661 	}
6662 
6663 	if (PP_ISNORELOC(pp) && !(flags & CAPTURE_GET_CAGE) &&
6664 	    (flags & CAPTURE_PHYSMEM)) {
6665 		return (ENOENT);
6666 	}
6667 
6668 	if (PP_ISNORELOCKERNEL(pp)) {
6669 		return (EPERM);
6670 	}
6671 #else
6672 	if (PP_ISKAS(pp)) {
6673 		return (EPERM);
6674 	}
6675 #endif /* __sparc */
6676 
6677 	/* only physmem currently has the restrictions checked below */
6678 	if (!(flags & CAPTURE_PHYSMEM)) {
6679 		return (0);
6680 	}
6681 
6682 	if (availrmem < swapfs_minfree) {
6683 		/*
6684 		 * We won't try to capture this page as we are
6685 		 * running low on memory.
6686 		 */
6687 		return (ENOMEM);
6688 	}
6689 	return (0);
6690 }
6691 
6692 /*
6693  * Once we have a page in our mits, go ahead and complete the capture
6694  * operation.
6695  * Returns 1 on failure where page is no longer needed
6696  * Returns 0 on success
6697  * Returns -1 if there was a transient failure.
6698  * Failure cases must release the SE_EXCL lock on pp (usually via page_free).
6699  */
6700 int
6701 page_capture_take_action(page_t *pp, uint_t flags, void *datap)
6702 {
6703 	int cb_index;
6704 	int ret = 0;
6705 	page_capture_hash_bucket_t *bp1;
6706 	page_capture_hash_bucket_t *bp2;
6707 	int index;
6708 	int found = 0;
6709 	int i;
6710 
6711 	ASSERT(PAGE_EXCL(pp));
6712 	ASSERT(curthread->t_flag & T_CAPTURING);
6713 
6714 	for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6715 		if ((flags >> cb_index) & 1) {
6716 			break;
6717 		}
6718 	}
6719 	ASSERT(cb_index < PC_NUM_CALLBACKS);
6720 
6721 	/*
6722 	 * Remove the entry from the page_capture hash, but don't free it yet
6723 	 * as we may need to put it back.
6724 	 * Since we own the page at this point in time, we should find it
6725 	 * in the hash if this is an ASYNC call.  If we don't it's likely
6726 	 * that the page_capture_async() thread decided that this request
6727 	 * had expired, in which case we just continue on.
6728 	 */
6729 	if (flags & CAPTURE_ASYNC) {
6730 
6731 		index = PAGE_CAPTURE_HASH(pp);
6732 
6733 		mutex_enter(&page_capture_hash[index].pchh_mutex);
6734 		for (i = 0; i < 2 && !found; i++) {
6735 			bp1 = page_capture_hash[index].lists[i].next;
6736 			while (bp1 != &page_capture_hash[index].lists[i]) {
6737 				if (bp1->pp == pp) {
6738 					bp1->next->prev = bp1->prev;
6739 					bp1->prev->next = bp1->next;
6740 					page_capture_hash[index].num_pages--;
6741 					page_clrtoxic(pp, PR_CAPTURE);
6742 					found = 1;
6743 					break;
6744 				}
6745 				bp1 = bp1->next;
6746 			}
6747 		}
6748 		mutex_exit(&page_capture_hash[index].pchh_mutex);
6749 	}
6750 
6751 	/* Synchronize with the unregister func. */
6752 	rw_enter(&pc_cb[cb_index].cb_rwlock, RW_READER);
6753 	if (!pc_cb[cb_index].cb_active) {
6754 		page_free(pp, 1);
6755 		rw_exit(&pc_cb[cb_index].cb_rwlock);
6756 		if (found) {
6757 			kmem_free(bp1, sizeof (*bp1));
6758 		}
6759 		return (1);
6760 	}
6761 
6762 	/*
6763 	 * We need to remove the entry from the page capture hash and turn off
6764 	 * the PR_CAPTURE bit before calling the callback.  We'll need to cache
6765 	 * the entry here, and then based upon the return value, cleanup
6766 	 * appropriately or re-add it to the hash, making sure that someone else
6767 	 * hasn't already done so.
6768 	 * It should be rare for the callback to fail and thus it's ok for
6769 	 * the failure path to be a bit complicated as the success path is
6770 	 * cleaner and the locking rules are easier to follow.
6771 	 */
6772 
6773 	ret = pc_cb[cb_index].cb_func(pp, datap, flags);
6774 
6775 	rw_exit(&pc_cb[cb_index].cb_rwlock);
6776 
6777 	/*
6778 	 * If this was an ASYNC request, we need to cleanup the hash if the
6779 	 * callback was successful or if the request was no longer valid.
6780 	 * For non-ASYNC requests, we return failure to map and the caller
6781 	 * will take care of adding the request to the hash.
6782 	 * Note also that the callback itself is responsible for the page
6783 	 * at this point in time in terms of locking ...  The most common
6784 	 * case for the failure path should just be a page_free.
6785 	 */
6786 	if (ret >= 0) {
6787 		if (found) {
6788 			if (bp1->flags & CAPTURE_RETIRE) {
6789 				page_retire_decr_pend_count();
6790 			}
6791 			kmem_free(bp1, sizeof (*bp1));
6792 		}
6793 		return (ret);
6794 	}
6795 	if (!found) {
6796 		return (ret);
6797 	}
6798 
6799 	ASSERT(flags & CAPTURE_ASYNC);
6800 
6801 	/*
6802 	 * Check for expiration time first as we can just free it up if it's
6803 	 * expired.
6804 	 */
6805 	if (lbolt > bp1->expires && bp1->expires != -1) {
6806 		kmem_free(bp1, sizeof (*bp1));
6807 		return (ret);
6808 	}
6809 
6810 	/*
6811 	 * The callback failed and there used to be an entry in the hash for
6812 	 * this page, so we need to add it back to the hash.
6813 	 */
6814 	mutex_enter(&page_capture_hash[index].pchh_mutex);
6815 	if (!(pp->p_toxic & PR_CAPTURE)) {
6816 		/* just add bp1 back to head of walked list */
6817 		page_settoxic(pp, PR_CAPTURE);
6818 		bp1->next = page_capture_hash[index].lists[1].next;
6819 		bp1->prev = &page_capture_hash[index].lists[1];
6820 		bp1->next->prev = bp1;
6821 		page_capture_hash[index].lists[1].next = bp1;
6822 		page_capture_hash[index].num_pages++;
6823 		mutex_exit(&page_capture_hash[index].pchh_mutex);
6824 		return (ret);
6825 	}
6826 
6827 	/*
6828 	 * Otherwise there was a new capture request added to list
6829 	 * Need to make sure that our original data is represented if
6830 	 * appropriate.
6831 	 */
6832 	for (i = 0; i < 2; i++) {
6833 		bp2 = page_capture_hash[index].lists[i].next;
6834 		while (bp2 != &page_capture_hash[index].lists[i]) {
6835 			if (bp2->pp == pp) {
6836 				if (bp1->flags & CAPTURE_RETIRE) {
6837 					if (!(bp2->flags & CAPTURE_RETIRE)) {
6838 						bp2->szc = bp1->szc;
6839 						bp2->flags = bp1->flags;
6840 						bp2->expires = bp1->expires;
6841 						bp2->datap = bp1->datap;
6842 					}
6843 				} else {
6844 					ASSERT(bp1->flags & CAPTURE_PHYSMEM);
6845 					if (!(bp2->flags & CAPTURE_RETIRE)) {
6846 						bp2->szc = bp1->szc;
6847 						bp2->flags = bp1->flags;
6848 						bp2->expires = bp1->expires;
6849 						bp2->datap = bp1->datap;
6850 					}
6851 				}
6852 				mutex_exit(&page_capture_hash[index].
6853 				    pchh_mutex);
6854 				kmem_free(bp1, sizeof (*bp1));
6855 				return (ret);
6856 			}
6857 			bp2 = bp2->next;
6858 		}
6859 	}
6860 	panic("PR_CAPTURE set but not on hash for pp 0x%p\n", pp);
6861 	/*NOTREACHED*/
6862 }
6863 
6864 /*
6865  * Try to capture the given page for the caller specified in the flags
6866  * parameter.  The page will either be captured and handed over to the
6867  * appropriate callback, or will be queued up in the page capture hash
6868  * to be captured asynchronously.
6869  * If the current request is due to an async capture, the page must be
6870  * exclusively locked before calling this function.
6871  * Currently szc must be 0 but in the future this should be expandable to
6872  * other page sizes.
6873  * Returns 0 on success, with the following error codes on failure:
6874  *      EPERM - The requested page is long term locked, and thus repeated
6875  *              requests to capture this page will likely fail.
6876  *      ENOMEM - There was not enough free memory in the system to safely
6877  *              map the requested page.
6878  *      ENOENT - The requested page was inside the kernel cage, and the
6879  *              CAPTURE_GET_CAGE flag was not set.
6880  *	EAGAIN - The requested page could not be capturead at this point in
6881  *		time but future requests will likely work.
6882  *	EBUSY - The requested page is retired and the CAPTURE_GET_RETIRED flag
6883  *		was not set.
6884  */
6885 int
6886 page_itrycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6887 {
6888 	int ret;
6889 	int cb_index;
6890 
6891 	if (flags & CAPTURE_ASYNC) {
6892 		ASSERT(PAGE_EXCL(pp));
6893 		goto async;
6894 	}
6895 
6896 	/* Make sure there's enough availrmem ... */
6897 	ret = page_capture_pre_checks(pp, flags);
6898 	if (ret != 0) {
6899 		return (ret);
6900 	}
6901 
6902 	if (!page_trylock(pp, SE_EXCL)) {
6903 		for (cb_index = 0; cb_index < PC_NUM_CALLBACKS; cb_index++) {
6904 			if ((flags >> cb_index) & 1) {
6905 				break;
6906 			}
6907 		}
6908 		ASSERT(cb_index < PC_NUM_CALLBACKS);
6909 		ret = EAGAIN;
6910 		/* Special case for retired pages */
6911 		if (PP_RETIRED(pp)) {
6912 			if (flags & CAPTURE_GET_RETIRED) {
6913 				if (!page_unretire_pp(pp, PR_UNR_TEMP)) {
6914 					/*
6915 					 * Need to set capture bit and add to
6916 					 * hash so that the page will be
6917 					 * retired when freed.
6918 					 */
6919 					page_capture_add_hash(pp, szc,
6920 					    CAPTURE_RETIRE, NULL);
6921 					ret = 0;
6922 					goto own_page;
6923 				}
6924 			} else {
6925 				return (EBUSY);
6926 			}
6927 		}
6928 		page_capture_add_hash(pp, szc, flags, datap);
6929 		return (ret);
6930 	}
6931 
6932 async:
6933 	ASSERT(PAGE_EXCL(pp));
6934 
6935 	/* Need to check for physmem async requests that availrmem is sane */
6936 	if ((flags & (CAPTURE_ASYNC | CAPTURE_PHYSMEM)) ==
6937 	    (CAPTURE_ASYNC | CAPTURE_PHYSMEM) &&
6938 	    (availrmem < swapfs_minfree)) {
6939 		page_unlock(pp);
6940 		return (ENOMEM);
6941 	}
6942 
6943 	ret = page_capture_clean_page(pp);
6944 
6945 	if (ret != 0) {
6946 		/* We failed to get the page, so lets add it to the hash */
6947 		if (!(flags & CAPTURE_ASYNC)) {
6948 			page_capture_add_hash(pp, szc, flags, datap);
6949 		}
6950 		return (ret);
6951 	}
6952 
6953 own_page:
6954 	ASSERT(PAGE_EXCL(pp));
6955 	ASSERT(pp->p_szc == 0);
6956 
6957 	/* Call the callback */
6958 	ret = page_capture_take_action(pp, flags, datap);
6959 
6960 	if (ret == 0) {
6961 		return (0);
6962 	}
6963 
6964 	/*
6965 	 * Note that in the failure cases from page_capture_take_action, the
6966 	 * EXCL lock will have already been dropped.
6967 	 */
6968 	if ((ret == -1) && (!(flags & CAPTURE_ASYNC))) {
6969 		page_capture_add_hash(pp, szc, flags, datap);
6970 	}
6971 	return (EAGAIN);
6972 }
6973 
6974 int
6975 page_trycapture(page_t *pp, uint_t szc, uint_t flags, void *datap)
6976 {
6977 	int ret;
6978 
6979 	curthread->t_flag |= T_CAPTURING;
6980 	ret = page_itrycapture(pp, szc, flags, datap);
6981 	curthread->t_flag &= ~T_CAPTURING; /* xor works as we know its set */
6982 	return (ret);
6983 }
6984 
6985 /*
6986  * When unlocking a page which has the PR_CAPTURE bit set, this routine
6987  * gets called to try and capture the page.
6988  */
6989 void
6990 page_unlock_capture(page_t *pp)
6991 {
6992 	page_capture_hash_bucket_t *bp;
6993 	int index;
6994 	int i;
6995 	uint_t szc;
6996 	uint_t flags = 0;
6997 	void *datap;
6998 	kmutex_t *mp;
6999 	extern vnode_t retired_pages;
7000 
7001 	/*
7002 	 * We need to protect against a possible deadlock here where we own
7003 	 * the vnode page hash mutex and want to acquire it again as there
7004 	 * are locations in the code, where we unlock a page while holding
7005 	 * the mutex which can lead to the page being captured and eventually
7006 	 * end up here.  As we may be hashing out the old page and hashing into
7007 	 * the retire vnode, we need to make sure we don't own them.
7008 	 * Other callbacks who do hash operations also need to make sure that
7009 	 * before they hashin to a vnode that they do not currently own the
7010 	 * vphm mutex otherwise there will be a panic.
7011 	 */
7012 	if (mutex_owned(page_vnode_mutex(&retired_pages))) {
7013 		page_unlock_nocapture(pp);
7014 		return;
7015 	}
7016 	if (pp->p_vnode != NULL && mutex_owned(page_vnode_mutex(pp->p_vnode))) {
7017 		page_unlock_nocapture(pp);
7018 		return;
7019 	}
7020 
7021 	index = PAGE_CAPTURE_HASH(pp);
7022 
7023 	mp = &page_capture_hash[index].pchh_mutex;
7024 	mutex_enter(mp);
7025 	for (i = 0; i < 2; i++) {
7026 		bp = page_capture_hash[index].lists[i].next;
7027 		while (bp != &page_capture_hash[index].lists[i]) {
7028 			if (bp->pp == pp) {
7029 				szc = bp->szc;
7030 				flags = bp->flags | CAPTURE_ASYNC;
7031 				datap = bp->datap;
7032 				mutex_exit(mp);
7033 				(void) page_trycapture(pp, szc, flags, datap);
7034 				return;
7035 			}
7036 			bp = bp->next;
7037 		}
7038 	}
7039 
7040 	/* Failed to find page in hash so clear flags and unlock it. */
7041 	page_clrtoxic(pp, PR_CAPTURE);
7042 	page_unlock(pp);
7043 
7044 	mutex_exit(mp);
7045 }
7046 
7047 void
7048 page_capture_init()
7049 {
7050 	int i;
7051 	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7052 		page_capture_hash[i].lists[0].next =
7053 		    &page_capture_hash[i].lists[0];
7054 		page_capture_hash[i].lists[0].prev =
7055 		    &page_capture_hash[i].lists[0];
7056 		page_capture_hash[i].lists[1].next =
7057 		    &page_capture_hash[i].lists[1];
7058 		page_capture_hash[i].lists[1].prev =
7059 		    &page_capture_hash[i].lists[1];
7060 	}
7061 
7062 	pc_thread_shortwait = 23 * hz;
7063 	pc_thread_longwait = 1201 * hz;
7064 	pc_thread_retry = 3;
7065 	mutex_init(&pc_thread_mutex, NULL, MUTEX_DEFAULT, NULL);
7066 	cv_init(&pc_cv, NULL, CV_DEFAULT, NULL);
7067 	pc_thread_id = thread_create(NULL, 0, page_capture_thread, NULL, 0, &p0,
7068 	    TS_RUN, minclsyspri);
7069 }
7070 
7071 /*
7072  * It is necessary to scrub any failing pages prior to reboot in order to
7073  * prevent a latent error trap from occurring on the next boot.
7074  */
7075 void
7076 page_retire_mdboot()
7077 {
7078 	page_t *pp;
7079 	int i, j;
7080 	page_capture_hash_bucket_t *bp;
7081 
7082 	/* walk lists looking for pages to scrub */
7083 	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7084 		if (page_capture_hash[i].num_pages == 0)
7085 			continue;
7086 
7087 		mutex_enter(&page_capture_hash[i].pchh_mutex);
7088 
7089 		for (j = 0; j < 2; j++) {
7090 			bp = page_capture_hash[i].lists[j].next;
7091 			while (bp != &page_capture_hash[i].lists[j]) {
7092 				pp = bp->pp;
7093 				if (!PP_ISKAS(pp) && PP_TOXIC(pp)) {
7094 					pp->p_selock = -1;  /* pacify ASSERTs */
7095 					PP_CLRFREE(pp);
7096 					pagescrub(pp, 0, PAGESIZE);
7097 					pp->p_selock = 0;
7098 				}
7099 				bp = bp->next;
7100 			}
7101 		}
7102 		mutex_exit(&page_capture_hash[i].pchh_mutex);
7103 	}
7104 }
7105 
7106 /*
7107  * Walk the page_capture_hash trying to capture pages and also cleanup old
7108  * entries which have expired.
7109  */
7110 void
7111 page_capture_async()
7112 {
7113 	page_t *pp;
7114 	int i;
7115 	int ret;
7116 	page_capture_hash_bucket_t *bp1, *bp2;
7117 	uint_t szc;
7118 	uint_t flags;
7119 	void *datap;
7120 
7121 	/* If there are outstanding pages to be captured, get to work */
7122 	for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++) {
7123 		if (page_capture_hash[i].num_pages == 0)
7124 			continue;
7125 		/* Append list 1 to list 0 and then walk through list 0 */
7126 		mutex_enter(&page_capture_hash[i].pchh_mutex);
7127 		bp1 = &page_capture_hash[i].lists[1];
7128 		bp2 = bp1->next;
7129 		if (bp1 != bp2) {
7130 			bp1->prev->next = page_capture_hash[i].lists[0].next;
7131 			bp2->prev = &page_capture_hash[i].lists[0];
7132 			page_capture_hash[i].lists[0].next->prev = bp1->prev;
7133 			page_capture_hash[i].lists[0].next = bp2;
7134 			bp1->next = bp1;
7135 			bp1->prev = bp1;
7136 		}
7137 
7138 		/* list[1] will be empty now */
7139 
7140 		bp1 = page_capture_hash[i].lists[0].next;
7141 		while (bp1 != &page_capture_hash[i].lists[0]) {
7142 			/* Check expiration time */
7143 			if ((lbolt > bp1->expires && bp1->expires != -1) ||
7144 			    page_deleted(bp1->pp)) {
7145 				page_capture_hash[i].lists[0].next = bp1->next;
7146 				bp1->next->prev =
7147 				    &page_capture_hash[i].lists[0];
7148 				page_capture_hash[i].num_pages--;
7149 
7150 				/*
7151 				 * We can safely remove the PR_CAPTURE bit
7152 				 * without holding the EXCL lock on the page
7153 				 * as the PR_CAPTURE bit requres that the
7154 				 * page_capture_hash[].pchh_mutex be held
7155 				 * to modify it.
7156 				 */
7157 				page_clrtoxic(bp1->pp, PR_CAPTURE);
7158 				mutex_exit(&page_capture_hash[i].pchh_mutex);
7159 				kmem_free(bp1, sizeof (*bp1));
7160 				mutex_enter(&page_capture_hash[i].pchh_mutex);
7161 				bp1 = page_capture_hash[i].lists[0].next;
7162 				continue;
7163 			}
7164 			pp = bp1->pp;
7165 			szc = bp1->szc;
7166 			flags = bp1->flags;
7167 			datap = bp1->datap;
7168 			mutex_exit(&page_capture_hash[i].pchh_mutex);
7169 			if (page_trylock(pp, SE_EXCL)) {
7170 				ret = page_trycapture(pp, szc,
7171 				    flags | CAPTURE_ASYNC, datap);
7172 			} else {
7173 				ret = 1;	/* move to walked hash */
7174 			}
7175 
7176 			if (ret != 0) {
7177 				/* Move to walked hash */
7178 				(void) page_capture_move_to_walked(pp);
7179 			}
7180 			mutex_enter(&page_capture_hash[i].pchh_mutex);
7181 			bp1 = page_capture_hash[i].lists[0].next;
7182 		}
7183 
7184 		mutex_exit(&page_capture_hash[i].pchh_mutex);
7185 	}
7186 }
7187 
7188 /*
7189  * This function is called by the page_capture_thread, and is needed in
7190  * in order to initiate aio cleanup, so that pages used in aio
7191  * will be unlocked and subsequently retired by page_capture_thread.
7192  */
7193 static int
7194 do_aio_cleanup(void)
7195 {
7196 	proc_t *procp;
7197 	int (*aio_cleanup_dr_delete_memory)(proc_t *);
7198 	int cleaned = 0;
7199 
7200 	if (modload("sys", "kaio") == -1) {
7201 		cmn_err(CE_WARN, "do_aio_cleanup: cannot load kaio");
7202 		return (0);
7203 	}
7204 	/*
7205 	 * We use the aio_cleanup_dr_delete_memory function to
7206 	 * initiate the actual clean up; this function will wake
7207 	 * up the per-process aio_cleanup_thread.
7208 	 */
7209 	aio_cleanup_dr_delete_memory = (int (*)(proc_t *))
7210 	    modgetsymvalue("aio_cleanup_dr_delete_memory", 0);
7211 	if (aio_cleanup_dr_delete_memory == NULL) {
7212 		cmn_err(CE_WARN,
7213 	    "aio_cleanup_dr_delete_memory not found in kaio");
7214 		return (0);
7215 	}
7216 	mutex_enter(&pidlock);
7217 	for (procp = practive; (procp != NULL); procp = procp->p_next) {
7218 		mutex_enter(&procp->p_lock);
7219 		if (procp->p_aio != NULL) {
7220 			/* cleanup proc's outstanding kaio */
7221 			cleaned += (*aio_cleanup_dr_delete_memory)(procp);
7222 		}
7223 		mutex_exit(&procp->p_lock);
7224 	}
7225 	mutex_exit(&pidlock);
7226 	return (cleaned);
7227 }
7228 
7229 /*
7230  * helper function for page_capture_thread
7231  */
7232 static void
7233 page_capture_handle_outstanding(void)
7234 {
7235 	int ntry;
7236 
7237 	if (!page_retire_pend_count()) {
7238 		/*
7239 		 * Do we really want to be this aggressive
7240 		 * for things other than page_retire?
7241 		 * Maybe have a counter for each callback
7242 		 * type to guide how aggressive we should
7243 		 * be here.  Thus if there's at least one
7244 		 * page for page_retire we go ahead and reap
7245 		 * like this.
7246 		 */
7247 		kmem_reap();
7248 		seg_preap();
7249 		page_capture_async();
7250 	} else {
7251 		/*
7252 		 * There are pages pending retirement, so
7253 		 * we reap prior to attempting to capture.
7254 		 */
7255 		kmem_reap();
7256 
7257 		/* disable and purge seg_pcache */
7258 		(void) seg_p_disable();
7259 		for (ntry = 0; ntry < pc_thread_retry; ntry++) {
7260 			if (!page_retire_pend_count())
7261 				break;
7262 			if (do_aio_cleanup()) {
7263 				/*
7264 				 * allow the apps cleanup threads
7265 				 * to run
7266 				 */
7267 				delay(pc_thread_shortwait);
7268 			}
7269 			page_capture_async();
7270 		}
7271 		/* reenable seg_pcache */
7272 		seg_p_enable();
7273 	}
7274 }
7275 
7276 /*
7277  * The page_capture_thread loops forever, looking to see if there are
7278  * pages still waiting to be captured.
7279  */
7280 static void
7281 page_capture_thread(void)
7282 {
7283 	callb_cpr_t c;
7284 	int outstanding;
7285 	int i;
7286 
7287 	CALLB_CPR_INIT(&c, &pc_thread_mutex, callb_generic_cpr, "page_capture");
7288 
7289 	mutex_enter(&pc_thread_mutex);
7290 	for (;;) {
7291 		outstanding = 0;
7292 		for (i = 0; i < NUM_PAGE_CAPTURE_BUCKETS; i++)
7293 			outstanding += page_capture_hash[i].num_pages;
7294 		if (outstanding) {
7295 			page_capture_handle_outstanding();
7296 			CALLB_CPR_SAFE_BEGIN(&c);
7297 			(void) cv_timedwait(&pc_cv, &pc_thread_mutex,
7298 			    lbolt + pc_thread_shortwait);
7299 			CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7300 		} else {
7301 			CALLB_CPR_SAFE_BEGIN(&c);
7302 			(void) cv_timedwait(&pc_cv, &pc_thread_mutex,
7303 			    lbolt + pc_thread_longwait);
7304 			CALLB_CPR_SAFE_END(&c, &pc_thread_mutex);
7305 		}
7306 	}
7307 	/*NOTREACHED*/
7308 }
7309 /*
7310  * Attempt to locate a bucket that has enough pages to satisfy the request.
7311  * The initial check is done without the lock to avoid unneeded contention.
7312  * The function returns 1 if enough pages were found, else 0 if it could not
7313  * find enough pages in a bucket.
7314  */
7315 static int
7316 pcf_decrement_bucket(pgcnt_t npages)
7317 {
7318 	struct pcf	*p;
7319 	struct pcf	*q;
7320 	int i;
7321 
7322 	p = &pcf[PCF_INDEX()];
7323 	q = &pcf[pcf_fanout];
7324 	for (i = 0; i < pcf_fanout; i++) {
7325 		if (p->pcf_count > npages) {
7326 			/*
7327 			 * a good one to try.
7328 			 */
7329 			mutex_enter(&p->pcf_lock);
7330 			if (p->pcf_count > npages) {
7331 				p->pcf_count -= (uint_t)npages;
7332 				/*
7333 				 * freemem is not protected by any lock.
7334 				 * Thus, we cannot have any assertion
7335 				 * containing freemem here.
7336 				 */
7337 				freemem -= npages;
7338 				mutex_exit(&p->pcf_lock);
7339 				return (1);
7340 			}
7341 			mutex_exit(&p->pcf_lock);
7342 		}
7343 		p++;
7344 		if (p >= q) {
7345 			p = pcf;
7346 		}
7347 	}
7348 	return (0);
7349 }
7350 
7351 /*
7352  * Arguments:
7353  *	pcftotal_ret:	If the value is not NULL and we have walked all the
7354  *			buckets but did not find enough pages then it will
7355  *			be set to the total number of pages in all the pcf
7356  *			buckets.
7357  *	npages:		Is the number of pages we have been requested to
7358  *			find.
7359  *	unlock:		If set to 0 we will leave the buckets locked if the
7360  *			requested number of pages are not found.
7361  *
7362  * Go and try to satisfy the page request  from any number of buckets.
7363  * This can be a very expensive operation as we have to lock the buckets
7364  * we are checking (and keep them locked), starting at bucket 0.
7365  *
7366  * The function returns 1 if enough pages were found, else 0 if it could not
7367  * find enough pages in the buckets.
7368  *
7369  */
7370 static int
7371 pcf_decrement_multiple(pgcnt_t *pcftotal_ret, pgcnt_t npages, int unlock)
7372 {
7373 	struct pcf	*p;
7374 	pgcnt_t pcftotal;
7375 	int i;
7376 
7377 	p = pcf;
7378 	/* try to collect pages from several pcf bins */
7379 	for (pcftotal = 0, i = 0; i < pcf_fanout; i++) {
7380 		mutex_enter(&p->pcf_lock);
7381 		pcftotal += p->pcf_count;
7382 		if (pcftotal >= npages) {
7383 			/*
7384 			 * Wow!  There are enough pages laying around
7385 			 * to satisfy the request.  Do the accounting,
7386 			 * drop the locks we acquired, and go back.
7387 			 *
7388 			 * freemem is not protected by any lock. So,
7389 			 * we cannot have any assertion containing
7390 			 * freemem.
7391 			 */
7392 			freemem -= npages;
7393 			while (p >= pcf) {
7394 				if (p->pcf_count <= npages) {
7395 					npages -= p->pcf_count;
7396 					p->pcf_count = 0;
7397 				} else {
7398 					p->pcf_count -= (uint_t)npages;
7399 					npages = 0;
7400 				}
7401 				mutex_exit(&p->pcf_lock);
7402 				p--;
7403 			}
7404 			ASSERT(npages == 0);
7405 			return (1);
7406 		}
7407 		p++;
7408 	}
7409 	if (unlock) {
7410 		/* failed to collect pages - release the locks */
7411 		while (--p >= pcf) {
7412 			mutex_exit(&p->pcf_lock);
7413 		}
7414 	}
7415 	if (pcftotal_ret != NULL)
7416 		*pcftotal_ret = pcftotal;
7417 	return (0);
7418 }
7419