xref: /illumos-gate/usr/src/uts/common/vm/vm_page.c (revision 94bc75770001bfdc49b11467deff2235fc9927f9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*	Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989  AT&T	*/
28 /*	  All Rights Reserved  	*/
29 
30 /*
31  * University Copyright- Copyright (c) 1982, 1986, 1988
32  * The Regents of the University of California
33  * All Rights Reserved
34  *
35  * University Acknowledgment- Portions of this document are derived from
36  * software developed by the University of California, Berkeley, and its
37  * contributors.
38  */
39 
40 #pragma ident	"%Z%%M%	%I%	%E% SMI"
41 
42 /*
43  * VM - physical page management.
44  */
45 
46 #include <sys/types.h>
47 #include <sys/t_lock.h>
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/errno.h>
51 #include <sys/time.h>
52 #include <sys/vnode.h>
53 #include <sys/vm.h>
54 #include <sys/vtrace.h>
55 #include <sys/swap.h>
56 #include <sys/cmn_err.h>
57 #include <sys/tuneable.h>
58 #include <sys/sysmacros.h>
59 #include <sys/cpuvar.h>
60 #include <sys/callb.h>
61 #include <sys/debug.h>
62 #include <sys/tnf_probe.h>
63 #include <sys/condvar_impl.h>
64 #include <sys/mem_config.h>
65 #include <sys/mem_cage.h>
66 #include <sys/kmem.h>
67 #include <sys/atomic.h>
68 #include <sys/strlog.h>
69 #include <sys/mman.h>
70 #include <sys/ontrap.h>
71 #include <sys/lgrp.h>
72 #include <sys/vfs.h>
73 
74 #include <vm/hat.h>
75 #include <vm/anon.h>
76 #include <vm/page.h>
77 #include <vm/seg.h>
78 #include <vm/pvn.h>
79 #include <vm/seg_kmem.h>
80 #include <vm/vm_dep.h>
81 
82 #include <fs/fs_subr.h>
83 
84 static int nopageage = 0;
85 
86 static pgcnt_t max_page_get;	/* max page_get request size in pages */
87 pgcnt_t total_pages = 0;	/* total number of pages (used by /proc) */
88 
89 /*
90  * freemem_lock protects all freemem variables:
91  * availrmem. Also this lock protects the globals which track the
92  * availrmem changes for accurate kernel footprint calculation.
93  * See below for an explanation of these
94  * globals.
95  */
96 kmutex_t freemem_lock;
97 pgcnt_t availrmem;
98 pgcnt_t availrmem_initial;
99 
100 /*
101  * These globals track availrmem changes to get a more accurate
102  * estimate of tke kernel size. Historically pp_kernel is used for
103  * kernel size and is based on availrmem. But availrmem is adjusted for
104  * locked pages in the system not just for kernel locked pages.
105  * These new counters will track the pages locked through segvn and
106  * by explicit user locking.
107  *
108  * segvn_pages_locked : This keeps track on a global basis how many pages
109  * are currently locked because of I/O.
110  *
111  * pages_locked : How many pages are locked becuase of user specified
112  * locking through mlock or plock.
113  *
114  * pages_useclaim,pages_claimed : These two variables track the
115  * cliam adjustments because of the protection changes on a segvn segment.
116  *
117  * All these globals are protected by the same lock which protects availrmem.
118  */
119 pgcnt_t segvn_pages_locked;
120 pgcnt_t pages_locked;
121 pgcnt_t pages_useclaim;
122 pgcnt_t pages_claimed;
123 
124 
125 /*
126  * new_freemem_lock protects freemem, freemem_wait & freemem_cv.
127  */
128 static kmutex_t	new_freemem_lock;
129 static uint_t	freemem_wait;	/* someone waiting for freemem */
130 static kcondvar_t freemem_cv;
131 
132 /*
133  * The logical page free list is maintained as two lists, the 'free'
134  * and the 'cache' lists.
135  * The free list contains those pages that should be reused first.
136  *
137  * The implementation of the lists is machine dependent.
138  * page_get_freelist(), page_get_cachelist(),
139  * page_list_sub(), and page_list_add()
140  * form the interface to the machine dependent implementation.
141  *
142  * Pages with p_free set are on the cache list.
143  * Pages with p_free and p_age set are on the free list,
144  *
145  * A page may be locked while on either list.
146  */
147 
148 /*
149  * free list accounting stuff.
150  *
151  *
152  * Spread out the value for the number of pages on the
153  * page free and page cache lists.  If there is just one
154  * value, then it must be under just one lock.
155  * The lock contention and cache traffic are a real bother.
156  *
157  * When we acquire and then drop a single pcf lock
158  * we can start in the middle of the array of pcf structures.
159  * If we acquire more than one pcf lock at a time, we need to
160  * start at the front to avoid deadlocking.
161  *
162  * pcf_count holds the number of pages in each pool.
163  *
164  * pcf_block is set when page_create_get_something() has asked the
165  * PSM page freelist and page cachelist routines without specifying
166  * a color and nothing came back.  This is used to block anything
167  * else from moving pages from one list to the other while the
168  * lists are searched again.  If a page is freeed while pcf_block is
169  * set, then pcf_reserve is incremented.  pcgs_unblock() takes care
170  * of clearning pcf_block, doing the wakeups, etc.
171  */
172 
173 #if NCPU <= 4
174 #define	PAD	1
175 #define	PCF_FANOUT	4
176 static	uint_t	pcf_mask = PCF_FANOUT - 1;
177 #else
178 #define	PAD	9
179 #ifdef sun4v
180 #define	PCF_FANOUT	32
181 #else
182 #define	PCF_FANOUT	128
183 #endif
184 static	uint_t	pcf_mask = PCF_FANOUT - 1;
185 #endif
186 
187 struct pcf {
188 	uint_t		pcf_touch;	/* just to help the cache */
189 	uint_t		pcf_count;	/* page count */
190 	kmutex_t	pcf_lock;	/* protects the structure */
191 	uint_t		pcf_wait;	/* number of waiters */
192 	uint_t		pcf_block; 	/* pcgs flag to page_free() */
193 	uint_t		pcf_reserve; 	/* pages freed after pcf_block set */
194 	uint_t		pcf_fill[PAD];	/* to line up on the caches */
195 };
196 
197 static struct	pcf	pcf[PCF_FANOUT];
198 #define	PCF_INDEX()	((CPU->cpu_id) & (pcf_mask))
199 
200 kmutex_t	pcgs_lock;		/* serializes page_create_get_ */
201 kmutex_t	pcgs_cagelock;		/* serializes NOSLEEP cage allocs */
202 kmutex_t	pcgs_wait_lock;		/* used for delay in pcgs */
203 static kcondvar_t	pcgs_cv;	/* cv for delay in pcgs */
204 
205 #define	PAGE_LOCK_MAXIMUM \
206 	((1 << (sizeof (((page_t *)0)->p_lckcnt) * NBBY)) - 1)
207 
208 #ifdef VM_STATS
209 
210 /*
211  * No locks, but so what, they are only statistics.
212  */
213 
214 static struct page_tcnt {
215 	int	pc_free_cache;		/* free's into cache list */
216 	int	pc_free_dontneed;	/* free's with dontneed */
217 	int	pc_free_pageout;	/* free's from pageout */
218 	int	pc_free_free;		/* free's into free list */
219 	int	pc_free_pages;		/* free's into large page free list */
220 	int	pc_destroy_pages;	/* large page destroy's */
221 	int	pc_get_cache;		/* get's from cache list */
222 	int	pc_get_free;		/* get's from free list */
223 	int	pc_reclaim;		/* reclaim's */
224 	int	pc_abortfree;		/* abort's of free pages */
225 	int	pc_find_hit;		/* find's that find page */
226 	int	pc_find_miss;		/* find's that don't find page */
227 	int	pc_destroy_free;	/* # of free pages destroyed */
228 #define	PC_HASH_CNT	(4*PAGE_HASHAVELEN)
229 	int	pc_find_hashlen[PC_HASH_CNT+1];
230 	int	pc_addclaim_pages;
231 	int	pc_subclaim_pages;
232 	int	pc_free_replacement_page[2];
233 	int	pc_try_demote_pages[6];
234 	int	pc_demote_pages[2];
235 } pagecnt;
236 
237 uint_t	hashin_count;
238 uint_t	hashin_not_held;
239 uint_t	hashin_already;
240 
241 uint_t	hashout_count;
242 uint_t	hashout_not_held;
243 
244 uint_t	page_create_count;
245 uint_t	page_create_not_enough;
246 uint_t	page_create_not_enough_again;
247 uint_t	page_create_zero;
248 uint_t	page_create_hashout;
249 uint_t	page_create_page_lock_failed;
250 uint_t	page_create_trylock_failed;
251 uint_t	page_create_found_one;
252 uint_t	page_create_hashin_failed;
253 uint_t	page_create_dropped_phm;
254 
255 uint_t	page_create_new;
256 uint_t	page_create_exists;
257 uint_t	page_create_putbacks;
258 uint_t	page_create_overshoot;
259 
260 uint_t	page_reclaim_zero;
261 uint_t	page_reclaim_zero_locked;
262 
263 uint_t	page_rename_exists;
264 uint_t	page_rename_count;
265 
266 uint_t	page_lookup_cnt[20];
267 uint_t	page_lookup_nowait_cnt[10];
268 uint_t	page_find_cnt;
269 uint_t	page_exists_cnt;
270 uint_t	page_exists_forreal_cnt;
271 uint_t	page_lookup_dev_cnt;
272 uint_t	get_cachelist_cnt;
273 uint_t	page_create_cnt[10];
274 uint_t	alloc_pages[8];
275 uint_t	page_exphcontg[19];
276 uint_t  page_create_large_cnt[10];
277 
278 /*
279  * Collects statistics.
280  */
281 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
282 	uint_t	mylen = 0; \
283 			\
284 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash, mylen++) { \
285 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
286 			break; \
287 	} \
288 	if ((pp) != NULL) \
289 		pagecnt.pc_find_hit++; \
290 	else \
291 		pagecnt.pc_find_miss++; \
292 	if (mylen > PC_HASH_CNT) \
293 		mylen = PC_HASH_CNT; \
294 	pagecnt.pc_find_hashlen[mylen]++; \
295 }
296 
297 #else	/* VM_STATS */
298 
299 /*
300  * Don't collect statistics
301  */
302 #define	PAGE_HASH_SEARCH(index, pp, vp, off) { \
303 	for ((pp) = page_hash[(index)]; (pp); (pp) = (pp)->p_hash) { \
304 		if ((pp)->p_vnode == (vp) && (pp)->p_offset == (off)) \
305 			break; \
306 	} \
307 }
308 
309 #endif	/* VM_STATS */
310 
311 
312 
313 #ifdef DEBUG
314 #define	MEMSEG_SEARCH_STATS
315 #endif
316 
317 #ifdef MEMSEG_SEARCH_STATS
318 struct memseg_stats {
319     uint_t nsearch;
320     uint_t nlastwon;
321     uint_t nhashwon;
322     uint_t nnotfound;
323 } memseg_stats;
324 
325 #define	MEMSEG_STAT_INCR(v) \
326 	atomic_add_32(&memseg_stats.v, 1)
327 #else
328 #define	MEMSEG_STAT_INCR(x)
329 #endif
330 
331 struct memseg *memsegs;		/* list of memory segments */
332 
333 
334 static void page_init_mem_config(void);
335 static int page_do_hashin(page_t *, vnode_t *, u_offset_t);
336 static void page_do_hashout(page_t *);
337 
338 static void page_demote_vp_pages(page_t *);
339 
340 /*
341  * vm subsystem related initialization
342  */
343 void
344 vm_init(void)
345 {
346 	boolean_t callb_vm_cpr(void *, int);
347 
348 	(void) callb_add(callb_vm_cpr, 0, CB_CL_CPR_VM, "vm");
349 	page_init_mem_config();
350 	page_retire_init();
351 }
352 
353 /*
354  * This function is called at startup and when memory is added or deleted.
355  */
356 void
357 init_pages_pp_maximum()
358 {
359 	static pgcnt_t p_min;
360 	static pgcnt_t pages_pp_maximum_startup;
361 	static pgcnt_t avrmem_delta;
362 	static int init_done;
363 	static int user_set;	/* true if set in /etc/system */
364 
365 	if (init_done == 0) {
366 
367 		/* If the user specified a value, save it */
368 		if (pages_pp_maximum != 0) {
369 			user_set = 1;
370 			pages_pp_maximum_startup = pages_pp_maximum;
371 		}
372 
373 		/*
374 		 * Setting of pages_pp_maximum is based first time
375 		 * on the value of availrmem just after the start-up
376 		 * allocations. To preserve this relationship at run
377 		 * time, use a delta from availrmem_initial.
378 		 */
379 		ASSERT(availrmem_initial >= availrmem);
380 		avrmem_delta = availrmem_initial - availrmem;
381 
382 		/* The allowable floor of pages_pp_maximum */
383 		p_min = tune.t_minarmem + 100;
384 
385 		/* Make sure we don't come through here again. */
386 		init_done = 1;
387 	}
388 	/*
389 	 * Determine pages_pp_maximum, the number of currently available
390 	 * pages (availrmem) that can't be `locked'. If not set by
391 	 * the user, we set it to 4% of the currently available memory
392 	 * plus 4MB.
393 	 * But we also insist that it be greater than tune.t_minarmem;
394 	 * otherwise a process could lock down a lot of memory, get swapped
395 	 * out, and never have enough to get swapped back in.
396 	 */
397 	if (user_set)
398 		pages_pp_maximum = pages_pp_maximum_startup;
399 	else
400 		pages_pp_maximum = ((availrmem_initial - avrmem_delta) / 25)
401 		    + btop(4 * 1024 * 1024);
402 
403 	if (pages_pp_maximum <= p_min) {
404 		pages_pp_maximum = p_min;
405 	}
406 }
407 
408 void
409 set_max_page_get(pgcnt_t target_total_pages)
410 {
411 	max_page_get = target_total_pages / 2;
412 }
413 
414 static pgcnt_t pending_delete;
415 
416 /*ARGSUSED*/
417 static void
418 page_mem_config_post_add(
419 	void *arg,
420 	pgcnt_t delta_pages)
421 {
422 	set_max_page_get(total_pages - pending_delete);
423 	init_pages_pp_maximum();
424 }
425 
426 /*ARGSUSED*/
427 static int
428 page_mem_config_pre_del(
429 	void *arg,
430 	pgcnt_t delta_pages)
431 {
432 	pgcnt_t nv;
433 
434 	nv = atomic_add_long_nv(&pending_delete, (spgcnt_t)delta_pages);
435 	set_max_page_get(total_pages - nv);
436 	return (0);
437 }
438 
439 /*ARGSUSED*/
440 static void
441 page_mem_config_post_del(
442 	void *arg,
443 	pgcnt_t delta_pages,
444 	int cancelled)
445 {
446 	pgcnt_t nv;
447 
448 	nv = atomic_add_long_nv(&pending_delete, -(spgcnt_t)delta_pages);
449 	set_max_page_get(total_pages - nv);
450 	if (!cancelled)
451 		init_pages_pp_maximum();
452 }
453 
454 static kphysm_setup_vector_t page_mem_config_vec = {
455 	KPHYSM_SETUP_VECTOR_VERSION,
456 	page_mem_config_post_add,
457 	page_mem_config_pre_del,
458 	page_mem_config_post_del,
459 };
460 
461 static void
462 page_init_mem_config(void)
463 {
464 	int ret;
465 
466 	ret = kphysm_setup_func_register(&page_mem_config_vec, (void *)NULL);
467 	ASSERT(ret == 0);
468 }
469 
470 /*
471  * Evenly spread out the PCF counters for large free pages
472  */
473 static void
474 page_free_large_ctr(pgcnt_t npages)
475 {
476 	static struct pcf	*p = pcf;
477 	pgcnt_t			lump;
478 
479 	freemem += npages;
480 
481 	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
482 
483 	while (npages > 0) {
484 
485 		ASSERT(!p->pcf_block);
486 
487 		if (lump < npages) {
488 			p->pcf_count += (uint_t)lump;
489 			npages -= lump;
490 		} else {
491 			p->pcf_count += (uint_t)npages;
492 			npages = 0;
493 		}
494 
495 		ASSERT(!p->pcf_wait);
496 
497 		if (++p > &pcf[PCF_FANOUT - 1])
498 			p = pcf;
499 	}
500 
501 	ASSERT(npages == 0);
502 }
503 
504 /*
505  * Add a physical chunk of memory to the system freee lists during startup.
506  * Platform specific startup() allocates the memory for the page structs.
507  *
508  * num	- number of page structures
509  * base - page number (pfn) to be associated with the first page.
510  *
511  * Since we are doing this during startup (ie. single threaded), we will
512  * use shortcut routines to avoid any locking overhead while putting all
513  * these pages on the freelists.
514  *
515  * NOTE: Any changes performed to page_free(), must also be performed to
516  *	 add_physmem() since this is how we initialize all page_t's at
517  *	 boot time.
518  */
519 void
520 add_physmem(
521 	page_t	*pp,
522 	pgcnt_t	num,
523 	pfn_t	pnum)
524 {
525 	page_t	*root = NULL;
526 	uint_t	szc = page_num_pagesizes() - 1;
527 	pgcnt_t	large = page_get_pagecnt(szc);
528 	pgcnt_t	cnt = 0;
529 
530 	TRACE_2(TR_FAC_VM, TR_PAGE_INIT,
531 		"add_physmem:pp %p num %lu", pp, num);
532 
533 	/*
534 	 * Arbitrarily limit the max page_get request
535 	 * to 1/2 of the page structs we have.
536 	 */
537 	total_pages += num;
538 	set_max_page_get(total_pages);
539 
540 	PLCNT_MODIFY_MAX(pnum, (long)num);
541 
542 	/*
543 	 * The physical space for the pages array
544 	 * representing ram pages has already been
545 	 * allocated.  Here we initialize each lock
546 	 * in the page structure, and put each on
547 	 * the free list
548 	 */
549 	for (; num; pp++, pnum++, num--) {
550 
551 		/*
552 		 * this needs to fill in the page number
553 		 * and do any other arch specific initialization
554 		 */
555 		add_physmem_cb(pp, pnum);
556 
557 		/*
558 		 * Initialize the page lock as unlocked, since nobody
559 		 * can see or access this page yet.
560 		 */
561 		pp->p_selock = 0;
562 
563 		/*
564 		 * Initialize IO lock
565 		 */
566 		page_iolock_init(pp);
567 
568 		/*
569 		 * initialize other fields in the page_t
570 		 */
571 		PP_SETFREE(pp);
572 		page_clr_all_props(pp);
573 		PP_SETAGED(pp);
574 		pp->p_offset = (u_offset_t)-1;
575 		pp->p_next = pp;
576 		pp->p_prev = pp;
577 
578 		/*
579 		 * Simple case: System doesn't support large pages.
580 		 */
581 		if (szc == 0) {
582 			pp->p_szc = 0;
583 			page_free_at_startup(pp);
584 			continue;
585 		}
586 
587 		/*
588 		 * Handle unaligned pages, we collect them up onto
589 		 * the root page until we have a full large page.
590 		 */
591 		if (!IS_P2ALIGNED(pnum, large)) {
592 
593 			/*
594 			 * If not in a large page,
595 			 * just free as small page.
596 			 */
597 			if (root == NULL) {
598 				pp->p_szc = 0;
599 				page_free_at_startup(pp);
600 				continue;
601 			}
602 
603 			/*
604 			 * Link a constituent page into the large page.
605 			 */
606 			pp->p_szc = szc;
607 			page_list_concat(&root, &pp);
608 
609 			/*
610 			 * When large page is fully formed, free it.
611 			 */
612 			if (++cnt == large) {
613 				page_free_large_ctr(cnt);
614 				page_list_add_pages(root, PG_LIST_ISINIT);
615 				root = NULL;
616 				cnt = 0;
617 			}
618 			continue;
619 		}
620 
621 		/*
622 		 * At this point we have a page number which
623 		 * is aligned. We assert that we aren't already
624 		 * in a different large page.
625 		 */
626 		ASSERT(IS_P2ALIGNED(pnum, large));
627 		ASSERT(root == NULL && cnt == 0);
628 
629 		/*
630 		 * If insufficient number of pages left to form
631 		 * a large page, just free the small page.
632 		 */
633 		if (num < large) {
634 			pp->p_szc = 0;
635 			page_free_at_startup(pp);
636 			continue;
637 		}
638 
639 		/*
640 		 * Otherwise start a new large page.
641 		 */
642 		pp->p_szc = szc;
643 		cnt++;
644 		root = pp;
645 	}
646 	ASSERT(root == NULL && cnt == 0);
647 }
648 
649 /*
650  * Find a page representing the specified [vp, offset].
651  * If we find the page but it is intransit coming in,
652  * it will have an "exclusive" lock and we wait for
653  * the i/o to complete.  A page found on the free list
654  * is always reclaimed and then locked.  On success, the page
655  * is locked, its data is valid and it isn't on the free
656  * list, while a NULL is returned if the page doesn't exist.
657  */
658 page_t *
659 page_lookup(vnode_t *vp, u_offset_t off, se_t se)
660 {
661 	return (page_lookup_create(vp, off, se, NULL, NULL, 0));
662 }
663 
664 /*
665  * Find a page representing the specified [vp, offset].
666  * We either return the one we found or, if passed in,
667  * create one with identity of [vp, offset] of the
668  * pre-allocated page. If we find exsisting page but it is
669  * intransit coming in, it will have an "exclusive" lock
670  * and we wait for the i/o to complete.  A page found on
671  * the free list is always reclaimed and then locked.
672  * On success, the page is locked, its data is valid and
673  * it isn't on the free list, while a NULL is returned
674  * if the page doesn't exist and newpp is NULL;
675  */
676 page_t *
677 page_lookup_create(
678 	vnode_t *vp,
679 	u_offset_t off,
680 	se_t se,
681 	page_t *newpp,
682 	spgcnt_t *nrelocp,
683 	int flags)
684 {
685 	page_t		*pp;
686 	kmutex_t	*phm;
687 	ulong_t		index;
688 	uint_t		hash_locked;
689 	uint_t		es;
690 
691 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
692 	VM_STAT_ADD(page_lookup_cnt[0]);
693 	ASSERT(newpp ? PAGE_EXCL(newpp) : 1);
694 
695 	/*
696 	 * Acquire the appropriate page hash lock since
697 	 * we have to search the hash list.  Pages that
698 	 * hash to this list can't change identity while
699 	 * this lock is held.
700 	 */
701 	hash_locked = 0;
702 	index = PAGE_HASH_FUNC(vp, off);
703 	phm = NULL;
704 top:
705 	PAGE_HASH_SEARCH(index, pp, vp, off);
706 	if (pp != NULL) {
707 		VM_STAT_ADD(page_lookup_cnt[1]);
708 		es = (newpp != NULL) ? 1 : 0;
709 		es |= flags;
710 		if (!hash_locked) {
711 			VM_STAT_ADD(page_lookup_cnt[2]);
712 			if (!page_try_reclaim_lock(pp, se, es)) {
713 				/*
714 				 * On a miss, acquire the phm.  Then
715 				 * next time, page_lock() will be called,
716 				 * causing a wait if the page is busy.
717 				 * just looping with page_trylock() would
718 				 * get pretty boring.
719 				 */
720 				VM_STAT_ADD(page_lookup_cnt[3]);
721 				phm = PAGE_HASH_MUTEX(index);
722 				mutex_enter(phm);
723 				hash_locked = 1;
724 				goto top;
725 			}
726 		} else {
727 			VM_STAT_ADD(page_lookup_cnt[4]);
728 			if (!page_lock_es(pp, se, phm, P_RECLAIM, es)) {
729 				VM_STAT_ADD(page_lookup_cnt[5]);
730 				goto top;
731 			}
732 		}
733 
734 		/*
735 		 * Since `pp' is locked it can not change identity now.
736 		 * Reconfirm we locked the correct page.
737 		 *
738 		 * Both the p_vnode and p_offset *must* be cast volatile
739 		 * to force a reload of their values: The PAGE_HASH_SEARCH
740 		 * macro will have stuffed p_vnode and p_offset into
741 		 * registers before calling page_trylock(); another thread,
742 		 * actually holding the hash lock, could have changed the
743 		 * page's identity in memory, but our registers would not
744 		 * be changed, fooling the reconfirmation.  If the hash
745 		 * lock was held during the search, the casting would
746 		 * not be needed.
747 		 */
748 		VM_STAT_ADD(page_lookup_cnt[6]);
749 		if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
750 		    ((volatile u_offset_t)(pp->p_offset) != off)) {
751 			VM_STAT_ADD(page_lookup_cnt[7]);
752 			if (hash_locked) {
753 				panic("page_lookup_create: lost page %p",
754 				    (void *)pp);
755 				/*NOTREACHED*/
756 			}
757 			page_unlock(pp);
758 			phm = PAGE_HASH_MUTEX(index);
759 			mutex_enter(phm);
760 			hash_locked = 1;
761 			goto top;
762 		}
763 
764 		/*
765 		 * If page_trylock() was called, then pp may still be on
766 		 * the cachelist (can't be on the free list, it would not
767 		 * have been found in the search).  If it is on the
768 		 * cachelist it must be pulled now. To pull the page from
769 		 * the cachelist, it must be exclusively locked.
770 		 *
771 		 * The other big difference between page_trylock() and
772 		 * page_lock(), is that page_lock() will pull the
773 		 * page from whatever free list (the cache list in this
774 		 * case) the page is on.  If page_trylock() was used
775 		 * above, then we have to do the reclaim ourselves.
776 		 */
777 		if ((!hash_locked) && (PP_ISFREE(pp))) {
778 			ASSERT(PP_ISAGED(pp) == 0);
779 			VM_STAT_ADD(page_lookup_cnt[8]);
780 
781 			/*
782 			 * page_relcaim will insure that we
783 			 * have this page exclusively
784 			 */
785 
786 			if (!page_reclaim(pp, NULL)) {
787 				/*
788 				 * Page_reclaim dropped whatever lock
789 				 * we held.
790 				 */
791 				VM_STAT_ADD(page_lookup_cnt[9]);
792 				phm = PAGE_HASH_MUTEX(index);
793 				mutex_enter(phm);
794 				hash_locked = 1;
795 				goto top;
796 			} else if (se == SE_SHARED && newpp == NULL) {
797 				VM_STAT_ADD(page_lookup_cnt[10]);
798 				page_downgrade(pp);
799 			}
800 		}
801 
802 		if (hash_locked) {
803 			mutex_exit(phm);
804 		}
805 
806 		if (newpp != NULL && pp->p_szc < newpp->p_szc &&
807 		    PAGE_EXCL(pp) && nrelocp != NULL) {
808 			ASSERT(nrelocp != NULL);
809 			(void) page_relocate(&pp, &newpp, 1, 1, nrelocp,
810 			    NULL);
811 			if (*nrelocp > 0) {
812 				VM_STAT_COND_ADD(*nrelocp == 1,
813 				    page_lookup_cnt[11]);
814 				VM_STAT_COND_ADD(*nrelocp > 1,
815 				    page_lookup_cnt[12]);
816 				pp = newpp;
817 				se = SE_EXCL;
818 			} else {
819 				if (se == SE_SHARED) {
820 					page_downgrade(pp);
821 				}
822 				VM_STAT_ADD(page_lookup_cnt[13]);
823 			}
824 		} else if (newpp != NULL && nrelocp != NULL) {
825 			if (PAGE_EXCL(pp) && se == SE_SHARED) {
826 				page_downgrade(pp);
827 			}
828 			VM_STAT_COND_ADD(pp->p_szc < newpp->p_szc,
829 			    page_lookup_cnt[14]);
830 			VM_STAT_COND_ADD(pp->p_szc == newpp->p_szc,
831 			    page_lookup_cnt[15]);
832 			VM_STAT_COND_ADD(pp->p_szc > newpp->p_szc,
833 			    page_lookup_cnt[16]);
834 		} else if (newpp != NULL && PAGE_EXCL(pp)) {
835 			se = SE_EXCL;
836 		}
837 	} else if (!hash_locked) {
838 		VM_STAT_ADD(page_lookup_cnt[17]);
839 		phm = PAGE_HASH_MUTEX(index);
840 		mutex_enter(phm);
841 		hash_locked = 1;
842 		goto top;
843 	} else if (newpp != NULL) {
844 		/*
845 		 * If we have a preallocated page then
846 		 * insert it now and basically behave like
847 		 * page_create.
848 		 */
849 		VM_STAT_ADD(page_lookup_cnt[18]);
850 		/*
851 		 * Since we hold the page hash mutex and
852 		 * just searched for this page, page_hashin
853 		 * had better not fail.  If it does, that
854 		 * means some thread did not follow the
855 		 * page hash mutex rules.  Panic now and
856 		 * get it over with.  As usual, go down
857 		 * holding all the locks.
858 		 */
859 		ASSERT(MUTEX_HELD(phm));
860 		if (!page_hashin(newpp, vp, off, phm)) {
861 			ASSERT(MUTEX_HELD(phm));
862 			panic("page_lookup_create: hashin failed %p %p %llx %p",
863 			    (void *)newpp, (void *)vp, off, (void *)phm);
864 			/*NOTREACHED*/
865 		}
866 		ASSERT(MUTEX_HELD(phm));
867 		mutex_exit(phm);
868 		phm = NULL;
869 		page_set_props(newpp, P_REF);
870 		page_io_lock(newpp);
871 		pp = newpp;
872 		se = SE_EXCL;
873 	} else {
874 		VM_STAT_ADD(page_lookup_cnt[19]);
875 		mutex_exit(phm);
876 	}
877 
878 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
879 
880 	ASSERT(pp ? ((PP_ISFREE(pp) == 0) && (PP_ISAGED(pp) == 0)) : 1);
881 
882 	return (pp);
883 }
884 
885 /*
886  * Search the hash list for the page representing the
887  * specified [vp, offset] and return it locked.  Skip
888  * free pages and pages that cannot be locked as requested.
889  * Used while attempting to kluster pages.
890  */
891 page_t *
892 page_lookup_nowait(vnode_t *vp, u_offset_t off, se_t se)
893 {
894 	page_t		*pp;
895 	kmutex_t	*phm;
896 	ulong_t		index;
897 	uint_t		locked;
898 
899 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
900 	VM_STAT_ADD(page_lookup_nowait_cnt[0]);
901 
902 	index = PAGE_HASH_FUNC(vp, off);
903 	PAGE_HASH_SEARCH(index, pp, vp, off);
904 	locked = 0;
905 	if (pp == NULL) {
906 top:
907 		VM_STAT_ADD(page_lookup_nowait_cnt[1]);
908 		locked = 1;
909 		phm = PAGE_HASH_MUTEX(index);
910 		mutex_enter(phm);
911 		PAGE_HASH_SEARCH(index, pp, vp, off);
912 	}
913 
914 	if (pp == NULL || PP_ISFREE(pp)) {
915 		VM_STAT_ADD(page_lookup_nowait_cnt[2]);
916 		pp = NULL;
917 	} else {
918 		if (!page_trylock(pp, se)) {
919 			VM_STAT_ADD(page_lookup_nowait_cnt[3]);
920 			pp = NULL;
921 		} else {
922 			VM_STAT_ADD(page_lookup_nowait_cnt[4]);
923 			/*
924 			 * See the comment in page_lookup()
925 			 */
926 			if (((volatile struct vnode *)(pp->p_vnode) != vp) ||
927 			    ((u_offset_t)(pp->p_offset) != off)) {
928 				VM_STAT_ADD(page_lookup_nowait_cnt[5]);
929 				if (locked) {
930 					panic("page_lookup_nowait %p",
931 					    (void *)pp);
932 					/*NOTREACHED*/
933 				}
934 				page_unlock(pp);
935 				goto top;
936 			}
937 			if (PP_ISFREE(pp)) {
938 				VM_STAT_ADD(page_lookup_nowait_cnt[6]);
939 				page_unlock(pp);
940 				pp = NULL;
941 			}
942 		}
943 	}
944 	if (locked) {
945 		VM_STAT_ADD(page_lookup_nowait_cnt[7]);
946 		mutex_exit(phm);
947 	}
948 
949 	ASSERT(pp ? PAGE_LOCKED_SE(pp, se) : 1);
950 
951 	return (pp);
952 }
953 
954 /*
955  * Search the hash list for a page with the specified [vp, off]
956  * that is known to exist and is already locked.  This routine
957  * is typically used by segment SOFTUNLOCK routines.
958  */
959 page_t *
960 page_find(vnode_t *vp, u_offset_t off)
961 {
962 	page_t		*pp;
963 	kmutex_t	*phm;
964 	ulong_t		index;
965 
966 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
967 	VM_STAT_ADD(page_find_cnt);
968 
969 	index = PAGE_HASH_FUNC(vp, off);
970 	phm = PAGE_HASH_MUTEX(index);
971 
972 	mutex_enter(phm);
973 	PAGE_HASH_SEARCH(index, pp, vp, off);
974 	mutex_exit(phm);
975 
976 	ASSERT(pp == NULL || PAGE_LOCKED(pp) || panicstr);
977 	return (pp);
978 }
979 
980 /*
981  * Determine whether a page with the specified [vp, off]
982  * currently exists in the system.  Obviously this should
983  * only be considered as a hint since nothing prevents the
984  * page from disappearing or appearing immediately after
985  * the return from this routine. Subsequently, we don't
986  * even bother to lock the list.
987  */
988 page_t *
989 page_exists(vnode_t *vp, u_offset_t off)
990 {
991 	page_t	*pp;
992 	ulong_t		index;
993 
994 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
995 	VM_STAT_ADD(page_exists_cnt);
996 
997 	index = PAGE_HASH_FUNC(vp, off);
998 	PAGE_HASH_SEARCH(index, pp, vp, off);
999 
1000 	return (pp);
1001 }
1002 
1003 /*
1004  * Determine if physically contiguous pages exist for [vp, off] - [vp, off +
1005  * page_size(szc)) range.  if they exist and ppa is not NULL fill ppa array
1006  * with these pages locked SHARED. If necessary reclaim pages from
1007  * freelist. Return 1 if contiguous pages exist and 0 otherwise.
1008  *
1009  * If we fail to lock pages still return 1 if pages exist and contiguous.
1010  * But in this case return value is just a hint. ppa array won't be filled.
1011  * Caller should initialize ppa[0] as NULL to distinguish return value.
1012  *
1013  * Returns 0 if pages don't exist or not physically contiguous.
1014  *
1015  * This routine doesn't work for anonymous(swapfs) pages.
1016  */
1017 int
1018 page_exists_physcontig(vnode_t *vp, u_offset_t off, uint_t szc, page_t *ppa[])
1019 {
1020 	pgcnt_t pages;
1021 	pfn_t pfn;
1022 	page_t *rootpp;
1023 	pgcnt_t i;
1024 	pgcnt_t j;
1025 	u_offset_t save_off = off;
1026 	ulong_t index;
1027 	kmutex_t *phm;
1028 	page_t *pp;
1029 	uint_t pszc;
1030 	int loopcnt = 0;
1031 
1032 	ASSERT(szc != 0);
1033 	ASSERT(vp != NULL);
1034 	ASSERT(!IS_SWAPFSVP(vp));
1035 	ASSERT(vp != &kvp);
1036 
1037 again:
1038 	if (++loopcnt > 3) {
1039 		VM_STAT_ADD(page_exphcontg[0]);
1040 		return (0);
1041 	}
1042 
1043 	index = PAGE_HASH_FUNC(vp, off);
1044 	phm = PAGE_HASH_MUTEX(index);
1045 
1046 	mutex_enter(phm);
1047 	PAGE_HASH_SEARCH(index, pp, vp, off);
1048 	mutex_exit(phm);
1049 
1050 	VM_STAT_ADD(page_exphcontg[1]);
1051 
1052 	if (pp == NULL) {
1053 		VM_STAT_ADD(page_exphcontg[2]);
1054 		return (0);
1055 	}
1056 
1057 	pages = page_get_pagecnt(szc);
1058 	rootpp = pp;
1059 	pfn = rootpp->p_pagenum;
1060 
1061 	if ((pszc = pp->p_szc) >= szc && ppa != NULL) {
1062 		VM_STAT_ADD(page_exphcontg[3]);
1063 		if (!page_trylock(pp, SE_SHARED)) {
1064 			VM_STAT_ADD(page_exphcontg[4]);
1065 			return (1);
1066 		}
1067 		if (pp->p_szc != pszc || pp->p_vnode != vp ||
1068 		    pp->p_offset != off) {
1069 			VM_STAT_ADD(page_exphcontg[5]);
1070 			page_unlock(pp);
1071 			off = save_off;
1072 			goto again;
1073 		}
1074 		/*
1075 		 * szc was non zero and vnode and offset matched after we
1076 		 * locked the page it means it can't become free on us.
1077 		 */
1078 		ASSERT(!PP_ISFREE(pp));
1079 		if (!IS_P2ALIGNED(pfn, pages)) {
1080 			page_unlock(pp);
1081 			return (0);
1082 		}
1083 		ppa[0] = pp;
1084 		pp++;
1085 		off += PAGESIZE;
1086 		pfn++;
1087 		for (i = 1; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1088 			if (!page_trylock(pp, SE_SHARED)) {
1089 				VM_STAT_ADD(page_exphcontg[6]);
1090 				pp--;
1091 				while (i-- > 0) {
1092 					page_unlock(pp);
1093 					pp--;
1094 				}
1095 				ppa[0] = NULL;
1096 				return (1);
1097 			}
1098 			if (pp->p_szc != pszc) {
1099 				VM_STAT_ADD(page_exphcontg[7]);
1100 				page_unlock(pp);
1101 				pp--;
1102 				while (i-- > 0) {
1103 					page_unlock(pp);
1104 					pp--;
1105 				}
1106 				ppa[0] = NULL;
1107 				off = save_off;
1108 				goto again;
1109 			}
1110 			/*
1111 			 * szc the same as for previous already locked pages
1112 			 * with right identity. Since this page had correct
1113 			 * szc after we locked it can't get freed or destroyed
1114 			 * and therefore must have the expected identity.
1115 			 */
1116 			ASSERT(!PP_ISFREE(pp));
1117 			if (pp->p_vnode != vp ||
1118 			    pp->p_offset != off) {
1119 				panic("page_exists_physcontig: "
1120 				    "large page identity doesn't match");
1121 			}
1122 			ppa[i] = pp;
1123 			ASSERT(pp->p_pagenum == pfn);
1124 		}
1125 		VM_STAT_ADD(page_exphcontg[8]);
1126 		ppa[pages] = NULL;
1127 		return (1);
1128 	} else if (pszc >= szc) {
1129 		VM_STAT_ADD(page_exphcontg[9]);
1130 		if (!IS_P2ALIGNED(pfn, pages)) {
1131 			return (0);
1132 		}
1133 		return (1);
1134 	}
1135 
1136 	if (!IS_P2ALIGNED(pfn, pages)) {
1137 		VM_STAT_ADD(page_exphcontg[10]);
1138 		return (0);
1139 	}
1140 
1141 	if (page_numtomemseg_nolock(pfn) !=
1142 	    page_numtomemseg_nolock(pfn + pages - 1)) {
1143 		VM_STAT_ADD(page_exphcontg[11]);
1144 		return (0);
1145 	}
1146 
1147 	/*
1148 	 * We loop up 4 times across pages to promote page size.
1149 	 * We're extra cautious to promote page size atomically with respect
1150 	 * to everybody else.  But we can probably optimize into 1 loop if
1151 	 * this becomes an issue.
1152 	 */
1153 
1154 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE, pfn++) {
1155 		ASSERT(pp->p_pagenum == pfn);
1156 		if (!page_trylock(pp, SE_EXCL)) {
1157 			VM_STAT_ADD(page_exphcontg[12]);
1158 			break;
1159 		}
1160 		if (pp->p_vnode != vp ||
1161 		    pp->p_offset != off) {
1162 			VM_STAT_ADD(page_exphcontg[13]);
1163 			page_unlock(pp);
1164 			break;
1165 		}
1166 		if (pp->p_szc >= szc) {
1167 			ASSERT(i == 0);
1168 			page_unlock(pp);
1169 			off = save_off;
1170 			goto again;
1171 		}
1172 	}
1173 
1174 	if (i != pages) {
1175 		VM_STAT_ADD(page_exphcontg[14]);
1176 		--pp;
1177 		while (i-- > 0) {
1178 			page_unlock(pp);
1179 			--pp;
1180 		}
1181 		return (0);
1182 	}
1183 
1184 	pp = rootpp;
1185 	for (i = 0; i < pages; i++, pp++) {
1186 		if (PP_ISFREE(pp)) {
1187 			VM_STAT_ADD(page_exphcontg[15]);
1188 			ASSERT(!PP_ISAGED(pp));
1189 			ASSERT(pp->p_szc == 0);
1190 			if (!page_reclaim(pp, NULL)) {
1191 				break;
1192 			}
1193 		} else {
1194 			ASSERT(pp->p_szc < szc);
1195 			VM_STAT_ADD(page_exphcontg[16]);
1196 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1197 		}
1198 	}
1199 	if (i < pages) {
1200 		VM_STAT_ADD(page_exphcontg[17]);
1201 		/*
1202 		 * page_reclaim failed because we were out of memory.
1203 		 * drop the rest of the locks and return because this page
1204 		 * must be already reallocated anyway.
1205 		 */
1206 		pp = rootpp;
1207 		for (j = 0; j < pages; j++, pp++) {
1208 			if (j != i) {
1209 				page_unlock(pp);
1210 			}
1211 		}
1212 		return (0);
1213 	}
1214 
1215 	off = save_off;
1216 	pp = rootpp;
1217 	for (i = 0; i < pages; i++, pp++, off += PAGESIZE) {
1218 		ASSERT(PAGE_EXCL(pp));
1219 		ASSERT(!PP_ISFREE(pp));
1220 		ASSERT(!hat_page_is_mapped(pp));
1221 		ASSERT(pp->p_vnode == vp);
1222 		ASSERT(pp->p_offset == off);
1223 		pp->p_szc = szc;
1224 	}
1225 	pp = rootpp;
1226 	for (i = 0; i < pages; i++, pp++) {
1227 		if (ppa == NULL) {
1228 			page_unlock(pp);
1229 		} else {
1230 			ppa[i] = pp;
1231 			page_downgrade(ppa[i]);
1232 		}
1233 	}
1234 	if (ppa != NULL) {
1235 		ppa[pages] = NULL;
1236 	}
1237 	VM_STAT_ADD(page_exphcontg[18]);
1238 	ASSERT(vp->v_pages != NULL);
1239 	return (1);
1240 }
1241 
1242 /*
1243  * Determine whether a page with the specified [vp, off]
1244  * currently exists in the system and if so return its
1245  * size code. Obviously this should only be considered as
1246  * a hint since nothing prevents the page from disappearing
1247  * or appearing immediately after the return from this routine.
1248  */
1249 int
1250 page_exists_forreal(vnode_t *vp, u_offset_t off, uint_t *szc)
1251 {
1252 	page_t		*pp;
1253 	kmutex_t	*phm;
1254 	ulong_t		index;
1255 	int		rc = 0;
1256 
1257 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
1258 	ASSERT(szc != NULL);
1259 	VM_STAT_ADD(page_exists_forreal_cnt);
1260 
1261 	index = PAGE_HASH_FUNC(vp, off);
1262 	phm = PAGE_HASH_MUTEX(index);
1263 
1264 	mutex_enter(phm);
1265 	PAGE_HASH_SEARCH(index, pp, vp, off);
1266 	if (pp != NULL) {
1267 		*szc = pp->p_szc;
1268 		rc = 1;
1269 	}
1270 	mutex_exit(phm);
1271 	return (rc);
1272 }
1273 
1274 /* wakeup threads waiting for pages in page_create_get_something() */
1275 void
1276 wakeup_pcgs(void)
1277 {
1278 	if (!CV_HAS_WAITERS(&pcgs_cv))
1279 		return;
1280 	cv_broadcast(&pcgs_cv);
1281 }
1282 
1283 /*
1284  * 'freemem' is used all over the kernel as an indication of how many
1285  * pages are free (either on the cache list or on the free page list)
1286  * in the system.  In very few places is a really accurate 'freemem'
1287  * needed.  To avoid contention of the lock protecting a the
1288  * single freemem, it was spread out into NCPU buckets.  Set_freemem
1289  * sets freemem to the total of all NCPU buckets.  It is called from
1290  * clock() on each TICK.
1291  */
1292 void
1293 set_freemem()
1294 {
1295 	struct pcf	*p;
1296 	ulong_t		t;
1297 	uint_t		i;
1298 
1299 	t = 0;
1300 	p = pcf;
1301 	for (i = 0;  i < PCF_FANOUT; i++) {
1302 		t += p->pcf_count;
1303 		p++;
1304 	}
1305 	freemem = t;
1306 
1307 	/*
1308 	 * Don't worry about grabbing mutex.  It's not that
1309 	 * critical if we miss a tick or two.  This is
1310 	 * where we wakeup possible delayers in
1311 	 * page_create_get_something().
1312 	 */
1313 	wakeup_pcgs();
1314 }
1315 
1316 ulong_t
1317 get_freemem()
1318 {
1319 	struct pcf	*p;
1320 	ulong_t		t;
1321 	uint_t		i;
1322 
1323 	t = 0;
1324 	p = pcf;
1325 	for (i = 0; i < PCF_FANOUT; i++) {
1326 		t += p->pcf_count;
1327 		p++;
1328 	}
1329 	/*
1330 	 * We just calculated it, might as well set it.
1331 	 */
1332 	freemem = t;
1333 	return (t);
1334 }
1335 
1336 /*
1337  * Acquire all of the page cache & free (pcf) locks.
1338  */
1339 void
1340 pcf_acquire_all()
1341 {
1342 	struct pcf	*p;
1343 	uint_t		i;
1344 
1345 	p = pcf;
1346 	for (i = 0; i < PCF_FANOUT; i++) {
1347 		p->pcf_touch = 1;
1348 		mutex_enter(&p->pcf_lock);
1349 		p++;
1350 	}
1351 }
1352 
1353 /*
1354  * Release all the pcf_locks.
1355  */
1356 void
1357 pcf_release_all()
1358 {
1359 	struct pcf	*p;
1360 	uint_t		i;
1361 
1362 	p = pcf;
1363 	for (i = 0; i < PCF_FANOUT; i++) {
1364 		mutex_exit(&p->pcf_lock);
1365 		p++;
1366 	}
1367 }
1368 
1369 /*
1370  * Inform the VM system that we need some pages freed up.
1371  * Calls must be symmetric, e.g.:
1372  *
1373  *	page_needfree(100);
1374  *	wait a bit;
1375  *	page_needfree(-100);
1376  */
1377 void
1378 page_needfree(spgcnt_t npages)
1379 {
1380 	mutex_enter(&new_freemem_lock);
1381 	needfree += npages;
1382 	mutex_exit(&new_freemem_lock);
1383 }
1384 
1385 /*
1386  * Throttle for page_create(): try to prevent freemem from dropping
1387  * below throttlefree.  We can't provide a 100% guarantee because
1388  * KM_NOSLEEP allocations, page_reclaim(), and various other things
1389  * nibble away at the freelist.  However, we can block all PG_WAIT
1390  * allocations until memory becomes available.  The motivation is
1391  * that several things can fall apart when there's no free memory:
1392  *
1393  * (1) If pageout() needs memory to push a page, the system deadlocks.
1394  *
1395  * (2) By (broken) specification, timeout(9F) can neither fail nor
1396  *     block, so it has no choice but to panic the system if it
1397  *     cannot allocate a callout structure.
1398  *
1399  * (3) Like timeout(), ddi_set_callback() cannot fail and cannot block;
1400  *     it panics if it cannot allocate a callback structure.
1401  *
1402  * (4) Untold numbers of third-party drivers have not yet been hardened
1403  *     against KM_NOSLEEP and/or allocb() failures; they simply assume
1404  *     success and panic the system with a data fault on failure.
1405  *     (The long-term solution to this particular problem is to ship
1406  *     hostile fault-injecting DEBUG kernels with the DDK.)
1407  *
1408  * It is theoretically impossible to guarantee success of non-blocking
1409  * allocations, but in practice, this throttle is very hard to break.
1410  */
1411 static int
1412 page_create_throttle(pgcnt_t npages, int flags)
1413 {
1414 	ulong_t	fm;
1415 	uint_t	i;
1416 	pgcnt_t tf;	/* effective value of throttlefree */
1417 
1418 	/*
1419 	 * Never deny pages when:
1420 	 * - it's a thread that cannot block [NOMEMWAIT()]
1421 	 * - the allocation cannot block and must not fail
1422 	 * - the allocation cannot block and is pageout dispensated
1423 	 */
1424 	if (NOMEMWAIT() ||
1425 	    ((flags & (PG_WAIT | PG_PANIC)) == PG_PANIC) ||
1426 	    ((flags & (PG_WAIT | PG_PUSHPAGE)) == PG_PUSHPAGE))
1427 		return (1);
1428 
1429 	/*
1430 	 * If the allocation can't block, we look favorably upon it
1431 	 * unless we're below pageout_reserve.  In that case we fail
1432 	 * the allocation because we want to make sure there are a few
1433 	 * pages available for pageout.
1434 	 */
1435 	if ((flags & PG_WAIT) == 0)
1436 		return (freemem >= npages + pageout_reserve);
1437 
1438 	/* Calculate the effective throttlefree value */
1439 	tf = throttlefree -
1440 	    ((flags & PG_PUSHPAGE) ? pageout_reserve : 0);
1441 
1442 	cv_signal(&proc_pageout->p_cv);
1443 
1444 	while (freemem < npages + tf) {
1445 		pcf_acquire_all();
1446 		mutex_enter(&new_freemem_lock);
1447 		fm = 0;
1448 		for (i = 0; i < PCF_FANOUT; i++) {
1449 			fm += pcf[i].pcf_count;
1450 			pcf[i].pcf_wait++;
1451 			mutex_exit(&pcf[i].pcf_lock);
1452 		}
1453 		freemem = fm;
1454 		needfree += npages;
1455 		freemem_wait++;
1456 		cv_wait(&freemem_cv, &new_freemem_lock);
1457 		freemem_wait--;
1458 		needfree -= npages;
1459 		mutex_exit(&new_freemem_lock);
1460 	}
1461 	return (1);
1462 }
1463 
1464 /*
1465  * page_create_wait() is called to either coalecse pages from the
1466  * different pcf buckets or to wait because there simply are not
1467  * enough pages to satisfy the caller's request.
1468  *
1469  * Sadly, this is called from platform/vm/vm_machdep.c
1470  */
1471 int
1472 page_create_wait(size_t npages, uint_t flags)
1473 {
1474 	pgcnt_t		total;
1475 	uint_t		i;
1476 	struct pcf	*p;
1477 
1478 	/*
1479 	 * Wait until there are enough free pages to satisfy our
1480 	 * entire request.
1481 	 * We set needfree += npages before prodding pageout, to make sure
1482 	 * it does real work when npages > lotsfree > freemem.
1483 	 */
1484 	VM_STAT_ADD(page_create_not_enough);
1485 
1486 	ASSERT(!kcage_on ? !(flags & PG_NORELOC) : 1);
1487 checkagain:
1488 	if ((flags & PG_NORELOC) &&
1489 	    kcage_freemem < kcage_throttlefree + npages)
1490 		(void) kcage_create_throttle(npages, flags);
1491 
1492 	if (freemem < npages + throttlefree)
1493 		if (!page_create_throttle(npages, flags))
1494 			return (0);
1495 
1496 	/*
1497 	 * Since page_create_va() looked at every
1498 	 * bucket, assume we are going to have to wait.
1499 	 * Get all of the pcf locks.
1500 	 */
1501 	total = 0;
1502 	p = pcf;
1503 	for (i = 0; i < PCF_FANOUT; i++) {
1504 		p->pcf_touch = 1;
1505 		mutex_enter(&p->pcf_lock);
1506 		total += p->pcf_count;
1507 		if (total >= npages) {
1508 			/*
1509 			 * Wow!  There are enough pages laying around
1510 			 * to satisfy the request.  Do the accounting,
1511 			 * drop the locks we acquired, and go back.
1512 			 *
1513 			 * freemem is not protected by any lock. So,
1514 			 * we cannot have any assertion containing
1515 			 * freemem.
1516 			 */
1517 			freemem -= npages;
1518 
1519 			while (p >= pcf) {
1520 				if (p->pcf_count <= npages) {
1521 					npages -= p->pcf_count;
1522 					p->pcf_count = 0;
1523 				} else {
1524 					p->pcf_count -= (uint_t)npages;
1525 					npages = 0;
1526 				}
1527 				mutex_exit(&p->pcf_lock);
1528 				p--;
1529 			}
1530 			ASSERT(npages == 0);
1531 			return (1);
1532 		}
1533 		p++;
1534 	}
1535 
1536 	/*
1537 	 * All of the pcf locks are held, there are not enough pages
1538 	 * to satisfy the request (npages < total).
1539 	 * Be sure to acquire the new_freemem_lock before dropping
1540 	 * the pcf locks.  This prevents dropping wakeups in page_free().
1541 	 * The order is always pcf_lock then new_freemem_lock.
1542 	 *
1543 	 * Since we hold all the pcf locks, it is a good time to set freemem.
1544 	 *
1545 	 * If the caller does not want to wait, return now.
1546 	 * Else turn the pageout daemon loose to find something
1547 	 * and wait till it does.
1548 	 *
1549 	 */
1550 	freemem = total;
1551 
1552 	if ((flags & PG_WAIT) == 0) {
1553 		pcf_release_all();
1554 
1555 		TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_NOMEM,
1556 		"page_create_nomem:npages %ld freemem %ld", npages, freemem);
1557 		return (0);
1558 	}
1559 
1560 	ASSERT(proc_pageout != NULL);
1561 	cv_signal(&proc_pageout->p_cv);
1562 
1563 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_START,
1564 	    "page_create_sleep_start: freemem %ld needfree %ld",
1565 	    freemem, needfree);
1566 
1567 	/*
1568 	 * We are going to wait.
1569 	 * We currently hold all of the pcf_locks,
1570 	 * get the new_freemem_lock (it protects freemem_wait),
1571 	 * before dropping the pcf_locks.
1572 	 */
1573 	mutex_enter(&new_freemem_lock);
1574 
1575 	p = pcf;
1576 	for (i = 0; i < PCF_FANOUT; i++) {
1577 		p->pcf_wait++;
1578 		mutex_exit(&p->pcf_lock);
1579 		p++;
1580 	}
1581 
1582 	needfree += npages;
1583 	freemem_wait++;
1584 
1585 	cv_wait(&freemem_cv, &new_freemem_lock);
1586 
1587 	freemem_wait--;
1588 	needfree -= npages;
1589 
1590 	mutex_exit(&new_freemem_lock);
1591 
1592 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SLEEP_END,
1593 	    "page_create_sleep_end: freemem %ld needfree %ld",
1594 	    freemem, needfree);
1595 
1596 	VM_STAT_ADD(page_create_not_enough_again);
1597 	goto checkagain;
1598 }
1599 
1600 /*
1601  * A routine to do the opposite of page_create_wait().
1602  */
1603 void
1604 page_create_putback(spgcnt_t npages)
1605 {
1606 	struct pcf	*p;
1607 	pgcnt_t		lump;
1608 	uint_t		*which;
1609 
1610 	/*
1611 	 * When a contiguous lump is broken up, we have to
1612 	 * deal with lots of pages (min 64) so lets spread
1613 	 * the wealth around.
1614 	 */
1615 	lump = roundup(npages, PCF_FANOUT) / PCF_FANOUT;
1616 	freemem += npages;
1617 
1618 	for (p = pcf; (npages > 0) && (p < &pcf[PCF_FANOUT]); p++) {
1619 		which = &p->pcf_count;
1620 
1621 		mutex_enter(&p->pcf_lock);
1622 
1623 		if (p->pcf_block) {
1624 			which = &p->pcf_reserve;
1625 		}
1626 
1627 		if (lump < npages) {
1628 			*which += (uint_t)lump;
1629 			npages -= lump;
1630 		} else {
1631 			*which += (uint_t)npages;
1632 			npages = 0;
1633 		}
1634 
1635 		if (p->pcf_wait) {
1636 			mutex_enter(&new_freemem_lock);
1637 			/*
1638 			 * Check to see if some other thread
1639 			 * is actually waiting.  Another bucket
1640 			 * may have woken it up by now.  If there
1641 			 * are no waiters, then set our pcf_wait
1642 			 * count to zero to avoid coming in here
1643 			 * next time.
1644 			 */
1645 			if (freemem_wait) {
1646 				if (npages > 1) {
1647 					cv_broadcast(&freemem_cv);
1648 				} else {
1649 					cv_signal(&freemem_cv);
1650 				}
1651 				p->pcf_wait--;
1652 			} else {
1653 				p->pcf_wait = 0;
1654 			}
1655 			mutex_exit(&new_freemem_lock);
1656 		}
1657 		mutex_exit(&p->pcf_lock);
1658 	}
1659 	ASSERT(npages == 0);
1660 }
1661 
1662 /*
1663  * A helper routine for page_create_get_something.
1664  * The indenting got to deep down there.
1665  * Unblock the pcf counters.  Any pages freed after
1666  * pcf_block got set are moved to pcf_count and
1667  * wakeups (cv_broadcast() or cv_signal()) are done as needed.
1668  */
1669 static void
1670 pcgs_unblock(void)
1671 {
1672 	int		i;
1673 	struct pcf	*p;
1674 
1675 	/* Update freemem while we're here. */
1676 	freemem = 0;
1677 	p = pcf;
1678 	for (i = 0; i < PCF_FANOUT; i++) {
1679 		mutex_enter(&p->pcf_lock);
1680 		ASSERT(p->pcf_count == 0);
1681 		p->pcf_count = p->pcf_reserve;
1682 		p->pcf_block = 0;
1683 		freemem += p->pcf_count;
1684 		if (p->pcf_wait) {
1685 			mutex_enter(&new_freemem_lock);
1686 			if (freemem_wait) {
1687 				if (p->pcf_reserve > 1) {
1688 					cv_broadcast(&freemem_cv);
1689 					p->pcf_wait = 0;
1690 				} else {
1691 					cv_signal(&freemem_cv);
1692 					p->pcf_wait--;
1693 				}
1694 			} else {
1695 				p->pcf_wait = 0;
1696 			}
1697 			mutex_exit(&new_freemem_lock);
1698 		}
1699 		p->pcf_reserve = 0;
1700 		mutex_exit(&p->pcf_lock);
1701 		p++;
1702 	}
1703 }
1704 
1705 /*
1706  * Called from page_create_va() when both the cache and free lists
1707  * have been checked once.
1708  *
1709  * Either returns a page or panics since the accounting was done
1710  * way before we got here.
1711  *
1712  * We don't come here often, so leave the accounting on permanently.
1713  */
1714 
1715 #define	MAX_PCGS	100
1716 
1717 #ifdef	DEBUG
1718 #define	PCGS_TRIES	100
1719 #else	/* DEBUG */
1720 #define	PCGS_TRIES	10
1721 #endif	/* DEBUG */
1722 
1723 #ifdef	VM_STATS
1724 uint_t	pcgs_counts[PCGS_TRIES];
1725 uint_t	pcgs_too_many;
1726 uint_t	pcgs_entered;
1727 uint_t	pcgs_entered_noreloc;
1728 uint_t	pcgs_locked;
1729 uint_t	pcgs_cagelocked;
1730 #endif	/* VM_STATS */
1731 
1732 static page_t *
1733 page_create_get_something(vnode_t *vp, u_offset_t off, struct seg *seg,
1734     caddr_t vaddr, uint_t flags)
1735 {
1736 	uint_t		count;
1737 	page_t		*pp;
1738 	uint_t		locked, i;
1739 	struct	pcf	*p;
1740 	lgrp_t		*lgrp;
1741 	int		cagelocked = 0;
1742 
1743 	VM_STAT_ADD(pcgs_entered);
1744 
1745 	/*
1746 	 * Tap any reserve freelists: if we fail now, we'll die
1747 	 * since the page(s) we're looking for have already been
1748 	 * accounted for.
1749 	 */
1750 	flags |= PG_PANIC;
1751 
1752 	if ((flags & PG_NORELOC) != 0) {
1753 		VM_STAT_ADD(pcgs_entered_noreloc);
1754 		/*
1755 		 * Requests for free pages from critical threads
1756 		 * such as pageout still won't throttle here, but
1757 		 * we must try again, to give the cageout thread
1758 		 * another chance to catch up. Since we already
1759 		 * accounted for the pages, we had better get them
1760 		 * this time.
1761 		 *
1762 		 * N.B. All non-critical threads acquire the pcgs_cagelock
1763 		 * to serialize access to the freelists. This implements a
1764 		 * turnstile-type synchornization to avoid starvation of
1765 		 * critical requests for PG_NORELOC memory by non-critical
1766 		 * threads: all non-critical threads must acquire a 'ticket'
1767 		 * before passing through, which entails making sure
1768 		 * kcage_freemem won't fall below minfree prior to grabbing
1769 		 * pages from the freelists.
1770 		 */
1771 		if (kcage_create_throttle(1, flags) == KCT_NONCRIT) {
1772 			mutex_enter(&pcgs_cagelock);
1773 			cagelocked = 1;
1774 			VM_STAT_ADD(pcgs_cagelocked);
1775 		}
1776 	}
1777 
1778 	/*
1779 	 * Time to get serious.
1780 	 * We failed to get a `correctly colored' page from both the
1781 	 * free and cache lists.
1782 	 * We escalate in stage.
1783 	 *
1784 	 * First try both lists without worring about color.
1785 	 *
1786 	 * Then, grab all page accounting locks (ie. pcf[]) and
1787 	 * steal any pages that they have and set the pcf_block flag to
1788 	 * stop deletions from the lists.  This will help because
1789 	 * a page can get added to the free list while we are looking
1790 	 * at the cache list, then another page could be added to the cache
1791 	 * list allowing the page on the free list to be removed as we
1792 	 * move from looking at the cache list to the free list. This
1793 	 * could happen over and over. We would never find the page
1794 	 * we have accounted for.
1795 	 *
1796 	 * Noreloc pages are a subset of the global (relocatable) page pool.
1797 	 * They are not tracked separately in the pcf bins, so it is
1798 	 * impossible to know when doing pcf accounting if the available
1799 	 * page(s) are noreloc pages or not. When looking for a noreloc page
1800 	 * it is quite easy to end up here even if the global (relocatable)
1801 	 * page pool has plenty of free pages but the noreloc pool is empty.
1802 	 *
1803 	 * When the noreloc pool is empty (or low), additional noreloc pages
1804 	 * are created by converting pages from the global page pool. This
1805 	 * process will stall during pcf accounting if the pcf bins are
1806 	 * already locked. Such is the case when a noreloc allocation is
1807 	 * looping here in page_create_get_something waiting for more noreloc
1808 	 * pages to appear.
1809 	 *
1810 	 * Short of adding a new field to the pcf bins to accurately track
1811 	 * the number of free noreloc pages, we instead do not grab the
1812 	 * pcgs_lock, do not set the pcf blocks and do not timeout when
1813 	 * allocating a noreloc page. This allows noreloc allocations to
1814 	 * loop without blocking global page pool allocations.
1815 	 *
1816 	 * NOTE: the behaviour of page_create_get_something has not changed
1817 	 * for the case of global page pool allocations.
1818 	 */
1819 
1820 	flags &= ~PG_MATCH_COLOR;
1821 	locked = 0;
1822 #if defined(__i386) || defined(__amd64)
1823 	/*
1824 	 * page_create_get_something may be called because 4g memory may be
1825 	 * depleted. Set flags to allow for relocation of base page below
1826 	 * 4g if necessary.
1827 	 */
1828 	if (physmax4g)
1829 		flags |= (PGI_PGCPSZC0 | PGI_PGCPHIPRI);
1830 #endif
1831 
1832 	lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
1833 
1834 	for (count = 0; kcage_on || count < MAX_PCGS; count++) {
1835 		pp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
1836 		    flags, lgrp);
1837 		if (pp == NULL) {
1838 			pp = page_get_cachelist(vp, off, seg, vaddr,
1839 				flags, lgrp);
1840 		}
1841 		if (pp == NULL) {
1842 			/*
1843 			 * Serialize.  Don't fight with other pcgs().
1844 			 */
1845 			if (!locked && (!kcage_on || !(flags & PG_NORELOC))) {
1846 				mutex_enter(&pcgs_lock);
1847 				VM_STAT_ADD(pcgs_locked);
1848 				locked = 1;
1849 				p = pcf;
1850 				for (i = 0; i < PCF_FANOUT; i++) {
1851 					mutex_enter(&p->pcf_lock);
1852 					ASSERT(p->pcf_block == 0);
1853 					p->pcf_block = 1;
1854 					p->pcf_reserve = p->pcf_count;
1855 					p->pcf_count = 0;
1856 					mutex_exit(&p->pcf_lock);
1857 					p++;
1858 				}
1859 				freemem = 0;
1860 			}
1861 
1862 			if (count) {
1863 				/*
1864 				 * Since page_free() puts pages on
1865 				 * a list then accounts for it, we
1866 				 * just have to wait for page_free()
1867 				 * to unlock any page it was working
1868 				 * with. The page_lock()-page_reclaim()
1869 				 * path falls in the same boat.
1870 				 *
1871 				 * We don't need to check on the
1872 				 * PG_WAIT flag, we have already
1873 				 * accounted for the page we are
1874 				 * looking for in page_create_va().
1875 				 *
1876 				 * We just wait a moment to let any
1877 				 * locked pages on the lists free up,
1878 				 * then continue around and try again.
1879 				 *
1880 				 * Will be awakened by set_freemem().
1881 				 */
1882 				mutex_enter(&pcgs_wait_lock);
1883 				cv_wait(&pcgs_cv, &pcgs_wait_lock);
1884 				mutex_exit(&pcgs_wait_lock);
1885 			}
1886 		} else {
1887 #ifdef VM_STATS
1888 			if (count >= PCGS_TRIES) {
1889 				VM_STAT_ADD(pcgs_too_many);
1890 			} else {
1891 				VM_STAT_ADD(pcgs_counts[count]);
1892 			}
1893 #endif
1894 			if (locked) {
1895 				pcgs_unblock();
1896 				mutex_exit(&pcgs_lock);
1897 			}
1898 			if (cagelocked)
1899 				mutex_exit(&pcgs_cagelock);
1900 			return (pp);
1901 		}
1902 	}
1903 	/*
1904 	 * we go down holding the pcf locks.
1905 	 */
1906 	panic("no %spage found %d",
1907 	    ((flags & PG_NORELOC) ? "non-reloc " : ""), count);
1908 	/*NOTREACHED*/
1909 }
1910 
1911 /*
1912  * Create enough pages for "bytes" worth of data starting at
1913  * "off" in "vp".
1914  *
1915  *	Where flag must be one of:
1916  *
1917  *		PG_EXCL:	Exclusive create (fail if any page already
1918  *				exists in the page cache) which does not
1919  *				wait for memory to become available.
1920  *
1921  *		PG_WAIT:	Non-exclusive create which can wait for
1922  *				memory to become available.
1923  *
1924  *		PG_PHYSCONTIG:	Allocate physically contiguous pages.
1925  *				(Not Supported)
1926  *
1927  * A doubly linked list of pages is returned to the caller.  Each page
1928  * on the list has the "exclusive" (p_selock) lock and "iolock" (p_iolock)
1929  * lock.
1930  *
1931  * Unable to change the parameters to page_create() in a minor release,
1932  * we renamed page_create() to page_create_va(), changed all known calls
1933  * from page_create() to page_create_va(), and created this wrapper.
1934  *
1935  * Upon a major release, we should break compatibility by deleting this
1936  * wrapper, and replacing all the strings "page_create_va", with "page_create".
1937  *
1938  * NOTE: There is a copy of this interface as page_create_io() in
1939  *	 i86/vm/vm_machdep.c. Any bugs fixed here should be applied
1940  *	 there.
1941  */
1942 page_t *
1943 page_create(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags)
1944 {
1945 	caddr_t random_vaddr;
1946 	struct seg kseg;
1947 
1948 #ifdef DEBUG
1949 	cmn_err(CE_WARN, "Using deprecated interface page_create: caller %p",
1950 	    (void *)caller());
1951 #endif
1952 
1953 	random_vaddr = (caddr_t)(((uintptr_t)vp >> 7) ^
1954 	    (uintptr_t)(off >> PAGESHIFT));
1955 	kseg.s_as = &kas;
1956 
1957 	return (page_create_va(vp, off, bytes, flags, &kseg, random_vaddr));
1958 }
1959 
1960 #ifdef DEBUG
1961 uint32_t pg_alloc_pgs_mtbf = 0;
1962 #endif
1963 
1964 /*
1965  * Used for large page support. It will attempt to allocate
1966  * a large page(s) off the freelist.
1967  *
1968  * Returns non zero on failure.
1969  */
1970 int
1971 page_alloc_pages(struct vnode *vp, struct seg *seg, caddr_t addr,
1972     page_t **basepp, page_t *ppa[], uint_t szc, int anypgsz)
1973 {
1974 	pgcnt_t		npgs, curnpgs, totpgs;
1975 	size_t		pgsz;
1976 	page_t		*pplist = NULL, *pp;
1977 	int		err = 0;
1978 	lgrp_t		*lgrp;
1979 
1980 	ASSERT(szc != 0 && szc <= (page_num_pagesizes() - 1));
1981 
1982 	VM_STAT_ADD(alloc_pages[0]);
1983 
1984 #ifdef DEBUG
1985 	if (pg_alloc_pgs_mtbf && !(gethrtime() % pg_alloc_pgs_mtbf)) {
1986 		return (ENOMEM);
1987 	}
1988 #endif
1989 
1990 	pgsz = page_get_pagesize(szc);
1991 	totpgs = curnpgs = npgs = pgsz >> PAGESHIFT;
1992 
1993 	ASSERT(((uintptr_t)addr & (pgsz - 1)) == 0);
1994 	/*
1995 	 * One must be NULL but not both.
1996 	 * And one must be non NULL but not both.
1997 	 */
1998 	ASSERT(basepp != NULL || ppa != NULL);
1999 	ASSERT(basepp == NULL || ppa == NULL);
2000 
2001 	(void) page_create_wait(npgs, PG_WAIT);
2002 
2003 	while (npgs && szc) {
2004 		lgrp = lgrp_mem_choose(seg, addr, pgsz);
2005 		pp = page_get_freelist(vp, 0, seg, addr, pgsz, 0, lgrp);
2006 		if (pp != NULL) {
2007 			VM_STAT_ADD(alloc_pages[1]);
2008 			page_list_concat(&pplist, &pp);
2009 			ASSERT(npgs >= curnpgs);
2010 			npgs -= curnpgs;
2011 		} else if (anypgsz) {
2012 			VM_STAT_ADD(alloc_pages[2]);
2013 			szc--;
2014 			pgsz = page_get_pagesize(szc);
2015 			curnpgs = pgsz >> PAGESHIFT;
2016 		} else {
2017 			VM_STAT_ADD(alloc_pages[3]);
2018 			ASSERT(npgs == totpgs);
2019 			page_create_putback(npgs);
2020 			return (ENOMEM);
2021 		}
2022 	}
2023 	if (szc == 0) {
2024 		VM_STAT_ADD(alloc_pages[4]);
2025 		ASSERT(npgs != 0);
2026 		page_create_putback(npgs);
2027 		err = ENOMEM;
2028 	} else if (basepp != NULL) {
2029 		ASSERT(npgs == 0);
2030 		ASSERT(ppa == NULL);
2031 		*basepp = pplist;
2032 	}
2033 
2034 	npgs = totpgs - npgs;
2035 	pp = pplist;
2036 
2037 	/*
2038 	 * Clear the free and age bits. Also if we were passed in a ppa then
2039 	 * fill it in with all the constituent pages from the large page. But
2040 	 * if we failed to allocate all the pages just free what we got.
2041 	 */
2042 	while (npgs != 0) {
2043 		ASSERT(PP_ISFREE(pp));
2044 		ASSERT(PP_ISAGED(pp));
2045 		if (ppa != NULL || err != 0) {
2046 			if (err == 0) {
2047 				VM_STAT_ADD(alloc_pages[5]);
2048 				PP_CLRFREE(pp);
2049 				PP_CLRAGED(pp);
2050 				page_sub(&pplist, pp);
2051 				*ppa++ = pp;
2052 				npgs--;
2053 			} else {
2054 				VM_STAT_ADD(alloc_pages[6]);
2055 				ASSERT(pp->p_szc != 0);
2056 				curnpgs = page_get_pagecnt(pp->p_szc);
2057 				page_list_break(&pp, &pplist, curnpgs);
2058 				page_list_add_pages(pp, 0);
2059 				page_create_putback(curnpgs);
2060 				ASSERT(npgs >= curnpgs);
2061 				npgs -= curnpgs;
2062 			}
2063 			pp = pplist;
2064 		} else {
2065 			VM_STAT_ADD(alloc_pages[7]);
2066 			PP_CLRFREE(pp);
2067 			PP_CLRAGED(pp);
2068 			pp = pp->p_next;
2069 			npgs--;
2070 		}
2071 	}
2072 	return (err);
2073 }
2074 
2075 /*
2076  * Get a single large page off of the freelists, and set it up for use.
2077  * Number of bytes requested must be a supported page size.
2078  *
2079  * Note that this call may fail even if there is sufficient
2080  * memory available or PG_WAIT is set, so the caller must
2081  * be willing to fallback on page_create_va(), block and retry,
2082  * or fail the requester.
2083  */
2084 page_t *
2085 page_create_va_large(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2086     struct seg *seg, caddr_t vaddr, void *arg)
2087 {
2088 	pgcnt_t		npages, pcftotal;
2089 	page_t		*pp;
2090 	page_t		*rootpp;
2091 	lgrp_t		*lgrp;
2092 	uint_t		enough;
2093 	uint_t		pcf_index;
2094 	uint_t		i;
2095 	struct pcf	*p;
2096 	struct pcf	*q;
2097 	lgrp_id_t	*lgrpid = (lgrp_id_t *)arg;
2098 
2099 	ASSERT(vp != NULL);
2100 
2101 	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2102 		    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
2103 	/* but no others */
2104 
2105 	ASSERT((flags & PG_EXCL) == PG_EXCL);
2106 
2107 	npages = btop(bytes);
2108 
2109 	if (!kcage_on || panicstr) {
2110 		/*
2111 		 * Cage is OFF, or we are single threaded in
2112 		 * panic, so make everything a RELOC request.
2113 		 */
2114 		flags &= ~PG_NORELOC;
2115 	}
2116 
2117 	/*
2118 	 * Make sure there's adequate physical memory available.
2119 	 * Note: PG_WAIT is ignored here.
2120 	 */
2121 	if (freemem <= throttlefree + npages) {
2122 		VM_STAT_ADD(page_create_large_cnt[1]);
2123 		return (NULL);
2124 	}
2125 
2126 	/*
2127 	 * If cage is on, dampen draw from cage when available
2128 	 * cage space is low.
2129 	 */
2130 	if ((flags & (PG_NORELOC | PG_WAIT)) ==  (PG_NORELOC | PG_WAIT) &&
2131 	    kcage_freemem < kcage_throttlefree + npages) {
2132 
2133 		/*
2134 		 * The cage is on, the caller wants PG_NORELOC
2135 		 * pages and available cage memory is very low.
2136 		 * Call kcage_create_throttle() to attempt to
2137 		 * control demand on the cage.
2138 		 */
2139 		if (kcage_create_throttle(npages, flags) == KCT_FAILURE) {
2140 			VM_STAT_ADD(page_create_large_cnt[2]);
2141 			return (NULL);
2142 		}
2143 	}
2144 
2145 	enough = 0;
2146 	pcf_index = PCF_INDEX();
2147 	p = &pcf[pcf_index];
2148 	p->pcf_touch = 1;
2149 	q = &pcf[PCF_FANOUT];
2150 	for (pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
2151 		if (p->pcf_count > npages) {
2152 			/*
2153 			 * a good one to try.
2154 			 */
2155 			mutex_enter(&p->pcf_lock);
2156 			if (p->pcf_count > npages) {
2157 				p->pcf_count -= (uint_t)npages;
2158 				/*
2159 				 * freemem is not protected by any lock.
2160 				 * Thus, we cannot have any assertion
2161 				 * containing freemem here.
2162 				 */
2163 				freemem -= npages;
2164 				enough = 1;
2165 				mutex_exit(&p->pcf_lock);
2166 				break;
2167 			}
2168 			mutex_exit(&p->pcf_lock);
2169 		}
2170 		pcftotal += p->pcf_count;
2171 		p++;
2172 		if (p >= q) {
2173 			p = pcf;
2174 		}
2175 		p->pcf_touch = 1;
2176 	}
2177 
2178 	if (!enough) {
2179 		/* If there isn't enough memory available, give up. */
2180 		if (pcftotal < npages) {
2181 			VM_STAT_ADD(page_create_large_cnt[3]);
2182 			return (NULL);
2183 		}
2184 
2185 		/* try to collect pages from several pcf bins */
2186 		for (p = pcf, pcftotal = 0, i = 0; i < PCF_FANOUT; i++) {
2187 			p->pcf_touch = 1;
2188 			mutex_enter(&p->pcf_lock);
2189 			pcftotal += p->pcf_count;
2190 			if (pcftotal >= npages) {
2191 				/*
2192 				 * Wow!  There are enough pages laying around
2193 				 * to satisfy the request.  Do the accounting,
2194 				 * drop the locks we acquired, and go back.
2195 				 *
2196 				 * freemem is not protected by any lock. So,
2197 				 * we cannot have any assertion containing
2198 				 * freemem.
2199 				 */
2200 				pgcnt_t	tpages = npages;
2201 				freemem -= npages;
2202 				while (p >= pcf) {
2203 					if (p->pcf_count <= tpages) {
2204 						tpages -= p->pcf_count;
2205 						p->pcf_count = 0;
2206 					} else {
2207 						p->pcf_count -= (uint_t)tpages;
2208 						tpages = 0;
2209 					}
2210 					mutex_exit(&p->pcf_lock);
2211 					p--;
2212 				}
2213 				ASSERT(tpages == 0);
2214 				break;
2215 			}
2216 			p++;
2217 		}
2218 		if (i == PCF_FANOUT) {
2219 			/* failed to collect pages - release the locks */
2220 			while (--p >= pcf) {
2221 				mutex_exit(&p->pcf_lock);
2222 			}
2223 			VM_STAT_ADD(page_create_large_cnt[4]);
2224 			return (NULL);
2225 		}
2226 	}
2227 
2228 	/*
2229 	 * This is where this function behaves fundamentally differently
2230 	 * than page_create_va(); since we're intending to map the page
2231 	 * with a single TTE, we have to get it as a physically contiguous
2232 	 * hardware pagesize chunk.  If we can't, we fail.
2233 	 */
2234 	if (lgrpid != NULL && *lgrpid >= 0 && *lgrpid <= lgrp_alloc_max &&
2235 		LGRP_EXISTS(lgrp_table[*lgrpid]))
2236 		lgrp = lgrp_table[*lgrpid];
2237 	else
2238 		lgrp = lgrp_mem_choose(seg, vaddr, bytes);
2239 
2240 	if ((rootpp = page_get_freelist(&kvp, off, seg, vaddr,
2241 	    bytes, flags & ~PG_MATCH_COLOR, lgrp)) == NULL) {
2242 		page_create_putback(npages);
2243 		VM_STAT_ADD(page_create_large_cnt[5]);
2244 		return (NULL);
2245 	}
2246 
2247 	/*
2248 	 * if we got the page with the wrong mtype give it back this is a
2249 	 * workaround for CR 6249718. When CR 6249718 is fixed we never get
2250 	 * inside "if" and the workaround becomes just a nop
2251 	 */
2252 	if (kcage_on && (flags & PG_NORELOC) && !PP_ISNORELOC(rootpp)) {
2253 		page_list_add_pages(rootpp, 0);
2254 		page_create_putback(npages);
2255 		VM_STAT_ADD(page_create_large_cnt[6]);
2256 		return (NULL);
2257 	}
2258 
2259 	/*
2260 	 * If satisfying this request has left us with too little
2261 	 * memory, start the wheels turning to get some back.  The
2262 	 * first clause of the test prevents waking up the pageout
2263 	 * daemon in situations where it would decide that there's
2264 	 * nothing to do.
2265 	 */
2266 	if (nscan < desscan && freemem < minfree) {
2267 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2268 		    "pageout_cv_signal:freemem %ld", freemem);
2269 		cv_signal(&proc_pageout->p_cv);
2270 	}
2271 
2272 	pp = rootpp;
2273 	while (npages--) {
2274 		ASSERT(PAGE_EXCL(pp));
2275 		ASSERT(pp->p_vnode == NULL);
2276 		ASSERT(!hat_page_is_mapped(pp));
2277 		PP_CLRFREE(pp);
2278 		PP_CLRAGED(pp);
2279 		if (!page_hashin(pp, vp, off, NULL))
2280 			panic("page_create_large: hashin failed: page %p",
2281 			    (void *)pp);
2282 		page_io_lock(pp);
2283 		off += PAGESIZE;
2284 		pp = pp->p_next;
2285 	}
2286 
2287 	VM_STAT_ADD(page_create_large_cnt[0]);
2288 	return (rootpp);
2289 }
2290 
2291 page_t *
2292 page_create_va(vnode_t *vp, u_offset_t off, size_t bytes, uint_t flags,
2293     struct seg *seg, caddr_t vaddr)
2294 {
2295 	page_t		*plist = NULL;
2296 	pgcnt_t		npages;
2297 	pgcnt_t		found_on_free = 0;
2298 	pgcnt_t		pages_req;
2299 	page_t		*npp = NULL;
2300 	uint_t		enough;
2301 	uint_t		i;
2302 	uint_t		pcf_index;
2303 	struct pcf	*p;
2304 	struct pcf	*q;
2305 	lgrp_t		*lgrp;
2306 
2307 	TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_START,
2308 		"page_create_start:vp %p off %llx bytes %lu flags %x",
2309 		vp, off, bytes, flags);
2310 
2311 	ASSERT(bytes != 0 && vp != NULL);
2312 
2313 	if ((flags & PG_EXCL) == 0 && (flags & PG_WAIT) == 0) {
2314 		panic("page_create: invalid flags");
2315 		/*NOTREACHED*/
2316 	}
2317 	ASSERT((flags & ~(PG_EXCL | PG_WAIT |
2318 	    PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == 0);
2319 	    /* but no others */
2320 
2321 	pages_req = npages = btopr(bytes);
2322 	/*
2323 	 * Try to see whether request is too large to *ever* be
2324 	 * satisfied, in order to prevent deadlock.  We arbitrarily
2325 	 * decide to limit maximum size requests to max_page_get.
2326 	 */
2327 	if (npages >= max_page_get) {
2328 		if ((flags & PG_WAIT) == 0) {
2329 			TRACE_4(TR_FAC_VM, TR_PAGE_CREATE_TOOBIG,
2330 			    "page_create_toobig:vp %p off %llx npages "
2331 			    "%lu max_page_get %lu",
2332 			    vp, off, npages, max_page_get);
2333 			return (NULL);
2334 		} else {
2335 			cmn_err(CE_WARN,
2336 			    "Request for too much kernel memory "
2337 			    "(%lu bytes), will hang forever", bytes);
2338 			for (;;)
2339 				delay(1000000000);
2340 		}
2341 	}
2342 
2343 	if (!kcage_on || panicstr) {
2344 		/*
2345 		 * Cage is OFF, or we are single threaded in
2346 		 * panic, so make everything a RELOC request.
2347 		 */
2348 		flags &= ~PG_NORELOC;
2349 	}
2350 
2351 	if (freemem <= throttlefree + npages)
2352 		if (!page_create_throttle(npages, flags))
2353 			return (NULL);
2354 
2355 	/*
2356 	 * If cage is on, dampen draw from cage when available
2357 	 * cage space is low.
2358 	 */
2359 	if ((flags & PG_NORELOC) &&
2360 		kcage_freemem < kcage_throttlefree + npages) {
2361 
2362 		/*
2363 		 * The cage is on, the caller wants PG_NORELOC
2364 		 * pages and available cage memory is very low.
2365 		 * Call kcage_create_throttle() to attempt to
2366 		 * control demand on the cage.
2367 		 */
2368 		if (kcage_create_throttle(npages, flags) == KCT_FAILURE)
2369 			return (NULL);
2370 	}
2371 
2372 	VM_STAT_ADD(page_create_cnt[0]);
2373 
2374 	enough = 0;
2375 	pcf_index = PCF_INDEX();
2376 
2377 	p = &pcf[pcf_index];
2378 	p->pcf_touch = 1;
2379 	q = &pcf[PCF_FANOUT];
2380 	for (i = 0; i < PCF_FANOUT; i++) {
2381 		if (p->pcf_count > npages) {
2382 			/*
2383 			 * a good one to try.
2384 			 */
2385 			mutex_enter(&p->pcf_lock);
2386 			if (p->pcf_count > npages) {
2387 				p->pcf_count -= (uint_t)npages;
2388 				/*
2389 				 * freemem is not protected by any lock.
2390 				 * Thus, we cannot have any assertion
2391 				 * containing freemem here.
2392 				 */
2393 				freemem -= npages;
2394 				enough = 1;
2395 				mutex_exit(&p->pcf_lock);
2396 				break;
2397 			}
2398 			mutex_exit(&p->pcf_lock);
2399 		}
2400 		p++;
2401 		if (p >= q) {
2402 			p = pcf;
2403 		}
2404 		p->pcf_touch = 1;
2405 	}
2406 
2407 	if (!enough) {
2408 		/*
2409 		 * Have to look harder.  If npages is greater than
2410 		 * one, then we might have to coalecse the counters.
2411 		 *
2412 		 * Go wait.  We come back having accounted
2413 		 * for the memory.
2414 		 */
2415 		VM_STAT_ADD(page_create_cnt[1]);
2416 		if (!page_create_wait(npages, flags)) {
2417 			VM_STAT_ADD(page_create_cnt[2]);
2418 			return (NULL);
2419 		}
2420 	}
2421 
2422 	TRACE_2(TR_FAC_VM, TR_PAGE_CREATE_SUCCESS,
2423 		"page_create_success:vp %p off %llx", vp, off);
2424 
2425 	/*
2426 	 * If satisfying this request has left us with too little
2427 	 * memory, start the wheels turning to get some back.  The
2428 	 * first clause of the test prevents waking up the pageout
2429 	 * daemon in situations where it would decide that there's
2430 	 * nothing to do.
2431 	 */
2432 	if (nscan < desscan && freemem < minfree) {
2433 		TRACE_1(TR_FAC_VM, TR_PAGEOUT_CV_SIGNAL,
2434 			"pageout_cv_signal:freemem %ld", freemem);
2435 		cv_signal(&proc_pageout->p_cv);
2436 	}
2437 
2438 	/*
2439 	 * Loop around collecting the requested number of pages.
2440 	 * Most of the time, we have to `create' a new page. With
2441 	 * this in mind, pull the page off the free list before
2442 	 * getting the hash lock.  This will minimize the hash
2443 	 * lock hold time, nesting, and the like.  If it turns
2444 	 * out we don't need the page, we put it back at the end.
2445 	 */
2446 	while (npages--) {
2447 		page_t		*pp;
2448 		kmutex_t	*phm = NULL;
2449 		ulong_t		index;
2450 
2451 		index = PAGE_HASH_FUNC(vp, off);
2452 top:
2453 		ASSERT(phm == NULL);
2454 		ASSERT(index == PAGE_HASH_FUNC(vp, off));
2455 		ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
2456 
2457 		if (npp == NULL) {
2458 			/*
2459 			 * Try to get a page from the freelist (ie,
2460 			 * a page with no [vp, off] tag).  If that
2461 			 * fails, use the cachelist.
2462 			 *
2463 			 * During the first attempt at both the free
2464 			 * and cache lists we try for the correct color.
2465 			 */
2466 			/*
2467 			 * XXXX-how do we deal with virtual indexed
2468 			 * caches and and colors?
2469 			 */
2470 			VM_STAT_ADD(page_create_cnt[4]);
2471 			/*
2472 			 * Get lgroup to allocate next page of shared memory
2473 			 * from and use it to specify where to allocate
2474 			 * the physical memory
2475 			 */
2476 			lgrp = lgrp_mem_choose(seg, vaddr, PAGESIZE);
2477 			npp = page_get_freelist(vp, off, seg, vaddr, PAGESIZE,
2478 			    flags | PG_MATCH_COLOR, lgrp);
2479 			if (npp == NULL) {
2480 				npp = page_get_cachelist(vp, off, seg,
2481 				    vaddr, flags | PG_MATCH_COLOR, lgrp);
2482 				if (npp == NULL) {
2483 					npp = page_create_get_something(vp,
2484 					    off, seg, vaddr,
2485 					    flags & ~PG_MATCH_COLOR);
2486 				}
2487 
2488 				if (PP_ISAGED(npp) == 0) {
2489 					/*
2490 					 * Since this page came from the
2491 					 * cachelist, we must destroy the
2492 					 * old vnode association.
2493 					 */
2494 					page_hashout(npp, NULL);
2495 				}
2496 			}
2497 		}
2498 
2499 		/*
2500 		 * We own this page!
2501 		 */
2502 		ASSERT(PAGE_EXCL(npp));
2503 		ASSERT(npp->p_vnode == NULL);
2504 		ASSERT(!hat_page_is_mapped(npp));
2505 		PP_CLRFREE(npp);
2506 		PP_CLRAGED(npp);
2507 
2508 		/*
2509 		 * Here we have a page in our hot little mits and are
2510 		 * just waiting to stuff it on the appropriate lists.
2511 		 * Get the mutex and check to see if it really does
2512 		 * not exist.
2513 		 */
2514 		phm = PAGE_HASH_MUTEX(index);
2515 		mutex_enter(phm);
2516 		PAGE_HASH_SEARCH(index, pp, vp, off);
2517 		if (pp == NULL) {
2518 			VM_STAT_ADD(page_create_new);
2519 			pp = npp;
2520 			npp = NULL;
2521 			if (!page_hashin(pp, vp, off, phm)) {
2522 				/*
2523 				 * Since we hold the page hash mutex and
2524 				 * just searched for this page, page_hashin
2525 				 * had better not fail.  If it does, that
2526 				 * means somethread did not follow the
2527 				 * page hash mutex rules.  Panic now and
2528 				 * get it over with.  As usual, go down
2529 				 * holding all the locks.
2530 				 */
2531 				ASSERT(MUTEX_HELD(phm));
2532 				panic("page_create: "
2533 				    "hashin failed %p %p %llx %p",
2534 				    (void *)pp, (void *)vp, off, (void *)phm);
2535 				/*NOTREACHED*/
2536 			}
2537 			ASSERT(MUTEX_HELD(phm));
2538 			mutex_exit(phm);
2539 			phm = NULL;
2540 
2541 			/*
2542 			 * Hat layer locking need not be done to set
2543 			 * the following bits since the page is not hashed
2544 			 * and was on the free list (i.e., had no mappings).
2545 			 *
2546 			 * Set the reference bit to protect
2547 			 * against immediate pageout
2548 			 *
2549 			 * XXXmh modify freelist code to set reference
2550 			 * bit so we don't have to do it here.
2551 			 */
2552 			page_set_props(pp, P_REF);
2553 			found_on_free++;
2554 		} else {
2555 			VM_STAT_ADD(page_create_exists);
2556 			if (flags & PG_EXCL) {
2557 				/*
2558 				 * Found an existing page, and the caller
2559 				 * wanted all new pages.  Undo all of the work
2560 				 * we have done.
2561 				 */
2562 				mutex_exit(phm);
2563 				phm = NULL;
2564 				while (plist != NULL) {
2565 					pp = plist;
2566 					page_sub(&plist, pp);
2567 					page_io_unlock(pp);
2568 					/* large pages should not end up here */
2569 					ASSERT(pp->p_szc == 0);
2570 					/*LINTED: constant in conditional ctx*/
2571 					VN_DISPOSE(pp, B_INVAL, 0, kcred);
2572 				}
2573 				VM_STAT_ADD(page_create_found_one);
2574 				goto fail;
2575 			}
2576 			ASSERT(flags & PG_WAIT);
2577 			if (!page_lock(pp, SE_EXCL, phm, P_NO_RECLAIM)) {
2578 				/*
2579 				 * Start all over again if we blocked trying
2580 				 * to lock the page.
2581 				 */
2582 				mutex_exit(phm);
2583 				VM_STAT_ADD(page_create_page_lock_failed);
2584 				phm = NULL;
2585 				goto top;
2586 			}
2587 			mutex_exit(phm);
2588 			phm = NULL;
2589 
2590 			if (PP_ISFREE(pp)) {
2591 				ASSERT(PP_ISAGED(pp) == 0);
2592 				VM_STAT_ADD(pagecnt.pc_get_cache);
2593 				page_list_sub(pp, PG_CACHE_LIST);
2594 				PP_CLRFREE(pp);
2595 				found_on_free++;
2596 			}
2597 		}
2598 
2599 		/*
2600 		 * Got a page!  It is locked.  Acquire the i/o
2601 		 * lock since we are going to use the p_next and
2602 		 * p_prev fields to link the requested pages together.
2603 		 */
2604 		page_io_lock(pp);
2605 		page_add(&plist, pp);
2606 		plist = plist->p_next;
2607 		off += PAGESIZE;
2608 		vaddr += PAGESIZE;
2609 	}
2610 
2611 	ASSERT((flags & PG_EXCL) ? (found_on_free == pages_req) : 1);
2612 fail:
2613 	if (npp != NULL) {
2614 		/*
2615 		 * Did not need this page after all.
2616 		 * Put it back on the free list.
2617 		 */
2618 		VM_STAT_ADD(page_create_putbacks);
2619 		PP_SETFREE(npp);
2620 		PP_SETAGED(npp);
2621 		npp->p_offset = (u_offset_t)-1;
2622 		page_list_add(npp, PG_FREE_LIST | PG_LIST_TAIL);
2623 		page_unlock(npp);
2624 
2625 	}
2626 
2627 	ASSERT(pages_req >= found_on_free);
2628 
2629 	{
2630 		uint_t overshoot = (uint_t)(pages_req - found_on_free);
2631 
2632 		if (overshoot) {
2633 			VM_STAT_ADD(page_create_overshoot);
2634 			p = &pcf[pcf_index];
2635 			p->pcf_touch = 1;
2636 			mutex_enter(&p->pcf_lock);
2637 			if (p->pcf_block) {
2638 				p->pcf_reserve += overshoot;
2639 			} else {
2640 				p->pcf_count += overshoot;
2641 				if (p->pcf_wait) {
2642 					mutex_enter(&new_freemem_lock);
2643 					if (freemem_wait) {
2644 						cv_signal(&freemem_cv);
2645 						p->pcf_wait--;
2646 					} else {
2647 						p->pcf_wait = 0;
2648 					}
2649 					mutex_exit(&new_freemem_lock);
2650 				}
2651 			}
2652 			mutex_exit(&p->pcf_lock);
2653 			/* freemem is approximate, so this test OK */
2654 			if (!p->pcf_block)
2655 				freemem += overshoot;
2656 		}
2657 	}
2658 
2659 	return (plist);
2660 }
2661 
2662 /*
2663  * One or more constituent pages of this large page has been marked
2664  * toxic. Simply demote the large page to PAGESIZE pages and let
2665  * page_free() handle it. This routine should only be called by
2666  * large page free routines (page_free_pages() and page_destroy_pages().
2667  * All pages are locked SE_EXCL and have already been marked free.
2668  */
2669 static void
2670 page_free_toxic_pages(page_t *rootpp)
2671 {
2672 	page_t	*tpp;
2673 	pgcnt_t	i, pgcnt = page_get_pagecnt(rootpp->p_szc);
2674 	uint_t	szc = rootpp->p_szc;
2675 
2676 	for (i = 0, tpp = rootpp; i < pgcnt; i++, tpp = tpp->p_next) {
2677 		ASSERT(tpp->p_szc == szc);
2678 		ASSERT((PAGE_EXCL(tpp) &&
2679 		    !page_iolock_assert(tpp)) || panicstr);
2680 		tpp->p_szc = 0;
2681 	}
2682 
2683 	while (rootpp != NULL) {
2684 		tpp = rootpp;
2685 		page_sub(&rootpp, tpp);
2686 		ASSERT(PP_ISFREE(tpp));
2687 		PP_CLRFREE(tpp);
2688 		page_free(tpp, 1);
2689 	}
2690 }
2691 
2692 /*
2693  * Put page on the "free" list.
2694  * The free list is really two lists maintained by
2695  * the PSM of whatever machine we happen to be on.
2696  */
2697 void
2698 page_free(page_t *pp, int dontneed)
2699 {
2700 	struct pcf	*p;
2701 	uint_t		pcf_index;
2702 
2703 	ASSERT((PAGE_EXCL(pp) &&
2704 	    !page_iolock_assert(pp)) || panicstr);
2705 
2706 	if (PP_ISFREE(pp)) {
2707 		panic("page_free: page %p is free", (void *)pp);
2708 	}
2709 
2710 	if (pp->p_szc != 0) {
2711 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
2712 		    pp->p_vnode == &kvp) {
2713 			panic("page_free: anon or kernel "
2714 			    "or no vnode large page %p", (void *)pp);
2715 		}
2716 		page_demote_vp_pages(pp);
2717 		ASSERT(pp->p_szc == 0);
2718 	}
2719 
2720 	/*
2721 	 * The page_struct_lock need not be acquired to examine these
2722 	 * fields since the page has an "exclusive" lock.
2723 	 */
2724 	if (hat_page_is_mapped(pp) || pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
2725 		panic("page_free pp=%p, pfn=%lx, lckcnt=%d, cowcnt=%d",
2726 		    pp, page_pptonum(pp), pp->p_lckcnt, pp->p_cowcnt);
2727 		/*NOTREACHED*/
2728 	}
2729 
2730 	ASSERT(!hat_page_getshare(pp));
2731 
2732 	PP_SETFREE(pp);
2733 	ASSERT(pp->p_vnode == NULL || !IS_VMODSORT(pp->p_vnode) ||
2734 	    !hat_ismod(pp));
2735 	page_clr_all_props(pp);
2736 	ASSERT(!hat_page_getshare(pp));
2737 
2738 	/*
2739 	 * Now we add the page to the head of the free list.
2740 	 * But if this page is associated with a paged vnode
2741 	 * then we adjust the head forward so that the page is
2742 	 * effectively at the end of the list.
2743 	 */
2744 	if (pp->p_vnode == NULL) {
2745 		/*
2746 		 * Page has no identity, put it on the free list.
2747 		 */
2748 		PP_SETAGED(pp);
2749 		pp->p_offset = (u_offset_t)-1;
2750 		page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
2751 		VM_STAT_ADD(pagecnt.pc_free_free);
2752 		TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2753 		    "page_free_free:pp %p", pp);
2754 	} else {
2755 		PP_CLRAGED(pp);
2756 
2757 		if (!dontneed || nopageage) {
2758 			/* move it to the tail of the list */
2759 			page_list_add(pp, PG_CACHE_LIST | PG_LIST_TAIL);
2760 
2761 			VM_STAT_ADD(pagecnt.pc_free_cache);
2762 			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_TAIL,
2763 			    "page_free_cache_tail:pp %p", pp);
2764 		} else {
2765 			page_list_add(pp, PG_CACHE_LIST | PG_LIST_HEAD);
2766 
2767 			VM_STAT_ADD(pagecnt.pc_free_dontneed);
2768 			TRACE_1(TR_FAC_VM, TR_PAGE_FREE_CACHE_HEAD,
2769 			    "page_free_cache_head:pp %p", pp);
2770 		}
2771 	}
2772 	page_unlock(pp);
2773 
2774 	/*
2775 	 * Now do the `freemem' accounting.
2776 	 */
2777 	pcf_index = PCF_INDEX();
2778 	p = &pcf[pcf_index];
2779 	p->pcf_touch = 1;
2780 
2781 	mutex_enter(&p->pcf_lock);
2782 	if (p->pcf_block) {
2783 		p->pcf_reserve += 1;
2784 	} else {
2785 		p->pcf_count += 1;
2786 		if (p->pcf_wait) {
2787 			mutex_enter(&new_freemem_lock);
2788 			/*
2789 			 * Check to see if some other thread
2790 			 * is actually waiting.  Another bucket
2791 			 * may have woken it up by now.  If there
2792 			 * are no waiters, then set our pcf_wait
2793 			 * count to zero to avoid coming in here
2794 			 * next time.  Also, since only one page
2795 			 * was put on the free list, just wake
2796 			 * up one waiter.
2797 			 */
2798 			if (freemem_wait) {
2799 				cv_signal(&freemem_cv);
2800 				p->pcf_wait--;
2801 			} else {
2802 				p->pcf_wait = 0;
2803 			}
2804 			mutex_exit(&new_freemem_lock);
2805 		}
2806 	}
2807 	mutex_exit(&p->pcf_lock);
2808 
2809 	/* freemem is approximate, so this test OK */
2810 	if (!p->pcf_block)
2811 		freemem += 1;
2812 }
2813 
2814 /*
2815  * Put page on the "free" list during intial startup.
2816  * This happens during initial single threaded execution.
2817  */
2818 void
2819 page_free_at_startup(page_t *pp)
2820 {
2821 	struct pcf	*p;
2822 	uint_t		pcf_index;
2823 
2824 	page_list_add(pp, PG_FREE_LIST | PG_LIST_HEAD | PG_LIST_ISINIT);
2825 	VM_STAT_ADD(pagecnt.pc_free_free);
2826 
2827 	/*
2828 	 * Now do the `freemem' accounting.
2829 	 */
2830 	pcf_index = PCF_INDEX();
2831 	p = &pcf[pcf_index];
2832 	p->pcf_touch = 1;
2833 
2834 	ASSERT(p->pcf_block == 0);
2835 	ASSERT(p->pcf_wait == 0);
2836 	p->pcf_count += 1;
2837 
2838 	/* freemem is approximate, so this is OK */
2839 	freemem += 1;
2840 }
2841 
2842 void
2843 page_free_pages(page_t *pp)
2844 {
2845 	page_t	*tpp, *rootpp = NULL;
2846 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
2847 	pgcnt_t	i;
2848 	uint_t	szc = pp->p_szc;
2849 
2850 	VM_STAT_ADD(pagecnt.pc_free_pages);
2851 	TRACE_1(TR_FAC_VM, TR_PAGE_FREE_FREE,
2852 	    "page_free_free:pp %p", pp);
2853 
2854 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
2855 	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
2856 		panic("page_free_pages: not root page %p", (void *)pp);
2857 		/*NOTREACHED*/
2858 	}
2859 
2860 	for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
2861 		ASSERT((PAGE_EXCL(tpp) &&
2862 		    !page_iolock_assert(tpp)) || panicstr);
2863 		if (PP_ISFREE(tpp)) {
2864 			panic("page_free_pages: page %p is free", (void *)tpp);
2865 			/*NOTREACHED*/
2866 		}
2867 		if (hat_page_is_mapped(tpp) || tpp->p_lckcnt != 0 ||
2868 		    tpp->p_cowcnt != 0) {
2869 			panic("page_free_pages %p", (void *)tpp);
2870 			/*NOTREACHED*/
2871 		}
2872 
2873 		ASSERT(!hat_page_getshare(tpp));
2874 		ASSERT(tpp->p_vnode == NULL);
2875 		ASSERT(tpp->p_szc == szc);
2876 
2877 		PP_SETFREE(tpp);
2878 		page_clr_all_props(tpp);
2879 		PP_SETAGED(tpp);
2880 		tpp->p_offset = (u_offset_t)-1;
2881 		ASSERT(tpp->p_next == tpp);
2882 		ASSERT(tpp->p_prev == tpp);
2883 		page_list_concat(&rootpp, &tpp);
2884 	}
2885 	ASSERT(rootpp == pp);
2886 
2887 	page_list_add_pages(rootpp, 0);
2888 	page_create_putback(pgcnt);
2889 }
2890 
2891 int free_pages = 1;
2892 
2893 /*
2894  * This routine attempts to return pages to the cachelist via page_release().
2895  * It does not *have* to be successful in all cases, since the pageout scanner
2896  * will catch any pages it misses.  It does need to be fast and not introduce
2897  * too much overhead.
2898  *
2899  * If a page isn't found on the unlocked sweep of the page_hash bucket, we
2900  * don't lock and retry.  This is ok, since the page scanner will eventually
2901  * find any page we miss in free_vp_pages().
2902  */
2903 void
2904 free_vp_pages(vnode_t *vp, u_offset_t off, size_t len)
2905 {
2906 	page_t *pp;
2907 	u_offset_t eoff;
2908 	extern int swap_in_range(vnode_t *, u_offset_t, size_t);
2909 
2910 	eoff = off + len;
2911 
2912 	if (free_pages == 0)
2913 		return;
2914 	if (swap_in_range(vp, off, len))
2915 		return;
2916 
2917 	for (; off < eoff; off += PAGESIZE) {
2918 
2919 		/*
2920 		 * find the page using a fast, but inexact search. It'll be OK
2921 		 * if a few pages slip through the cracks here.
2922 		 */
2923 		pp = page_exists(vp, off);
2924 
2925 		/*
2926 		 * If we didn't find the page (it may not exist), the page
2927 		 * is free, looks still in use (shared), or we can't lock it,
2928 		 * just give up.
2929 		 */
2930 		if (pp == NULL ||
2931 		    PP_ISFREE(pp) ||
2932 		    page_share_cnt(pp) > 0 ||
2933 		    !page_trylock(pp, SE_EXCL))
2934 			continue;
2935 
2936 		/*
2937 		 * Once we have locked pp, verify that it's still the
2938 		 * correct page and not already free
2939 		 */
2940 		ASSERT(PAGE_LOCKED_SE(pp, SE_EXCL));
2941 		if (pp->p_vnode != vp || pp->p_offset != off || PP_ISFREE(pp)) {
2942 			page_unlock(pp);
2943 			continue;
2944 		}
2945 
2946 		/*
2947 		 * try to release the page...
2948 		 */
2949 		(void) page_release(pp, 1);
2950 	}
2951 }
2952 
2953 /*
2954  * Reclaim the given page from the free list.
2955  * Returns 1 on success or 0 on failure.
2956  *
2957  * The page is unlocked if it can't be reclaimed (when freemem == 0).
2958  * If `lock' is non-null, it will be dropped and re-acquired if
2959  * the routine must wait while freemem is 0.
2960  *
2961  * As it turns out, boot_getpages() does this.  It picks a page,
2962  * based on where OBP mapped in some address, gets its pfn, searches
2963  * the memsegs, locks the page, then pulls it off the free list!
2964  */
2965 int
2966 page_reclaim(page_t *pp, kmutex_t *lock)
2967 {
2968 	struct pcf	*p;
2969 	uint_t		pcf_index;
2970 	struct cpu	*cpup;
2971 	uint_t		i;
2972 	pgcnt_t		npgs, need;
2973 	pgcnt_t		collected = 0;
2974 
2975 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
2976 	ASSERT(PAGE_EXCL(pp) && PP_ISFREE(pp));
2977 
2978 	npgs = page_get_pagecnt(pp->p_szc);
2979 
2980 	/*
2981 	 * If `freemem' is 0, we cannot reclaim this page from the
2982 	 * freelist, so release every lock we might hold: the page,
2983 	 * and the `lock' before blocking.
2984 	 *
2985 	 * The only way `freemem' can become 0 while there are pages
2986 	 * marked free (have their p->p_free bit set) is when the
2987 	 * system is low on memory and doing a page_create().  In
2988 	 * order to guarantee that once page_create() starts acquiring
2989 	 * pages it will be able to get all that it needs since `freemem'
2990 	 * was decreased by the requested amount.  So, we need to release
2991 	 * this page, and let page_create() have it.
2992 	 *
2993 	 * Since `freemem' being zero is not supposed to happen, just
2994 	 * use the usual hash stuff as a starting point.  If that bucket
2995 	 * is empty, then assume the worst, and start at the beginning
2996 	 * of the pcf array.  If we always start at the beginning
2997 	 * when acquiring more than one pcf lock, there won't be any
2998 	 * deadlock problems.
2999 	 */
3000 
3001 	/* TODO: Do we need to test kcage_freemem if PG_NORELOC(pp)? */
3002 
3003 	if (freemem <= throttlefree && !page_create_throttle(npgs, 0)) {
3004 		pcf_acquire_all();
3005 		goto page_reclaim_nomem;
3006 	}
3007 
3008 	pcf_index = PCF_INDEX();
3009 	p = &pcf[pcf_index];
3010 	p->pcf_touch = 1;
3011 	mutex_enter(&p->pcf_lock);
3012 	if (p->pcf_count >= npgs) {
3013 		collected = npgs;
3014 		p->pcf_count -= npgs;
3015 	}
3016 	mutex_exit(&p->pcf_lock);
3017 	need = npgs - collected;
3018 
3019 	if (need > 0) {
3020 		VM_STAT_ADD(page_reclaim_zero);
3021 		/*
3022 		 * Check again. Its possible that some other thread
3023 		 * could have been right behind us, and added one
3024 		 * to a list somewhere.  Acquire each of the pcf locks
3025 		 * until we find a page.
3026 		 */
3027 		p = pcf;
3028 		for (i = 0; i < PCF_FANOUT; i++) {
3029 			p->pcf_touch = 1;
3030 			mutex_enter(&p->pcf_lock);
3031 			if (p->pcf_count) {
3032 				if (p->pcf_count >= need) {
3033 					p->pcf_count -= need;
3034 					collected += need;
3035 					need = 0;
3036 					break;
3037 				} else if (p->pcf_count) {
3038 					collected += p->pcf_count;
3039 					need -= p->pcf_count;
3040 					p->pcf_count = 0;
3041 				}
3042 			}
3043 			p++;
3044 		}
3045 
3046 		if (need > 0) {
3047 page_reclaim_nomem:
3048 			/*
3049 			 * We really can't have page `pp'.
3050 			 * Time for the no-memory dance with
3051 			 * page_free().  This is just like
3052 			 * page_create_wait().  Plus the added
3053 			 * attraction of releasing whatever mutex
3054 			 * we held when we were called with in `lock'.
3055 			 * Page_unlock() will wakeup any thread
3056 			 * waiting around for this page.
3057 			 */
3058 			if (lock) {
3059 				VM_STAT_ADD(page_reclaim_zero_locked);
3060 				mutex_exit(lock);
3061 			}
3062 			page_unlock(pp);
3063 
3064 			/*
3065 			 * get this before we drop all the pcf locks.
3066 			 */
3067 			mutex_enter(&new_freemem_lock);
3068 
3069 			p = pcf;
3070 			p->pcf_count += collected;
3071 			for (i = 0; i < PCF_FANOUT; i++) {
3072 				p->pcf_wait++;
3073 				mutex_exit(&p->pcf_lock);
3074 				p++;
3075 			}
3076 
3077 			freemem_wait++;
3078 			cv_wait(&freemem_cv, &new_freemem_lock);
3079 			freemem_wait--;
3080 
3081 			mutex_exit(&new_freemem_lock);
3082 
3083 			if (lock) {
3084 				mutex_enter(lock);
3085 			}
3086 			return (0);
3087 		}
3088 
3089 		/*
3090 		 * We beat the PCF bins over the head until
3091 		 * we got the memory that we wanted.
3092 		 * The pcf accounting has been done,
3093 		 * though none of the pcf_wait flags have been set,
3094 		 * drop the locks and continue on.
3095 		 */
3096 		ASSERT(collected == npgs);
3097 		while (p >= pcf) {
3098 			mutex_exit(&p->pcf_lock);
3099 			p--;
3100 		}
3101 	}
3102 
3103 	/*
3104 	 * freemem is not protected by any lock. Thus, we cannot
3105 	 * have any assertion containing freemem here.
3106 	 */
3107 	freemem -= npgs;
3108 
3109 	VM_STAT_ADD(pagecnt.pc_reclaim);
3110 	if (PP_ISAGED(pp)) {
3111 		if (npgs > 1) {
3112 			page_list_sub_pages(pp, pp->p_szc);
3113 		} else {
3114 			page_list_sub(pp, PG_FREE_LIST);
3115 		}
3116 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_FREE,
3117 		    "page_reclaim_free:pp %p", pp);
3118 	} else {
3119 		ASSERT(npgs == 1);
3120 		page_list_sub(pp, PG_CACHE_LIST);
3121 		TRACE_1(TR_FAC_VM, TR_PAGE_UNFREE_CACHE,
3122 		    "page_reclaim_cache:pp %p", pp);
3123 	}
3124 
3125 	/*
3126 	 * clear the p_free & p_age bits since this page is no longer
3127 	 * on the free list.  Notice that there was a brief time where
3128 	 * a page is marked as free, but is not on the list.
3129 	 *
3130 	 * Set the reference bit to protect against immediate pageout.
3131 	 */
3132 	for (i = 0; i < npgs; i++, pp++) {
3133 		PP_CLRFREE(pp);
3134 		PP_CLRAGED(pp);
3135 		page_set_props(pp, P_REF);
3136 	}
3137 
3138 	CPU_STATS_ENTER_K();
3139 	cpup = CPU;	/* get cpup now that CPU cannot change */
3140 	CPU_STATS_ADDQ(cpup, vm, pgrec, 1);
3141 	CPU_STATS_ADDQ(cpup, vm, pgfrec, 1);
3142 	CPU_STATS_EXIT_K();
3143 
3144 	return (1);
3145 }
3146 
3147 
3148 
3149 /*
3150  * Destroy identity of the page and put it back on
3151  * the page free list.  Assumes that the caller has
3152  * acquired the "exclusive" lock on the page.
3153  */
3154 void
3155 page_destroy(page_t *pp, int dontfree)
3156 {
3157 	ASSERT((PAGE_EXCL(pp) &&
3158 	    !page_iolock_assert(pp)) || panicstr);
3159 
3160 	if (pp->p_szc != 0) {
3161 		if (pp->p_vnode == NULL || IS_SWAPFSVP(pp->p_vnode) ||
3162 		    pp->p_vnode == &kvp) {
3163 			panic("page_destroy: anon or kernel or no vnode "
3164 			    "large page %p", (void *)pp);
3165 		}
3166 		page_demote_vp_pages(pp);
3167 		ASSERT(pp->p_szc == 0);
3168 	}
3169 
3170 	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy:pp %p", pp);
3171 
3172 	/*
3173 	 * Unload translations, if any, then hash out the
3174 	 * page to erase its identity.
3175 	 */
3176 	(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3177 	page_hashout(pp, NULL);
3178 
3179 	if (!dontfree) {
3180 		/*
3181 		 * Acquire the "freemem_lock" for availrmem.
3182 		 * The page_struct_lock need not be acquired for lckcnt
3183 		 * and cowcnt since the page has an "exclusive" lock.
3184 		 */
3185 		if ((pp->p_lckcnt != 0) || (pp->p_cowcnt != 0)) {
3186 			mutex_enter(&freemem_lock);
3187 			if (pp->p_lckcnt != 0) {
3188 				availrmem++;
3189 				pp->p_lckcnt = 0;
3190 			}
3191 			if (pp->p_cowcnt != 0) {
3192 				availrmem += pp->p_cowcnt;
3193 				pp->p_cowcnt = 0;
3194 			}
3195 			mutex_exit(&freemem_lock);
3196 		}
3197 		/*
3198 		 * Put the page on the "free" list.
3199 		 */
3200 		page_free(pp, 0);
3201 	}
3202 }
3203 
3204 void
3205 page_destroy_pages(page_t *pp)
3206 {
3207 
3208 	page_t	*tpp, *rootpp = NULL;
3209 	pgcnt_t	pgcnt = page_get_pagecnt(pp->p_szc);
3210 	pgcnt_t	i, pglcks = 0;
3211 	uint_t	szc = pp->p_szc;
3212 
3213 	ASSERT(pp->p_szc != 0 && pp->p_szc < page_num_pagesizes());
3214 
3215 	VM_STAT_ADD(pagecnt.pc_destroy_pages);
3216 
3217 	TRACE_1(TR_FAC_VM, TR_PAGE_DESTROY, "page_destroy_pages:pp %p", pp);
3218 
3219 	if ((page_pptonum(pp) & (pgcnt - 1)) != 0) {
3220 		panic("page_destroy_pages: not root page %p", (void *)pp);
3221 		/*NOTREACHED*/
3222 	}
3223 
3224 	for (i = 0, tpp = pp; i < pgcnt; i++, tpp++) {
3225 		ASSERT((PAGE_EXCL(tpp) &&
3226 		    !page_iolock_assert(tpp)) || panicstr);
3227 		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
3228 		page_hashout(tpp, NULL);
3229 		ASSERT(tpp->p_offset == (u_offset_t)-1);
3230 		if (tpp->p_lckcnt != 0) {
3231 			pglcks++;
3232 			tpp->p_lckcnt = 0;
3233 		} else if (tpp->p_cowcnt != 0) {
3234 			pglcks += tpp->p_cowcnt;
3235 			tpp->p_cowcnt = 0;
3236 		}
3237 		ASSERT(!hat_page_getshare(tpp));
3238 		ASSERT(tpp->p_vnode == NULL);
3239 		ASSERT(tpp->p_szc == szc);
3240 
3241 		PP_SETFREE(tpp);
3242 		page_clr_all_props(tpp);
3243 		PP_SETAGED(tpp);
3244 		ASSERT(tpp->p_next == tpp);
3245 		ASSERT(tpp->p_prev == tpp);
3246 		page_list_concat(&rootpp, &tpp);
3247 	}
3248 
3249 	ASSERT(rootpp == pp);
3250 	if (pglcks != 0) {
3251 		mutex_enter(&freemem_lock);
3252 		availrmem += pglcks;
3253 		mutex_exit(&freemem_lock);
3254 	}
3255 
3256 	page_list_add_pages(rootpp, 0);
3257 	page_create_putback(pgcnt);
3258 }
3259 
3260 /*
3261  * Similar to page_destroy(), but destroys pages which are
3262  * locked and known to be on the page free list.  Since
3263  * the page is known to be free and locked, no one can access
3264  * it.
3265  *
3266  * Also, the number of free pages does not change.
3267  */
3268 void
3269 page_destroy_free(page_t *pp)
3270 {
3271 	ASSERT(PAGE_EXCL(pp));
3272 	ASSERT(PP_ISFREE(pp));
3273 	ASSERT(pp->p_vnode);
3274 	ASSERT(hat_page_getattr(pp, P_MOD | P_REF | P_RO) == 0);
3275 	ASSERT(!hat_page_is_mapped(pp));
3276 	ASSERT(PP_ISAGED(pp) == 0);
3277 	ASSERT(pp->p_szc == 0);
3278 
3279 	VM_STAT_ADD(pagecnt.pc_destroy_free);
3280 	page_list_sub(pp, PG_CACHE_LIST);
3281 
3282 	page_hashout(pp, NULL);
3283 	ASSERT(pp->p_vnode == NULL);
3284 	ASSERT(pp->p_offset == (u_offset_t)-1);
3285 	ASSERT(pp->p_hash == NULL);
3286 
3287 	PP_SETAGED(pp);
3288 	page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3289 	page_unlock(pp);
3290 
3291 	mutex_enter(&new_freemem_lock);
3292 	if (freemem_wait) {
3293 		cv_signal(&freemem_cv);
3294 	}
3295 	mutex_exit(&new_freemem_lock);
3296 }
3297 
3298 /*
3299  * Rename the page "opp" to have an identity specified
3300  * by [vp, off].  If a page already exists with this name
3301  * it is locked and destroyed.  Note that the page's
3302  * translations are not unloaded during the rename.
3303  *
3304  * This routine is used by the anon layer to "steal" the
3305  * original page and is not unlike destroying a page and
3306  * creating a new page using the same page frame.
3307  *
3308  * XXX -- Could deadlock if caller 1 tries to rename A to B while
3309  * caller 2 tries to rename B to A.
3310  */
3311 void
3312 page_rename(page_t *opp, vnode_t *vp, u_offset_t off)
3313 {
3314 	page_t		*pp;
3315 	int		olckcnt = 0;
3316 	int		ocowcnt = 0;
3317 	kmutex_t	*phm;
3318 	ulong_t		index;
3319 
3320 	ASSERT(PAGE_EXCL(opp) && !page_iolock_assert(opp));
3321 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3322 	ASSERT(PP_ISFREE(opp) == 0);
3323 
3324 	VM_STAT_ADD(page_rename_count);
3325 
3326 	TRACE_3(TR_FAC_VM, TR_PAGE_RENAME,
3327 		"page rename:pp %p vp %p off %llx", opp, vp, off);
3328 
3329 	/*
3330 	 * CacheFS may call page_rename for a large NFS page
3331 	 * when both CacheFS and NFS mount points are used
3332 	 * by applications. Demote this large page before
3333 	 * renaming it, to ensure that there are no "partial"
3334 	 * large pages left lying around.
3335 	 */
3336 	if (opp->p_szc != 0) {
3337 		vnode_t *ovp = opp->p_vnode;
3338 		ASSERT(ovp != NULL);
3339 		ASSERT(!IS_SWAPFSVP(ovp));
3340 		ASSERT(ovp != &kvp);
3341 		page_demote_vp_pages(opp);
3342 		ASSERT(opp->p_szc == 0);
3343 	}
3344 
3345 	page_hashout(opp, NULL);
3346 	PP_CLRAGED(opp);
3347 
3348 	/*
3349 	 * Acquire the appropriate page hash lock, since
3350 	 * we're going to rename the page.
3351 	 */
3352 	index = PAGE_HASH_FUNC(vp, off);
3353 	phm = PAGE_HASH_MUTEX(index);
3354 	mutex_enter(phm);
3355 top:
3356 	/*
3357 	 * Look for an existing page with this name and destroy it if found.
3358 	 * By holding the page hash lock all the way to the page_hashin()
3359 	 * call, we are assured that no page can be created with this
3360 	 * identity.  In the case when the phm lock is dropped to undo any
3361 	 * hat layer mappings, the existing page is held with an "exclusive"
3362 	 * lock, again preventing another page from being created with
3363 	 * this identity.
3364 	 */
3365 	PAGE_HASH_SEARCH(index, pp, vp, off);
3366 	if (pp != NULL) {
3367 		VM_STAT_ADD(page_rename_exists);
3368 
3369 		/*
3370 		 * As it turns out, this is one of only two places where
3371 		 * page_lock() needs to hold the passed in lock in the
3372 		 * successful case.  In all of the others, the lock could
3373 		 * be dropped as soon as the attempt is made to lock
3374 		 * the page.  It is tempting to add yet another arguement,
3375 		 * PL_KEEP or PL_DROP, to let page_lock know what to do.
3376 		 */
3377 		if (!page_lock(pp, SE_EXCL, phm, P_RECLAIM)) {
3378 			/*
3379 			 * Went to sleep because the page could not
3380 			 * be locked.  We were woken up when the page
3381 			 * was unlocked, or when the page was destroyed.
3382 			 * In either case, `phm' was dropped while we
3383 			 * slept.  Hence we should not just roar through
3384 			 * this loop.
3385 			 */
3386 			goto top;
3387 		}
3388 
3389 		/*
3390 		 * If an existing page is a large page, then demote
3391 		 * it to ensure that no "partial" large pages are
3392 		 * "created" after page_rename. An existing page
3393 		 * can be a CacheFS page, and can't belong to swapfs.
3394 		 */
3395 		if (hat_page_is_mapped(pp)) {
3396 			/*
3397 			 * Unload translations.  Since we hold the
3398 			 * exclusive lock on this page, the page
3399 			 * can not be changed while we drop phm.
3400 			 * This is also not a lock protocol violation,
3401 			 * but rather the proper way to do things.
3402 			 */
3403 			mutex_exit(phm);
3404 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
3405 			if (pp->p_szc != 0) {
3406 				ASSERT(!IS_SWAPFSVP(vp));
3407 				ASSERT(vp != &kvp);
3408 				page_demote_vp_pages(pp);
3409 				ASSERT(pp->p_szc == 0);
3410 			}
3411 			mutex_enter(phm);
3412 		} else if (pp->p_szc != 0) {
3413 			ASSERT(!IS_SWAPFSVP(vp));
3414 			ASSERT(vp != &kvp);
3415 			mutex_exit(phm);
3416 			page_demote_vp_pages(pp);
3417 			ASSERT(pp->p_szc == 0);
3418 			mutex_enter(phm);
3419 		}
3420 		page_hashout(pp, phm);
3421 	}
3422 	/*
3423 	 * Hash in the page with the new identity.
3424 	 */
3425 	if (!page_hashin(opp, vp, off, phm)) {
3426 		/*
3427 		 * We were holding phm while we searched for [vp, off]
3428 		 * and only dropped phm if we found and locked a page.
3429 		 * If we can't create this page now, then some thing
3430 		 * is really broken.
3431 		 */
3432 		panic("page_rename: Can't hash in page: %p", (void *)pp);
3433 		/*NOTREACHED*/
3434 	}
3435 
3436 	ASSERT(MUTEX_HELD(phm));
3437 	mutex_exit(phm);
3438 
3439 	/*
3440 	 * Now that we have dropped phm, lets get around to finishing up
3441 	 * with pp.
3442 	 */
3443 	if (pp != NULL) {
3444 		ASSERT(!hat_page_is_mapped(pp));
3445 		/* for now large pages should not end up here */
3446 		ASSERT(pp->p_szc == 0);
3447 		/*
3448 		 * Save the locks for transfer to the new page and then
3449 		 * clear them so page_free doesn't think they're important.
3450 		 * The page_struct_lock need not be acquired for lckcnt and
3451 		 * cowcnt since the page has an "exclusive" lock.
3452 		 */
3453 		olckcnt = pp->p_lckcnt;
3454 		ocowcnt = pp->p_cowcnt;
3455 		pp->p_lckcnt = pp->p_cowcnt = 0;
3456 
3457 		/*
3458 		 * Put the page on the "free" list after we drop
3459 		 * the lock.  The less work under the lock the better.
3460 		 */
3461 		/*LINTED: constant in conditional context*/
3462 		VN_DISPOSE(pp, B_FREE, 0, kcred);
3463 	}
3464 
3465 	/*
3466 	 * Transfer the lock count from the old page (if any).
3467 	 * The page_struct_lock need not be acquired for lckcnt and
3468 	 * cowcnt since the page has an "exclusive" lock.
3469 	 */
3470 	opp->p_lckcnt += olckcnt;
3471 	opp->p_cowcnt += ocowcnt;
3472 }
3473 
3474 /*
3475  * low level routine to add page `pp' to the hash and vp chains for [vp, offset]
3476  *
3477  * Pages are normally inserted at the start of a vnode's v_pages list.
3478  * If the vnode is VMODSORT and the page is modified, it goes at the end.
3479  * This can happen when a modified page is relocated for DR.
3480  *
3481  * Returns 1 on success and 0 on failure.
3482  */
3483 static int
3484 page_do_hashin(page_t *pp, vnode_t *vp, u_offset_t offset)
3485 {
3486 	page_t		**listp;
3487 	page_t		*tp;
3488 	ulong_t		index;
3489 
3490 	ASSERT(PAGE_EXCL(pp));
3491 	ASSERT(vp != NULL);
3492 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3493 
3494 	/*
3495 	 * Be sure to set these up before the page is inserted on the hash
3496 	 * list.  As soon as the page is placed on the list some other
3497 	 * thread might get confused and wonder how this page could
3498 	 * possibly hash to this list.
3499 	 */
3500 	pp->p_vnode = vp;
3501 	pp->p_offset = offset;
3502 
3503 	/*
3504 	 * record if this page is on a swap vnode
3505 	 */
3506 	if ((vp->v_flag & VISSWAP) != 0)
3507 		PP_SETSWAP(pp);
3508 
3509 	index = PAGE_HASH_FUNC(vp, offset);
3510 	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(index)));
3511 	listp = &page_hash[index];
3512 
3513 	/*
3514 	 * If this page is already hashed in, fail this attempt to add it.
3515 	 */
3516 	for (tp = *listp; tp != NULL; tp = tp->p_hash) {
3517 		if (tp->p_vnode == vp && tp->p_offset == offset) {
3518 			pp->p_vnode = NULL;
3519 			pp->p_offset = (u_offset_t)(-1);
3520 			return (0);
3521 		}
3522 	}
3523 	pp->p_hash = *listp;
3524 	*listp = pp;
3525 
3526 	/*
3527 	 * Add the page to the vnode's list of pages
3528 	 */
3529 	if (vp->v_pages != NULL && IS_VMODSORT(vp) && hat_ismod(pp))
3530 		listp = &vp->v_pages->p_vpprev->p_vpnext;
3531 	else
3532 		listp = &vp->v_pages;
3533 
3534 	page_vpadd(listp, pp);
3535 
3536 	return (1);
3537 }
3538 
3539 /*
3540  * Add page `pp' to both the hash and vp chains for [vp, offset].
3541  *
3542  * Returns 1 on success and 0 on failure.
3543  * If hold is passed in, it is not dropped.
3544  */
3545 int
3546 page_hashin(page_t *pp, vnode_t *vp, u_offset_t offset, kmutex_t *hold)
3547 {
3548 	kmutex_t	*phm = NULL;
3549 	kmutex_t	*vphm;
3550 	int		rc;
3551 
3552 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(vp)));
3553 
3554 	TRACE_3(TR_FAC_VM, TR_PAGE_HASHIN,
3555 		"page_hashin:pp %p vp %p offset %llx",
3556 		pp, vp, offset);
3557 
3558 	VM_STAT_ADD(hashin_count);
3559 
3560 	if (hold != NULL)
3561 		phm = hold;
3562 	else {
3563 		VM_STAT_ADD(hashin_not_held);
3564 		phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, offset));
3565 		mutex_enter(phm);
3566 	}
3567 
3568 	vphm = page_vnode_mutex(vp);
3569 	mutex_enter(vphm);
3570 	rc = page_do_hashin(pp, vp, offset);
3571 	mutex_exit(vphm);
3572 	if (hold == NULL)
3573 		mutex_exit(phm);
3574 	if (rc == 0)
3575 		VM_STAT_ADD(hashin_already);
3576 	return (rc);
3577 }
3578 
3579 /*
3580  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3581  * All mutexes must be held
3582  */
3583 static void
3584 page_do_hashout(page_t *pp)
3585 {
3586 	page_t	**hpp;
3587 	page_t	*hp;
3588 	vnode_t	*vp = pp->p_vnode;
3589 
3590 	ASSERT(vp != NULL);
3591 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
3592 
3593 	/*
3594 	 * First, take pp off of its hash chain.
3595 	 */
3596 	hpp = &page_hash[PAGE_HASH_FUNC(vp, pp->p_offset)];
3597 
3598 	for (;;) {
3599 		hp = *hpp;
3600 		if (hp == pp)
3601 			break;
3602 		if (hp == NULL) {
3603 			panic("page_do_hashout");
3604 			/*NOTREACHED*/
3605 		}
3606 		hpp = &hp->p_hash;
3607 	}
3608 	*hpp = pp->p_hash;
3609 
3610 	/*
3611 	 * Now remove it from its associated vnode.
3612 	 */
3613 	if (vp->v_pages)
3614 		page_vpsub(&vp->v_pages, pp);
3615 
3616 	pp->p_hash = NULL;
3617 	page_clr_all_props(pp);
3618 	PP_CLRSWAP(pp);
3619 	pp->p_vnode = NULL;
3620 	pp->p_offset = (u_offset_t)-1;
3621 }
3622 
3623 /*
3624  * Remove page ``pp'' from the hash and vp chains and remove vp association.
3625  *
3626  * When `phm' is non-NULL it contains the address of the mutex protecting the
3627  * hash list pp is on.  It is not dropped.
3628  */
3629 void
3630 page_hashout(page_t *pp, kmutex_t *phm)
3631 {
3632 	vnode_t		*vp;
3633 	ulong_t		index;
3634 	kmutex_t	*nphm;
3635 	kmutex_t	*vphm;
3636 	kmutex_t	*sep;
3637 
3638 	ASSERT(phm != NULL ? MUTEX_HELD(phm) : 1);
3639 	ASSERT(pp->p_vnode != NULL);
3640 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
3641 	ASSERT(MUTEX_NOT_HELD(page_vnode_mutex(pp->p_vnode)));
3642 
3643 	vp = pp->p_vnode;
3644 
3645 	TRACE_2(TR_FAC_VM, TR_PAGE_HASHOUT,
3646 		"page_hashout:pp %p vp %p", pp, vp);
3647 
3648 	/* Kernel probe */
3649 	TNF_PROBE_2(page_unmap, "vm pagefault", /* CSTYLED */,
3650 	    tnf_opaque, vnode, vp,
3651 	    tnf_offset, offset, pp->p_offset);
3652 
3653 	/*
3654 	 *
3655 	 */
3656 	VM_STAT_ADD(hashout_count);
3657 	index = PAGE_HASH_FUNC(vp, pp->p_offset);
3658 	if (phm == NULL) {
3659 		VM_STAT_ADD(hashout_not_held);
3660 		nphm = PAGE_HASH_MUTEX(index);
3661 		mutex_enter(nphm);
3662 	}
3663 	ASSERT(phm ? phm == PAGE_HASH_MUTEX(index) : 1);
3664 
3665 
3666 	/*
3667 	 * grab page vnode mutex and remove it...
3668 	 */
3669 	vphm = page_vnode_mutex(vp);
3670 	mutex_enter(vphm);
3671 
3672 	page_do_hashout(pp);
3673 
3674 	mutex_exit(vphm);
3675 	if (phm == NULL)
3676 		mutex_exit(nphm);
3677 
3678 	/*
3679 	 * Wake up processes waiting for this page.  The page's
3680 	 * identity has been changed, and is probably not the
3681 	 * desired page any longer.
3682 	 */
3683 	sep = page_se_mutex(pp);
3684 	mutex_enter(sep);
3685 	pp->p_selock &= ~SE_EWANTED;
3686 	if (CV_HAS_WAITERS(&pp->p_cv))
3687 		cv_broadcast(&pp->p_cv);
3688 	mutex_exit(sep);
3689 }
3690 
3691 /*
3692  * Add the page to the front of a linked list of pages
3693  * using the p_next & p_prev pointers for the list.
3694  * The caller is responsible for protecting the list pointers.
3695  */
3696 void
3697 page_add(page_t **ppp, page_t *pp)
3698 {
3699 	ASSERT(PAGE_EXCL(pp) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3700 
3701 	page_add_common(ppp, pp);
3702 }
3703 
3704 
3705 
3706 /*
3707  *  Common code for page_add() and mach_page_add()
3708  */
3709 void
3710 page_add_common(page_t **ppp, page_t *pp)
3711 {
3712 	if (*ppp == NULL) {
3713 		pp->p_next = pp->p_prev = pp;
3714 	} else {
3715 		pp->p_next = *ppp;
3716 		pp->p_prev = (*ppp)->p_prev;
3717 		(*ppp)->p_prev = pp;
3718 		pp->p_prev->p_next = pp;
3719 	}
3720 	*ppp = pp;
3721 }
3722 
3723 
3724 /*
3725  * Remove this page from a linked list of pages
3726  * using the p_next & p_prev pointers for the list.
3727  *
3728  * The caller is responsible for protecting the list pointers.
3729  */
3730 void
3731 page_sub(page_t **ppp, page_t *pp)
3732 {
3733 	ASSERT((PP_ISFREE(pp)) ? 1 :
3734 	    (PAGE_EXCL(pp)) || (PAGE_SHARED(pp) && page_iolock_assert(pp)));
3735 
3736 	if (*ppp == NULL || pp == NULL) {
3737 		panic("page_sub: bad arg(s): pp %p, *ppp %p",
3738 		    (void *)pp, (void *)(*ppp));
3739 		/*NOTREACHED*/
3740 	}
3741 
3742 	page_sub_common(ppp, pp);
3743 }
3744 
3745 
3746 /*
3747  *  Common code for page_sub() and mach_page_sub()
3748  */
3749 void
3750 page_sub_common(page_t **ppp, page_t *pp)
3751 {
3752 	if (*ppp == pp)
3753 		*ppp = pp->p_next;		/* go to next page */
3754 
3755 	if (*ppp == pp)
3756 		*ppp = NULL;			/* page list is gone */
3757 	else {
3758 		pp->p_prev->p_next = pp->p_next;
3759 		pp->p_next->p_prev = pp->p_prev;
3760 	}
3761 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
3762 }
3763 
3764 
3765 /*
3766  * Break page list cppp into two lists with npages in the first list.
3767  * The tail is returned in nppp.
3768  */
3769 void
3770 page_list_break(page_t **oppp, page_t **nppp, pgcnt_t npages)
3771 {
3772 	page_t *s1pp = *oppp;
3773 	page_t *s2pp;
3774 	page_t *e1pp, *e2pp;
3775 	long n = 0;
3776 
3777 	if (s1pp == NULL) {
3778 		*nppp = NULL;
3779 		return;
3780 	}
3781 	if (npages == 0) {
3782 		*nppp = s1pp;
3783 		*oppp = NULL;
3784 		return;
3785 	}
3786 	for (n = 0, s2pp = *oppp; n < npages; n++) {
3787 		s2pp = s2pp->p_next;
3788 	}
3789 	/* Fix head and tail of new lists */
3790 	e1pp = s2pp->p_prev;
3791 	e2pp = s1pp->p_prev;
3792 	s1pp->p_prev = e1pp;
3793 	e1pp->p_next = s1pp;
3794 	s2pp->p_prev = e2pp;
3795 	e2pp->p_next = s2pp;
3796 
3797 	/* second list empty */
3798 	if (s2pp == s1pp) {
3799 		*oppp = s1pp;
3800 		*nppp = NULL;
3801 	} else {
3802 		*oppp = s1pp;
3803 		*nppp = s2pp;
3804 	}
3805 }
3806 
3807 /*
3808  * Concatenate page list nppp onto the end of list ppp.
3809  */
3810 void
3811 page_list_concat(page_t **ppp, page_t **nppp)
3812 {
3813 	page_t *s1pp, *s2pp, *e1pp, *e2pp;
3814 
3815 	if (*nppp == NULL) {
3816 		return;
3817 	}
3818 	if (*ppp == NULL) {
3819 		*ppp = *nppp;
3820 		return;
3821 	}
3822 	s1pp = *ppp;
3823 	e1pp =  s1pp->p_prev;
3824 	s2pp = *nppp;
3825 	e2pp = s2pp->p_prev;
3826 	s1pp->p_prev = e2pp;
3827 	e2pp->p_next = s1pp;
3828 	e1pp->p_next = s2pp;
3829 	s2pp->p_prev = e1pp;
3830 }
3831 
3832 /*
3833  * return the next page in the page list
3834  */
3835 page_t *
3836 page_list_next(page_t *pp)
3837 {
3838 	return (pp->p_next);
3839 }
3840 
3841 
3842 /*
3843  * Add the page to the front of the linked list of pages
3844  * using p_vpnext/p_vpprev pointers for the list.
3845  *
3846  * The caller is responsible for protecting the lists.
3847  */
3848 void
3849 page_vpadd(page_t **ppp, page_t *pp)
3850 {
3851 	if (*ppp == NULL) {
3852 		pp->p_vpnext = pp->p_vpprev = pp;
3853 	} else {
3854 		pp->p_vpnext = *ppp;
3855 		pp->p_vpprev = (*ppp)->p_vpprev;
3856 		(*ppp)->p_vpprev = pp;
3857 		pp->p_vpprev->p_vpnext = pp;
3858 	}
3859 	*ppp = pp;
3860 }
3861 
3862 /*
3863  * Remove this page from the linked list of pages
3864  * using p_vpnext/p_vpprev pointers for the list.
3865  *
3866  * The caller is responsible for protecting the lists.
3867  */
3868 void
3869 page_vpsub(page_t **ppp, page_t *pp)
3870 {
3871 	if (*ppp == NULL || pp == NULL) {
3872 		panic("page_vpsub: bad arg(s): pp %p, *ppp %p",
3873 		    (void *)pp, (void *)(*ppp));
3874 		/*NOTREACHED*/
3875 	}
3876 
3877 	if (*ppp == pp)
3878 		*ppp = pp->p_vpnext;		/* go to next page */
3879 
3880 	if (*ppp == pp)
3881 		*ppp = NULL;			/* page list is gone */
3882 	else {
3883 		pp->p_vpprev->p_vpnext = pp->p_vpnext;
3884 		pp->p_vpnext->p_vpprev = pp->p_vpprev;
3885 	}
3886 	pp->p_vpprev = pp->p_vpnext = pp;	/* make pp a list of one */
3887 }
3888 
3889 /*
3890  * Lock a physical page into memory "long term".  Used to support "lock
3891  * in memory" functions.  Accepts the page to be locked, and a cow variable
3892  * to indicate whether a the lock will travel to the new page during
3893  * a potential copy-on-write.
3894  */
3895 int
3896 page_pp_lock(
3897 	page_t *pp,			/* page to be locked */
3898 	int cow,			/* cow lock */
3899 	int kernel)			/* must succeed -- ignore checking */
3900 {
3901 	int r = 0;			/* result -- assume failure */
3902 
3903 	ASSERT(PAGE_LOCKED(pp));
3904 
3905 	page_struct_lock(pp);
3906 	/*
3907 	 * Acquire the "freemem_lock" for availrmem.
3908 	 */
3909 	if (cow) {
3910 		mutex_enter(&freemem_lock);
3911 		if ((availrmem > pages_pp_maximum) &&
3912 		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
3913 			availrmem--;
3914 			pages_locked++;
3915 			mutex_exit(&freemem_lock);
3916 			r = 1;
3917 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
3918 				cmn_err(CE_WARN,
3919 				    "COW lock limit reached on pfn 0x%lx",
3920 				    page_pptonum(pp));
3921 			}
3922 		} else
3923 			mutex_exit(&freemem_lock);
3924 	} else {
3925 		if (pp->p_lckcnt) {
3926 			if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
3927 				r = 1;
3928 				if (++pp->p_lckcnt ==
3929 				    (ushort_t)PAGE_LOCK_MAXIMUM) {
3930 					cmn_err(CE_WARN, "Page lock limit "
3931 					    "reached on pfn 0x%lx",
3932 					    page_pptonum(pp));
3933 				}
3934 			}
3935 		} else {
3936 			if (kernel) {
3937 				/* availrmem accounting done by caller */
3938 				++pp->p_lckcnt;
3939 				r = 1;
3940 			} else {
3941 				mutex_enter(&freemem_lock);
3942 				if (availrmem > pages_pp_maximum) {
3943 					availrmem--;
3944 					pages_locked++;
3945 					++pp->p_lckcnt;
3946 					r = 1;
3947 				}
3948 				mutex_exit(&freemem_lock);
3949 			}
3950 		}
3951 	}
3952 	page_struct_unlock(pp);
3953 	return (r);
3954 }
3955 
3956 /*
3957  * Decommit a lock on a physical page frame.  Account for cow locks if
3958  * appropriate.
3959  */
3960 void
3961 page_pp_unlock(
3962 	page_t *pp,			/* page to be unlocked */
3963 	int cow,			/* expect cow lock */
3964 	int kernel)			/* this was a kernel lock */
3965 {
3966 	ASSERT(PAGE_LOCKED(pp));
3967 
3968 	page_struct_lock(pp);
3969 	/*
3970 	 * Acquire the "freemem_lock" for availrmem.
3971 	 * If cowcnt or lcknt is already 0 do nothing; i.e., we
3972 	 * could be called to unlock even if nothing is locked. This could
3973 	 * happen if locked file pages were truncated (removing the lock)
3974 	 * and the file was grown again and new pages faulted in; the new
3975 	 * pages are unlocked but the segment still thinks they're locked.
3976 	 */
3977 	if (cow) {
3978 		if (pp->p_cowcnt) {
3979 			mutex_enter(&freemem_lock);
3980 			pp->p_cowcnt--;
3981 			availrmem++;
3982 			pages_locked--;
3983 			mutex_exit(&freemem_lock);
3984 		}
3985 	} else {
3986 		if (pp->p_lckcnt && --pp->p_lckcnt == 0) {
3987 			if (!kernel) {
3988 				mutex_enter(&freemem_lock);
3989 				availrmem++;
3990 				pages_locked--;
3991 				mutex_exit(&freemem_lock);
3992 			}
3993 		}
3994 	}
3995 	page_struct_unlock(pp);
3996 }
3997 
3998 /*
3999  * This routine reserves availrmem for npages;
4000  * 	flags: KM_NOSLEEP or KM_SLEEP
4001  * 	returns 1 on success or 0 on failure
4002  */
4003 int
4004 page_resv(pgcnt_t npages, uint_t flags)
4005 {
4006 	mutex_enter(&freemem_lock);
4007 	while (availrmem < tune.t_minarmem + npages) {
4008 		if (flags & KM_NOSLEEP) {
4009 			mutex_exit(&freemem_lock);
4010 			return (0);
4011 		}
4012 		mutex_exit(&freemem_lock);
4013 		page_needfree(npages);
4014 		kmem_reap();
4015 		delay(hz >> 2);
4016 		page_needfree(-(spgcnt_t)npages);
4017 		mutex_enter(&freemem_lock);
4018 	}
4019 	availrmem -= npages;
4020 	mutex_exit(&freemem_lock);
4021 	return (1);
4022 }
4023 
4024 /*
4025  * This routine unreserves availrmem for npages;
4026  */
4027 void
4028 page_unresv(pgcnt_t npages)
4029 {
4030 	mutex_enter(&freemem_lock);
4031 	availrmem += npages;
4032 	mutex_exit(&freemem_lock);
4033 }
4034 
4035 /*
4036  * See Statement at the beginning of segvn_lockop() regarding
4037  * the way we handle cowcnts and lckcnts.
4038  *
4039  * Transfer cowcnt on 'opp' to cowcnt on 'npp' if the vpage
4040  * that breaks COW has PROT_WRITE.
4041  *
4042  * Note that, we may also break COW in case we are softlocking
4043  * on read access during physio;
4044  * in this softlock case, the vpage may not have PROT_WRITE.
4045  * So, we need to transfer lckcnt on 'opp' to lckcnt on 'npp'
4046  * if the vpage doesn't have PROT_WRITE.
4047  *
4048  * This routine is never called if we are stealing a page
4049  * in anon_private.
4050  *
4051  * The caller subtracted from availrmem for read only mapping.
4052  * if lckcnt is 1 increment availrmem.
4053  */
4054 void
4055 page_pp_useclaim(
4056 	page_t *opp,		/* original page frame losing lock */
4057 	page_t *npp,		/* new page frame gaining lock */
4058 	uint_t	write_perm) 	/* set if vpage has PROT_WRITE */
4059 {
4060 	int payback = 0;
4061 
4062 	ASSERT(PAGE_LOCKED(opp));
4063 	ASSERT(PAGE_LOCKED(npp));
4064 
4065 	page_struct_lock(opp);
4066 
4067 	ASSERT(npp->p_cowcnt == 0);
4068 	ASSERT(npp->p_lckcnt == 0);
4069 
4070 	/* Don't use claim if nothing is locked (see page_pp_unlock above) */
4071 	if ((write_perm && opp->p_cowcnt != 0) ||
4072 	    (!write_perm && opp->p_lckcnt != 0)) {
4073 
4074 		if (write_perm) {
4075 			npp->p_cowcnt++;
4076 			ASSERT(opp->p_cowcnt != 0);
4077 			opp->p_cowcnt--;
4078 		} else {
4079 
4080 			ASSERT(opp->p_lckcnt != 0);
4081 
4082 			/*
4083 			 * We didn't need availrmem decremented if p_lckcnt on
4084 			 * original page is 1. Here, we are unlocking
4085 			 * read-only copy belonging to original page and
4086 			 * are locking a copy belonging to new page.
4087 			 */
4088 			if (opp->p_lckcnt == 1)
4089 				payback = 1;
4090 
4091 			npp->p_lckcnt++;
4092 			opp->p_lckcnt--;
4093 		}
4094 	}
4095 	if (payback) {
4096 		mutex_enter(&freemem_lock);
4097 		availrmem++;
4098 		pages_useclaim--;
4099 		mutex_exit(&freemem_lock);
4100 	}
4101 	page_struct_unlock(opp);
4102 }
4103 
4104 /*
4105  * Simple claim adjust functions -- used to support changes in
4106  * claims due to changes in access permissions.  Used by segvn_setprot().
4107  */
4108 int
4109 page_addclaim(page_t *pp)
4110 {
4111 	int r = 0;			/* result */
4112 
4113 	ASSERT(PAGE_LOCKED(pp));
4114 
4115 	page_struct_lock(pp);
4116 	ASSERT(pp->p_lckcnt != 0);
4117 
4118 	if (pp->p_lckcnt == 1) {
4119 		if (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4120 			--pp->p_lckcnt;
4121 			r = 1;
4122 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4123 				cmn_err(CE_WARN,
4124 				    "COW lock limit reached on pfn 0x%lx",
4125 				    page_pptonum(pp));
4126 			}
4127 		}
4128 	} else {
4129 		mutex_enter(&freemem_lock);
4130 		if ((availrmem > pages_pp_maximum) &&
4131 		    (pp->p_cowcnt < (ushort_t)PAGE_LOCK_MAXIMUM)) {
4132 			--availrmem;
4133 			++pages_claimed;
4134 			mutex_exit(&freemem_lock);
4135 			--pp->p_lckcnt;
4136 			r = 1;
4137 			if (++pp->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4138 				cmn_err(CE_WARN,
4139 				    "COW lock limit reached on pfn 0x%lx",
4140 				    page_pptonum(pp));
4141 			}
4142 		} else
4143 			mutex_exit(&freemem_lock);
4144 	}
4145 	page_struct_unlock(pp);
4146 	return (r);
4147 }
4148 
4149 int
4150 page_subclaim(page_t *pp)
4151 {
4152 	int r = 0;
4153 
4154 	ASSERT(PAGE_LOCKED(pp));
4155 
4156 	page_struct_lock(pp);
4157 	ASSERT(pp->p_cowcnt != 0);
4158 
4159 	if (pp->p_lckcnt) {
4160 		if (pp->p_lckcnt < (ushort_t)PAGE_LOCK_MAXIMUM) {
4161 			r = 1;
4162 			/*
4163 			 * for availrmem
4164 			 */
4165 			mutex_enter(&freemem_lock);
4166 			availrmem++;
4167 			pages_claimed--;
4168 			mutex_exit(&freemem_lock);
4169 
4170 			pp->p_cowcnt--;
4171 
4172 			if (++pp->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4173 				cmn_err(CE_WARN,
4174 				    "Page lock limit reached on pfn 0x%lx",
4175 				    page_pptonum(pp));
4176 			}
4177 		}
4178 	} else {
4179 		r = 1;
4180 		pp->p_cowcnt--;
4181 		pp->p_lckcnt++;
4182 	}
4183 	page_struct_unlock(pp);
4184 	return (r);
4185 }
4186 
4187 int
4188 page_addclaim_pages(page_t  **ppa)
4189 {
4190 
4191 	pgcnt_t	lckpgs = 0, pg_idx;
4192 
4193 	VM_STAT_ADD(pagecnt.pc_addclaim_pages);
4194 
4195 	mutex_enter(&page_llock);
4196 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4197 
4198 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4199 		ASSERT(ppa[pg_idx]->p_lckcnt != 0);
4200 		if (ppa[pg_idx]->p_cowcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4201 			mutex_exit(&page_llock);
4202 			return (0);
4203 		}
4204 		if (ppa[pg_idx]->p_lckcnt > 1)
4205 			lckpgs++;
4206 	}
4207 
4208 	if (lckpgs != 0) {
4209 		mutex_enter(&freemem_lock);
4210 		if (availrmem >= pages_pp_maximum + lckpgs) {
4211 			availrmem -= lckpgs;
4212 			pages_claimed += lckpgs;
4213 		} else {
4214 			mutex_exit(&freemem_lock);
4215 			mutex_exit(&page_llock);
4216 			return (0);
4217 		}
4218 		mutex_exit(&freemem_lock);
4219 	}
4220 
4221 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4222 		ppa[pg_idx]->p_lckcnt--;
4223 		ppa[pg_idx]->p_cowcnt++;
4224 	}
4225 	mutex_exit(&page_llock);
4226 	return (1);
4227 }
4228 
4229 int
4230 page_subclaim_pages(page_t  **ppa)
4231 {
4232 	pgcnt_t	ulckpgs = 0, pg_idx;
4233 
4234 	VM_STAT_ADD(pagecnt.pc_subclaim_pages);
4235 
4236 	mutex_enter(&page_llock);
4237 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4238 
4239 		ASSERT(PAGE_LOCKED(ppa[pg_idx]));
4240 		ASSERT(ppa[pg_idx]->p_cowcnt != 0);
4241 		if (ppa[pg_idx]->p_lckcnt == (ushort_t)PAGE_LOCK_MAXIMUM) {
4242 			mutex_exit(&page_llock);
4243 			return (0);
4244 		}
4245 		if (ppa[pg_idx]->p_lckcnt != 0)
4246 			ulckpgs++;
4247 	}
4248 
4249 	if (ulckpgs != 0) {
4250 		mutex_enter(&freemem_lock);
4251 		availrmem += ulckpgs;
4252 		pages_claimed -= ulckpgs;
4253 		mutex_exit(&freemem_lock);
4254 	}
4255 
4256 	for (pg_idx = 0; ppa[pg_idx] != NULL; pg_idx++) {
4257 		ppa[pg_idx]->p_cowcnt--;
4258 		ppa[pg_idx]->p_lckcnt++;
4259 
4260 	}
4261 	mutex_exit(&page_llock);
4262 	return (1);
4263 }
4264 
4265 page_t *
4266 page_numtopp(pfn_t pfnum, se_t se)
4267 {
4268 	page_t *pp;
4269 
4270 retry:
4271 	pp = page_numtopp_nolock(pfnum);
4272 	if (pp == NULL) {
4273 		return ((page_t *)NULL);
4274 	}
4275 
4276 	/*
4277 	 * Acquire the appropriate lock on the page.
4278 	 */
4279 	while (!page_lock(pp, se, (kmutex_t *)NULL, P_RECLAIM)) {
4280 		if (page_pptonum(pp) != pfnum)
4281 			goto retry;
4282 		continue;
4283 	}
4284 
4285 	if (page_pptonum(pp) != pfnum) {
4286 		page_unlock(pp);
4287 		goto retry;
4288 	}
4289 
4290 	return (pp);
4291 }
4292 
4293 page_t *
4294 page_numtopp_noreclaim(pfn_t pfnum, se_t se)
4295 {
4296 	page_t *pp;
4297 
4298 retry:
4299 	pp = page_numtopp_nolock(pfnum);
4300 	if (pp == NULL) {
4301 		return ((page_t *)NULL);
4302 	}
4303 
4304 	/*
4305 	 * Acquire the appropriate lock on the page.
4306 	 */
4307 	while (!page_lock(pp, se, (kmutex_t *)NULL, P_NO_RECLAIM)) {
4308 		if (page_pptonum(pp) != pfnum)
4309 			goto retry;
4310 		continue;
4311 	}
4312 
4313 	if (page_pptonum(pp) != pfnum) {
4314 		page_unlock(pp);
4315 		goto retry;
4316 	}
4317 
4318 	return (pp);
4319 }
4320 
4321 /*
4322  * This routine is like page_numtopp, but will only return page structs
4323  * for pages which are ok for loading into hardware using the page struct.
4324  */
4325 page_t *
4326 page_numtopp_nowait(pfn_t pfnum, se_t se)
4327 {
4328 	page_t *pp;
4329 
4330 retry:
4331 	pp = page_numtopp_nolock(pfnum);
4332 	if (pp == NULL) {
4333 		return ((page_t *)NULL);
4334 	}
4335 
4336 	/*
4337 	 * Try to acquire the appropriate lock on the page.
4338 	 */
4339 	if (PP_ISFREE(pp))
4340 		pp = NULL;
4341 	else {
4342 		if (!page_trylock(pp, se))
4343 			pp = NULL;
4344 		else {
4345 			if (page_pptonum(pp) != pfnum) {
4346 				page_unlock(pp);
4347 				goto retry;
4348 			}
4349 			if (PP_ISFREE(pp)) {
4350 				page_unlock(pp);
4351 				pp = NULL;
4352 			}
4353 		}
4354 	}
4355 	return (pp);
4356 }
4357 
4358 /*
4359  * Returns a count of dirty pages that are in the process
4360  * of being written out.  If 'cleanit' is set, try to push the page.
4361  */
4362 pgcnt_t
4363 page_busy(int cleanit)
4364 {
4365 	page_t *page0 = page_first();
4366 	page_t *pp = page0;
4367 	pgcnt_t nppbusy = 0;
4368 	u_offset_t off;
4369 
4370 	do {
4371 		vnode_t *vp = pp->p_vnode;
4372 
4373 		/*
4374 		 * A page is a candidate for syncing if it is:
4375 		 *
4376 		 * (a)	On neither the freelist nor the cachelist
4377 		 * (b)	Hashed onto a vnode
4378 		 * (c)	Not a kernel page
4379 		 * (d)	Dirty
4380 		 * (e)	Not part of a swapfile
4381 		 * (f)	a page which belongs to a real vnode; eg has a non-null
4382 		 *	v_vfsp pointer.
4383 		 * (g)	Backed by a filesystem which doesn't have a
4384 		 *	stubbed-out sync operation
4385 		 */
4386 		if (!PP_ISFREE(pp) && vp != NULL && vp != &kvp &&
4387 		    hat_ismod(pp) && !IS_SWAPVP(vp) && vp->v_vfsp != NULL &&
4388 		    vfs_can_sync(vp->v_vfsp)) {
4389 			nppbusy++;
4390 			vfs_syncprogress();
4391 
4392 			if (!cleanit)
4393 				continue;
4394 			if (!page_trylock(pp, SE_EXCL))
4395 				continue;
4396 
4397 			if (PP_ISFREE(pp) || vp == NULL || IS_SWAPVP(vp) ||
4398 			    pp->p_lckcnt != 0 || pp->p_cowcnt != 0 ||
4399 			    !(hat_pagesync(pp,
4400 			    HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD) & P_MOD)) {
4401 				page_unlock(pp);
4402 				continue;
4403 			}
4404 			off = pp->p_offset;
4405 			VN_HOLD(vp);
4406 			page_unlock(pp);
4407 			(void) VOP_PUTPAGE(vp, off, PAGESIZE,
4408 			    B_ASYNC | B_FREE, kcred);
4409 			VN_RELE(vp);
4410 		}
4411 	} while ((pp = page_next(pp)) != page0);
4412 
4413 	return (nppbusy);
4414 }
4415 
4416 void page_invalidate_pages(void);
4417 
4418 /*
4419  * callback handler to vm sub-system
4420  *
4421  * callers make sure no recursive entries to this func.
4422  */
4423 /*ARGSUSED*/
4424 boolean_t
4425 callb_vm_cpr(void *arg, int code)
4426 {
4427 	if (code == CB_CODE_CPR_CHKPT)
4428 		page_invalidate_pages();
4429 	return (B_TRUE);
4430 }
4431 
4432 /*
4433  * Invalidate all pages of the system.
4434  * It shouldn't be called until all user page activities are all stopped.
4435  */
4436 void
4437 page_invalidate_pages()
4438 {
4439 	page_t *pp;
4440 	page_t *page0;
4441 	pgcnt_t nbusypages;
4442 	int retry = 0;
4443 	const int MAXRETRIES = 4;
4444 #if defined(__sparc)
4445 	extern struct vnode prom_ppages;
4446 #endif /* __sparc */
4447 
4448 top:
4449 	/*
4450 	 * Flush dirty pages and destory the clean ones.
4451 	 */
4452 	nbusypages = 0;
4453 
4454 	pp = page0 = page_first();
4455 	do {
4456 		struct vnode	*vp;
4457 		u_offset_t	offset;
4458 		int		mod;
4459 
4460 		/*
4461 		 * skip the page if it has no vnode or the page associated
4462 		 * with the kernel vnode or prom allocated kernel mem.
4463 		 */
4464 #if defined(__sparc)
4465 		if ((vp = pp->p_vnode) == NULL || vp == &kvp ||
4466 		    vp == &prom_ppages)
4467 #else /* x86 doesn't have prom or prom_ppage */
4468 		if ((vp = pp->p_vnode) == NULL || vp == &kvp)
4469 #endif /* __sparc */
4470 			continue;
4471 
4472 		/*
4473 		 * skip the page which is already free invalidated.
4474 		 */
4475 		if (PP_ISFREE(pp) && PP_ISAGED(pp))
4476 			continue;
4477 
4478 		/*
4479 		 * skip pages that are already locked or can't be "exclusively"
4480 		 * locked or are already free.  After we lock the page, check
4481 		 * the free and age bits again to be sure it's not destroied
4482 		 * yet.
4483 		 * To achieve max. parallelization, we use page_trylock instead
4484 		 * of page_lock so that we don't get block on individual pages
4485 		 * while we have thousands of other pages to process.
4486 		 */
4487 		if (!page_trylock(pp, SE_EXCL)) {
4488 			nbusypages++;
4489 			continue;
4490 		} else if (PP_ISFREE(pp)) {
4491 			if (!PP_ISAGED(pp)) {
4492 				page_destroy_free(pp);
4493 			} else {
4494 				page_unlock(pp);
4495 			}
4496 			continue;
4497 		}
4498 		/*
4499 		 * Is this page involved in some I/O? shared?
4500 		 *
4501 		 * The page_struct_lock need not be acquired to
4502 		 * examine these fields since the page has an
4503 		 * "exclusive" lock.
4504 		 */
4505 		if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
4506 			page_unlock(pp);
4507 			continue;
4508 		}
4509 
4510 		if (vp->v_type == VCHR) {
4511 			panic("vp->v_type == VCHR");
4512 			/*NOTREACHED*/
4513 		}
4514 
4515 		if (!page_try_demote_pages(pp)) {
4516 			page_unlock(pp);
4517 			continue;
4518 		}
4519 
4520 		/*
4521 		 * Check the modified bit. Leave the bits alone in hardware
4522 		 * (they will be modified if we do the putpage).
4523 		 */
4524 		mod = (hat_pagesync(pp, HAT_SYNC_DONTZERO | HAT_SYNC_STOPON_MOD)
4525 			& P_MOD);
4526 		if (mod) {
4527 			offset = pp->p_offset;
4528 			/*
4529 			 * Hold the vnode before releasing the page lock
4530 			 * to prevent it from being freed and re-used by
4531 			 * some other thread.
4532 			 */
4533 			VN_HOLD(vp);
4534 			page_unlock(pp);
4535 			/*
4536 			 * No error return is checked here. Callers such as
4537 			 * cpr deals with the dirty pages at the dump time
4538 			 * if this putpage fails.
4539 			 */
4540 			(void) VOP_PUTPAGE(vp, offset, PAGESIZE, B_INVAL,
4541 			    kcred);
4542 			VN_RELE(vp);
4543 		} else {
4544 			page_destroy(pp, 0);
4545 		}
4546 	} while ((pp = page_next(pp)) != page0);
4547 	if (nbusypages && retry++ < MAXRETRIES) {
4548 		delay(1);
4549 		goto top;
4550 	}
4551 }
4552 
4553 /*
4554  * Replace the page "old" with the page "new" on the page hash and vnode lists
4555  *
4556  * the replacemnt must be done in place, ie the equivalent sequence:
4557  *
4558  *	vp = old->p_vnode;
4559  *	off = old->p_offset;
4560  *	page_do_hashout(old)
4561  *	page_do_hashin(new, vp, off)
4562  *
4563  * doesn't work, since
4564  *  1) if old is the only page on the vnode, the v_pages list has a window
4565  *     where it looks empty. This will break file system assumptions.
4566  * and
4567  *  2) pvn_vplist_dirty() can't deal with pages moving on the v_pages list.
4568  */
4569 static void
4570 page_do_relocate_hash(page_t *new, page_t *old)
4571 {
4572 	page_t	**hash_list;
4573 	vnode_t	*vp = old->p_vnode;
4574 	kmutex_t *sep;
4575 
4576 	ASSERT(PAGE_EXCL(old));
4577 	ASSERT(PAGE_EXCL(new));
4578 	ASSERT(vp != NULL);
4579 	ASSERT(MUTEX_HELD(page_vnode_mutex(vp)));
4580 	ASSERT(MUTEX_HELD(PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, old->p_offset))));
4581 
4582 	/*
4583 	 * First find old page on the page hash list
4584 	 */
4585 	hash_list = &page_hash[PAGE_HASH_FUNC(vp, old->p_offset)];
4586 
4587 	for (;;) {
4588 		if (*hash_list == old)
4589 			break;
4590 		if (*hash_list == NULL) {
4591 			panic("page_do_hashout");
4592 			/*NOTREACHED*/
4593 		}
4594 		hash_list = &(*hash_list)->p_hash;
4595 	}
4596 
4597 	/*
4598 	 * update new and replace old with new on the page hash list
4599 	 */
4600 	new->p_vnode = old->p_vnode;
4601 	new->p_offset = old->p_offset;
4602 	new->p_hash = old->p_hash;
4603 	*hash_list = new;
4604 
4605 	if ((new->p_vnode->v_flag & VISSWAP) != 0)
4606 		PP_SETSWAP(new);
4607 
4608 	/*
4609 	 * replace old with new on the vnode's page list
4610 	 */
4611 	if (old->p_vpnext == old) {
4612 		new->p_vpnext = new;
4613 		new->p_vpprev = new;
4614 	} else {
4615 		new->p_vpnext = old->p_vpnext;
4616 		new->p_vpprev = old->p_vpprev;
4617 		new->p_vpnext->p_vpprev = new;
4618 		new->p_vpprev->p_vpnext = new;
4619 	}
4620 	if (vp->v_pages == old)
4621 		vp->v_pages = new;
4622 
4623 	/*
4624 	 * clear out the old page
4625 	 */
4626 	old->p_hash = NULL;
4627 	old->p_vpnext = NULL;
4628 	old->p_vpprev = NULL;
4629 	old->p_vnode = NULL;
4630 	PP_CLRSWAP(old);
4631 	old->p_offset = (u_offset_t)-1;
4632 	page_clr_all_props(old);
4633 
4634 	/*
4635 	 * Wake up processes waiting for this page.  The page's
4636 	 * identity has been changed, and is probably not the
4637 	 * desired page any longer.
4638 	 */
4639 	sep = page_se_mutex(old);
4640 	mutex_enter(sep);
4641 	old->p_selock &= ~SE_EWANTED;
4642 	if (CV_HAS_WAITERS(&old->p_cv))
4643 		cv_broadcast(&old->p_cv);
4644 	mutex_exit(sep);
4645 }
4646 
4647 /*
4648  * This function moves the identity of page "pp_old" to page "pp_new".
4649  * Both pages must be locked on entry.  "pp_new" is free, has no identity,
4650  * and need not be hashed out from anywhere.
4651  */
4652 void
4653 page_relocate_hash(page_t *pp_new, page_t *pp_old)
4654 {
4655 	vnode_t *vp = pp_old->p_vnode;
4656 	u_offset_t off = pp_old->p_offset;
4657 	kmutex_t *phm, *vphm;
4658 
4659 	/*
4660 	 * Rehash two pages
4661 	 */
4662 	ASSERT(PAGE_EXCL(pp_old));
4663 	ASSERT(PAGE_EXCL(pp_new));
4664 	ASSERT(vp != NULL);
4665 	ASSERT(pp_new->p_vnode == NULL);
4666 
4667 	/*
4668 	 * hashout then hashin while holding the mutexes
4669 	 */
4670 	phm = PAGE_HASH_MUTEX(PAGE_HASH_FUNC(vp, off));
4671 	mutex_enter(phm);
4672 	vphm = page_vnode_mutex(vp);
4673 	mutex_enter(vphm);
4674 
4675 	page_do_relocate_hash(pp_new, pp_old);
4676 
4677 	mutex_exit(vphm);
4678 	mutex_exit(phm);
4679 
4680 	/*
4681 	 * The page_struct_lock need not be acquired for lckcnt and
4682 	 * cowcnt since the page has an "exclusive" lock.
4683 	 */
4684 	ASSERT(pp_new->p_lckcnt == 0);
4685 	ASSERT(pp_new->p_cowcnt == 0);
4686 	pp_new->p_lckcnt = pp_old->p_lckcnt;
4687 	pp_new->p_cowcnt = pp_old->p_cowcnt;
4688 	pp_old->p_lckcnt = pp_old->p_cowcnt = 0;
4689 
4690 	/* The following comment preserved from page_flip(). */
4691 	/* XXX - Do we need to protect fsdata? */
4692 	pp_new->p_fsdata = pp_old->p_fsdata;
4693 }
4694 
4695 /*
4696  * Helper routine used to lock all remaining members of a
4697  * large page. The caller is responsible for passing in a locked
4698  * pp. If pp is a large page, then it succeeds in locking all the
4699  * remaining constituent pages or it returns with only the
4700  * original page locked.
4701  *
4702  * Returns 1 on success, 0 on failure.
4703  *
4704  * If success is returned this routine gurantees p_szc for all constituent
4705  * pages of a large page pp belongs to can't change. To achieve this we
4706  * recheck szc of pp after locking all constituent pages and retry if szc
4707  * changed (it could only decrease). Since hat_page_demote() needs an EXCL
4708  * lock on one of constituent pages it can't be running after all constituent
4709  * pages are locked.  hat_page_demote() with a lock on a constituent page
4710  * outside of this large page (i.e. pp belonged to a larger large page) is
4711  * already done with all constituent pages of pp since the root's p_szc is
4712  * changed last. Thefore no need to synchronize with hat_page_demote() that
4713  * locked a constituent page outside of pp's current large page.
4714  */
4715 #ifdef DEBUG
4716 uint32_t gpg_trylock_mtbf = 0;
4717 #endif
4718 
4719 int
4720 group_page_trylock(page_t *pp, se_t se)
4721 {
4722 	page_t  *tpp;
4723 	pgcnt_t	npgs, i, j;
4724 	uint_t pszc = pp->p_szc;
4725 
4726 #ifdef DEBUG
4727 	if (gpg_trylock_mtbf && !(gethrtime() % gpg_trylock_mtbf)) {
4728 		return (0);
4729 	}
4730 #endif
4731 
4732 	if (pp != PP_GROUPLEADER(pp, pszc)) {
4733 		return (0);
4734 	}
4735 
4736 retry:
4737 	ASSERT(PAGE_LOCKED_SE(pp, se));
4738 	ASSERT(!PP_ISFREE(pp));
4739 	if (pszc == 0) {
4740 		return (1);
4741 	}
4742 	npgs = page_get_pagecnt(pszc);
4743 	tpp = pp + 1;
4744 	for (i = 1; i < npgs; i++, tpp++) {
4745 		if (!page_trylock(tpp, se)) {
4746 			tpp = pp + 1;
4747 			for (j = 1; j < i; j++, tpp++) {
4748 				page_unlock(tpp);
4749 			}
4750 			return (0);
4751 		}
4752 	}
4753 	if (pp->p_szc != pszc) {
4754 		ASSERT(pp->p_szc < pszc);
4755 		ASSERT(pp->p_vnode != NULL && pp->p_vnode != &kvp &&
4756 		    !IS_SWAPFSVP(pp->p_vnode));
4757 		tpp = pp + 1;
4758 		for (i = 1; i < npgs; i++, tpp++) {
4759 			page_unlock(tpp);
4760 		}
4761 		pszc = pp->p_szc;
4762 		goto retry;
4763 	}
4764 	return (1);
4765 }
4766 
4767 void
4768 group_page_unlock(page_t *pp)
4769 {
4770 	page_t *tpp;
4771 	pgcnt_t	npgs, i;
4772 
4773 	ASSERT(PAGE_LOCKED(pp));
4774 	ASSERT(!PP_ISFREE(pp));
4775 	ASSERT(pp == PP_PAGEROOT(pp));
4776 	npgs = page_get_pagecnt(pp->p_szc);
4777 	for (i = 1, tpp = pp + 1; i < npgs; i++, tpp++) {
4778 		page_unlock(tpp);
4779 	}
4780 }
4781 
4782 /*
4783  * returns
4784  * 0 		: on success and *nrelocp is number of relocated PAGESIZE pages
4785  * ERANGE	: this is not a base page
4786  * EBUSY	: failure to get locks on the page/pages
4787  * ENOMEM	: failure to obtain replacement pages
4788  * EAGAIN	: OBP has not yet completed its boot-time handoff to the kernel
4789  *
4790  * Return with all constituent members of target and replacement
4791  * SE_EXCL locked. It is the callers responsibility to drop the
4792  * locks.
4793  */
4794 int
4795 do_page_relocate(
4796 	page_t **target,
4797 	page_t **replacement,
4798 	int grouplock,
4799 	spgcnt_t *nrelocp,
4800 	lgrp_t *lgrp)
4801 {
4802 #ifdef DEBUG
4803 	page_t *first_repl;
4804 #endif /* DEBUG */
4805 	page_t *repl;
4806 	page_t *targ;
4807 	page_t *pl = NULL;
4808 	uint_t ppattr;
4809 	pfn_t   pfn, repl_pfn;
4810 	uint_t	szc;
4811 	spgcnt_t npgs, i;
4812 	int repl_contig = 0;
4813 	uint_t flags = 0;
4814 	spgcnt_t dofree = 0;
4815 
4816 	*nrelocp = 0;
4817 
4818 #if defined(__sparc)
4819 	/*
4820 	 * We need to wait till OBP has completed
4821 	 * its boot-time handoff of its resources to the kernel
4822 	 * before we allow page relocation
4823 	 */
4824 	if (page_relocate_ready == 0) {
4825 		return (EAGAIN);
4826 	}
4827 #endif
4828 
4829 	/*
4830 	 * If this is not a base page,
4831 	 * just return with 0x0 pages relocated.
4832 	 */
4833 	targ = *target;
4834 	ASSERT(PAGE_EXCL(targ));
4835 	ASSERT(!PP_ISFREE(targ));
4836 	szc = targ->p_szc;
4837 	ASSERT(szc < mmu_page_sizes);
4838 	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4839 	pfn = targ->p_pagenum;
4840 	if (pfn != PFN_BASE(pfn, szc)) {
4841 		VM_STAT_ADD(vmm_vmstats.ppr_relocnoroot[szc]);
4842 		return (ERANGE);
4843 	}
4844 
4845 	if ((repl = *replacement) != NULL && repl->p_szc >= szc) {
4846 		repl_pfn = repl->p_pagenum;
4847 		if (repl_pfn != PFN_BASE(repl_pfn, szc)) {
4848 			VM_STAT_ADD(vmm_vmstats.ppr_reloc_replnoroot[szc]);
4849 			return (ERANGE);
4850 		}
4851 		repl_contig = 1;
4852 	}
4853 
4854 	/*
4855 	 * We must lock all members of this large page or we cannot
4856 	 * relocate any part of it.
4857 	 */
4858 	if (grouplock != 0 && !group_page_trylock(targ, SE_EXCL)) {
4859 		VM_STAT_ADD(vmm_vmstats.ppr_relocnolock[targ->p_szc]);
4860 		return (EBUSY);
4861 	}
4862 
4863 	/*
4864 	 * reread szc it could have been decreased before
4865 	 * group_page_trylock() was done.
4866 	 */
4867 	szc = targ->p_szc;
4868 	ASSERT(szc < mmu_page_sizes);
4869 	VM_STAT_ADD(vmm_vmstats.ppr_reloc[szc]);
4870 	ASSERT(pfn == PFN_BASE(pfn, szc));
4871 
4872 	npgs = page_get_pagecnt(targ->p_szc);
4873 
4874 	if (repl == NULL) {
4875 		dofree = npgs;		/* Size of target page in MMU pages */
4876 		if (!page_create_wait(dofree, 0)) {
4877 			if (grouplock != 0) {
4878 				group_page_unlock(targ);
4879 			}
4880 			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4881 			return (ENOMEM);
4882 		}
4883 
4884 		/*
4885 		 * seg kmem pages require that the target and replacement
4886 		 * page be the same pagesize.
4887 		 */
4888 		flags = (targ->p_vnode == &kvp) ? PGR_SAMESZC : 0;
4889 		repl = page_get_replacement_page(targ, lgrp, flags);
4890 		if (repl == NULL) {
4891 			if (grouplock != 0) {
4892 				group_page_unlock(targ);
4893 			}
4894 			page_create_putback(dofree);
4895 			VM_STAT_ADD(vmm_vmstats.ppr_relocnomem[szc]);
4896 			return (ENOMEM);
4897 		}
4898 	}
4899 #ifdef DEBUG
4900 	else {
4901 		ASSERT(PAGE_LOCKED(repl));
4902 	}
4903 #endif /* DEBUG */
4904 
4905 #if defined(__sparc)
4906 	/*
4907 	 * Let hat_page_relocate() complete the relocation if it's kernel page
4908 	 */
4909 	if (targ->p_vnode == &kvp) {
4910 		*replacement = repl;
4911 		if (hat_page_relocate(target, replacement, nrelocp) != 0) {
4912 			if (grouplock != 0) {
4913 				group_page_unlock(targ);
4914 			}
4915 			if (dofree) {
4916 				*replacement = NULL;
4917 				page_free_replacement_page(repl);
4918 				page_create_putback(dofree);
4919 			}
4920 			VM_STAT_ADD(vmm_vmstats.ppr_krelocfail[szc]);
4921 			return (EAGAIN);
4922 		}
4923 		VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4924 		return (0);
4925 	}
4926 #else
4927 #if defined(lint)
4928 	dofree = dofree;
4929 #endif
4930 #endif
4931 
4932 #ifdef DEBUG
4933 	first_repl = repl;
4934 #endif /* DEBUG */
4935 
4936 	for (i = 0; i < npgs; i++) {
4937 		ASSERT(PAGE_EXCL(targ));
4938 
4939 		(void) hat_pageunload(targ, HAT_FORCE_PGUNLOAD);
4940 
4941 		ASSERT(hat_page_getshare(targ) == 0);
4942 		ASSERT(!PP_ISFREE(targ));
4943 		ASSERT(targ->p_pagenum == (pfn + i));
4944 		ASSERT(repl_contig == 0 ||
4945 		    repl->p_pagenum == (repl_pfn + i));
4946 
4947 		/*
4948 		 * Copy the page contents and attributes then
4949 		 * relocate the page in the page hash.
4950 		 */
4951 		ppcopy(targ, repl);
4952 		ppattr = hat_page_getattr(targ, (P_MOD | P_REF | P_RO));
4953 		page_clr_all_props(repl);
4954 		page_set_props(repl, ppattr);
4955 		page_relocate_hash(repl, targ);
4956 
4957 		ASSERT(hat_page_getshare(targ) == 0);
4958 		ASSERT(hat_page_getshare(repl) == 0);
4959 		/*
4960 		 * Now clear the props on targ, after the
4961 		 * page_relocate_hash(), they no longer
4962 		 * have any meaning.
4963 		 */
4964 		page_clr_all_props(targ);
4965 		ASSERT(targ->p_next == targ);
4966 		ASSERT(targ->p_prev == targ);
4967 		page_list_concat(&pl, &targ);
4968 
4969 		targ++;
4970 		if (repl_contig != 0) {
4971 			repl++;
4972 		} else {
4973 			repl = repl->p_next;
4974 		}
4975 	}
4976 	/* assert that we have come full circle with repl */
4977 	ASSERT(repl_contig == 1 || first_repl == repl);
4978 
4979 	*target = pl;
4980 	if (*replacement == NULL) {
4981 		ASSERT(first_repl == repl);
4982 		*replacement = repl;
4983 	}
4984 	VM_STAT_ADD(vmm_vmstats.ppr_relocok[szc]);
4985 	*nrelocp = npgs;
4986 	return (0);
4987 }
4988 /*
4989  * On success returns 0 and *nrelocp the number of PAGESIZE pages relocated.
4990  */
4991 int
4992 page_relocate(
4993 	page_t **target,
4994 	page_t **replacement,
4995 	int grouplock,
4996 	int freetarget,
4997 	spgcnt_t *nrelocp,
4998 	lgrp_t *lgrp)
4999 {
5000 	spgcnt_t ret;
5001 
5002 	/* do_page_relocate returns 0 on success or errno value */
5003 	ret = do_page_relocate(target, replacement, grouplock, nrelocp, lgrp);
5004 
5005 	if (ret != 0 || freetarget == 0) {
5006 		return (ret);
5007 	}
5008 	if (*nrelocp == 1) {
5009 		ASSERT(*target != NULL);
5010 		page_free(*target, 1);
5011 	} else {
5012 		page_t *tpp = *target;
5013 		uint_t szc = tpp->p_szc;
5014 		pgcnt_t npgs = page_get_pagecnt(szc);
5015 		ASSERT(npgs > 1);
5016 		ASSERT(szc != 0);
5017 		do {
5018 			ASSERT(PAGE_EXCL(tpp));
5019 			ASSERT(!hat_page_is_mapped(tpp));
5020 			ASSERT(tpp->p_szc == szc);
5021 			PP_SETFREE(tpp);
5022 			PP_SETAGED(tpp);
5023 			npgs--;
5024 		} while ((tpp = tpp->p_next) != *target);
5025 		ASSERT(npgs == 0);
5026 		page_list_add_pages(*target, 0);
5027 		npgs = page_get_pagecnt(szc);
5028 		page_create_putback(npgs);
5029 	}
5030 	return (ret);
5031 }
5032 
5033 /*
5034  * it is up to the caller to deal with pcf accounting.
5035  */
5036 void
5037 page_free_replacement_page(page_t *pplist)
5038 {
5039 	page_t *pp;
5040 
5041 	while (pplist != NULL) {
5042 		/*
5043 		 * pp_targ is a linked list.
5044 		 */
5045 		pp = pplist;
5046 		if (pp->p_szc == 0) {
5047 			page_sub(&pplist, pp);
5048 			page_clr_all_props(pp);
5049 			PP_SETFREE(pp);
5050 			PP_SETAGED(pp);
5051 			page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
5052 			page_unlock(pp);
5053 			VM_STAT_ADD(pagecnt.pc_free_replacement_page[0]);
5054 		} else {
5055 			spgcnt_t curnpgs = page_get_pagecnt(pp->p_szc);
5056 			page_t *tpp;
5057 			page_list_break(&pp, &pplist, curnpgs);
5058 			tpp = pp;
5059 			do {
5060 				ASSERT(PAGE_EXCL(tpp));
5061 				ASSERT(!hat_page_is_mapped(tpp));
5062 				page_clr_all_props(pp);
5063 				PP_SETFREE(tpp);
5064 				PP_SETAGED(tpp);
5065 			} while ((tpp = tpp->p_next) != pp);
5066 			page_list_add_pages(pp, 0);
5067 			VM_STAT_ADD(pagecnt.pc_free_replacement_page[1]);
5068 		}
5069 	}
5070 }
5071 
5072 /*
5073  * Relocate target to non-relocatable replacement page.
5074  */
5075 int
5076 page_relocate_cage(page_t **target, page_t **replacement)
5077 {
5078 	page_t *tpp, *rpp;
5079 	spgcnt_t pgcnt, npgs;
5080 	int result;
5081 
5082 	tpp = *target;
5083 
5084 	ASSERT(PAGE_EXCL(tpp));
5085 	ASSERT(tpp->p_szc == 0);
5086 
5087 	pgcnt = btop(page_get_pagesize(tpp->p_szc));
5088 
5089 	do {
5090 		(void) page_create_wait(pgcnt, PG_WAIT | PG_NORELOC);
5091 		rpp = page_get_replacement_page(tpp, NULL, PGR_NORELOC);
5092 		if (rpp == NULL) {
5093 			page_create_putback(pgcnt);
5094 			kcage_cageout_wakeup();
5095 		}
5096 	} while (rpp == NULL);
5097 
5098 	ASSERT(PP_ISNORELOC(rpp));
5099 
5100 	result = page_relocate(&tpp, &rpp, 0, 1, &npgs, NULL);
5101 
5102 	if (result == 0) {
5103 		*replacement = rpp;
5104 		if (pgcnt != npgs)
5105 			panic("page_relocate_cage: partial relocation");
5106 	}
5107 
5108 	return (result);
5109 }
5110 
5111 /*
5112  * Release the page lock on a page, place on cachelist
5113  * tail if no longer mapped. Caller can let us know if
5114  * the page is known to be clean.
5115  */
5116 int
5117 page_release(page_t *pp, int checkmod)
5118 {
5119 	int status;
5120 
5121 	ASSERT(PAGE_LOCKED(pp) && !PP_ISFREE(pp) &&
5122 		(pp->p_vnode != NULL));
5123 
5124 	if (!hat_page_is_mapped(pp) && !IS_SWAPVP(pp->p_vnode) &&
5125 	    ((PAGE_SHARED(pp) && page_tryupgrade(pp)) || PAGE_EXCL(pp)) &&
5126 	    pp->p_lckcnt == 0 && pp->p_cowcnt == 0 &&
5127 	    !hat_page_is_mapped(pp)) {
5128 
5129 		/*
5130 		 * If page is modified, unlock it
5131 		 *
5132 		 * (p_nrm & P_MOD) bit has the latest stuff because:
5133 		 * (1) We found that this page doesn't have any mappings
5134 		 *	_after_ holding SE_EXCL and
5135 		 * (2) We didn't drop SE_EXCL lock after the check in (1)
5136 		 */
5137 		if (checkmod && hat_ismod(pp)) {
5138 			page_unlock(pp);
5139 			status = PGREL_MOD;
5140 		} else {
5141 			/*LINTED: constant in conditional context*/
5142 			VN_DISPOSE(pp, B_FREE, 0, kcred);
5143 			status = PGREL_CLEAN;
5144 		}
5145 	} else {
5146 		page_unlock(pp);
5147 		status = PGREL_NOTREL;
5148 	}
5149 	return (status);
5150 }
5151 
5152 /*
5153  * Given a constituent page, try to demote the large page on the freelist.
5154  *
5155  * Returns nonzero if the page could be demoted successfully. Returns with
5156  * the constituent page still locked.
5157  */
5158 int
5159 page_try_demote_free_pages(page_t *pp)
5160 {
5161 	page_t *rootpp = pp;
5162 	pfn_t	pfn = page_pptonum(pp);
5163 	spgcnt_t npgs;
5164 	uint_t	szc = pp->p_szc;
5165 
5166 	ASSERT(PP_ISFREE(pp));
5167 	ASSERT(PAGE_EXCL(pp));
5168 
5169 	/*
5170 	 * Adjust rootpp and lock it, if `pp' is not the base
5171 	 * constituent page.
5172 	 */
5173 	npgs = page_get_pagecnt(pp->p_szc);
5174 	if (npgs == 1) {
5175 		return (0);
5176 	}
5177 
5178 	if (!IS_P2ALIGNED(pfn, npgs)) {
5179 		pfn = P2ALIGN(pfn, npgs);
5180 		rootpp = page_numtopp_nolock(pfn);
5181 	}
5182 
5183 	if (pp != rootpp && !page_trylock(rootpp, SE_EXCL)) {
5184 		return (0);
5185 	}
5186 
5187 	if (rootpp->p_szc != szc) {
5188 		if (pp != rootpp)
5189 			page_unlock(rootpp);
5190 		return (0);
5191 	}
5192 
5193 	page_demote_free_pages(rootpp);
5194 
5195 	if (pp != rootpp)
5196 		page_unlock(rootpp);
5197 
5198 	ASSERT(PP_ISFREE(pp));
5199 	ASSERT(PAGE_EXCL(pp));
5200 	return (1);
5201 }
5202 
5203 /*
5204  * Given a constituent page, try to demote the large page.
5205  *
5206  * Returns nonzero if the page could be demoted successfully. Returns with
5207  * the constituent page still locked.
5208  */
5209 int
5210 page_try_demote_pages(page_t *pp)
5211 {
5212 	page_t *tpp, *rootpp = pp;
5213 	pfn_t	pfn = page_pptonum(pp);
5214 	spgcnt_t i, npgs;
5215 	uint_t	szc = pp->p_szc;
5216 	vnode_t *vp = pp->p_vnode;
5217 
5218 	ASSERT(PAGE_EXCL(pp));
5219 
5220 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[0]);
5221 
5222 	if (pp->p_szc == 0) {
5223 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[1]);
5224 		return (1);
5225 	}
5226 
5227 	if (vp != NULL && !IS_SWAPFSVP(vp) && vp != &kvp) {
5228 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[2]);
5229 		page_demote_vp_pages(pp);
5230 		ASSERT(pp->p_szc == 0);
5231 		return (1);
5232 	}
5233 
5234 	/*
5235 	 * Adjust rootpp if passed in is not the base
5236 	 * constituent page.
5237 	 */
5238 	npgs = page_get_pagecnt(pp->p_szc);
5239 	ASSERT(npgs > 1);
5240 	if (!IS_P2ALIGNED(pfn, npgs)) {
5241 		pfn = P2ALIGN(pfn, npgs);
5242 		rootpp = page_numtopp_nolock(pfn);
5243 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[3]);
5244 		ASSERT(rootpp->p_vnode != NULL);
5245 		ASSERT(rootpp->p_szc == szc);
5246 	}
5247 
5248 	/*
5249 	 * We can't demote kernel pages since we can't hat_unload()
5250 	 * the mappings.
5251 	 */
5252 	if (rootpp->p_vnode == &kvp)
5253 		return (0);
5254 
5255 	/*
5256 	 * Attempt to lock all constituent pages except the page passed
5257 	 * in since it's already locked.
5258 	 */
5259 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5260 		ASSERT(!PP_ISFREE(tpp));
5261 		ASSERT(tpp->p_vnode != NULL);
5262 
5263 		if (tpp != pp && !page_trylock(tpp, SE_EXCL))
5264 			break;
5265 		ASSERT(tpp->p_szc == rootpp->p_szc);
5266 		ASSERT(page_pptonum(tpp) == page_pptonum(rootpp) + i);
5267 	}
5268 
5269 	/*
5270 	 * If we failed to lock them all then unlock what we have
5271 	 * locked so far and bail.
5272 	 */
5273 	if (i < npgs) {
5274 		tpp = rootpp;
5275 		while (i-- > 0) {
5276 			if (tpp != pp)
5277 				page_unlock(tpp);
5278 			tpp++;
5279 		}
5280 		VM_STAT_ADD(pagecnt.pc_try_demote_pages[4]);
5281 		return (0);
5282 	}
5283 
5284 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5285 		ASSERT(PAGE_EXCL(tpp));
5286 		(void) hat_pageunload(tpp, HAT_FORCE_PGUNLOAD);
5287 		tpp->p_szc = 0;
5288 	}
5289 
5290 	/*
5291 	 * Unlock all pages except the page passed in.
5292 	 */
5293 	for (tpp = rootpp, i = 0; i < npgs; i++, tpp++) {
5294 		ASSERT(!hat_page_is_mapped(tpp));
5295 		if (tpp != pp)
5296 			page_unlock(tpp);
5297 	}
5298 
5299 	VM_STAT_ADD(pagecnt.pc_try_demote_pages[5]);
5300 	return (1);
5301 }
5302 
5303 /*
5304  * Called by page_free() and page_destroy() to demote the page size code
5305  * (p_szc) to 0 (since we can't just put a single PAGESIZE page with non zero
5306  * p_szc on free list, neither can we just clear p_szc of a single page_t
5307  * within a large page since it will break other code that relies on p_szc
5308  * being the same for all page_t's of a large page). Anonymous pages should
5309  * never end up here because anon_map_getpages() cannot deal with p_szc
5310  * changes after a single constituent page is locked.  While anonymous or
5311  * kernel large pages are demoted or freed the entire large page at a time
5312  * with all constituent pages locked EXCL for the file system pages we
5313  * have to be able to demote a large page (i.e. decrease all constituent pages
5314  * p_szc) with only just an EXCL lock on one of constituent pages. The reason
5315  * we can easily deal with anonymous page demotion the entire large page at a
5316  * time is that those operation originate at address space level and concern
5317  * the entire large page region with actual demotion only done when pages are
5318  * not shared with any other processes (therefore we can always get EXCL lock
5319  * on all anonymous constituent pages after clearing segment page
5320  * cache). However file system pages can be truncated or invalidated at a
5321  * PAGESIZE level from the file system side and end up in page_free() or
5322  * page_destroy() (we also allow only part of the large page to be SOFTLOCKed
5323  * and therfore pageout should be able to demote a large page by EXCL locking
5324  * any constituent page that is not under SOFTLOCK). In those cases we cannot
5325  * rely on being able to lock EXCL all constituent pages.
5326  *
5327  * To prevent szc changes on file system pages one has to lock all constituent
5328  * pages at least SHARED (or call page_szc_lock()). The only subsystem that
5329  * doesn't rely on locking all constituent pages (or using page_szc_lock()) to
5330  * prevent szc changes is hat layer that uses its own page level mlist
5331  * locks. hat assumes that szc doesn't change after mlist lock for a page is
5332  * taken. Therefore we need to change szc under hat level locks if we only
5333  * have an EXCL lock on a single constituent page and hat still references any
5334  * of constituent pages.  (Note we can't "ignore" hat layer by simply
5335  * hat_pageunload() all constituent pages without having EXCL locks on all of
5336  * constituent pages). We use hat_page_demote() call to safely demote szc of
5337  * all constituent pages under hat locks when we only have an EXCL lock on one
5338  * of constituent pages.
5339  *
5340  * This routine calls page_szc_lock() before calling hat_page_demote() to
5341  * allow segvn in one special case not to lock all constituent pages SHARED
5342  * before calling hat_memload_array() that relies on p_szc not changeing even
5343  * before hat level mlist lock is taken.  In that case segvn uses
5344  * page_szc_lock() to prevent hat_page_demote() changeing p_szc values.
5345  *
5346  * Anonymous or kernel page demotion still has to lock all pages exclusively
5347  * and do hat_pageunload() on all constituent pages before demoting the page
5348  * therefore there's no need for anonymous or kernel page demotion to use
5349  * hat_page_demote() mechanism.
5350  *
5351  * hat_page_demote() removes all large mappings that map pp and then decreases
5352  * p_szc starting from the last constituent page of the large page. By working
5353  * from the tail of a large page in pfn decreasing order allows one looking at
5354  * the root page to know that hat_page_demote() is done for root's szc area.
5355  * e.g. if a root page has szc 1 one knows it only has to lock all constituent
5356  * pages within szc 1 area to prevent szc changes because hat_page_demote()
5357  * that started on this page when it had szc > 1 is done for this szc 1 area.
5358  *
5359  * We are guranteed that all constituent pages of pp's large page belong to
5360  * the same vnode with the consecutive offsets increasing in the direction of
5361  * the pfn i.e. the identity of constituent pages can't change until their
5362  * p_szc is decreased. Therefore it's safe for hat_page_demote() to remove
5363  * large mappings to pp even though we don't lock any constituent page except
5364  * pp (i.e. we won't unload e.g. kernel locked page).
5365  */
5366 static void
5367 page_demote_vp_pages(page_t *pp)
5368 {
5369 	kmutex_t *mtx;
5370 
5371 	ASSERT(PAGE_EXCL(pp));
5372 	ASSERT(!PP_ISFREE(pp));
5373 	ASSERT(pp->p_vnode != NULL);
5374 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
5375 	ASSERT(pp->p_vnode != &kvp);
5376 
5377 	VM_STAT_ADD(pagecnt.pc_demote_pages[0]);
5378 
5379 	mtx = page_szc_lock(pp);
5380 	if (mtx != NULL) {
5381 		hat_page_demote(pp);
5382 		mutex_exit(mtx);
5383 	}
5384 	ASSERT(pp->p_szc == 0);
5385 }
5386 
5387 /*
5388  * Mark any existing pages for migration in the given range
5389  */
5390 void
5391 page_mark_migrate(struct seg *seg, caddr_t addr, size_t len,
5392     struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
5393     u_offset_t vnoff, int rflag)
5394 {
5395 	struct anon	*ap;
5396 	vnode_t		*curvp;
5397 	lgrp_t		*from;
5398 	pgcnt_t		i;
5399 	pgcnt_t		nlocked;
5400 	u_offset_t	off;
5401 	pfn_t		pfn;
5402 	size_t		pgsz;
5403 	size_t		segpgsz;
5404 	pgcnt_t		pages;
5405 	uint_t		pszc;
5406 	page_t		**ppa;
5407 	pgcnt_t		ppa_nentries;
5408 	page_t		*pp;
5409 	caddr_t		va;
5410 	ulong_t		an_idx;
5411 	anon_sync_obj_t	cookie;
5412 
5413 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5414 
5415 	/*
5416 	 * Don't do anything if don't need to do lgroup optimizations
5417 	 * on this system
5418 	 */
5419 	if (!lgrp_optimizations())
5420 		return;
5421 
5422 	/*
5423 	 * Align address and length to (potentially large) page boundary
5424 	 */
5425 	segpgsz = page_get_pagesize(seg->s_szc);
5426 	addr = (caddr_t)P2ALIGN((uintptr_t)addr, segpgsz);
5427 	if (rflag)
5428 		len = P2ROUNDUP(len, segpgsz);
5429 
5430 	/*
5431 	 * Allocate page array to accomodate largest page size
5432 	 */
5433 	pgsz = page_get_pagesize(page_num_pagesizes() - 1);
5434 	ppa_nentries = btop(pgsz);
5435 	ppa = kmem_zalloc(ppa_nentries * sizeof (page_t *), KM_SLEEP);
5436 
5437 	/*
5438 	 * Do one (large) page at a time
5439 	 */
5440 	va = addr;
5441 	while (va < addr + len) {
5442 		/*
5443 		 * Lookup (root) page for vnode and offset corresponding to
5444 		 * this virtual address
5445 		 * Try anonmap first since there may be copy-on-write
5446 		 * pages, but initialize vnode pointer and offset using
5447 		 * vnode arguments just in case there isn't an amp.
5448 		 */
5449 		curvp = vp;
5450 		off = vnoff + va - seg->s_base;
5451 		if (amp) {
5452 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_READER);
5453 			an_idx = anon_index + seg_page(seg, va);
5454 			anon_array_enter(amp, an_idx, &cookie);
5455 			ap = anon_get_ptr(amp->ahp, an_idx);
5456 			if (ap)
5457 				swap_xlate(ap, &curvp, &off);
5458 			anon_array_exit(&cookie);
5459 			ANON_LOCK_EXIT(&amp->a_rwlock);
5460 		}
5461 
5462 		pp = NULL;
5463 		if (curvp)
5464 			pp = page_lookup(curvp, off, SE_SHARED);
5465 
5466 		/*
5467 		 * If there isn't a page at this virtual address,
5468 		 * skip to next page
5469 		 */
5470 		if (pp == NULL) {
5471 			va += PAGESIZE;
5472 			continue;
5473 		}
5474 
5475 		/*
5476 		 * Figure out which lgroup this page is in for kstats
5477 		 */
5478 		pfn = page_pptonum(pp);
5479 		from = lgrp_pfn_to_lgrp(pfn);
5480 
5481 		/*
5482 		 * Get page size, and round up and skip to next page boundary
5483 		 * if unaligned address
5484 		 */
5485 		pszc = pp->p_szc;
5486 		pgsz = page_get_pagesize(pszc);
5487 		pages = btop(pgsz);
5488 		if (!IS_P2ALIGNED(va, pgsz) ||
5489 		    !IS_P2ALIGNED(pfn, pages) ||
5490 		    pgsz > segpgsz) {
5491 			pgsz = MIN(pgsz, segpgsz);
5492 			page_unlock(pp);
5493 			i = btop(P2END((uintptr_t)va, pgsz) -
5494 			    (uintptr_t)va);
5495 			va = (caddr_t)P2END((uintptr_t)va, pgsz);
5496 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS, i);
5497 			continue;
5498 		}
5499 
5500 		/*
5501 		 * Upgrade to exclusive lock on page
5502 		 */
5503 		if (!page_tryupgrade(pp)) {
5504 			page_unlock(pp);
5505 			va += pgsz;
5506 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5507 			    btop(pgsz));
5508 			continue;
5509 		}
5510 
5511 		/*
5512 		 * Remember pages locked exclusively and how many
5513 		 */
5514 		ppa[0] = pp;
5515 		nlocked = 1;
5516 
5517 		/*
5518 		 * Lock constituent pages if this is large page
5519 		 */
5520 		if (pages > 1) {
5521 			/*
5522 			 * Lock all constituents except root page, since it
5523 			 * should be locked already.
5524 			 */
5525 			for (i = 1; i < pages; i++) {
5526 				pp++;
5527 				if (!page_trylock(pp, SE_EXCL)) {
5528 					break;
5529 				}
5530 				if (PP_ISFREE(pp) ||
5531 				    pp->p_szc != pszc) {
5532 					/*
5533 					 * hat_page_demote() raced in with us.
5534 					 */
5535 					ASSERT(!IS_SWAPFSVP(curvp));
5536 					page_unlock(pp);
5537 					break;
5538 				}
5539 				ppa[nlocked] = pp;
5540 				nlocked++;
5541 			}
5542 		}
5543 
5544 		/*
5545 		 * If all constituent pages couldn't be locked,
5546 		 * unlock pages locked so far and skip to next page.
5547 		 */
5548 		if (nlocked != pages) {
5549 			for (i = 0; i < nlocked; i++)
5550 				page_unlock(ppa[i]);
5551 			va += pgsz;
5552 			lgrp_stat_add(from->lgrp_id, LGRP_PMM_FAIL_PGS,
5553 			    btop(pgsz));
5554 			continue;
5555 		}
5556 
5557 		/*
5558 		 * hat_page_demote() can no longer happen
5559 		 * since last cons page had the right p_szc after
5560 		 * all cons pages were locked. all cons pages
5561 		 * should now have the same p_szc.
5562 		 */
5563 
5564 		/*
5565 		 * All constituent pages locked successfully, so mark
5566 		 * large page for migration and unload the mappings of
5567 		 * constituent pages, so a fault will occur on any part of the
5568 		 * large page
5569 		 */
5570 		PP_SETMIGRATE(ppa[0]);
5571 		for (i = 0; i < nlocked; i++) {
5572 			pp = ppa[i];
5573 			(void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
5574 			ASSERT(hat_page_getshare(pp) == 0);
5575 			page_unlock(pp);
5576 		}
5577 		lgrp_stat_add(from->lgrp_id, LGRP_PMM_PGS, nlocked);
5578 
5579 		va += pgsz;
5580 	}
5581 	kmem_free(ppa, ppa_nentries * sizeof (page_t *));
5582 }
5583 
5584 /*
5585  * Migrate any pages that have been marked for migration in the given range
5586  */
5587 void
5588 page_migrate(
5589 	struct seg	*seg,
5590 	caddr_t		addr,
5591 	page_t		**ppa,
5592 	pgcnt_t		npages)
5593 {
5594 	lgrp_t		*from;
5595 	lgrp_t		*to;
5596 	page_t		*newpp;
5597 	page_t		*pp;
5598 	pfn_t		pfn;
5599 	size_t		pgsz;
5600 	spgcnt_t	page_cnt;
5601 	spgcnt_t	i;
5602 	uint_t		pszc;
5603 
5604 	ASSERT(seg->s_as && AS_LOCK_HELD(seg->s_as, &seg->s_as->a_lock));
5605 
5606 	while (npages > 0) {
5607 		pp = *ppa;
5608 		pszc = pp->p_szc;
5609 		pgsz = page_get_pagesize(pszc);
5610 		page_cnt = btop(pgsz);
5611 
5612 		/*
5613 		 * Check to see whether this page is marked for migration
5614 		 *
5615 		 * Assume that root page of large page is marked for
5616 		 * migration and none of the other constituent pages
5617 		 * are marked.  This really simplifies clearing the
5618 		 * migrate bit by not having to clear it from each
5619 		 * constituent page.
5620 		 *
5621 		 * note we don't want to relocate an entire large page if
5622 		 * someone is only using one subpage.
5623 		 */
5624 		if (npages < page_cnt)
5625 			break;
5626 
5627 		/*
5628 		 * Is it marked for migration?
5629 		 */
5630 		if (!PP_ISMIGRATE(pp))
5631 			goto next;
5632 
5633 		/*
5634 		 * Determine lgroups that page is being migrated between
5635 		 */
5636 		pfn = page_pptonum(pp);
5637 		if (!IS_P2ALIGNED(pfn, page_cnt)) {
5638 			break;
5639 		}
5640 		from = lgrp_pfn_to_lgrp(pfn);
5641 		to = lgrp_mem_choose(seg, addr, pgsz);
5642 
5643 		/*
5644 		 * Check to see whether we are trying to migrate page to lgroup
5645 		 * where it is allocated already
5646 		 */
5647 		if (to == from) {
5648 			PP_CLRMIGRATE(pp);
5649 			goto next;
5650 		}
5651 
5652 		/*
5653 		 * Need to get exclusive lock's to migrate
5654 		 */
5655 		for (i = 0; i < page_cnt; i++) {
5656 			ASSERT(PAGE_LOCKED(ppa[i]));
5657 			if (page_pptonum(ppa[i]) != pfn + i ||
5658 			    ppa[i]->p_szc != pszc) {
5659 				break;
5660 			}
5661 			if (!page_tryupgrade(ppa[i])) {
5662 				lgrp_stat_add(from->lgrp_id,
5663 				    LGRP_PM_FAIL_LOCK_PGS,
5664 				    page_cnt);
5665 				break;
5666 			}
5667 		}
5668 		if (i != page_cnt) {
5669 			while (--i != -1) {
5670 				page_downgrade(ppa[i]);
5671 			}
5672 			goto next;
5673 		}
5674 
5675 		(void) page_create_wait(page_cnt, PG_WAIT);
5676 		newpp = page_get_replacement_page(pp, to, PGR_SAMESZC);
5677 		if (newpp == NULL) {
5678 			page_create_putback(page_cnt);
5679 			for (i = 0; i < page_cnt; i++) {
5680 				page_downgrade(ppa[i]);
5681 			}
5682 			lgrp_stat_add(to->lgrp_id, LGRP_PM_FAIL_ALLOC_PGS,
5683 			    page_cnt);
5684 			goto next;
5685 		}
5686 		ASSERT(newpp->p_szc == pszc);
5687 		/*
5688 		 * Clear migrate bit and relocate page
5689 		 */
5690 		PP_CLRMIGRATE(pp);
5691 		if (page_relocate(&pp, &newpp, 0, 1, &page_cnt, to)) {
5692 			panic("page_migrate: page_relocate failed");
5693 		}
5694 		ASSERT(page_cnt * PAGESIZE == pgsz);
5695 
5696 		/*
5697 		 * Keep stats for number of pages migrated from and to
5698 		 * each lgroup
5699 		 */
5700 		lgrp_stat_add(from->lgrp_id, LGRP_PM_SRC_PGS, page_cnt);
5701 		lgrp_stat_add(to->lgrp_id, LGRP_PM_DEST_PGS, page_cnt);
5702 		/*
5703 		 * update the page_t array we were passed in and
5704 		 * unlink constituent pages of a large page.
5705 		 */
5706 		for (i = 0; i < page_cnt; ++i, ++pp) {
5707 			ASSERT(PAGE_EXCL(newpp));
5708 			ASSERT(newpp->p_szc == pszc);
5709 			ppa[i] = newpp;
5710 			pp = newpp;
5711 			page_sub(&newpp, pp);
5712 			page_downgrade(pp);
5713 		}
5714 		ASSERT(newpp == NULL);
5715 next:
5716 		addr += pgsz;
5717 		ppa += page_cnt;
5718 		npages -= page_cnt;
5719 	}
5720 }
5721 
5722 ulong_t mem_waiters 	= 0;
5723 ulong_t	max_count 	= 20;
5724 #define	MAX_DELAY	0x1ff
5725 
5726 /*
5727  * Check if enough memory is available to proceed.
5728  * Depending on system configuration and how much memory is
5729  * reserved for swap we need to check against two variables.
5730  * e.g. on systems with little physical swap availrmem can be
5731  * more reliable indicator of how much memory is available.
5732  * On systems with large phys swap freemem can be better indicator.
5733  * If freemem drops below threshold level don't return an error
5734  * immediately but wake up pageout to free memory and block.
5735  * This is done number of times. If pageout is not able to free
5736  * memory within certain time return an error.
5737  * The same applies for availrmem but kmem_reap is used to
5738  * free memory.
5739  */
5740 int
5741 page_mem_avail(pgcnt_t npages)
5742 {
5743 	ulong_t count;
5744 
5745 #if defined(__i386)
5746 	if (freemem > desfree + npages &&
5747 	    availrmem > swapfs_reserve + npages &&
5748 	    btop(vmem_size(heap_arena, VMEM_FREE)) > tune.t_minarmem +
5749 	    npages)
5750 		return (1);
5751 #else
5752 	if (freemem > desfree + npages &&
5753 	    availrmem > swapfs_reserve + npages)
5754 		return (1);
5755 #endif
5756 
5757 	count = max_count;
5758 	atomic_add_long(&mem_waiters, 1);
5759 
5760 	while (freemem < desfree + npages && --count) {
5761 		cv_signal(&proc_pageout->p_cv);
5762 		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
5763 			atomic_add_long(&mem_waiters, -1);
5764 			return (0);
5765 		}
5766 	}
5767 	if (count == 0) {
5768 		atomic_add_long(&mem_waiters, -1);
5769 		return (0);
5770 	}
5771 
5772 	count = max_count;
5773 	while (availrmem < swapfs_reserve + npages && --count) {
5774 		kmem_reap();
5775 		if (delay_sig(hz + (mem_waiters & MAX_DELAY))) {
5776 			atomic_add_long(&mem_waiters, -1);
5777 			return (0);
5778 		}
5779 	}
5780 	atomic_add_long(&mem_waiters, -1);
5781 	if (count == 0)
5782 		return (0);
5783 
5784 #if defined(__i386)
5785 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
5786 	    tune.t_minarmem + npages)
5787 		return (0);
5788 #endif
5789 	return (1);
5790 }
5791 
5792 
5793 /*
5794  * Search the memory segments to locate the desired page.  Within a
5795  * segment, pages increase linearly with one page structure per
5796  * physical page frame (size PAGESIZE).  The search begins
5797  * with the segment that was accessed last, to take advantage of locality.
5798  * If the hint misses, we start from the beginning of the sorted memseg list
5799  */
5800 
5801 
5802 /*
5803  * Some data structures for pfn to pp lookup.
5804  */
5805 ulong_t mhash_per_slot;
5806 struct memseg *memseg_hash[N_MEM_SLOTS];
5807 
5808 page_t *
5809 page_numtopp_nolock(pfn_t pfnum)
5810 {
5811 	struct memseg *seg;
5812 	page_t *pp;
5813 	vm_cpu_data_t *vc = CPU->cpu_vm_data;
5814 
5815 	ASSERT(vc != NULL);
5816 
5817 	MEMSEG_STAT_INCR(nsearch);
5818 
5819 	/* Try last winner first */
5820 	if (((seg = vc->vc_pnum_memseg) != NULL) &&
5821 		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5822 		MEMSEG_STAT_INCR(nlastwon);
5823 		pp = seg->pages + (pfnum - seg->pages_base);
5824 		if (pp->p_pagenum == pfnum)
5825 			return ((page_t *)pp);
5826 	}
5827 
5828 	/* Else Try hash */
5829 	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5830 		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5831 		MEMSEG_STAT_INCR(nhashwon);
5832 		vc->vc_pnum_memseg = seg;
5833 		pp = seg->pages + (pfnum - seg->pages_base);
5834 		if (pp->p_pagenum == pfnum)
5835 			return ((page_t *)pp);
5836 	}
5837 
5838 	/* Else Brute force */
5839 	for (seg = memsegs; seg != NULL; seg = seg->next) {
5840 		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5841 			vc->vc_pnum_memseg = seg;
5842 			pp = seg->pages + (pfnum - seg->pages_base);
5843 			return ((page_t *)pp);
5844 		}
5845 	}
5846 	vc->vc_pnum_memseg = NULL;
5847 	MEMSEG_STAT_INCR(nnotfound);
5848 	return ((page_t *)NULL);
5849 
5850 }
5851 
5852 struct memseg *
5853 page_numtomemseg_nolock(pfn_t pfnum)
5854 {
5855 	struct memseg *seg;
5856 	page_t *pp;
5857 
5858 	/* Try hash */
5859 	if (((seg = memseg_hash[MEMSEG_PFN_HASH(pfnum)]) != NULL) &&
5860 		(pfnum >= seg->pages_base) && (pfnum < seg->pages_end)) {
5861 		pp = seg->pages + (pfnum - seg->pages_base);
5862 		if (pp->p_pagenum == pfnum)
5863 			return (seg);
5864 	}
5865 
5866 	/* Else Brute force */
5867 	for (seg = memsegs; seg != NULL; seg = seg->next) {
5868 		if (pfnum >= seg->pages_base && pfnum < seg->pages_end) {
5869 			return (seg);
5870 		}
5871 	}
5872 	return ((struct memseg *)NULL);
5873 }
5874 
5875 /*
5876  * Given a page and a count return the page struct that is
5877  * n structs away from the current one in the global page
5878  * list.
5879  *
5880  * This function wraps to the first page upon
5881  * reaching the end of the memseg list.
5882  */
5883 page_t *
5884 page_nextn(page_t *pp, ulong_t n)
5885 {
5886 	struct memseg *seg;
5887 	page_t *ppn;
5888 	vm_cpu_data_t *vc = (vm_cpu_data_t *)CPU->cpu_vm_data;
5889 
5890 	ASSERT(vc != NULL);
5891 
5892 	if (((seg = vc->vc_pnext_memseg) == NULL) ||
5893 	    (seg->pages_base == seg->pages_end) ||
5894 	    !(pp >= seg->pages && pp < seg->epages)) {
5895 
5896 		for (seg = memsegs; seg; seg = seg->next) {
5897 			if (pp >= seg->pages && pp < seg->epages)
5898 				break;
5899 		}
5900 
5901 		if (seg == NULL) {
5902 			/* Memory delete got in, return something valid. */
5903 			/* TODO: fix me. */
5904 			seg = memsegs;
5905 			pp = seg->pages;
5906 		}
5907 	}
5908 
5909 	/* check for wraparound - possible if n is large */
5910 	while ((ppn = (pp + n)) >= seg->epages || ppn < pp) {
5911 		n -= seg->epages - pp;
5912 		seg = seg->next;
5913 		if (seg == NULL)
5914 			seg = memsegs;
5915 		pp = seg->pages;
5916 	}
5917 	vc->vc_pnext_memseg = seg;
5918 	return (ppn);
5919 }
5920 
5921 /*
5922  * Initialize for a loop using page_next_scan_large().
5923  */
5924 page_t *
5925 page_next_scan_init(void **cookie)
5926 {
5927 	ASSERT(cookie != NULL);
5928 	*cookie = (void *)memsegs;
5929 	return ((page_t *)memsegs->pages);
5930 }
5931 
5932 /*
5933  * Return the next page in a scan of page_t's, assuming we want
5934  * to skip over sub-pages within larger page sizes.
5935  *
5936  * The cookie is used to keep track of the current memseg.
5937  */
5938 page_t *
5939 page_next_scan_large(
5940 	page_t		*pp,
5941 	ulong_t		*n,
5942 	void		**cookie)
5943 {
5944 	struct memseg	*seg = (struct memseg *)*cookie;
5945 	page_t		*new_pp;
5946 	ulong_t		cnt;
5947 	pfn_t		pfn;
5948 
5949 
5950 	/*
5951 	 * get the count of page_t's to skip based on the page size
5952 	 */
5953 	ASSERT(pp != NULL);
5954 	if (pp->p_szc == 0) {
5955 		cnt = 1;
5956 	} else {
5957 		pfn = page_pptonum(pp);
5958 		cnt = page_get_pagecnt(pp->p_szc);
5959 		cnt -= pfn & (cnt - 1);
5960 	}
5961 	*n += cnt;
5962 	new_pp = pp + cnt;
5963 
5964 	/*
5965 	 * Catch if we went past the end of the current memory segment. If so,
5966 	 * just move to the next segment with pages.
5967 	 */
5968 	if (new_pp >= seg->epages) {
5969 		do {
5970 			seg = seg->next;
5971 			if (seg == NULL)
5972 				seg = memsegs;
5973 		} while (seg->pages == seg->epages);
5974 		new_pp = seg->pages;
5975 		*cookie = (void *)seg;
5976 	}
5977 
5978 	return (new_pp);
5979 }
5980 
5981 
5982 /*
5983  * Returns next page in list. Note: this function wraps
5984  * to the first page in the list upon reaching the end
5985  * of the list. Callers should be aware of this fact.
5986  */
5987 
5988 /* We should change this be a #define */
5989 
5990 page_t *
5991 page_next(page_t *pp)
5992 {
5993 	return (page_nextn(pp, 1));
5994 }
5995 
5996 page_t *
5997 page_first()
5998 {
5999 	return ((page_t *)memsegs->pages);
6000 }
6001 
6002 
6003 /*
6004  * This routine is called at boot with the initial memory configuration
6005  * and when memory is added or removed.
6006  */
6007 void
6008 build_pfn_hash()
6009 {
6010 	pfn_t cur;
6011 	pgcnt_t index;
6012 	struct memseg *pseg;
6013 	int	i;
6014 
6015 	/*
6016 	 * Clear memseg_hash array.
6017 	 * Since memory add/delete is designed to operate concurrently
6018 	 * with normal operation, the hash rebuild must be able to run
6019 	 * concurrently with page_numtopp_nolock(). To support this
6020 	 * functionality, assignments to memseg_hash array members must
6021 	 * be done atomically.
6022 	 *
6023 	 * NOTE: bzero() does not currently guarantee this for kernel
6024 	 * threads, and cannot be used here.
6025 	 */
6026 	for (i = 0; i < N_MEM_SLOTS; i++)
6027 		memseg_hash[i] = NULL;
6028 
6029 	hat_kpm_mseghash_clear(N_MEM_SLOTS);
6030 
6031 	/*
6032 	 * Physmax is the last valid pfn.
6033 	 */
6034 	mhash_per_slot = (physmax + 1) >> MEM_HASH_SHIFT;
6035 	for (pseg = memsegs; pseg != NULL; pseg = pseg->next) {
6036 		index = MEMSEG_PFN_HASH(pseg->pages_base);
6037 		cur = pseg->pages_base;
6038 		do {
6039 			if (index >= N_MEM_SLOTS)
6040 				index = MEMSEG_PFN_HASH(cur);
6041 
6042 			if (memseg_hash[index] == NULL ||
6043 			    memseg_hash[index]->pages_base > pseg->pages_base) {
6044 				memseg_hash[index] = pseg;
6045 				hat_kpm_mseghash_update(index, pseg);
6046 			}
6047 			cur += mhash_per_slot;
6048 			index++;
6049 		} while (cur < pseg->pages_end);
6050 	}
6051 }
6052 
6053 /*
6054  * Return the pagenum for the pp
6055  */
6056 pfn_t
6057 page_pptonum(page_t *pp)
6058 {
6059 	return (pp->p_pagenum);
6060 }
6061 
6062 /*
6063  * interface to the referenced and modified etc bits
6064  * in the PSM part of the page struct
6065  * when no locking is desired.
6066  */
6067 void
6068 page_set_props(page_t *pp, uint_t flags)
6069 {
6070 	ASSERT((flags & ~(P_MOD | P_REF | P_RO)) == 0);
6071 	pp->p_nrm |= (uchar_t)flags;
6072 }
6073 
6074 void
6075 page_clr_all_props(page_t *pp)
6076 {
6077 	pp->p_nrm = 0;
6078 }
6079 
6080 /*
6081  * Clear p_lckcnt and p_cowcnt, adjusting freemem if required.
6082  */
6083 int
6084 page_clear_lck_cow(page_t *pp, int adjust)
6085 {
6086 	int	f_amount;
6087 
6088 	ASSERT(PAGE_EXCL(pp));
6089 
6090 	/*
6091 	 * The page_struct_lock need not be acquired here since
6092 	 * we require the caller hold the page exclusively locked.
6093 	 */
6094 	f_amount = 0;
6095 	if (pp->p_lckcnt) {
6096 		f_amount = 1;
6097 		pp->p_lckcnt = 0;
6098 	}
6099 	if (pp->p_cowcnt) {
6100 		f_amount += pp->p_cowcnt;
6101 		pp->p_cowcnt = 0;
6102 	}
6103 
6104 	if (adjust && f_amount) {
6105 		mutex_enter(&freemem_lock);
6106 		availrmem += f_amount;
6107 		mutex_exit(&freemem_lock);
6108 	}
6109 
6110 	return (f_amount);
6111 }
6112 
6113 /*
6114  * The following functions is called from free_vp_pages()
6115  * for an inexact estimate of a newly free'd page...
6116  */
6117 ulong_t
6118 page_share_cnt(page_t *pp)
6119 {
6120 	return (hat_page_getshare(pp));
6121 }
6122 
6123 int
6124 page_isshared(page_t *pp)
6125 {
6126 	return (hat_page_getshare(pp) > 1);
6127 }
6128 
6129 int
6130 page_isfree(page_t *pp)
6131 {
6132 	return (PP_ISFREE(pp));
6133 }
6134 
6135 int
6136 page_isref(page_t *pp)
6137 {
6138 	return (hat_page_getattr(pp, P_REF));
6139 }
6140 
6141 int
6142 page_ismod(page_t *pp)
6143 {
6144 	return (hat_page_getattr(pp, P_MOD));
6145 }
6146