xref: /titanic_50/usr/src/uts/common/vm/page_lock.c (revision 770915ebe81263e14c9bdd49d7d24aac978ef725)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 
26 /*
27  * VM - page locking primitives
28  */
29 #include <sys/param.h>
30 #include <sys/t_lock.h>
31 #include <sys/vtrace.h>
32 #include <sys/debug.h>
33 #include <sys/cmn_err.h>
34 #include <sys/bitmap.h>
35 #include <sys/lockstat.h>
36 #include <sys/sysmacros.h>
37 #include <sys/condvar_impl.h>
38 #include <vm/page.h>
39 #include <vm/seg_enum.h>
40 #include <vm/vm_dep.h>
41 #include <vm/seg_kmem.h>
42 
43 /*
44  * This global mutex array is for logical page locking.
45  * The following fields in the page structure are protected
46  * by this lock:
47  *
48  *	p_lckcnt
49  *	p_cowcnt
50  */
51 pad_mutex_t page_llocks[8 * NCPU_P2];
52 
53 /*
54  * This is a global lock for the logical page free list.  The
55  * logical free list, in this implementation, is maintained as two
56  * separate physical lists - the cache list and the free list.
57  */
58 kmutex_t  page_freelock;
59 
60 /*
61  * The hash table, page_hash[], the p_selock fields, and the
62  * list of pages associated with vnodes are protected by arrays of mutexes.
63  *
64  * Unless the hashes are changed radically, the table sizes must be
65  * a power of two.  Also, we typically need more mutexes for the
66  * vnodes since these locks are occasionally held for long periods.
67  * And since there seem to be two special vnodes (kvp and swapvp),
68  * we make room for private mutexes for them.
69  *
70  * The pse_mutex[] array holds the mutexes to protect the p_selock
71  * fields of all page_t structures.
72  *
73  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
74  * when given a pointer to a page_t.
75  *
76  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
77  * should go to the trouble of setting it up at run time and base it
78  * on memory size rather than the number of compile time CPUs.
79  *
80  * XX64	We should be using physmem size to calculate PIO_SHIFT.
81  *
82  *	These might break in 64 bit world.
83  */
84 #define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
85 #define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
86 
87 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
88 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
89 
90 #define	PAGE_IO_MUTEX(pp) \
91 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
92 
93 /*
94  * The pse_mutex[] array is allocated in the platform startup code
95  * based on the size of the machine at startup.
96  */
97 extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
98 extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
99 extern int pse_shift;			/* log2(pse_table_size) */
100 #define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
101 	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
102 	(pse_table_size - 1)].pad_mutex
103 
104 #define	PSZC_MTX_TABLE_SIZE	128
105 #define	PSZC_MTX_TABLE_SHIFT	7
106 
107 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
108 
109 #define	PAGE_SZC_MUTEX(_pp) \
110 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
111 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
112 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
113 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
114 
115 /*
116  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
117  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
118  * and p_vpnext).
119  *
120  * The page_vnode_mutex(vp) function returns the address of the appropriate
121  * mutex from this array given a pointer to a vnode.  It is complicated
122  * by the fact that the kernel's vnode and the swapfs vnode are referenced
123  * frequently enough to warrent their own mutexes.
124  *
125  * The VP_HASH_FUNC returns the index into the vph_mutex array given
126  * an address of a vnode.
127  */
128 
129 #if defined(_LP64)
130 #define	VPH_TABLE_SIZE  (8 * NCPU_P2)
131 #else	/* 32 bits */
132 #define	VPH_TABLE_SIZE	(2 * NCPU_P2)
133 #endif
134 
135 #define	VP_HASH_FUNC(vp) \
136 	((((uintptr_t)(vp) >> 6) + \
137 	    ((uintptr_t)(vp) >> 8) + \
138 	    ((uintptr_t)(vp) >> 10) + \
139 	    ((uintptr_t)(vp) >> 12)) \
140 	    & (VPH_TABLE_SIZE - 1))
141 
142 /*
143  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
144  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
145  * VPH_TABLE_SIZE + 1.
146  */
147 
148 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
149 
150 /*
151  * Initialize the locks used by the Virtual Memory Management system.
152  */
153 void
154 page_lock_init()
155 {
156 }
157 
158 /*
159  * Return a value for pse_shift based on npg (the number of physical pages)
160  * and ncpu (the maximum number of CPUs).  This is called by platform startup
161  * code.
162  *
163  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
164  * locks grew approximately as the square of the number of threads executing.
165  * So the primary scaling factor used is NCPU^2.  The size of the machine in
166  * megabytes is used as an upper bound, particularly for sun4v machines which
167  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
168  * (128) is used as a minimum.  Since the size of the table has to be a power
169  * of two, the calculated size is rounded up to the next power of two.
170  */
171 /*ARGSUSED*/
172 int
173 size_pse_array(pgcnt_t npg, int ncpu)
174 {
175 	size_t size;
176 	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
177 
178 	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
179 	size += (1 << (highbit(size) - 1)) - 1;
180 	return (highbit(size) - 1);
181 }
182 
183 /*
184  * At present we only use page ownership to aid debugging, so it's
185  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
186  * can map to the same owner because we just 'or' in 0x80000000 and
187  * then clear the second highest bit, so that (for example) 0x2faced00
188  * and 0xafaced00 both map to 0xafaced00.
189  * In the 64-bit world, p_selock may not be large enough to hold a full
190  * thread pointer.  If we ever need precise ownership (e.g. if we implement
191  * priority inheritance for page locks) then p_selock should become a
192  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
193  */
194 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
195 #define	SE_READER	1
196 
197 /*
198  * A page that is deleted must be marked as such using the
199  * page_lock_delete() function. The page must be exclusively locked.
200  * The SE_DELETED marker is put in p_selock when this function is called.
201  * SE_DELETED must be distinct from any SE_WRITER value.
202  */
203 #define	SE_DELETED	(1 | INT_MIN)
204 
205 #ifdef VM_STATS
206 uint_t	vph_kvp_count;
207 uint_t	vph_swapfsvp_count;
208 uint_t	vph_other;
209 #endif /* VM_STATS */
210 
211 #ifdef VM_STATS
212 uint_t	page_lock_count;
213 uint_t	page_lock_miss;
214 uint_t	page_lock_miss_lock;
215 uint_t	page_lock_reclaim;
216 uint_t	page_lock_bad_reclaim;
217 uint_t	page_lock_same_page;
218 uint_t	page_lock_upgrade;
219 uint_t	page_lock_retired;
220 uint_t	page_lock_upgrade_failed;
221 uint_t	page_lock_deleted;
222 
223 uint_t	page_trylock_locked;
224 uint_t	page_trylock_failed;
225 uint_t	page_trylock_missed;
226 
227 uint_t	page_try_reclaim_upgrade;
228 #endif /* VM_STATS */
229 
230 /*
231  * Acquire the "shared/exclusive" lock on a page.
232  *
233  * Returns 1 on success and locks the page appropriately.
234  *	   0 on failure and does not lock the page.
235  *
236  * If `lock' is non-NULL, it will be dropped and reacquired in the
237  * failure case.  This routine can block, and if it does
238  * it will always return a failure since the page identity [vp, off]
239  * or state may have changed.
240  */
241 
242 int
243 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
244 {
245 	return (page_lock_es(pp, se, lock, reclaim, 0));
246 }
247 
248 /*
249  * With the addition of reader-writer lock semantics to page_lock_es,
250  * callers wanting an exclusive (writer) lock may prevent shared-lock
251  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
252  * In this case, when an exclusive lock cannot be acquired, p_selock's
253  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
254  * if the page is slated for retirement.
255  *
256  * The se and es parameters determine if the lock should be granted
257  * based on the following decision table:
258  *
259  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
260  * ----------- -------------- -------------------  ---------
261  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
262  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
263  * SE_EXCL        none         any lock/any        deny
264  * SE_SHARED      n/a [2]        shared/0          grant
265  * SE_SHARED      n/a [2]      unlocked/0          grant
266  * SE_SHARED      n/a            shared/1          deny
267  * SE_SHARED      n/a          unlocked/1          deny
268  * SE_SHARED      n/a              excl/any        deny
269  *
270  * Notes:
271  * [1] The code grants an exclusive lock to the caller and clears the bit
272  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
273  *   bit's value.  This was deemed acceptable as we are not concerned about
274  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
275  *   fifo mechanism should also be implemented. Meantime, the thread that
276  *   set SE_EWANTED should be prepared to catch this condition and reset it
277  *
278  * [2] Retired pages may not be locked at any time, regardless of the
279  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
280  *
281  * Notes on values of "es":
282  *
283  *   es & 1: page_lookup_create will attempt page relocation
284  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
285  *       memory thread); this prevents reader-starvation of waiting
286  *       writer thread(s) by giving priority to writers over readers.
287  *   es & SE_RETIRED: caller wants to lock pages even if they are
288  *       retired.  Default is to deny the lock if the page is retired.
289  *
290  * And yes, we know, the semantics of this function are too complicated.
291  * It's on the list to be cleaned up.
292  */
293 int
294 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
295 {
296 	int		retval;
297 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
298 	int		upgraded;
299 	int		reclaim_it;
300 
301 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
302 
303 	VM_STAT_ADD(page_lock_count);
304 
305 	upgraded = 0;
306 	reclaim_it = 0;
307 
308 	mutex_enter(pse);
309 
310 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
311 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
312 
313 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
314 		mutex_exit(pse);
315 		VM_STAT_ADD(page_lock_retired);
316 		return (0);
317 	}
318 
319 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
320 		se = SE_EXCL;
321 	}
322 
323 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
324 
325 		reclaim_it = 1;
326 		if (se == SE_SHARED) {
327 			/*
328 			 * This is an interesting situation.
329 			 *
330 			 * Remember that p_free can only change if
331 			 * p_selock < 0.
332 			 * p_free does not depend on our holding `pse'.
333 			 * And, since we hold `pse', p_selock can not change.
334 			 * So, if p_free changes on us, the page is already
335 			 * exclusively held, and we would fail to get p_selock
336 			 * regardless.
337 			 *
338 			 * We want to avoid getting the share
339 			 * lock on a free page that needs to be reclaimed.
340 			 * It is possible that some other thread has the share
341 			 * lock and has left the free page on the cache list.
342 			 * pvn_vplist_dirty() does this for brief periods.
343 			 * If the se_share is currently SE_EXCL, we will fail
344 			 * to acquire p_selock anyway.  Blocking is the
345 			 * right thing to do.
346 			 * If we need to reclaim this page, we must get
347 			 * exclusive access to it, force the upgrade now.
348 			 * Again, we will fail to acquire p_selock if the
349 			 * page is not free and block.
350 			 */
351 			upgraded = 1;
352 			se = SE_EXCL;
353 			VM_STAT_ADD(page_lock_upgrade);
354 		}
355 	}
356 
357 	if (se == SE_EXCL) {
358 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
359 			/*
360 			 * if the caller wants a writer lock (but did not
361 			 * specify exclusive access), and there is a pending
362 			 * writer that wants exclusive access, return failure
363 			 */
364 			retval = 0;
365 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
366 			/* no reader/writer lock held */
367 			THREAD_KPRI_REQUEST();
368 			/* this clears our setting of the SE_EWANTED bit */
369 			pp->p_selock = SE_WRITER;
370 			retval = 1;
371 		} else {
372 			/* page is locked */
373 			if (es & SE_EXCL_WANTED) {
374 				/* set the SE_EWANTED bit */
375 				pp->p_selock |= SE_EWANTED;
376 			}
377 			retval = 0;
378 		}
379 	} else {
380 		retval = 0;
381 		if (pp->p_selock >= 0) {
382 			if ((pp->p_selock & SE_EWANTED) == 0) {
383 				pp->p_selock += SE_READER;
384 				retval = 1;
385 			}
386 		}
387 	}
388 
389 	if (retval == 0) {
390 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
391 			VM_STAT_ADD(page_lock_deleted);
392 			mutex_exit(pse);
393 			return (retval);
394 		}
395 
396 #ifdef VM_STATS
397 		VM_STAT_ADD(page_lock_miss);
398 		if (upgraded) {
399 			VM_STAT_ADD(page_lock_upgrade_failed);
400 		}
401 #endif
402 		if (lock) {
403 			VM_STAT_ADD(page_lock_miss_lock);
404 			mutex_exit(lock);
405 		}
406 
407 		/*
408 		 * Now, wait for the page to be unlocked and
409 		 * release the lock protecting p_cv and p_selock.
410 		 */
411 		cv_wait(&pp->p_cv, pse);
412 		mutex_exit(pse);
413 
414 		/*
415 		 * The page identity may have changed while we were
416 		 * blocked.  If we are willing to depend on "pp"
417 		 * still pointing to a valid page structure (i.e.,
418 		 * assuming page structures are not dynamically allocated
419 		 * or freed), we could try to lock the page if its
420 		 * identity hasn't changed.
421 		 *
422 		 * This needs to be measured, since we come back from
423 		 * cv_wait holding pse (the expensive part of this
424 		 * operation) we might as well try the cheap part.
425 		 * Though we would also have to confirm that dropping
426 		 * `lock' did not cause any grief to the callers.
427 		 */
428 		if (lock) {
429 			mutex_enter(lock);
430 		}
431 	} else {
432 		/*
433 		 * We have the page lock.
434 		 * If we needed to reclaim the page, and the page
435 		 * needed reclaiming (ie, it was free), then we
436 		 * have the page exclusively locked.  We may need
437 		 * to downgrade the page.
438 		 */
439 		ASSERT((upgraded) ?
440 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
441 		mutex_exit(pse);
442 
443 		/*
444 		 * We now hold this page's lock, either shared or
445 		 * exclusive.  This will prevent its identity from changing.
446 		 * The page, however, may or may not be free.  If the caller
447 		 * requested, and it is free, go reclaim it from the
448 		 * free list.  If the page can't be reclaimed, return failure
449 		 * so that the caller can start all over again.
450 		 *
451 		 * NOTE:page_reclaim() releases the page lock (p_selock)
452 		 *	if it can't be reclaimed.
453 		 */
454 		if (reclaim_it) {
455 			if (!page_reclaim(pp, lock)) {
456 				VM_STAT_ADD(page_lock_bad_reclaim);
457 				retval = 0;
458 			} else {
459 				VM_STAT_ADD(page_lock_reclaim);
460 				if (upgraded) {
461 					page_downgrade(pp);
462 				}
463 			}
464 		}
465 	}
466 	return (retval);
467 }
468 
469 /*
470  * Clear the SE_EWANTED bit from p_selock.  This function allows
471  * callers of page_lock_es and page_try_reclaim_lock to clear
472  * their setting of this bit if they decide they no longer wish
473  * to gain exclusive access to the page.  Currently only
474  * delete_memory_thread uses this when the delete memory
475  * operation is cancelled.
476  */
477 void
478 page_lock_clr_exclwanted(page_t *pp)
479 {
480 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
481 
482 	mutex_enter(pse);
483 	pp->p_selock &= ~SE_EWANTED;
484 	if (CV_HAS_WAITERS(&pp->p_cv))
485 		cv_broadcast(&pp->p_cv);
486 	mutex_exit(pse);
487 }
488 
489 /*
490  * Read the comments inside of page_lock_es() carefully.
491  *
492  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
493  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
494  * This is used by threads subject to reader-starvation (eg. memory delete).
495  *
496  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
497  * it is expected that it will retry at a later time.  Threads that will
498  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
499  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
500  * the bit is cleared.)
501  */
502 int
503 page_try_reclaim_lock(page_t *pp, se_t se, int es)
504 {
505 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
506 	selock_t old;
507 
508 	mutex_enter(pse);
509 
510 	old = pp->p_selock;
511 
512 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
513 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
514 
515 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
516 		mutex_exit(pse);
517 		VM_STAT_ADD(page_trylock_failed);
518 		return (0);
519 	}
520 
521 	if (se == SE_SHARED && es == 1 && old == 0) {
522 		se = SE_EXCL;
523 	}
524 
525 	if (se == SE_SHARED) {
526 		if (!PP_ISFREE(pp)) {
527 			if (old >= 0) {
528 				/*
529 				 * Readers are not allowed when excl wanted
530 				 */
531 				if ((old & SE_EWANTED) == 0) {
532 					pp->p_selock = old + SE_READER;
533 					mutex_exit(pse);
534 					return (1);
535 				}
536 			}
537 			mutex_exit(pse);
538 			return (0);
539 		}
540 		/*
541 		 * The page is free, so we really want SE_EXCL (below)
542 		 */
543 		VM_STAT_ADD(page_try_reclaim_upgrade);
544 	}
545 
546 	/*
547 	 * The caller wants a writer lock.  We try for it only if
548 	 * SE_EWANTED is not set, or if the caller specified
549 	 * SE_EXCL_WANTED.
550 	 */
551 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
552 		if ((old & ~SE_EWANTED) == 0) {
553 			/* no reader/writer lock held */
554 			THREAD_KPRI_REQUEST();
555 			/* this clears out our setting of the SE_EWANTED bit */
556 			pp->p_selock = SE_WRITER;
557 			mutex_exit(pse);
558 			return (1);
559 		}
560 	}
561 	if (es & SE_EXCL_WANTED) {
562 		/* page is locked, set the SE_EWANTED bit */
563 		pp->p_selock |= SE_EWANTED;
564 	}
565 	mutex_exit(pse);
566 	return (0);
567 }
568 
569 /*
570  * Acquire a page's "shared/exclusive" lock, but never block.
571  * Returns 1 on success, 0 on failure.
572  */
573 int
574 page_trylock(page_t *pp, se_t se)
575 {
576 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
577 
578 	mutex_enter(pse);
579 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
580 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
581 		/*
582 		 * Fail if a thread wants exclusive access and page is
583 		 * retired, if the page is slated for retirement, or a
584 		 * share lock is requested.
585 		 */
586 		mutex_exit(pse);
587 		VM_STAT_ADD(page_trylock_failed);
588 		return (0);
589 	}
590 
591 	if (se == SE_EXCL) {
592 		if (pp->p_selock == 0) {
593 			THREAD_KPRI_REQUEST();
594 			pp->p_selock = SE_WRITER;
595 			mutex_exit(pse);
596 			return (1);
597 		}
598 	} else {
599 		if (pp->p_selock >= 0) {
600 			pp->p_selock += SE_READER;
601 			mutex_exit(pse);
602 			return (1);
603 		}
604 	}
605 	mutex_exit(pse);
606 	return (0);
607 }
608 
609 /*
610  * Variant of page_unlock() specifically for the page freelist
611  * code. The mere existence of this code is a vile hack that
612  * has resulted due to the backwards locking order of the page
613  * freelist manager; please don't call it.
614  */
615 void
616 page_unlock_nocapture(page_t *pp)
617 {
618 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
619 	selock_t old;
620 
621 	mutex_enter(pse);
622 
623 	old = pp->p_selock;
624 	if ((old & ~SE_EWANTED) == SE_READER) {
625 		pp->p_selock = old & ~SE_READER;
626 		if (CV_HAS_WAITERS(&pp->p_cv))
627 			cv_broadcast(&pp->p_cv);
628 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
629 		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
630 	} else if (old < 0) {
631 		THREAD_KPRI_RELEASE();
632 		pp->p_selock &= SE_EWANTED;
633 		if (CV_HAS_WAITERS(&pp->p_cv))
634 			cv_broadcast(&pp->p_cv);
635 	} else if ((old & ~SE_EWANTED) > SE_READER) {
636 		pp->p_selock = old - SE_READER;
637 	} else {
638 		panic("page_unlock_nocapture: page %p is not locked",
639 		    (void *)pp);
640 	}
641 
642 	mutex_exit(pse);
643 }
644 
645 /*
646  * Release the page's "shared/exclusive" lock and wake up anyone
647  * who might be waiting for it.
648  */
649 void
650 page_unlock(page_t *pp)
651 {
652 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
653 	selock_t old;
654 
655 	mutex_enter(pse);
656 
657 	old = pp->p_selock;
658 	if ((old & ~SE_EWANTED) == SE_READER) {
659 		pp->p_selock = old & ~SE_READER;
660 		if (CV_HAS_WAITERS(&pp->p_cv))
661 			cv_broadcast(&pp->p_cv);
662 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
663 		panic("page_unlock: page %p is deleted", (void *)pp);
664 	} else if (old < 0) {
665 		THREAD_KPRI_RELEASE();
666 		pp->p_selock &= SE_EWANTED;
667 		if (CV_HAS_WAITERS(&pp->p_cv))
668 			cv_broadcast(&pp->p_cv);
669 	} else if ((old & ~SE_EWANTED) > SE_READER) {
670 		pp->p_selock = old - SE_READER;
671 	} else {
672 		panic("page_unlock: page %p is not locked", (void *)pp);
673 	}
674 
675 	if (pp->p_selock == 0) {
676 		/*
677 		 * If the T_CAPTURING bit is set, that means that we should
678 		 * not try and capture the page again as we could recurse
679 		 * which could lead to a stack overflow panic or spending a
680 		 * relatively long time in the kernel making no progress.
681 		 */
682 		if ((pp->p_toxic & PR_CAPTURE) &&
683 		    !(curthread->t_flag & T_CAPTURING) &&
684 		    !PP_RETIRED(pp)) {
685 			THREAD_KPRI_REQUEST();
686 			pp->p_selock = SE_WRITER;
687 			mutex_exit(pse);
688 			page_unlock_capture(pp);
689 		} else {
690 			mutex_exit(pse);
691 		}
692 	} else {
693 		mutex_exit(pse);
694 	}
695 }
696 
697 /*
698  * Try to upgrade the lock on the page from a "shared" to an
699  * "exclusive" lock.  Since this upgrade operation is done while
700  * holding the mutex protecting this page, no one else can acquire this page's
701  * lock and change the page. Thus, it is safe to drop the "shared"
702  * lock and attempt to acquire the "exclusive" lock.
703  *
704  * Returns 1 on success, 0 on failure.
705  */
706 int
707 page_tryupgrade(page_t *pp)
708 {
709 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
710 
711 	mutex_enter(pse);
712 	if (!(pp->p_selock & SE_EWANTED)) {
713 		/* no threads want exclusive access, try upgrade */
714 		if (pp->p_selock == SE_READER) {
715 			THREAD_KPRI_REQUEST();
716 			/* convert to exclusive lock */
717 			pp->p_selock = SE_WRITER;
718 			mutex_exit(pse);
719 			return (1);
720 		}
721 	}
722 	mutex_exit(pse);
723 	return (0);
724 }
725 
726 /*
727  * Downgrade the "exclusive" lock on the page to a "shared" lock
728  * while holding the mutex protecting this page's p_selock field.
729  */
730 void
731 page_downgrade(page_t *pp)
732 {
733 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
734 	int excl_waiting;
735 
736 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
737 	ASSERT(PAGE_EXCL(pp));
738 
739 	mutex_enter(pse);
740 	excl_waiting =  pp->p_selock & SE_EWANTED;
741 	THREAD_KPRI_RELEASE();
742 	pp->p_selock = SE_READER | excl_waiting;
743 	if (CV_HAS_WAITERS(&pp->p_cv))
744 		cv_broadcast(&pp->p_cv);
745 	mutex_exit(pse);
746 }
747 
748 void
749 page_lock_delete(page_t *pp)
750 {
751 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
752 
753 	ASSERT(PAGE_EXCL(pp));
754 	ASSERT(pp->p_vnode == NULL);
755 	ASSERT(pp->p_offset == (u_offset_t)-1);
756 	ASSERT(!PP_ISFREE(pp));
757 
758 	mutex_enter(pse);
759 	THREAD_KPRI_RELEASE();
760 	pp->p_selock = SE_DELETED;
761 	if (CV_HAS_WAITERS(&pp->p_cv))
762 		cv_broadcast(&pp->p_cv);
763 	mutex_exit(pse);
764 }
765 
766 int
767 page_deleted(page_t *pp)
768 {
769 	return (pp->p_selock == SE_DELETED);
770 }
771 
772 /*
773  * Implement the io lock for pages
774  */
775 void
776 page_iolock_init(page_t *pp)
777 {
778 	pp->p_iolock_state = 0;
779 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
780 }
781 
782 /*
783  * Acquire the i/o lock on a page.
784  */
785 void
786 page_io_lock(page_t *pp)
787 {
788 	kmutex_t *pio;
789 
790 	pio = PAGE_IO_MUTEX(pp);
791 	mutex_enter(pio);
792 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
793 		cv_wait(&(pp->p_io_cv), pio);
794 	}
795 	pp->p_iolock_state |= PAGE_IO_INUSE;
796 	mutex_exit(pio);
797 }
798 
799 /*
800  * Release the i/o lock on a page.
801  */
802 void
803 page_io_unlock(page_t *pp)
804 {
805 	kmutex_t *pio;
806 
807 	pio = PAGE_IO_MUTEX(pp);
808 	mutex_enter(pio);
809 	cv_broadcast(&pp->p_io_cv);
810 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
811 	mutex_exit(pio);
812 }
813 
814 /*
815  * Try to acquire the i/o lock on a page without blocking.
816  * Returns 1 on success, 0 on failure.
817  */
818 int
819 page_io_trylock(page_t *pp)
820 {
821 	kmutex_t *pio;
822 
823 	if (pp->p_iolock_state & PAGE_IO_INUSE)
824 		return (0);
825 
826 	pio = PAGE_IO_MUTEX(pp);
827 	mutex_enter(pio);
828 
829 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
830 		mutex_exit(pio);
831 		return (0);
832 	}
833 	pp->p_iolock_state |= PAGE_IO_INUSE;
834 	mutex_exit(pio);
835 
836 	return (1);
837 }
838 
839 /*
840  * Wait until the i/o lock is not held.
841  */
842 void
843 page_io_wait(page_t *pp)
844 {
845 	kmutex_t *pio;
846 
847 	pio = PAGE_IO_MUTEX(pp);
848 	mutex_enter(pio);
849 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
850 		cv_wait(&(pp->p_io_cv), pio);
851 	}
852 	mutex_exit(pio);
853 }
854 
855 /*
856  * Returns 1 on success, 0 on failure.
857  */
858 int
859 page_io_locked(page_t *pp)
860 {
861 	return (pp->p_iolock_state & PAGE_IO_INUSE);
862 }
863 
864 /*
865  * Assert that the i/o lock on a page is held.
866  * Returns 1 on success, 0 on failure.
867  */
868 int
869 page_iolock_assert(page_t *pp)
870 {
871 	return (page_io_locked(pp));
872 }
873 
874 /*
875  * Wrapper exported to kernel routines that are built
876  * platform-independent (the macro is platform-dependent;
877  * the size of vph_mutex[] is based on NCPU).
878  *
879  * Note that you can do stress testing on this by setting the
880  * variable page_vnode_mutex_stress to something other than
881  * zero in a DEBUG kernel in a debugger after loading the kernel.
882  * Setting it after the kernel is running may not work correctly.
883  */
884 #ifdef DEBUG
885 static int page_vnode_mutex_stress = 0;
886 #endif
887 
888 kmutex_t *
889 page_vnode_mutex(vnode_t *vp)
890 {
891 	if (vp == &kvp)
892 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
893 
894 	if (vp == &zvp)
895 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
896 #ifdef DEBUG
897 	if (page_vnode_mutex_stress != 0)
898 		return (&vph_mutex[0]);
899 #endif
900 
901 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
902 }
903 
904 kmutex_t *
905 page_se_mutex(page_t *pp)
906 {
907 	return (PAGE_SE_MUTEX(pp));
908 }
909 
910 #ifdef VM_STATS
911 uint_t pszclck_stat[4];
912 #endif
913 /*
914  * Find, take and return a mutex held by hat_page_demote().
915  * Called by page_demote_vp_pages() before hat_page_demote() call and by
916  * routines that want to block hat_page_demote() but can't do it
917  * via locking all constituent pages.
918  *
919  * Return NULL if p_szc is 0.
920  *
921  * It should only be used for pages that can be demoted by hat_page_demote()
922  * i.e. non swapfs file system pages.  The logic here is lifted from
923  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
924  * since the page is locked and not free.
925  *
926  * Hash of the root page is used to find the lock.
927  * To find the root in the presense of hat_page_demote() chageing the location
928  * of the root this routine relies on the fact that hat_page_demote() changes
929  * root last.
930  *
931  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
932  * returned pp's p_szc may be any value.
933  */
934 kmutex_t *
935 page_szc_lock(page_t *pp)
936 {
937 	kmutex_t	*mtx;
938 	page_t		*rootpp;
939 	uint_t		szc;
940 	uint_t		rszc;
941 	uint_t		pszc = pp->p_szc;
942 
943 	ASSERT(pp != NULL);
944 	ASSERT(PAGE_LOCKED(pp));
945 	ASSERT(!PP_ISFREE(pp));
946 	ASSERT(pp->p_vnode != NULL);
947 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
948 	ASSERT(!PP_ISKAS(pp));
949 
950 again:
951 	if (pszc == 0) {
952 		VM_STAT_ADD(pszclck_stat[0]);
953 		return (NULL);
954 	}
955 
956 	/* The lock lives in the root page */
957 
958 	rootpp = PP_GROUPLEADER(pp, pszc);
959 	mtx = PAGE_SZC_MUTEX(rootpp);
960 	mutex_enter(mtx);
961 
962 	/*
963 	 * since p_szc can only decrease if pp == rootpp
964 	 * rootpp will be always the same i.e we have the right root
965 	 * regardless of rootpp->p_szc.
966 	 * If location of pp's root didn't change after we took
967 	 * the lock we have the right root. return mutex hashed off it.
968 	 */
969 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
970 		VM_STAT_ADD(pszclck_stat[1]);
971 		return (mtx);
972 	}
973 
974 	/*
975 	 * root location changed because page got demoted.
976 	 * locate the new root.
977 	 */
978 	if (rszc < pszc) {
979 		szc = pp->p_szc;
980 		ASSERT(szc < pszc);
981 		mutex_exit(mtx);
982 		pszc = szc;
983 		VM_STAT_ADD(pszclck_stat[2]);
984 		goto again;
985 	}
986 
987 	VM_STAT_ADD(pszclck_stat[3]);
988 	/*
989 	 * current hat_page_demote not done yet.
990 	 * wait for it to finish.
991 	 */
992 	mutex_exit(mtx);
993 	rootpp = PP_GROUPLEADER(rootpp, rszc);
994 	mtx = PAGE_SZC_MUTEX(rootpp);
995 	mutex_enter(mtx);
996 	mutex_exit(mtx);
997 	ASSERT(rootpp->p_szc < rszc);
998 	goto again;
999 }
1000 
1001 int
1002 page_szc_lock_assert(page_t *pp)
1003 {
1004 	page_t *rootpp = PP_PAGEROOT(pp);
1005 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1006 
1007 	return (MUTEX_HELD(mtx));
1008 }
1009 
1010 /*
1011  * memseg locking
1012  */
1013 static krwlock_t memsegslock;
1014 
1015 /*
1016  * memlist (phys_install, phys_avail) locking.
1017  */
1018 static krwlock_t memlists_lock;
1019 
1020 int
1021 memsegs_trylock(int writer)
1022 {
1023 	return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1024 }
1025 
1026 void
1027 memsegs_lock(int writer)
1028 {
1029 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1030 }
1031 
1032 /*ARGSUSED*/
1033 void
1034 memsegs_unlock(int writer)
1035 {
1036 	rw_exit(&memsegslock);
1037 }
1038 
1039 int
1040 memsegs_lock_held(void)
1041 {
1042 	return (RW_LOCK_HELD(&memsegslock));
1043 }
1044 
1045 void
1046 memlist_read_lock(void)
1047 {
1048 	rw_enter(&memlists_lock, RW_READER);
1049 }
1050 
1051 void
1052 memlist_read_unlock(void)
1053 {
1054 	rw_exit(&memlists_lock);
1055 }
1056 
1057 void
1058 memlist_write_lock(void)
1059 {
1060 	rw_enter(&memlists_lock, RW_WRITER);
1061 }
1062 
1063 void
1064 memlist_write_unlock(void)
1065 {
1066 	rw_exit(&memlists_lock);
1067 }
1068