xref: /titanic_50/usr/src/uts/common/vm/page_lock.c (revision e65e5c2d2f32a99e8c5f740cabae9075dab03ce7)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 /*
28  * VM - page locking primitives
29  */
30 #include <sys/param.h>
31 #include <sys/t_lock.h>
32 #include <sys/vtrace.h>
33 #include <sys/debug.h>
34 #include <sys/cmn_err.h>
35 #include <sys/bitmap.h>
36 #include <sys/lockstat.h>
37 #include <sys/sysmacros.h>
38 #include <sys/condvar_impl.h>
39 #include <vm/page.h>
40 #include <vm/seg_enum.h>
41 #include <vm/vm_dep.h>
42 #include <vm/seg_kmem.h>
43 
44 /*
45  * This global mutex is for logical page locking.
46  * The following fields in the page structure are protected
47  * by this lock:
48  *
49  *	p_lckcnt
50  *	p_cowcnt
51  */
52 kmutex_t page_llock;
53 
54 /*
55  * This is a global lock for the logical page free list.  The
56  * logical free list, in this implementation, is maintained as two
57  * separate physical lists - the cache list and the free list.
58  */
59 kmutex_t  page_freelock;
60 
61 /*
62  * The hash table, page_hash[], the p_selock fields, and the
63  * list of pages associated with vnodes are protected by arrays of mutexes.
64  *
65  * Unless the hashes are changed radically, the table sizes must be
66  * a power of two.  Also, we typically need more mutexes for the
67  * vnodes since these locks are occasionally held for long periods.
68  * And since there seem to be two special vnodes (kvp and swapvp),
69  * we make room for private mutexes for them.
70  *
71  * The pse_mutex[] array holds the mutexes to protect the p_selock
72  * fields of all page_t structures.
73  *
74  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
75  * when given a pointer to a page_t.
76  *
77  * PIO_TABLE_SIZE must be a power of two.  One could argue that we
78  * should go to the trouble of setting it up at run time and base it
79  * on memory size rather than the number of compile time CPUs.
80  *
81  * XX64	We should be using physmem size to calculate PIO_SHIFT.
82  *
83  *	These might break in 64 bit world.
84  */
85 #define	PIO_SHIFT	7	/* log2(sizeof(page_t)) */
86 #define	PIO_TABLE_SIZE	128	/* number of io mutexes to have */
87 
88 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
89 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
90 
91 #define	PAGE_IO_MUTEX(pp) \
92 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
93 
94 /*
95  * The pse_mutex[] array is allocated in the platform startup code
96  * based on the size of the machine at startup.
97  */
98 extern pad_mutex_t *pse_mutex;		/* Locks protecting pp->p_selock */
99 extern size_t pse_table_size;		/* Number of mutexes in pse_mutex[] */
100 extern int pse_shift;			/* log2(pse_table_size) */
101 #define	PAGE_SE_MUTEX(pp)	&pse_mutex[				\
102 	((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) &	\
103 	(pse_table_size - 1)].pad_mutex
104 
105 #define	PSZC_MTX_TABLE_SIZE	128
106 #define	PSZC_MTX_TABLE_SHIFT	7
107 
108 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
109 
110 #define	PAGE_SZC_MUTEX(_pp) \
111 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
112 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
113 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
114 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
115 
116 /*
117  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
118  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
119  * and p_vpnext).
120  *
121  * The page_vnode_mutex(vp) function returns the address of the appropriate
122  * mutex from this array given a pointer to a vnode.  It is complicated
123  * by the fact that the kernel's vnode and the swapfs vnode are referenced
124  * frequently enough to warrent their own mutexes.
125  *
126  * The VP_HASH_FUNC returns the index into the vph_mutex array given
127  * an address of a vnode.
128  */
129 
130 /*
131  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
132  *	Need to review again.
133  */
134 #if defined(_LP64)
135 #define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
136 #else	/* 32 bits */
137 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
138 #endif
139 
140 #define	VP_HASH_FUNC(vp) \
141 	((((uintptr_t)(vp) >> 6) + \
142 	    ((uintptr_t)(vp) >> 8) + \
143 	    ((uintptr_t)(vp) >> 10) + \
144 	    ((uintptr_t)(vp) >> 12)) \
145 	    & (VPH_TABLE_SIZE - 1))
146 
147 /*
148  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
149  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
150  * VPH_TABLE_SIZE + 1.
151  */
152 
153 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
154 
155 /*
156  * Initialize the locks used by the Virtual Memory Management system.
157  */
158 void
159 page_lock_init()
160 {
161 }
162 
163 /*
164  * Return a value for pse_shift based on npg (the number of physical pages)
165  * and ncpu (the maximum number of CPUs).  This is called by platform startup
166  * code.
167  *
168  * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
169  * locks grew approximately as the square of the number of threads executing.
170  * So the primary scaling factor used is NCPU^2.  The size of the machine in
171  * megabytes is used as an upper bound, particularly for sun4v machines which
172  * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
173  * (128) is used as a minimum.  Since the size of the table has to be a power
174  * of two, the calculated size is rounded up to the next power of two.
175  */
176 /*ARGSUSED*/
177 int
178 size_pse_array(pgcnt_t npg, int ncpu)
179 {
180 	size_t size;
181 	pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
182 
183 	size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
184 	size += (1 << (highbit(size) - 1)) - 1;
185 	return (highbit(size) - 1);
186 }
187 
188 /*
189  * At present we only use page ownership to aid debugging, so it's
190  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
191  * can map to the same owner because we just 'or' in 0x80000000 and
192  * then clear the second highest bit, so that (for example) 0x2faced00
193  * and 0xafaced00 both map to 0xafaced00.
194  * In the 64-bit world, p_selock may not be large enough to hold a full
195  * thread pointer.  If we ever need precise ownership (e.g. if we implement
196  * priority inheritance for page locks) then p_selock should become a
197  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
198  */
199 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
200 #define	SE_READER	1
201 
202 /*
203  * A page that is deleted must be marked as such using the
204  * page_lock_delete() function. The page must be exclusively locked.
205  * The SE_DELETED marker is put in p_selock when this function is called.
206  * SE_DELETED must be distinct from any SE_WRITER value.
207  */
208 #define	SE_DELETED	(1 | INT_MIN)
209 
210 #ifdef VM_STATS
211 uint_t	vph_kvp_count;
212 uint_t	vph_swapfsvp_count;
213 uint_t	vph_other;
214 #endif /* VM_STATS */
215 
216 #ifdef VM_STATS
217 uint_t	page_lock_count;
218 uint_t	page_lock_miss;
219 uint_t	page_lock_miss_lock;
220 uint_t	page_lock_reclaim;
221 uint_t	page_lock_bad_reclaim;
222 uint_t	page_lock_same_page;
223 uint_t	page_lock_upgrade;
224 uint_t	page_lock_retired;
225 uint_t	page_lock_upgrade_failed;
226 uint_t	page_lock_deleted;
227 
228 uint_t	page_trylock_locked;
229 uint_t	page_trylock_failed;
230 uint_t	page_trylock_missed;
231 
232 uint_t	page_try_reclaim_upgrade;
233 #endif /* VM_STATS */
234 
235 /*
236  * Acquire the "shared/exclusive" lock on a page.
237  *
238  * Returns 1 on success and locks the page appropriately.
239  *	   0 on failure and does not lock the page.
240  *
241  * If `lock' is non-NULL, it will be dropped and reacquired in the
242  * failure case.  This routine can block, and if it does
243  * it will always return a failure since the page identity [vp, off]
244  * or state may have changed.
245  */
246 
247 int
248 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
249 {
250 	return (page_lock_es(pp, se, lock, reclaim, 0));
251 }
252 
253 /*
254  * With the addition of reader-writer lock semantics to page_lock_es,
255  * callers wanting an exclusive (writer) lock may prevent shared-lock
256  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
257  * In this case, when an exclusive lock cannot be acquired, p_selock's
258  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
259  * if the page is slated for retirement.
260  *
261  * The se and es parameters determine if the lock should be granted
262  * based on the following decision table:
263  *
264  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
265  * ----------- -------------- -------------------  ---------
266  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
267  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
268  * SE_EXCL        none         any lock/any        deny
269  * SE_SHARED      n/a [2]        shared/0          grant
270  * SE_SHARED      n/a [2]      unlocked/0          grant
271  * SE_SHARED      n/a            shared/1          deny
272  * SE_SHARED      n/a          unlocked/1          deny
273  * SE_SHARED      n/a              excl/any        deny
274  *
275  * Notes:
276  * [1] The code grants an exclusive lock to the caller and clears the bit
277  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
278  *   bit's value.  This was deemed acceptable as we are not concerned about
279  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
280  *   fifo mechanism should also be implemented. Meantime, the thread that
281  *   set SE_EWANTED should be prepared to catch this condition and reset it
282  *
283  * [2] Retired pages may not be locked at any time, regardless of the
284  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
285  *
286  * Notes on values of "es":
287  *
288  *   es & 1: page_lookup_create will attempt page relocation
289  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
290  *       memory thread); this prevents reader-starvation of waiting
291  *       writer thread(s) by giving priority to writers over readers.
292  *   es & SE_RETIRED: caller wants to lock pages even if they are
293  *       retired.  Default is to deny the lock if the page is retired.
294  *
295  * And yes, we know, the semantics of this function are too complicated.
296  * It's on the list to be cleaned up.
297  */
298 int
299 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
300 {
301 	int		retval;
302 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
303 	int		upgraded;
304 	int		reclaim_it;
305 
306 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
307 
308 	VM_STAT_ADD(page_lock_count);
309 
310 	upgraded = 0;
311 	reclaim_it = 0;
312 
313 	mutex_enter(pse);
314 
315 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
316 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
317 
318 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
319 		mutex_exit(pse);
320 		VM_STAT_ADD(page_lock_retired);
321 		return (0);
322 	}
323 
324 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
325 		se = SE_EXCL;
326 	}
327 
328 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
329 
330 		reclaim_it = 1;
331 		if (se == SE_SHARED) {
332 			/*
333 			 * This is an interesting situation.
334 			 *
335 			 * Remember that p_free can only change if
336 			 * p_selock < 0.
337 			 * p_free does not depend on our holding `pse'.
338 			 * And, since we hold `pse', p_selock can not change.
339 			 * So, if p_free changes on us, the page is already
340 			 * exclusively held, and we would fail to get p_selock
341 			 * regardless.
342 			 *
343 			 * We want to avoid getting the share
344 			 * lock on a free page that needs to be reclaimed.
345 			 * It is possible that some other thread has the share
346 			 * lock and has left the free page on the cache list.
347 			 * pvn_vplist_dirty() does this for brief periods.
348 			 * If the se_share is currently SE_EXCL, we will fail
349 			 * to acquire p_selock anyway.  Blocking is the
350 			 * right thing to do.
351 			 * If we need to reclaim this page, we must get
352 			 * exclusive access to it, force the upgrade now.
353 			 * Again, we will fail to acquire p_selock if the
354 			 * page is not free and block.
355 			 */
356 			upgraded = 1;
357 			se = SE_EXCL;
358 			VM_STAT_ADD(page_lock_upgrade);
359 		}
360 	}
361 
362 	if (se == SE_EXCL) {
363 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
364 			/*
365 			 * if the caller wants a writer lock (but did not
366 			 * specify exclusive access), and there is a pending
367 			 * writer that wants exclusive access, return failure
368 			 */
369 			retval = 0;
370 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
371 			/* no reader/writer lock held */
372 			THREAD_KPRI_REQUEST();
373 			/* this clears our setting of the SE_EWANTED bit */
374 			pp->p_selock = SE_WRITER;
375 			retval = 1;
376 		} else {
377 			/* page is locked */
378 			if (es & SE_EXCL_WANTED) {
379 				/* set the SE_EWANTED bit */
380 				pp->p_selock |= SE_EWANTED;
381 			}
382 			retval = 0;
383 		}
384 	} else {
385 		retval = 0;
386 		if (pp->p_selock >= 0) {
387 			if ((pp->p_selock & SE_EWANTED) == 0) {
388 				pp->p_selock += SE_READER;
389 				retval = 1;
390 			}
391 		}
392 	}
393 
394 	if (retval == 0) {
395 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
396 			VM_STAT_ADD(page_lock_deleted);
397 			mutex_exit(pse);
398 			return (retval);
399 		}
400 
401 #ifdef VM_STATS
402 		VM_STAT_ADD(page_lock_miss);
403 		if (upgraded) {
404 			VM_STAT_ADD(page_lock_upgrade_failed);
405 		}
406 #endif
407 		if (lock) {
408 			VM_STAT_ADD(page_lock_miss_lock);
409 			mutex_exit(lock);
410 		}
411 
412 		/*
413 		 * Now, wait for the page to be unlocked and
414 		 * release the lock protecting p_cv and p_selock.
415 		 */
416 		cv_wait(&pp->p_cv, pse);
417 		mutex_exit(pse);
418 
419 		/*
420 		 * The page identity may have changed while we were
421 		 * blocked.  If we are willing to depend on "pp"
422 		 * still pointing to a valid page structure (i.e.,
423 		 * assuming page structures are not dynamically allocated
424 		 * or freed), we could try to lock the page if its
425 		 * identity hasn't changed.
426 		 *
427 		 * This needs to be measured, since we come back from
428 		 * cv_wait holding pse (the expensive part of this
429 		 * operation) we might as well try the cheap part.
430 		 * Though we would also have to confirm that dropping
431 		 * `lock' did not cause any grief to the callers.
432 		 */
433 		if (lock) {
434 			mutex_enter(lock);
435 		}
436 	} else {
437 		/*
438 		 * We have the page lock.
439 		 * If we needed to reclaim the page, and the page
440 		 * needed reclaiming (ie, it was free), then we
441 		 * have the page exclusively locked.  We may need
442 		 * to downgrade the page.
443 		 */
444 		ASSERT((upgraded) ?
445 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
446 		mutex_exit(pse);
447 
448 		/*
449 		 * We now hold this page's lock, either shared or
450 		 * exclusive.  This will prevent its identity from changing.
451 		 * The page, however, may or may not be free.  If the caller
452 		 * requested, and it is free, go reclaim it from the
453 		 * free list.  If the page can't be reclaimed, return failure
454 		 * so that the caller can start all over again.
455 		 *
456 		 * NOTE:page_reclaim() releases the page lock (p_selock)
457 		 *	if it can't be reclaimed.
458 		 */
459 		if (reclaim_it) {
460 			if (!page_reclaim(pp, lock)) {
461 				VM_STAT_ADD(page_lock_bad_reclaim);
462 				retval = 0;
463 			} else {
464 				VM_STAT_ADD(page_lock_reclaim);
465 				if (upgraded) {
466 					page_downgrade(pp);
467 				}
468 			}
469 		}
470 	}
471 	return (retval);
472 }
473 
474 /*
475  * Clear the SE_EWANTED bit from p_selock.  This function allows
476  * callers of page_lock_es and page_try_reclaim_lock to clear
477  * their setting of this bit if they decide they no longer wish
478  * to gain exclusive access to the page.  Currently only
479  * delete_memory_thread uses this when the delete memory
480  * operation is cancelled.
481  */
482 void
483 page_lock_clr_exclwanted(page_t *pp)
484 {
485 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
486 
487 	mutex_enter(pse);
488 	pp->p_selock &= ~SE_EWANTED;
489 	if (CV_HAS_WAITERS(&pp->p_cv))
490 		cv_broadcast(&pp->p_cv);
491 	mutex_exit(pse);
492 }
493 
494 /*
495  * Read the comments inside of page_lock_es() carefully.
496  *
497  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
498  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
499  * This is used by threads subject to reader-starvation (eg. memory delete).
500  *
501  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
502  * it is expected that it will retry at a later time.  Threads that will
503  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
504  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
505  * the bit is cleared.)
506  */
507 int
508 page_try_reclaim_lock(page_t *pp, se_t se, int es)
509 {
510 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
511 	selock_t old;
512 
513 	mutex_enter(pse);
514 
515 	old = pp->p_selock;
516 
517 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
518 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
519 
520 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
521 		mutex_exit(pse);
522 		VM_STAT_ADD(page_trylock_failed);
523 		return (0);
524 	}
525 
526 	if (se == SE_SHARED && es == 1 && old == 0) {
527 		se = SE_EXCL;
528 	}
529 
530 	if (se == SE_SHARED) {
531 		if (!PP_ISFREE(pp)) {
532 			if (old >= 0) {
533 				/*
534 				 * Readers are not allowed when excl wanted
535 				 */
536 				if ((old & SE_EWANTED) == 0) {
537 					pp->p_selock = old + SE_READER;
538 					mutex_exit(pse);
539 					return (1);
540 				}
541 			}
542 			mutex_exit(pse);
543 			return (0);
544 		}
545 		/*
546 		 * The page is free, so we really want SE_EXCL (below)
547 		 */
548 		VM_STAT_ADD(page_try_reclaim_upgrade);
549 	}
550 
551 	/*
552 	 * The caller wants a writer lock.  We try for it only if
553 	 * SE_EWANTED is not set, or if the caller specified
554 	 * SE_EXCL_WANTED.
555 	 */
556 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
557 		if ((old & ~SE_EWANTED) == 0) {
558 			/* no reader/writer lock held */
559 			THREAD_KPRI_REQUEST();
560 			/* this clears out our setting of the SE_EWANTED bit */
561 			pp->p_selock = SE_WRITER;
562 			mutex_exit(pse);
563 			return (1);
564 		}
565 	}
566 	if (es & SE_EXCL_WANTED) {
567 		/* page is locked, set the SE_EWANTED bit */
568 		pp->p_selock |= SE_EWANTED;
569 	}
570 	mutex_exit(pse);
571 	return (0);
572 }
573 
574 /*
575  * Acquire a page's "shared/exclusive" lock, but never block.
576  * Returns 1 on success, 0 on failure.
577  */
578 int
579 page_trylock(page_t *pp, se_t se)
580 {
581 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
582 
583 	mutex_enter(pse);
584 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
585 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
586 		/*
587 		 * Fail if a thread wants exclusive access and page is
588 		 * retired, if the page is slated for retirement, or a
589 		 * share lock is requested.
590 		 */
591 		mutex_exit(pse);
592 		VM_STAT_ADD(page_trylock_failed);
593 		return (0);
594 	}
595 
596 	if (se == SE_EXCL) {
597 		if (pp->p_selock == 0) {
598 			THREAD_KPRI_REQUEST();
599 			pp->p_selock = SE_WRITER;
600 			mutex_exit(pse);
601 			return (1);
602 		}
603 	} else {
604 		if (pp->p_selock >= 0) {
605 			pp->p_selock += SE_READER;
606 			mutex_exit(pse);
607 			return (1);
608 		}
609 	}
610 	mutex_exit(pse);
611 	return (0);
612 }
613 
614 /*
615  * Variant of page_unlock() specifically for the page freelist
616  * code. The mere existence of this code is a vile hack that
617  * has resulted due to the backwards locking order of the page
618  * freelist manager; please don't call it.
619  */
620 void
621 page_unlock_nocapture(page_t *pp)
622 {
623 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
624 	selock_t old;
625 
626 	mutex_enter(pse);
627 
628 	old = pp->p_selock;
629 	if ((old & ~SE_EWANTED) == SE_READER) {
630 		pp->p_selock = old & ~SE_READER;
631 		if (CV_HAS_WAITERS(&pp->p_cv))
632 			cv_broadcast(&pp->p_cv);
633 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
634 		panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
635 	} else if (old < 0) {
636 		THREAD_KPRI_RELEASE();
637 		pp->p_selock &= SE_EWANTED;
638 		if (CV_HAS_WAITERS(&pp->p_cv))
639 			cv_broadcast(&pp->p_cv);
640 	} else if ((old & ~SE_EWANTED) > SE_READER) {
641 		pp->p_selock = old - SE_READER;
642 	} else {
643 		panic("page_unlock_nocapture: page %p is not locked",
644 		    (void *)pp);
645 	}
646 
647 	mutex_exit(pse);
648 }
649 
650 /*
651  * Release the page's "shared/exclusive" lock and wake up anyone
652  * who might be waiting for it.
653  */
654 void
655 page_unlock(page_t *pp)
656 {
657 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
658 	selock_t old;
659 
660 	mutex_enter(pse);
661 
662 	old = pp->p_selock;
663 	if ((old & ~SE_EWANTED) == SE_READER) {
664 		pp->p_selock = old & ~SE_READER;
665 		if (CV_HAS_WAITERS(&pp->p_cv))
666 			cv_broadcast(&pp->p_cv);
667 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
668 		panic("page_unlock: page %p is deleted", (void *)pp);
669 	} else if (old < 0) {
670 		THREAD_KPRI_RELEASE();
671 		pp->p_selock &= SE_EWANTED;
672 		if (CV_HAS_WAITERS(&pp->p_cv))
673 			cv_broadcast(&pp->p_cv);
674 	} else if ((old & ~SE_EWANTED) > SE_READER) {
675 		pp->p_selock = old - SE_READER;
676 	} else {
677 		panic("page_unlock: page %p is not locked", (void *)pp);
678 	}
679 
680 	if (pp->p_selock == 0) {
681 		/*
682 		 * If the T_CAPTURING bit is set, that means that we should
683 		 * not try and capture the page again as we could recurse
684 		 * which could lead to a stack overflow panic or spending a
685 		 * relatively long time in the kernel making no progress.
686 		 */
687 		if ((pp->p_toxic & PR_CAPTURE) &&
688 		    !(curthread->t_flag & T_CAPTURING) &&
689 		    !PP_RETIRED(pp)) {
690 			THREAD_KPRI_REQUEST();
691 			pp->p_selock = SE_WRITER;
692 			mutex_exit(pse);
693 			page_unlock_capture(pp);
694 		} else {
695 			mutex_exit(pse);
696 		}
697 	} else {
698 		mutex_exit(pse);
699 	}
700 }
701 
702 /*
703  * Try to upgrade the lock on the page from a "shared" to an
704  * "exclusive" lock.  Since this upgrade operation is done while
705  * holding the mutex protecting this page, no one else can acquire this page's
706  * lock and change the page. Thus, it is safe to drop the "shared"
707  * lock and attempt to acquire the "exclusive" lock.
708  *
709  * Returns 1 on success, 0 on failure.
710  */
711 int
712 page_tryupgrade(page_t *pp)
713 {
714 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
715 
716 	mutex_enter(pse);
717 	if (!(pp->p_selock & SE_EWANTED)) {
718 		/* no threads want exclusive access, try upgrade */
719 		if (pp->p_selock == SE_READER) {
720 			THREAD_KPRI_REQUEST();
721 			/* convert to exclusive lock */
722 			pp->p_selock = SE_WRITER;
723 			mutex_exit(pse);
724 			return (1);
725 		}
726 	}
727 	mutex_exit(pse);
728 	return (0);
729 }
730 
731 /*
732  * Downgrade the "exclusive" lock on the page to a "shared" lock
733  * while holding the mutex protecting this page's p_selock field.
734  */
735 void
736 page_downgrade(page_t *pp)
737 {
738 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
739 	int excl_waiting;
740 
741 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
742 	ASSERT(PAGE_EXCL(pp));
743 
744 	mutex_enter(pse);
745 	excl_waiting =  pp->p_selock & SE_EWANTED;
746 	THREAD_KPRI_RELEASE();
747 	pp->p_selock = SE_READER | excl_waiting;
748 	if (CV_HAS_WAITERS(&pp->p_cv))
749 		cv_broadcast(&pp->p_cv);
750 	mutex_exit(pse);
751 }
752 
753 void
754 page_lock_delete(page_t *pp)
755 {
756 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
757 
758 	ASSERT(PAGE_EXCL(pp));
759 	ASSERT(pp->p_vnode == NULL);
760 	ASSERT(pp->p_offset == (u_offset_t)-1);
761 	ASSERT(!PP_ISFREE(pp));
762 
763 	mutex_enter(pse);
764 	THREAD_KPRI_RELEASE();
765 	pp->p_selock = SE_DELETED;
766 	if (CV_HAS_WAITERS(&pp->p_cv))
767 		cv_broadcast(&pp->p_cv);
768 	mutex_exit(pse);
769 }
770 
771 int
772 page_deleted(page_t *pp)
773 {
774 	return (pp->p_selock == SE_DELETED);
775 }
776 
777 /*
778  * Implement the io lock for pages
779  */
780 void
781 page_iolock_init(page_t *pp)
782 {
783 	pp->p_iolock_state = 0;
784 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
785 }
786 
787 /*
788  * Acquire the i/o lock on a page.
789  */
790 void
791 page_io_lock(page_t *pp)
792 {
793 	kmutex_t *pio;
794 
795 	pio = PAGE_IO_MUTEX(pp);
796 	mutex_enter(pio);
797 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
798 		cv_wait(&(pp->p_io_cv), pio);
799 	}
800 	pp->p_iolock_state |= PAGE_IO_INUSE;
801 	mutex_exit(pio);
802 }
803 
804 /*
805  * Release the i/o lock on a page.
806  */
807 void
808 page_io_unlock(page_t *pp)
809 {
810 	kmutex_t *pio;
811 
812 	pio = PAGE_IO_MUTEX(pp);
813 	mutex_enter(pio);
814 	cv_broadcast(&pp->p_io_cv);
815 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
816 	mutex_exit(pio);
817 }
818 
819 /*
820  * Try to acquire the i/o lock on a page without blocking.
821  * Returns 1 on success, 0 on failure.
822  */
823 int
824 page_io_trylock(page_t *pp)
825 {
826 	kmutex_t *pio;
827 
828 	if (pp->p_iolock_state & PAGE_IO_INUSE)
829 		return (0);
830 
831 	pio = PAGE_IO_MUTEX(pp);
832 	mutex_enter(pio);
833 
834 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
835 		mutex_exit(pio);
836 		return (0);
837 	}
838 	pp->p_iolock_state |= PAGE_IO_INUSE;
839 	mutex_exit(pio);
840 
841 	return (1);
842 }
843 
844 /*
845  * Wait until the i/o lock is not held.
846  */
847 void
848 page_io_wait(page_t *pp)
849 {
850 	kmutex_t *pio;
851 
852 	pio = PAGE_IO_MUTEX(pp);
853 	mutex_enter(pio);
854 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
855 		cv_wait(&(pp->p_io_cv), pio);
856 	}
857 	mutex_exit(pio);
858 }
859 
860 /*
861  * Returns 1 on success, 0 on failure.
862  */
863 int
864 page_io_locked(page_t *pp)
865 {
866 	return (pp->p_iolock_state & PAGE_IO_INUSE);
867 }
868 
869 /*
870  * Assert that the i/o lock on a page is held.
871  * Returns 1 on success, 0 on failure.
872  */
873 int
874 page_iolock_assert(page_t *pp)
875 {
876 	return (page_io_locked(pp));
877 }
878 
879 /*
880  * Wrapper exported to kernel routines that are built
881  * platform-independent (the macro is platform-dependent;
882  * the size of vph_mutex[] is based on NCPU).
883  *
884  * Note that you can do stress testing on this by setting the
885  * variable page_vnode_mutex_stress to something other than
886  * zero in a DEBUG kernel in a debugger after loading the kernel.
887  * Setting it after the kernel is running may not work correctly.
888  */
889 #ifdef DEBUG
890 static int page_vnode_mutex_stress = 0;
891 #endif
892 
893 kmutex_t *
894 page_vnode_mutex(vnode_t *vp)
895 {
896 	if (vp == &kvp)
897 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
898 
899 	if (vp == &zvp)
900 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
901 #ifdef DEBUG
902 	if (page_vnode_mutex_stress != 0)
903 		return (&vph_mutex[0]);
904 #endif
905 
906 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
907 }
908 
909 kmutex_t *
910 page_se_mutex(page_t *pp)
911 {
912 	return (PAGE_SE_MUTEX(pp));
913 }
914 
915 #ifdef VM_STATS
916 uint_t pszclck_stat[4];
917 #endif
918 /*
919  * Find, take and return a mutex held by hat_page_demote().
920  * Called by page_demote_vp_pages() before hat_page_demote() call and by
921  * routines that want to block hat_page_demote() but can't do it
922  * via locking all constituent pages.
923  *
924  * Return NULL if p_szc is 0.
925  *
926  * It should only be used for pages that can be demoted by hat_page_demote()
927  * i.e. non swapfs file system pages.  The logic here is lifted from
928  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
929  * since the page is locked and not free.
930  *
931  * Hash of the root page is used to find the lock.
932  * To find the root in the presense of hat_page_demote() chageing the location
933  * of the root this routine relies on the fact that hat_page_demote() changes
934  * root last.
935  *
936  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
937  * returned pp's p_szc may be any value.
938  */
939 kmutex_t *
940 page_szc_lock(page_t *pp)
941 {
942 	kmutex_t	*mtx;
943 	page_t		*rootpp;
944 	uint_t		szc;
945 	uint_t		rszc;
946 	uint_t		pszc = pp->p_szc;
947 
948 	ASSERT(pp != NULL);
949 	ASSERT(PAGE_LOCKED(pp));
950 	ASSERT(!PP_ISFREE(pp));
951 	ASSERT(pp->p_vnode != NULL);
952 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
953 	ASSERT(!PP_ISKAS(pp));
954 
955 again:
956 	if (pszc == 0) {
957 		VM_STAT_ADD(pszclck_stat[0]);
958 		return (NULL);
959 	}
960 
961 	/* The lock lives in the root page */
962 
963 	rootpp = PP_GROUPLEADER(pp, pszc);
964 	mtx = PAGE_SZC_MUTEX(rootpp);
965 	mutex_enter(mtx);
966 
967 	/*
968 	 * since p_szc can only decrease if pp == rootpp
969 	 * rootpp will be always the same i.e we have the right root
970 	 * regardless of rootpp->p_szc.
971 	 * If location of pp's root didn't change after we took
972 	 * the lock we have the right root. return mutex hashed off it.
973 	 */
974 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
975 		VM_STAT_ADD(pszclck_stat[1]);
976 		return (mtx);
977 	}
978 
979 	/*
980 	 * root location changed because page got demoted.
981 	 * locate the new root.
982 	 */
983 	if (rszc < pszc) {
984 		szc = pp->p_szc;
985 		ASSERT(szc < pszc);
986 		mutex_exit(mtx);
987 		pszc = szc;
988 		VM_STAT_ADD(pszclck_stat[2]);
989 		goto again;
990 	}
991 
992 	VM_STAT_ADD(pszclck_stat[3]);
993 	/*
994 	 * current hat_page_demote not done yet.
995 	 * wait for it to finish.
996 	 */
997 	mutex_exit(mtx);
998 	rootpp = PP_GROUPLEADER(rootpp, rszc);
999 	mtx = PAGE_SZC_MUTEX(rootpp);
1000 	mutex_enter(mtx);
1001 	mutex_exit(mtx);
1002 	ASSERT(rootpp->p_szc < rszc);
1003 	goto again;
1004 }
1005 
1006 int
1007 page_szc_lock_assert(page_t *pp)
1008 {
1009 	page_t *rootpp = PP_PAGEROOT(pp);
1010 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
1011 
1012 	return (MUTEX_HELD(mtx));
1013 }
1014 
1015 /*
1016  * memseg locking
1017  */
1018 static krwlock_t memsegslock;
1019 
1020 /*
1021  * memlist (phys_install, phys_avail) locking.
1022  */
1023 static krwlock_t memlists_lock;
1024 
1025 int
1026 memsegs_trylock(int writer)
1027 {
1028 	return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1029 }
1030 
1031 void
1032 memsegs_lock(int writer)
1033 {
1034 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1035 }
1036 
1037 /*ARGSUSED*/
1038 void
1039 memsegs_unlock(int writer)
1040 {
1041 	rw_exit(&memsegslock);
1042 }
1043 
1044 int
1045 memsegs_lock_held(void)
1046 {
1047 	return (RW_LOCK_HELD(&memsegslock));
1048 }
1049 
1050 void
1051 memlist_read_lock(void)
1052 {
1053 	rw_enter(&memlists_lock, RW_READER);
1054 }
1055 
1056 void
1057 memlist_read_unlock(void)
1058 {
1059 	rw_exit(&memlists_lock);
1060 }
1061 
1062 void
1063 memlist_write_lock(void)
1064 {
1065 	rw_enter(&memlists_lock, RW_WRITER);
1066 }
1067 
1068 void
1069 memlist_write_unlock(void)
1070 {
1071 	rw_exit(&memlists_lock);
1072 }
1073