xref: /titanic_51/usr/src/uts/common/vm/page_lock.c (revision 672986541be54a7a471bb088e60780c37e371d7e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * VM - page locking primitives
30  */
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/vtrace.h>
34 #include <sys/debug.h>
35 #include <sys/cmn_err.h>
36 #include <sys/vnode.h>
37 #include <sys/bitmap.h>
38 #include <sys/lockstat.h>
39 #include <sys/condvar_impl.h>
40 #include <vm/page.h>
41 #include <vm/seg_enum.h>
42 #include <vm/vm_dep.h>
43 
44 /*
45  * This global mutex is for logical page locking.
46  * The following fields in the page structure are protected
47  * by this lock:
48  *
49  *	p_lckcnt
50  *	p_cowcnt
51  */
52 kmutex_t page_llock;
53 
54 /*
55  * This is a global lock for the logical page free list.  The
56  * logical free list, in this implementation, is maintained as two
57  * separate physical lists - the cache list and the free list.
58  */
59 kmutex_t  page_freelock;
60 
61 /*
62  * The hash table, page_hash[], the p_selock fields, and the
63  * list of pages associated with vnodes are protected by arrays of mutexes.
64  *
65  * Unless the hashes are changed radically, the table sizes must be
66  * a power of two.  Also, we typically need more mutexes for the
67  * vnodes since these locks are occasionally held for long periods.
68  * And since there seem to be two special vnodes (kvp and swapvp),
69  * we make room for private mutexes for them.
70  *
71  * The pse_mutex[] array holds the mutexes to protect the p_selock
72  * fields of all page_t structures.
73  *
74  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
75  * when given a pointer to a page_t.
76  *
77  * PSE_TABLE_SIZE must be a power of two.  One could argue that we
78  * should go to the trouble of setting it up at run time and base it
79  * on memory size rather than the number of compile time CPUs.
80  *
81  * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
82  *	PSE_SHIFT, PIO_SHIFT.
83  *
84  *	These might break in 64 bit world.
85  */
86 #define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
87 
88 #define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
89 
90 #define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
91 #define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
92 
93 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
94 pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
95 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
96 
97 #define	PAGE_SE_MUTEX(pp) \
98 	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
99 		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
100 		(PSE_TABLE_SIZE - 1))].pad_mutex
101 
102 #define	PAGE_IO_MUTEX(pp) \
103 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
104 
105 #define	PSZC_MTX_TABLE_SIZE	128
106 #define	PSZC_MTX_TABLE_SHIFT	7
107 
108 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
109 
110 #define	PAGE_SZC_MUTEX(_pp) \
111 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
112 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
113 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
114 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
115 
116 /*
117  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
118  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
119  * and p_vpnext).
120  *
121  * The page_vnode_mutex(vp) function returns the address of the appropriate
122  * mutex from this array given a pointer to a vnode.  It is complicated
123  * by the fact that the kernel's vnode and the swapfs vnode are referenced
124  * frequently enough to warrent their own mutexes.
125  *
126  * The VP_HASH_FUNC returns the index into the vph_mutex array given
127  * an address of a vnode.
128  */
129 
130 /*
131  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
132  *	Need to review again.
133  */
134 #if defined(_LP64)
135 #define	VPH_TABLE_SIZE  (1 << (VP_SHIFT + 3))
136 #else	/* 32 bits */
137 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
138 #endif
139 
140 #define	VP_HASH_FUNC(vp) \
141 	((((uintptr_t)(vp) >> 6) + \
142 	    ((uintptr_t)(vp) >> 8) + \
143 	    ((uintptr_t)(vp) >> 10) + \
144 	    ((uintptr_t)(vp) >> 12)) \
145 	    & (VPH_TABLE_SIZE - 1))
146 
147 extern	struct vnode	kvp;
148 
149 /*
150  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
151  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
152  * VPH_TABLE_SIZE + 1.
153  */
154 
155 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
156 
157 /*
158  * Initialize the locks used by the Virtual Memory Management system.
159  */
160 void
161 page_lock_init()
162 {
163 }
164 
165 /*
166  * At present we only use page ownership to aid debugging, so it's
167  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
168  * can map to the same owner because we just 'or' in 0x80000000 and
169  * then clear the second highest bit, so that (for example) 0x2faced00
170  * and 0xafaced00 both map to 0xafaced00.
171  * In the 64-bit world, p_selock may not be large enough to hold a full
172  * thread pointer.  If we ever need precise ownership (e.g. if we implement
173  * priority inheritance for page locks) then p_selock should become a
174  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
175  */
176 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
177 #define	SE_READER	1
178 
179 /*
180  * A page that is deleted must be marked as such using the
181  * page_lock_delete() function. The page must be exclusively locked.
182  * The SE_DELETED marker is put in p_selock when this function is called.
183  * SE_DELETED must be distinct from any SE_WRITER value.
184  */
185 #define	SE_DELETED	(1 | INT_MIN)
186 
187 #ifdef VM_STATS
188 uint_t	vph_kvp_count;
189 uint_t	vph_swapfsvp_count;
190 uint_t	vph_other;
191 #endif /* VM_STATS */
192 
193 #ifdef VM_STATS
194 uint_t	page_lock_count;
195 uint_t	page_lock_miss;
196 uint_t	page_lock_miss_lock;
197 uint_t	page_lock_reclaim;
198 uint_t	page_lock_bad_reclaim;
199 uint_t	page_lock_same_page;
200 uint_t	page_lock_upgrade;
201 uint_t	page_lock_retired;
202 uint_t	page_lock_upgrade_failed;
203 uint_t	page_lock_deleted;
204 
205 uint_t	page_trylock_locked;
206 uint_t	page_trylock_failed;
207 uint_t	page_trylock_missed;
208 
209 uint_t	page_try_reclaim_upgrade;
210 #endif /* VM_STATS */
211 
212 /*
213  * Acquire the "shared/exclusive" lock on a page.
214  *
215  * Returns 1 on success and locks the page appropriately.
216  *	   0 on failure and does not lock the page.
217  *
218  * If `lock' is non-NULL, it will be dropped and reacquired in the
219  * failure case.  This routine can block, and if it does
220  * it will always return a failure since the page identity [vp, off]
221  * or state may have changed.
222  */
223 
224 int
225 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
226 {
227 	return (page_lock_es(pp, se, lock, reclaim, 0));
228 }
229 
230 /*
231  * With the addition of reader-writer lock semantics to page_lock_es,
232  * callers wanting an exclusive (writer) lock may prevent shared-lock
233  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
234  * In this case, when an exclusive lock cannot be acquired, p_selock's
235  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
236  * if the page is slated for retirement.
237  *
238  * The se and es parameters determine if the lock should be granted
239  * based on the following decision table:
240  *
241  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
242  * ----------- -------------- -------------------  ---------
243  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
244  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
245  * SE_EXCL        none         any lock/any        deny
246  * SE_SHARED      n/a [2]        shared/0          grant
247  * SE_SHARED      n/a [2]      unlocked/0          grant
248  * SE_SHARED      n/a            shared/1          deny
249  * SE_SHARED      n/a          unlocked/1          deny
250  * SE_SHARED      n/a              excl/any        deny
251  *
252  * Notes:
253  * [1] The code grants an exclusive lock to the caller and clears the bit
254  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
255  *   bit's value.  This was deemed acceptable as we are not concerned about
256  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
257  *   fifo mechanism should also be implemented. Meantime, the thread that
258  *   set SE_EWANTED should be prepared to catch this condition and reset it
259  *
260  * [2] Retired pages may not be locked at any time, regardless of the
261  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
262  *
263  * Notes on values of "es":
264  *
265  *   es & 1: page_lookup_create will attempt page relocation
266  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
267  *       memory thread); this prevents reader-starvation of waiting
268  *       writer thread(s) by giving priority to writers over readers.
269  *   es & SE_RETIRED: caller wants to lock pages even if they are
270  *       retired.  Default is to deny the lock if the page is retired.
271  *
272  * And yes, we know, the semantics of this function are too complicated.
273  * It's on the list to be cleaned up.
274  */
275 int
276 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
277 {
278 	int		retval;
279 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
280 	int		upgraded;
281 	int		reclaim_it;
282 
283 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
284 
285 	VM_STAT_ADD(page_lock_count);
286 
287 	upgraded = 0;
288 	reclaim_it = 0;
289 
290 	mutex_enter(pse);
291 
292 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
293 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
294 
295 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
296 		mutex_exit(pse);
297 		VM_STAT_ADD(page_lock_retired);
298 		return (0);
299 	}
300 
301 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
302 		se = SE_EXCL;
303 	}
304 
305 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
306 
307 		reclaim_it = 1;
308 		if (se == SE_SHARED) {
309 			/*
310 			 * This is an interesting situation.
311 			 *
312 			 * Remember that p_free can only change if
313 			 * p_selock < 0.
314 			 * p_free does not depend on our holding `pse'.
315 			 * And, since we hold `pse', p_selock can not change.
316 			 * So, if p_free changes on us, the page is already
317 			 * exclusively held, and we would fail to get p_selock
318 			 * regardless.
319 			 *
320 			 * We want to avoid getting the share
321 			 * lock on a free page that needs to be reclaimed.
322 			 * It is possible that some other thread has the share
323 			 * lock and has left the free page on the cache list.
324 			 * pvn_vplist_dirty() does this for brief periods.
325 			 * If the se_share is currently SE_EXCL, we will fail
326 			 * to acquire p_selock anyway.  Blocking is the
327 			 * right thing to do.
328 			 * If we need to reclaim this page, we must get
329 			 * exclusive access to it, force the upgrade now.
330 			 * Again, we will fail to acquire p_selock if the
331 			 * page is not free and block.
332 			 */
333 			upgraded = 1;
334 			se = SE_EXCL;
335 			VM_STAT_ADD(page_lock_upgrade);
336 		}
337 	}
338 
339 	if (se == SE_EXCL) {
340 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
341 			/*
342 			 * if the caller wants a writer lock (but did not
343 			 * specify exclusive access), and there is a pending
344 			 * writer that wants exclusive access, return failure
345 			 */
346 			retval = 0;
347 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
348 			/* no reader/writer lock held */
349 			THREAD_KPRI_REQUEST();
350 			/* this clears our setting of the SE_EWANTED bit */
351 			pp->p_selock = SE_WRITER;
352 			retval = 1;
353 		} else {
354 			/* page is locked */
355 			if (es & SE_EXCL_WANTED) {
356 				/* set the SE_EWANTED bit */
357 				pp->p_selock |= SE_EWANTED;
358 			}
359 			retval = 0;
360 		}
361 	} else {
362 		retval = 0;
363 		if (pp->p_selock >= 0) {
364 			if ((pp->p_selock & SE_EWANTED) == 0) {
365 				pp->p_selock += SE_READER;
366 				retval = 1;
367 			}
368 		}
369 	}
370 
371 	if (retval == 0) {
372 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
373 			VM_STAT_ADD(page_lock_deleted);
374 			mutex_exit(pse);
375 			return (retval);
376 		}
377 
378 #ifdef VM_STATS
379 		VM_STAT_ADD(page_lock_miss);
380 		if (upgraded) {
381 			VM_STAT_ADD(page_lock_upgrade_failed);
382 		}
383 #endif
384 		if (lock) {
385 			VM_STAT_ADD(page_lock_miss_lock);
386 			mutex_exit(lock);
387 		}
388 
389 		/*
390 		 * Now, wait for the page to be unlocked and
391 		 * release the lock protecting p_cv and p_selock.
392 		 */
393 		cv_wait(&pp->p_cv, pse);
394 		mutex_exit(pse);
395 
396 		/*
397 		 * The page identity may have changed while we were
398 		 * blocked.  If we are willing to depend on "pp"
399 		 * still pointing to a valid page structure (i.e.,
400 		 * assuming page structures are not dynamically allocated
401 		 * or freed), we could try to lock the page if its
402 		 * identity hasn't changed.
403 		 *
404 		 * This needs to be measured, since we come back from
405 		 * cv_wait holding pse (the expensive part of this
406 		 * operation) we might as well try the cheap part.
407 		 * Though we would also have to confirm that dropping
408 		 * `lock' did not cause any grief to the callers.
409 		 */
410 		if (lock) {
411 			mutex_enter(lock);
412 		}
413 	} else {
414 		/*
415 		 * We have the page lock.
416 		 * If we needed to reclaim the page, and the page
417 		 * needed reclaiming (ie, it was free), then we
418 		 * have the page exclusively locked.  We may need
419 		 * to downgrade the page.
420 		 */
421 		ASSERT((upgraded) ?
422 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
423 		mutex_exit(pse);
424 
425 		/*
426 		 * We now hold this page's lock, either shared or
427 		 * exclusive.  This will prevent its identity from changing.
428 		 * The page, however, may or may not be free.  If the caller
429 		 * requested, and it is free, go reclaim it from the
430 		 * free list.  If the page can't be reclaimed, return failure
431 		 * so that the caller can start all over again.
432 		 *
433 		 * NOTE:page_reclaim() releases the page lock (p_selock)
434 		 *	if it can't be reclaimed.
435 		 */
436 		if (reclaim_it) {
437 			if (!page_reclaim(pp, lock)) {
438 				VM_STAT_ADD(page_lock_bad_reclaim);
439 				retval = 0;
440 			} else {
441 				VM_STAT_ADD(page_lock_reclaim);
442 				if (upgraded) {
443 					page_downgrade(pp);
444 				}
445 			}
446 		}
447 	}
448 	return (retval);
449 }
450 
451 /*
452  * Clear the SE_EWANTED bit from p_selock.  This function allows
453  * callers of page_lock_es and page_try_reclaim_lock to clear
454  * their setting of this bit if they decide they no longer wish
455  * to gain exclusive access to the page.  Currently only
456  * delete_memory_thread uses this when the delete memory
457  * operation is cancelled.
458  */
459 void
460 page_lock_clr_exclwanted(page_t *pp)
461 {
462 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
463 
464 	mutex_enter(pse);
465 	pp->p_selock &= ~SE_EWANTED;
466 	if (CV_HAS_WAITERS(&pp->p_cv))
467 		cv_broadcast(&pp->p_cv);
468 	mutex_exit(pse);
469 }
470 
471 /*
472  * Read the comments inside of page_lock_es() carefully.
473  *
474  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
475  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
476  * This is used by threads subject to reader-starvation (eg. memory delete).
477  *
478  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
479  * it is expected that it will retry at a later time.  Threads that will
480  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
481  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
482  * the bit is cleared.)
483  */
484 int
485 page_try_reclaim_lock(page_t *pp, se_t se, int es)
486 {
487 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
488 	selock_t old;
489 
490 	mutex_enter(pse);
491 
492 	old = pp->p_selock;
493 
494 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
495 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
496 
497 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
498 		mutex_exit(pse);
499 		VM_STAT_ADD(page_trylock_failed);
500 		return (0);
501 	}
502 
503 	if (se == SE_SHARED && es == 1 && old == 0) {
504 		se = SE_EXCL;
505 	}
506 
507 	if (se == SE_SHARED) {
508 		if (!PP_ISFREE(pp)) {
509 			if (old >= 0) {
510 				/*
511 				 * Readers are not allowed when excl wanted
512 				 */
513 				if ((old & SE_EWANTED) == 0) {
514 					pp->p_selock = old + SE_READER;
515 					mutex_exit(pse);
516 					return (1);
517 				}
518 			}
519 			mutex_exit(pse);
520 			return (0);
521 		}
522 		/*
523 		 * The page is free, so we really want SE_EXCL (below)
524 		 */
525 		VM_STAT_ADD(page_try_reclaim_upgrade);
526 	}
527 
528 	/*
529 	 * The caller wants a writer lock.  We try for it only if
530 	 * SE_EWANTED is not set, or if the caller specified
531 	 * SE_EXCL_WANTED.
532 	 */
533 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
534 		if ((old & ~SE_EWANTED) == 0) {
535 			/* no reader/writer lock held */
536 			THREAD_KPRI_REQUEST();
537 			/* this clears out our setting of the SE_EWANTED bit */
538 			pp->p_selock = SE_WRITER;
539 			mutex_exit(pse);
540 			return (1);
541 		}
542 	}
543 	if (es & SE_EXCL_WANTED) {
544 		/* page is locked, set the SE_EWANTED bit */
545 		pp->p_selock |= SE_EWANTED;
546 	}
547 	mutex_exit(pse);
548 	return (0);
549 }
550 
551 /*
552  * Acquire a page's "shared/exclusive" lock, but never block.
553  * Returns 1 on success, 0 on failure.
554  */
555 int
556 page_trylock(page_t *pp, se_t se)
557 {
558 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
559 
560 	mutex_enter(pse);
561 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
562 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
563 		/*
564 		 * Fail if a thread wants exclusive access and page is
565 		 * retired, if the page is slated for retirement, or a
566 		 * share lock is requested.
567 		 */
568 		mutex_exit(pse);
569 		VM_STAT_ADD(page_trylock_failed);
570 		return (0);
571 	}
572 
573 	if (se == SE_EXCL) {
574 		if (pp->p_selock == 0) {
575 			THREAD_KPRI_REQUEST();
576 			pp->p_selock = SE_WRITER;
577 			mutex_exit(pse);
578 			return (1);
579 		}
580 	} else {
581 		if (pp->p_selock >= 0) {
582 			pp->p_selock += SE_READER;
583 			mutex_exit(pse);
584 			return (1);
585 		}
586 	}
587 	mutex_exit(pse);
588 	return (0);
589 }
590 
591 /*
592  * Variant of page_unlock() specifically for the page freelist
593  * code. The mere existence of this code is a vile hack that
594  * has resulted due to the backwards locking order of the page
595  * freelist manager; please don't call it.
596  */
597 void
598 page_unlock_nocapture(page_t *pp)
599 {
600 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
601 	selock_t old;
602 
603 	mutex_enter(pse);
604 
605 	old = pp->p_selock;
606 	if ((old & ~SE_EWANTED) == SE_READER) {
607 		pp->p_selock = old & ~SE_READER;
608 		if (CV_HAS_WAITERS(&pp->p_cv))
609 			cv_broadcast(&pp->p_cv);
610 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
611 		panic("page_unlock_nocapture: page %p is deleted", pp);
612 	} else if (old < 0) {
613 		THREAD_KPRI_RELEASE();
614 		pp->p_selock &= SE_EWANTED;
615 		if (CV_HAS_WAITERS(&pp->p_cv))
616 			cv_broadcast(&pp->p_cv);
617 	} else if ((old & ~SE_EWANTED) > SE_READER) {
618 		pp->p_selock = old - SE_READER;
619 	} else {
620 		panic("page_unlock_nocapture: page %p is not locked", pp);
621 	}
622 
623 	mutex_exit(pse);
624 }
625 
626 /*
627  * Release the page's "shared/exclusive" lock and wake up anyone
628  * who might be waiting for it.
629  */
630 void
631 page_unlock(page_t *pp)
632 {
633 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
634 	selock_t old;
635 
636 	mutex_enter(pse);
637 
638 	old = pp->p_selock;
639 	if ((old & ~SE_EWANTED) == SE_READER) {
640 		pp->p_selock = old & ~SE_READER;
641 		if (CV_HAS_WAITERS(&pp->p_cv))
642 			cv_broadcast(&pp->p_cv);
643 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
644 		panic("page_unlock: page %p is deleted", pp);
645 	} else if (old < 0) {
646 		THREAD_KPRI_RELEASE();
647 		pp->p_selock &= SE_EWANTED;
648 		if (CV_HAS_WAITERS(&pp->p_cv))
649 			cv_broadcast(&pp->p_cv);
650 	} else if ((old & ~SE_EWANTED) > SE_READER) {
651 		pp->p_selock = old - SE_READER;
652 	} else {
653 		panic("page_unlock: page %p is not locked", pp);
654 	}
655 
656 	if (pp->p_selock == 0) {
657 		/*
658 		 * If the T_CAPTURING bit is set, that means that we should
659 		 * not try and capture the page again as we could recurse
660 		 * which could lead to a stack overflow panic or spending a
661 		 * relatively long time in the kernel making no progress.
662 		 */
663 		if ((pp->p_toxic & PR_CAPTURE) &&
664 		    !(curthread->t_flag & T_CAPTURING) &&
665 		    !PP_RETIRED(pp)) {
666 			THREAD_KPRI_REQUEST();
667 			pp->p_selock = SE_WRITER;
668 			mutex_exit(pse);
669 			page_unlock_capture(pp);
670 		} else {
671 			mutex_exit(pse);
672 		}
673 	} else {
674 		mutex_exit(pse);
675 	}
676 }
677 
678 /*
679  * Try to upgrade the lock on the page from a "shared" to an
680  * "exclusive" lock.  Since this upgrade operation is done while
681  * holding the mutex protecting this page, no one else can acquire this page's
682  * lock and change the page. Thus, it is safe to drop the "shared"
683  * lock and attempt to acquire the "exclusive" lock.
684  *
685  * Returns 1 on success, 0 on failure.
686  */
687 int
688 page_tryupgrade(page_t *pp)
689 {
690 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
691 
692 	mutex_enter(pse);
693 	if (!(pp->p_selock & SE_EWANTED)) {
694 		/* no threads want exclusive access, try upgrade */
695 		if (pp->p_selock == SE_READER) {
696 			THREAD_KPRI_REQUEST();
697 			/* convert to exclusive lock */
698 			pp->p_selock = SE_WRITER;
699 			mutex_exit(pse);
700 			return (1);
701 		}
702 	}
703 	mutex_exit(pse);
704 	return (0);
705 }
706 
707 /*
708  * Downgrade the "exclusive" lock on the page to a "shared" lock
709  * while holding the mutex protecting this page's p_selock field.
710  */
711 void
712 page_downgrade(page_t *pp)
713 {
714 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
715 	int excl_waiting;
716 
717 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
718 	ASSERT(PAGE_EXCL(pp));
719 
720 	mutex_enter(pse);
721 	excl_waiting =  pp->p_selock & SE_EWANTED;
722 	THREAD_KPRI_RELEASE();
723 	pp->p_selock = SE_READER | excl_waiting;
724 	if (CV_HAS_WAITERS(&pp->p_cv))
725 		cv_broadcast(&pp->p_cv);
726 	mutex_exit(pse);
727 }
728 
729 void
730 page_lock_delete(page_t *pp)
731 {
732 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
733 
734 	ASSERT(PAGE_EXCL(pp));
735 	ASSERT(pp->p_vnode == NULL);
736 	ASSERT(pp->p_offset == (u_offset_t)-1);
737 	ASSERT(!PP_ISFREE(pp));
738 
739 	mutex_enter(pse);
740 	THREAD_KPRI_RELEASE();
741 	pp->p_selock = SE_DELETED;
742 	if (CV_HAS_WAITERS(&pp->p_cv))
743 		cv_broadcast(&pp->p_cv);
744 	mutex_exit(pse);
745 }
746 
747 int
748 page_deleted(page_t *pp)
749 {
750 	return (pp->p_selock == SE_DELETED);
751 }
752 
753 /*
754  * Implement the io lock for pages
755  */
756 void
757 page_iolock_init(page_t *pp)
758 {
759 	pp->p_iolock_state = 0;
760 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
761 }
762 
763 /*
764  * Acquire the i/o lock on a page.
765  */
766 void
767 page_io_lock(page_t *pp)
768 {
769 	kmutex_t *pio;
770 
771 	pio = PAGE_IO_MUTEX(pp);
772 	mutex_enter(pio);
773 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
774 		cv_wait(&(pp->p_io_cv), pio);
775 	}
776 	pp->p_iolock_state |= PAGE_IO_INUSE;
777 	mutex_exit(pio);
778 }
779 
780 /*
781  * Release the i/o lock on a page.
782  */
783 void
784 page_io_unlock(page_t *pp)
785 {
786 	kmutex_t *pio;
787 
788 	pio = PAGE_IO_MUTEX(pp);
789 	mutex_enter(pio);
790 	cv_broadcast(&pp->p_io_cv);
791 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
792 	mutex_exit(pio);
793 }
794 
795 /*
796  * Try to acquire the i/o lock on a page without blocking.
797  * Returns 1 on success, 0 on failure.
798  */
799 int
800 page_io_trylock(page_t *pp)
801 {
802 	kmutex_t *pio;
803 
804 	if (pp->p_iolock_state & PAGE_IO_INUSE)
805 		return (0);
806 
807 	pio = PAGE_IO_MUTEX(pp);
808 	mutex_enter(pio);
809 
810 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
811 		mutex_exit(pio);
812 		return (0);
813 	}
814 	pp->p_iolock_state |= PAGE_IO_INUSE;
815 	mutex_exit(pio);
816 
817 	return (1);
818 }
819 
820 /*
821  * Wait until the i/o lock is not held.
822  */
823 void
824 page_io_wait(page_t *pp)
825 {
826 	kmutex_t *pio;
827 
828 	pio = PAGE_IO_MUTEX(pp);
829 	mutex_enter(pio);
830 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
831 		cv_wait(&(pp->p_io_cv), pio);
832 	}
833 	mutex_exit(pio);
834 }
835 
836 /*
837  * Returns 1 on success, 0 on failure.
838  */
839 int
840 page_io_locked(page_t *pp)
841 {
842 	return (pp->p_iolock_state & PAGE_IO_INUSE);
843 }
844 
845 /*
846  * Assert that the i/o lock on a page is held.
847  * Returns 1 on success, 0 on failure.
848  */
849 int
850 page_iolock_assert(page_t *pp)
851 {
852 	return (page_io_locked(pp));
853 }
854 
855 /*
856  * Wrapper exported to kernel routines that are built
857  * platform-independent (the macro is platform-dependent;
858  * the size of vph_mutex[] is based on NCPU).
859  *
860  * Note that you can do stress testing on this by setting the
861  * variable page_vnode_mutex_stress to something other than
862  * zero in a DEBUG kernel in a debugger after loading the kernel.
863  * Setting it after the kernel is running may not work correctly.
864  */
865 #ifdef DEBUG
866 static int page_vnode_mutex_stress = 0;
867 #endif
868 
869 kmutex_t *
870 page_vnode_mutex(vnode_t *vp)
871 {
872 	if (vp == &kvp)
873 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
874 
875 	if (vp == &zvp)
876 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
877 #ifdef DEBUG
878 	if (page_vnode_mutex_stress != 0)
879 		return (&vph_mutex[0]);
880 #endif
881 
882 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
883 }
884 
885 kmutex_t *
886 page_se_mutex(page_t *pp)
887 {
888 	return (PAGE_SE_MUTEX(pp));
889 }
890 
891 #ifdef VM_STATS
892 uint_t pszclck_stat[4];
893 #endif
894 /*
895  * Find, take and return a mutex held by hat_page_demote().
896  * Called by page_demote_vp_pages() before hat_page_demote() call and by
897  * routines that want to block hat_page_demote() but can't do it
898  * via locking all constituent pages.
899  *
900  * Return NULL if p_szc is 0.
901  *
902  * It should only be used for pages that can be demoted by hat_page_demote()
903  * i.e. non swapfs file system pages.  The logic here is lifted from
904  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
905  * since the page is locked and not free.
906  *
907  * Hash of the root page is used to find the lock.
908  * To find the root in the presense of hat_page_demote() chageing the location
909  * of the root this routine relies on the fact that hat_page_demote() changes
910  * root last.
911  *
912  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
913  * returned pp's p_szc may be any value.
914  */
915 kmutex_t *
916 page_szc_lock(page_t *pp)
917 {
918 	kmutex_t	*mtx;
919 	page_t		*rootpp;
920 	uint_t		szc;
921 	uint_t		rszc;
922 	uint_t		pszc = pp->p_szc;
923 
924 	ASSERT(pp != NULL);
925 	ASSERT(PAGE_LOCKED(pp));
926 	ASSERT(!PP_ISFREE(pp));
927 	ASSERT(pp->p_vnode != NULL);
928 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
929 	ASSERT(!PP_ISKAS(pp));
930 
931 again:
932 	if (pszc == 0) {
933 		VM_STAT_ADD(pszclck_stat[0]);
934 		return (NULL);
935 	}
936 
937 	/* The lock lives in the root page */
938 
939 	rootpp = PP_GROUPLEADER(pp, pszc);
940 	mtx = PAGE_SZC_MUTEX(rootpp);
941 	mutex_enter(mtx);
942 
943 	/*
944 	 * since p_szc can only decrease if pp == rootpp
945 	 * rootpp will be always the same i.e we have the right root
946 	 * regardless of rootpp->p_szc.
947 	 * If location of pp's root didn't change after we took
948 	 * the lock we have the right root. return mutex hashed off it.
949 	 */
950 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
951 		VM_STAT_ADD(pszclck_stat[1]);
952 		return (mtx);
953 	}
954 
955 	/*
956 	 * root location changed because page got demoted.
957 	 * locate the new root.
958 	 */
959 	if (rszc < pszc) {
960 		szc = pp->p_szc;
961 		ASSERT(szc < pszc);
962 		mutex_exit(mtx);
963 		pszc = szc;
964 		VM_STAT_ADD(pszclck_stat[2]);
965 		goto again;
966 	}
967 
968 	VM_STAT_ADD(pszclck_stat[3]);
969 	/*
970 	 * current hat_page_demote not done yet.
971 	 * wait for it to finish.
972 	 */
973 	mutex_exit(mtx);
974 	rootpp = PP_GROUPLEADER(rootpp, rszc);
975 	mtx = PAGE_SZC_MUTEX(rootpp);
976 	mutex_enter(mtx);
977 	mutex_exit(mtx);
978 	ASSERT(rootpp->p_szc < rszc);
979 	goto again;
980 }
981 
982 int
983 page_szc_lock_assert(page_t *pp)
984 {
985 	page_t *rootpp = PP_PAGEROOT(pp);
986 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
987 
988 	return (MUTEX_HELD(mtx));
989 }
990 
991 /*
992  * memseg locking
993  */
994 static krwlock_t memsegslock;
995 
996 /*
997  * memlist (phys_install, phys_avail) locking.
998  */
999 static krwlock_t memlists_lock;
1000 
1001 void
1002 memsegs_lock(int writer)
1003 {
1004 	rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1005 }
1006 
1007 /*ARGSUSED*/
1008 void
1009 memsegs_unlock(int writer)
1010 {
1011 	rw_exit(&memsegslock);
1012 }
1013 
1014 int
1015 memsegs_lock_held(void)
1016 {
1017 	return (RW_LOCK_HELD(&memsegslock));
1018 }
1019 
1020 void
1021 memlist_read_lock(void)
1022 {
1023 	rw_enter(&memlists_lock, RW_READER);
1024 }
1025 
1026 void
1027 memlist_read_unlock(void)
1028 {
1029 	rw_exit(&memlists_lock);
1030 }
1031 
1032 void
1033 memlist_write_lock(void)
1034 {
1035 	rw_enter(&memlists_lock, RW_WRITER);
1036 }
1037 
1038 void
1039 memlist_write_unlock(void)
1040 {
1041 	rw_exit(&memlists_lock);
1042 }
1043