xref: /titanic_50/usr/src/uts/common/vm/page_lock.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * VM - page locking primitives
31  */
32 #include <sys/param.h>
33 #include <sys/t_lock.h>
34 #include <sys/vtrace.h>
35 #include <sys/debug.h>
36 #include <sys/cmn_err.h>
37 #include <sys/vnode.h>
38 #include <sys/bitmap.h>
39 #include <sys/lockstat.h>
40 #include <sys/condvar_impl.h>
41 #include <vm/page.h>
42 #include <vm/seg_enum.h>
43 #include <vm/vm_dep.h>
44 
45 /*
46  * This global mutex is for logical page locking.
47  * The following fields in the page structure are protected
48  * by this lock:
49  *
50  *	p_lckcnt
51  *	p_cowcnt
52  */
53 kmutex_t page_llock;
54 
55 /*
56  * This is a global lock for the logical page free list.  The
57  * logical free list, in this implementation, is maintained as two
58  * separate physical lists - the cache list and the free list.
59  */
60 kmutex_t  page_freelock;
61 
62 /*
63  * The hash table, page_hash[], the p_selock fields, and the
64  * list of pages associated with vnodes are protected by arrays of mutexes.
65  *
66  * Unless the hashes are changed radically, the table sizes must be
67  * a power of two.  Also, we typically need more mutexes for the
68  * vnodes since these locks are occasionally held for long periods.
69  * And since there seem to be two special vnodes (kvp and swapvp),
70  * we make room for private mutexes for them.
71  *
72  * The pse_mutex[] array holds the mutexes to protect the p_selock
73  * fields of all page_t structures.
74  *
75  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
76  * when given a pointer to a page_t.
77  *
78  * PSE_TABLE_SIZE must be a power of two.  One could argue that we
79  * should go to the trouble of setting it up at run time and base it
80  * on memory size rather than the number of compile time CPUs.
81  *
82  * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
83  *	PSE_SHIFT, PIO_SHIFT.
84  *
85  *	These might break in 64 bit world.
86  */
87 #define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
88 
89 #define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
90 
91 #define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
92 #define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
93 
94 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
95 pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
96 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
97 
98 #define	PAGE_SE_MUTEX(pp) \
99 	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
100 		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
101 		(PSE_TABLE_SIZE - 1))].pad_mutex
102 
103 #define	PAGE_IO_MUTEX(pp) \
104 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
105 
106 #define	PSZC_MTX_TABLE_SIZE	128
107 #define	PSZC_MTX_TABLE_SHIFT	7
108 
109 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
110 
111 #define	PAGE_SZC_MUTEX(_pp) \
112 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
113 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
114 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
115 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
116 
117 /*
118  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
119  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
120  * and p_vpnext).
121  *
122  * The page_vnode_mutex(vp) function returns the address of the appropriate
123  * mutex from this array given a pointer to a vnode.  It is complicated
124  * by the fact that the kernel's vnode and the swapfs vnode are referenced
125  * frequently enough to warrent their own mutexes.
126  *
127  * The VP_HASH_FUNC returns the index into the vph_mutex array given
128  * an address of a vnode.
129  */
130 
131 /*
132  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
133  *	Need to review again.
134  */
135 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
136 
137 #define	VP_HASH_FUNC(vp) \
138 	((((uintptr_t)(vp) >> 6) + \
139 	    ((uintptr_t)(vp) >> 8) + \
140 	    ((uintptr_t)(vp) >> 10) + \
141 	    ((uintptr_t)(vp) >> 12)) \
142 	    & (VPH_TABLE_SIZE - 1))
143 
144 extern	struct vnode	kvp;
145 
146 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
147 
148 /*
149  * Initialize the locks used by the Virtual Memory Management system.
150  */
151 void
152 page_lock_init()
153 {
154 }
155 
156 /*
157  * At present we only use page ownership to aid debugging, so it's
158  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
159  * can map to the same owner because we just 'or' in 0x80000000 and
160  * then clear the second highest bit, so that (for example) 0x2faced00
161  * and 0xafaced00 both map to 0xafaced00.
162  * In the 64-bit world, p_selock may not be large enough to hold a full
163  * thread pointer.  If we ever need precise ownership (e.g. if we implement
164  * priority inheritance for page locks) then p_selock should become a
165  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
166  */
167 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
168 #define	SE_READER	1
169 
170 /*
171  * A page that is deleted must be marked as such using the
172  * page_lock_delete() function. The page must be exclusively locked.
173  * The SE_DELETED marker is put in p_selock when this function is called.
174  * SE_DELETED must be distinct from any SE_WRITER value.
175  */
176 #define	SE_DELETED	(1 | INT_MIN)
177 
178 #ifdef VM_STATS
179 uint_t	vph_kvp_count;
180 uint_t	vph_swapfsvp_count;
181 uint_t	vph_other;
182 #endif /* VM_STATS */
183 
184 #ifdef VM_STATS
185 uint_t	page_lock_count;
186 uint_t	page_lock_miss;
187 uint_t	page_lock_miss_lock;
188 uint_t	page_lock_reclaim;
189 uint_t	page_lock_bad_reclaim;
190 uint_t	page_lock_same_page;
191 uint_t	page_lock_upgrade;
192 uint_t	page_lock_upgrade_failed;
193 uint_t	page_lock_deleted;
194 
195 uint_t	page_trylock_locked;
196 uint_t	page_trylock_missed;
197 
198 uint_t	page_try_reclaim_upgrade;
199 #endif /* VM_STATS */
200 
201 
202 /*
203  * Acquire the "shared/exclusive" lock on a page.
204  *
205  * Returns 1 on success and locks the page appropriately.
206  *	   0 on failure and does not lock the page.
207  *
208  * If `lock' is non-NULL, it will be dropped and reacquired in the
209  * failure case.  This routine can block, and if it does
210  * it will always return a failure since the page identity [vp, off]
211  * or state may have changed.
212  */
213 
214 int
215 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
216 {
217 	return (page_lock_es(pp, se, lock, reclaim, 0));
218 }
219 
220 /*
221  * With the addition of reader-writer lock semantics to page_lock_es,
222  * callers wanting an exclusive (writer) lock may prevent shared-lock
223  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
224  * In this case, when an exclusive lock cannot be acquired, p_selock's
225  * SE_EWANTED bit is set.
226  * This bit, along with the se and es parameters, are used to decide
227  * if the requested lock should be granted:
228  *
229  * Lock wanted SE_EXCL_WANTED p_selock/SE_EWANTED  Action
230  * ----------  -------------- -------------------  ---------
231  * SE_EXCL        no           dont-care/1         deny lock
232  * SE_EXCL     any(see note)   unlocked/any        grant lock, clear SE_EWANTED
233  * SE_EXCL        yes          any lock/any        deny, set SE_EWANTED
234  * SE_EXCL        no           any lock/any        deny
235  * SE_SHARED   not applicable    shared/0          grant
236  * SE_SHARED   not applicable  unlocked/0          grant
237  * SE_SHARED   not applicable    shared/1          deny
238  * SE_SHARED   not applicable  unlocked/1          deny
239  * SE_SHARED   not applicable      excl/any        deny
240  *
241  * Note: the code grants an exclusive lock to the caller and clears
242  * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
243  * bit's value.  This was deemed acceptable as we are not concerned about
244  * exclusive-lock starvation. If this ever becomes an issue, a priority or
245  * fifo mechanism should also be implemented.
246  */
247 int
248 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
249 {
250 	int		retval;
251 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
252 	int		upgraded;
253 	int		reclaim_it;
254 
255 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
256 
257 	VM_STAT_ADD(page_lock_count);
258 
259 	upgraded = 0;
260 	reclaim_it = 0;
261 
262 	mutex_enter(pse);
263 
264 	/*
265 	 * Current uses of 'es':
266 	 * es == 1 page_lookup_create will attempt page relocation
267 	 * es == SE_EXCL_WANTED caller wants SE_EWANTED set (eg. delete
268 	 * memory thread); this prevents reader-starvation of waiting
269 	 * writer thread(s).
270 	 */
271 
272 
273 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
274 	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
275 
276 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
277 		se = SE_EXCL;
278 	}
279 
280 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
281 
282 		reclaim_it = 1;
283 		if (se == SE_SHARED) {
284 			/*
285 			 * This is an interesting situation.
286 			 *
287 			 * Remember that p_free can only change if
288 			 * p_selock < 0.
289 			 * p_free does not depend on our holding `pse'.
290 			 * And, since we hold `pse', p_selock can not change.
291 			 * So, if p_free changes on us, the page is already
292 			 * exclusively held, and we would fail to get p_selock
293 			 * regardless.
294 			 *
295 			 * We want to avoid getting the share
296 			 * lock on a free page that needs to be reclaimed.
297 			 * It is possible that some other thread has the share
298 			 * lock and has left the free page on the cache list.
299 			 * pvn_vplist_dirty() does this for brief periods.
300 			 * If the se_share is currently SE_EXCL, we will fail
301 			 * to acquire p_selock anyway.  Blocking is the
302 			 * right thing to do.
303 			 * If we need to reclaim this page, we must get
304 			 * exclusive access to it, force the upgrade now.
305 			 * Again, we will fail to acquire p_selock if the
306 			 * page is not free and block.
307 			 */
308 			upgraded = 1;
309 			se = SE_EXCL;
310 			VM_STAT_ADD(page_lock_upgrade);
311 		}
312 	}
313 
314 	if (se == SE_EXCL) {
315 		if ((es != SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
316 			/*
317 			 * if the caller wants a writer lock (but did not
318 			 * specify exclusive access), and there is a pending
319 			 * writer that wants exclusive access, return failure
320 			 */
321 			retval = 0;
322 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
323 			/* no reader/writer lock held */
324 			THREAD_KPRI_REQUEST();
325 			/* this clears our setting of the SE_EWANTED bit */
326 			pp->p_selock = SE_WRITER;
327 			retval = 1;
328 		} else {
329 			/* page is locked */
330 			if (es == SE_EXCL_WANTED) {
331 				/* set the SE_EWANTED bit */
332 				pp->p_selock |= SE_EWANTED;
333 			}
334 			retval = 0;
335 		}
336 	} else {
337 		retval = 0;
338 		if (pp->p_selock >= 0) {
339 			/* readers are not allowed when excl wanted */
340 			if (!(pp->p_selock & SE_EWANTED)) {
341 				pp->p_selock += SE_READER;
342 				retval = 1;
343 			}
344 		}
345 	}
346 
347 	if (retval == 0) {
348 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
349 			VM_STAT_ADD(page_lock_deleted);
350 			mutex_exit(pse);
351 			return (retval);
352 		}
353 
354 #ifdef VM_STATS
355 		VM_STAT_ADD(page_lock_miss);
356 		if (upgraded) {
357 			VM_STAT_ADD(page_lock_upgrade_failed);
358 		}
359 #endif
360 		if (lock) {
361 			VM_STAT_ADD(page_lock_miss_lock);
362 			mutex_exit(lock);
363 		}
364 
365 		/*
366 		 * Now, wait for the page to be unlocked and
367 		 * release the lock protecting p_cv and p_selock.
368 		 */
369 		cv_wait(&pp->p_cv, pse);
370 		mutex_exit(pse);
371 
372 		/*
373 		 * The page identity may have changed while we were
374 		 * blocked.  If we are willing to depend on "pp"
375 		 * still pointing to a valid page structure (i.e.,
376 		 * assuming page structures are not dynamically allocated
377 		 * or freed), we could try to lock the page if its
378 		 * identity hasn't changed.
379 		 *
380 		 * This needs to be measured, since we come back from
381 		 * cv_wait holding pse (the expensive part of this
382 		 * operation) we might as well try the cheap part.
383 		 * Though we would also have to confirm that dropping
384 		 * `lock' did not cause any grief to the callers.
385 		 */
386 		if (lock) {
387 			mutex_enter(lock);
388 		}
389 	} else {
390 		/*
391 		 * We have the page lock.
392 		 * If we needed to reclaim the page, and the page
393 		 * needed reclaiming (ie, it was free), then we
394 		 * have the page exclusively locked.  We may need
395 		 * to downgrade the page.
396 		 */
397 		ASSERT((upgraded) ?
398 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
399 		mutex_exit(pse);
400 
401 		/*
402 		 * We now hold this page's lock, either shared or
403 		 * exclusive.  This will prevent its identity from changing.
404 		 * The page, however, may or may not be free.  If the caller
405 		 * requested, and it is free, go reclaim it from the
406 		 * free list.  If the page can't be reclaimed, return failure
407 		 * so that the caller can start all over again.
408 		 *
409 		 * NOTE:page_reclaim() releases the page lock (p_selock)
410 		 *	if it can't be reclaimed.
411 		 */
412 		if (reclaim_it) {
413 			if (!page_reclaim(pp, lock)) {
414 				VM_STAT_ADD(page_lock_bad_reclaim);
415 				retval = 0;
416 			} else {
417 				VM_STAT_ADD(page_lock_reclaim);
418 				if (upgraded) {
419 					page_downgrade(pp);
420 				}
421 			}
422 		}
423 	}
424 	return (retval);
425 }
426 
427 /*
428  * Clear the SE_EWANTED bit from p_selock.  This function allows
429  * callers of page_lock_es and page_try_reclaim_lock to clear
430  * their setting of this bit if they decide they no longer wish
431  * to gain exclusive access to the page.  Currently only
432  * delete_memory_thread uses this when the delete memory
433  * operation is cancelled.
434  */
435 void
436 page_lock_clr_exclwanted(page_t *pp)
437 {
438 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
439 
440 	mutex_enter(pse);
441 	pp->p_selock &= ~SE_EWANTED;
442 	if (CV_HAS_WAITERS(&pp->p_cv))
443 		cv_broadcast(&pp->p_cv);
444 	mutex_exit(pse);
445 }
446 
447 /*
448  * Read the comments inside of page_lock_es() carefully.
449  *
450  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
451  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
452  * This is used by threads subject to reader-starvation (eg. memory delete).
453  *
454  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
455  * it is expected that it will retry at a later time.  Threads that will
456  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
457  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
458  * the bit is cleared.)
459  */
460 int
461 page_try_reclaim_lock(page_t *pp, se_t se, int es)
462 {
463 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
464 	selock_t old;
465 
466 	mutex_enter(pse);
467 
468 	old = pp->p_selock;
469 
470 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
471 	    ((es == SE_EXCL_WANTED) && (se == SE_EXCL)));
472 
473 	if (se == SE_SHARED && es == 1 && old == 0) {
474 		se = SE_EXCL;
475 	}
476 
477 	if (se == SE_SHARED) {
478 		if (!PP_ISFREE(pp)) {
479 			if (old >= 0) {
480 				/* readers are not allowed when excl wanted */
481 				if (!(old & SE_EWANTED)) {
482 					pp->p_selock = old + SE_READER;
483 					mutex_exit(pse);
484 					return (1);
485 				}
486 			}
487 			mutex_exit(pse);
488 			return (0);
489 		}
490 		/*
491 		 * The page is free, so we really want SE_EXCL (below)
492 		 */
493 		VM_STAT_ADD(page_try_reclaim_upgrade);
494 	}
495 
496 	/*
497 	 * The caller wants a writer lock.  We try for it only if
498 	 * SE_EWANTED is not set, or if the caller specified
499 	 * SE_EXCL_WANTED.
500 	 */
501 	if (!(old & SE_EWANTED) || (es == SE_EXCL_WANTED)) {
502 		if ((old & ~SE_EWANTED) == 0) {
503 			/* no reader/writer lock held */
504 			THREAD_KPRI_REQUEST();
505 			/* this clears out our setting of the SE_EWANTED bit */
506 			pp->p_selock = SE_WRITER;
507 			mutex_exit(pse);
508 			return (1);
509 		}
510 	}
511 	if (es == SE_EXCL_WANTED) {
512 		/* page is locked, set the SE_EWANTED bit */
513 		pp->p_selock |= SE_EWANTED;
514 	}
515 	mutex_exit(pse);
516 	return (0);
517 }
518 
519 /*
520  * Acquire a page's "shared/exclusive" lock, but never block.
521  * Returns 1 on success, 0 on failure.
522  */
523 int
524 page_trylock(page_t *pp, se_t se)
525 {
526 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
527 
528 	mutex_enter(pse);
529 	if (pp->p_selock & SE_EWANTED) {
530 		/* fail if a thread wants exclusive access */
531 		mutex_exit(pse);
532 		return (0);
533 	}
534 
535 	if (se == SE_EXCL) {
536 		if (pp->p_selock == 0) {
537 			THREAD_KPRI_REQUEST();
538 			pp->p_selock = SE_WRITER;
539 			mutex_exit(pse);
540 			return (1);
541 		}
542 	} else {
543 		if (pp->p_selock >= 0) {
544 			pp->p_selock += SE_READER;
545 			mutex_exit(pse);
546 			return (1);
547 		}
548 	}
549 	mutex_exit(pse);
550 	return (0);
551 }
552 
553 /*
554  * Release the page's "shared/exclusive" lock and wake up anyone
555  * who might be waiting for it.
556  */
557 void
558 page_unlock(page_t *pp)
559 {
560 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
561 	selock_t old;
562 
563 	mutex_enter(pse);
564 	old = pp->p_selock;
565 	if ((old & ~SE_EWANTED) == SE_READER) {
566 		pp->p_selock = old & ~SE_READER;
567 		if (CV_HAS_WAITERS(&pp->p_cv))
568 			cv_broadcast(&pp->p_cv);
569 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
570 		panic("page_unlock: page %p is deleted", pp);
571 	} else if (old < 0) {
572 		THREAD_KPRI_RELEASE();
573 		pp->p_selock &= SE_EWANTED;
574 		if (CV_HAS_WAITERS(&pp->p_cv))
575 			cv_broadcast(&pp->p_cv);
576 	} else if ((old & ~SE_EWANTED) > SE_READER) {
577 		pp->p_selock = old - SE_READER;
578 	} else {
579 		panic("page_unlock: page %p is not locked", pp);
580 	}
581 	mutex_exit(pse);
582 }
583 
584 /*
585  * Try to upgrade the lock on the page from a "shared" to an
586  * "exclusive" lock.  Since this upgrade operation is done while
587  * holding the mutex protecting this page, no one else can acquire this page's
588  * lock and change the page. Thus, it is safe to drop the "shared"
589  * lock and attempt to acquire the "exclusive" lock.
590  *
591  * Returns 1 on success, 0 on failure.
592  */
593 int
594 page_tryupgrade(page_t *pp)
595 {
596 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
597 
598 	mutex_enter(pse);
599 	if (!(pp->p_selock & SE_EWANTED)) {
600 		/* no threads want exclusive access, try upgrade */
601 		if (pp->p_selock == SE_READER) {
602 			THREAD_KPRI_REQUEST();
603 			/* convert to exclusive lock */
604 			pp->p_selock = SE_WRITER;
605 			mutex_exit(pse);
606 			return (1);
607 		}
608 	}
609 	mutex_exit(pse);
610 	return (0);
611 }
612 
613 /*
614  * Downgrade the "exclusive" lock on the page to a "shared" lock
615  * while holding the mutex protecting this page's p_selock field.
616  */
617 void
618 page_downgrade(page_t *pp)
619 {
620 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
621 	int excl_waiting;
622 
623 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
624 	ASSERT(PAGE_EXCL(pp));
625 
626 	mutex_enter(pse);
627 	excl_waiting =  pp->p_selock & SE_EWANTED;
628 	THREAD_KPRI_RELEASE();
629 	pp->p_selock = SE_READER | excl_waiting;
630 	if (CV_HAS_WAITERS(&pp->p_cv))
631 		cv_broadcast(&pp->p_cv);
632 	mutex_exit(pse);
633 }
634 
635 void
636 page_lock_delete(page_t *pp)
637 {
638 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
639 
640 	ASSERT(PAGE_EXCL(pp));
641 	ASSERT(pp->p_vnode == NULL);
642 	ASSERT(pp->p_offset == (u_offset_t)-1);
643 	ASSERT(!PP_ISFREE(pp));
644 
645 	mutex_enter(pse);
646 	THREAD_KPRI_RELEASE();
647 	pp->p_selock = SE_DELETED;
648 	if (CV_HAS_WAITERS(&pp->p_cv))
649 		cv_broadcast(&pp->p_cv);
650 	mutex_exit(pse);
651 }
652 
653 /*
654  * Implement the io lock for pages
655  */
656 void
657 page_iolock_init(page_t *pp)
658 {
659 	pp->p_iolock_state = 0;
660 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
661 }
662 
663 /*
664  * Acquire the i/o lock on a page.
665  */
666 void
667 page_io_lock(page_t *pp)
668 {
669 	kmutex_t *pio;
670 
671 	pio = PAGE_IO_MUTEX(pp);
672 	mutex_enter(pio);
673 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
674 		cv_wait(&(pp->p_io_cv), pio);
675 	}
676 	pp->p_iolock_state |= PAGE_IO_INUSE;
677 	mutex_exit(pio);
678 }
679 
680 /*
681  * Release the i/o lock on a page.
682  */
683 void
684 page_io_unlock(page_t *pp)
685 {
686 	kmutex_t *pio;
687 
688 	pio = PAGE_IO_MUTEX(pp);
689 	mutex_enter(pio);
690 	cv_signal(&pp->p_io_cv);
691 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
692 	mutex_exit(pio);
693 }
694 
695 /*
696  * Try to acquire the i/o lock on a page without blocking.
697  * Returns 1 on success, 0 on failure.
698  */
699 int
700 page_io_trylock(page_t *pp)
701 {
702 	kmutex_t *pio;
703 
704 	if (pp->p_iolock_state & PAGE_IO_INUSE)
705 		return (0);
706 
707 	pio = PAGE_IO_MUTEX(pp);
708 	mutex_enter(pio);
709 
710 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
711 		mutex_exit(pio);
712 		return (0);
713 	}
714 	pp->p_iolock_state |= PAGE_IO_INUSE;
715 	mutex_exit(pio);
716 
717 	return (1);
718 }
719 
720 /*
721  * Assert that the i/o lock on a page is held.
722  * Returns 1 on success, 0 on failure.
723  */
724 int
725 page_iolock_assert(page_t *pp)
726 {
727 	return (pp->p_iolock_state & PAGE_IO_INUSE);
728 }
729 
730 /*
731  * Wrapper exported to kernel routines that are built
732  * platform-independent (the macro is platform-dependent;
733  * the size of vph_mutex[] is based on NCPU).
734  *
735  * Note that you can do stress testing on this by setting the
736  * variable page_vnode_mutex_stress to something other than
737  * zero in a DEBUG kernel in a debugger after loading the kernel.
738  * Setting it after the kernel is running may not work correctly.
739  */
740 #ifdef DEBUG
741 static int page_vnode_mutex_stress = 0;
742 #endif
743 
744 kmutex_t *
745 page_vnode_mutex(vnode_t *vp)
746 {
747 	if (vp == &kvp)
748 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
749 #ifdef DEBUG
750 	if (page_vnode_mutex_stress != 0)
751 		return (&vph_mutex[0]);
752 #endif
753 
754 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
755 }
756 
757 kmutex_t *
758 page_se_mutex(page_t *pp)
759 {
760 	return (PAGE_SE_MUTEX(pp));
761 }
762 
763 #ifdef VM_STATS
764 uint_t pszclck_stat[4];
765 #endif
766 /*
767  * Find, take and return a mutex held by hat_page_demote().
768  * Called by page_demote_vp_pages() before hat_page_demote() call and by
769  * routines that want to block hat_page_demote() but can't do it
770  * via locking all constituent pages.
771  *
772  * Return NULL if p_szc is 0.
773  *
774  * It should only be used for pages that can be demoted by hat_page_demote()
775  * i.e. non swapfs file system pages.  The logic here is lifted from
776  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
777  * since the page is locked and not free.
778  *
779  * Hash of the root page is used to find the lock.
780  * To find the root in the presense of hat_page_demote() chageing the location
781  * of the root this routine relies on the fact that hat_page_demote() changes
782  * root last.
783  *
784  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
785  * returned pp's p_szc may be any value.
786  */
787 kmutex_t *
788 page_szc_lock(page_t *pp)
789 {
790 	kmutex_t	*mtx;
791 	page_t		*rootpp;
792 	uint_t		szc;
793 	uint_t		rszc;
794 	uint_t		pszc = pp->p_szc;
795 
796 	ASSERT(pp != NULL);
797 	ASSERT(PAGE_LOCKED(pp));
798 	ASSERT(!PP_ISFREE(pp));
799 	ASSERT(pp->p_vnode != NULL);
800 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
801 	ASSERT(pp->p_vnode != &kvp);
802 
803 again:
804 	if (pszc == 0) {
805 		VM_STAT_ADD(pszclck_stat[0]);
806 		return (NULL);
807 	}
808 
809 	/* The lock lives in the root page */
810 
811 	rootpp = PP_GROUPLEADER(pp, pszc);
812 	mtx = PAGE_SZC_MUTEX(rootpp);
813 	mutex_enter(mtx);
814 
815 	/*
816 	 * since p_szc can only decrease if pp == rootpp
817 	 * rootpp will be always the same i.e we have the right root
818 	 * regardless of rootpp->p_szc.
819 	 * If location of pp's root didn't change after we took
820 	 * the lock we have the right root. return mutex hashed off it.
821 	 */
822 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
823 		VM_STAT_ADD(pszclck_stat[1]);
824 		return (mtx);
825 	}
826 
827 	/*
828 	 * root location changed because page got demoted.
829 	 * locate the new root.
830 	 */
831 	if (rszc < pszc) {
832 		szc = pp->p_szc;
833 		ASSERT(szc < pszc);
834 		mutex_exit(mtx);
835 		pszc = szc;
836 		VM_STAT_ADD(pszclck_stat[2]);
837 		goto again;
838 	}
839 
840 	VM_STAT_ADD(pszclck_stat[3]);
841 	/*
842 	 * current hat_page_demote not done yet.
843 	 * wait for it to finish.
844 	 */
845 	mutex_exit(mtx);
846 	rootpp = PP_GROUPLEADER(rootpp, rszc);
847 	mtx = PAGE_SZC_MUTEX(rootpp);
848 	mutex_enter(mtx);
849 	mutex_exit(mtx);
850 	ASSERT(rootpp->p_szc < rszc);
851 	goto again;
852 }
853 
854 int
855 page_szc_lock_assert(page_t *pp)
856 {
857 	page_t *rootpp = PP_PAGEROOT(pp);
858 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
859 
860 	return (MUTEX_HELD(mtx));
861 }
862