xref: /illumos-gate/usr/src/uts/common/vm/page_lock.c (revision ad23a2db4cfc94c0ed1d58554479ce8d2e7e5768)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * VM - page locking primitives
30  */
31 #include <sys/param.h>
32 #include <sys/t_lock.h>
33 #include <sys/vtrace.h>
34 #include <sys/debug.h>
35 #include <sys/cmn_err.h>
36 #include <sys/vnode.h>
37 #include <sys/bitmap.h>
38 #include <sys/lockstat.h>
39 #include <sys/condvar_impl.h>
40 #include <vm/page.h>
41 #include <vm/seg_enum.h>
42 #include <vm/vm_dep.h>
43 
44 /*
45  * This global mutex is for logical page locking.
46  * The following fields in the page structure are protected
47  * by this lock:
48  *
49  *	p_lckcnt
50  *	p_cowcnt
51  */
52 kmutex_t page_llock;
53 
54 /*
55  * This is a global lock for the logical page free list.  The
56  * logical free list, in this implementation, is maintained as two
57  * separate physical lists - the cache list and the free list.
58  */
59 kmutex_t  page_freelock;
60 
61 /*
62  * The hash table, page_hash[], the p_selock fields, and the
63  * list of pages associated with vnodes are protected by arrays of mutexes.
64  *
65  * Unless the hashes are changed radically, the table sizes must be
66  * a power of two.  Also, we typically need more mutexes for the
67  * vnodes since these locks are occasionally held for long periods.
68  * And since there seem to be two special vnodes (kvp and swapvp),
69  * we make room for private mutexes for them.
70  *
71  * The pse_mutex[] array holds the mutexes to protect the p_selock
72  * fields of all page_t structures.
73  *
74  * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
75  * when given a pointer to a page_t.
76  *
77  * PSE_TABLE_SIZE must be a power of two.  One could argue that we
78  * should go to the trouble of setting it up at run time and base it
79  * on memory size rather than the number of compile time CPUs.
80  *
81  * XX64	We should be using physmem size to calculate PSE_TABLE_SIZE,
82  *	PSE_SHIFT, PIO_SHIFT.
83  *
84  *	These might break in 64 bit world.
85  */
86 #define	PSE_SHIFT	7		/* log2(PSE_TABLE_SIZE) */
87 
88 #define	PSE_TABLE_SIZE	128		/* number of mutexes to have */
89 
90 #define	PIO_SHIFT	PSE_SHIFT	/* next power of 2 bigger than page_t */
91 #define	PIO_TABLE_SIZE	PSE_TABLE_SIZE	/* number of io mutexes to have */
92 
93 pad_mutex_t	ph_mutex[PH_TABLE_SIZE];
94 pad_mutex_t	pse_mutex[PSE_TABLE_SIZE];
95 kmutex_t	pio_mutex[PIO_TABLE_SIZE];
96 
97 #define	PAGE_SE_MUTEX(pp) \
98 	    &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \
99 		((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \
100 		(PSE_TABLE_SIZE - 1))].pad_mutex
101 
102 #define	PAGE_IO_MUTEX(pp) \
103 	    &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
104 
105 #define	PSZC_MTX_TABLE_SIZE	128
106 #define	PSZC_MTX_TABLE_SHIFT	7
107 
108 static pad_mutex_t	pszc_mutex[PSZC_MTX_TABLE_SIZE];
109 
110 #define	PAGE_SZC_MUTEX(_pp) \
111 	    &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
112 		((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
113 		((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
114 		(PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
115 
116 /*
117  * The vph_mutex[] array  holds the mutexes to protect the vnode chains,
118  * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
119  * and p_vpnext).
120  *
121  * The page_vnode_mutex(vp) function returns the address of the appropriate
122  * mutex from this array given a pointer to a vnode.  It is complicated
123  * by the fact that the kernel's vnode and the swapfs vnode are referenced
124  * frequently enough to warrent their own mutexes.
125  *
126  * The VP_HASH_FUNC returns the index into the vph_mutex array given
127  * an address of a vnode.
128  */
129 
130 /*
131  * XX64	VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world.
132  *	Need to review again.
133  */
134 #define	VPH_TABLE_SIZE	(2 << VP_SHIFT)
135 
136 #define	VP_HASH_FUNC(vp) \
137 	((((uintptr_t)(vp) >> 6) + \
138 	    ((uintptr_t)(vp) >> 8) + \
139 	    ((uintptr_t)(vp) >> 10) + \
140 	    ((uintptr_t)(vp) >> 12)) \
141 	    & (VPH_TABLE_SIZE - 1))
142 
143 extern	struct vnode	kvp;
144 
145 /*
146  * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
147  * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
148  * VPH_TABLE_SIZE + 1.
149  */
150 
151 kmutex_t	vph_mutex[VPH_TABLE_SIZE + 2];
152 
153 /*
154  * Initialize the locks used by the Virtual Memory Management system.
155  */
156 void
157 page_lock_init()
158 {
159 }
160 
161 /*
162  * At present we only use page ownership to aid debugging, so it's
163  * OK if the owner field isn't exact.  In the 32-bit world two thread ids
164  * can map to the same owner because we just 'or' in 0x80000000 and
165  * then clear the second highest bit, so that (for example) 0x2faced00
166  * and 0xafaced00 both map to 0xafaced00.
167  * In the 64-bit world, p_selock may not be large enough to hold a full
168  * thread pointer.  If we ever need precise ownership (e.g. if we implement
169  * priority inheritance for page locks) then p_selock should become a
170  * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
171  */
172 #define	SE_WRITER	(((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
173 #define	SE_READER	1
174 
175 /*
176  * A page that is deleted must be marked as such using the
177  * page_lock_delete() function. The page must be exclusively locked.
178  * The SE_DELETED marker is put in p_selock when this function is called.
179  * SE_DELETED must be distinct from any SE_WRITER value.
180  */
181 #define	SE_DELETED	(1 | INT_MIN)
182 
183 #ifdef VM_STATS
184 uint_t	vph_kvp_count;
185 uint_t	vph_swapfsvp_count;
186 uint_t	vph_other;
187 #endif /* VM_STATS */
188 
189 #ifdef VM_STATS
190 uint_t	page_lock_count;
191 uint_t	page_lock_miss;
192 uint_t	page_lock_miss_lock;
193 uint_t	page_lock_reclaim;
194 uint_t	page_lock_bad_reclaim;
195 uint_t	page_lock_same_page;
196 uint_t	page_lock_upgrade;
197 uint_t	page_lock_retired;
198 uint_t	page_lock_upgrade_failed;
199 uint_t	page_lock_deleted;
200 
201 uint_t	page_trylock_locked;
202 uint_t	page_trylock_failed;
203 uint_t	page_trylock_missed;
204 
205 uint_t	page_try_reclaim_upgrade;
206 #endif /* VM_STATS */
207 
208 /*
209  * Acquire the "shared/exclusive" lock on a page.
210  *
211  * Returns 1 on success and locks the page appropriately.
212  *	   0 on failure and does not lock the page.
213  *
214  * If `lock' is non-NULL, it will be dropped and reacquired in the
215  * failure case.  This routine can block, and if it does
216  * it will always return a failure since the page identity [vp, off]
217  * or state may have changed.
218  */
219 
220 int
221 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
222 {
223 	return (page_lock_es(pp, se, lock, reclaim, 0));
224 }
225 
226 /*
227  * With the addition of reader-writer lock semantics to page_lock_es,
228  * callers wanting an exclusive (writer) lock may prevent shared-lock
229  * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
230  * In this case, when an exclusive lock cannot be acquired, p_selock's
231  * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
232  * if the page is slated for retirement.
233  *
234  * The se and es parameters determine if the lock should be granted
235  * based on the following decision table:
236  *
237  * Lock wanted   es flags     p_selock/SE_EWANTED  Action
238  * ----------- -------------- -------------------  ---------
239  * SE_EXCL        any [1][2]   unlocked/any        grant lock, clear SE_EWANTED
240  * SE_EXCL        SE_EWANTED   any lock/any        deny, set SE_EWANTED
241  * SE_EXCL        none         any lock/any        deny
242  * SE_SHARED      n/a [2]        shared/0          grant
243  * SE_SHARED      n/a [2]      unlocked/0          grant
244  * SE_SHARED      n/a            shared/1          deny
245  * SE_SHARED      n/a          unlocked/1          deny
246  * SE_SHARED      n/a              excl/any        deny
247  *
248  * Notes:
249  * [1] The code grants an exclusive lock to the caller and clears the bit
250  *   SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
251  *   bit's value.  This was deemed acceptable as we are not concerned about
252  *   exclusive-lock starvation. If this ever becomes an issue, a priority or
253  *   fifo mechanism should also be implemented. Meantime, the thread that
254  *   set SE_EWANTED should be prepared to catch this condition and reset it
255  *
256  * [2] Retired pages may not be locked at any time, regardless of the
257  *   dispostion of se, unless the es parameter has SE_RETIRED flag set.
258  *
259  * Notes on values of "es":
260  *
261  *   es & 1: page_lookup_create will attempt page relocation
262  *   es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
263  *       memory thread); this prevents reader-starvation of waiting
264  *       writer thread(s) by giving priority to writers over readers.
265  *   es & SE_RETIRED: caller wants to lock pages even if they are
266  *       retired.  Default is to deny the lock if the page is retired.
267  *
268  * And yes, we know, the semantics of this function are too complicated.
269  * It's on the list to be cleaned up.
270  */
271 int
272 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
273 {
274 	int		retval;
275 	kmutex_t	*pse = PAGE_SE_MUTEX(pp);
276 	int		upgraded;
277 	int		reclaim_it;
278 
279 	ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
280 
281 	VM_STAT_ADD(page_lock_count);
282 
283 	upgraded = 0;
284 	reclaim_it = 0;
285 
286 	mutex_enter(pse);
287 
288 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
289 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
290 
291 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
292 		mutex_exit(pse);
293 		VM_STAT_ADD(page_lock_retired);
294 		return (0);
295 	}
296 
297 	if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
298 		se = SE_EXCL;
299 	}
300 
301 	if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
302 
303 		reclaim_it = 1;
304 		if (se == SE_SHARED) {
305 			/*
306 			 * This is an interesting situation.
307 			 *
308 			 * Remember that p_free can only change if
309 			 * p_selock < 0.
310 			 * p_free does not depend on our holding `pse'.
311 			 * And, since we hold `pse', p_selock can not change.
312 			 * So, if p_free changes on us, the page is already
313 			 * exclusively held, and we would fail to get p_selock
314 			 * regardless.
315 			 *
316 			 * We want to avoid getting the share
317 			 * lock on a free page that needs to be reclaimed.
318 			 * It is possible that some other thread has the share
319 			 * lock and has left the free page on the cache list.
320 			 * pvn_vplist_dirty() does this for brief periods.
321 			 * If the se_share is currently SE_EXCL, we will fail
322 			 * to acquire p_selock anyway.  Blocking is the
323 			 * right thing to do.
324 			 * If we need to reclaim this page, we must get
325 			 * exclusive access to it, force the upgrade now.
326 			 * Again, we will fail to acquire p_selock if the
327 			 * page is not free and block.
328 			 */
329 			upgraded = 1;
330 			se = SE_EXCL;
331 			VM_STAT_ADD(page_lock_upgrade);
332 		}
333 	}
334 
335 	if (se == SE_EXCL) {
336 		if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
337 			/*
338 			 * if the caller wants a writer lock (but did not
339 			 * specify exclusive access), and there is a pending
340 			 * writer that wants exclusive access, return failure
341 			 */
342 			retval = 0;
343 		} else if ((pp->p_selock & ~SE_EWANTED) == 0) {
344 			/* no reader/writer lock held */
345 			THREAD_KPRI_REQUEST();
346 			/* this clears our setting of the SE_EWANTED bit */
347 			pp->p_selock = SE_WRITER;
348 			retval = 1;
349 		} else {
350 			/* page is locked */
351 			if (es & SE_EXCL_WANTED) {
352 				/* set the SE_EWANTED bit */
353 				pp->p_selock |= SE_EWANTED;
354 			}
355 			retval = 0;
356 		}
357 	} else {
358 		retval = 0;
359 		if (pp->p_selock >= 0) {
360 			if ((pp->p_selock & SE_EWANTED) == 0) {
361 				pp->p_selock += SE_READER;
362 				retval = 1;
363 			}
364 		}
365 	}
366 
367 	if (retval == 0) {
368 		if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
369 			VM_STAT_ADD(page_lock_deleted);
370 			mutex_exit(pse);
371 			return (retval);
372 		}
373 
374 #ifdef VM_STATS
375 		VM_STAT_ADD(page_lock_miss);
376 		if (upgraded) {
377 			VM_STAT_ADD(page_lock_upgrade_failed);
378 		}
379 #endif
380 		if (lock) {
381 			VM_STAT_ADD(page_lock_miss_lock);
382 			mutex_exit(lock);
383 		}
384 
385 		/*
386 		 * Now, wait for the page to be unlocked and
387 		 * release the lock protecting p_cv and p_selock.
388 		 */
389 		cv_wait(&pp->p_cv, pse);
390 		mutex_exit(pse);
391 
392 		/*
393 		 * The page identity may have changed while we were
394 		 * blocked.  If we are willing to depend on "pp"
395 		 * still pointing to a valid page structure (i.e.,
396 		 * assuming page structures are not dynamically allocated
397 		 * or freed), we could try to lock the page if its
398 		 * identity hasn't changed.
399 		 *
400 		 * This needs to be measured, since we come back from
401 		 * cv_wait holding pse (the expensive part of this
402 		 * operation) we might as well try the cheap part.
403 		 * Though we would also have to confirm that dropping
404 		 * `lock' did not cause any grief to the callers.
405 		 */
406 		if (lock) {
407 			mutex_enter(lock);
408 		}
409 	} else {
410 		/*
411 		 * We have the page lock.
412 		 * If we needed to reclaim the page, and the page
413 		 * needed reclaiming (ie, it was free), then we
414 		 * have the page exclusively locked.  We may need
415 		 * to downgrade the page.
416 		 */
417 		ASSERT((upgraded) ?
418 		    ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
419 		mutex_exit(pse);
420 
421 		/*
422 		 * We now hold this page's lock, either shared or
423 		 * exclusive.  This will prevent its identity from changing.
424 		 * The page, however, may or may not be free.  If the caller
425 		 * requested, and it is free, go reclaim it from the
426 		 * free list.  If the page can't be reclaimed, return failure
427 		 * so that the caller can start all over again.
428 		 *
429 		 * NOTE:page_reclaim() releases the page lock (p_selock)
430 		 *	if it can't be reclaimed.
431 		 */
432 		if (reclaim_it) {
433 			if (!page_reclaim(pp, lock)) {
434 				VM_STAT_ADD(page_lock_bad_reclaim);
435 				retval = 0;
436 			} else {
437 				VM_STAT_ADD(page_lock_reclaim);
438 				if (upgraded) {
439 					page_downgrade(pp);
440 				}
441 			}
442 		}
443 	}
444 	return (retval);
445 }
446 
447 /*
448  * Clear the SE_EWANTED bit from p_selock.  This function allows
449  * callers of page_lock_es and page_try_reclaim_lock to clear
450  * their setting of this bit if they decide they no longer wish
451  * to gain exclusive access to the page.  Currently only
452  * delete_memory_thread uses this when the delete memory
453  * operation is cancelled.
454  */
455 void
456 page_lock_clr_exclwanted(page_t *pp)
457 {
458 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
459 
460 	mutex_enter(pse);
461 	pp->p_selock &= ~SE_EWANTED;
462 	if (CV_HAS_WAITERS(&pp->p_cv))
463 		cv_broadcast(&pp->p_cv);
464 	mutex_exit(pse);
465 }
466 
467 /*
468  * Read the comments inside of page_lock_es() carefully.
469  *
470  * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
471  * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
472  * This is used by threads subject to reader-starvation (eg. memory delete).
473  *
474  * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
475  * it is expected that it will retry at a later time.  Threads that will
476  * not retry the lock *must* call page_lock_clr_exclwanted to clear the
477  * SE_EWANTED bit.  (When a thread using SE_EXCL_WANTED obtains the lock,
478  * the bit is cleared.)
479  */
480 int
481 page_try_reclaim_lock(page_t *pp, se_t se, int es)
482 {
483 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
484 	selock_t old;
485 
486 	mutex_enter(pse);
487 
488 	old = pp->p_selock;
489 
490 	ASSERT(((es & SE_EXCL_WANTED) == 0) ||
491 	    ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
492 
493 	if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
494 		mutex_exit(pse);
495 		VM_STAT_ADD(page_trylock_failed);
496 		return (0);
497 	}
498 
499 	if (se == SE_SHARED && es == 1 && old == 0) {
500 		se = SE_EXCL;
501 	}
502 
503 	if (se == SE_SHARED) {
504 		if (!PP_ISFREE(pp)) {
505 			if (old >= 0) {
506 				/*
507 				 * Readers are not allowed when excl wanted
508 				 */
509 				if ((old & SE_EWANTED) == 0) {
510 					pp->p_selock = old + SE_READER;
511 					mutex_exit(pse);
512 					return (1);
513 				}
514 			}
515 			mutex_exit(pse);
516 			return (0);
517 		}
518 		/*
519 		 * The page is free, so we really want SE_EXCL (below)
520 		 */
521 		VM_STAT_ADD(page_try_reclaim_upgrade);
522 	}
523 
524 	/*
525 	 * The caller wants a writer lock.  We try for it only if
526 	 * SE_EWANTED is not set, or if the caller specified
527 	 * SE_EXCL_WANTED.
528 	 */
529 	if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
530 		if ((old & ~SE_EWANTED) == 0) {
531 			/* no reader/writer lock held */
532 			THREAD_KPRI_REQUEST();
533 			/* this clears out our setting of the SE_EWANTED bit */
534 			pp->p_selock = SE_WRITER;
535 			mutex_exit(pse);
536 			return (1);
537 		}
538 	}
539 	if (es & SE_EXCL_WANTED) {
540 		/* page is locked, set the SE_EWANTED bit */
541 		pp->p_selock |= SE_EWANTED;
542 	}
543 	mutex_exit(pse);
544 	return (0);
545 }
546 
547 /*
548  * Acquire a page's "shared/exclusive" lock, but never block.
549  * Returns 1 on success, 0 on failure.
550  */
551 int
552 page_trylock(page_t *pp, se_t se)
553 {
554 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
555 
556 	mutex_enter(pse);
557 	if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
558 	    (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
559 		/*
560 		 * Fail if a thread wants exclusive access and page is
561 		 * retired, if the page is slated for retirement, or a
562 		 * share lock is requested.
563 		 */
564 		mutex_exit(pse);
565 		VM_STAT_ADD(page_trylock_failed);
566 		return (0);
567 	}
568 
569 	if (se == SE_EXCL) {
570 		if (pp->p_selock == 0) {
571 			THREAD_KPRI_REQUEST();
572 			pp->p_selock = SE_WRITER;
573 			mutex_exit(pse);
574 			return (1);
575 		}
576 	} else {
577 		if (pp->p_selock >= 0) {
578 			pp->p_selock += SE_READER;
579 			mutex_exit(pse);
580 			return (1);
581 		}
582 	}
583 	mutex_exit(pse);
584 	return (0);
585 }
586 
587 /*
588  * Variant of page_unlock() specifically for the page freelist
589  * code. The mere existence of this code is a vile hack that
590  * has resulted due to the backwards locking order of the page
591  * freelist manager; please don't call it.
592  */
593 void
594 page_unlock_nocapture(page_t *pp)
595 {
596 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
597 	selock_t old;
598 
599 	mutex_enter(pse);
600 
601 	old = pp->p_selock;
602 	if ((old & ~SE_EWANTED) == SE_READER) {
603 		pp->p_selock = old & ~SE_READER;
604 		if (CV_HAS_WAITERS(&pp->p_cv))
605 			cv_broadcast(&pp->p_cv);
606 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
607 		panic("page_unlock_nocapture: page %p is deleted", pp);
608 	} else if (old < 0) {
609 		THREAD_KPRI_RELEASE();
610 		pp->p_selock &= SE_EWANTED;
611 		if (CV_HAS_WAITERS(&pp->p_cv))
612 			cv_broadcast(&pp->p_cv);
613 	} else if ((old & ~SE_EWANTED) > SE_READER) {
614 		pp->p_selock = old - SE_READER;
615 	} else {
616 		panic("page_unlock_nocapture: page %p is not locked", pp);
617 	}
618 
619 	mutex_exit(pse);
620 }
621 
622 /*
623  * Release the page's "shared/exclusive" lock and wake up anyone
624  * who might be waiting for it.
625  */
626 void
627 page_unlock(page_t *pp)
628 {
629 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
630 	selock_t old;
631 
632 	mutex_enter(pse);
633 
634 	old = pp->p_selock;
635 	if ((old & ~SE_EWANTED) == SE_READER) {
636 		pp->p_selock = old & ~SE_READER;
637 		if (CV_HAS_WAITERS(&pp->p_cv))
638 			cv_broadcast(&pp->p_cv);
639 	} else if ((old & ~SE_EWANTED) == SE_DELETED) {
640 		panic("page_unlock: page %p is deleted", pp);
641 	} else if (old < 0) {
642 		THREAD_KPRI_RELEASE();
643 		pp->p_selock &= SE_EWANTED;
644 		if (CV_HAS_WAITERS(&pp->p_cv))
645 			cv_broadcast(&pp->p_cv);
646 	} else if ((old & ~SE_EWANTED) > SE_READER) {
647 		pp->p_selock = old - SE_READER;
648 	} else {
649 		panic("page_unlock: page %p is not locked", pp);
650 	}
651 
652 	if (pp->p_selock == 0) {
653 		/*
654 		 * If the T_CAPTURING bit is set, that means that we should
655 		 * not try and capture the page again as we could recurse
656 		 * which could lead to a stack overflow panic or spending a
657 		 * relatively long time in the kernel making no progress.
658 		 */
659 		if ((pp->p_toxic & PR_CAPTURE) &&
660 		    !(curthread->t_flag & T_CAPTURING) &&
661 		    !PP_RETIRED(pp)) {
662 			THREAD_KPRI_REQUEST();
663 			pp->p_selock = SE_WRITER;
664 			mutex_exit(pse);
665 			page_unlock_capture(pp);
666 		} else {
667 			mutex_exit(pse);
668 		}
669 	} else {
670 		mutex_exit(pse);
671 	}
672 }
673 
674 /*
675  * Try to upgrade the lock on the page from a "shared" to an
676  * "exclusive" lock.  Since this upgrade operation is done while
677  * holding the mutex protecting this page, no one else can acquire this page's
678  * lock and change the page. Thus, it is safe to drop the "shared"
679  * lock and attempt to acquire the "exclusive" lock.
680  *
681  * Returns 1 on success, 0 on failure.
682  */
683 int
684 page_tryupgrade(page_t *pp)
685 {
686 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
687 
688 	mutex_enter(pse);
689 	if (!(pp->p_selock & SE_EWANTED)) {
690 		/* no threads want exclusive access, try upgrade */
691 		if (pp->p_selock == SE_READER) {
692 			THREAD_KPRI_REQUEST();
693 			/* convert to exclusive lock */
694 			pp->p_selock = SE_WRITER;
695 			mutex_exit(pse);
696 			return (1);
697 		}
698 	}
699 	mutex_exit(pse);
700 	return (0);
701 }
702 
703 /*
704  * Downgrade the "exclusive" lock on the page to a "shared" lock
705  * while holding the mutex protecting this page's p_selock field.
706  */
707 void
708 page_downgrade(page_t *pp)
709 {
710 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
711 	int excl_waiting;
712 
713 	ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
714 	ASSERT(PAGE_EXCL(pp));
715 
716 	mutex_enter(pse);
717 	excl_waiting =  pp->p_selock & SE_EWANTED;
718 	THREAD_KPRI_RELEASE();
719 	pp->p_selock = SE_READER | excl_waiting;
720 	if (CV_HAS_WAITERS(&pp->p_cv))
721 		cv_broadcast(&pp->p_cv);
722 	mutex_exit(pse);
723 }
724 
725 void
726 page_lock_delete(page_t *pp)
727 {
728 	kmutex_t *pse = PAGE_SE_MUTEX(pp);
729 
730 	ASSERT(PAGE_EXCL(pp));
731 	ASSERT(pp->p_vnode == NULL);
732 	ASSERT(pp->p_offset == (u_offset_t)-1);
733 	ASSERT(!PP_ISFREE(pp));
734 
735 	mutex_enter(pse);
736 	THREAD_KPRI_RELEASE();
737 	pp->p_selock = SE_DELETED;
738 	if (CV_HAS_WAITERS(&pp->p_cv))
739 		cv_broadcast(&pp->p_cv);
740 	mutex_exit(pse);
741 }
742 
743 int
744 page_deleted(page_t *pp)
745 {
746 	return (pp->p_selock == SE_DELETED);
747 }
748 
749 /*
750  * Implement the io lock for pages
751  */
752 void
753 page_iolock_init(page_t *pp)
754 {
755 	pp->p_iolock_state = 0;
756 	cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
757 }
758 
759 /*
760  * Acquire the i/o lock on a page.
761  */
762 void
763 page_io_lock(page_t *pp)
764 {
765 	kmutex_t *pio;
766 
767 	pio = PAGE_IO_MUTEX(pp);
768 	mutex_enter(pio);
769 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
770 		cv_wait(&(pp->p_io_cv), pio);
771 	}
772 	pp->p_iolock_state |= PAGE_IO_INUSE;
773 	mutex_exit(pio);
774 }
775 
776 /*
777  * Release the i/o lock on a page.
778  */
779 void
780 page_io_unlock(page_t *pp)
781 {
782 	kmutex_t *pio;
783 
784 	pio = PAGE_IO_MUTEX(pp);
785 	mutex_enter(pio);
786 	cv_broadcast(&pp->p_io_cv);
787 	pp->p_iolock_state &= ~PAGE_IO_INUSE;
788 	mutex_exit(pio);
789 }
790 
791 /*
792  * Try to acquire the i/o lock on a page without blocking.
793  * Returns 1 on success, 0 on failure.
794  */
795 int
796 page_io_trylock(page_t *pp)
797 {
798 	kmutex_t *pio;
799 
800 	if (pp->p_iolock_state & PAGE_IO_INUSE)
801 		return (0);
802 
803 	pio = PAGE_IO_MUTEX(pp);
804 	mutex_enter(pio);
805 
806 	if (pp->p_iolock_state & PAGE_IO_INUSE) {
807 		mutex_exit(pio);
808 		return (0);
809 	}
810 	pp->p_iolock_state |= PAGE_IO_INUSE;
811 	mutex_exit(pio);
812 
813 	return (1);
814 }
815 
816 /*
817  * Wait until the i/o lock is not held.
818  */
819 void
820 page_io_wait(page_t *pp)
821 {
822 	kmutex_t *pio;
823 
824 	pio = PAGE_IO_MUTEX(pp);
825 	mutex_enter(pio);
826 	while (pp->p_iolock_state & PAGE_IO_INUSE) {
827 		cv_wait(&(pp->p_io_cv), pio);
828 	}
829 	mutex_exit(pio);
830 }
831 
832 /*
833  * Returns 1 on success, 0 on failure.
834  */
835 int
836 page_io_locked(page_t *pp)
837 {
838 	return (pp->p_iolock_state & PAGE_IO_INUSE);
839 }
840 
841 /*
842  * Assert that the i/o lock on a page is held.
843  * Returns 1 on success, 0 on failure.
844  */
845 int
846 page_iolock_assert(page_t *pp)
847 {
848 	return (page_io_locked(pp));
849 }
850 
851 /*
852  * Wrapper exported to kernel routines that are built
853  * platform-independent (the macro is platform-dependent;
854  * the size of vph_mutex[] is based on NCPU).
855  *
856  * Note that you can do stress testing on this by setting the
857  * variable page_vnode_mutex_stress to something other than
858  * zero in a DEBUG kernel in a debugger after loading the kernel.
859  * Setting it after the kernel is running may not work correctly.
860  */
861 #ifdef DEBUG
862 static int page_vnode_mutex_stress = 0;
863 #endif
864 
865 kmutex_t *
866 page_vnode_mutex(vnode_t *vp)
867 {
868 	if (vp == &kvp)
869 		return (&vph_mutex[VPH_TABLE_SIZE + 0]);
870 
871 	if (vp == &zvp)
872 		return (&vph_mutex[VPH_TABLE_SIZE + 1]);
873 #ifdef DEBUG
874 	if (page_vnode_mutex_stress != 0)
875 		return (&vph_mutex[0]);
876 #endif
877 
878 	return (&vph_mutex[VP_HASH_FUNC(vp)]);
879 }
880 
881 kmutex_t *
882 page_se_mutex(page_t *pp)
883 {
884 	return (PAGE_SE_MUTEX(pp));
885 }
886 
887 #ifdef VM_STATS
888 uint_t pszclck_stat[4];
889 #endif
890 /*
891  * Find, take and return a mutex held by hat_page_demote().
892  * Called by page_demote_vp_pages() before hat_page_demote() call and by
893  * routines that want to block hat_page_demote() but can't do it
894  * via locking all constituent pages.
895  *
896  * Return NULL if p_szc is 0.
897  *
898  * It should only be used for pages that can be demoted by hat_page_demote()
899  * i.e. non swapfs file system pages.  The logic here is lifted from
900  * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
901  * since the page is locked and not free.
902  *
903  * Hash of the root page is used to find the lock.
904  * To find the root in the presense of hat_page_demote() chageing the location
905  * of the root this routine relies on the fact that hat_page_demote() changes
906  * root last.
907  *
908  * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
909  * returned pp's p_szc may be any value.
910  */
911 kmutex_t *
912 page_szc_lock(page_t *pp)
913 {
914 	kmutex_t	*mtx;
915 	page_t		*rootpp;
916 	uint_t		szc;
917 	uint_t		rszc;
918 	uint_t		pszc = pp->p_szc;
919 
920 	ASSERT(pp != NULL);
921 	ASSERT(PAGE_LOCKED(pp));
922 	ASSERT(!PP_ISFREE(pp));
923 	ASSERT(pp->p_vnode != NULL);
924 	ASSERT(!IS_SWAPFSVP(pp->p_vnode));
925 	ASSERT(!PP_ISKAS(pp));
926 
927 again:
928 	if (pszc == 0) {
929 		VM_STAT_ADD(pszclck_stat[0]);
930 		return (NULL);
931 	}
932 
933 	/* The lock lives in the root page */
934 
935 	rootpp = PP_GROUPLEADER(pp, pszc);
936 	mtx = PAGE_SZC_MUTEX(rootpp);
937 	mutex_enter(mtx);
938 
939 	/*
940 	 * since p_szc can only decrease if pp == rootpp
941 	 * rootpp will be always the same i.e we have the right root
942 	 * regardless of rootpp->p_szc.
943 	 * If location of pp's root didn't change after we took
944 	 * the lock we have the right root. return mutex hashed off it.
945 	 */
946 	if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
947 		VM_STAT_ADD(pszclck_stat[1]);
948 		return (mtx);
949 	}
950 
951 	/*
952 	 * root location changed because page got demoted.
953 	 * locate the new root.
954 	 */
955 	if (rszc < pszc) {
956 		szc = pp->p_szc;
957 		ASSERT(szc < pszc);
958 		mutex_exit(mtx);
959 		pszc = szc;
960 		VM_STAT_ADD(pszclck_stat[2]);
961 		goto again;
962 	}
963 
964 	VM_STAT_ADD(pszclck_stat[3]);
965 	/*
966 	 * current hat_page_demote not done yet.
967 	 * wait for it to finish.
968 	 */
969 	mutex_exit(mtx);
970 	rootpp = PP_GROUPLEADER(rootpp, rszc);
971 	mtx = PAGE_SZC_MUTEX(rootpp);
972 	mutex_enter(mtx);
973 	mutex_exit(mtx);
974 	ASSERT(rootpp->p_szc < rszc);
975 	goto again;
976 }
977 
978 int
979 page_szc_lock_assert(page_t *pp)
980 {
981 	page_t *rootpp = PP_PAGEROOT(pp);
982 	kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
983 
984 	return (MUTEX_HELD(mtx));
985 }
986