1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2019 Joyent, Inc.
24 */
25
26
27 /*
28 * VM - page locking primitives
29 */
30 #include <sys/param.h>
31 #include <sys/t_lock.h>
32 #include <sys/vtrace.h>
33 #include <sys/debug.h>
34 #include <sys/cmn_err.h>
35 #include <sys/bitmap.h>
36 #include <sys/lockstat.h>
37 #include <sys/sysmacros.h>
38 #include <sys/condvar_impl.h>
39 #include <vm/page.h>
40 #include <vm/seg_enum.h>
41 #include <vm/vm_dep.h>
42 #include <vm/seg_kmem.h>
43
44 /*
45 * This global mutex array is for logical page locking.
46 * The following fields in the page structure are protected
47 * by this lock:
48 *
49 * p_lckcnt
50 * p_cowcnt
51 */
52 pad_mutex_t page_llocks[8 * NCPU_P2];
53
54 /*
55 * This is a global lock for the logical page free list. The
56 * logical free list, in this implementation, is maintained as two
57 * separate physical lists - the cache list and the free list.
58 */
59 kmutex_t page_freelock;
60
61 /*
62 * The hash table, page_hash[], the p_selock fields, and the
63 * list of pages associated with vnodes are protected by arrays of mutexes.
64 *
65 * Unless the hashes are changed radically, the table sizes must be
66 * a power of two. Also, we typically need more mutexes for the
67 * vnodes since these locks are occasionally held for long periods.
68 * And since there seem to be two special vnodes (kvp and swapvp),
69 * we make room for private mutexes for them.
70 *
71 * The pse_mutex[] array holds the mutexes to protect the p_selock
72 * fields of all page_t structures.
73 *
74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex
75 * when given a pointer to a page_t.
76 *
77 * PIO_TABLE_SIZE must be a power of two. One could argue that we
78 * should go to the trouble of setting it up at run time and base it
79 * on memory size rather than the number of compile time CPUs.
80 *
81 * XX64 We should be using physmem size to calculate PIO_SHIFT.
82 *
83 * These might break in 64 bit world.
84 */
85 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */
86 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */
87
88 pad_mutex_t ph_mutex[PH_TABLE_SIZE];
89 kmutex_t pio_mutex[PIO_TABLE_SIZE];
90
91 #define PAGE_IO_MUTEX(pp) \
92 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)]
93
94 /*
95 * The pse_mutex[] array is allocated in the platform startup code
96 * based on the size of the machine at startup.
97 */
98 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */
99 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */
100 extern int pse_shift; /* log2(pse_table_size) */
101 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \
102 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \
103 (pse_table_size - 1)].pad_mutex
104
105 #define PSZC_MTX_TABLE_SIZE 128
106 #define PSZC_MTX_TABLE_SHIFT 7
107
108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE];
109
110 #define PAGE_SZC_MUTEX(_pp) \
111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \
112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \
113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \
114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex
115
116 /*
117 * The vph_mutex[] array holds the mutexes to protect the vnode chains,
118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev
119 * and p_vpnext).
120 *
121 * The page_vnode_mutex(vp) function returns the address of the appropriate
122 * mutex from this array given a pointer to a vnode. It is complicated
123 * by the fact that the kernel's vnode and the swapfs vnode are referenced
124 * frequently enough to warrent their own mutexes.
125 *
126 * The VP_HASH_FUNC returns the index into the vph_mutex array given
127 * an address of a vnode.
128 */
129
130 #if defined(_LP64)
131 #define VPH_TABLE_SIZE (8 * NCPU_P2)
132 #else /* 32 bits */
133 #define VPH_TABLE_SIZE (2 * NCPU_P2)
134 #endif
135
136 #define VP_HASH_FUNC(vp) \
137 ((((uintptr_t)(vp) >> 6) + \
138 ((uintptr_t)(vp) >> 8) + \
139 ((uintptr_t)(vp) >> 10) + \
140 ((uintptr_t)(vp) >> 12)) \
141 & (VPH_TABLE_SIZE - 1))
142
143 /*
144 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes.
145 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is
146 * VPH_TABLE_SIZE + 1.
147 */
148
149 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2];
150
151 /*
152 * Initialize the locks used by the Virtual Memory Management system.
153 */
154 void
page_lock_init()155 page_lock_init()
156 {
157 }
158
159 /*
160 * Return a value for pse_shift based on npg (the number of physical pages)
161 * and ncpu (the maximum number of CPUs). This is called by platform startup
162 * code.
163 *
164 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[]
165 * locks grew approximately as the square of the number of threads executing.
166 * So the primary scaling factor used is NCPU^2. The size of the machine in
167 * megabytes is used as an upper bound, particularly for sun4v machines which
168 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE
169 * (128) is used as a minimum. Since the size of the table has to be a power
170 * of two, the calculated size is rounded up to the next power of two.
171 */
172 /*ARGSUSED*/
173 int
size_pse_array(pgcnt_t npg,int ncpu)174 size_pse_array(pgcnt_t npg, int ncpu)
175 {
176 size_t size;
177 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE;
178
179 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu));
180 size += (1 << (highbit(size) - 1)) - 1;
181 return (highbit(size) - 1);
182 }
183
184 /*
185 * At present we only use page ownership to aid debugging, so it's
186 * OK if the owner field isn't exact. In the 32-bit world two thread ids
187 * can map to the same owner because we just 'or' in 0x80000000 and
188 * then clear the second highest bit, so that (for example) 0x2faced00
189 * and 0xafaced00 both map to 0xafaced00.
190 * In the 64-bit world, p_selock may not be large enough to hold a full
191 * thread pointer. If we ever need precise ownership (e.g. if we implement
192 * priority inheritance for page locks) then p_selock should become a
193 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2).
194 */
195 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED)
196 #define SE_READER 1
197
198 /*
199 * A page that is deleted must be marked as such using the
200 * page_lock_delete() function. The page must be exclusively locked.
201 * The SE_DELETED marker is put in p_selock when this function is called.
202 * SE_DELETED must be distinct from any SE_WRITER value.
203 */
204 #define SE_DELETED (1 | INT_MIN)
205
206 #ifdef VM_STATS
207 uint_t vph_kvp_count;
208 uint_t vph_swapfsvp_count;
209 uint_t vph_other;
210 #endif /* VM_STATS */
211
212 #ifdef VM_STATS
213 uint_t page_lock_count;
214 uint_t page_lock_miss;
215 uint_t page_lock_miss_lock;
216 uint_t page_lock_reclaim;
217 uint_t page_lock_bad_reclaim;
218 uint_t page_lock_same_page;
219 uint_t page_lock_upgrade;
220 uint_t page_lock_retired;
221 uint_t page_lock_upgrade_failed;
222 uint_t page_lock_deleted;
223
224 uint_t page_trylock_locked;
225 uint_t page_trylock_failed;
226 uint_t page_trylock_missed;
227
228 uint_t page_try_reclaim_upgrade;
229 #endif /* VM_STATS */
230
231 /*
232 * Acquire the "shared/exclusive" lock on a page.
233 *
234 * Returns 1 on success and locks the page appropriately.
235 * 0 on failure and does not lock the page.
236 *
237 * If `lock' is non-NULL, it will be dropped and reacquired in the
238 * failure case. This routine can block, and if it does
239 * it will always return a failure since the page identity [vp, off]
240 * or state may have changed.
241 */
242
243 int
page_lock(page_t * pp,se_t se,kmutex_t * lock,reclaim_t reclaim)244 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim)
245 {
246 return (page_lock_es(pp, se, lock, reclaim, 0));
247 }
248
249 /*
250 * With the addition of reader-writer lock semantics to page_lock_es,
251 * callers wanting an exclusive (writer) lock may prevent shared-lock
252 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED.
253 * In this case, when an exclusive lock cannot be acquired, p_selock's
254 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied
255 * if the page is slated for retirement.
256 *
257 * The se and es parameters determine if the lock should be granted
258 * based on the following decision table:
259 *
260 * Lock wanted es flags p_selock/SE_EWANTED Action
261 * ----------- -------------- ------------------- ---------
262 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED
263 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED
264 * SE_EXCL none any lock/any deny
265 * SE_SHARED n/a [2] shared/0 grant
266 * SE_SHARED n/a [2] unlocked/0 grant
267 * SE_SHARED n/a shared/1 deny
268 * SE_SHARED n/a unlocked/1 deny
269 * SE_SHARED n/a excl/any deny
270 *
271 * Notes:
272 * [1] The code grants an exclusive lock to the caller and clears the bit
273 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED
274 * bit's value. This was deemed acceptable as we are not concerned about
275 * exclusive-lock starvation. If this ever becomes an issue, a priority or
276 * fifo mechanism should also be implemented. Meantime, the thread that
277 * set SE_EWANTED should be prepared to catch this condition and reset it
278 *
279 * [2] Retired pages may not be locked at any time, regardless of the
280 * dispostion of se, unless the es parameter has SE_RETIRED flag set.
281 *
282 * Notes on values of "es":
283 *
284 * es & 1: page_lookup_create will attempt page relocation
285 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete
286 * memory thread); this prevents reader-starvation of waiting
287 * writer thread(s) by giving priority to writers over readers.
288 * es & SE_RETIRED: caller wants to lock pages even if they are
289 * retired. Default is to deny the lock if the page is retired.
290 *
291 * And yes, we know, the semantics of this function are too complicated.
292 * It's on the list to be cleaned up.
293 */
294 int
page_lock_es(page_t * pp,se_t se,kmutex_t * lock,reclaim_t reclaim,int es)295 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es)
296 {
297 int retval;
298 kmutex_t *pse = PAGE_SE_MUTEX(pp);
299 int upgraded;
300 int reclaim_it;
301
302 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1);
303
304 VM_STAT_ADD(page_lock_count);
305
306 upgraded = 0;
307 reclaim_it = 0;
308
309 mutex_enter(pse);
310
311 ASSERT(((es & SE_EXCL_WANTED) == 0) ||
312 ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
313
314 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
315 mutex_exit(pse);
316 VM_STAT_ADD(page_lock_retired);
317 return (0);
318 }
319
320 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) {
321 se = SE_EXCL;
322 }
323
324 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) {
325
326 reclaim_it = 1;
327 if (se == SE_SHARED) {
328 /*
329 * This is an interesting situation.
330 *
331 * Remember that p_free can only change if
332 * p_selock < 0.
333 * p_free does not depend on our holding `pse'.
334 * And, since we hold `pse', p_selock can not change.
335 * So, if p_free changes on us, the page is already
336 * exclusively held, and we would fail to get p_selock
337 * regardless.
338 *
339 * We want to avoid getting the share
340 * lock on a free page that needs to be reclaimed.
341 * It is possible that some other thread has the share
342 * lock and has left the free page on the cache list.
343 * pvn_vplist_dirty() does this for brief periods.
344 * If the se_share is currently SE_EXCL, we will fail
345 * to acquire p_selock anyway. Blocking is the
346 * right thing to do.
347 * If we need to reclaim this page, we must get
348 * exclusive access to it, force the upgrade now.
349 * Again, we will fail to acquire p_selock if the
350 * page is not free and block.
351 */
352 upgraded = 1;
353 se = SE_EXCL;
354 VM_STAT_ADD(page_lock_upgrade);
355 }
356 }
357
358 if (se == SE_EXCL) {
359 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) {
360 /*
361 * if the caller wants a writer lock (but did not
362 * specify exclusive access), and there is a pending
363 * writer that wants exclusive access, return failure
364 */
365 retval = 0;
366 } else if ((pp->p_selock & ~SE_EWANTED) == 0) {
367 /* no reader/writer lock held */
368 /* this clears our setting of the SE_EWANTED bit */
369 pp->p_selock = SE_WRITER;
370 retval = 1;
371 } else {
372 /* page is locked */
373 if (es & SE_EXCL_WANTED) {
374 /* set the SE_EWANTED bit */
375 pp->p_selock |= SE_EWANTED;
376 }
377 retval = 0;
378 }
379 } else {
380 retval = 0;
381 if (pp->p_selock >= 0) {
382 if ((pp->p_selock & SE_EWANTED) == 0) {
383 pp->p_selock += SE_READER;
384 retval = 1;
385 }
386 }
387 }
388
389 if (retval == 0) {
390 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) {
391 VM_STAT_ADD(page_lock_deleted);
392 mutex_exit(pse);
393 return (retval);
394 }
395
396 #ifdef VM_STATS
397 VM_STAT_ADD(page_lock_miss);
398 if (upgraded) {
399 VM_STAT_ADD(page_lock_upgrade_failed);
400 }
401 #endif
402 if (lock) {
403 VM_STAT_ADD(page_lock_miss_lock);
404 mutex_exit(lock);
405 }
406
407 /*
408 * Now, wait for the page to be unlocked and
409 * release the lock protecting p_cv and p_selock.
410 */
411 cv_wait(&pp->p_cv, pse);
412 mutex_exit(pse);
413
414 /*
415 * The page identity may have changed while we were
416 * blocked. If we are willing to depend on "pp"
417 * still pointing to a valid page structure (i.e.,
418 * assuming page structures are not dynamically allocated
419 * or freed), we could try to lock the page if its
420 * identity hasn't changed.
421 *
422 * This needs to be measured, since we come back from
423 * cv_wait holding pse (the expensive part of this
424 * operation) we might as well try the cheap part.
425 * Though we would also have to confirm that dropping
426 * `lock' did not cause any grief to the callers.
427 */
428 if (lock) {
429 mutex_enter(lock);
430 }
431 } else {
432 /*
433 * We have the page lock.
434 * If we needed to reclaim the page, and the page
435 * needed reclaiming (ie, it was free), then we
436 * have the page exclusively locked. We may need
437 * to downgrade the page.
438 */
439 ASSERT((upgraded) ?
440 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1);
441 mutex_exit(pse);
442
443 /*
444 * We now hold this page's lock, either shared or
445 * exclusive. This will prevent its identity from changing.
446 * The page, however, may or may not be free. If the caller
447 * requested, and it is free, go reclaim it from the
448 * free list. If the page can't be reclaimed, return failure
449 * so that the caller can start all over again.
450 *
451 * NOTE:page_reclaim() releases the page lock (p_selock)
452 * if it can't be reclaimed.
453 */
454 if (reclaim_it) {
455 if (!page_reclaim(pp, lock)) {
456 VM_STAT_ADD(page_lock_bad_reclaim);
457 retval = 0;
458 } else {
459 VM_STAT_ADD(page_lock_reclaim);
460 if (upgraded) {
461 page_downgrade(pp);
462 }
463 }
464 }
465 }
466 return (retval);
467 }
468
469 /*
470 * Clear the SE_EWANTED bit from p_selock. This function allows
471 * callers of page_lock_es and page_try_reclaim_lock to clear
472 * their setting of this bit if they decide they no longer wish
473 * to gain exclusive access to the page. Currently only
474 * delete_memory_thread uses this when the delete memory
475 * operation is cancelled.
476 */
477 void
page_lock_clr_exclwanted(page_t * pp)478 page_lock_clr_exclwanted(page_t *pp)
479 {
480 kmutex_t *pse = PAGE_SE_MUTEX(pp);
481
482 mutex_enter(pse);
483 pp->p_selock &= ~SE_EWANTED;
484 if (CV_HAS_WAITERS(&pp->p_cv))
485 cv_broadcast(&pp->p_cv);
486 mutex_exit(pse);
487 }
488
489 /*
490 * Read the comments inside of page_lock_es() carefully.
491 *
492 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the
493 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained.
494 * This is used by threads subject to reader-starvation (eg. memory delete).
495 *
496 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock,
497 * it is expected that it will retry at a later time. Threads that will
498 * not retry the lock *must* call page_lock_clr_exclwanted to clear the
499 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock,
500 * the bit is cleared.)
501 */
502 int
page_try_reclaim_lock(page_t * pp,se_t se,int es)503 page_try_reclaim_lock(page_t *pp, se_t se, int es)
504 {
505 kmutex_t *pse = PAGE_SE_MUTEX(pp);
506 selock_t old;
507
508 mutex_enter(pse);
509
510 old = pp->p_selock;
511
512 ASSERT(((es & SE_EXCL_WANTED) == 0) ||
513 ((es & SE_EXCL_WANTED) && (se == SE_EXCL)));
514
515 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) {
516 mutex_exit(pse);
517 VM_STAT_ADD(page_trylock_failed);
518 return (0);
519 }
520
521 if (se == SE_SHARED && es == 1 && old == 0) {
522 se = SE_EXCL;
523 }
524
525 if (se == SE_SHARED) {
526 if (!PP_ISFREE(pp)) {
527 if (old >= 0) {
528 /*
529 * Readers are not allowed when excl wanted
530 */
531 if ((old & SE_EWANTED) == 0) {
532 pp->p_selock = old + SE_READER;
533 mutex_exit(pse);
534 return (1);
535 }
536 }
537 mutex_exit(pse);
538 return (0);
539 }
540 /*
541 * The page is free, so we really want SE_EXCL (below)
542 */
543 VM_STAT_ADD(page_try_reclaim_upgrade);
544 }
545
546 /*
547 * The caller wants a writer lock. We try for it only if
548 * SE_EWANTED is not set, or if the caller specified
549 * SE_EXCL_WANTED.
550 */
551 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) {
552 if ((old & ~SE_EWANTED) == 0) {
553 /* no reader/writer lock held */
554 /* this clears out our setting of the SE_EWANTED bit */
555 pp->p_selock = SE_WRITER;
556 mutex_exit(pse);
557 return (1);
558 }
559 }
560 if (es & SE_EXCL_WANTED) {
561 /* page is locked, set the SE_EWANTED bit */
562 pp->p_selock |= SE_EWANTED;
563 }
564 mutex_exit(pse);
565 return (0);
566 }
567
568 /*
569 * Acquire a page's "shared/exclusive" lock, but never block.
570 * Returns 1 on success, 0 on failure.
571 */
572 int
page_trylock(page_t * pp,se_t se)573 page_trylock(page_t *pp, se_t se)
574 {
575 kmutex_t *pse = PAGE_SE_MUTEX(pp);
576
577 mutex_enter(pse);
578 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) ||
579 (se == SE_SHARED && PP_PR_NOSHARE(pp))) {
580 /*
581 * Fail if a thread wants exclusive access and page is
582 * retired, if the page is slated for retirement, or a
583 * share lock is requested.
584 */
585 mutex_exit(pse);
586 VM_STAT_ADD(page_trylock_failed);
587 return (0);
588 }
589
590 if (se == SE_EXCL) {
591 if (pp->p_selock == 0) {
592 pp->p_selock = SE_WRITER;
593 mutex_exit(pse);
594 return (1);
595 }
596 } else {
597 if (pp->p_selock >= 0) {
598 pp->p_selock += SE_READER;
599 mutex_exit(pse);
600 return (1);
601 }
602 }
603 mutex_exit(pse);
604 return (0);
605 }
606
607 /*
608 * Variant of page_unlock() specifically for the page freelist
609 * code. The mere existence of this code is a vile hack that
610 * has resulted due to the backwards locking order of the page
611 * freelist manager; please don't call it.
612 */
613 void
page_unlock_nocapture(page_t * pp)614 page_unlock_nocapture(page_t *pp)
615 {
616 kmutex_t *pse = PAGE_SE_MUTEX(pp);
617 selock_t old;
618
619 mutex_enter(pse);
620
621 old = pp->p_selock;
622 if ((old & ~SE_EWANTED) == SE_READER) {
623 pp->p_selock = old & ~SE_READER;
624 if (CV_HAS_WAITERS(&pp->p_cv))
625 cv_broadcast(&pp->p_cv);
626 } else if ((old & ~SE_EWANTED) == SE_DELETED) {
627 panic("page_unlock_nocapture: page %p is deleted", (void *)pp);
628 } else if (old < 0) {
629 pp->p_selock &= SE_EWANTED;
630 if (CV_HAS_WAITERS(&pp->p_cv))
631 cv_broadcast(&pp->p_cv);
632 } else if ((old & ~SE_EWANTED) > SE_READER) {
633 pp->p_selock = old - SE_READER;
634 } else {
635 panic("page_unlock_nocapture: page %p is not locked",
636 (void *)pp);
637 }
638
639 mutex_exit(pse);
640 }
641
642 /*
643 * Release the page's "shared/exclusive" lock and wake up anyone
644 * who might be waiting for it.
645 */
646 void
page_unlock(page_t * pp)647 page_unlock(page_t *pp)
648 {
649 kmutex_t *pse = PAGE_SE_MUTEX(pp);
650 selock_t old;
651
652 mutex_enter(pse);
653
654 old = pp->p_selock;
655 if ((old & ~SE_EWANTED) == SE_READER) {
656 pp->p_selock = old & ~SE_READER;
657 if (CV_HAS_WAITERS(&pp->p_cv))
658 cv_broadcast(&pp->p_cv);
659 } else if ((old & ~SE_EWANTED) == SE_DELETED) {
660 panic("page_unlock: page %p is deleted", (void *)pp);
661 } else if (old < 0) {
662 pp->p_selock &= SE_EWANTED;
663 if (CV_HAS_WAITERS(&pp->p_cv))
664 cv_broadcast(&pp->p_cv);
665 } else if ((old & ~SE_EWANTED) > SE_READER) {
666 pp->p_selock = old - SE_READER;
667 } else {
668 panic("page_unlock: page %p is not locked", (void *)pp);
669 }
670
671 if (pp->p_selock == 0) {
672 /*
673 * If the T_CAPTURING bit is set, that means that we should
674 * not try and capture the page again as we could recurse
675 * which could lead to a stack overflow panic or spending a
676 * relatively long time in the kernel making no progress.
677 */
678 if ((pp->p_toxic & PR_CAPTURE) &&
679 !(curthread->t_flag & T_CAPTURING) &&
680 !PP_RETIRED(pp)) {
681 pp->p_selock = SE_WRITER;
682 mutex_exit(pse);
683 page_unlock_capture(pp);
684 } else {
685 mutex_exit(pse);
686 }
687 } else {
688 mutex_exit(pse);
689 }
690 }
691
692 /*
693 * Try to upgrade the lock on the page from a "shared" to an
694 * "exclusive" lock. Since this upgrade operation is done while
695 * holding the mutex protecting this page, no one else can acquire this page's
696 * lock and change the page. Thus, it is safe to drop the "shared"
697 * lock and attempt to acquire the "exclusive" lock.
698 *
699 * Returns 1 on success, 0 on failure.
700 */
701 int
page_tryupgrade(page_t * pp)702 page_tryupgrade(page_t *pp)
703 {
704 kmutex_t *pse = PAGE_SE_MUTEX(pp);
705
706 mutex_enter(pse);
707 if (!(pp->p_selock & SE_EWANTED)) {
708 /* no threads want exclusive access, try upgrade */
709 if (pp->p_selock == SE_READER) {
710 /* convert to exclusive lock */
711 pp->p_selock = SE_WRITER;
712 mutex_exit(pse);
713 return (1);
714 }
715 }
716 mutex_exit(pse);
717 return (0);
718 }
719
720 /*
721 * Downgrade the "exclusive" lock on the page to a "shared" lock
722 * while holding the mutex protecting this page's p_selock field.
723 */
724 void
page_downgrade(page_t * pp)725 page_downgrade(page_t *pp)
726 {
727 kmutex_t *pse = PAGE_SE_MUTEX(pp);
728 int excl_waiting;
729
730 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED);
731 ASSERT(PAGE_EXCL(pp));
732
733 mutex_enter(pse);
734 excl_waiting = pp->p_selock & SE_EWANTED;
735 pp->p_selock = SE_READER | excl_waiting;
736 if (CV_HAS_WAITERS(&pp->p_cv))
737 cv_broadcast(&pp->p_cv);
738 mutex_exit(pse);
739 }
740
741 void
page_lock_delete(page_t * pp)742 page_lock_delete(page_t *pp)
743 {
744 kmutex_t *pse = PAGE_SE_MUTEX(pp);
745
746 ASSERT(PAGE_EXCL(pp));
747 ASSERT(pp->p_vnode == NULL);
748 ASSERT(pp->p_offset == (u_offset_t)-1);
749 ASSERT(!PP_ISFREE(pp));
750
751 mutex_enter(pse);
752 pp->p_selock = SE_DELETED;
753 if (CV_HAS_WAITERS(&pp->p_cv))
754 cv_broadcast(&pp->p_cv);
755 mutex_exit(pse);
756 }
757
758 int
page_deleted(page_t * pp)759 page_deleted(page_t *pp)
760 {
761 return (pp->p_selock == SE_DELETED);
762 }
763
764 /*
765 * Implement the io lock for pages
766 */
767 void
page_iolock_init(page_t * pp)768 page_iolock_init(page_t *pp)
769 {
770 pp->p_iolock_state = 0;
771 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL);
772 }
773
774 /*
775 * Acquire the i/o lock on a page.
776 */
777 void
page_io_lock(page_t * pp)778 page_io_lock(page_t *pp)
779 {
780 kmutex_t *pio;
781
782 pio = PAGE_IO_MUTEX(pp);
783 mutex_enter(pio);
784 while (pp->p_iolock_state & PAGE_IO_INUSE) {
785 cv_wait(&(pp->p_io_cv), pio);
786 }
787 pp->p_iolock_state |= PAGE_IO_INUSE;
788 mutex_exit(pio);
789 }
790
791 /*
792 * Release the i/o lock on a page.
793 */
794 void
page_io_unlock(page_t * pp)795 page_io_unlock(page_t *pp)
796 {
797 kmutex_t *pio;
798
799 pio = PAGE_IO_MUTEX(pp);
800 mutex_enter(pio);
801 cv_broadcast(&pp->p_io_cv);
802 pp->p_iolock_state &= ~PAGE_IO_INUSE;
803 mutex_exit(pio);
804 }
805
806 /*
807 * Try to acquire the i/o lock on a page without blocking.
808 * Returns 1 on success, 0 on failure.
809 */
810 int
page_io_trylock(page_t * pp)811 page_io_trylock(page_t *pp)
812 {
813 kmutex_t *pio;
814
815 if (pp->p_iolock_state & PAGE_IO_INUSE)
816 return (0);
817
818 pio = PAGE_IO_MUTEX(pp);
819 mutex_enter(pio);
820
821 if (pp->p_iolock_state & PAGE_IO_INUSE) {
822 mutex_exit(pio);
823 return (0);
824 }
825 pp->p_iolock_state |= PAGE_IO_INUSE;
826 mutex_exit(pio);
827
828 return (1);
829 }
830
831 /*
832 * Wait until the i/o lock is not held.
833 */
834 void
page_io_wait(page_t * pp)835 page_io_wait(page_t *pp)
836 {
837 kmutex_t *pio;
838
839 pio = PAGE_IO_MUTEX(pp);
840 mutex_enter(pio);
841 while (pp->p_iolock_state & PAGE_IO_INUSE) {
842 cv_wait(&(pp->p_io_cv), pio);
843 }
844 mutex_exit(pio);
845 }
846
847 /*
848 * Returns 1 on success, 0 on failure.
849 */
850 int
page_io_locked(page_t * pp)851 page_io_locked(page_t *pp)
852 {
853 return (pp->p_iolock_state & PAGE_IO_INUSE);
854 }
855
856 /*
857 * Assert that the i/o lock on a page is held.
858 * Returns 1 on success, 0 on failure.
859 */
860 int
page_iolock_assert(page_t * pp)861 page_iolock_assert(page_t *pp)
862 {
863 return (page_io_locked(pp));
864 }
865
866 /*
867 * Wrapper exported to kernel routines that are built
868 * platform-independent (the macro is platform-dependent;
869 * the size of vph_mutex[] is based on NCPU).
870 *
871 * Note that you can do stress testing on this by setting the
872 * variable page_vnode_mutex_stress to something other than
873 * zero in a DEBUG kernel in a debugger after loading the kernel.
874 * Setting it after the kernel is running may not work correctly.
875 */
876 #ifdef DEBUG
877 static int page_vnode_mutex_stress = 0;
878 #endif
879
880 kmutex_t *
page_vnode_mutex(vnode_t * vp)881 page_vnode_mutex(vnode_t *vp)
882 {
883 if (vp == &kvp)
884 return (&vph_mutex[VPH_TABLE_SIZE + 0]);
885
886 if (vp == &zvp)
887 return (&vph_mutex[VPH_TABLE_SIZE + 1]);
888 #ifdef DEBUG
889 if (page_vnode_mutex_stress != 0)
890 return (&vph_mutex[0]);
891 #endif
892
893 return (&vph_mutex[VP_HASH_FUNC(vp)]);
894 }
895
896 kmutex_t *
page_se_mutex(page_t * pp)897 page_se_mutex(page_t *pp)
898 {
899 return (PAGE_SE_MUTEX(pp));
900 }
901
902 #ifdef VM_STATS
903 uint_t pszclck_stat[4];
904 #endif
905 /*
906 * Find, take and return a mutex held by hat_page_demote().
907 * Called by page_demote_vp_pages() before hat_page_demote() call and by
908 * routines that want to block hat_page_demote() but can't do it
909 * via locking all constituent pages.
910 *
911 * Return NULL if p_szc is 0.
912 *
913 * It should only be used for pages that can be demoted by hat_page_demote()
914 * i.e. non swapfs file system pages. The logic here is lifted from
915 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase
916 * since the page is locked and not free.
917 *
918 * Hash of the root page is used to find the lock.
919 * To find the root in the presense of hat_page_demote() chageing the location
920 * of the root this routine relies on the fact that hat_page_demote() changes
921 * root last.
922 *
923 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is
924 * returned pp's p_szc may be any value.
925 */
926 kmutex_t *
page_szc_lock(page_t * pp)927 page_szc_lock(page_t *pp)
928 {
929 kmutex_t *mtx;
930 page_t *rootpp;
931 uint_t szc;
932 uint_t rszc;
933 uint_t pszc = pp->p_szc;
934
935 ASSERT(pp != NULL);
936 ASSERT(PAGE_LOCKED(pp));
937 ASSERT(!PP_ISFREE(pp));
938 ASSERT(pp->p_vnode != NULL);
939 ASSERT(!IS_SWAPFSVP(pp->p_vnode));
940 ASSERT(!PP_ISKAS(pp));
941
942 again:
943 if (pszc == 0) {
944 VM_STAT_ADD(pszclck_stat[0]);
945 return (NULL);
946 }
947
948 /* The lock lives in the root page */
949
950 rootpp = PP_GROUPLEADER(pp, pszc);
951 mtx = PAGE_SZC_MUTEX(rootpp);
952 mutex_enter(mtx);
953
954 /*
955 * since p_szc can only decrease if pp == rootpp
956 * rootpp will be always the same i.e we have the right root
957 * regardless of rootpp->p_szc.
958 * If location of pp's root didn't change after we took
959 * the lock we have the right root. return mutex hashed off it.
960 */
961 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) {
962 VM_STAT_ADD(pszclck_stat[1]);
963 return (mtx);
964 }
965
966 /*
967 * root location changed because page got demoted.
968 * locate the new root.
969 */
970 if (rszc < pszc) {
971 szc = pp->p_szc;
972 ASSERT(szc < pszc);
973 mutex_exit(mtx);
974 pszc = szc;
975 VM_STAT_ADD(pszclck_stat[2]);
976 goto again;
977 }
978
979 VM_STAT_ADD(pszclck_stat[3]);
980 /*
981 * current hat_page_demote not done yet.
982 * wait for it to finish.
983 */
984 mutex_exit(mtx);
985 rootpp = PP_GROUPLEADER(rootpp, rszc);
986 mtx = PAGE_SZC_MUTEX(rootpp);
987 mutex_enter(mtx);
988 mutex_exit(mtx);
989 ASSERT(rootpp->p_szc < rszc);
990 goto again;
991 }
992
993 int
page_szc_lock_assert(page_t * pp)994 page_szc_lock_assert(page_t *pp)
995 {
996 page_t *rootpp = PP_PAGEROOT(pp);
997 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp);
998
999 return (MUTEX_HELD(mtx));
1000 }
1001
1002 /*
1003 * memseg locking
1004 */
1005 static krwlock_t memsegslock;
1006
1007 /*
1008 * memlist (phys_install, phys_avail) locking.
1009 */
1010 static krwlock_t memlists_lock;
1011
1012 int
memsegs_trylock(int writer)1013 memsegs_trylock(int writer)
1014 {
1015 return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER));
1016 }
1017
1018 void
memsegs_lock(int writer)1019 memsegs_lock(int writer)
1020 {
1021 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER);
1022 }
1023
1024 /*ARGSUSED*/
1025 void
memsegs_unlock(int writer)1026 memsegs_unlock(int writer)
1027 {
1028 rw_exit(&memsegslock);
1029 }
1030
1031 int
memsegs_lock_held(void)1032 memsegs_lock_held(void)
1033 {
1034 return (RW_LOCK_HELD(&memsegslock));
1035 }
1036
1037 void
memlist_read_lock(void)1038 memlist_read_lock(void)
1039 {
1040 rw_enter(&memlists_lock, RW_READER);
1041 }
1042
1043 void
memlist_read_unlock(void)1044 memlist_read_unlock(void)
1045 {
1046 rw_exit(&memlists_lock);
1047 }
1048
1049 void
memlist_write_lock(void)1050 memlist_write_lock(void)
1051 {
1052 rw_enter(&memlists_lock, RW_WRITER);
1053 }
1054
1055 void
memlist_write_unlock(void)1056 memlist_write_unlock(void)
1057 {
1058 rw_exit(&memlists_lock);
1059 }
1060