1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 */ 25 26 27 /* 28 * VM - page locking primitives 29 */ 30 #include <sys/param.h> 31 #include <sys/t_lock.h> 32 #include <sys/vtrace.h> 33 #include <sys/debug.h> 34 #include <sys/cmn_err.h> 35 #include <sys/bitmap.h> 36 #include <sys/lockstat.h> 37 #include <sys/sysmacros.h> 38 #include <sys/condvar_impl.h> 39 #include <vm/page.h> 40 #include <vm/seg_enum.h> 41 #include <vm/vm_dep.h> 42 #include <vm/seg_kmem.h> 43 44 /* 45 * This global mutex array is for logical page locking. 46 * The following fields in the page structure are protected 47 * by this lock: 48 * 49 * p_lckcnt 50 * p_cowcnt 51 */ 52 pad_mutex_t page_llocks[8 * NCPU_P2]; 53 54 /* 55 * This is a global lock for the logical page free list. The 56 * logical free list, in this implementation, is maintained as two 57 * separate physical lists - the cache list and the free list. 58 */ 59 kmutex_t page_freelock; 60 61 /* 62 * The hash table, page_hash[], the p_selock fields, and the 63 * list of pages associated with vnodes are protected by arrays of mutexes. 64 * 65 * Unless the hashes are changed radically, the table sizes must be 66 * a power of two. Also, we typically need more mutexes for the 67 * vnodes since these locks are occasionally held for long periods. 68 * And since there seem to be two special vnodes (kvp and swapvp), 69 * we make room for private mutexes for them. 70 * 71 * The pse_mutex[] array holds the mutexes to protect the p_selock 72 * fields of all page_t structures. 73 * 74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 75 * when given a pointer to a page_t. 76 * 77 * PIO_TABLE_SIZE must be a power of two. One could argue that we 78 * should go to the trouble of setting it up at run time and base it 79 * on memory size rather than the number of compile time CPUs. 80 * 81 * XX64 We should be using physmem size to calculate PIO_SHIFT. 82 * 83 * These might break in 64 bit world. 84 */ 85 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 86 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 87 88 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 89 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 90 91 #define PAGE_IO_MUTEX(pp) \ 92 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 93 94 /* 95 * The pse_mutex[] array is allocated in the platform startup code 96 * based on the size of the machine at startup. 97 */ 98 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 99 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 100 extern int pse_shift; /* log2(pse_table_size) */ 101 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 102 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 103 (pse_table_size - 1)].pad_mutex 104 105 #define PSZC_MTX_TABLE_SIZE 128 106 #define PSZC_MTX_TABLE_SHIFT 7 107 108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 109 110 #define PAGE_SZC_MUTEX(_pp) \ 111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 115 116 /* 117 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 119 * and p_vpnext). 120 * 121 * The page_vnode_mutex(vp) function returns the address of the appropriate 122 * mutex from this array given a pointer to a vnode. It is complicated 123 * by the fact that the kernel's vnode and the swapfs vnode are referenced 124 * frequently enough to warrent their own mutexes. 125 * 126 * The VP_HASH_FUNC returns the index into the vph_mutex array given 127 * an address of a vnode. 128 */ 129 130 #if defined(_LP64) 131 #define VPH_TABLE_SIZE (8 * NCPU_P2) 132 #else /* 32 bits */ 133 #define VPH_TABLE_SIZE (2 * NCPU_P2) 134 #endif 135 136 #define VP_HASH_FUNC(vp) \ 137 ((((uintptr_t)(vp) >> 6) + \ 138 ((uintptr_t)(vp) >> 8) + \ 139 ((uintptr_t)(vp) >> 10) + \ 140 ((uintptr_t)(vp) >> 12)) \ 141 & (VPH_TABLE_SIZE - 1)) 142 143 /* 144 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes, 145 * one for kvps[KV_ZVP], and one for other kvps[] users. 146 */ 147 148 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 149 150 /* 151 * Initialize the locks used by the Virtual Memory Management system. 152 */ 153 void 154 page_lock_init() 155 { 156 } 157 158 /* 159 * Return a value for pse_shift based on npg (the number of physical pages) 160 * and ncpu (the maximum number of CPUs). This is called by platform startup 161 * code. 162 * 163 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 164 * locks grew approximately as the square of the number of threads executing. 165 * So the primary scaling factor used is NCPU^2. The size of the machine in 166 * megabytes is used as an upper bound, particularly for sun4v machines which 167 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 168 * (128) is used as a minimum. Since the size of the table has to be a power 169 * of two, the calculated size is rounded up to the next power of two. 170 */ 171 /*ARGSUSED*/ 172 int 173 size_pse_array(pgcnt_t npg, int ncpu) 174 { 175 size_t size; 176 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 177 178 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 179 size += (1 << (highbit(size) - 1)) - 1; 180 return (highbit(size) - 1); 181 } 182 183 /* 184 * At present we only use page ownership to aid debugging, so it's 185 * OK if the owner field isn't exact. In the 32-bit world two thread ids 186 * can map to the same owner because we just 'or' in 0x80000000 and 187 * then clear the second highest bit, so that (for example) 0x2faced00 188 * and 0xafaced00 both map to 0xafaced00. 189 * In the 64-bit world, p_selock may not be large enough to hold a full 190 * thread pointer. If we ever need precise ownership (e.g. if we implement 191 * priority inheritance for page locks) then p_selock should become a 192 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 193 */ 194 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 195 #define SE_READER 1 196 197 /* 198 * A page that is deleted must be marked as such using the 199 * page_lock_delete() function. The page must be exclusively locked. 200 * The SE_DELETED marker is put in p_selock when this function is called. 201 * SE_DELETED must be distinct from any SE_WRITER value. 202 */ 203 #define SE_DELETED (1 | INT_MIN) 204 205 #ifdef VM_STATS 206 uint_t vph_kvp_count; 207 uint_t vph_swapfsvp_count; 208 uint_t vph_other; 209 #endif /* VM_STATS */ 210 211 #ifdef VM_STATS 212 uint_t page_lock_count; 213 uint_t page_lock_miss; 214 uint_t page_lock_miss_lock; 215 uint_t page_lock_reclaim; 216 uint_t page_lock_bad_reclaim; 217 uint_t page_lock_same_page; 218 uint_t page_lock_upgrade; 219 uint_t page_lock_retired; 220 uint_t page_lock_upgrade_failed; 221 uint_t page_lock_deleted; 222 223 uint_t page_trylock_locked; 224 uint_t page_trylock_failed; 225 uint_t page_trylock_missed; 226 227 uint_t page_try_reclaim_upgrade; 228 #endif /* VM_STATS */ 229 230 /* 231 * Acquire the "shared/exclusive" lock on a page. 232 * 233 * Returns 1 on success and locks the page appropriately. 234 * 0 on failure and does not lock the page. 235 * 236 * If `lock' is non-NULL, it will be dropped and reacquired in the 237 * failure case. This routine can block, and if it does 238 * it will always return a failure since the page identity [vp, off] 239 * or state may have changed. 240 */ 241 242 int 243 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 244 { 245 return (page_lock_es(pp, se, lock, reclaim, 0)); 246 } 247 248 /* 249 * With the addition of reader-writer lock semantics to page_lock_es, 250 * callers wanting an exclusive (writer) lock may prevent shared-lock 251 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 252 * In this case, when an exclusive lock cannot be acquired, p_selock's 253 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 254 * if the page is slated for retirement. 255 * 256 * The se and es parameters determine if the lock should be granted 257 * based on the following decision table: 258 * 259 * Lock wanted es flags p_selock/SE_EWANTED Action 260 * ----------- -------------- ------------------- --------- 261 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 262 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 263 * SE_EXCL none any lock/any deny 264 * SE_SHARED n/a [2] shared/0 grant 265 * SE_SHARED n/a [2] unlocked/0 grant 266 * SE_SHARED n/a shared/1 deny 267 * SE_SHARED n/a unlocked/1 deny 268 * SE_SHARED n/a excl/any deny 269 * 270 * Notes: 271 * [1] The code grants an exclusive lock to the caller and clears the bit 272 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 273 * bit's value. This was deemed acceptable as we are not concerned about 274 * exclusive-lock starvation. If this ever becomes an issue, a priority or 275 * fifo mechanism should also be implemented. Meantime, the thread that 276 * set SE_EWANTED should be prepared to catch this condition and reset it 277 * 278 * [2] Retired pages may not be locked at any time, regardless of the 279 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 280 * 281 * Notes on values of "es": 282 * 283 * es & 1: page_lookup_create will attempt page relocation 284 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 285 * memory thread); this prevents reader-starvation of waiting 286 * writer thread(s) by giving priority to writers over readers. 287 * es & SE_RETIRED: caller wants to lock pages even if they are 288 * retired. Default is to deny the lock if the page is retired. 289 * 290 * And yes, we know, the semantics of this function are too complicated. 291 * It's on the list to be cleaned up. 292 */ 293 int 294 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 295 { 296 int retval; 297 kmutex_t *pse = PAGE_SE_MUTEX(pp); 298 int upgraded; 299 int reclaim_it; 300 301 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 302 303 VM_STAT_ADD(page_lock_count); 304 305 upgraded = 0; 306 reclaim_it = 0; 307 308 mutex_enter(pse); 309 310 ASSERT(((es & SE_EXCL_WANTED) == 0) || 311 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 312 313 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 314 mutex_exit(pse); 315 VM_STAT_ADD(page_lock_retired); 316 return (0); 317 } 318 319 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 320 se = SE_EXCL; 321 } 322 323 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 324 325 reclaim_it = 1; 326 if (se == SE_SHARED) { 327 /* 328 * This is an interesting situation. 329 * 330 * Remember that p_free can only change if 331 * p_selock < 0. 332 * p_free does not depend on our holding `pse'. 333 * And, since we hold `pse', p_selock can not change. 334 * So, if p_free changes on us, the page is already 335 * exclusively held, and we would fail to get p_selock 336 * regardless. 337 * 338 * We want to avoid getting the share 339 * lock on a free page that needs to be reclaimed. 340 * It is possible that some other thread has the share 341 * lock and has left the free page on the cache list. 342 * pvn_vplist_dirty() does this for brief periods. 343 * If the se_share is currently SE_EXCL, we will fail 344 * to acquire p_selock anyway. Blocking is the 345 * right thing to do. 346 * If we need to reclaim this page, we must get 347 * exclusive access to it, force the upgrade now. 348 * Again, we will fail to acquire p_selock if the 349 * page is not free and block. 350 */ 351 upgraded = 1; 352 se = SE_EXCL; 353 VM_STAT_ADD(page_lock_upgrade); 354 } 355 } 356 357 if (se == SE_EXCL) { 358 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 359 /* 360 * if the caller wants a writer lock (but did not 361 * specify exclusive access), and there is a pending 362 * writer that wants exclusive access, return failure 363 */ 364 retval = 0; 365 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 366 /* no reader/writer lock held */ 367 /* this clears our setting of the SE_EWANTED bit */ 368 pp->p_selock = SE_WRITER; 369 retval = 1; 370 } else { 371 /* page is locked */ 372 if (es & SE_EXCL_WANTED) { 373 /* set the SE_EWANTED bit */ 374 pp->p_selock |= SE_EWANTED; 375 } 376 retval = 0; 377 } 378 } else { 379 retval = 0; 380 if (pp->p_selock >= 0) { 381 if ((pp->p_selock & SE_EWANTED) == 0) { 382 pp->p_selock += SE_READER; 383 retval = 1; 384 } 385 } 386 } 387 388 if (retval == 0) { 389 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 390 VM_STAT_ADD(page_lock_deleted); 391 mutex_exit(pse); 392 return (retval); 393 } 394 395 #ifdef VM_STATS 396 VM_STAT_ADD(page_lock_miss); 397 if (upgraded) { 398 VM_STAT_ADD(page_lock_upgrade_failed); 399 } 400 #endif 401 if (lock) { 402 VM_STAT_ADD(page_lock_miss_lock); 403 mutex_exit(lock); 404 } 405 406 /* 407 * Now, wait for the page to be unlocked and 408 * release the lock protecting p_cv and p_selock. 409 */ 410 cv_wait(&pp->p_cv, pse); 411 mutex_exit(pse); 412 413 /* 414 * The page identity may have changed while we were 415 * blocked. If we are willing to depend on "pp" 416 * still pointing to a valid page structure (i.e., 417 * assuming page structures are not dynamically allocated 418 * or freed), we could try to lock the page if its 419 * identity hasn't changed. 420 * 421 * This needs to be measured, since we come back from 422 * cv_wait holding pse (the expensive part of this 423 * operation) we might as well try the cheap part. 424 * Though we would also have to confirm that dropping 425 * `lock' did not cause any grief to the callers. 426 */ 427 if (lock) { 428 mutex_enter(lock); 429 } 430 } else { 431 /* 432 * We have the page lock. 433 * If we needed to reclaim the page, and the page 434 * needed reclaiming (ie, it was free), then we 435 * have the page exclusively locked. We may need 436 * to downgrade the page. 437 */ 438 ASSERT((upgraded) ? 439 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 440 mutex_exit(pse); 441 442 /* 443 * We now hold this page's lock, either shared or 444 * exclusive. This will prevent its identity from changing. 445 * The page, however, may or may not be free. If the caller 446 * requested, and it is free, go reclaim it from the 447 * free list. If the page can't be reclaimed, return failure 448 * so that the caller can start all over again. 449 * 450 * NOTE:page_reclaim() releases the page lock (p_selock) 451 * if it can't be reclaimed. 452 */ 453 if (reclaim_it) { 454 if (!page_reclaim(pp, lock)) { 455 VM_STAT_ADD(page_lock_bad_reclaim); 456 retval = 0; 457 } else { 458 VM_STAT_ADD(page_lock_reclaim); 459 if (upgraded) { 460 page_downgrade(pp); 461 } 462 } 463 } 464 } 465 return (retval); 466 } 467 468 /* 469 * Clear the SE_EWANTED bit from p_selock. This function allows 470 * callers of page_lock_es and page_try_reclaim_lock to clear 471 * their setting of this bit if they decide they no longer wish 472 * to gain exclusive access to the page. Currently only 473 * delete_memory_thread uses this when the delete memory 474 * operation is cancelled. 475 */ 476 void 477 page_lock_clr_exclwanted(page_t *pp) 478 { 479 kmutex_t *pse = PAGE_SE_MUTEX(pp); 480 481 mutex_enter(pse); 482 pp->p_selock &= ~SE_EWANTED; 483 if (CV_HAS_WAITERS(&pp->p_cv)) 484 cv_broadcast(&pp->p_cv); 485 mutex_exit(pse); 486 } 487 488 /* 489 * Read the comments inside of page_lock_es() carefully. 490 * 491 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 492 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 493 * This is used by threads subject to reader-starvation (eg. memory delete). 494 * 495 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 496 * it is expected that it will retry at a later time. Threads that will 497 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 498 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 499 * the bit is cleared.) 500 */ 501 int 502 page_try_reclaim_lock(page_t *pp, se_t se, int es) 503 { 504 kmutex_t *pse = PAGE_SE_MUTEX(pp); 505 selock_t old; 506 507 mutex_enter(pse); 508 509 old = pp->p_selock; 510 511 ASSERT(((es & SE_EXCL_WANTED) == 0) || 512 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 513 514 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 515 mutex_exit(pse); 516 VM_STAT_ADD(page_trylock_failed); 517 return (0); 518 } 519 520 if (se == SE_SHARED && es == 1 && old == 0) { 521 se = SE_EXCL; 522 } 523 524 if (se == SE_SHARED) { 525 if (!PP_ISFREE(pp)) { 526 if (old >= 0) { 527 /* 528 * Readers are not allowed when excl wanted 529 */ 530 if ((old & SE_EWANTED) == 0) { 531 pp->p_selock = old + SE_READER; 532 mutex_exit(pse); 533 return (1); 534 } 535 } 536 mutex_exit(pse); 537 return (0); 538 } 539 /* 540 * The page is free, so we really want SE_EXCL (below) 541 */ 542 VM_STAT_ADD(page_try_reclaim_upgrade); 543 } 544 545 /* 546 * The caller wants a writer lock. We try for it only if 547 * SE_EWANTED is not set, or if the caller specified 548 * SE_EXCL_WANTED. 549 */ 550 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 551 if ((old & ~SE_EWANTED) == 0) { 552 /* no reader/writer lock held */ 553 /* this clears out our setting of the SE_EWANTED bit */ 554 pp->p_selock = SE_WRITER; 555 mutex_exit(pse); 556 return (1); 557 } 558 } 559 if (es & SE_EXCL_WANTED) { 560 /* page is locked, set the SE_EWANTED bit */ 561 pp->p_selock |= SE_EWANTED; 562 } 563 mutex_exit(pse); 564 return (0); 565 } 566 567 /* 568 * Acquire a page's "shared/exclusive" lock, but never block. 569 * Returns 1 on success, 0 on failure. 570 */ 571 int 572 page_trylock(page_t *pp, se_t se) 573 { 574 kmutex_t *pse = PAGE_SE_MUTEX(pp); 575 576 mutex_enter(pse); 577 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 578 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 579 /* 580 * Fail if a thread wants exclusive access and page is 581 * retired, if the page is slated for retirement, or a 582 * share lock is requested. 583 */ 584 mutex_exit(pse); 585 VM_STAT_ADD(page_trylock_failed); 586 return (0); 587 } 588 589 if (se == SE_EXCL) { 590 if (pp->p_selock == 0) { 591 pp->p_selock = SE_WRITER; 592 mutex_exit(pse); 593 return (1); 594 } 595 } else { 596 if (pp->p_selock >= 0) { 597 pp->p_selock += SE_READER; 598 mutex_exit(pse); 599 return (1); 600 } 601 } 602 mutex_exit(pse); 603 return (0); 604 } 605 606 /* 607 * Variant of page_unlock() specifically for the page freelist 608 * code. The mere existence of this code is a vile hack that 609 * has resulted due to the backwards locking order of the page 610 * freelist manager; please don't call it. 611 */ 612 void 613 page_unlock_nocapture(page_t *pp) 614 { 615 kmutex_t *pse = PAGE_SE_MUTEX(pp); 616 selock_t old; 617 618 mutex_enter(pse); 619 620 old = pp->p_selock; 621 if ((old & ~SE_EWANTED) == SE_READER) { 622 pp->p_selock = old & ~SE_READER; 623 if (CV_HAS_WAITERS(&pp->p_cv)) 624 cv_broadcast(&pp->p_cv); 625 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 626 panic("page_unlock_nocapture: page %p is deleted", (void *)pp); 627 } else if (old < 0) { 628 pp->p_selock &= SE_EWANTED; 629 if (CV_HAS_WAITERS(&pp->p_cv)) 630 cv_broadcast(&pp->p_cv); 631 } else if ((old & ~SE_EWANTED) > SE_READER) { 632 pp->p_selock = old - SE_READER; 633 } else { 634 panic("page_unlock_nocapture: page %p is not locked", 635 (void *)pp); 636 } 637 638 mutex_exit(pse); 639 } 640 641 /* 642 * Release the page's "shared/exclusive" lock and wake up anyone 643 * who might be waiting for it. 644 */ 645 void 646 page_unlock(page_t *pp) 647 { 648 kmutex_t *pse = PAGE_SE_MUTEX(pp); 649 selock_t old; 650 651 mutex_enter(pse); 652 653 old = pp->p_selock; 654 if ((old & ~SE_EWANTED) == SE_READER) { 655 pp->p_selock = old & ~SE_READER; 656 if (CV_HAS_WAITERS(&pp->p_cv)) 657 cv_broadcast(&pp->p_cv); 658 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 659 panic("page_unlock: page %p is deleted", (void *)pp); 660 } else if (old < 0) { 661 pp->p_selock &= SE_EWANTED; 662 if (CV_HAS_WAITERS(&pp->p_cv)) 663 cv_broadcast(&pp->p_cv); 664 } else if ((old & ~SE_EWANTED) > SE_READER) { 665 pp->p_selock = old - SE_READER; 666 } else { 667 panic("page_unlock: page %p is not locked", (void *)pp); 668 } 669 670 if (pp->p_selock == 0) { 671 /* 672 * If the T_CAPTURING bit is set, that means that we should 673 * not try and capture the page again as we could recurse 674 * which could lead to a stack overflow panic or spending a 675 * relatively long time in the kernel making no progress. 676 */ 677 if ((pp->p_toxic & PR_CAPTURE) && 678 !(curthread->t_flag & T_CAPTURING) && 679 !PP_RETIRED(pp)) { 680 pp->p_selock = SE_WRITER; 681 mutex_exit(pse); 682 page_unlock_capture(pp); 683 } else { 684 mutex_exit(pse); 685 } 686 } else { 687 mutex_exit(pse); 688 } 689 } 690 691 /* 692 * Try to upgrade the lock on the page from a "shared" to an 693 * "exclusive" lock. Since this upgrade operation is done while 694 * holding the mutex protecting this page, no one else can acquire this page's 695 * lock and change the page. Thus, it is safe to drop the "shared" 696 * lock and attempt to acquire the "exclusive" lock. 697 * 698 * Returns 1 on success, 0 on failure. 699 */ 700 int 701 page_tryupgrade(page_t *pp) 702 { 703 kmutex_t *pse = PAGE_SE_MUTEX(pp); 704 705 mutex_enter(pse); 706 if (!(pp->p_selock & SE_EWANTED)) { 707 /* no threads want exclusive access, try upgrade */ 708 if (pp->p_selock == SE_READER) { 709 /* convert to exclusive lock */ 710 pp->p_selock = SE_WRITER; 711 mutex_exit(pse); 712 return (1); 713 } 714 } 715 mutex_exit(pse); 716 return (0); 717 } 718 719 /* 720 * Downgrade the "exclusive" lock on the page to a "shared" lock 721 * while holding the mutex protecting this page's p_selock field. 722 */ 723 void 724 page_downgrade(page_t *pp) 725 { 726 kmutex_t *pse = PAGE_SE_MUTEX(pp); 727 int excl_waiting; 728 729 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 730 ASSERT(PAGE_EXCL(pp)); 731 732 mutex_enter(pse); 733 excl_waiting = pp->p_selock & SE_EWANTED; 734 pp->p_selock = SE_READER | excl_waiting; 735 if (CV_HAS_WAITERS(&pp->p_cv)) 736 cv_broadcast(&pp->p_cv); 737 mutex_exit(pse); 738 } 739 740 void 741 page_lock_delete(page_t *pp) 742 { 743 kmutex_t *pse = PAGE_SE_MUTEX(pp); 744 745 ASSERT(PAGE_EXCL(pp)); 746 ASSERT(pp->p_vnode == NULL); 747 ASSERT(pp->p_offset == (u_offset_t)-1); 748 ASSERT(!PP_ISFREE(pp)); 749 750 mutex_enter(pse); 751 pp->p_selock = SE_DELETED; 752 if (CV_HAS_WAITERS(&pp->p_cv)) 753 cv_broadcast(&pp->p_cv); 754 mutex_exit(pse); 755 } 756 757 int 758 page_deleted(page_t *pp) 759 { 760 return (pp->p_selock == SE_DELETED); 761 } 762 763 /* 764 * Implement the io lock for pages 765 */ 766 void 767 page_iolock_init(page_t *pp) 768 { 769 pp->p_iolock_state = 0; 770 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 771 } 772 773 /* 774 * Acquire the i/o lock on a page. 775 */ 776 void 777 page_io_lock(page_t *pp) 778 { 779 kmutex_t *pio; 780 781 pio = PAGE_IO_MUTEX(pp); 782 mutex_enter(pio); 783 while (pp->p_iolock_state & PAGE_IO_INUSE) { 784 cv_wait(&(pp->p_io_cv), pio); 785 } 786 pp->p_iolock_state |= PAGE_IO_INUSE; 787 mutex_exit(pio); 788 } 789 790 /* 791 * Release the i/o lock on a page. 792 */ 793 void 794 page_io_unlock(page_t *pp) 795 { 796 kmutex_t *pio; 797 798 pio = PAGE_IO_MUTEX(pp); 799 mutex_enter(pio); 800 cv_broadcast(&pp->p_io_cv); 801 pp->p_iolock_state &= ~PAGE_IO_INUSE; 802 mutex_exit(pio); 803 } 804 805 /* 806 * Try to acquire the i/o lock on a page without blocking. 807 * Returns 1 on success, 0 on failure. 808 */ 809 int 810 page_io_trylock(page_t *pp) 811 { 812 kmutex_t *pio; 813 814 if (pp->p_iolock_state & PAGE_IO_INUSE) 815 return (0); 816 817 pio = PAGE_IO_MUTEX(pp); 818 mutex_enter(pio); 819 820 if (pp->p_iolock_state & PAGE_IO_INUSE) { 821 mutex_exit(pio); 822 return (0); 823 } 824 pp->p_iolock_state |= PAGE_IO_INUSE; 825 mutex_exit(pio); 826 827 return (1); 828 } 829 830 /* 831 * Wait until the i/o lock is not held. 832 */ 833 void 834 page_io_wait(page_t *pp) 835 { 836 kmutex_t *pio; 837 838 pio = PAGE_IO_MUTEX(pp); 839 mutex_enter(pio); 840 while (pp->p_iolock_state & PAGE_IO_INUSE) { 841 cv_wait(&(pp->p_io_cv), pio); 842 } 843 mutex_exit(pio); 844 } 845 846 /* 847 * Returns 1 on success, 0 on failure. 848 */ 849 int 850 page_io_locked(page_t *pp) 851 { 852 return (pp->p_iolock_state & PAGE_IO_INUSE); 853 } 854 855 /* 856 * Assert that the i/o lock on a page is held. 857 * Returns 1 on success, 0 on failure. 858 */ 859 int 860 page_iolock_assert(page_t *pp) 861 { 862 return (page_io_locked(pp)); 863 } 864 865 /* 866 * Wrapper exported to kernel routines that are built 867 * platform-independent (the macro is platform-dependent; 868 * the size of vph_mutex[] is based on NCPU). 869 * 870 * Note that you can do stress testing on this by setting the 871 * variable page_vnode_mutex_stress to something other than 872 * zero in a DEBUG kernel in a debugger after loading the kernel. 873 * Setting it after the kernel is running may not work correctly. 874 */ 875 #ifdef DEBUG 876 static int page_vnode_mutex_stress = 0; 877 #endif 878 879 kmutex_t * 880 page_vnode_mutex(vnode_t *vp) 881 { 882 if (vp == &kvp || vp == &kvps[KV_VVP]) 883 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 884 885 if (vp == &kvps[KV_ZVP]) 886 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 887 #ifdef DEBUG 888 if (page_vnode_mutex_stress != 0) 889 return (&vph_mutex[0]); 890 #endif 891 892 return (&vph_mutex[VP_HASH_FUNC(vp)]); 893 } 894 895 kmutex_t * 896 page_se_mutex(page_t *pp) 897 { 898 return (PAGE_SE_MUTEX(pp)); 899 } 900 901 #ifdef VM_STATS 902 uint_t pszclck_stat[4]; 903 #endif 904 /* 905 * Find, take and return a mutex held by hat_page_demote(). 906 * Called by page_demote_vp_pages() before hat_page_demote() call and by 907 * routines that want to block hat_page_demote() but can't do it 908 * via locking all constituent pages. 909 * 910 * Return NULL if p_szc is 0. 911 * 912 * It should only be used for pages that can be demoted by hat_page_demote() 913 * i.e. non swapfs file system pages. The logic here is lifted from 914 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 915 * since the page is locked and not free. 916 * 917 * Hash of the root page is used to find the lock. 918 * To find the root in the presense of hat_page_demote() chageing the location 919 * of the root this routine relies on the fact that hat_page_demote() changes 920 * root last. 921 * 922 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 923 * returned pp's p_szc may be any value. 924 */ 925 kmutex_t * 926 page_szc_lock(page_t *pp) 927 { 928 kmutex_t *mtx; 929 page_t *rootpp; 930 uint_t szc; 931 uint_t rszc; 932 uint_t pszc = pp->p_szc; 933 934 ASSERT(pp != NULL); 935 ASSERT(PAGE_LOCKED(pp)); 936 ASSERT(!PP_ISFREE(pp)); 937 ASSERT(pp->p_vnode != NULL); 938 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 939 ASSERT(!PP_ISKAS(pp)); 940 941 again: 942 if (pszc == 0) { 943 VM_STAT_ADD(pszclck_stat[0]); 944 return (NULL); 945 } 946 947 /* The lock lives in the root page */ 948 949 rootpp = PP_GROUPLEADER(pp, pszc); 950 mtx = PAGE_SZC_MUTEX(rootpp); 951 mutex_enter(mtx); 952 953 /* 954 * since p_szc can only decrease if pp == rootpp 955 * rootpp will be always the same i.e we have the right root 956 * regardless of rootpp->p_szc. 957 * If location of pp's root didn't change after we took 958 * the lock we have the right root. return mutex hashed off it. 959 */ 960 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 961 VM_STAT_ADD(pszclck_stat[1]); 962 return (mtx); 963 } 964 965 /* 966 * root location changed because page got demoted. 967 * locate the new root. 968 */ 969 if (rszc < pszc) { 970 szc = pp->p_szc; 971 ASSERT(szc < pszc); 972 mutex_exit(mtx); 973 pszc = szc; 974 VM_STAT_ADD(pszclck_stat[2]); 975 goto again; 976 } 977 978 VM_STAT_ADD(pszclck_stat[3]); 979 /* 980 * current hat_page_demote not done yet. 981 * wait for it to finish. 982 */ 983 mutex_exit(mtx); 984 rootpp = PP_GROUPLEADER(rootpp, rszc); 985 mtx = PAGE_SZC_MUTEX(rootpp); 986 mutex_enter(mtx); 987 mutex_exit(mtx); 988 ASSERT(rootpp->p_szc < rszc); 989 goto again; 990 } 991 992 int 993 page_szc_lock_assert(page_t *pp) 994 { 995 page_t *rootpp = PP_PAGEROOT(pp); 996 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 997 998 return (MUTEX_HELD(mtx)); 999 } 1000 1001 /* 1002 * memseg locking 1003 */ 1004 static krwlock_t memsegslock; 1005 1006 /* 1007 * memlist (phys_install, phys_avail) locking. 1008 */ 1009 static krwlock_t memlists_lock; 1010 1011 int 1012 memsegs_trylock(int writer) 1013 { 1014 return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER)); 1015 } 1016 1017 void 1018 memsegs_lock(int writer) 1019 { 1020 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1021 } 1022 1023 /*ARGSUSED*/ 1024 void 1025 memsegs_unlock(int writer) 1026 { 1027 rw_exit(&memsegslock); 1028 } 1029 1030 int 1031 memsegs_lock_held(void) 1032 { 1033 return (RW_LOCK_HELD(&memsegslock)); 1034 } 1035 1036 void 1037 memlist_read_lock(void) 1038 { 1039 rw_enter(&memlists_lock, RW_READER); 1040 } 1041 1042 void 1043 memlist_read_unlock(void) 1044 { 1045 rw_exit(&memlists_lock); 1046 } 1047 1048 void 1049 memlist_write_lock(void) 1050 { 1051 rw_enter(&memlists_lock, RW_WRITER); 1052 } 1053 1054 void 1055 memlist_write_unlock(void) 1056 { 1057 rw_exit(&memlists_lock); 1058 } 1059