1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2019 Joyent, Inc. 24 */ 25 26 27 /* 28 * VM - page locking primitives 29 */ 30 #include <sys/param.h> 31 #include <sys/t_lock.h> 32 #include <sys/vtrace.h> 33 #include <sys/debug.h> 34 #include <sys/cmn_err.h> 35 #include <sys/bitmap.h> 36 #include <sys/lockstat.h> 37 #include <sys/sysmacros.h> 38 #include <sys/condvar_impl.h> 39 #include <vm/page.h> 40 #include <vm/seg_enum.h> 41 #include <vm/vm_dep.h> 42 #include <vm/seg_kmem.h> 43 44 /* 45 * This global mutex array is for logical page locking. 46 * The following fields in the page structure are protected 47 * by this lock: 48 * 49 * p_lckcnt 50 * p_cowcnt 51 */ 52 pad_mutex_t page_llocks[8 * NCPU_P2]; 53 54 /* 55 * This is a global lock for the logical page free list. The 56 * logical free list, in this implementation, is maintained as two 57 * separate physical lists - the cache list and the free list. 58 */ 59 kmutex_t page_freelock; 60 61 /* 62 * The hash table, page_hash[], the p_selock fields, and the 63 * list of pages associated with vnodes are protected by arrays of mutexes. 64 * 65 * Unless the hashes are changed radically, the table sizes must be 66 * a power of two. Also, we typically need more mutexes for the 67 * vnodes since these locks are occasionally held for long periods. 68 * And since there seem to be two special vnodes (kvp and swapvp), 69 * we make room for private mutexes for them. 70 * 71 * The pse_mutex[] array holds the mutexes to protect the p_selock 72 * fields of all page_t structures. 73 * 74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 75 * when given a pointer to a page_t. 76 * 77 * PIO_TABLE_SIZE must be a power of two. One could argue that we 78 * should go to the trouble of setting it up at run time and base it 79 * on memory size rather than the number of compile time CPUs. 80 * 81 * XX64 We should be using physmem size to calculate PIO_SHIFT. 82 * 83 * These might break in 64 bit world. 84 */ 85 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 86 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 87 88 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 89 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 90 91 #define PAGE_IO_MUTEX(pp) \ 92 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 93 94 /* 95 * The pse_mutex[] array is allocated in the platform startup code 96 * based on the size of the machine at startup. 97 */ 98 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 99 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 100 extern int pse_shift; /* log2(pse_table_size) */ 101 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 102 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 103 (pse_table_size - 1)].pad_mutex 104 105 #define PSZC_MTX_TABLE_SIZE 128 106 #define PSZC_MTX_TABLE_SHIFT 7 107 108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 109 110 #define PAGE_SZC_MUTEX(_pp) \ 111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 115 116 /* 117 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 119 * and p_vpnext). 120 * 121 * The page_vnode_mutex(vp) function returns the address of the appropriate 122 * mutex from this array given a pointer to a vnode. It is complicated 123 * by the fact that the kernel's vnode and the swapfs vnode are referenced 124 * frequently enough to warrent their own mutexes. 125 * 126 * The VP_HASH_FUNC returns the index into the vph_mutex array given 127 * an address of a vnode. 128 */ 129 130 #if defined(_LP64) 131 #define VPH_TABLE_SIZE (8 * NCPU_P2) 132 #else /* 32 bits */ 133 #define VPH_TABLE_SIZE (2 * NCPU_P2) 134 #endif 135 136 #define VP_HASH_FUNC(vp) \ 137 ((((uintptr_t)(vp) >> 6) + \ 138 ((uintptr_t)(vp) >> 8) + \ 139 ((uintptr_t)(vp) >> 10) + \ 140 ((uintptr_t)(vp) >> 12)) \ 141 & (VPH_TABLE_SIZE - 1)) 142 143 /* 144 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 145 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 146 * VPH_TABLE_SIZE + 1. 147 */ 148 149 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 150 151 /* 152 * Initialize the locks used by the Virtual Memory Management system. 153 */ 154 void 155 page_lock_init() 156 { 157 } 158 159 /* 160 * Return a value for pse_shift based on npg (the number of physical pages) 161 * and ncpu (the maximum number of CPUs). This is called by platform startup 162 * code. 163 * 164 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 165 * locks grew approximately as the square of the number of threads executing. 166 * So the primary scaling factor used is NCPU^2. The size of the machine in 167 * megabytes is used as an upper bound, particularly for sun4v machines which 168 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 169 * (128) is used as a minimum. Since the size of the table has to be a power 170 * of two, the calculated size is rounded up to the next power of two. 171 */ 172 /*ARGSUSED*/ 173 int 174 size_pse_array(pgcnt_t npg, int ncpu) 175 { 176 size_t size; 177 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 178 179 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 180 size += (1 << (highbit(size) - 1)) - 1; 181 return (highbit(size) - 1); 182 } 183 184 /* 185 * At present we only use page ownership to aid debugging, so it's 186 * OK if the owner field isn't exact. In the 32-bit world two thread ids 187 * can map to the same owner because we just 'or' in 0x80000000 and 188 * then clear the second highest bit, so that (for example) 0x2faced00 189 * and 0xafaced00 both map to 0xafaced00. 190 * In the 64-bit world, p_selock may not be large enough to hold a full 191 * thread pointer. If we ever need precise ownership (e.g. if we implement 192 * priority inheritance for page locks) then p_selock should become a 193 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 194 */ 195 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 196 #define SE_READER 1 197 198 /* 199 * A page that is deleted must be marked as such using the 200 * page_lock_delete() function. The page must be exclusively locked. 201 * The SE_DELETED marker is put in p_selock when this function is called. 202 * SE_DELETED must be distinct from any SE_WRITER value. 203 */ 204 #define SE_DELETED (1 | INT_MIN) 205 206 #ifdef VM_STATS 207 uint_t vph_kvp_count; 208 uint_t vph_swapfsvp_count; 209 uint_t vph_other; 210 #endif /* VM_STATS */ 211 212 #ifdef VM_STATS 213 uint_t page_lock_count; 214 uint_t page_lock_miss; 215 uint_t page_lock_miss_lock; 216 uint_t page_lock_reclaim; 217 uint_t page_lock_bad_reclaim; 218 uint_t page_lock_same_page; 219 uint_t page_lock_upgrade; 220 uint_t page_lock_retired; 221 uint_t page_lock_upgrade_failed; 222 uint_t page_lock_deleted; 223 224 uint_t page_trylock_locked; 225 uint_t page_trylock_failed; 226 uint_t page_trylock_missed; 227 228 uint_t page_try_reclaim_upgrade; 229 #endif /* VM_STATS */ 230 231 /* 232 * Acquire the "shared/exclusive" lock on a page. 233 * 234 * Returns 1 on success and locks the page appropriately. 235 * 0 on failure and does not lock the page. 236 * 237 * If `lock' is non-NULL, it will be dropped and reacquired in the 238 * failure case. This routine can block, and if it does 239 * it will always return a failure since the page identity [vp, off] 240 * or state may have changed. 241 */ 242 243 int 244 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 245 { 246 return (page_lock_es(pp, se, lock, reclaim, 0)); 247 } 248 249 /* 250 * With the addition of reader-writer lock semantics to page_lock_es, 251 * callers wanting an exclusive (writer) lock may prevent shared-lock 252 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 253 * In this case, when an exclusive lock cannot be acquired, p_selock's 254 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 255 * if the page is slated for retirement. 256 * 257 * The se and es parameters determine if the lock should be granted 258 * based on the following decision table: 259 * 260 * Lock wanted es flags p_selock/SE_EWANTED Action 261 * ----------- -------------- ------------------- --------- 262 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 263 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 264 * SE_EXCL none any lock/any deny 265 * SE_SHARED n/a [2] shared/0 grant 266 * SE_SHARED n/a [2] unlocked/0 grant 267 * SE_SHARED n/a shared/1 deny 268 * SE_SHARED n/a unlocked/1 deny 269 * SE_SHARED n/a excl/any deny 270 * 271 * Notes: 272 * [1] The code grants an exclusive lock to the caller and clears the bit 273 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 274 * bit's value. This was deemed acceptable as we are not concerned about 275 * exclusive-lock starvation. If this ever becomes an issue, a priority or 276 * fifo mechanism should also be implemented. Meantime, the thread that 277 * set SE_EWANTED should be prepared to catch this condition and reset it 278 * 279 * [2] Retired pages may not be locked at any time, regardless of the 280 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 281 * 282 * Notes on values of "es": 283 * 284 * es & 1: page_lookup_create will attempt page relocation 285 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 286 * memory thread); this prevents reader-starvation of waiting 287 * writer thread(s) by giving priority to writers over readers. 288 * es & SE_RETIRED: caller wants to lock pages even if they are 289 * retired. Default is to deny the lock if the page is retired. 290 * 291 * And yes, we know, the semantics of this function are too complicated. 292 * It's on the list to be cleaned up. 293 */ 294 int 295 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 296 { 297 int retval; 298 kmutex_t *pse = PAGE_SE_MUTEX(pp); 299 int upgraded; 300 int reclaim_it; 301 302 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 303 304 VM_STAT_ADD(page_lock_count); 305 306 upgraded = 0; 307 reclaim_it = 0; 308 309 mutex_enter(pse); 310 311 ASSERT(((es & SE_EXCL_WANTED) == 0) || 312 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 313 314 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 315 mutex_exit(pse); 316 VM_STAT_ADD(page_lock_retired); 317 return (0); 318 } 319 320 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 321 se = SE_EXCL; 322 } 323 324 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 325 326 reclaim_it = 1; 327 if (se == SE_SHARED) { 328 /* 329 * This is an interesting situation. 330 * 331 * Remember that p_free can only change if 332 * p_selock < 0. 333 * p_free does not depend on our holding `pse'. 334 * And, since we hold `pse', p_selock can not change. 335 * So, if p_free changes on us, the page is already 336 * exclusively held, and we would fail to get p_selock 337 * regardless. 338 * 339 * We want to avoid getting the share 340 * lock on a free page that needs to be reclaimed. 341 * It is possible that some other thread has the share 342 * lock and has left the free page on the cache list. 343 * pvn_vplist_dirty() does this for brief periods. 344 * If the se_share is currently SE_EXCL, we will fail 345 * to acquire p_selock anyway. Blocking is the 346 * right thing to do. 347 * If we need to reclaim this page, we must get 348 * exclusive access to it, force the upgrade now. 349 * Again, we will fail to acquire p_selock if the 350 * page is not free and block. 351 */ 352 upgraded = 1; 353 se = SE_EXCL; 354 VM_STAT_ADD(page_lock_upgrade); 355 } 356 } 357 358 if (se == SE_EXCL) { 359 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 360 /* 361 * if the caller wants a writer lock (but did not 362 * specify exclusive access), and there is a pending 363 * writer that wants exclusive access, return failure 364 */ 365 retval = 0; 366 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 367 /* no reader/writer lock held */ 368 /* this clears our setting of the SE_EWANTED bit */ 369 pp->p_selock = SE_WRITER; 370 retval = 1; 371 } else { 372 /* page is locked */ 373 if (es & SE_EXCL_WANTED) { 374 /* set the SE_EWANTED bit */ 375 pp->p_selock |= SE_EWANTED; 376 } 377 retval = 0; 378 } 379 } else { 380 retval = 0; 381 if (pp->p_selock >= 0) { 382 if ((pp->p_selock & SE_EWANTED) == 0) { 383 pp->p_selock += SE_READER; 384 retval = 1; 385 } 386 } 387 } 388 389 if (retval == 0) { 390 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 391 VM_STAT_ADD(page_lock_deleted); 392 mutex_exit(pse); 393 return (retval); 394 } 395 396 #ifdef VM_STATS 397 VM_STAT_ADD(page_lock_miss); 398 if (upgraded) { 399 VM_STAT_ADD(page_lock_upgrade_failed); 400 } 401 #endif 402 if (lock) { 403 VM_STAT_ADD(page_lock_miss_lock); 404 mutex_exit(lock); 405 } 406 407 /* 408 * Now, wait for the page to be unlocked and 409 * release the lock protecting p_cv and p_selock. 410 */ 411 cv_wait(&pp->p_cv, pse); 412 mutex_exit(pse); 413 414 /* 415 * The page identity may have changed while we were 416 * blocked. If we are willing to depend on "pp" 417 * still pointing to a valid page structure (i.e., 418 * assuming page structures are not dynamically allocated 419 * or freed), we could try to lock the page if its 420 * identity hasn't changed. 421 * 422 * This needs to be measured, since we come back from 423 * cv_wait holding pse (the expensive part of this 424 * operation) we might as well try the cheap part. 425 * Though we would also have to confirm that dropping 426 * `lock' did not cause any grief to the callers. 427 */ 428 if (lock) { 429 mutex_enter(lock); 430 } 431 } else { 432 /* 433 * We have the page lock. 434 * If we needed to reclaim the page, and the page 435 * needed reclaiming (ie, it was free), then we 436 * have the page exclusively locked. We may need 437 * to downgrade the page. 438 */ 439 ASSERT((upgraded) ? 440 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 441 mutex_exit(pse); 442 443 /* 444 * We now hold this page's lock, either shared or 445 * exclusive. This will prevent its identity from changing. 446 * The page, however, may or may not be free. If the caller 447 * requested, and it is free, go reclaim it from the 448 * free list. If the page can't be reclaimed, return failure 449 * so that the caller can start all over again. 450 * 451 * NOTE:page_reclaim() releases the page lock (p_selock) 452 * if it can't be reclaimed. 453 */ 454 if (reclaim_it) { 455 if (!page_reclaim(pp, lock)) { 456 VM_STAT_ADD(page_lock_bad_reclaim); 457 retval = 0; 458 } else { 459 VM_STAT_ADD(page_lock_reclaim); 460 if (upgraded) { 461 page_downgrade(pp); 462 } 463 } 464 } 465 } 466 return (retval); 467 } 468 469 /* 470 * Clear the SE_EWANTED bit from p_selock. This function allows 471 * callers of page_lock_es and page_try_reclaim_lock to clear 472 * their setting of this bit if they decide they no longer wish 473 * to gain exclusive access to the page. Currently only 474 * delete_memory_thread uses this when the delete memory 475 * operation is cancelled. 476 */ 477 void 478 page_lock_clr_exclwanted(page_t *pp) 479 { 480 kmutex_t *pse = PAGE_SE_MUTEX(pp); 481 482 mutex_enter(pse); 483 pp->p_selock &= ~SE_EWANTED; 484 if (CV_HAS_WAITERS(&pp->p_cv)) 485 cv_broadcast(&pp->p_cv); 486 mutex_exit(pse); 487 } 488 489 /* 490 * Read the comments inside of page_lock_es() carefully. 491 * 492 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 493 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 494 * This is used by threads subject to reader-starvation (eg. memory delete). 495 * 496 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 497 * it is expected that it will retry at a later time. Threads that will 498 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 499 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 500 * the bit is cleared.) 501 */ 502 int 503 page_try_reclaim_lock(page_t *pp, se_t se, int es) 504 { 505 kmutex_t *pse = PAGE_SE_MUTEX(pp); 506 selock_t old; 507 508 mutex_enter(pse); 509 510 old = pp->p_selock; 511 512 ASSERT(((es & SE_EXCL_WANTED) == 0) || 513 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 514 515 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 516 mutex_exit(pse); 517 VM_STAT_ADD(page_trylock_failed); 518 return (0); 519 } 520 521 if (se == SE_SHARED && es == 1 && old == 0) { 522 se = SE_EXCL; 523 } 524 525 if (se == SE_SHARED) { 526 if (!PP_ISFREE(pp)) { 527 if (old >= 0) { 528 /* 529 * Readers are not allowed when excl wanted 530 */ 531 if ((old & SE_EWANTED) == 0) { 532 pp->p_selock = old + SE_READER; 533 mutex_exit(pse); 534 return (1); 535 } 536 } 537 mutex_exit(pse); 538 return (0); 539 } 540 /* 541 * The page is free, so we really want SE_EXCL (below) 542 */ 543 VM_STAT_ADD(page_try_reclaim_upgrade); 544 } 545 546 /* 547 * The caller wants a writer lock. We try for it only if 548 * SE_EWANTED is not set, or if the caller specified 549 * SE_EXCL_WANTED. 550 */ 551 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 552 if ((old & ~SE_EWANTED) == 0) { 553 /* no reader/writer lock held */ 554 /* this clears out our setting of the SE_EWANTED bit */ 555 pp->p_selock = SE_WRITER; 556 mutex_exit(pse); 557 return (1); 558 } 559 } 560 if (es & SE_EXCL_WANTED) { 561 /* page is locked, set the SE_EWANTED bit */ 562 pp->p_selock |= SE_EWANTED; 563 } 564 mutex_exit(pse); 565 return (0); 566 } 567 568 /* 569 * Acquire a page's "shared/exclusive" lock, but never block. 570 * Returns 1 on success, 0 on failure. 571 */ 572 int 573 page_trylock(page_t *pp, se_t se) 574 { 575 kmutex_t *pse = PAGE_SE_MUTEX(pp); 576 577 mutex_enter(pse); 578 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 579 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 580 /* 581 * Fail if a thread wants exclusive access and page is 582 * retired, if the page is slated for retirement, or a 583 * share lock is requested. 584 */ 585 mutex_exit(pse); 586 VM_STAT_ADD(page_trylock_failed); 587 return (0); 588 } 589 590 if (se == SE_EXCL) { 591 if (pp->p_selock == 0) { 592 pp->p_selock = SE_WRITER; 593 mutex_exit(pse); 594 return (1); 595 } 596 } else { 597 if (pp->p_selock >= 0) { 598 pp->p_selock += SE_READER; 599 mutex_exit(pse); 600 return (1); 601 } 602 } 603 mutex_exit(pse); 604 return (0); 605 } 606 607 /* 608 * Variant of page_unlock() specifically for the page freelist 609 * code. The mere existence of this code is a vile hack that 610 * has resulted due to the backwards locking order of the page 611 * freelist manager; please don't call it. 612 */ 613 void 614 page_unlock_nocapture(page_t *pp) 615 { 616 kmutex_t *pse = PAGE_SE_MUTEX(pp); 617 selock_t old; 618 619 mutex_enter(pse); 620 621 old = pp->p_selock; 622 if ((old & ~SE_EWANTED) == SE_READER) { 623 pp->p_selock = old & ~SE_READER; 624 if (CV_HAS_WAITERS(&pp->p_cv)) 625 cv_broadcast(&pp->p_cv); 626 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 627 panic("page_unlock_nocapture: page %p is deleted", (void *)pp); 628 } else if (old < 0) { 629 pp->p_selock &= SE_EWANTED; 630 if (CV_HAS_WAITERS(&pp->p_cv)) 631 cv_broadcast(&pp->p_cv); 632 } else if ((old & ~SE_EWANTED) > SE_READER) { 633 pp->p_selock = old - SE_READER; 634 } else { 635 panic("page_unlock_nocapture: page %p is not locked", 636 (void *)pp); 637 } 638 639 mutex_exit(pse); 640 } 641 642 /* 643 * Release the page's "shared/exclusive" lock and wake up anyone 644 * who might be waiting for it. 645 */ 646 void 647 page_unlock(page_t *pp) 648 { 649 kmutex_t *pse = PAGE_SE_MUTEX(pp); 650 selock_t old; 651 652 mutex_enter(pse); 653 654 old = pp->p_selock; 655 if ((old & ~SE_EWANTED) == SE_READER) { 656 pp->p_selock = old & ~SE_READER; 657 if (CV_HAS_WAITERS(&pp->p_cv)) 658 cv_broadcast(&pp->p_cv); 659 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 660 panic("page_unlock: page %p is deleted", (void *)pp); 661 } else if (old < 0) { 662 pp->p_selock &= SE_EWANTED; 663 if (CV_HAS_WAITERS(&pp->p_cv)) 664 cv_broadcast(&pp->p_cv); 665 } else if ((old & ~SE_EWANTED) > SE_READER) { 666 pp->p_selock = old - SE_READER; 667 } else { 668 panic("page_unlock: page %p is not locked", (void *)pp); 669 } 670 671 if (pp->p_selock == 0) { 672 /* 673 * If the T_CAPTURING bit is set, that means that we should 674 * not try and capture the page again as we could recurse 675 * which could lead to a stack overflow panic or spending a 676 * relatively long time in the kernel making no progress. 677 */ 678 if ((pp->p_toxic & PR_CAPTURE) && 679 !(curthread->t_flag & T_CAPTURING) && 680 !PP_RETIRED(pp)) { 681 pp->p_selock = SE_WRITER; 682 mutex_exit(pse); 683 page_unlock_capture(pp); 684 } else { 685 mutex_exit(pse); 686 } 687 } else { 688 mutex_exit(pse); 689 } 690 } 691 692 /* 693 * Try to upgrade the lock on the page from a "shared" to an 694 * "exclusive" lock. Since this upgrade operation is done while 695 * holding the mutex protecting this page, no one else can acquire this page's 696 * lock and change the page. Thus, it is safe to drop the "shared" 697 * lock and attempt to acquire the "exclusive" lock. 698 * 699 * Returns 1 on success, 0 on failure. 700 */ 701 int 702 page_tryupgrade(page_t *pp) 703 { 704 kmutex_t *pse = PAGE_SE_MUTEX(pp); 705 706 mutex_enter(pse); 707 if (!(pp->p_selock & SE_EWANTED)) { 708 /* no threads want exclusive access, try upgrade */ 709 if (pp->p_selock == SE_READER) { 710 /* convert to exclusive lock */ 711 pp->p_selock = SE_WRITER; 712 mutex_exit(pse); 713 return (1); 714 } 715 } 716 mutex_exit(pse); 717 return (0); 718 } 719 720 /* 721 * Downgrade the "exclusive" lock on the page to a "shared" lock 722 * while holding the mutex protecting this page's p_selock field. 723 */ 724 void 725 page_downgrade(page_t *pp) 726 { 727 kmutex_t *pse = PAGE_SE_MUTEX(pp); 728 int excl_waiting; 729 730 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 731 ASSERT(PAGE_EXCL(pp)); 732 733 mutex_enter(pse); 734 excl_waiting = pp->p_selock & SE_EWANTED; 735 pp->p_selock = SE_READER | excl_waiting; 736 if (CV_HAS_WAITERS(&pp->p_cv)) 737 cv_broadcast(&pp->p_cv); 738 mutex_exit(pse); 739 } 740 741 void 742 page_lock_delete(page_t *pp) 743 { 744 kmutex_t *pse = PAGE_SE_MUTEX(pp); 745 746 ASSERT(PAGE_EXCL(pp)); 747 ASSERT(pp->p_vnode == NULL); 748 ASSERT(pp->p_offset == (u_offset_t)-1); 749 ASSERT(!PP_ISFREE(pp)); 750 751 mutex_enter(pse); 752 pp->p_selock = SE_DELETED; 753 if (CV_HAS_WAITERS(&pp->p_cv)) 754 cv_broadcast(&pp->p_cv); 755 mutex_exit(pse); 756 } 757 758 int 759 page_deleted(page_t *pp) 760 { 761 return (pp->p_selock == SE_DELETED); 762 } 763 764 /* 765 * Implement the io lock for pages 766 */ 767 void 768 page_iolock_init(page_t *pp) 769 { 770 pp->p_iolock_state = 0; 771 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 772 } 773 774 /* 775 * Acquire the i/o lock on a page. 776 */ 777 void 778 page_io_lock(page_t *pp) 779 { 780 kmutex_t *pio; 781 782 pio = PAGE_IO_MUTEX(pp); 783 mutex_enter(pio); 784 while (pp->p_iolock_state & PAGE_IO_INUSE) { 785 cv_wait(&(pp->p_io_cv), pio); 786 } 787 pp->p_iolock_state |= PAGE_IO_INUSE; 788 mutex_exit(pio); 789 } 790 791 /* 792 * Release the i/o lock on a page. 793 */ 794 void 795 page_io_unlock(page_t *pp) 796 { 797 kmutex_t *pio; 798 799 pio = PAGE_IO_MUTEX(pp); 800 mutex_enter(pio); 801 cv_broadcast(&pp->p_io_cv); 802 pp->p_iolock_state &= ~PAGE_IO_INUSE; 803 mutex_exit(pio); 804 } 805 806 /* 807 * Try to acquire the i/o lock on a page without blocking. 808 * Returns 1 on success, 0 on failure. 809 */ 810 int 811 page_io_trylock(page_t *pp) 812 { 813 kmutex_t *pio; 814 815 if (pp->p_iolock_state & PAGE_IO_INUSE) 816 return (0); 817 818 pio = PAGE_IO_MUTEX(pp); 819 mutex_enter(pio); 820 821 if (pp->p_iolock_state & PAGE_IO_INUSE) { 822 mutex_exit(pio); 823 return (0); 824 } 825 pp->p_iolock_state |= PAGE_IO_INUSE; 826 mutex_exit(pio); 827 828 return (1); 829 } 830 831 /* 832 * Wait until the i/o lock is not held. 833 */ 834 void 835 page_io_wait(page_t *pp) 836 { 837 kmutex_t *pio; 838 839 pio = PAGE_IO_MUTEX(pp); 840 mutex_enter(pio); 841 while (pp->p_iolock_state & PAGE_IO_INUSE) { 842 cv_wait(&(pp->p_io_cv), pio); 843 } 844 mutex_exit(pio); 845 } 846 847 /* 848 * Returns 1 on success, 0 on failure. 849 */ 850 int 851 page_io_locked(page_t *pp) 852 { 853 return (pp->p_iolock_state & PAGE_IO_INUSE); 854 } 855 856 /* 857 * Assert that the i/o lock on a page is held. 858 * Returns 1 on success, 0 on failure. 859 */ 860 int 861 page_iolock_assert(page_t *pp) 862 { 863 return (page_io_locked(pp)); 864 } 865 866 /* 867 * Wrapper exported to kernel routines that are built 868 * platform-independent (the macro is platform-dependent; 869 * the size of vph_mutex[] is based on NCPU). 870 * 871 * Note that you can do stress testing on this by setting the 872 * variable page_vnode_mutex_stress to something other than 873 * zero in a DEBUG kernel in a debugger after loading the kernel. 874 * Setting it after the kernel is running may not work correctly. 875 */ 876 #ifdef DEBUG 877 static int page_vnode_mutex_stress = 0; 878 #endif 879 880 kmutex_t * 881 page_vnode_mutex(vnode_t *vp) 882 { 883 if (vp == &kvp) 884 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 885 886 if (vp == &zvp) 887 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 888 #ifdef DEBUG 889 if (page_vnode_mutex_stress != 0) 890 return (&vph_mutex[0]); 891 #endif 892 893 return (&vph_mutex[VP_HASH_FUNC(vp)]); 894 } 895 896 kmutex_t * 897 page_se_mutex(page_t *pp) 898 { 899 return (PAGE_SE_MUTEX(pp)); 900 } 901 902 #ifdef VM_STATS 903 uint_t pszclck_stat[4]; 904 #endif 905 /* 906 * Find, take and return a mutex held by hat_page_demote(). 907 * Called by page_demote_vp_pages() before hat_page_demote() call and by 908 * routines that want to block hat_page_demote() but can't do it 909 * via locking all constituent pages. 910 * 911 * Return NULL if p_szc is 0. 912 * 913 * It should only be used for pages that can be demoted by hat_page_demote() 914 * i.e. non swapfs file system pages. The logic here is lifted from 915 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 916 * since the page is locked and not free. 917 * 918 * Hash of the root page is used to find the lock. 919 * To find the root in the presense of hat_page_demote() chageing the location 920 * of the root this routine relies on the fact that hat_page_demote() changes 921 * root last. 922 * 923 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 924 * returned pp's p_szc may be any value. 925 */ 926 kmutex_t * 927 page_szc_lock(page_t *pp) 928 { 929 kmutex_t *mtx; 930 page_t *rootpp; 931 uint_t szc; 932 uint_t rszc; 933 uint_t pszc = pp->p_szc; 934 935 ASSERT(pp != NULL); 936 ASSERT(PAGE_LOCKED(pp)); 937 ASSERT(!PP_ISFREE(pp)); 938 ASSERT(pp->p_vnode != NULL); 939 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 940 ASSERT(!PP_ISKAS(pp)); 941 942 again: 943 if (pszc == 0) { 944 VM_STAT_ADD(pszclck_stat[0]); 945 return (NULL); 946 } 947 948 /* The lock lives in the root page */ 949 950 rootpp = PP_GROUPLEADER(pp, pszc); 951 mtx = PAGE_SZC_MUTEX(rootpp); 952 mutex_enter(mtx); 953 954 /* 955 * since p_szc can only decrease if pp == rootpp 956 * rootpp will be always the same i.e we have the right root 957 * regardless of rootpp->p_szc. 958 * If location of pp's root didn't change after we took 959 * the lock we have the right root. return mutex hashed off it. 960 */ 961 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 962 VM_STAT_ADD(pszclck_stat[1]); 963 return (mtx); 964 } 965 966 /* 967 * root location changed because page got demoted. 968 * locate the new root. 969 */ 970 if (rszc < pszc) { 971 szc = pp->p_szc; 972 ASSERT(szc < pszc); 973 mutex_exit(mtx); 974 pszc = szc; 975 VM_STAT_ADD(pszclck_stat[2]); 976 goto again; 977 } 978 979 VM_STAT_ADD(pszclck_stat[3]); 980 /* 981 * current hat_page_demote not done yet. 982 * wait for it to finish. 983 */ 984 mutex_exit(mtx); 985 rootpp = PP_GROUPLEADER(rootpp, rszc); 986 mtx = PAGE_SZC_MUTEX(rootpp); 987 mutex_enter(mtx); 988 mutex_exit(mtx); 989 ASSERT(rootpp->p_szc < rszc); 990 goto again; 991 } 992 993 int 994 page_szc_lock_assert(page_t *pp) 995 { 996 page_t *rootpp = PP_PAGEROOT(pp); 997 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 998 999 return (MUTEX_HELD(mtx)); 1000 } 1001 1002 /* 1003 * memseg locking 1004 */ 1005 static krwlock_t memsegslock; 1006 1007 /* 1008 * memlist (phys_install, phys_avail) locking. 1009 */ 1010 static krwlock_t memlists_lock; 1011 1012 int 1013 memsegs_trylock(int writer) 1014 { 1015 return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER)); 1016 } 1017 1018 void 1019 memsegs_lock(int writer) 1020 { 1021 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1022 } 1023 1024 /*ARGSUSED*/ 1025 void 1026 memsegs_unlock(int writer) 1027 { 1028 rw_exit(&memsegslock); 1029 } 1030 1031 int 1032 memsegs_lock_held(void) 1033 { 1034 return (RW_LOCK_HELD(&memsegslock)); 1035 } 1036 1037 void 1038 memlist_read_lock(void) 1039 { 1040 rw_enter(&memlists_lock, RW_READER); 1041 } 1042 1043 void 1044 memlist_read_unlock(void) 1045 { 1046 rw_exit(&memlists_lock); 1047 } 1048 1049 void 1050 memlist_write_lock(void) 1051 { 1052 rw_enter(&memlists_lock, RW_WRITER); 1053 } 1054 1055 void 1056 memlist_write_unlock(void) 1057 { 1058 rw_exit(&memlists_lock); 1059 } 1060