1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * VM - page locking primitives 30 */ 31 #include <sys/param.h> 32 #include <sys/t_lock.h> 33 #include <sys/vtrace.h> 34 #include <sys/debug.h> 35 #include <sys/cmn_err.h> 36 #include <sys/vnode.h> 37 #include <sys/bitmap.h> 38 #include <sys/lockstat.h> 39 #include <sys/condvar_impl.h> 40 #include <vm/page.h> 41 #include <vm/seg_enum.h> 42 #include <vm/vm_dep.h> 43 44 /* 45 * This global mutex is for logical page locking. 46 * The following fields in the page structure are protected 47 * by this lock: 48 * 49 * p_lckcnt 50 * p_cowcnt 51 */ 52 kmutex_t page_llock; 53 54 /* 55 * This is a global lock for the logical page free list. The 56 * logical free list, in this implementation, is maintained as two 57 * separate physical lists - the cache list and the free list. 58 */ 59 kmutex_t page_freelock; 60 61 /* 62 * The hash table, page_hash[], the p_selock fields, and the 63 * list of pages associated with vnodes are protected by arrays of mutexes. 64 * 65 * Unless the hashes are changed radically, the table sizes must be 66 * a power of two. Also, we typically need more mutexes for the 67 * vnodes since these locks are occasionally held for long periods. 68 * And since there seem to be two special vnodes (kvp and swapvp), 69 * we make room for private mutexes for them. 70 * 71 * The pse_mutex[] array holds the mutexes to protect the p_selock 72 * fields of all page_t structures. 73 * 74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 75 * when given a pointer to a page_t. 76 * 77 * PSE_TABLE_SIZE must be a power of two. One could argue that we 78 * should go to the trouble of setting it up at run time and base it 79 * on memory size rather than the number of compile time CPUs. 80 * 81 * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE, 82 * PSE_SHIFT, PIO_SHIFT. 83 * 84 * These might break in 64 bit world. 85 */ 86 #define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */ 87 88 #define PSE_TABLE_SIZE 128 /* number of mutexes to have */ 89 90 #define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */ 91 #define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */ 92 93 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 94 pad_mutex_t pse_mutex[PSE_TABLE_SIZE]; 95 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 96 97 #define PAGE_SE_MUTEX(pp) \ 98 &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \ 99 ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \ 100 (PSE_TABLE_SIZE - 1))].pad_mutex 101 102 #define PAGE_IO_MUTEX(pp) \ 103 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 104 105 #define PSZC_MTX_TABLE_SIZE 128 106 #define PSZC_MTX_TABLE_SHIFT 7 107 108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 109 110 #define PAGE_SZC_MUTEX(_pp) \ 111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 115 116 /* 117 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 119 * and p_vpnext). 120 * 121 * The page_vnode_mutex(vp) function returns the address of the appropriate 122 * mutex from this array given a pointer to a vnode. It is complicated 123 * by the fact that the kernel's vnode and the swapfs vnode are referenced 124 * frequently enough to warrent their own mutexes. 125 * 126 * The VP_HASH_FUNC returns the index into the vph_mutex array given 127 * an address of a vnode. 128 */ 129 130 /* 131 * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 132 * Need to review again. 133 */ 134 #if defined(_LP64) 135 #define VPH_TABLE_SIZE (1 << (VP_SHIFT + 3)) 136 #else /* 32 bits */ 137 #define VPH_TABLE_SIZE (2 << VP_SHIFT) 138 #endif 139 140 #define VP_HASH_FUNC(vp) \ 141 ((((uintptr_t)(vp) >> 6) + \ 142 ((uintptr_t)(vp) >> 8) + \ 143 ((uintptr_t)(vp) >> 10) + \ 144 ((uintptr_t)(vp) >> 12)) \ 145 & (VPH_TABLE_SIZE - 1)) 146 147 extern struct vnode kvp; 148 149 /* 150 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 151 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 152 * VPH_TABLE_SIZE + 1. 153 */ 154 155 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 156 157 /* 158 * Initialize the locks used by the Virtual Memory Management system. 159 */ 160 void 161 page_lock_init() 162 { 163 } 164 165 /* 166 * At present we only use page ownership to aid debugging, so it's 167 * OK if the owner field isn't exact. In the 32-bit world two thread ids 168 * can map to the same owner because we just 'or' in 0x80000000 and 169 * then clear the second highest bit, so that (for example) 0x2faced00 170 * and 0xafaced00 both map to 0xafaced00. 171 * In the 64-bit world, p_selock may not be large enough to hold a full 172 * thread pointer. If we ever need precise ownership (e.g. if we implement 173 * priority inheritance for page locks) then p_selock should become a 174 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 175 */ 176 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 177 #define SE_READER 1 178 179 /* 180 * A page that is deleted must be marked as such using the 181 * page_lock_delete() function. The page must be exclusively locked. 182 * The SE_DELETED marker is put in p_selock when this function is called. 183 * SE_DELETED must be distinct from any SE_WRITER value. 184 */ 185 #define SE_DELETED (1 | INT_MIN) 186 187 #ifdef VM_STATS 188 uint_t vph_kvp_count; 189 uint_t vph_swapfsvp_count; 190 uint_t vph_other; 191 #endif /* VM_STATS */ 192 193 #ifdef VM_STATS 194 uint_t page_lock_count; 195 uint_t page_lock_miss; 196 uint_t page_lock_miss_lock; 197 uint_t page_lock_reclaim; 198 uint_t page_lock_bad_reclaim; 199 uint_t page_lock_same_page; 200 uint_t page_lock_upgrade; 201 uint_t page_lock_retired; 202 uint_t page_lock_upgrade_failed; 203 uint_t page_lock_deleted; 204 205 uint_t page_trylock_locked; 206 uint_t page_trylock_failed; 207 uint_t page_trylock_missed; 208 209 uint_t page_try_reclaim_upgrade; 210 #endif /* VM_STATS */ 211 212 /* 213 * Acquire the "shared/exclusive" lock on a page. 214 * 215 * Returns 1 on success and locks the page appropriately. 216 * 0 on failure and does not lock the page. 217 * 218 * If `lock' is non-NULL, it will be dropped and reacquired in the 219 * failure case. This routine can block, and if it does 220 * it will always return a failure since the page identity [vp, off] 221 * or state may have changed. 222 */ 223 224 int 225 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 226 { 227 return (page_lock_es(pp, se, lock, reclaim, 0)); 228 } 229 230 /* 231 * With the addition of reader-writer lock semantics to page_lock_es, 232 * callers wanting an exclusive (writer) lock may prevent shared-lock 233 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 234 * In this case, when an exclusive lock cannot be acquired, p_selock's 235 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 236 * if the page is slated for retirement. 237 * 238 * The se and es parameters determine if the lock should be granted 239 * based on the following decision table: 240 * 241 * Lock wanted es flags p_selock/SE_EWANTED Action 242 * ----------- -------------- ------------------- --------- 243 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 244 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 245 * SE_EXCL none any lock/any deny 246 * SE_SHARED n/a [2] shared/0 grant 247 * SE_SHARED n/a [2] unlocked/0 grant 248 * SE_SHARED n/a shared/1 deny 249 * SE_SHARED n/a unlocked/1 deny 250 * SE_SHARED n/a excl/any deny 251 * 252 * Notes: 253 * [1] The code grants an exclusive lock to the caller and clears the bit 254 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 255 * bit's value. This was deemed acceptable as we are not concerned about 256 * exclusive-lock starvation. If this ever becomes an issue, a priority or 257 * fifo mechanism should also be implemented. Meantime, the thread that 258 * set SE_EWANTED should be prepared to catch this condition and reset it 259 * 260 * [2] Retired pages may not be locked at any time, regardless of the 261 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 262 * 263 * Notes on values of "es": 264 * 265 * es & 1: page_lookup_create will attempt page relocation 266 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 267 * memory thread); this prevents reader-starvation of waiting 268 * writer thread(s) by giving priority to writers over readers. 269 * es & SE_RETIRED: caller wants to lock pages even if they are 270 * retired. Default is to deny the lock if the page is retired. 271 * 272 * And yes, we know, the semantics of this function are too complicated. 273 * It's on the list to be cleaned up. 274 */ 275 int 276 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 277 { 278 int retval; 279 kmutex_t *pse = PAGE_SE_MUTEX(pp); 280 int upgraded; 281 int reclaim_it; 282 283 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 284 285 VM_STAT_ADD(page_lock_count); 286 287 upgraded = 0; 288 reclaim_it = 0; 289 290 mutex_enter(pse); 291 292 ASSERT(((es & SE_EXCL_WANTED) == 0) || 293 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 294 295 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 296 mutex_exit(pse); 297 VM_STAT_ADD(page_lock_retired); 298 return (0); 299 } 300 301 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 302 se = SE_EXCL; 303 } 304 305 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 306 307 reclaim_it = 1; 308 if (se == SE_SHARED) { 309 /* 310 * This is an interesting situation. 311 * 312 * Remember that p_free can only change if 313 * p_selock < 0. 314 * p_free does not depend on our holding `pse'. 315 * And, since we hold `pse', p_selock can not change. 316 * So, if p_free changes on us, the page is already 317 * exclusively held, and we would fail to get p_selock 318 * regardless. 319 * 320 * We want to avoid getting the share 321 * lock on a free page that needs to be reclaimed. 322 * It is possible that some other thread has the share 323 * lock and has left the free page on the cache list. 324 * pvn_vplist_dirty() does this for brief periods. 325 * If the se_share is currently SE_EXCL, we will fail 326 * to acquire p_selock anyway. Blocking is the 327 * right thing to do. 328 * If we need to reclaim this page, we must get 329 * exclusive access to it, force the upgrade now. 330 * Again, we will fail to acquire p_selock if the 331 * page is not free and block. 332 */ 333 upgraded = 1; 334 se = SE_EXCL; 335 VM_STAT_ADD(page_lock_upgrade); 336 } 337 } 338 339 if (se == SE_EXCL) { 340 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 341 /* 342 * if the caller wants a writer lock (but did not 343 * specify exclusive access), and there is a pending 344 * writer that wants exclusive access, return failure 345 */ 346 retval = 0; 347 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 348 /* no reader/writer lock held */ 349 THREAD_KPRI_REQUEST(); 350 /* this clears our setting of the SE_EWANTED bit */ 351 pp->p_selock = SE_WRITER; 352 retval = 1; 353 } else { 354 /* page is locked */ 355 if (es & SE_EXCL_WANTED) { 356 /* set the SE_EWANTED bit */ 357 pp->p_selock |= SE_EWANTED; 358 } 359 retval = 0; 360 } 361 } else { 362 retval = 0; 363 if (pp->p_selock >= 0) { 364 if ((pp->p_selock & SE_EWANTED) == 0) { 365 pp->p_selock += SE_READER; 366 retval = 1; 367 } 368 } 369 } 370 371 if (retval == 0) { 372 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 373 VM_STAT_ADD(page_lock_deleted); 374 mutex_exit(pse); 375 return (retval); 376 } 377 378 #ifdef VM_STATS 379 VM_STAT_ADD(page_lock_miss); 380 if (upgraded) { 381 VM_STAT_ADD(page_lock_upgrade_failed); 382 } 383 #endif 384 if (lock) { 385 VM_STAT_ADD(page_lock_miss_lock); 386 mutex_exit(lock); 387 } 388 389 /* 390 * Now, wait for the page to be unlocked and 391 * release the lock protecting p_cv and p_selock. 392 */ 393 cv_wait(&pp->p_cv, pse); 394 mutex_exit(pse); 395 396 /* 397 * The page identity may have changed while we were 398 * blocked. If we are willing to depend on "pp" 399 * still pointing to a valid page structure (i.e., 400 * assuming page structures are not dynamically allocated 401 * or freed), we could try to lock the page if its 402 * identity hasn't changed. 403 * 404 * This needs to be measured, since we come back from 405 * cv_wait holding pse (the expensive part of this 406 * operation) we might as well try the cheap part. 407 * Though we would also have to confirm that dropping 408 * `lock' did not cause any grief to the callers. 409 */ 410 if (lock) { 411 mutex_enter(lock); 412 } 413 } else { 414 /* 415 * We have the page lock. 416 * If we needed to reclaim the page, and the page 417 * needed reclaiming (ie, it was free), then we 418 * have the page exclusively locked. We may need 419 * to downgrade the page. 420 */ 421 ASSERT((upgraded) ? 422 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 423 mutex_exit(pse); 424 425 /* 426 * We now hold this page's lock, either shared or 427 * exclusive. This will prevent its identity from changing. 428 * The page, however, may or may not be free. If the caller 429 * requested, and it is free, go reclaim it from the 430 * free list. If the page can't be reclaimed, return failure 431 * so that the caller can start all over again. 432 * 433 * NOTE:page_reclaim() releases the page lock (p_selock) 434 * if it can't be reclaimed. 435 */ 436 if (reclaim_it) { 437 if (!page_reclaim(pp, lock)) { 438 VM_STAT_ADD(page_lock_bad_reclaim); 439 retval = 0; 440 } else { 441 VM_STAT_ADD(page_lock_reclaim); 442 if (upgraded) { 443 page_downgrade(pp); 444 } 445 } 446 } 447 } 448 return (retval); 449 } 450 451 /* 452 * Clear the SE_EWANTED bit from p_selock. This function allows 453 * callers of page_lock_es and page_try_reclaim_lock to clear 454 * their setting of this bit if they decide they no longer wish 455 * to gain exclusive access to the page. Currently only 456 * delete_memory_thread uses this when the delete memory 457 * operation is cancelled. 458 */ 459 void 460 page_lock_clr_exclwanted(page_t *pp) 461 { 462 kmutex_t *pse = PAGE_SE_MUTEX(pp); 463 464 mutex_enter(pse); 465 pp->p_selock &= ~SE_EWANTED; 466 if (CV_HAS_WAITERS(&pp->p_cv)) 467 cv_broadcast(&pp->p_cv); 468 mutex_exit(pse); 469 } 470 471 /* 472 * Read the comments inside of page_lock_es() carefully. 473 * 474 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 475 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 476 * This is used by threads subject to reader-starvation (eg. memory delete). 477 * 478 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 479 * it is expected that it will retry at a later time. Threads that will 480 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 481 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 482 * the bit is cleared.) 483 */ 484 int 485 page_try_reclaim_lock(page_t *pp, se_t se, int es) 486 { 487 kmutex_t *pse = PAGE_SE_MUTEX(pp); 488 selock_t old; 489 490 mutex_enter(pse); 491 492 old = pp->p_selock; 493 494 ASSERT(((es & SE_EXCL_WANTED) == 0) || 495 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 496 497 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 498 mutex_exit(pse); 499 VM_STAT_ADD(page_trylock_failed); 500 return (0); 501 } 502 503 if (se == SE_SHARED && es == 1 && old == 0) { 504 se = SE_EXCL; 505 } 506 507 if (se == SE_SHARED) { 508 if (!PP_ISFREE(pp)) { 509 if (old >= 0) { 510 /* 511 * Readers are not allowed when excl wanted 512 */ 513 if ((old & SE_EWANTED) == 0) { 514 pp->p_selock = old + SE_READER; 515 mutex_exit(pse); 516 return (1); 517 } 518 } 519 mutex_exit(pse); 520 return (0); 521 } 522 /* 523 * The page is free, so we really want SE_EXCL (below) 524 */ 525 VM_STAT_ADD(page_try_reclaim_upgrade); 526 } 527 528 /* 529 * The caller wants a writer lock. We try for it only if 530 * SE_EWANTED is not set, or if the caller specified 531 * SE_EXCL_WANTED. 532 */ 533 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 534 if ((old & ~SE_EWANTED) == 0) { 535 /* no reader/writer lock held */ 536 THREAD_KPRI_REQUEST(); 537 /* this clears out our setting of the SE_EWANTED bit */ 538 pp->p_selock = SE_WRITER; 539 mutex_exit(pse); 540 return (1); 541 } 542 } 543 if (es & SE_EXCL_WANTED) { 544 /* page is locked, set the SE_EWANTED bit */ 545 pp->p_selock |= SE_EWANTED; 546 } 547 mutex_exit(pse); 548 return (0); 549 } 550 551 /* 552 * Acquire a page's "shared/exclusive" lock, but never block. 553 * Returns 1 on success, 0 on failure. 554 */ 555 int 556 page_trylock(page_t *pp, se_t se) 557 { 558 kmutex_t *pse = PAGE_SE_MUTEX(pp); 559 560 mutex_enter(pse); 561 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 562 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 563 /* 564 * Fail if a thread wants exclusive access and page is 565 * retired, if the page is slated for retirement, or a 566 * share lock is requested. 567 */ 568 mutex_exit(pse); 569 VM_STAT_ADD(page_trylock_failed); 570 return (0); 571 } 572 573 if (se == SE_EXCL) { 574 if (pp->p_selock == 0) { 575 THREAD_KPRI_REQUEST(); 576 pp->p_selock = SE_WRITER; 577 mutex_exit(pse); 578 return (1); 579 } 580 } else { 581 if (pp->p_selock >= 0) { 582 pp->p_selock += SE_READER; 583 mutex_exit(pse); 584 return (1); 585 } 586 } 587 mutex_exit(pse); 588 return (0); 589 } 590 591 /* 592 * Variant of page_unlock() specifically for the page freelist 593 * code. The mere existence of this code is a vile hack that 594 * has resulted due to the backwards locking order of the page 595 * freelist manager; please don't call it. 596 */ 597 void 598 page_unlock_nocapture(page_t *pp) 599 { 600 kmutex_t *pse = PAGE_SE_MUTEX(pp); 601 selock_t old; 602 603 mutex_enter(pse); 604 605 old = pp->p_selock; 606 if ((old & ~SE_EWANTED) == SE_READER) { 607 pp->p_selock = old & ~SE_READER; 608 if (CV_HAS_WAITERS(&pp->p_cv)) 609 cv_broadcast(&pp->p_cv); 610 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 611 panic("page_unlock_nocapture: page %p is deleted", pp); 612 } else if (old < 0) { 613 THREAD_KPRI_RELEASE(); 614 pp->p_selock &= SE_EWANTED; 615 if (CV_HAS_WAITERS(&pp->p_cv)) 616 cv_broadcast(&pp->p_cv); 617 } else if ((old & ~SE_EWANTED) > SE_READER) { 618 pp->p_selock = old - SE_READER; 619 } else { 620 panic("page_unlock_nocapture: page %p is not locked", pp); 621 } 622 623 mutex_exit(pse); 624 } 625 626 /* 627 * Release the page's "shared/exclusive" lock and wake up anyone 628 * who might be waiting for it. 629 */ 630 void 631 page_unlock(page_t *pp) 632 { 633 kmutex_t *pse = PAGE_SE_MUTEX(pp); 634 selock_t old; 635 636 mutex_enter(pse); 637 638 old = pp->p_selock; 639 if ((old & ~SE_EWANTED) == SE_READER) { 640 pp->p_selock = old & ~SE_READER; 641 if (CV_HAS_WAITERS(&pp->p_cv)) 642 cv_broadcast(&pp->p_cv); 643 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 644 panic("page_unlock: page %p is deleted", pp); 645 } else if (old < 0) { 646 THREAD_KPRI_RELEASE(); 647 pp->p_selock &= SE_EWANTED; 648 if (CV_HAS_WAITERS(&pp->p_cv)) 649 cv_broadcast(&pp->p_cv); 650 } else if ((old & ~SE_EWANTED) > SE_READER) { 651 pp->p_selock = old - SE_READER; 652 } else { 653 panic("page_unlock: page %p is not locked", pp); 654 } 655 656 if (pp->p_selock == 0) { 657 /* 658 * If the T_CAPTURING bit is set, that means that we should 659 * not try and capture the page again as we could recurse 660 * which could lead to a stack overflow panic or spending a 661 * relatively long time in the kernel making no progress. 662 */ 663 if ((pp->p_toxic & PR_CAPTURE) && 664 !(curthread->t_flag & T_CAPTURING) && 665 !PP_RETIRED(pp)) { 666 THREAD_KPRI_REQUEST(); 667 pp->p_selock = SE_WRITER; 668 mutex_exit(pse); 669 page_unlock_capture(pp); 670 } else { 671 mutex_exit(pse); 672 } 673 } else { 674 mutex_exit(pse); 675 } 676 } 677 678 /* 679 * Try to upgrade the lock on the page from a "shared" to an 680 * "exclusive" lock. Since this upgrade operation is done while 681 * holding the mutex protecting this page, no one else can acquire this page's 682 * lock and change the page. Thus, it is safe to drop the "shared" 683 * lock and attempt to acquire the "exclusive" lock. 684 * 685 * Returns 1 on success, 0 on failure. 686 */ 687 int 688 page_tryupgrade(page_t *pp) 689 { 690 kmutex_t *pse = PAGE_SE_MUTEX(pp); 691 692 mutex_enter(pse); 693 if (!(pp->p_selock & SE_EWANTED)) { 694 /* no threads want exclusive access, try upgrade */ 695 if (pp->p_selock == SE_READER) { 696 THREAD_KPRI_REQUEST(); 697 /* convert to exclusive lock */ 698 pp->p_selock = SE_WRITER; 699 mutex_exit(pse); 700 return (1); 701 } 702 } 703 mutex_exit(pse); 704 return (0); 705 } 706 707 /* 708 * Downgrade the "exclusive" lock on the page to a "shared" lock 709 * while holding the mutex protecting this page's p_selock field. 710 */ 711 void 712 page_downgrade(page_t *pp) 713 { 714 kmutex_t *pse = PAGE_SE_MUTEX(pp); 715 int excl_waiting; 716 717 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 718 ASSERT(PAGE_EXCL(pp)); 719 720 mutex_enter(pse); 721 excl_waiting = pp->p_selock & SE_EWANTED; 722 THREAD_KPRI_RELEASE(); 723 pp->p_selock = SE_READER | excl_waiting; 724 if (CV_HAS_WAITERS(&pp->p_cv)) 725 cv_broadcast(&pp->p_cv); 726 mutex_exit(pse); 727 } 728 729 void 730 page_lock_delete(page_t *pp) 731 { 732 kmutex_t *pse = PAGE_SE_MUTEX(pp); 733 734 ASSERT(PAGE_EXCL(pp)); 735 ASSERT(pp->p_vnode == NULL); 736 ASSERT(pp->p_offset == (u_offset_t)-1); 737 ASSERT(!PP_ISFREE(pp)); 738 739 mutex_enter(pse); 740 THREAD_KPRI_RELEASE(); 741 pp->p_selock = SE_DELETED; 742 if (CV_HAS_WAITERS(&pp->p_cv)) 743 cv_broadcast(&pp->p_cv); 744 mutex_exit(pse); 745 } 746 747 int 748 page_deleted(page_t *pp) 749 { 750 return (pp->p_selock == SE_DELETED); 751 } 752 753 /* 754 * Implement the io lock for pages 755 */ 756 void 757 page_iolock_init(page_t *pp) 758 { 759 pp->p_iolock_state = 0; 760 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 761 } 762 763 /* 764 * Acquire the i/o lock on a page. 765 */ 766 void 767 page_io_lock(page_t *pp) 768 { 769 kmutex_t *pio; 770 771 pio = PAGE_IO_MUTEX(pp); 772 mutex_enter(pio); 773 while (pp->p_iolock_state & PAGE_IO_INUSE) { 774 cv_wait(&(pp->p_io_cv), pio); 775 } 776 pp->p_iolock_state |= PAGE_IO_INUSE; 777 mutex_exit(pio); 778 } 779 780 /* 781 * Release the i/o lock on a page. 782 */ 783 void 784 page_io_unlock(page_t *pp) 785 { 786 kmutex_t *pio; 787 788 pio = PAGE_IO_MUTEX(pp); 789 mutex_enter(pio); 790 cv_broadcast(&pp->p_io_cv); 791 pp->p_iolock_state &= ~PAGE_IO_INUSE; 792 mutex_exit(pio); 793 } 794 795 /* 796 * Try to acquire the i/o lock on a page without blocking. 797 * Returns 1 on success, 0 on failure. 798 */ 799 int 800 page_io_trylock(page_t *pp) 801 { 802 kmutex_t *pio; 803 804 if (pp->p_iolock_state & PAGE_IO_INUSE) 805 return (0); 806 807 pio = PAGE_IO_MUTEX(pp); 808 mutex_enter(pio); 809 810 if (pp->p_iolock_state & PAGE_IO_INUSE) { 811 mutex_exit(pio); 812 return (0); 813 } 814 pp->p_iolock_state |= PAGE_IO_INUSE; 815 mutex_exit(pio); 816 817 return (1); 818 } 819 820 /* 821 * Wait until the i/o lock is not held. 822 */ 823 void 824 page_io_wait(page_t *pp) 825 { 826 kmutex_t *pio; 827 828 pio = PAGE_IO_MUTEX(pp); 829 mutex_enter(pio); 830 while (pp->p_iolock_state & PAGE_IO_INUSE) { 831 cv_wait(&(pp->p_io_cv), pio); 832 } 833 mutex_exit(pio); 834 } 835 836 /* 837 * Returns 1 on success, 0 on failure. 838 */ 839 int 840 page_io_locked(page_t *pp) 841 { 842 return (pp->p_iolock_state & PAGE_IO_INUSE); 843 } 844 845 /* 846 * Assert that the i/o lock on a page is held. 847 * Returns 1 on success, 0 on failure. 848 */ 849 int 850 page_iolock_assert(page_t *pp) 851 { 852 return (page_io_locked(pp)); 853 } 854 855 /* 856 * Wrapper exported to kernel routines that are built 857 * platform-independent (the macro is platform-dependent; 858 * the size of vph_mutex[] is based on NCPU). 859 * 860 * Note that you can do stress testing on this by setting the 861 * variable page_vnode_mutex_stress to something other than 862 * zero in a DEBUG kernel in a debugger after loading the kernel. 863 * Setting it after the kernel is running may not work correctly. 864 */ 865 #ifdef DEBUG 866 static int page_vnode_mutex_stress = 0; 867 #endif 868 869 kmutex_t * 870 page_vnode_mutex(vnode_t *vp) 871 { 872 if (vp == &kvp) 873 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 874 875 if (vp == &zvp) 876 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 877 #ifdef DEBUG 878 if (page_vnode_mutex_stress != 0) 879 return (&vph_mutex[0]); 880 #endif 881 882 return (&vph_mutex[VP_HASH_FUNC(vp)]); 883 } 884 885 kmutex_t * 886 page_se_mutex(page_t *pp) 887 { 888 return (PAGE_SE_MUTEX(pp)); 889 } 890 891 #ifdef VM_STATS 892 uint_t pszclck_stat[4]; 893 #endif 894 /* 895 * Find, take and return a mutex held by hat_page_demote(). 896 * Called by page_demote_vp_pages() before hat_page_demote() call and by 897 * routines that want to block hat_page_demote() but can't do it 898 * via locking all constituent pages. 899 * 900 * Return NULL if p_szc is 0. 901 * 902 * It should only be used for pages that can be demoted by hat_page_demote() 903 * i.e. non swapfs file system pages. The logic here is lifted from 904 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 905 * since the page is locked and not free. 906 * 907 * Hash of the root page is used to find the lock. 908 * To find the root in the presense of hat_page_demote() chageing the location 909 * of the root this routine relies on the fact that hat_page_demote() changes 910 * root last. 911 * 912 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 913 * returned pp's p_szc may be any value. 914 */ 915 kmutex_t * 916 page_szc_lock(page_t *pp) 917 { 918 kmutex_t *mtx; 919 page_t *rootpp; 920 uint_t szc; 921 uint_t rszc; 922 uint_t pszc = pp->p_szc; 923 924 ASSERT(pp != NULL); 925 ASSERT(PAGE_LOCKED(pp)); 926 ASSERT(!PP_ISFREE(pp)); 927 ASSERT(pp->p_vnode != NULL); 928 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 929 ASSERT(!PP_ISKAS(pp)); 930 931 again: 932 if (pszc == 0) { 933 VM_STAT_ADD(pszclck_stat[0]); 934 return (NULL); 935 } 936 937 /* The lock lives in the root page */ 938 939 rootpp = PP_GROUPLEADER(pp, pszc); 940 mtx = PAGE_SZC_MUTEX(rootpp); 941 mutex_enter(mtx); 942 943 /* 944 * since p_szc can only decrease if pp == rootpp 945 * rootpp will be always the same i.e we have the right root 946 * regardless of rootpp->p_szc. 947 * If location of pp's root didn't change after we took 948 * the lock we have the right root. return mutex hashed off it. 949 */ 950 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 951 VM_STAT_ADD(pszclck_stat[1]); 952 return (mtx); 953 } 954 955 /* 956 * root location changed because page got demoted. 957 * locate the new root. 958 */ 959 if (rszc < pszc) { 960 szc = pp->p_szc; 961 ASSERT(szc < pszc); 962 mutex_exit(mtx); 963 pszc = szc; 964 VM_STAT_ADD(pszclck_stat[2]); 965 goto again; 966 } 967 968 VM_STAT_ADD(pszclck_stat[3]); 969 /* 970 * current hat_page_demote not done yet. 971 * wait for it to finish. 972 */ 973 mutex_exit(mtx); 974 rootpp = PP_GROUPLEADER(rootpp, rszc); 975 mtx = PAGE_SZC_MUTEX(rootpp); 976 mutex_enter(mtx); 977 mutex_exit(mtx); 978 ASSERT(rootpp->p_szc < rszc); 979 goto again; 980 } 981 982 int 983 page_szc_lock_assert(page_t *pp) 984 { 985 page_t *rootpp = PP_PAGEROOT(pp); 986 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 987 988 return (MUTEX_HELD(mtx)); 989 } 990 991 /* 992 * memseg locking 993 */ 994 static krwlock_t memsegslock; 995 996 /* 997 * memlist (phys_install, phys_avail) locking. 998 */ 999 static krwlock_t memlists_lock; 1000 1001 void 1002 memsegs_lock(int writer) 1003 { 1004 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1005 } 1006 1007 /*ARGSUSED*/ 1008 void 1009 memsegs_unlock(int writer) 1010 { 1011 rw_exit(&memsegslock); 1012 } 1013 1014 int 1015 memsegs_lock_held(void) 1016 { 1017 return (RW_LOCK_HELD(&memsegslock)); 1018 } 1019 1020 void 1021 memlist_read_lock(void) 1022 { 1023 rw_enter(&memlists_lock, RW_READER); 1024 } 1025 1026 void 1027 memlist_read_unlock(void) 1028 { 1029 rw_exit(&memlists_lock); 1030 } 1031 1032 void 1033 memlist_write_lock(void) 1034 { 1035 rw_enter(&memlists_lock, RW_WRITER); 1036 } 1037 1038 void 1039 memlist_write_unlock(void) 1040 { 1041 rw_exit(&memlists_lock); 1042 } 1043