1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * VM - page locking primitives 30 */ 31 #include <sys/param.h> 32 #include <sys/t_lock.h> 33 #include <sys/vtrace.h> 34 #include <sys/debug.h> 35 #include <sys/cmn_err.h> 36 #include <sys/vnode.h> 37 #include <sys/bitmap.h> 38 #include <sys/lockstat.h> 39 #include <sys/condvar_impl.h> 40 #include <vm/page.h> 41 #include <vm/seg_enum.h> 42 #include <vm/vm_dep.h> 43 44 /* 45 * This global mutex is for logical page locking. 46 * The following fields in the page structure are protected 47 * by this lock: 48 * 49 * p_lckcnt 50 * p_cowcnt 51 */ 52 kmutex_t page_llock; 53 54 /* 55 * This is a global lock for the logical page free list. The 56 * logical free list, in this implementation, is maintained as two 57 * separate physical lists - the cache list and the free list. 58 */ 59 kmutex_t page_freelock; 60 61 /* 62 * The hash table, page_hash[], the p_selock fields, and the 63 * list of pages associated with vnodes are protected by arrays of mutexes. 64 * 65 * Unless the hashes are changed radically, the table sizes must be 66 * a power of two. Also, we typically need more mutexes for the 67 * vnodes since these locks are occasionally held for long periods. 68 * And since there seem to be two special vnodes (kvp and swapvp), 69 * we make room for private mutexes for them. 70 * 71 * The pse_mutex[] array holds the mutexes to protect the p_selock 72 * fields of all page_t structures. 73 * 74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 75 * when given a pointer to a page_t. 76 * 77 * PSE_TABLE_SIZE must be a power of two. One could argue that we 78 * should go to the trouble of setting it up at run time and base it 79 * on memory size rather than the number of compile time CPUs. 80 * 81 * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE, 82 * PSE_SHIFT, PIO_SHIFT. 83 * 84 * These might break in 64 bit world. 85 */ 86 #define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */ 87 88 #define PSE_TABLE_SIZE 128 /* number of mutexes to have */ 89 90 #define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */ 91 #define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */ 92 93 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 94 pad_mutex_t pse_mutex[PSE_TABLE_SIZE]; 95 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 96 97 #define PAGE_SE_MUTEX(pp) \ 98 &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \ 99 ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \ 100 (PSE_TABLE_SIZE - 1))].pad_mutex 101 102 #define PAGE_IO_MUTEX(pp) \ 103 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 104 105 #define PSZC_MTX_TABLE_SIZE 128 106 #define PSZC_MTX_TABLE_SHIFT 7 107 108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 109 110 #define PAGE_SZC_MUTEX(_pp) \ 111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 115 116 /* 117 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 119 * and p_vpnext). 120 * 121 * The page_vnode_mutex(vp) function returns the address of the appropriate 122 * mutex from this array given a pointer to a vnode. It is complicated 123 * by the fact that the kernel's vnode and the swapfs vnode are referenced 124 * frequently enough to warrent their own mutexes. 125 * 126 * The VP_HASH_FUNC returns the index into the vph_mutex array given 127 * an address of a vnode. 128 */ 129 130 /* 131 * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 132 * Need to review again. 133 */ 134 #define VPH_TABLE_SIZE (2 << VP_SHIFT) 135 136 #define VP_HASH_FUNC(vp) \ 137 ((((uintptr_t)(vp) >> 6) + \ 138 ((uintptr_t)(vp) >> 8) + \ 139 ((uintptr_t)(vp) >> 10) + \ 140 ((uintptr_t)(vp) >> 12)) \ 141 & (VPH_TABLE_SIZE - 1)) 142 143 extern struct vnode kvp; 144 145 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 146 147 /* 148 * Initialize the locks used by the Virtual Memory Management system. 149 */ 150 void 151 page_lock_init() 152 { 153 } 154 155 /* 156 * At present we only use page ownership to aid debugging, so it's 157 * OK if the owner field isn't exact. In the 32-bit world two thread ids 158 * can map to the same owner because we just 'or' in 0x80000000 and 159 * then clear the second highest bit, so that (for example) 0x2faced00 160 * and 0xafaced00 both map to 0xafaced00. 161 * In the 64-bit world, p_selock may not be large enough to hold a full 162 * thread pointer. If we ever need precise ownership (e.g. if we implement 163 * priority inheritance for page locks) then p_selock should become a 164 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 165 */ 166 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 167 #define SE_READER 1 168 169 /* 170 * A page that is deleted must be marked as such using the 171 * page_lock_delete() function. The page must be exclusively locked. 172 * The SE_DELETED marker is put in p_selock when this function is called. 173 * SE_DELETED must be distinct from any SE_WRITER value. 174 */ 175 #define SE_DELETED (1 | INT_MIN) 176 177 #ifdef VM_STATS 178 uint_t vph_kvp_count; 179 uint_t vph_swapfsvp_count; 180 uint_t vph_other; 181 #endif /* VM_STATS */ 182 183 #ifdef VM_STATS 184 uint_t page_lock_count; 185 uint_t page_lock_miss; 186 uint_t page_lock_miss_lock; 187 uint_t page_lock_reclaim; 188 uint_t page_lock_bad_reclaim; 189 uint_t page_lock_same_page; 190 uint_t page_lock_upgrade; 191 uint_t page_lock_retired; 192 uint_t page_lock_upgrade_failed; 193 uint_t page_lock_deleted; 194 195 uint_t page_trylock_locked; 196 uint_t page_trylock_failed; 197 uint_t page_trylock_missed; 198 199 uint_t page_try_reclaim_upgrade; 200 #endif /* VM_STATS */ 201 202 /* 203 * Acquire the "shared/exclusive" lock on a page. 204 * 205 * Returns 1 on success and locks the page appropriately. 206 * 0 on failure and does not lock the page. 207 * 208 * If `lock' is non-NULL, it will be dropped and reacquired in the 209 * failure case. This routine can block, and if it does 210 * it will always return a failure since the page identity [vp, off] 211 * or state may have changed. 212 */ 213 214 int 215 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 216 { 217 return (page_lock_es(pp, se, lock, reclaim, 0)); 218 } 219 220 /* 221 * With the addition of reader-writer lock semantics to page_lock_es, 222 * callers wanting an exclusive (writer) lock may prevent shared-lock 223 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 224 * In this case, when an exclusive lock cannot be acquired, p_selock's 225 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 226 * if the page is slated for retirement. 227 * 228 * The se and es parameters determine if the lock should be granted 229 * based on the following decision table: 230 * 231 * Lock wanted es flags p_selock/SE_EWANTED Action 232 * ----------- -------------- ------------------- --------- 233 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 234 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 235 * SE_EXCL none any lock/any deny 236 * SE_SHARED n/a [2] shared/0 grant 237 * SE_SHARED n/a [2] unlocked/0 grant 238 * SE_SHARED n/a shared/1 deny 239 * SE_SHARED n/a unlocked/1 deny 240 * SE_SHARED n/a excl/any deny 241 * 242 * Notes: 243 * [1] The code grants an exclusive lock to the caller and clears the bit 244 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 245 * bit's value. This was deemed acceptable as we are not concerned about 246 * exclusive-lock starvation. If this ever becomes an issue, a priority or 247 * fifo mechanism should also be implemented. Meantime, the thread that 248 * set SE_EWANTED should be prepared to catch this condition and reset it 249 * 250 * [2] Retired pages may not be locked at any time, regardless of the 251 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 252 * 253 * Notes on values of "es": 254 * 255 * es & 1: page_lookup_create will attempt page relocation 256 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 257 * memory thread); this prevents reader-starvation of waiting 258 * writer thread(s) by giving priority to writers over readers. 259 * es & SE_RETIRED: caller wants to lock pages even if they are 260 * retired. Default is to deny the lock if the page is retired. 261 * 262 * And yes, we know, the semantics of this function are too complicated. 263 * It's on the list to be cleaned up. 264 */ 265 int 266 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 267 { 268 int retval; 269 kmutex_t *pse = PAGE_SE_MUTEX(pp); 270 int upgraded; 271 int reclaim_it; 272 273 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 274 275 VM_STAT_ADD(page_lock_count); 276 277 upgraded = 0; 278 reclaim_it = 0; 279 280 mutex_enter(pse); 281 282 ASSERT(((es & SE_EXCL_WANTED) == 0) || 283 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 284 285 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 286 mutex_exit(pse); 287 VM_STAT_ADD(page_lock_retired); 288 return (0); 289 } 290 291 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 292 se = SE_EXCL; 293 } 294 295 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 296 297 reclaim_it = 1; 298 if (se == SE_SHARED) { 299 /* 300 * This is an interesting situation. 301 * 302 * Remember that p_free can only change if 303 * p_selock < 0. 304 * p_free does not depend on our holding `pse'. 305 * And, since we hold `pse', p_selock can not change. 306 * So, if p_free changes on us, the page is already 307 * exclusively held, and we would fail to get p_selock 308 * regardless. 309 * 310 * We want to avoid getting the share 311 * lock on a free page that needs to be reclaimed. 312 * It is possible that some other thread has the share 313 * lock and has left the free page on the cache list. 314 * pvn_vplist_dirty() does this for brief periods. 315 * If the se_share is currently SE_EXCL, we will fail 316 * to acquire p_selock anyway. Blocking is the 317 * right thing to do. 318 * If we need to reclaim this page, we must get 319 * exclusive access to it, force the upgrade now. 320 * Again, we will fail to acquire p_selock if the 321 * page is not free and block. 322 */ 323 upgraded = 1; 324 se = SE_EXCL; 325 VM_STAT_ADD(page_lock_upgrade); 326 } 327 } 328 329 if (se == SE_EXCL) { 330 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 331 /* 332 * if the caller wants a writer lock (but did not 333 * specify exclusive access), and there is a pending 334 * writer that wants exclusive access, return failure 335 */ 336 retval = 0; 337 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 338 /* no reader/writer lock held */ 339 THREAD_KPRI_REQUEST(); 340 /* this clears our setting of the SE_EWANTED bit */ 341 pp->p_selock = SE_WRITER; 342 retval = 1; 343 } else { 344 /* page is locked */ 345 if (es & SE_EXCL_WANTED) { 346 /* set the SE_EWANTED bit */ 347 pp->p_selock |= SE_EWANTED; 348 } 349 retval = 0; 350 } 351 } else { 352 retval = 0; 353 if (pp->p_selock >= 0) { 354 if ((pp->p_selock & SE_EWANTED) == 0) { 355 pp->p_selock += SE_READER; 356 retval = 1; 357 } 358 } 359 } 360 361 if (retval == 0) { 362 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 363 VM_STAT_ADD(page_lock_deleted); 364 mutex_exit(pse); 365 return (retval); 366 } 367 368 #ifdef VM_STATS 369 VM_STAT_ADD(page_lock_miss); 370 if (upgraded) { 371 VM_STAT_ADD(page_lock_upgrade_failed); 372 } 373 #endif 374 if (lock) { 375 VM_STAT_ADD(page_lock_miss_lock); 376 mutex_exit(lock); 377 } 378 379 /* 380 * Now, wait for the page to be unlocked and 381 * release the lock protecting p_cv and p_selock. 382 */ 383 cv_wait(&pp->p_cv, pse); 384 mutex_exit(pse); 385 386 /* 387 * The page identity may have changed while we were 388 * blocked. If we are willing to depend on "pp" 389 * still pointing to a valid page structure (i.e., 390 * assuming page structures are not dynamically allocated 391 * or freed), we could try to lock the page if its 392 * identity hasn't changed. 393 * 394 * This needs to be measured, since we come back from 395 * cv_wait holding pse (the expensive part of this 396 * operation) we might as well try the cheap part. 397 * Though we would also have to confirm that dropping 398 * `lock' did not cause any grief to the callers. 399 */ 400 if (lock) { 401 mutex_enter(lock); 402 } 403 } else { 404 /* 405 * We have the page lock. 406 * If we needed to reclaim the page, and the page 407 * needed reclaiming (ie, it was free), then we 408 * have the page exclusively locked. We may need 409 * to downgrade the page. 410 */ 411 ASSERT((upgraded) ? 412 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 413 mutex_exit(pse); 414 415 /* 416 * We now hold this page's lock, either shared or 417 * exclusive. This will prevent its identity from changing. 418 * The page, however, may or may not be free. If the caller 419 * requested, and it is free, go reclaim it from the 420 * free list. If the page can't be reclaimed, return failure 421 * so that the caller can start all over again. 422 * 423 * NOTE:page_reclaim() releases the page lock (p_selock) 424 * if it can't be reclaimed. 425 */ 426 if (reclaim_it) { 427 if (!page_reclaim(pp, lock)) { 428 VM_STAT_ADD(page_lock_bad_reclaim); 429 retval = 0; 430 } else { 431 VM_STAT_ADD(page_lock_reclaim); 432 if (upgraded) { 433 page_downgrade(pp); 434 } 435 } 436 } 437 } 438 return (retval); 439 } 440 441 /* 442 * Clear the SE_EWANTED bit from p_selock. This function allows 443 * callers of page_lock_es and page_try_reclaim_lock to clear 444 * their setting of this bit if they decide they no longer wish 445 * to gain exclusive access to the page. Currently only 446 * delete_memory_thread uses this when the delete memory 447 * operation is cancelled. 448 */ 449 void 450 page_lock_clr_exclwanted(page_t *pp) 451 { 452 kmutex_t *pse = PAGE_SE_MUTEX(pp); 453 454 mutex_enter(pse); 455 pp->p_selock &= ~SE_EWANTED; 456 if (CV_HAS_WAITERS(&pp->p_cv)) 457 cv_broadcast(&pp->p_cv); 458 mutex_exit(pse); 459 } 460 461 /* 462 * Read the comments inside of page_lock_es() carefully. 463 * 464 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 465 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 466 * This is used by threads subject to reader-starvation (eg. memory delete). 467 * 468 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 469 * it is expected that it will retry at a later time. Threads that will 470 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 471 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 472 * the bit is cleared.) 473 */ 474 int 475 page_try_reclaim_lock(page_t *pp, se_t se, int es) 476 { 477 kmutex_t *pse = PAGE_SE_MUTEX(pp); 478 selock_t old; 479 480 mutex_enter(pse); 481 482 old = pp->p_selock; 483 484 ASSERT(((es & SE_EXCL_WANTED) == 0) || 485 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 486 487 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 488 mutex_exit(pse); 489 VM_STAT_ADD(page_trylock_failed); 490 return (0); 491 } 492 493 if (se == SE_SHARED && es == 1 && old == 0) { 494 se = SE_EXCL; 495 } 496 497 if (se == SE_SHARED) { 498 if (!PP_ISFREE(pp)) { 499 if (old >= 0) { 500 /* 501 * Readers are not allowed when excl wanted 502 */ 503 if ((old & SE_EWANTED) == 0) { 504 pp->p_selock = old + SE_READER; 505 mutex_exit(pse); 506 return (1); 507 } 508 } 509 mutex_exit(pse); 510 return (0); 511 } 512 /* 513 * The page is free, so we really want SE_EXCL (below) 514 */ 515 VM_STAT_ADD(page_try_reclaim_upgrade); 516 } 517 518 /* 519 * The caller wants a writer lock. We try for it only if 520 * SE_EWANTED is not set, or if the caller specified 521 * SE_EXCL_WANTED. 522 */ 523 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 524 if ((old & ~SE_EWANTED) == 0) { 525 /* no reader/writer lock held */ 526 THREAD_KPRI_REQUEST(); 527 /* this clears out our setting of the SE_EWANTED bit */ 528 pp->p_selock = SE_WRITER; 529 mutex_exit(pse); 530 return (1); 531 } 532 } 533 if (es & SE_EXCL_WANTED) { 534 /* page is locked, set the SE_EWANTED bit */ 535 pp->p_selock |= SE_EWANTED; 536 } 537 mutex_exit(pse); 538 return (0); 539 } 540 541 /* 542 * Acquire a page's "shared/exclusive" lock, but never block. 543 * Returns 1 on success, 0 on failure. 544 */ 545 int 546 page_trylock(page_t *pp, se_t se) 547 { 548 kmutex_t *pse = PAGE_SE_MUTEX(pp); 549 550 mutex_enter(pse); 551 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 552 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 553 /* 554 * Fail if a thread wants exclusive access and page is 555 * retired, if the page is slated for retirement, or a 556 * share lock is requested. 557 */ 558 mutex_exit(pse); 559 VM_STAT_ADD(page_trylock_failed); 560 return (0); 561 } 562 563 if (se == SE_EXCL) { 564 if (pp->p_selock == 0) { 565 THREAD_KPRI_REQUEST(); 566 pp->p_selock = SE_WRITER; 567 mutex_exit(pse); 568 return (1); 569 } 570 } else { 571 if (pp->p_selock >= 0) { 572 pp->p_selock += SE_READER; 573 mutex_exit(pse); 574 return (1); 575 } 576 } 577 mutex_exit(pse); 578 return (0); 579 } 580 581 /* 582 * Variant of page_unlock() specifically for the page freelist 583 * code. The mere existence of this code is a vile hack that 584 * has resulted due to the backwards locking order of the page 585 * freelist manager; please don't call it. 586 */ 587 void 588 page_unlock_nocapture(page_t *pp) 589 { 590 kmutex_t *pse = PAGE_SE_MUTEX(pp); 591 selock_t old; 592 593 mutex_enter(pse); 594 595 old = pp->p_selock; 596 if ((old & ~SE_EWANTED) == SE_READER) { 597 pp->p_selock = old & ~SE_READER; 598 if (CV_HAS_WAITERS(&pp->p_cv)) 599 cv_broadcast(&pp->p_cv); 600 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 601 panic("page_unlock_nocapture: page %p is deleted", pp); 602 } else if (old < 0) { 603 THREAD_KPRI_RELEASE(); 604 pp->p_selock &= SE_EWANTED; 605 if (CV_HAS_WAITERS(&pp->p_cv)) 606 cv_broadcast(&pp->p_cv); 607 } else if ((old & ~SE_EWANTED) > SE_READER) { 608 pp->p_selock = old - SE_READER; 609 } else { 610 panic("page_unlock_nocapture: page %p is not locked", pp); 611 } 612 613 mutex_exit(pse); 614 } 615 616 /* 617 * Release the page's "shared/exclusive" lock and wake up anyone 618 * who might be waiting for it. 619 */ 620 void 621 page_unlock(page_t *pp) 622 { 623 kmutex_t *pse = PAGE_SE_MUTEX(pp); 624 selock_t old; 625 626 mutex_enter(pse); 627 628 old = pp->p_selock; 629 if ((old & ~SE_EWANTED) == SE_READER) { 630 pp->p_selock = old & ~SE_READER; 631 if (CV_HAS_WAITERS(&pp->p_cv)) 632 cv_broadcast(&pp->p_cv); 633 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 634 panic("page_unlock: page %p is deleted", pp); 635 } else if (old < 0) { 636 THREAD_KPRI_RELEASE(); 637 pp->p_selock &= SE_EWANTED; 638 if (CV_HAS_WAITERS(&pp->p_cv)) 639 cv_broadcast(&pp->p_cv); 640 } else if ((old & ~SE_EWANTED) > SE_READER) { 641 pp->p_selock = old - SE_READER; 642 } else { 643 panic("page_unlock: page %p is not locked", pp); 644 } 645 646 if (pp->p_selock == 0) { 647 /* 648 * If the T_CAPTURING bit is set, that means that we should 649 * not try and capture the page again as we could recurse 650 * which could lead to a stack overflow panic or spending a 651 * relatively long time in the kernel making no progress. 652 */ 653 if ((pp->p_toxic & PR_CAPTURE) && 654 !(curthread->t_flag & T_CAPTURING) && 655 !PP_RETIRED(pp)) { 656 THREAD_KPRI_REQUEST(); 657 pp->p_selock = SE_WRITER; 658 mutex_exit(pse); 659 page_unlock_capture(pp); 660 } else { 661 mutex_exit(pse); 662 } 663 } else { 664 mutex_exit(pse); 665 } 666 } 667 668 /* 669 * Try to upgrade the lock on the page from a "shared" to an 670 * "exclusive" lock. Since this upgrade operation is done while 671 * holding the mutex protecting this page, no one else can acquire this page's 672 * lock and change the page. Thus, it is safe to drop the "shared" 673 * lock and attempt to acquire the "exclusive" lock. 674 * 675 * Returns 1 on success, 0 on failure. 676 */ 677 int 678 page_tryupgrade(page_t *pp) 679 { 680 kmutex_t *pse = PAGE_SE_MUTEX(pp); 681 682 mutex_enter(pse); 683 if (!(pp->p_selock & SE_EWANTED)) { 684 /* no threads want exclusive access, try upgrade */ 685 if (pp->p_selock == SE_READER) { 686 THREAD_KPRI_REQUEST(); 687 /* convert to exclusive lock */ 688 pp->p_selock = SE_WRITER; 689 mutex_exit(pse); 690 return (1); 691 } 692 } 693 mutex_exit(pse); 694 return (0); 695 } 696 697 /* 698 * Downgrade the "exclusive" lock on the page to a "shared" lock 699 * while holding the mutex protecting this page's p_selock field. 700 */ 701 void 702 page_downgrade(page_t *pp) 703 { 704 kmutex_t *pse = PAGE_SE_MUTEX(pp); 705 int excl_waiting; 706 707 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 708 ASSERT(PAGE_EXCL(pp)); 709 710 mutex_enter(pse); 711 excl_waiting = pp->p_selock & SE_EWANTED; 712 THREAD_KPRI_RELEASE(); 713 pp->p_selock = SE_READER | excl_waiting; 714 if (CV_HAS_WAITERS(&pp->p_cv)) 715 cv_broadcast(&pp->p_cv); 716 mutex_exit(pse); 717 } 718 719 void 720 page_lock_delete(page_t *pp) 721 { 722 kmutex_t *pse = PAGE_SE_MUTEX(pp); 723 724 ASSERT(PAGE_EXCL(pp)); 725 ASSERT(pp->p_vnode == NULL); 726 ASSERT(pp->p_offset == (u_offset_t)-1); 727 ASSERT(!PP_ISFREE(pp)); 728 729 mutex_enter(pse); 730 THREAD_KPRI_RELEASE(); 731 pp->p_selock = SE_DELETED; 732 if (CV_HAS_WAITERS(&pp->p_cv)) 733 cv_broadcast(&pp->p_cv); 734 mutex_exit(pse); 735 } 736 737 int 738 page_deleted(page_t *pp) 739 { 740 return (pp->p_selock == SE_DELETED); 741 } 742 743 /* 744 * Implement the io lock for pages 745 */ 746 void 747 page_iolock_init(page_t *pp) 748 { 749 pp->p_iolock_state = 0; 750 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 751 } 752 753 /* 754 * Acquire the i/o lock on a page. 755 */ 756 void 757 page_io_lock(page_t *pp) 758 { 759 kmutex_t *pio; 760 761 pio = PAGE_IO_MUTEX(pp); 762 mutex_enter(pio); 763 while (pp->p_iolock_state & PAGE_IO_INUSE) { 764 cv_wait(&(pp->p_io_cv), pio); 765 } 766 pp->p_iolock_state |= PAGE_IO_INUSE; 767 mutex_exit(pio); 768 } 769 770 /* 771 * Release the i/o lock on a page. 772 */ 773 void 774 page_io_unlock(page_t *pp) 775 { 776 kmutex_t *pio; 777 778 pio = PAGE_IO_MUTEX(pp); 779 mutex_enter(pio); 780 cv_broadcast(&pp->p_io_cv); 781 pp->p_iolock_state &= ~PAGE_IO_INUSE; 782 mutex_exit(pio); 783 } 784 785 /* 786 * Try to acquire the i/o lock on a page without blocking. 787 * Returns 1 on success, 0 on failure. 788 */ 789 int 790 page_io_trylock(page_t *pp) 791 { 792 kmutex_t *pio; 793 794 if (pp->p_iolock_state & PAGE_IO_INUSE) 795 return (0); 796 797 pio = PAGE_IO_MUTEX(pp); 798 mutex_enter(pio); 799 800 if (pp->p_iolock_state & PAGE_IO_INUSE) { 801 mutex_exit(pio); 802 return (0); 803 } 804 pp->p_iolock_state |= PAGE_IO_INUSE; 805 mutex_exit(pio); 806 807 return (1); 808 } 809 810 /* 811 * Wait until the i/o lock is not held. 812 */ 813 void 814 page_io_wait(page_t *pp) 815 { 816 kmutex_t *pio; 817 818 pio = PAGE_IO_MUTEX(pp); 819 mutex_enter(pio); 820 while (pp->p_iolock_state & PAGE_IO_INUSE) { 821 cv_wait(&(pp->p_io_cv), pio); 822 } 823 mutex_exit(pio); 824 } 825 826 /* 827 * Returns 1 on success, 0 on failure. 828 */ 829 int 830 page_io_locked(page_t *pp) 831 { 832 return (pp->p_iolock_state & PAGE_IO_INUSE); 833 } 834 835 /* 836 * Assert that the i/o lock on a page is held. 837 * Returns 1 on success, 0 on failure. 838 */ 839 int 840 page_iolock_assert(page_t *pp) 841 { 842 return (page_io_locked(pp)); 843 } 844 845 /* 846 * Wrapper exported to kernel routines that are built 847 * platform-independent (the macro is platform-dependent; 848 * the size of vph_mutex[] is based on NCPU). 849 * 850 * Note that you can do stress testing on this by setting the 851 * variable page_vnode_mutex_stress to something other than 852 * zero in a DEBUG kernel in a debugger after loading the kernel. 853 * Setting it after the kernel is running may not work correctly. 854 */ 855 #ifdef DEBUG 856 static int page_vnode_mutex_stress = 0; 857 #endif 858 859 kmutex_t * 860 page_vnode_mutex(vnode_t *vp) 861 { 862 if (vp == &kvp) 863 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 864 #ifdef DEBUG 865 if (page_vnode_mutex_stress != 0) 866 return (&vph_mutex[0]); 867 #endif 868 869 return (&vph_mutex[VP_HASH_FUNC(vp)]); 870 } 871 872 kmutex_t * 873 page_se_mutex(page_t *pp) 874 { 875 return (PAGE_SE_MUTEX(pp)); 876 } 877 878 #ifdef VM_STATS 879 uint_t pszclck_stat[4]; 880 #endif 881 /* 882 * Find, take and return a mutex held by hat_page_demote(). 883 * Called by page_demote_vp_pages() before hat_page_demote() call and by 884 * routines that want to block hat_page_demote() but can't do it 885 * via locking all constituent pages. 886 * 887 * Return NULL if p_szc is 0. 888 * 889 * It should only be used for pages that can be demoted by hat_page_demote() 890 * i.e. non swapfs file system pages. The logic here is lifted from 891 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 892 * since the page is locked and not free. 893 * 894 * Hash of the root page is used to find the lock. 895 * To find the root in the presense of hat_page_demote() chageing the location 896 * of the root this routine relies on the fact that hat_page_demote() changes 897 * root last. 898 * 899 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 900 * returned pp's p_szc may be any value. 901 */ 902 kmutex_t * 903 page_szc_lock(page_t *pp) 904 { 905 kmutex_t *mtx; 906 page_t *rootpp; 907 uint_t szc; 908 uint_t rszc; 909 uint_t pszc = pp->p_szc; 910 911 ASSERT(pp != NULL); 912 ASSERT(PAGE_LOCKED(pp)); 913 ASSERT(!PP_ISFREE(pp)); 914 ASSERT(pp->p_vnode != NULL); 915 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 916 ASSERT(pp->p_vnode != &kvp); 917 918 again: 919 if (pszc == 0) { 920 VM_STAT_ADD(pszclck_stat[0]); 921 return (NULL); 922 } 923 924 /* The lock lives in the root page */ 925 926 rootpp = PP_GROUPLEADER(pp, pszc); 927 mtx = PAGE_SZC_MUTEX(rootpp); 928 mutex_enter(mtx); 929 930 /* 931 * since p_szc can only decrease if pp == rootpp 932 * rootpp will be always the same i.e we have the right root 933 * regardless of rootpp->p_szc. 934 * If location of pp's root didn't change after we took 935 * the lock we have the right root. return mutex hashed off it. 936 */ 937 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 938 VM_STAT_ADD(pszclck_stat[1]); 939 return (mtx); 940 } 941 942 /* 943 * root location changed because page got demoted. 944 * locate the new root. 945 */ 946 if (rszc < pszc) { 947 szc = pp->p_szc; 948 ASSERT(szc < pszc); 949 mutex_exit(mtx); 950 pszc = szc; 951 VM_STAT_ADD(pszclck_stat[2]); 952 goto again; 953 } 954 955 VM_STAT_ADD(pszclck_stat[3]); 956 /* 957 * current hat_page_demote not done yet. 958 * wait for it to finish. 959 */ 960 mutex_exit(mtx); 961 rootpp = PP_GROUPLEADER(rootpp, rszc); 962 mtx = PAGE_SZC_MUTEX(rootpp); 963 mutex_enter(mtx); 964 mutex_exit(mtx); 965 ASSERT(rootpp->p_szc < rszc); 966 goto again; 967 } 968 969 int 970 page_szc_lock_assert(page_t *pp) 971 { 972 page_t *rootpp = PP_PAGEROOT(pp); 973 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 974 975 return (MUTEX_HELD(mtx)); 976 } 977