1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * VM - page locking primitives 31 */ 32 #include <sys/param.h> 33 #include <sys/t_lock.h> 34 #include <sys/vtrace.h> 35 #include <sys/debug.h> 36 #include <sys/cmn_err.h> 37 #include <sys/vnode.h> 38 #include <sys/bitmap.h> 39 #include <sys/lockstat.h> 40 #include <sys/condvar_impl.h> 41 #include <vm/page.h> 42 #include <vm/seg_enum.h> 43 #include <vm/vm_dep.h> 44 45 /* 46 * This global mutex is for logical page locking. 47 * The following fields in the page structure are protected 48 * by this lock: 49 * 50 * p_lckcnt 51 * p_cowcnt 52 */ 53 kmutex_t page_llock; 54 55 /* 56 * This is a global lock for the logical page free list. The 57 * logical free list, in this implementation, is maintained as two 58 * separate physical lists - the cache list and the free list. 59 */ 60 kmutex_t page_freelock; 61 62 /* 63 * The hash table, page_hash[], the p_selock fields, and the 64 * list of pages associated with vnodes are protected by arrays of mutexes. 65 * 66 * Unless the hashes are changed radically, the table sizes must be 67 * a power of two. Also, we typically need more mutexes for the 68 * vnodes since these locks are occasionally held for long periods. 69 * And since there seem to be two special vnodes (kvp and swapvp), 70 * we make room for private mutexes for them. 71 * 72 * The pse_mutex[] array holds the mutexes to protect the p_selock 73 * fields of all page_t structures. 74 * 75 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 76 * when given a pointer to a page_t. 77 * 78 * PSE_TABLE_SIZE must be a power of two. One could argue that we 79 * should go to the trouble of setting it up at run time and base it 80 * on memory size rather than the number of compile time CPUs. 81 * 82 * XX64 We should be using physmem size to calculate PSE_TABLE_SIZE, 83 * PSE_SHIFT, PIO_SHIFT. 84 * 85 * These might break in 64 bit world. 86 */ 87 #define PSE_SHIFT 7 /* log2(PSE_TABLE_SIZE) */ 88 89 #define PSE_TABLE_SIZE 128 /* number of mutexes to have */ 90 91 #define PIO_SHIFT PSE_SHIFT /* next power of 2 bigger than page_t */ 92 #define PIO_TABLE_SIZE PSE_TABLE_SIZE /* number of io mutexes to have */ 93 94 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 95 pad_mutex_t pse_mutex[PSE_TABLE_SIZE]; 96 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 97 98 #define PAGE_SE_MUTEX(pp) \ 99 &pse_mutex[((((uintptr_t)(pp) >> PSE_SHIFT) ^ \ 100 ((uintptr_t)(pp) >> (PSE_SHIFT << 1))) & \ 101 (PSE_TABLE_SIZE - 1))].pad_mutex 102 103 #define PAGE_IO_MUTEX(pp) \ 104 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 105 106 #define PSZC_MTX_TABLE_SIZE 128 107 #define PSZC_MTX_TABLE_SHIFT 7 108 109 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 110 111 #define PAGE_SZC_MUTEX(_pp) \ 112 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 113 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 114 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 115 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 116 117 /* 118 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 119 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 120 * and p_vpnext). 121 * 122 * The page_vnode_mutex(vp) function returns the address of the appropriate 123 * mutex from this array given a pointer to a vnode. It is complicated 124 * by the fact that the kernel's vnode and the swapfs vnode are referenced 125 * frequently enough to warrent their own mutexes. 126 * 127 * The VP_HASH_FUNC returns the index into the vph_mutex array given 128 * an address of a vnode. 129 */ 130 131 /* 132 * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 133 * Need to review again. 134 */ 135 #define VPH_TABLE_SIZE (2 << VP_SHIFT) 136 137 #define VP_HASH_FUNC(vp) \ 138 ((((uintptr_t)(vp) >> 6) + \ 139 ((uintptr_t)(vp) >> 8) + \ 140 ((uintptr_t)(vp) >> 10) + \ 141 ((uintptr_t)(vp) >> 12)) \ 142 & (VPH_TABLE_SIZE - 1)) 143 144 extern struct vnode kvp; 145 146 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 147 148 /* 149 * Initialize the locks used by the Virtual Memory Management system. 150 */ 151 void 152 page_lock_init() 153 { 154 } 155 156 /* 157 * At present we only use page ownership to aid debugging, so it's 158 * OK if the owner field isn't exact. In the 32-bit world two thread ids 159 * can map to the same owner because we just 'or' in 0x80000000 and 160 * then clear the second highest bit, so that (for example) 0x2faced00 161 * and 0xafaced00 both map to 0xafaced00. 162 * In the 64-bit world, p_selock may not be large enough to hold a full 163 * thread pointer. If we ever need precise ownership (e.g. if we implement 164 * priority inheritance for page locks) then p_selock should become a 165 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 166 */ 167 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 168 #define SE_READER 1 169 170 /* 171 * A page that is deleted must be marked as such using the 172 * page_lock_delete() function. The page must be exclusively locked. 173 * The SE_DELETED marker is put in p_selock when this function is called. 174 * SE_DELETED must be distinct from any SE_WRITER value. 175 */ 176 #define SE_DELETED (1 | INT_MIN) 177 178 #ifdef VM_STATS 179 uint_t vph_kvp_count; 180 uint_t vph_swapfsvp_count; 181 uint_t vph_other; 182 #endif /* VM_STATS */ 183 184 #ifdef VM_STATS 185 uint_t page_lock_count; 186 uint_t page_lock_miss; 187 uint_t page_lock_miss_lock; 188 uint_t page_lock_reclaim; 189 uint_t page_lock_bad_reclaim; 190 uint_t page_lock_same_page; 191 uint_t page_lock_upgrade; 192 uint_t page_lock_retired; 193 uint_t page_lock_upgrade_failed; 194 uint_t page_lock_deleted; 195 196 uint_t page_trylock_locked; 197 uint_t page_trylock_failed; 198 uint_t page_trylock_missed; 199 200 uint_t page_try_reclaim_upgrade; 201 #endif /* VM_STATS */ 202 203 /* 204 * Acquire the "shared/exclusive" lock on a page. 205 * 206 * Returns 1 on success and locks the page appropriately. 207 * 0 on failure and does not lock the page. 208 * 209 * If `lock' is non-NULL, it will be dropped and reacquired in the 210 * failure case. This routine can block, and if it does 211 * it will always return a failure since the page identity [vp, off] 212 * or state may have changed. 213 */ 214 215 int 216 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 217 { 218 return (page_lock_es(pp, se, lock, reclaim, 0)); 219 } 220 221 /* 222 * With the addition of reader-writer lock semantics to page_lock_es, 223 * callers wanting an exclusive (writer) lock may prevent shared-lock 224 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 225 * In this case, when an exclusive lock cannot be acquired, p_selock's 226 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 227 * if the page is slated for retirement. 228 * 229 * The se and es parameters determine if the lock should be granted 230 * based on the following decision table: 231 * 232 * Lock wanted es flags p_selock/SE_EWANTED Action 233 * ----------- -------------- ------------------- --------- 234 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 235 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 236 * SE_EXCL none any lock/any deny 237 * SE_SHARED n/a [2][3] shared/0 grant 238 * SE_SHARED n/a [2][3] unlocked/0 grant 239 * SE_SHARED n/a shared/1 deny 240 * SE_SHARED n/a unlocked/1 deny 241 * SE_SHARED n/a excl/any deny 242 * 243 * Notes: 244 * [1] The code grants an exclusive lock to the caller and clears the bit 245 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 246 * bit's value. This was deemed acceptable as we are not concerned about 247 * exclusive-lock starvation. If this ever becomes an issue, a priority or 248 * fifo mechanism should also be implemented. Meantime, the thread that 249 * set SE_EWANTED should be prepared to catch this condition and reset it 250 * 251 * [2] Retired pages may not be locked at any time, regardless of the 252 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 253 * 254 * [3] If the page is slated for retirement the lock is denied. 255 * 256 * Notes on values of "es": 257 * 258 * es & 1: page_lookup_create will attempt page relocation 259 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 260 * memory thread); this prevents reader-starvation of waiting 261 * writer thread(s) by giving priority to writers over readers. 262 * es & SE_RETIRED: caller wants to lock pages even if they are 263 * retired. Default is to deny the lock if the page is retired. 264 * 265 * And yes, we know, the semantics of this function are too complicated. 266 * It's on the list to be cleaned up. 267 */ 268 int 269 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 270 { 271 int retval; 272 kmutex_t *pse = PAGE_SE_MUTEX(pp); 273 int upgraded; 274 int reclaim_it; 275 276 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 277 278 VM_STAT_ADD(page_lock_count); 279 280 upgraded = 0; 281 reclaim_it = 0; 282 283 mutex_enter(pse); 284 285 ASSERT(((es & SE_EXCL_WANTED) == 0) || 286 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 287 288 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 289 mutex_exit(pse); 290 VM_STAT_ADD(page_lock_retired); 291 return (0); 292 } 293 294 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 295 se = SE_EXCL; 296 } 297 298 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 299 300 reclaim_it = 1; 301 if (se == SE_SHARED) { 302 /* 303 * This is an interesting situation. 304 * 305 * Remember that p_free can only change if 306 * p_selock < 0. 307 * p_free does not depend on our holding `pse'. 308 * And, since we hold `pse', p_selock can not change. 309 * So, if p_free changes on us, the page is already 310 * exclusively held, and we would fail to get p_selock 311 * regardless. 312 * 313 * We want to avoid getting the share 314 * lock on a free page that needs to be reclaimed. 315 * It is possible that some other thread has the share 316 * lock and has left the free page on the cache list. 317 * pvn_vplist_dirty() does this for brief periods. 318 * If the se_share is currently SE_EXCL, we will fail 319 * to acquire p_selock anyway. Blocking is the 320 * right thing to do. 321 * If we need to reclaim this page, we must get 322 * exclusive access to it, force the upgrade now. 323 * Again, we will fail to acquire p_selock if the 324 * page is not free and block. 325 */ 326 upgraded = 1; 327 se = SE_EXCL; 328 VM_STAT_ADD(page_lock_upgrade); 329 } 330 } 331 332 if (se == SE_EXCL) { 333 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 334 /* 335 * if the caller wants a writer lock (but did not 336 * specify exclusive access), and there is a pending 337 * writer that wants exclusive access, return failure 338 */ 339 retval = 0; 340 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 341 /* no reader/writer lock held */ 342 THREAD_KPRI_REQUEST(); 343 /* this clears our setting of the SE_EWANTED bit */ 344 pp->p_selock = SE_WRITER; 345 retval = 1; 346 } else { 347 /* page is locked */ 348 if (es & SE_EXCL_WANTED) { 349 /* set the SE_EWANTED bit */ 350 pp->p_selock |= SE_EWANTED; 351 } 352 retval = 0; 353 } 354 } else { 355 retval = 0; 356 if (pp->p_selock >= 0) { 357 /* 358 * Readers are not allowed when excl wanted or 359 * a retire is pending. Since kvp pages can take 360 * a long time to be retired, we make an exception 361 * for them to avoid hanging threads unnecessarily. 362 */ 363 if ((pp->p_selock & SE_EWANTED) == 0) { 364 if (!PP_PR_REQ(pp) || pp->p_vnode == &kvp) { 365 pp->p_selock += SE_READER; 366 retval = 1; 367 } 368 } 369 } 370 } 371 372 if (retval == 0) { 373 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 374 VM_STAT_ADD(page_lock_deleted); 375 mutex_exit(pse); 376 return (retval); 377 } 378 379 #ifdef VM_STATS 380 VM_STAT_ADD(page_lock_miss); 381 if (upgraded) { 382 VM_STAT_ADD(page_lock_upgrade_failed); 383 } 384 #endif 385 if (lock) { 386 VM_STAT_ADD(page_lock_miss_lock); 387 mutex_exit(lock); 388 } 389 390 /* 391 * Now, wait for the page to be unlocked and 392 * release the lock protecting p_cv and p_selock. 393 */ 394 cv_wait(&pp->p_cv, pse); 395 mutex_exit(pse); 396 397 /* 398 * The page identity may have changed while we were 399 * blocked. If we are willing to depend on "pp" 400 * still pointing to a valid page structure (i.e., 401 * assuming page structures are not dynamically allocated 402 * or freed), we could try to lock the page if its 403 * identity hasn't changed. 404 * 405 * This needs to be measured, since we come back from 406 * cv_wait holding pse (the expensive part of this 407 * operation) we might as well try the cheap part. 408 * Though we would also have to confirm that dropping 409 * `lock' did not cause any grief to the callers. 410 */ 411 if (lock) { 412 mutex_enter(lock); 413 } 414 } else { 415 /* 416 * We have the page lock. 417 * If we needed to reclaim the page, and the page 418 * needed reclaiming (ie, it was free), then we 419 * have the page exclusively locked. We may need 420 * to downgrade the page. 421 */ 422 ASSERT((upgraded) ? 423 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 424 mutex_exit(pse); 425 426 /* 427 * We now hold this page's lock, either shared or 428 * exclusive. This will prevent its identity from changing. 429 * The page, however, may or may not be free. If the caller 430 * requested, and it is free, go reclaim it from the 431 * free list. If the page can't be reclaimed, return failure 432 * so that the caller can start all over again. 433 * 434 * NOTE:page_reclaim() releases the page lock (p_selock) 435 * if it can't be reclaimed. 436 */ 437 if (reclaim_it) { 438 if (!page_reclaim(pp, lock)) { 439 VM_STAT_ADD(page_lock_bad_reclaim); 440 retval = 0; 441 } else { 442 VM_STAT_ADD(page_lock_reclaim); 443 if (upgraded) { 444 page_downgrade(pp); 445 } 446 } 447 } 448 } 449 return (retval); 450 } 451 452 /* 453 * Clear the SE_EWANTED bit from p_selock. This function allows 454 * callers of page_lock_es and page_try_reclaim_lock to clear 455 * their setting of this bit if they decide they no longer wish 456 * to gain exclusive access to the page. Currently only 457 * delete_memory_thread uses this when the delete memory 458 * operation is cancelled. 459 */ 460 void 461 page_lock_clr_exclwanted(page_t *pp) 462 { 463 kmutex_t *pse = PAGE_SE_MUTEX(pp); 464 465 mutex_enter(pse); 466 pp->p_selock &= ~SE_EWANTED; 467 if (CV_HAS_WAITERS(&pp->p_cv)) 468 cv_broadcast(&pp->p_cv); 469 mutex_exit(pse); 470 } 471 472 /* 473 * Read the comments inside of page_lock_es() carefully. 474 * 475 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 476 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 477 * This is used by threads subject to reader-starvation (eg. memory delete). 478 * 479 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 480 * it is expected that it will retry at a later time. Threads that will 481 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 482 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 483 * the bit is cleared.) 484 */ 485 int 486 page_try_reclaim_lock(page_t *pp, se_t se, int es) 487 { 488 kmutex_t *pse = PAGE_SE_MUTEX(pp); 489 selock_t old; 490 491 mutex_enter(pse); 492 493 old = pp->p_selock; 494 495 ASSERT(((es & SE_EXCL_WANTED) == 0) || 496 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 497 498 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 499 mutex_exit(pse); 500 VM_STAT_ADD(page_trylock_failed); 501 return (0); 502 } 503 504 if (se == SE_SHARED && es == 1 && old == 0) { 505 se = SE_EXCL; 506 } 507 508 if (se == SE_SHARED) { 509 if (!PP_ISFREE(pp)) { 510 if (old >= 0) { 511 /* 512 * Readers are not allowed when excl wanted 513 * or a retire is pending. Since kvp pages can 514 * take a long time to be retired, we make an 515 * exception for them to avoid hanging threads 516 * unnecessarily. 517 */ 518 if ((old & SE_EWANTED) == 0) { 519 if (!PP_PR_REQ(pp) || 520 pp->p_vnode == &kvp) { 521 pp->p_selock = old + SE_READER; 522 mutex_exit(pse); 523 return (1); 524 } 525 } 526 } 527 mutex_exit(pse); 528 return (0); 529 } 530 /* 531 * The page is free, so we really want SE_EXCL (below) 532 */ 533 VM_STAT_ADD(page_try_reclaim_upgrade); 534 } 535 536 /* 537 * The caller wants a writer lock. We try for it only if 538 * SE_EWANTED is not set, or if the caller specified 539 * SE_EXCL_WANTED. 540 */ 541 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 542 if ((old & ~SE_EWANTED) == 0) { 543 /* no reader/writer lock held */ 544 THREAD_KPRI_REQUEST(); 545 /* this clears out our setting of the SE_EWANTED bit */ 546 pp->p_selock = SE_WRITER; 547 mutex_exit(pse); 548 return (1); 549 } 550 } 551 if (es & SE_EXCL_WANTED) { 552 /* page is locked, set the SE_EWANTED bit */ 553 pp->p_selock |= SE_EWANTED; 554 } 555 mutex_exit(pse); 556 return (0); 557 } 558 559 /* 560 * Acquire a page's "shared/exclusive" lock, but never block. 561 * Returns 1 on success, 0 on failure. 562 */ 563 int 564 page_trylock(page_t *pp, se_t se) 565 { 566 kmutex_t *pse = PAGE_SE_MUTEX(pp); 567 568 mutex_enter(pse); 569 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 570 (se == SE_SHARED && PP_PR_REQ(pp) && pp->p_vnode != &kvp)) { 571 /* 572 * Fail if a thread wants exclusive access and page is 573 * retired, if the page is slated for retirement, or a 574 * share lock is requested. 575 */ 576 mutex_exit(pse); 577 VM_STAT_ADD(page_trylock_failed); 578 return (0); 579 } 580 581 if (se == SE_EXCL) { 582 if (pp->p_selock == 0) { 583 THREAD_KPRI_REQUEST(); 584 pp->p_selock = SE_WRITER; 585 mutex_exit(pse); 586 return (1); 587 } 588 } else { 589 if (pp->p_selock >= 0) { 590 pp->p_selock += SE_READER; 591 mutex_exit(pse); 592 return (1); 593 } 594 } 595 mutex_exit(pse); 596 return (0); 597 } 598 599 /* 600 * Variant of page_unlock() specifically for the page freelist 601 * code. The mere existence of this code is a vile hack that 602 * has resulted due to the backwards locking order of the page 603 * freelist manager; please don't call it. 604 */ 605 void 606 page_unlock_noretire(page_t *pp) 607 { 608 kmutex_t *pse = PAGE_SE_MUTEX(pp); 609 selock_t old; 610 611 mutex_enter(pse); 612 613 old = pp->p_selock; 614 if ((old & ~SE_EWANTED) == SE_READER) { 615 pp->p_selock = old & ~SE_READER; 616 if (CV_HAS_WAITERS(&pp->p_cv)) 617 cv_broadcast(&pp->p_cv); 618 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 619 panic("page_unlock_noretire: page %p is deleted", pp); 620 } else if (old < 0) { 621 THREAD_KPRI_RELEASE(); 622 pp->p_selock &= SE_EWANTED; 623 if (CV_HAS_WAITERS(&pp->p_cv)) 624 cv_broadcast(&pp->p_cv); 625 } else if ((old & ~SE_EWANTED) > SE_READER) { 626 pp->p_selock = old - SE_READER; 627 } else { 628 panic("page_unlock_noretire: page %p is not locked", pp); 629 } 630 631 mutex_exit(pse); 632 } 633 634 /* 635 * Release the page's "shared/exclusive" lock and wake up anyone 636 * who might be waiting for it. 637 */ 638 void 639 page_unlock(page_t *pp) 640 { 641 kmutex_t *pse = PAGE_SE_MUTEX(pp); 642 selock_t old; 643 644 mutex_enter(pse); 645 646 old = pp->p_selock; 647 if ((old & ~SE_EWANTED) == SE_READER) { 648 pp->p_selock = old & ~SE_READER; 649 if (CV_HAS_WAITERS(&pp->p_cv)) 650 cv_broadcast(&pp->p_cv); 651 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 652 panic("page_unlock: page %p is deleted", pp); 653 } else if (old < 0) { 654 THREAD_KPRI_RELEASE(); 655 pp->p_selock &= SE_EWANTED; 656 if (CV_HAS_WAITERS(&pp->p_cv)) 657 cv_broadcast(&pp->p_cv); 658 } else if ((old & ~SE_EWANTED) > SE_READER) { 659 pp->p_selock = old - SE_READER; 660 } else { 661 panic("page_unlock: page %p is not locked", pp); 662 } 663 664 if (pp->p_selock == 0 && PP_PR_REQ(pp)) { 665 /* 666 * Try to retire the page. If it retires, great. 667 * If not, oh well, we'll get it in the next unlock 668 * request, and repeat the cycle. Regardless, 669 * page_tryretire() will drop the page lock. 670 */ 671 if ((pp->p_toxic & PR_BUSY) == 0) { 672 THREAD_KPRI_REQUEST(); 673 pp->p_selock = SE_WRITER; 674 page_settoxic(pp, PR_BUSY); 675 mutex_exit(pse); 676 page_tryretire(pp); 677 } else { 678 pp->p_selock = SE_WRITER; 679 page_clrtoxic(pp, PR_BUSY); 680 pp->p_selock = 0; 681 mutex_exit(pse); 682 } 683 } else { 684 mutex_exit(pse); 685 } 686 } 687 688 /* 689 * Try to upgrade the lock on the page from a "shared" to an 690 * "exclusive" lock. Since this upgrade operation is done while 691 * holding the mutex protecting this page, no one else can acquire this page's 692 * lock and change the page. Thus, it is safe to drop the "shared" 693 * lock and attempt to acquire the "exclusive" lock. 694 * 695 * Returns 1 on success, 0 on failure. 696 */ 697 int 698 page_tryupgrade(page_t *pp) 699 { 700 kmutex_t *pse = PAGE_SE_MUTEX(pp); 701 702 mutex_enter(pse); 703 if (!(pp->p_selock & SE_EWANTED)) { 704 /* no threads want exclusive access, try upgrade */ 705 if (pp->p_selock == SE_READER) { 706 THREAD_KPRI_REQUEST(); 707 /* convert to exclusive lock */ 708 pp->p_selock = SE_WRITER; 709 mutex_exit(pse); 710 return (1); 711 } 712 } 713 mutex_exit(pse); 714 return (0); 715 } 716 717 /* 718 * Downgrade the "exclusive" lock on the page to a "shared" lock 719 * while holding the mutex protecting this page's p_selock field. 720 */ 721 void 722 page_downgrade(page_t *pp) 723 { 724 kmutex_t *pse = PAGE_SE_MUTEX(pp); 725 int excl_waiting; 726 727 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 728 ASSERT(PAGE_EXCL(pp)); 729 730 mutex_enter(pse); 731 excl_waiting = pp->p_selock & SE_EWANTED; 732 THREAD_KPRI_RELEASE(); 733 pp->p_selock = SE_READER | excl_waiting; 734 if (CV_HAS_WAITERS(&pp->p_cv)) 735 cv_broadcast(&pp->p_cv); 736 mutex_exit(pse); 737 } 738 739 void 740 page_lock_delete(page_t *pp) 741 { 742 kmutex_t *pse = PAGE_SE_MUTEX(pp); 743 744 ASSERT(PAGE_EXCL(pp)); 745 ASSERT(pp->p_vnode == NULL); 746 ASSERT(pp->p_offset == (u_offset_t)-1); 747 ASSERT(!PP_ISFREE(pp)); 748 749 mutex_enter(pse); 750 THREAD_KPRI_RELEASE(); 751 pp->p_selock = SE_DELETED; 752 if (CV_HAS_WAITERS(&pp->p_cv)) 753 cv_broadcast(&pp->p_cv); 754 mutex_exit(pse); 755 } 756 757 /* 758 * Implement the io lock for pages 759 */ 760 void 761 page_iolock_init(page_t *pp) 762 { 763 pp->p_iolock_state = 0; 764 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 765 } 766 767 /* 768 * Acquire the i/o lock on a page. 769 */ 770 void 771 page_io_lock(page_t *pp) 772 { 773 kmutex_t *pio; 774 775 pio = PAGE_IO_MUTEX(pp); 776 mutex_enter(pio); 777 while (pp->p_iolock_state & PAGE_IO_INUSE) { 778 cv_wait(&(pp->p_io_cv), pio); 779 } 780 pp->p_iolock_state |= PAGE_IO_INUSE; 781 mutex_exit(pio); 782 } 783 784 /* 785 * Release the i/o lock on a page. 786 */ 787 void 788 page_io_unlock(page_t *pp) 789 { 790 kmutex_t *pio; 791 792 pio = PAGE_IO_MUTEX(pp); 793 mutex_enter(pio); 794 cv_signal(&pp->p_io_cv); 795 pp->p_iolock_state &= ~PAGE_IO_INUSE; 796 mutex_exit(pio); 797 } 798 799 /* 800 * Try to acquire the i/o lock on a page without blocking. 801 * Returns 1 on success, 0 on failure. 802 */ 803 int 804 page_io_trylock(page_t *pp) 805 { 806 kmutex_t *pio; 807 808 if (pp->p_iolock_state & PAGE_IO_INUSE) 809 return (0); 810 811 pio = PAGE_IO_MUTEX(pp); 812 mutex_enter(pio); 813 814 if (pp->p_iolock_state & PAGE_IO_INUSE) { 815 mutex_exit(pio); 816 return (0); 817 } 818 pp->p_iolock_state |= PAGE_IO_INUSE; 819 mutex_exit(pio); 820 821 return (1); 822 } 823 824 /* 825 * Assert that the i/o lock on a page is held. 826 * Returns 1 on success, 0 on failure. 827 */ 828 int 829 page_iolock_assert(page_t *pp) 830 { 831 return (pp->p_iolock_state & PAGE_IO_INUSE); 832 } 833 834 /* 835 * Wrapper exported to kernel routines that are built 836 * platform-independent (the macro is platform-dependent; 837 * the size of vph_mutex[] is based on NCPU). 838 * 839 * Note that you can do stress testing on this by setting the 840 * variable page_vnode_mutex_stress to something other than 841 * zero in a DEBUG kernel in a debugger after loading the kernel. 842 * Setting it after the kernel is running may not work correctly. 843 */ 844 #ifdef DEBUG 845 static int page_vnode_mutex_stress = 0; 846 #endif 847 848 kmutex_t * 849 page_vnode_mutex(vnode_t *vp) 850 { 851 if (vp == &kvp) 852 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 853 #ifdef DEBUG 854 if (page_vnode_mutex_stress != 0) 855 return (&vph_mutex[0]); 856 #endif 857 858 return (&vph_mutex[VP_HASH_FUNC(vp)]); 859 } 860 861 kmutex_t * 862 page_se_mutex(page_t *pp) 863 { 864 return (PAGE_SE_MUTEX(pp)); 865 } 866 867 #ifdef VM_STATS 868 uint_t pszclck_stat[4]; 869 #endif 870 /* 871 * Find, take and return a mutex held by hat_page_demote(). 872 * Called by page_demote_vp_pages() before hat_page_demote() call and by 873 * routines that want to block hat_page_demote() but can't do it 874 * via locking all constituent pages. 875 * 876 * Return NULL if p_szc is 0. 877 * 878 * It should only be used for pages that can be demoted by hat_page_demote() 879 * i.e. non swapfs file system pages. The logic here is lifted from 880 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 881 * since the page is locked and not free. 882 * 883 * Hash of the root page is used to find the lock. 884 * To find the root in the presense of hat_page_demote() chageing the location 885 * of the root this routine relies on the fact that hat_page_demote() changes 886 * root last. 887 * 888 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 889 * returned pp's p_szc may be any value. 890 */ 891 kmutex_t * 892 page_szc_lock(page_t *pp) 893 { 894 kmutex_t *mtx; 895 page_t *rootpp; 896 uint_t szc; 897 uint_t rszc; 898 uint_t pszc = pp->p_szc; 899 900 ASSERT(pp != NULL); 901 ASSERT(PAGE_LOCKED(pp)); 902 ASSERT(!PP_ISFREE(pp)); 903 ASSERT(pp->p_vnode != NULL); 904 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 905 ASSERT(pp->p_vnode != &kvp); 906 907 again: 908 if (pszc == 0) { 909 VM_STAT_ADD(pszclck_stat[0]); 910 return (NULL); 911 } 912 913 /* The lock lives in the root page */ 914 915 rootpp = PP_GROUPLEADER(pp, pszc); 916 mtx = PAGE_SZC_MUTEX(rootpp); 917 mutex_enter(mtx); 918 919 /* 920 * since p_szc can only decrease if pp == rootpp 921 * rootpp will be always the same i.e we have the right root 922 * regardless of rootpp->p_szc. 923 * If location of pp's root didn't change after we took 924 * the lock we have the right root. return mutex hashed off it. 925 */ 926 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 927 VM_STAT_ADD(pszclck_stat[1]); 928 return (mtx); 929 } 930 931 /* 932 * root location changed because page got demoted. 933 * locate the new root. 934 */ 935 if (rszc < pszc) { 936 szc = pp->p_szc; 937 ASSERT(szc < pszc); 938 mutex_exit(mtx); 939 pszc = szc; 940 VM_STAT_ADD(pszclck_stat[2]); 941 goto again; 942 } 943 944 VM_STAT_ADD(pszclck_stat[3]); 945 /* 946 * current hat_page_demote not done yet. 947 * wait for it to finish. 948 */ 949 mutex_exit(mtx); 950 rootpp = PP_GROUPLEADER(rootpp, rszc); 951 mtx = PAGE_SZC_MUTEX(rootpp); 952 mutex_enter(mtx); 953 mutex_exit(mtx); 954 ASSERT(rootpp->p_szc < rszc); 955 goto again; 956 } 957 958 int 959 page_szc_lock_assert(page_t *pp) 960 { 961 page_t *rootpp = PP_PAGEROOT(pp); 962 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 963 964 return (MUTEX_HELD(mtx)); 965 } 966