1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 27 /* 28 * VM - page locking primitives 29 */ 30 #include <sys/param.h> 31 #include <sys/t_lock.h> 32 #include <sys/vtrace.h> 33 #include <sys/debug.h> 34 #include <sys/cmn_err.h> 35 #include <sys/bitmap.h> 36 #include <sys/lockstat.h> 37 #include <sys/sysmacros.h> 38 #include <sys/condvar_impl.h> 39 #include <vm/page.h> 40 #include <vm/seg_enum.h> 41 #include <vm/vm_dep.h> 42 #include <vm/seg_kmem.h> 43 44 /* 45 * This global mutex is for logical page locking. 46 * The following fields in the page structure are protected 47 * by this lock: 48 * 49 * p_lckcnt 50 * p_cowcnt 51 */ 52 kmutex_t page_llock; 53 54 /* 55 * This is a global lock for the logical page free list. The 56 * logical free list, in this implementation, is maintained as two 57 * separate physical lists - the cache list and the free list. 58 */ 59 kmutex_t page_freelock; 60 61 /* 62 * The hash table, page_hash[], the p_selock fields, and the 63 * list of pages associated with vnodes are protected by arrays of mutexes. 64 * 65 * Unless the hashes are changed radically, the table sizes must be 66 * a power of two. Also, we typically need more mutexes for the 67 * vnodes since these locks are occasionally held for long periods. 68 * And since there seem to be two special vnodes (kvp and swapvp), 69 * we make room for private mutexes for them. 70 * 71 * The pse_mutex[] array holds the mutexes to protect the p_selock 72 * fields of all page_t structures. 73 * 74 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 75 * when given a pointer to a page_t. 76 * 77 * PIO_TABLE_SIZE must be a power of two. One could argue that we 78 * should go to the trouble of setting it up at run time and base it 79 * on memory size rather than the number of compile time CPUs. 80 * 81 * XX64 We should be using physmem size to calculate PIO_SHIFT. 82 * 83 * These might break in 64 bit world. 84 */ 85 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 86 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 87 88 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 89 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 90 91 #define PAGE_IO_MUTEX(pp) \ 92 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 93 94 /* 95 * The pse_mutex[] array is allocated in the platform startup code 96 * based on the size of the machine at startup. 97 */ 98 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 99 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 100 extern int pse_shift; /* log2(pse_table_size) */ 101 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 102 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 103 (pse_table_size - 1)].pad_mutex 104 105 #define PSZC_MTX_TABLE_SIZE 128 106 #define PSZC_MTX_TABLE_SHIFT 7 107 108 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 109 110 #define PAGE_SZC_MUTEX(_pp) \ 111 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 112 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 113 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 114 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 115 116 /* 117 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 118 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 119 * and p_vpnext). 120 * 121 * The page_vnode_mutex(vp) function returns the address of the appropriate 122 * mutex from this array given a pointer to a vnode. It is complicated 123 * by the fact that the kernel's vnode and the swapfs vnode are referenced 124 * frequently enough to warrent their own mutexes. 125 * 126 * The VP_HASH_FUNC returns the index into the vph_mutex array given 127 * an address of a vnode. 128 */ 129 130 /* 131 * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 132 * Need to review again. 133 */ 134 #if defined(_LP64) 135 #define VPH_TABLE_SIZE (1 << (VP_SHIFT + 3)) 136 #else /* 32 bits */ 137 #define VPH_TABLE_SIZE (2 << VP_SHIFT) 138 #endif 139 140 #define VP_HASH_FUNC(vp) \ 141 ((((uintptr_t)(vp) >> 6) + \ 142 ((uintptr_t)(vp) >> 8) + \ 143 ((uintptr_t)(vp) >> 10) + \ 144 ((uintptr_t)(vp) >> 12)) \ 145 & (VPH_TABLE_SIZE - 1)) 146 147 /* 148 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 149 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 150 * VPH_TABLE_SIZE + 1. 151 */ 152 153 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 154 155 /* 156 * Initialize the locks used by the Virtual Memory Management system. 157 */ 158 void 159 page_lock_init() 160 { 161 } 162 163 /* 164 * Return a value for pse_shift based on npg (the number of physical pages) 165 * and ncpu (the maximum number of CPUs). This is called by platform startup 166 * code. 167 * 168 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 169 * locks grew approximately as the square of the number of threads executing. 170 * So the primary scaling factor used is NCPU^2. The size of the machine in 171 * megabytes is used as an upper bound, particularly for sun4v machines which 172 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 173 * (128) is used as a minimum. Since the size of the table has to be a power 174 * of two, the calculated size is rounded up to the next power of two. 175 */ 176 /*ARGSUSED*/ 177 int 178 size_pse_array(pgcnt_t npg, int ncpu) 179 { 180 size_t size; 181 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 182 183 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 184 size += (1 << (highbit(size) - 1)) - 1; 185 return (highbit(size) - 1); 186 } 187 188 /* 189 * At present we only use page ownership to aid debugging, so it's 190 * OK if the owner field isn't exact. In the 32-bit world two thread ids 191 * can map to the same owner because we just 'or' in 0x80000000 and 192 * then clear the second highest bit, so that (for example) 0x2faced00 193 * and 0xafaced00 both map to 0xafaced00. 194 * In the 64-bit world, p_selock may not be large enough to hold a full 195 * thread pointer. If we ever need precise ownership (e.g. if we implement 196 * priority inheritance for page locks) then p_selock should become a 197 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 198 */ 199 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 200 #define SE_READER 1 201 202 /* 203 * A page that is deleted must be marked as such using the 204 * page_lock_delete() function. The page must be exclusively locked. 205 * The SE_DELETED marker is put in p_selock when this function is called. 206 * SE_DELETED must be distinct from any SE_WRITER value. 207 */ 208 #define SE_DELETED (1 | INT_MIN) 209 210 #ifdef VM_STATS 211 uint_t vph_kvp_count; 212 uint_t vph_swapfsvp_count; 213 uint_t vph_other; 214 #endif /* VM_STATS */ 215 216 #ifdef VM_STATS 217 uint_t page_lock_count; 218 uint_t page_lock_miss; 219 uint_t page_lock_miss_lock; 220 uint_t page_lock_reclaim; 221 uint_t page_lock_bad_reclaim; 222 uint_t page_lock_same_page; 223 uint_t page_lock_upgrade; 224 uint_t page_lock_retired; 225 uint_t page_lock_upgrade_failed; 226 uint_t page_lock_deleted; 227 228 uint_t page_trylock_locked; 229 uint_t page_trylock_failed; 230 uint_t page_trylock_missed; 231 232 uint_t page_try_reclaim_upgrade; 233 #endif /* VM_STATS */ 234 235 /* 236 * Acquire the "shared/exclusive" lock on a page. 237 * 238 * Returns 1 on success and locks the page appropriately. 239 * 0 on failure and does not lock the page. 240 * 241 * If `lock' is non-NULL, it will be dropped and reacquired in the 242 * failure case. This routine can block, and if it does 243 * it will always return a failure since the page identity [vp, off] 244 * or state may have changed. 245 */ 246 247 int 248 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 249 { 250 return (page_lock_es(pp, se, lock, reclaim, 0)); 251 } 252 253 /* 254 * With the addition of reader-writer lock semantics to page_lock_es, 255 * callers wanting an exclusive (writer) lock may prevent shared-lock 256 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 257 * In this case, when an exclusive lock cannot be acquired, p_selock's 258 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 259 * if the page is slated for retirement. 260 * 261 * The se and es parameters determine if the lock should be granted 262 * based on the following decision table: 263 * 264 * Lock wanted es flags p_selock/SE_EWANTED Action 265 * ----------- -------------- ------------------- --------- 266 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 267 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 268 * SE_EXCL none any lock/any deny 269 * SE_SHARED n/a [2] shared/0 grant 270 * SE_SHARED n/a [2] unlocked/0 grant 271 * SE_SHARED n/a shared/1 deny 272 * SE_SHARED n/a unlocked/1 deny 273 * SE_SHARED n/a excl/any deny 274 * 275 * Notes: 276 * [1] The code grants an exclusive lock to the caller and clears the bit 277 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 278 * bit's value. This was deemed acceptable as we are not concerned about 279 * exclusive-lock starvation. If this ever becomes an issue, a priority or 280 * fifo mechanism should also be implemented. Meantime, the thread that 281 * set SE_EWANTED should be prepared to catch this condition and reset it 282 * 283 * [2] Retired pages may not be locked at any time, regardless of the 284 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 285 * 286 * Notes on values of "es": 287 * 288 * es & 1: page_lookup_create will attempt page relocation 289 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 290 * memory thread); this prevents reader-starvation of waiting 291 * writer thread(s) by giving priority to writers over readers. 292 * es & SE_RETIRED: caller wants to lock pages even if they are 293 * retired. Default is to deny the lock if the page is retired. 294 * 295 * And yes, we know, the semantics of this function are too complicated. 296 * It's on the list to be cleaned up. 297 */ 298 int 299 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 300 { 301 int retval; 302 kmutex_t *pse = PAGE_SE_MUTEX(pp); 303 int upgraded; 304 int reclaim_it; 305 306 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 307 308 VM_STAT_ADD(page_lock_count); 309 310 upgraded = 0; 311 reclaim_it = 0; 312 313 mutex_enter(pse); 314 315 ASSERT(((es & SE_EXCL_WANTED) == 0) || 316 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 317 318 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 319 mutex_exit(pse); 320 VM_STAT_ADD(page_lock_retired); 321 return (0); 322 } 323 324 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 325 se = SE_EXCL; 326 } 327 328 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 329 330 reclaim_it = 1; 331 if (se == SE_SHARED) { 332 /* 333 * This is an interesting situation. 334 * 335 * Remember that p_free can only change if 336 * p_selock < 0. 337 * p_free does not depend on our holding `pse'. 338 * And, since we hold `pse', p_selock can not change. 339 * So, if p_free changes on us, the page is already 340 * exclusively held, and we would fail to get p_selock 341 * regardless. 342 * 343 * We want to avoid getting the share 344 * lock on a free page that needs to be reclaimed. 345 * It is possible that some other thread has the share 346 * lock and has left the free page on the cache list. 347 * pvn_vplist_dirty() does this for brief periods. 348 * If the se_share is currently SE_EXCL, we will fail 349 * to acquire p_selock anyway. Blocking is the 350 * right thing to do. 351 * If we need to reclaim this page, we must get 352 * exclusive access to it, force the upgrade now. 353 * Again, we will fail to acquire p_selock if the 354 * page is not free and block. 355 */ 356 upgraded = 1; 357 se = SE_EXCL; 358 VM_STAT_ADD(page_lock_upgrade); 359 } 360 } 361 362 if (se == SE_EXCL) { 363 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 364 /* 365 * if the caller wants a writer lock (but did not 366 * specify exclusive access), and there is a pending 367 * writer that wants exclusive access, return failure 368 */ 369 retval = 0; 370 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 371 /* no reader/writer lock held */ 372 THREAD_KPRI_REQUEST(); 373 /* this clears our setting of the SE_EWANTED bit */ 374 pp->p_selock = SE_WRITER; 375 retval = 1; 376 } else { 377 /* page is locked */ 378 if (es & SE_EXCL_WANTED) { 379 /* set the SE_EWANTED bit */ 380 pp->p_selock |= SE_EWANTED; 381 } 382 retval = 0; 383 } 384 } else { 385 retval = 0; 386 if (pp->p_selock >= 0) { 387 if ((pp->p_selock & SE_EWANTED) == 0) { 388 pp->p_selock += SE_READER; 389 retval = 1; 390 } 391 } 392 } 393 394 if (retval == 0) { 395 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 396 VM_STAT_ADD(page_lock_deleted); 397 mutex_exit(pse); 398 return (retval); 399 } 400 401 #ifdef VM_STATS 402 VM_STAT_ADD(page_lock_miss); 403 if (upgraded) { 404 VM_STAT_ADD(page_lock_upgrade_failed); 405 } 406 #endif 407 if (lock) { 408 VM_STAT_ADD(page_lock_miss_lock); 409 mutex_exit(lock); 410 } 411 412 /* 413 * Now, wait for the page to be unlocked and 414 * release the lock protecting p_cv and p_selock. 415 */ 416 cv_wait(&pp->p_cv, pse); 417 mutex_exit(pse); 418 419 /* 420 * The page identity may have changed while we were 421 * blocked. If we are willing to depend on "pp" 422 * still pointing to a valid page structure (i.e., 423 * assuming page structures are not dynamically allocated 424 * or freed), we could try to lock the page if its 425 * identity hasn't changed. 426 * 427 * This needs to be measured, since we come back from 428 * cv_wait holding pse (the expensive part of this 429 * operation) we might as well try the cheap part. 430 * Though we would also have to confirm that dropping 431 * `lock' did not cause any grief to the callers. 432 */ 433 if (lock) { 434 mutex_enter(lock); 435 } 436 } else { 437 /* 438 * We have the page lock. 439 * If we needed to reclaim the page, and the page 440 * needed reclaiming (ie, it was free), then we 441 * have the page exclusively locked. We may need 442 * to downgrade the page. 443 */ 444 ASSERT((upgraded) ? 445 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 446 mutex_exit(pse); 447 448 /* 449 * We now hold this page's lock, either shared or 450 * exclusive. This will prevent its identity from changing. 451 * The page, however, may or may not be free. If the caller 452 * requested, and it is free, go reclaim it from the 453 * free list. If the page can't be reclaimed, return failure 454 * so that the caller can start all over again. 455 * 456 * NOTE:page_reclaim() releases the page lock (p_selock) 457 * if it can't be reclaimed. 458 */ 459 if (reclaim_it) { 460 if (!page_reclaim(pp, lock)) { 461 VM_STAT_ADD(page_lock_bad_reclaim); 462 retval = 0; 463 } else { 464 VM_STAT_ADD(page_lock_reclaim); 465 if (upgraded) { 466 page_downgrade(pp); 467 } 468 } 469 } 470 } 471 return (retval); 472 } 473 474 /* 475 * Clear the SE_EWANTED bit from p_selock. This function allows 476 * callers of page_lock_es and page_try_reclaim_lock to clear 477 * their setting of this bit if they decide they no longer wish 478 * to gain exclusive access to the page. Currently only 479 * delete_memory_thread uses this when the delete memory 480 * operation is cancelled. 481 */ 482 void 483 page_lock_clr_exclwanted(page_t *pp) 484 { 485 kmutex_t *pse = PAGE_SE_MUTEX(pp); 486 487 mutex_enter(pse); 488 pp->p_selock &= ~SE_EWANTED; 489 if (CV_HAS_WAITERS(&pp->p_cv)) 490 cv_broadcast(&pp->p_cv); 491 mutex_exit(pse); 492 } 493 494 /* 495 * Read the comments inside of page_lock_es() carefully. 496 * 497 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 498 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 499 * This is used by threads subject to reader-starvation (eg. memory delete). 500 * 501 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 502 * it is expected that it will retry at a later time. Threads that will 503 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 504 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 505 * the bit is cleared.) 506 */ 507 int 508 page_try_reclaim_lock(page_t *pp, se_t se, int es) 509 { 510 kmutex_t *pse = PAGE_SE_MUTEX(pp); 511 selock_t old; 512 513 mutex_enter(pse); 514 515 old = pp->p_selock; 516 517 ASSERT(((es & SE_EXCL_WANTED) == 0) || 518 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 519 520 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 521 mutex_exit(pse); 522 VM_STAT_ADD(page_trylock_failed); 523 return (0); 524 } 525 526 if (se == SE_SHARED && es == 1 && old == 0) { 527 se = SE_EXCL; 528 } 529 530 if (se == SE_SHARED) { 531 if (!PP_ISFREE(pp)) { 532 if (old >= 0) { 533 /* 534 * Readers are not allowed when excl wanted 535 */ 536 if ((old & SE_EWANTED) == 0) { 537 pp->p_selock = old + SE_READER; 538 mutex_exit(pse); 539 return (1); 540 } 541 } 542 mutex_exit(pse); 543 return (0); 544 } 545 /* 546 * The page is free, so we really want SE_EXCL (below) 547 */ 548 VM_STAT_ADD(page_try_reclaim_upgrade); 549 } 550 551 /* 552 * The caller wants a writer lock. We try for it only if 553 * SE_EWANTED is not set, or if the caller specified 554 * SE_EXCL_WANTED. 555 */ 556 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 557 if ((old & ~SE_EWANTED) == 0) { 558 /* no reader/writer lock held */ 559 THREAD_KPRI_REQUEST(); 560 /* this clears out our setting of the SE_EWANTED bit */ 561 pp->p_selock = SE_WRITER; 562 mutex_exit(pse); 563 return (1); 564 } 565 } 566 if (es & SE_EXCL_WANTED) { 567 /* page is locked, set the SE_EWANTED bit */ 568 pp->p_selock |= SE_EWANTED; 569 } 570 mutex_exit(pse); 571 return (0); 572 } 573 574 /* 575 * Acquire a page's "shared/exclusive" lock, but never block. 576 * Returns 1 on success, 0 on failure. 577 */ 578 int 579 page_trylock(page_t *pp, se_t se) 580 { 581 kmutex_t *pse = PAGE_SE_MUTEX(pp); 582 583 mutex_enter(pse); 584 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 585 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 586 /* 587 * Fail if a thread wants exclusive access and page is 588 * retired, if the page is slated for retirement, or a 589 * share lock is requested. 590 */ 591 mutex_exit(pse); 592 VM_STAT_ADD(page_trylock_failed); 593 return (0); 594 } 595 596 if (se == SE_EXCL) { 597 if (pp->p_selock == 0) { 598 THREAD_KPRI_REQUEST(); 599 pp->p_selock = SE_WRITER; 600 mutex_exit(pse); 601 return (1); 602 } 603 } else { 604 if (pp->p_selock >= 0) { 605 pp->p_selock += SE_READER; 606 mutex_exit(pse); 607 return (1); 608 } 609 } 610 mutex_exit(pse); 611 return (0); 612 } 613 614 /* 615 * Variant of page_unlock() specifically for the page freelist 616 * code. The mere existence of this code is a vile hack that 617 * has resulted due to the backwards locking order of the page 618 * freelist manager; please don't call it. 619 */ 620 void 621 page_unlock_nocapture(page_t *pp) 622 { 623 kmutex_t *pse = PAGE_SE_MUTEX(pp); 624 selock_t old; 625 626 mutex_enter(pse); 627 628 old = pp->p_selock; 629 if ((old & ~SE_EWANTED) == SE_READER) { 630 pp->p_selock = old & ~SE_READER; 631 if (CV_HAS_WAITERS(&pp->p_cv)) 632 cv_broadcast(&pp->p_cv); 633 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 634 panic("page_unlock_nocapture: page %p is deleted", (void *)pp); 635 } else if (old < 0) { 636 THREAD_KPRI_RELEASE(); 637 pp->p_selock &= SE_EWANTED; 638 if (CV_HAS_WAITERS(&pp->p_cv)) 639 cv_broadcast(&pp->p_cv); 640 } else if ((old & ~SE_EWANTED) > SE_READER) { 641 pp->p_selock = old - SE_READER; 642 } else { 643 panic("page_unlock_nocapture: page %p is not locked", 644 (void *)pp); 645 } 646 647 mutex_exit(pse); 648 } 649 650 /* 651 * Release the page's "shared/exclusive" lock and wake up anyone 652 * who might be waiting for it. 653 */ 654 void 655 page_unlock(page_t *pp) 656 { 657 kmutex_t *pse = PAGE_SE_MUTEX(pp); 658 selock_t old; 659 660 mutex_enter(pse); 661 662 old = pp->p_selock; 663 if ((old & ~SE_EWANTED) == SE_READER) { 664 pp->p_selock = old & ~SE_READER; 665 if (CV_HAS_WAITERS(&pp->p_cv)) 666 cv_broadcast(&pp->p_cv); 667 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 668 panic("page_unlock: page %p is deleted", (void *)pp); 669 } else if (old < 0) { 670 THREAD_KPRI_RELEASE(); 671 pp->p_selock &= SE_EWANTED; 672 if (CV_HAS_WAITERS(&pp->p_cv)) 673 cv_broadcast(&pp->p_cv); 674 } else if ((old & ~SE_EWANTED) > SE_READER) { 675 pp->p_selock = old - SE_READER; 676 } else { 677 panic("page_unlock: page %p is not locked", (void *)pp); 678 } 679 680 if (pp->p_selock == 0) { 681 /* 682 * If the T_CAPTURING bit is set, that means that we should 683 * not try and capture the page again as we could recurse 684 * which could lead to a stack overflow panic or spending a 685 * relatively long time in the kernel making no progress. 686 */ 687 if ((pp->p_toxic & PR_CAPTURE) && 688 !(curthread->t_flag & T_CAPTURING) && 689 !PP_RETIRED(pp)) { 690 THREAD_KPRI_REQUEST(); 691 pp->p_selock = SE_WRITER; 692 mutex_exit(pse); 693 page_unlock_capture(pp); 694 } else { 695 mutex_exit(pse); 696 } 697 } else { 698 mutex_exit(pse); 699 } 700 } 701 702 /* 703 * Try to upgrade the lock on the page from a "shared" to an 704 * "exclusive" lock. Since this upgrade operation is done while 705 * holding the mutex protecting this page, no one else can acquire this page's 706 * lock and change the page. Thus, it is safe to drop the "shared" 707 * lock and attempt to acquire the "exclusive" lock. 708 * 709 * Returns 1 on success, 0 on failure. 710 */ 711 int 712 page_tryupgrade(page_t *pp) 713 { 714 kmutex_t *pse = PAGE_SE_MUTEX(pp); 715 716 mutex_enter(pse); 717 if (!(pp->p_selock & SE_EWANTED)) { 718 /* no threads want exclusive access, try upgrade */ 719 if (pp->p_selock == SE_READER) { 720 THREAD_KPRI_REQUEST(); 721 /* convert to exclusive lock */ 722 pp->p_selock = SE_WRITER; 723 mutex_exit(pse); 724 return (1); 725 } 726 } 727 mutex_exit(pse); 728 return (0); 729 } 730 731 /* 732 * Downgrade the "exclusive" lock on the page to a "shared" lock 733 * while holding the mutex protecting this page's p_selock field. 734 */ 735 void 736 page_downgrade(page_t *pp) 737 { 738 kmutex_t *pse = PAGE_SE_MUTEX(pp); 739 int excl_waiting; 740 741 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 742 ASSERT(PAGE_EXCL(pp)); 743 744 mutex_enter(pse); 745 excl_waiting = pp->p_selock & SE_EWANTED; 746 THREAD_KPRI_RELEASE(); 747 pp->p_selock = SE_READER | excl_waiting; 748 if (CV_HAS_WAITERS(&pp->p_cv)) 749 cv_broadcast(&pp->p_cv); 750 mutex_exit(pse); 751 } 752 753 void 754 page_lock_delete(page_t *pp) 755 { 756 kmutex_t *pse = PAGE_SE_MUTEX(pp); 757 758 ASSERT(PAGE_EXCL(pp)); 759 ASSERT(pp->p_vnode == NULL); 760 ASSERT(pp->p_offset == (u_offset_t)-1); 761 ASSERT(!PP_ISFREE(pp)); 762 763 mutex_enter(pse); 764 THREAD_KPRI_RELEASE(); 765 pp->p_selock = SE_DELETED; 766 if (CV_HAS_WAITERS(&pp->p_cv)) 767 cv_broadcast(&pp->p_cv); 768 mutex_exit(pse); 769 } 770 771 int 772 page_deleted(page_t *pp) 773 { 774 return (pp->p_selock == SE_DELETED); 775 } 776 777 /* 778 * Implement the io lock for pages 779 */ 780 void 781 page_iolock_init(page_t *pp) 782 { 783 pp->p_iolock_state = 0; 784 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 785 } 786 787 /* 788 * Acquire the i/o lock on a page. 789 */ 790 void 791 page_io_lock(page_t *pp) 792 { 793 kmutex_t *pio; 794 795 pio = PAGE_IO_MUTEX(pp); 796 mutex_enter(pio); 797 while (pp->p_iolock_state & PAGE_IO_INUSE) { 798 cv_wait(&(pp->p_io_cv), pio); 799 } 800 pp->p_iolock_state |= PAGE_IO_INUSE; 801 mutex_exit(pio); 802 } 803 804 /* 805 * Release the i/o lock on a page. 806 */ 807 void 808 page_io_unlock(page_t *pp) 809 { 810 kmutex_t *pio; 811 812 pio = PAGE_IO_MUTEX(pp); 813 mutex_enter(pio); 814 cv_broadcast(&pp->p_io_cv); 815 pp->p_iolock_state &= ~PAGE_IO_INUSE; 816 mutex_exit(pio); 817 } 818 819 /* 820 * Try to acquire the i/o lock on a page without blocking. 821 * Returns 1 on success, 0 on failure. 822 */ 823 int 824 page_io_trylock(page_t *pp) 825 { 826 kmutex_t *pio; 827 828 if (pp->p_iolock_state & PAGE_IO_INUSE) 829 return (0); 830 831 pio = PAGE_IO_MUTEX(pp); 832 mutex_enter(pio); 833 834 if (pp->p_iolock_state & PAGE_IO_INUSE) { 835 mutex_exit(pio); 836 return (0); 837 } 838 pp->p_iolock_state |= PAGE_IO_INUSE; 839 mutex_exit(pio); 840 841 return (1); 842 } 843 844 /* 845 * Wait until the i/o lock is not held. 846 */ 847 void 848 page_io_wait(page_t *pp) 849 { 850 kmutex_t *pio; 851 852 pio = PAGE_IO_MUTEX(pp); 853 mutex_enter(pio); 854 while (pp->p_iolock_state & PAGE_IO_INUSE) { 855 cv_wait(&(pp->p_io_cv), pio); 856 } 857 mutex_exit(pio); 858 } 859 860 /* 861 * Returns 1 on success, 0 on failure. 862 */ 863 int 864 page_io_locked(page_t *pp) 865 { 866 return (pp->p_iolock_state & PAGE_IO_INUSE); 867 } 868 869 /* 870 * Assert that the i/o lock on a page is held. 871 * Returns 1 on success, 0 on failure. 872 */ 873 int 874 page_iolock_assert(page_t *pp) 875 { 876 return (page_io_locked(pp)); 877 } 878 879 /* 880 * Wrapper exported to kernel routines that are built 881 * platform-independent (the macro is platform-dependent; 882 * the size of vph_mutex[] is based on NCPU). 883 * 884 * Note that you can do stress testing on this by setting the 885 * variable page_vnode_mutex_stress to something other than 886 * zero in a DEBUG kernel in a debugger after loading the kernel. 887 * Setting it after the kernel is running may not work correctly. 888 */ 889 #ifdef DEBUG 890 static int page_vnode_mutex_stress = 0; 891 #endif 892 893 kmutex_t * 894 page_vnode_mutex(vnode_t *vp) 895 { 896 if (vp == &kvp) 897 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 898 899 if (vp == &zvp) 900 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 901 #ifdef DEBUG 902 if (page_vnode_mutex_stress != 0) 903 return (&vph_mutex[0]); 904 #endif 905 906 return (&vph_mutex[VP_HASH_FUNC(vp)]); 907 } 908 909 kmutex_t * 910 page_se_mutex(page_t *pp) 911 { 912 return (PAGE_SE_MUTEX(pp)); 913 } 914 915 #ifdef VM_STATS 916 uint_t pszclck_stat[4]; 917 #endif 918 /* 919 * Find, take and return a mutex held by hat_page_demote(). 920 * Called by page_demote_vp_pages() before hat_page_demote() call and by 921 * routines that want to block hat_page_demote() but can't do it 922 * via locking all constituent pages. 923 * 924 * Return NULL if p_szc is 0. 925 * 926 * It should only be used for pages that can be demoted by hat_page_demote() 927 * i.e. non swapfs file system pages. The logic here is lifted from 928 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 929 * since the page is locked and not free. 930 * 931 * Hash of the root page is used to find the lock. 932 * To find the root in the presense of hat_page_demote() chageing the location 933 * of the root this routine relies on the fact that hat_page_demote() changes 934 * root last. 935 * 936 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 937 * returned pp's p_szc may be any value. 938 */ 939 kmutex_t * 940 page_szc_lock(page_t *pp) 941 { 942 kmutex_t *mtx; 943 page_t *rootpp; 944 uint_t szc; 945 uint_t rszc; 946 uint_t pszc = pp->p_szc; 947 948 ASSERT(pp != NULL); 949 ASSERT(PAGE_LOCKED(pp)); 950 ASSERT(!PP_ISFREE(pp)); 951 ASSERT(pp->p_vnode != NULL); 952 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 953 ASSERT(!PP_ISKAS(pp)); 954 955 again: 956 if (pszc == 0) { 957 VM_STAT_ADD(pszclck_stat[0]); 958 return (NULL); 959 } 960 961 /* The lock lives in the root page */ 962 963 rootpp = PP_GROUPLEADER(pp, pszc); 964 mtx = PAGE_SZC_MUTEX(rootpp); 965 mutex_enter(mtx); 966 967 /* 968 * since p_szc can only decrease if pp == rootpp 969 * rootpp will be always the same i.e we have the right root 970 * regardless of rootpp->p_szc. 971 * If location of pp's root didn't change after we took 972 * the lock we have the right root. return mutex hashed off it. 973 */ 974 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 975 VM_STAT_ADD(pszclck_stat[1]); 976 return (mtx); 977 } 978 979 /* 980 * root location changed because page got demoted. 981 * locate the new root. 982 */ 983 if (rszc < pszc) { 984 szc = pp->p_szc; 985 ASSERT(szc < pszc); 986 mutex_exit(mtx); 987 pszc = szc; 988 VM_STAT_ADD(pszclck_stat[2]); 989 goto again; 990 } 991 992 VM_STAT_ADD(pszclck_stat[3]); 993 /* 994 * current hat_page_demote not done yet. 995 * wait for it to finish. 996 */ 997 mutex_exit(mtx); 998 rootpp = PP_GROUPLEADER(rootpp, rszc); 999 mtx = PAGE_SZC_MUTEX(rootpp); 1000 mutex_enter(mtx); 1001 mutex_exit(mtx); 1002 ASSERT(rootpp->p_szc < rszc); 1003 goto again; 1004 } 1005 1006 int 1007 page_szc_lock_assert(page_t *pp) 1008 { 1009 page_t *rootpp = PP_PAGEROOT(pp); 1010 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 1011 1012 return (MUTEX_HELD(mtx)); 1013 } 1014 1015 /* 1016 * memseg locking 1017 */ 1018 static krwlock_t memsegslock; 1019 1020 /* 1021 * memlist (phys_install, phys_avail) locking. 1022 */ 1023 static krwlock_t memlists_lock; 1024 1025 int 1026 memsegs_trylock(int writer) 1027 { 1028 return (rw_tryenter(&memsegslock, writer ? RW_WRITER : RW_READER)); 1029 } 1030 1031 void 1032 memsegs_lock(int writer) 1033 { 1034 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1035 } 1036 1037 /*ARGSUSED*/ 1038 void 1039 memsegs_unlock(int writer) 1040 { 1041 rw_exit(&memsegslock); 1042 } 1043 1044 int 1045 memsegs_lock_held(void) 1046 { 1047 return (RW_LOCK_HELD(&memsegslock)); 1048 } 1049 1050 void 1051 memlist_read_lock(void) 1052 { 1053 rw_enter(&memlists_lock, RW_READER); 1054 } 1055 1056 void 1057 memlist_read_unlock(void) 1058 { 1059 rw_exit(&memlists_lock); 1060 } 1061 1062 void 1063 memlist_write_lock(void) 1064 { 1065 rw_enter(&memlists_lock, RW_WRITER); 1066 } 1067 1068 void 1069 memlist_write_unlock(void) 1070 { 1071 rw_exit(&memlists_lock); 1072 } 1073