1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * VM - page locking primitives 30 */ 31 #include <sys/param.h> 32 #include <sys/t_lock.h> 33 #include <sys/vtrace.h> 34 #include <sys/debug.h> 35 #include <sys/cmn_err.h> 36 #include <sys/vnode.h> 37 #include <sys/bitmap.h> 38 #include <sys/lockstat.h> 39 #include <sys/sysmacros.h> 40 #include <sys/condvar_impl.h> 41 #include <vm/page.h> 42 #include <vm/seg_enum.h> 43 #include <vm/vm_dep.h> 44 45 /* 46 * This global mutex is for logical page locking. 47 * The following fields in the page structure are protected 48 * by this lock: 49 * 50 * p_lckcnt 51 * p_cowcnt 52 */ 53 kmutex_t page_llock; 54 55 /* 56 * This is a global lock for the logical page free list. The 57 * logical free list, in this implementation, is maintained as two 58 * separate physical lists - the cache list and the free list. 59 */ 60 kmutex_t page_freelock; 61 62 /* 63 * The hash table, page_hash[], the p_selock fields, and the 64 * list of pages associated with vnodes are protected by arrays of mutexes. 65 * 66 * Unless the hashes are changed radically, the table sizes must be 67 * a power of two. Also, we typically need more mutexes for the 68 * vnodes since these locks are occasionally held for long periods. 69 * And since there seem to be two special vnodes (kvp and swapvp), 70 * we make room for private mutexes for them. 71 * 72 * The pse_mutex[] array holds the mutexes to protect the p_selock 73 * fields of all page_t structures. 74 * 75 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 76 * when given a pointer to a page_t. 77 * 78 * PIO_TABLE_SIZE must be a power of two. One could argue that we 79 * should go to the trouble of setting it up at run time and base it 80 * on memory size rather than the number of compile time CPUs. 81 * 82 * XX64 We should be using physmem size to calculate PIO_SHIFT. 83 * 84 * These might break in 64 bit world. 85 */ 86 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 87 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 88 89 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 90 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 91 92 #define PAGE_IO_MUTEX(pp) \ 93 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 94 95 /* 96 * The pse_mutex[] array is allocated in the platform startup code 97 * based on the size of the machine at startup. 98 */ 99 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 100 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 101 extern int pse_shift; /* log2(pse_table_size) */ 102 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 103 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 104 (pse_table_size - 1)].pad_mutex 105 106 #define PSZC_MTX_TABLE_SIZE 128 107 #define PSZC_MTX_TABLE_SHIFT 7 108 109 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 110 111 #define PAGE_SZC_MUTEX(_pp) \ 112 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 113 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 114 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 115 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 116 117 /* 118 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 119 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 120 * and p_vpnext). 121 * 122 * The page_vnode_mutex(vp) function returns the address of the appropriate 123 * mutex from this array given a pointer to a vnode. It is complicated 124 * by the fact that the kernel's vnode and the swapfs vnode are referenced 125 * frequently enough to warrent their own mutexes. 126 * 127 * The VP_HASH_FUNC returns the index into the vph_mutex array given 128 * an address of a vnode. 129 */ 130 131 /* 132 * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 133 * Need to review again. 134 */ 135 #if defined(_LP64) 136 #define VPH_TABLE_SIZE (1 << (VP_SHIFT + 3)) 137 #else /* 32 bits */ 138 #define VPH_TABLE_SIZE (2 << VP_SHIFT) 139 #endif 140 141 #define VP_HASH_FUNC(vp) \ 142 ((((uintptr_t)(vp) >> 6) + \ 143 ((uintptr_t)(vp) >> 8) + \ 144 ((uintptr_t)(vp) >> 10) + \ 145 ((uintptr_t)(vp) >> 12)) \ 146 & (VPH_TABLE_SIZE - 1)) 147 148 extern struct vnode kvp; 149 150 /* 151 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 152 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 153 * VPH_TABLE_SIZE + 1. 154 */ 155 156 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 157 158 /* 159 * Initialize the locks used by the Virtual Memory Management system. 160 */ 161 void 162 page_lock_init() 163 { 164 } 165 166 /* 167 * Return a value for pse_shift based on npg (the number of physical pages) 168 * and ncpu (the maximum number of CPUs). This is called by platform startup 169 * code. 170 * 171 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 172 * locks grew approximately as the square of the number of threads executing. 173 * So the primary scaling factor used is NCPU^2. The size of the machine in 174 * megabytes is used as an upper bound, particularly for sun4v machines which 175 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 176 * (128) is used as a minimum. Since the size of the table has to be a power 177 * of two, the calculated size is rounded up to the next power of two. 178 */ 179 /*ARGSUSED*/ 180 int 181 size_pse_array(pgcnt_t npg, int ncpu) 182 { 183 size_t size; 184 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 185 186 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 187 size += (1 << (highbit(size) - 1)) - 1; 188 return (highbit(size) - 1); 189 } 190 191 /* 192 * At present we only use page ownership to aid debugging, so it's 193 * OK if the owner field isn't exact. In the 32-bit world two thread ids 194 * can map to the same owner because we just 'or' in 0x80000000 and 195 * then clear the second highest bit, so that (for example) 0x2faced00 196 * and 0xafaced00 both map to 0xafaced00. 197 * In the 64-bit world, p_selock may not be large enough to hold a full 198 * thread pointer. If we ever need precise ownership (e.g. if we implement 199 * priority inheritance for page locks) then p_selock should become a 200 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 201 */ 202 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 203 #define SE_READER 1 204 205 /* 206 * A page that is deleted must be marked as such using the 207 * page_lock_delete() function. The page must be exclusively locked. 208 * The SE_DELETED marker is put in p_selock when this function is called. 209 * SE_DELETED must be distinct from any SE_WRITER value. 210 */ 211 #define SE_DELETED (1 | INT_MIN) 212 213 #ifdef VM_STATS 214 uint_t vph_kvp_count; 215 uint_t vph_swapfsvp_count; 216 uint_t vph_other; 217 #endif /* VM_STATS */ 218 219 #ifdef VM_STATS 220 uint_t page_lock_count; 221 uint_t page_lock_miss; 222 uint_t page_lock_miss_lock; 223 uint_t page_lock_reclaim; 224 uint_t page_lock_bad_reclaim; 225 uint_t page_lock_same_page; 226 uint_t page_lock_upgrade; 227 uint_t page_lock_retired; 228 uint_t page_lock_upgrade_failed; 229 uint_t page_lock_deleted; 230 231 uint_t page_trylock_locked; 232 uint_t page_trylock_failed; 233 uint_t page_trylock_missed; 234 235 uint_t page_try_reclaim_upgrade; 236 #endif /* VM_STATS */ 237 238 /* 239 * Acquire the "shared/exclusive" lock on a page. 240 * 241 * Returns 1 on success and locks the page appropriately. 242 * 0 on failure and does not lock the page. 243 * 244 * If `lock' is non-NULL, it will be dropped and reacquired in the 245 * failure case. This routine can block, and if it does 246 * it will always return a failure since the page identity [vp, off] 247 * or state may have changed. 248 */ 249 250 int 251 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 252 { 253 return (page_lock_es(pp, se, lock, reclaim, 0)); 254 } 255 256 /* 257 * With the addition of reader-writer lock semantics to page_lock_es, 258 * callers wanting an exclusive (writer) lock may prevent shared-lock 259 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 260 * In this case, when an exclusive lock cannot be acquired, p_selock's 261 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 262 * if the page is slated for retirement. 263 * 264 * The se and es parameters determine if the lock should be granted 265 * based on the following decision table: 266 * 267 * Lock wanted es flags p_selock/SE_EWANTED Action 268 * ----------- -------------- ------------------- --------- 269 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 270 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 271 * SE_EXCL none any lock/any deny 272 * SE_SHARED n/a [2] shared/0 grant 273 * SE_SHARED n/a [2] unlocked/0 grant 274 * SE_SHARED n/a shared/1 deny 275 * SE_SHARED n/a unlocked/1 deny 276 * SE_SHARED n/a excl/any deny 277 * 278 * Notes: 279 * [1] The code grants an exclusive lock to the caller and clears the bit 280 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 281 * bit's value. This was deemed acceptable as we are not concerned about 282 * exclusive-lock starvation. If this ever becomes an issue, a priority or 283 * fifo mechanism should also be implemented. Meantime, the thread that 284 * set SE_EWANTED should be prepared to catch this condition and reset it 285 * 286 * [2] Retired pages may not be locked at any time, regardless of the 287 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 288 * 289 * Notes on values of "es": 290 * 291 * es & 1: page_lookup_create will attempt page relocation 292 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 293 * memory thread); this prevents reader-starvation of waiting 294 * writer thread(s) by giving priority to writers over readers. 295 * es & SE_RETIRED: caller wants to lock pages even if they are 296 * retired. Default is to deny the lock if the page is retired. 297 * 298 * And yes, we know, the semantics of this function are too complicated. 299 * It's on the list to be cleaned up. 300 */ 301 int 302 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 303 { 304 int retval; 305 kmutex_t *pse = PAGE_SE_MUTEX(pp); 306 int upgraded; 307 int reclaim_it; 308 309 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 310 311 VM_STAT_ADD(page_lock_count); 312 313 upgraded = 0; 314 reclaim_it = 0; 315 316 mutex_enter(pse); 317 318 ASSERT(((es & SE_EXCL_WANTED) == 0) || 319 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 320 321 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 322 mutex_exit(pse); 323 VM_STAT_ADD(page_lock_retired); 324 return (0); 325 } 326 327 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 328 se = SE_EXCL; 329 } 330 331 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 332 333 reclaim_it = 1; 334 if (se == SE_SHARED) { 335 /* 336 * This is an interesting situation. 337 * 338 * Remember that p_free can only change if 339 * p_selock < 0. 340 * p_free does not depend on our holding `pse'. 341 * And, since we hold `pse', p_selock can not change. 342 * So, if p_free changes on us, the page is already 343 * exclusively held, and we would fail to get p_selock 344 * regardless. 345 * 346 * We want to avoid getting the share 347 * lock on a free page that needs to be reclaimed. 348 * It is possible that some other thread has the share 349 * lock and has left the free page on the cache list. 350 * pvn_vplist_dirty() does this for brief periods. 351 * If the se_share is currently SE_EXCL, we will fail 352 * to acquire p_selock anyway. Blocking is the 353 * right thing to do. 354 * If we need to reclaim this page, we must get 355 * exclusive access to it, force the upgrade now. 356 * Again, we will fail to acquire p_selock if the 357 * page is not free and block. 358 */ 359 upgraded = 1; 360 se = SE_EXCL; 361 VM_STAT_ADD(page_lock_upgrade); 362 } 363 } 364 365 if (se == SE_EXCL) { 366 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 367 /* 368 * if the caller wants a writer lock (but did not 369 * specify exclusive access), and there is a pending 370 * writer that wants exclusive access, return failure 371 */ 372 retval = 0; 373 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 374 /* no reader/writer lock held */ 375 THREAD_KPRI_REQUEST(); 376 /* this clears our setting of the SE_EWANTED bit */ 377 pp->p_selock = SE_WRITER; 378 retval = 1; 379 } else { 380 /* page is locked */ 381 if (es & SE_EXCL_WANTED) { 382 /* set the SE_EWANTED bit */ 383 pp->p_selock |= SE_EWANTED; 384 } 385 retval = 0; 386 } 387 } else { 388 retval = 0; 389 if (pp->p_selock >= 0) { 390 if ((pp->p_selock & SE_EWANTED) == 0) { 391 pp->p_selock += SE_READER; 392 retval = 1; 393 } 394 } 395 } 396 397 if (retval == 0) { 398 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 399 VM_STAT_ADD(page_lock_deleted); 400 mutex_exit(pse); 401 return (retval); 402 } 403 404 #ifdef VM_STATS 405 VM_STAT_ADD(page_lock_miss); 406 if (upgraded) { 407 VM_STAT_ADD(page_lock_upgrade_failed); 408 } 409 #endif 410 if (lock) { 411 VM_STAT_ADD(page_lock_miss_lock); 412 mutex_exit(lock); 413 } 414 415 /* 416 * Now, wait for the page to be unlocked and 417 * release the lock protecting p_cv and p_selock. 418 */ 419 cv_wait(&pp->p_cv, pse); 420 mutex_exit(pse); 421 422 /* 423 * The page identity may have changed while we were 424 * blocked. If we are willing to depend on "pp" 425 * still pointing to a valid page structure (i.e., 426 * assuming page structures are not dynamically allocated 427 * or freed), we could try to lock the page if its 428 * identity hasn't changed. 429 * 430 * This needs to be measured, since we come back from 431 * cv_wait holding pse (the expensive part of this 432 * operation) we might as well try the cheap part. 433 * Though we would also have to confirm that dropping 434 * `lock' did not cause any grief to the callers. 435 */ 436 if (lock) { 437 mutex_enter(lock); 438 } 439 } else { 440 /* 441 * We have the page lock. 442 * If we needed to reclaim the page, and the page 443 * needed reclaiming (ie, it was free), then we 444 * have the page exclusively locked. We may need 445 * to downgrade the page. 446 */ 447 ASSERT((upgraded) ? 448 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 449 mutex_exit(pse); 450 451 /* 452 * We now hold this page's lock, either shared or 453 * exclusive. This will prevent its identity from changing. 454 * The page, however, may or may not be free. If the caller 455 * requested, and it is free, go reclaim it from the 456 * free list. If the page can't be reclaimed, return failure 457 * so that the caller can start all over again. 458 * 459 * NOTE:page_reclaim() releases the page lock (p_selock) 460 * if it can't be reclaimed. 461 */ 462 if (reclaim_it) { 463 if (!page_reclaim(pp, lock)) { 464 VM_STAT_ADD(page_lock_bad_reclaim); 465 retval = 0; 466 } else { 467 VM_STAT_ADD(page_lock_reclaim); 468 if (upgraded) { 469 page_downgrade(pp); 470 } 471 } 472 } 473 } 474 return (retval); 475 } 476 477 /* 478 * Clear the SE_EWANTED bit from p_selock. This function allows 479 * callers of page_lock_es and page_try_reclaim_lock to clear 480 * their setting of this bit if they decide they no longer wish 481 * to gain exclusive access to the page. Currently only 482 * delete_memory_thread uses this when the delete memory 483 * operation is cancelled. 484 */ 485 void 486 page_lock_clr_exclwanted(page_t *pp) 487 { 488 kmutex_t *pse = PAGE_SE_MUTEX(pp); 489 490 mutex_enter(pse); 491 pp->p_selock &= ~SE_EWANTED; 492 if (CV_HAS_WAITERS(&pp->p_cv)) 493 cv_broadcast(&pp->p_cv); 494 mutex_exit(pse); 495 } 496 497 /* 498 * Read the comments inside of page_lock_es() carefully. 499 * 500 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 501 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 502 * This is used by threads subject to reader-starvation (eg. memory delete). 503 * 504 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 505 * it is expected that it will retry at a later time. Threads that will 506 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 507 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 508 * the bit is cleared.) 509 */ 510 int 511 page_try_reclaim_lock(page_t *pp, se_t se, int es) 512 { 513 kmutex_t *pse = PAGE_SE_MUTEX(pp); 514 selock_t old; 515 516 mutex_enter(pse); 517 518 old = pp->p_selock; 519 520 ASSERT(((es & SE_EXCL_WANTED) == 0) || 521 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 522 523 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 524 mutex_exit(pse); 525 VM_STAT_ADD(page_trylock_failed); 526 return (0); 527 } 528 529 if (se == SE_SHARED && es == 1 && old == 0) { 530 se = SE_EXCL; 531 } 532 533 if (se == SE_SHARED) { 534 if (!PP_ISFREE(pp)) { 535 if (old >= 0) { 536 /* 537 * Readers are not allowed when excl wanted 538 */ 539 if ((old & SE_EWANTED) == 0) { 540 pp->p_selock = old + SE_READER; 541 mutex_exit(pse); 542 return (1); 543 } 544 } 545 mutex_exit(pse); 546 return (0); 547 } 548 /* 549 * The page is free, so we really want SE_EXCL (below) 550 */ 551 VM_STAT_ADD(page_try_reclaim_upgrade); 552 } 553 554 /* 555 * The caller wants a writer lock. We try for it only if 556 * SE_EWANTED is not set, or if the caller specified 557 * SE_EXCL_WANTED. 558 */ 559 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 560 if ((old & ~SE_EWANTED) == 0) { 561 /* no reader/writer lock held */ 562 THREAD_KPRI_REQUEST(); 563 /* this clears out our setting of the SE_EWANTED bit */ 564 pp->p_selock = SE_WRITER; 565 mutex_exit(pse); 566 return (1); 567 } 568 } 569 if (es & SE_EXCL_WANTED) { 570 /* page is locked, set the SE_EWANTED bit */ 571 pp->p_selock |= SE_EWANTED; 572 } 573 mutex_exit(pse); 574 return (0); 575 } 576 577 /* 578 * Acquire a page's "shared/exclusive" lock, but never block. 579 * Returns 1 on success, 0 on failure. 580 */ 581 int 582 page_trylock(page_t *pp, se_t se) 583 { 584 kmutex_t *pse = PAGE_SE_MUTEX(pp); 585 586 mutex_enter(pse); 587 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 588 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 589 /* 590 * Fail if a thread wants exclusive access and page is 591 * retired, if the page is slated for retirement, or a 592 * share lock is requested. 593 */ 594 mutex_exit(pse); 595 VM_STAT_ADD(page_trylock_failed); 596 return (0); 597 } 598 599 if (se == SE_EXCL) { 600 if (pp->p_selock == 0) { 601 THREAD_KPRI_REQUEST(); 602 pp->p_selock = SE_WRITER; 603 mutex_exit(pse); 604 return (1); 605 } 606 } else { 607 if (pp->p_selock >= 0) { 608 pp->p_selock += SE_READER; 609 mutex_exit(pse); 610 return (1); 611 } 612 } 613 mutex_exit(pse); 614 return (0); 615 } 616 617 /* 618 * Variant of page_unlock() specifically for the page freelist 619 * code. The mere existence of this code is a vile hack that 620 * has resulted due to the backwards locking order of the page 621 * freelist manager; please don't call it. 622 */ 623 void 624 page_unlock_nocapture(page_t *pp) 625 { 626 kmutex_t *pse = PAGE_SE_MUTEX(pp); 627 selock_t old; 628 629 mutex_enter(pse); 630 631 old = pp->p_selock; 632 if ((old & ~SE_EWANTED) == SE_READER) { 633 pp->p_selock = old & ~SE_READER; 634 if (CV_HAS_WAITERS(&pp->p_cv)) 635 cv_broadcast(&pp->p_cv); 636 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 637 panic("page_unlock_nocapture: page %p is deleted", (void *)pp); 638 } else if (old < 0) { 639 THREAD_KPRI_RELEASE(); 640 pp->p_selock &= SE_EWANTED; 641 if (CV_HAS_WAITERS(&pp->p_cv)) 642 cv_broadcast(&pp->p_cv); 643 } else if ((old & ~SE_EWANTED) > SE_READER) { 644 pp->p_selock = old - SE_READER; 645 } else { 646 panic("page_unlock_nocapture: page %p is not locked", 647 (void *)pp); 648 } 649 650 mutex_exit(pse); 651 } 652 653 /* 654 * Release the page's "shared/exclusive" lock and wake up anyone 655 * who might be waiting for it. 656 */ 657 void 658 page_unlock(page_t *pp) 659 { 660 kmutex_t *pse = PAGE_SE_MUTEX(pp); 661 selock_t old; 662 663 mutex_enter(pse); 664 665 old = pp->p_selock; 666 if ((old & ~SE_EWANTED) == SE_READER) { 667 pp->p_selock = old & ~SE_READER; 668 if (CV_HAS_WAITERS(&pp->p_cv)) 669 cv_broadcast(&pp->p_cv); 670 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 671 panic("page_unlock: page %p is deleted", (void *)pp); 672 } else if (old < 0) { 673 THREAD_KPRI_RELEASE(); 674 pp->p_selock &= SE_EWANTED; 675 if (CV_HAS_WAITERS(&pp->p_cv)) 676 cv_broadcast(&pp->p_cv); 677 } else if ((old & ~SE_EWANTED) > SE_READER) { 678 pp->p_selock = old - SE_READER; 679 } else { 680 panic("page_unlock: page %p is not locked", (void *)pp); 681 } 682 683 if (pp->p_selock == 0) { 684 /* 685 * If the T_CAPTURING bit is set, that means that we should 686 * not try and capture the page again as we could recurse 687 * which could lead to a stack overflow panic or spending a 688 * relatively long time in the kernel making no progress. 689 */ 690 if ((pp->p_toxic & PR_CAPTURE) && 691 !(curthread->t_flag & T_CAPTURING) && 692 !PP_RETIRED(pp)) { 693 THREAD_KPRI_REQUEST(); 694 pp->p_selock = SE_WRITER; 695 mutex_exit(pse); 696 page_unlock_capture(pp); 697 } else { 698 mutex_exit(pse); 699 } 700 } else { 701 mutex_exit(pse); 702 } 703 } 704 705 /* 706 * Try to upgrade the lock on the page from a "shared" to an 707 * "exclusive" lock. Since this upgrade operation is done while 708 * holding the mutex protecting this page, no one else can acquire this page's 709 * lock and change the page. Thus, it is safe to drop the "shared" 710 * lock and attempt to acquire the "exclusive" lock. 711 * 712 * Returns 1 on success, 0 on failure. 713 */ 714 int 715 page_tryupgrade(page_t *pp) 716 { 717 kmutex_t *pse = PAGE_SE_MUTEX(pp); 718 719 mutex_enter(pse); 720 if (!(pp->p_selock & SE_EWANTED)) { 721 /* no threads want exclusive access, try upgrade */ 722 if (pp->p_selock == SE_READER) { 723 THREAD_KPRI_REQUEST(); 724 /* convert to exclusive lock */ 725 pp->p_selock = SE_WRITER; 726 mutex_exit(pse); 727 return (1); 728 } 729 } 730 mutex_exit(pse); 731 return (0); 732 } 733 734 /* 735 * Downgrade the "exclusive" lock on the page to a "shared" lock 736 * while holding the mutex protecting this page's p_selock field. 737 */ 738 void 739 page_downgrade(page_t *pp) 740 { 741 kmutex_t *pse = PAGE_SE_MUTEX(pp); 742 int excl_waiting; 743 744 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 745 ASSERT(PAGE_EXCL(pp)); 746 747 mutex_enter(pse); 748 excl_waiting = pp->p_selock & SE_EWANTED; 749 THREAD_KPRI_RELEASE(); 750 pp->p_selock = SE_READER | excl_waiting; 751 if (CV_HAS_WAITERS(&pp->p_cv)) 752 cv_broadcast(&pp->p_cv); 753 mutex_exit(pse); 754 } 755 756 void 757 page_lock_delete(page_t *pp) 758 { 759 kmutex_t *pse = PAGE_SE_MUTEX(pp); 760 761 ASSERT(PAGE_EXCL(pp)); 762 ASSERT(pp->p_vnode == NULL); 763 ASSERT(pp->p_offset == (u_offset_t)-1); 764 ASSERT(!PP_ISFREE(pp)); 765 766 mutex_enter(pse); 767 THREAD_KPRI_RELEASE(); 768 pp->p_selock = SE_DELETED; 769 if (CV_HAS_WAITERS(&pp->p_cv)) 770 cv_broadcast(&pp->p_cv); 771 mutex_exit(pse); 772 } 773 774 int 775 page_deleted(page_t *pp) 776 { 777 return (pp->p_selock == SE_DELETED); 778 } 779 780 /* 781 * Implement the io lock for pages 782 */ 783 void 784 page_iolock_init(page_t *pp) 785 { 786 pp->p_iolock_state = 0; 787 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 788 } 789 790 /* 791 * Acquire the i/o lock on a page. 792 */ 793 void 794 page_io_lock(page_t *pp) 795 { 796 kmutex_t *pio; 797 798 pio = PAGE_IO_MUTEX(pp); 799 mutex_enter(pio); 800 while (pp->p_iolock_state & PAGE_IO_INUSE) { 801 cv_wait(&(pp->p_io_cv), pio); 802 } 803 pp->p_iolock_state |= PAGE_IO_INUSE; 804 mutex_exit(pio); 805 } 806 807 /* 808 * Release the i/o lock on a page. 809 */ 810 void 811 page_io_unlock(page_t *pp) 812 { 813 kmutex_t *pio; 814 815 pio = PAGE_IO_MUTEX(pp); 816 mutex_enter(pio); 817 cv_broadcast(&pp->p_io_cv); 818 pp->p_iolock_state &= ~PAGE_IO_INUSE; 819 mutex_exit(pio); 820 } 821 822 /* 823 * Try to acquire the i/o lock on a page without blocking. 824 * Returns 1 on success, 0 on failure. 825 */ 826 int 827 page_io_trylock(page_t *pp) 828 { 829 kmutex_t *pio; 830 831 if (pp->p_iolock_state & PAGE_IO_INUSE) 832 return (0); 833 834 pio = PAGE_IO_MUTEX(pp); 835 mutex_enter(pio); 836 837 if (pp->p_iolock_state & PAGE_IO_INUSE) { 838 mutex_exit(pio); 839 return (0); 840 } 841 pp->p_iolock_state |= PAGE_IO_INUSE; 842 mutex_exit(pio); 843 844 return (1); 845 } 846 847 /* 848 * Wait until the i/o lock is not held. 849 */ 850 void 851 page_io_wait(page_t *pp) 852 { 853 kmutex_t *pio; 854 855 pio = PAGE_IO_MUTEX(pp); 856 mutex_enter(pio); 857 while (pp->p_iolock_state & PAGE_IO_INUSE) { 858 cv_wait(&(pp->p_io_cv), pio); 859 } 860 mutex_exit(pio); 861 } 862 863 /* 864 * Returns 1 on success, 0 on failure. 865 */ 866 int 867 page_io_locked(page_t *pp) 868 { 869 return (pp->p_iolock_state & PAGE_IO_INUSE); 870 } 871 872 /* 873 * Assert that the i/o lock on a page is held. 874 * Returns 1 on success, 0 on failure. 875 */ 876 int 877 page_iolock_assert(page_t *pp) 878 { 879 return (page_io_locked(pp)); 880 } 881 882 /* 883 * Wrapper exported to kernel routines that are built 884 * platform-independent (the macro is platform-dependent; 885 * the size of vph_mutex[] is based on NCPU). 886 * 887 * Note that you can do stress testing on this by setting the 888 * variable page_vnode_mutex_stress to something other than 889 * zero in a DEBUG kernel in a debugger after loading the kernel. 890 * Setting it after the kernel is running may not work correctly. 891 */ 892 #ifdef DEBUG 893 static int page_vnode_mutex_stress = 0; 894 #endif 895 896 kmutex_t * 897 page_vnode_mutex(vnode_t *vp) 898 { 899 if (vp == &kvp) 900 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 901 902 if (vp == &zvp) 903 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 904 #ifdef DEBUG 905 if (page_vnode_mutex_stress != 0) 906 return (&vph_mutex[0]); 907 #endif 908 909 return (&vph_mutex[VP_HASH_FUNC(vp)]); 910 } 911 912 kmutex_t * 913 page_se_mutex(page_t *pp) 914 { 915 return (PAGE_SE_MUTEX(pp)); 916 } 917 918 #ifdef VM_STATS 919 uint_t pszclck_stat[4]; 920 #endif 921 /* 922 * Find, take and return a mutex held by hat_page_demote(). 923 * Called by page_demote_vp_pages() before hat_page_demote() call and by 924 * routines that want to block hat_page_demote() but can't do it 925 * via locking all constituent pages. 926 * 927 * Return NULL if p_szc is 0. 928 * 929 * It should only be used for pages that can be demoted by hat_page_demote() 930 * i.e. non swapfs file system pages. The logic here is lifted from 931 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 932 * since the page is locked and not free. 933 * 934 * Hash of the root page is used to find the lock. 935 * To find the root in the presense of hat_page_demote() chageing the location 936 * of the root this routine relies on the fact that hat_page_demote() changes 937 * root last. 938 * 939 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 940 * returned pp's p_szc may be any value. 941 */ 942 kmutex_t * 943 page_szc_lock(page_t *pp) 944 { 945 kmutex_t *mtx; 946 page_t *rootpp; 947 uint_t szc; 948 uint_t rszc; 949 uint_t pszc = pp->p_szc; 950 951 ASSERT(pp != NULL); 952 ASSERT(PAGE_LOCKED(pp)); 953 ASSERT(!PP_ISFREE(pp)); 954 ASSERT(pp->p_vnode != NULL); 955 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 956 ASSERT(!PP_ISKAS(pp)); 957 958 again: 959 if (pszc == 0) { 960 VM_STAT_ADD(pszclck_stat[0]); 961 return (NULL); 962 } 963 964 /* The lock lives in the root page */ 965 966 rootpp = PP_GROUPLEADER(pp, pszc); 967 mtx = PAGE_SZC_MUTEX(rootpp); 968 mutex_enter(mtx); 969 970 /* 971 * since p_szc can only decrease if pp == rootpp 972 * rootpp will be always the same i.e we have the right root 973 * regardless of rootpp->p_szc. 974 * If location of pp's root didn't change after we took 975 * the lock we have the right root. return mutex hashed off it. 976 */ 977 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 978 VM_STAT_ADD(pszclck_stat[1]); 979 return (mtx); 980 } 981 982 /* 983 * root location changed because page got demoted. 984 * locate the new root. 985 */ 986 if (rszc < pszc) { 987 szc = pp->p_szc; 988 ASSERT(szc < pszc); 989 mutex_exit(mtx); 990 pszc = szc; 991 VM_STAT_ADD(pszclck_stat[2]); 992 goto again; 993 } 994 995 VM_STAT_ADD(pszclck_stat[3]); 996 /* 997 * current hat_page_demote not done yet. 998 * wait for it to finish. 999 */ 1000 mutex_exit(mtx); 1001 rootpp = PP_GROUPLEADER(rootpp, rszc); 1002 mtx = PAGE_SZC_MUTEX(rootpp); 1003 mutex_enter(mtx); 1004 mutex_exit(mtx); 1005 ASSERT(rootpp->p_szc < rszc); 1006 goto again; 1007 } 1008 1009 int 1010 page_szc_lock_assert(page_t *pp) 1011 { 1012 page_t *rootpp = PP_PAGEROOT(pp); 1013 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 1014 1015 return (MUTEX_HELD(mtx)); 1016 } 1017 1018 /* 1019 * memseg locking 1020 */ 1021 static krwlock_t memsegslock; 1022 1023 /* 1024 * memlist (phys_install, phys_avail) locking. 1025 */ 1026 static krwlock_t memlists_lock; 1027 1028 void 1029 memsegs_lock(int writer) 1030 { 1031 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1032 } 1033 1034 /*ARGSUSED*/ 1035 void 1036 memsegs_unlock(int writer) 1037 { 1038 rw_exit(&memsegslock); 1039 } 1040 1041 int 1042 memsegs_lock_held(void) 1043 { 1044 return (RW_LOCK_HELD(&memsegslock)); 1045 } 1046 1047 void 1048 memlist_read_lock(void) 1049 { 1050 rw_enter(&memlists_lock, RW_READER); 1051 } 1052 1053 void 1054 memlist_read_unlock(void) 1055 { 1056 rw_exit(&memlists_lock); 1057 } 1058 1059 void 1060 memlist_write_lock(void) 1061 { 1062 rw_enter(&memlists_lock, RW_WRITER); 1063 } 1064 1065 void 1066 memlist_write_unlock(void) 1067 { 1068 rw_exit(&memlists_lock); 1069 } 1070