1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * VM - page locking primitives 30 */ 31 #include <sys/param.h> 32 #include <sys/t_lock.h> 33 #include <sys/vtrace.h> 34 #include <sys/debug.h> 35 #include <sys/cmn_err.h> 36 #include <sys/vnode.h> 37 #include <sys/bitmap.h> 38 #include <sys/lockstat.h> 39 #include <sys/sysmacros.h> 40 #include <sys/condvar_impl.h> 41 #include <vm/page.h> 42 #include <vm/seg_enum.h> 43 #include <vm/vm_dep.h> 44 45 /* 46 * This global mutex is for logical page locking. 47 * The following fields in the page structure are protected 48 * by this lock: 49 * 50 * p_lckcnt 51 * p_cowcnt 52 */ 53 kmutex_t page_llock; 54 55 /* 56 * This is a global lock for the logical page free list. The 57 * logical free list, in this implementation, is maintained as two 58 * separate physical lists - the cache list and the free list. 59 */ 60 kmutex_t page_freelock; 61 62 /* 63 * The hash table, page_hash[], the p_selock fields, and the 64 * list of pages associated with vnodes are protected by arrays of mutexes. 65 * 66 * Unless the hashes are changed radically, the table sizes must be 67 * a power of two. Also, we typically need more mutexes for the 68 * vnodes since these locks are occasionally held for long periods. 69 * And since there seem to be two special vnodes (kvp and swapvp), 70 * we make room for private mutexes for them. 71 * 72 * The pse_mutex[] array holds the mutexes to protect the p_selock 73 * fields of all page_t structures. 74 * 75 * PAGE_SE_MUTEX(pp) returns the address of the appropriate mutex 76 * when given a pointer to a page_t. 77 * 78 * PIO_TABLE_SIZE must be a power of two. One could argue that we 79 * should go to the trouble of setting it up at run time and base it 80 * on memory size rather than the number of compile time CPUs. 81 * 82 * XX64 We should be using physmem size to calculate PIO_SHIFT. 83 * 84 * These might break in 64 bit world. 85 */ 86 #define PIO_SHIFT 7 /* log2(sizeof(page_t)) */ 87 #define PIO_TABLE_SIZE 128 /* number of io mutexes to have */ 88 89 pad_mutex_t ph_mutex[PH_TABLE_SIZE]; 90 kmutex_t pio_mutex[PIO_TABLE_SIZE]; 91 92 #define PAGE_IO_MUTEX(pp) \ 93 &pio_mutex[(((uintptr_t)pp) >> PIO_SHIFT) & (PIO_TABLE_SIZE - 1)] 94 95 /* 96 * The pse_mutex[] array is allocated in the platform startup code 97 * based on the size of the machine at startup. 98 */ 99 extern pad_mutex_t *pse_mutex; /* Locks protecting pp->p_selock */ 100 extern size_t pse_table_size; /* Number of mutexes in pse_mutex[] */ 101 extern int pse_shift; /* log2(pse_table_size) */ 102 #define PAGE_SE_MUTEX(pp) &pse_mutex[ \ 103 ((((uintptr_t)(pp) >> pse_shift) ^ ((uintptr_t)(pp))) >> 7) & \ 104 (pse_table_size - 1)].pad_mutex 105 106 #define PSZC_MTX_TABLE_SIZE 128 107 #define PSZC_MTX_TABLE_SHIFT 7 108 109 static pad_mutex_t pszc_mutex[PSZC_MTX_TABLE_SIZE]; 110 111 #define PAGE_SZC_MUTEX(_pp) \ 112 &pszc_mutex[((((uintptr_t)(_pp) >> PSZC_MTX_TABLE_SHIFT) ^ \ 113 ((uintptr_t)(_pp) >> (PSZC_MTX_TABLE_SHIFT << 1)) ^ \ 114 ((uintptr_t)(_pp) >> (3 * PSZC_MTX_TABLE_SHIFT))) & \ 115 (PSZC_MTX_TABLE_SIZE - 1))].pad_mutex 116 117 /* 118 * The vph_mutex[] array holds the mutexes to protect the vnode chains, 119 * (i.e., the list of pages anchored by v_pages and connected via p_vpprev 120 * and p_vpnext). 121 * 122 * The page_vnode_mutex(vp) function returns the address of the appropriate 123 * mutex from this array given a pointer to a vnode. It is complicated 124 * by the fact that the kernel's vnode and the swapfs vnode are referenced 125 * frequently enough to warrent their own mutexes. 126 * 127 * The VP_HASH_FUNC returns the index into the vph_mutex array given 128 * an address of a vnode. 129 */ 130 131 /* 132 * XX64 VPH_TABLE_SIZE and VP_HASH_FUNC might break in 64 bit world. 133 * Need to review again. 134 */ 135 #if defined(_LP64) 136 #define VPH_TABLE_SIZE (1 << (VP_SHIFT + 3)) 137 #else /* 32 bits */ 138 #define VPH_TABLE_SIZE (2 << VP_SHIFT) 139 #endif 140 141 #define VP_HASH_FUNC(vp) \ 142 ((((uintptr_t)(vp) >> 6) + \ 143 ((uintptr_t)(vp) >> 8) + \ 144 ((uintptr_t)(vp) >> 10) + \ 145 ((uintptr_t)(vp) >> 12)) \ 146 & (VPH_TABLE_SIZE - 1)) 147 148 extern struct vnode kvp; 149 150 /* 151 * Two slots after VPH_TABLE_SIZE are reserved in vph_mutex for kernel vnodes. 152 * The lock for kvp is VPH_TABLE_SIZE + 0, and the lock for zvp is 153 * VPH_TABLE_SIZE + 1. 154 */ 155 156 kmutex_t vph_mutex[VPH_TABLE_SIZE + 2]; 157 158 /* 159 * Initialize the locks used by the Virtual Memory Management system. 160 */ 161 void 162 page_lock_init() 163 { 164 } 165 166 /* 167 * Return a value for pse_shift based on npg (the number of physical pages) 168 * and ncpu (the maximum number of CPUs). This is called by platform startup 169 * code. 170 * 171 * Lockstat data from TPC-H runs showed that contention on the pse_mutex[] 172 * locks grew approximately as the square of the number of threads executing. 173 * So the primary scaling factor used is NCPU^2. The size of the machine in 174 * megabytes is used as an upper bound, particularly for sun4v machines which 175 * all claim to have 256 CPUs maximum, and the old value of PSE_TABLE_SIZE 176 * (128) is used as a minimum. Since the size of the table has to be a power 177 * of two, the calculated size is rounded up to the next power of two. 178 */ 179 /*ARGSUSED*/ 180 int 181 size_pse_array(pgcnt_t npg, int ncpu) 182 { 183 size_t size; 184 pgcnt_t pp_per_mb = (1024 * 1024) / PAGESIZE; 185 186 size = MAX(128, MIN(npg / pp_per_mb, 2 * ncpu * ncpu)); 187 size += (1 << (highbit(size) - 1)) - 1; 188 return (highbit(size) - 1); 189 } 190 191 /* 192 * At present we only use page ownership to aid debugging, so it's 193 * OK if the owner field isn't exact. In the 32-bit world two thread ids 194 * can map to the same owner because we just 'or' in 0x80000000 and 195 * then clear the second highest bit, so that (for example) 0x2faced00 196 * and 0xafaced00 both map to 0xafaced00. 197 * In the 64-bit world, p_selock may not be large enough to hold a full 198 * thread pointer. If we ever need precise ownership (e.g. if we implement 199 * priority inheritance for page locks) then p_selock should become a 200 * uintptr_t and SE_WRITER should be -((uintptr_t)curthread >> 2). 201 */ 202 #define SE_WRITER (((selock_t)(ulong_t)curthread | INT_MIN) & ~SE_EWANTED) 203 #define SE_READER 1 204 205 /* 206 * A page that is deleted must be marked as such using the 207 * page_lock_delete() function. The page must be exclusively locked. 208 * The SE_DELETED marker is put in p_selock when this function is called. 209 * SE_DELETED must be distinct from any SE_WRITER value. 210 */ 211 #define SE_DELETED (1 | INT_MIN) 212 213 #ifdef VM_STATS 214 uint_t vph_kvp_count; 215 uint_t vph_swapfsvp_count; 216 uint_t vph_other; 217 #endif /* VM_STATS */ 218 219 #ifdef VM_STATS 220 uint_t page_lock_count; 221 uint_t page_lock_miss; 222 uint_t page_lock_miss_lock; 223 uint_t page_lock_reclaim; 224 uint_t page_lock_bad_reclaim; 225 uint_t page_lock_same_page; 226 uint_t page_lock_upgrade; 227 uint_t page_lock_retired; 228 uint_t page_lock_upgrade_failed; 229 uint_t page_lock_deleted; 230 231 uint_t page_trylock_locked; 232 uint_t page_trylock_failed; 233 uint_t page_trylock_missed; 234 235 uint_t page_try_reclaim_upgrade; 236 #endif /* VM_STATS */ 237 238 /* 239 * Acquire the "shared/exclusive" lock on a page. 240 * 241 * Returns 1 on success and locks the page appropriately. 242 * 0 on failure and does not lock the page. 243 * 244 * If `lock' is non-NULL, it will be dropped and reacquired in the 245 * failure case. This routine can block, and if it does 246 * it will always return a failure since the page identity [vp, off] 247 * or state may have changed. 248 */ 249 250 int 251 page_lock(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim) 252 { 253 return (page_lock_es(pp, se, lock, reclaim, 0)); 254 } 255 256 /* 257 * With the addition of reader-writer lock semantics to page_lock_es, 258 * callers wanting an exclusive (writer) lock may prevent shared-lock 259 * (reader) starvation by setting the es parameter to SE_EXCL_WANTED. 260 * In this case, when an exclusive lock cannot be acquired, p_selock's 261 * SE_EWANTED bit is set. Shared-lock (reader) requests are also denied 262 * if the page is slated for retirement. 263 * 264 * The se and es parameters determine if the lock should be granted 265 * based on the following decision table: 266 * 267 * Lock wanted es flags p_selock/SE_EWANTED Action 268 * ----------- -------------- ------------------- --------- 269 * SE_EXCL any [1][2] unlocked/any grant lock, clear SE_EWANTED 270 * SE_EXCL SE_EWANTED any lock/any deny, set SE_EWANTED 271 * SE_EXCL none any lock/any deny 272 * SE_SHARED n/a [2] shared/0 grant 273 * SE_SHARED n/a [2] unlocked/0 grant 274 * SE_SHARED n/a shared/1 deny 275 * SE_SHARED n/a unlocked/1 deny 276 * SE_SHARED n/a excl/any deny 277 * 278 * Notes: 279 * [1] The code grants an exclusive lock to the caller and clears the bit 280 * SE_EWANTED whenever p_selock is unlocked, regardless of the SE_EWANTED 281 * bit's value. This was deemed acceptable as we are not concerned about 282 * exclusive-lock starvation. If this ever becomes an issue, a priority or 283 * fifo mechanism should also be implemented. Meantime, the thread that 284 * set SE_EWANTED should be prepared to catch this condition and reset it 285 * 286 * [2] Retired pages may not be locked at any time, regardless of the 287 * dispostion of se, unless the es parameter has SE_RETIRED flag set. 288 * 289 * Notes on values of "es": 290 * 291 * es & 1: page_lookup_create will attempt page relocation 292 * es & SE_EXCL_WANTED: caller wants SE_EWANTED set (eg. delete 293 * memory thread); this prevents reader-starvation of waiting 294 * writer thread(s) by giving priority to writers over readers. 295 * es & SE_RETIRED: caller wants to lock pages even if they are 296 * retired. Default is to deny the lock if the page is retired. 297 * 298 * And yes, we know, the semantics of this function are too complicated. 299 * It's on the list to be cleaned up. 300 */ 301 int 302 page_lock_es(page_t *pp, se_t se, kmutex_t *lock, reclaim_t reclaim, int es) 303 { 304 int retval; 305 kmutex_t *pse = PAGE_SE_MUTEX(pp); 306 int upgraded; 307 int reclaim_it; 308 309 ASSERT(lock != NULL ? MUTEX_HELD(lock) : 1); 310 311 VM_STAT_ADD(page_lock_count); 312 313 upgraded = 0; 314 reclaim_it = 0; 315 316 mutex_enter(pse); 317 318 ASSERT(((es & SE_EXCL_WANTED) == 0) || 319 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 320 321 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 322 mutex_exit(pse); 323 VM_STAT_ADD(page_lock_retired); 324 return (0); 325 } 326 327 if (se == SE_SHARED && es == 1 && pp->p_selock == 0) { 328 se = SE_EXCL; 329 } 330 331 if ((reclaim == P_RECLAIM) && (PP_ISFREE(pp))) { 332 333 reclaim_it = 1; 334 if (se == SE_SHARED) { 335 /* 336 * This is an interesting situation. 337 * 338 * Remember that p_free can only change if 339 * p_selock < 0. 340 * p_free does not depend on our holding `pse'. 341 * And, since we hold `pse', p_selock can not change. 342 * So, if p_free changes on us, the page is already 343 * exclusively held, and we would fail to get p_selock 344 * regardless. 345 * 346 * We want to avoid getting the share 347 * lock on a free page that needs to be reclaimed. 348 * It is possible that some other thread has the share 349 * lock and has left the free page on the cache list. 350 * pvn_vplist_dirty() does this for brief periods. 351 * If the se_share is currently SE_EXCL, we will fail 352 * to acquire p_selock anyway. Blocking is the 353 * right thing to do. 354 * If we need to reclaim this page, we must get 355 * exclusive access to it, force the upgrade now. 356 * Again, we will fail to acquire p_selock if the 357 * page is not free and block. 358 */ 359 upgraded = 1; 360 se = SE_EXCL; 361 VM_STAT_ADD(page_lock_upgrade); 362 } 363 } 364 365 if (se == SE_EXCL) { 366 if (!(es & SE_EXCL_WANTED) && (pp->p_selock & SE_EWANTED)) { 367 /* 368 * if the caller wants a writer lock (but did not 369 * specify exclusive access), and there is a pending 370 * writer that wants exclusive access, return failure 371 */ 372 retval = 0; 373 } else if ((pp->p_selock & ~SE_EWANTED) == 0) { 374 /* no reader/writer lock held */ 375 THREAD_KPRI_REQUEST(); 376 /* this clears our setting of the SE_EWANTED bit */ 377 pp->p_selock = SE_WRITER; 378 retval = 1; 379 } else { 380 /* page is locked */ 381 if (es & SE_EXCL_WANTED) { 382 /* set the SE_EWANTED bit */ 383 pp->p_selock |= SE_EWANTED; 384 } 385 retval = 0; 386 } 387 } else { 388 retval = 0; 389 if (pp->p_selock >= 0) { 390 if ((pp->p_selock & SE_EWANTED) == 0) { 391 pp->p_selock += SE_READER; 392 retval = 1; 393 } 394 } 395 } 396 397 if (retval == 0) { 398 if ((pp->p_selock & ~SE_EWANTED) == SE_DELETED) { 399 VM_STAT_ADD(page_lock_deleted); 400 mutex_exit(pse); 401 return (retval); 402 } 403 404 #ifdef VM_STATS 405 VM_STAT_ADD(page_lock_miss); 406 if (upgraded) { 407 VM_STAT_ADD(page_lock_upgrade_failed); 408 } 409 #endif 410 if (lock) { 411 VM_STAT_ADD(page_lock_miss_lock); 412 mutex_exit(lock); 413 } 414 415 /* 416 * Now, wait for the page to be unlocked and 417 * release the lock protecting p_cv and p_selock. 418 */ 419 cv_wait(&pp->p_cv, pse); 420 mutex_exit(pse); 421 422 /* 423 * The page identity may have changed while we were 424 * blocked. If we are willing to depend on "pp" 425 * still pointing to a valid page structure (i.e., 426 * assuming page structures are not dynamically allocated 427 * or freed), we could try to lock the page if its 428 * identity hasn't changed. 429 * 430 * This needs to be measured, since we come back from 431 * cv_wait holding pse (the expensive part of this 432 * operation) we might as well try the cheap part. 433 * Though we would also have to confirm that dropping 434 * `lock' did not cause any grief to the callers. 435 */ 436 if (lock) { 437 mutex_enter(lock); 438 } 439 } else { 440 /* 441 * We have the page lock. 442 * If we needed to reclaim the page, and the page 443 * needed reclaiming (ie, it was free), then we 444 * have the page exclusively locked. We may need 445 * to downgrade the page. 446 */ 447 ASSERT((upgraded) ? 448 ((PP_ISFREE(pp)) && PAGE_EXCL(pp)) : 1); 449 mutex_exit(pse); 450 451 /* 452 * We now hold this page's lock, either shared or 453 * exclusive. This will prevent its identity from changing. 454 * The page, however, may or may not be free. If the caller 455 * requested, and it is free, go reclaim it from the 456 * free list. If the page can't be reclaimed, return failure 457 * so that the caller can start all over again. 458 * 459 * NOTE:page_reclaim() releases the page lock (p_selock) 460 * if it can't be reclaimed. 461 */ 462 if (reclaim_it) { 463 if (!page_reclaim(pp, lock)) { 464 VM_STAT_ADD(page_lock_bad_reclaim); 465 retval = 0; 466 } else { 467 VM_STAT_ADD(page_lock_reclaim); 468 if (upgraded) { 469 page_downgrade(pp); 470 } 471 } 472 } 473 } 474 return (retval); 475 } 476 477 /* 478 * Clear the SE_EWANTED bit from p_selock. This function allows 479 * callers of page_lock_es and page_try_reclaim_lock to clear 480 * their setting of this bit if they decide they no longer wish 481 * to gain exclusive access to the page. Currently only 482 * delete_memory_thread uses this when the delete memory 483 * operation is cancelled. 484 */ 485 void 486 page_lock_clr_exclwanted(page_t *pp) 487 { 488 kmutex_t *pse = PAGE_SE_MUTEX(pp); 489 490 mutex_enter(pse); 491 pp->p_selock &= ~SE_EWANTED; 492 if (CV_HAS_WAITERS(&pp->p_cv)) 493 cv_broadcast(&pp->p_cv); 494 mutex_exit(pse); 495 } 496 497 /* 498 * Read the comments inside of page_lock_es() carefully. 499 * 500 * SE_EXCL callers specifying es == SE_EXCL_WANTED will cause the 501 * SE_EWANTED bit of p_selock to be set when the lock cannot be obtained. 502 * This is used by threads subject to reader-starvation (eg. memory delete). 503 * 504 * When a thread using SE_EXCL_WANTED does not obtain the SE_EXCL lock, 505 * it is expected that it will retry at a later time. Threads that will 506 * not retry the lock *must* call page_lock_clr_exclwanted to clear the 507 * SE_EWANTED bit. (When a thread using SE_EXCL_WANTED obtains the lock, 508 * the bit is cleared.) 509 */ 510 int 511 page_try_reclaim_lock(page_t *pp, se_t se, int es) 512 { 513 kmutex_t *pse = PAGE_SE_MUTEX(pp); 514 selock_t old; 515 516 mutex_enter(pse); 517 518 old = pp->p_selock; 519 520 ASSERT(((es & SE_EXCL_WANTED) == 0) || 521 ((es & SE_EXCL_WANTED) && (se == SE_EXCL))); 522 523 if (PP_RETIRED(pp) && !(es & SE_RETIRED)) { 524 mutex_exit(pse); 525 VM_STAT_ADD(page_trylock_failed); 526 return (0); 527 } 528 529 if (se == SE_SHARED && es == 1 && old == 0) { 530 se = SE_EXCL; 531 } 532 533 if (se == SE_SHARED) { 534 if (!PP_ISFREE(pp)) { 535 if (old >= 0) { 536 /* 537 * Readers are not allowed when excl wanted 538 */ 539 if ((old & SE_EWANTED) == 0) { 540 pp->p_selock = old + SE_READER; 541 mutex_exit(pse); 542 return (1); 543 } 544 } 545 mutex_exit(pse); 546 return (0); 547 } 548 /* 549 * The page is free, so we really want SE_EXCL (below) 550 */ 551 VM_STAT_ADD(page_try_reclaim_upgrade); 552 } 553 554 /* 555 * The caller wants a writer lock. We try for it only if 556 * SE_EWANTED is not set, or if the caller specified 557 * SE_EXCL_WANTED. 558 */ 559 if (!(old & SE_EWANTED) || (es & SE_EXCL_WANTED)) { 560 if ((old & ~SE_EWANTED) == 0) { 561 /* no reader/writer lock held */ 562 THREAD_KPRI_REQUEST(); 563 /* this clears out our setting of the SE_EWANTED bit */ 564 pp->p_selock = SE_WRITER; 565 mutex_exit(pse); 566 return (1); 567 } 568 } 569 if (es & SE_EXCL_WANTED) { 570 /* page is locked, set the SE_EWANTED bit */ 571 pp->p_selock |= SE_EWANTED; 572 } 573 mutex_exit(pse); 574 return (0); 575 } 576 577 /* 578 * Acquire a page's "shared/exclusive" lock, but never block. 579 * Returns 1 on success, 0 on failure. 580 */ 581 int 582 page_trylock(page_t *pp, se_t se) 583 { 584 kmutex_t *pse = PAGE_SE_MUTEX(pp); 585 586 mutex_enter(pse); 587 if (pp->p_selock & SE_EWANTED || PP_RETIRED(pp) || 588 (se == SE_SHARED && PP_PR_NOSHARE(pp))) { 589 /* 590 * Fail if a thread wants exclusive access and page is 591 * retired, if the page is slated for retirement, or a 592 * share lock is requested. 593 */ 594 mutex_exit(pse); 595 VM_STAT_ADD(page_trylock_failed); 596 return (0); 597 } 598 599 if (se == SE_EXCL) { 600 if (pp->p_selock == 0) { 601 THREAD_KPRI_REQUEST(); 602 pp->p_selock = SE_WRITER; 603 mutex_exit(pse); 604 return (1); 605 } 606 } else { 607 if (pp->p_selock >= 0) { 608 pp->p_selock += SE_READER; 609 mutex_exit(pse); 610 return (1); 611 } 612 } 613 mutex_exit(pse); 614 return (0); 615 } 616 617 /* 618 * Variant of page_unlock() specifically for the page freelist 619 * code. The mere existence of this code is a vile hack that 620 * has resulted due to the backwards locking order of the page 621 * freelist manager; please don't call it. 622 */ 623 void 624 page_unlock_nocapture(page_t *pp) 625 { 626 kmutex_t *pse = PAGE_SE_MUTEX(pp); 627 selock_t old; 628 629 mutex_enter(pse); 630 631 old = pp->p_selock; 632 if ((old & ~SE_EWANTED) == SE_READER) { 633 pp->p_selock = old & ~SE_READER; 634 if (CV_HAS_WAITERS(&pp->p_cv)) 635 cv_broadcast(&pp->p_cv); 636 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 637 panic("page_unlock_nocapture: page %p is deleted", pp); 638 } else if (old < 0) { 639 THREAD_KPRI_RELEASE(); 640 pp->p_selock &= SE_EWANTED; 641 if (CV_HAS_WAITERS(&pp->p_cv)) 642 cv_broadcast(&pp->p_cv); 643 } else if ((old & ~SE_EWANTED) > SE_READER) { 644 pp->p_selock = old - SE_READER; 645 } else { 646 panic("page_unlock_nocapture: page %p is not locked", pp); 647 } 648 649 mutex_exit(pse); 650 } 651 652 /* 653 * Release the page's "shared/exclusive" lock and wake up anyone 654 * who might be waiting for it. 655 */ 656 void 657 page_unlock(page_t *pp) 658 { 659 kmutex_t *pse = PAGE_SE_MUTEX(pp); 660 selock_t old; 661 662 mutex_enter(pse); 663 664 old = pp->p_selock; 665 if ((old & ~SE_EWANTED) == SE_READER) { 666 pp->p_selock = old & ~SE_READER; 667 if (CV_HAS_WAITERS(&pp->p_cv)) 668 cv_broadcast(&pp->p_cv); 669 } else if ((old & ~SE_EWANTED) == SE_DELETED) { 670 panic("page_unlock: page %p is deleted", pp); 671 } else if (old < 0) { 672 THREAD_KPRI_RELEASE(); 673 pp->p_selock &= SE_EWANTED; 674 if (CV_HAS_WAITERS(&pp->p_cv)) 675 cv_broadcast(&pp->p_cv); 676 } else if ((old & ~SE_EWANTED) > SE_READER) { 677 pp->p_selock = old - SE_READER; 678 } else { 679 panic("page_unlock: page %p is not locked", pp); 680 } 681 682 if (pp->p_selock == 0) { 683 /* 684 * If the T_CAPTURING bit is set, that means that we should 685 * not try and capture the page again as we could recurse 686 * which could lead to a stack overflow panic or spending a 687 * relatively long time in the kernel making no progress. 688 */ 689 if ((pp->p_toxic & PR_CAPTURE) && 690 !(curthread->t_flag & T_CAPTURING) && 691 !PP_RETIRED(pp)) { 692 THREAD_KPRI_REQUEST(); 693 pp->p_selock = SE_WRITER; 694 mutex_exit(pse); 695 page_unlock_capture(pp); 696 } else { 697 mutex_exit(pse); 698 } 699 } else { 700 mutex_exit(pse); 701 } 702 } 703 704 /* 705 * Try to upgrade the lock on the page from a "shared" to an 706 * "exclusive" lock. Since this upgrade operation is done while 707 * holding the mutex protecting this page, no one else can acquire this page's 708 * lock and change the page. Thus, it is safe to drop the "shared" 709 * lock and attempt to acquire the "exclusive" lock. 710 * 711 * Returns 1 on success, 0 on failure. 712 */ 713 int 714 page_tryupgrade(page_t *pp) 715 { 716 kmutex_t *pse = PAGE_SE_MUTEX(pp); 717 718 mutex_enter(pse); 719 if (!(pp->p_selock & SE_EWANTED)) { 720 /* no threads want exclusive access, try upgrade */ 721 if (pp->p_selock == SE_READER) { 722 THREAD_KPRI_REQUEST(); 723 /* convert to exclusive lock */ 724 pp->p_selock = SE_WRITER; 725 mutex_exit(pse); 726 return (1); 727 } 728 } 729 mutex_exit(pse); 730 return (0); 731 } 732 733 /* 734 * Downgrade the "exclusive" lock on the page to a "shared" lock 735 * while holding the mutex protecting this page's p_selock field. 736 */ 737 void 738 page_downgrade(page_t *pp) 739 { 740 kmutex_t *pse = PAGE_SE_MUTEX(pp); 741 int excl_waiting; 742 743 ASSERT((pp->p_selock & ~SE_EWANTED) != SE_DELETED); 744 ASSERT(PAGE_EXCL(pp)); 745 746 mutex_enter(pse); 747 excl_waiting = pp->p_selock & SE_EWANTED; 748 THREAD_KPRI_RELEASE(); 749 pp->p_selock = SE_READER | excl_waiting; 750 if (CV_HAS_WAITERS(&pp->p_cv)) 751 cv_broadcast(&pp->p_cv); 752 mutex_exit(pse); 753 } 754 755 void 756 page_lock_delete(page_t *pp) 757 { 758 kmutex_t *pse = PAGE_SE_MUTEX(pp); 759 760 ASSERT(PAGE_EXCL(pp)); 761 ASSERT(pp->p_vnode == NULL); 762 ASSERT(pp->p_offset == (u_offset_t)-1); 763 ASSERT(!PP_ISFREE(pp)); 764 765 mutex_enter(pse); 766 THREAD_KPRI_RELEASE(); 767 pp->p_selock = SE_DELETED; 768 if (CV_HAS_WAITERS(&pp->p_cv)) 769 cv_broadcast(&pp->p_cv); 770 mutex_exit(pse); 771 } 772 773 int 774 page_deleted(page_t *pp) 775 { 776 return (pp->p_selock == SE_DELETED); 777 } 778 779 /* 780 * Implement the io lock for pages 781 */ 782 void 783 page_iolock_init(page_t *pp) 784 { 785 pp->p_iolock_state = 0; 786 cv_init(&pp->p_io_cv, NULL, CV_DEFAULT, NULL); 787 } 788 789 /* 790 * Acquire the i/o lock on a page. 791 */ 792 void 793 page_io_lock(page_t *pp) 794 { 795 kmutex_t *pio; 796 797 pio = PAGE_IO_MUTEX(pp); 798 mutex_enter(pio); 799 while (pp->p_iolock_state & PAGE_IO_INUSE) { 800 cv_wait(&(pp->p_io_cv), pio); 801 } 802 pp->p_iolock_state |= PAGE_IO_INUSE; 803 mutex_exit(pio); 804 } 805 806 /* 807 * Release the i/o lock on a page. 808 */ 809 void 810 page_io_unlock(page_t *pp) 811 { 812 kmutex_t *pio; 813 814 pio = PAGE_IO_MUTEX(pp); 815 mutex_enter(pio); 816 cv_broadcast(&pp->p_io_cv); 817 pp->p_iolock_state &= ~PAGE_IO_INUSE; 818 mutex_exit(pio); 819 } 820 821 /* 822 * Try to acquire the i/o lock on a page without blocking. 823 * Returns 1 on success, 0 on failure. 824 */ 825 int 826 page_io_trylock(page_t *pp) 827 { 828 kmutex_t *pio; 829 830 if (pp->p_iolock_state & PAGE_IO_INUSE) 831 return (0); 832 833 pio = PAGE_IO_MUTEX(pp); 834 mutex_enter(pio); 835 836 if (pp->p_iolock_state & PAGE_IO_INUSE) { 837 mutex_exit(pio); 838 return (0); 839 } 840 pp->p_iolock_state |= PAGE_IO_INUSE; 841 mutex_exit(pio); 842 843 return (1); 844 } 845 846 /* 847 * Wait until the i/o lock is not held. 848 */ 849 void 850 page_io_wait(page_t *pp) 851 { 852 kmutex_t *pio; 853 854 pio = PAGE_IO_MUTEX(pp); 855 mutex_enter(pio); 856 while (pp->p_iolock_state & PAGE_IO_INUSE) { 857 cv_wait(&(pp->p_io_cv), pio); 858 } 859 mutex_exit(pio); 860 } 861 862 /* 863 * Returns 1 on success, 0 on failure. 864 */ 865 int 866 page_io_locked(page_t *pp) 867 { 868 return (pp->p_iolock_state & PAGE_IO_INUSE); 869 } 870 871 /* 872 * Assert that the i/o lock on a page is held. 873 * Returns 1 on success, 0 on failure. 874 */ 875 int 876 page_iolock_assert(page_t *pp) 877 { 878 return (page_io_locked(pp)); 879 } 880 881 /* 882 * Wrapper exported to kernel routines that are built 883 * platform-independent (the macro is platform-dependent; 884 * the size of vph_mutex[] is based on NCPU). 885 * 886 * Note that you can do stress testing on this by setting the 887 * variable page_vnode_mutex_stress to something other than 888 * zero in a DEBUG kernel in a debugger after loading the kernel. 889 * Setting it after the kernel is running may not work correctly. 890 */ 891 #ifdef DEBUG 892 static int page_vnode_mutex_stress = 0; 893 #endif 894 895 kmutex_t * 896 page_vnode_mutex(vnode_t *vp) 897 { 898 if (vp == &kvp) 899 return (&vph_mutex[VPH_TABLE_SIZE + 0]); 900 901 if (vp == &zvp) 902 return (&vph_mutex[VPH_TABLE_SIZE + 1]); 903 #ifdef DEBUG 904 if (page_vnode_mutex_stress != 0) 905 return (&vph_mutex[0]); 906 #endif 907 908 return (&vph_mutex[VP_HASH_FUNC(vp)]); 909 } 910 911 kmutex_t * 912 page_se_mutex(page_t *pp) 913 { 914 return (PAGE_SE_MUTEX(pp)); 915 } 916 917 #ifdef VM_STATS 918 uint_t pszclck_stat[4]; 919 #endif 920 /* 921 * Find, take and return a mutex held by hat_page_demote(). 922 * Called by page_demote_vp_pages() before hat_page_demote() call and by 923 * routines that want to block hat_page_demote() but can't do it 924 * via locking all constituent pages. 925 * 926 * Return NULL if p_szc is 0. 927 * 928 * It should only be used for pages that can be demoted by hat_page_demote() 929 * i.e. non swapfs file system pages. The logic here is lifted from 930 * sfmmu_mlspl_enter() except there's no need to worry about p_szc increase 931 * since the page is locked and not free. 932 * 933 * Hash of the root page is used to find the lock. 934 * To find the root in the presense of hat_page_demote() chageing the location 935 * of the root this routine relies on the fact that hat_page_demote() changes 936 * root last. 937 * 938 * If NULL is returned pp's p_szc is guaranteed to be 0. If non NULL is 939 * returned pp's p_szc may be any value. 940 */ 941 kmutex_t * 942 page_szc_lock(page_t *pp) 943 { 944 kmutex_t *mtx; 945 page_t *rootpp; 946 uint_t szc; 947 uint_t rszc; 948 uint_t pszc = pp->p_szc; 949 950 ASSERT(pp != NULL); 951 ASSERT(PAGE_LOCKED(pp)); 952 ASSERT(!PP_ISFREE(pp)); 953 ASSERT(pp->p_vnode != NULL); 954 ASSERT(!IS_SWAPFSVP(pp->p_vnode)); 955 ASSERT(!PP_ISKAS(pp)); 956 957 again: 958 if (pszc == 0) { 959 VM_STAT_ADD(pszclck_stat[0]); 960 return (NULL); 961 } 962 963 /* The lock lives in the root page */ 964 965 rootpp = PP_GROUPLEADER(pp, pszc); 966 mtx = PAGE_SZC_MUTEX(rootpp); 967 mutex_enter(mtx); 968 969 /* 970 * since p_szc can only decrease if pp == rootpp 971 * rootpp will be always the same i.e we have the right root 972 * regardless of rootpp->p_szc. 973 * If location of pp's root didn't change after we took 974 * the lock we have the right root. return mutex hashed off it. 975 */ 976 if (pp == rootpp || (rszc = rootpp->p_szc) == pszc) { 977 VM_STAT_ADD(pszclck_stat[1]); 978 return (mtx); 979 } 980 981 /* 982 * root location changed because page got demoted. 983 * locate the new root. 984 */ 985 if (rszc < pszc) { 986 szc = pp->p_szc; 987 ASSERT(szc < pszc); 988 mutex_exit(mtx); 989 pszc = szc; 990 VM_STAT_ADD(pszclck_stat[2]); 991 goto again; 992 } 993 994 VM_STAT_ADD(pszclck_stat[3]); 995 /* 996 * current hat_page_demote not done yet. 997 * wait for it to finish. 998 */ 999 mutex_exit(mtx); 1000 rootpp = PP_GROUPLEADER(rootpp, rszc); 1001 mtx = PAGE_SZC_MUTEX(rootpp); 1002 mutex_enter(mtx); 1003 mutex_exit(mtx); 1004 ASSERT(rootpp->p_szc < rszc); 1005 goto again; 1006 } 1007 1008 int 1009 page_szc_lock_assert(page_t *pp) 1010 { 1011 page_t *rootpp = PP_PAGEROOT(pp); 1012 kmutex_t *mtx = PAGE_SZC_MUTEX(rootpp); 1013 1014 return (MUTEX_HELD(mtx)); 1015 } 1016 1017 /* 1018 * memseg locking 1019 */ 1020 static krwlock_t memsegslock; 1021 1022 /* 1023 * memlist (phys_install, phys_avail) locking. 1024 */ 1025 static krwlock_t memlists_lock; 1026 1027 void 1028 memsegs_lock(int writer) 1029 { 1030 rw_enter(&memsegslock, writer ? RW_WRITER : RW_READER); 1031 } 1032 1033 /*ARGSUSED*/ 1034 void 1035 memsegs_unlock(int writer) 1036 { 1037 rw_exit(&memsegslock); 1038 } 1039 1040 int 1041 memsegs_lock_held(void) 1042 { 1043 return (RW_LOCK_HELD(&memsegslock)); 1044 } 1045 1046 void 1047 memlist_read_lock(void) 1048 { 1049 rw_enter(&memlists_lock, RW_READER); 1050 } 1051 1052 void 1053 memlist_read_unlock(void) 1054 { 1055 rw_exit(&memlists_lock); 1056 } 1057 1058 void 1059 memlist_write_lock(void) 1060 { 1061 rw_enter(&memlists_lock, RW_WRITER); 1062 } 1063 1064 void 1065 memlist_write_unlock(void) 1066 { 1067 rw_exit(&memlists_lock); 1068 } 1069