1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * VM - anonymous pages. 40 * 41 * This layer sits immediately above the vm_swap layer. It manages 42 * physical pages that have no permanent identity in the file system 43 * name space, using the services of the vm_swap layer to allocate 44 * backing storage for these pages. Since these pages have no external 45 * identity, they are discarded when the last reference is removed. 46 * 47 * An important function of this layer is to manage low-level sharing 48 * of pages that are logically distinct but that happen to be 49 * physically identical (e.g., the corresponding pages of the processes 50 * resulting from a fork before one process or the other changes their 51 * contents). This pseudo-sharing is present only as an optimization 52 * and is not to be confused with true sharing in which multiple 53 * address spaces deliberately contain references to the same object; 54 * such sharing is managed at a higher level. 55 * 56 * The key data structure here is the anon struct, which contains a 57 * reference count for its associated physical page and a hint about 58 * the identity of that page. Anon structs typically live in arrays, 59 * with an instance's position in its array determining where the 60 * corresponding backing storage is allocated; however, the swap_xlate() 61 * routine abstracts away this representation information so that the 62 * rest of the anon layer need not know it. (See the swap layer for 63 * more details on anon struct layout.) 64 * 65 * In the future versions of the system, the association between an 66 * anon struct and its position on backing store will change so that 67 * we don't require backing store all anonymous pages in the system. 68 * This is important for consideration for large memory systems. 69 * We can also use this technique to delay binding physical locations 70 * to anonymous pages until pageout/swapout time where we can make 71 * smarter allocation decisions to improve anonymous klustering. 72 * 73 * Many of the routines defined here take a (struct anon **) argument, 74 * which allows the code at this level to manage anon pages directly, 75 * so that callers can regard anon structs as opaque objects and not be 76 * concerned with assigning or inspecting their contents. 77 * 78 * Clients of this layer refer to anon pages indirectly. That is, they 79 * maintain arrays of pointers to anon structs rather than maintaining 80 * anon structs themselves. The (struct anon **) arguments mentioned 81 * above are pointers to entries in these arrays. It is these arrays 82 * that capture the mapping between offsets within a given segment and 83 * the corresponding anonymous backing storage address. 84 */ 85 86 #ifdef DEBUG 87 #define ANON_DEBUG 88 #endif 89 90 #include <sys/types.h> 91 #include <sys/t_lock.h> 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/mman.h> 95 #include <sys/cred.h> 96 #include <sys/thread.h> 97 #include <sys/vnode.h> 98 #include <sys/cpuvar.h> 99 #include <sys/swap.h> 100 #include <sys/cmn_err.h> 101 #include <sys/vtrace.h> 102 #include <sys/kmem.h> 103 #include <sys/sysmacros.h> 104 #include <sys/bitmap.h> 105 #include <sys/vmsystm.h> 106 #include <sys/tuneable.h> 107 #include <sys/debug.h> 108 #include <sys/fs/swapnode.h> 109 #include <sys/tnf_probe.h> 110 #include <sys/lgrp.h> 111 #include <sys/policy.h> 112 #include <sys/condvar_impl.h> 113 #include <sys/mutex_impl.h> 114 #include <sys/rctl.h> 115 116 #include <vm/as.h> 117 #include <vm/hat.h> 118 #include <vm/anon.h> 119 #include <vm/page.h> 120 #include <vm/vpage.h> 121 #include <vm/seg.h> 122 #include <vm/rm.h> 123 124 #include <fs/fs_subr.h> 125 126 struct vnode *anon_vp; 127 128 int anon_debug; 129 130 kmutex_t anoninfo_lock; 131 struct k_anoninfo k_anoninfo; 132 ani_free_t *ani_free_pool; 133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 134 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 135 136 /* 137 * Global hash table for (vp, off) -> anon slot 138 */ 139 extern int swap_maxcontig; 140 size_t anon_hash_size; 141 unsigned int anon_hash_shift; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 pad_mutex_t *anonhash_lock; 148 149 /* 150 * Used to make the increment of all refcnts of all anon slots of a large 151 * page appear to be atomic. The lock is grabbed for the first anon slot of 152 * a large page. 153 */ 154 pad_mutex_t *anonpages_hash_lock; 155 156 #define APH_MUTEX(vp, off) \ 157 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \ 158 (AH_LOCK_SIZE - 1))].pad_mutex) 159 160 #ifdef VM_STATS 161 static struct anonvmstats_str { 162 ulong_t getpages[30]; 163 ulong_t privatepages[10]; 164 ulong_t demotepages[9]; 165 ulong_t decrefpages[9]; 166 ulong_t dupfillholes[4]; 167 ulong_t freepages[1]; 168 } anonvmstats; 169 #endif /* VM_STATS */ 170 171 /*ARGSUSED*/ 172 static int 173 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 174 { 175 struct anon_map *amp = buf; 176 177 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 178 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 179 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 180 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 181 return (0); 182 } 183 184 /*ARGSUSED1*/ 185 static void 186 anonmap_cache_destructor(void *buf, void *cdrarg) 187 { 188 struct anon_map *amp = buf; 189 190 rw_destroy(&->a_rwlock); 191 cv_destroy(&->a_purgecv); 192 mutex_destroy(&->a_pmtx); 193 mutex_destroy(&->a_purgemtx); 194 } 195 196 void 197 anon_init(void) 198 { 199 int i; 200 pad_mutex_t *tmp; 201 202 /* These both need to be powers of 2 so round up to the next power */ 203 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1); 204 anon_hash_size = 1L << anon_hash_shift; 205 206 /* 207 * We need to align the anonhash_lock and anonpages_hash_lock arrays 208 * to a 64B boundary to avoid false sharing. We add 63B to our 209 * allocation so that we can get a 64B aligned address to use. 210 * We allocate both of these together to avoid wasting an additional 211 * 63B. 212 */ 213 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63, 214 KM_SLEEP); 215 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64); 216 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE; 217 218 for (i = 0; i < AH_LOCK_SIZE; i++) { 219 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT, 220 NULL); 221 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL, 222 MUTEX_DEFAULT, NULL); 223 } 224 225 for (i = 0; i < ANON_LOCKSIZE; i++) { 226 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 227 MUTEX_DEFAULT, NULL); 228 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 229 } 230 231 anon_hash = (struct anon **) 232 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 233 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 234 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 235 anonmap_cache = kmem_cache_create("anonmap_cache", 236 sizeof (struct anon_map), 0, 237 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 238 NULL, NULL, 0); 239 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 240 241 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP); 242 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */ 243 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64); 244 245 anon_vp = vn_alloc(KM_SLEEP); 246 vn_setops(anon_vp, swap_vnodeops); 247 anon_vp->v_type = VREG; 248 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 249 } 250 251 /* 252 * Global anon slot hash table manipulation. 253 */ 254 255 static void 256 anon_addhash(struct anon *ap) 257 { 258 int index; 259 260 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 261 index = ANON_HASH(ap->an_vp, ap->an_off); 262 ap->an_hash = anon_hash[index]; 263 anon_hash[index] = ap; 264 } 265 266 static void 267 anon_rmhash(struct anon *ap) 268 { 269 struct anon **app; 270 271 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 272 273 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 274 *app; app = &((*app)->an_hash)) { 275 if (*app == ap) { 276 *app = ap->an_hash; 277 break; 278 } 279 } 280 } 281 282 /* 283 * The anon array interfaces. Functions allocating, 284 * freeing array of pointers, and returning/setting 285 * entries in the array of pointers for a given offset. 286 * 287 * Create the list of pointers 288 */ 289 struct anon_hdr * 290 anon_create(pgcnt_t npages, int flags) 291 { 292 struct anon_hdr *ahp; 293 ulong_t nchunks; 294 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 295 296 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 297 return (NULL); 298 } 299 300 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 301 /* 302 * Single level case. 303 */ 304 ahp->size = npages; 305 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 306 307 if (flags & ANON_ALLOC_FORCE) 308 ahp->flags |= ANON_ALLOC_FORCE; 309 310 ahp->array_chunk = kmem_zalloc( 311 ahp->size * sizeof (struct anon *), kmemflags); 312 313 if (ahp->array_chunk == NULL) { 314 kmem_free(ahp, sizeof (struct anon_hdr)); 315 return (NULL); 316 } 317 } else { 318 /* 319 * 2 Level case. 320 * anon hdr size needs to be rounded off to be a multiple 321 * of ANON_CHUNK_SIZE. This is important as various anon 322 * related functions depend on this. 323 * NOTE - 324 * anon_grow() makes anon hdr size a multiple of 325 * ANON_CHUNK_SIZE. 326 * amp size is <= anon hdr size. 327 * anon_index + seg_pgs <= anon hdr size. 328 */ 329 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 330 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 331 332 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 333 kmemflags); 334 335 if (ahp->array_chunk == NULL) { 336 kmem_free(ahp, sizeof (struct anon_hdr)); 337 return (NULL); 338 } 339 } 340 return (ahp); 341 } 342 343 /* 344 * Free the array of pointers 345 */ 346 void 347 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 348 { 349 ulong_t i; 350 void **ppp; 351 ulong_t nchunks; 352 353 ASSERT(npages <= ahp->size); 354 355 /* 356 * Single level case. 357 */ 358 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 359 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 360 } else { 361 /* 362 * 2 level case. 363 */ 364 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 365 for (i = 0; i < nchunks; i++) { 366 ppp = &ahp->array_chunk[i]; 367 if (*ppp != NULL) 368 kmem_free(*ppp, PAGESIZE); 369 } 370 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 371 } 372 mutex_destroy(&ahp->serial_lock); 373 kmem_free(ahp, sizeof (struct anon_hdr)); 374 } 375 376 /* 377 * Return the pointer from the list for a 378 * specified anon index. 379 */ 380 struct anon * 381 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 382 { 383 struct anon **app; 384 385 ASSERT(an_idx < ahp->size); 386 387 /* 388 * Single level case. 389 */ 390 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 391 return ((struct anon *) 392 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 393 } else { 394 395 /* 396 * 2 level case. 397 */ 398 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 399 if (app) { 400 return ((struct anon *) 401 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 402 ANON_PTRMASK)); 403 } else { 404 return (NULL); 405 } 406 } 407 } 408 409 /* 410 * Return the anon pointer for the first valid entry in the anon list, 411 * starting from the given index. 412 */ 413 struct anon * 414 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 415 { 416 struct anon *ap; 417 struct anon **app; 418 ulong_t chunkoff; 419 ulong_t i; 420 ulong_t j; 421 pgcnt_t size; 422 423 i = *index; 424 size = ahp->size; 425 426 ASSERT(i < size); 427 428 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 429 /* 430 * 1 level case 431 */ 432 while (i < size) { 433 ap = (struct anon *) 434 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 435 if (ap) { 436 *index = i; 437 return (ap); 438 } 439 i++; 440 } 441 } else { 442 /* 443 * 2 level case 444 */ 445 chunkoff = i & ANON_CHUNK_OFF; 446 while (i < size) { 447 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 448 if (app) 449 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 450 ap = (struct anon *) 451 ((uintptr_t)app[j] & ANON_PTRMASK); 452 if (ap) { 453 *index = i + (j - chunkoff); 454 return (ap); 455 } 456 } 457 chunkoff = 0; 458 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 459 } 460 } 461 *index = size; 462 return (NULL); 463 } 464 465 /* 466 * Set list entry with a given pointer for a specified offset 467 */ 468 int 469 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 470 { 471 void **ppp; 472 struct anon **app; 473 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 474 uintptr_t *ap_addr; 475 476 ASSERT(an_idx < ahp->size); 477 478 /* 479 * Single level case. 480 */ 481 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 482 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 483 } else { 484 485 /* 486 * 2 level case. 487 */ 488 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 489 490 ASSERT(ppp != NULL); 491 if (*ppp == NULL) { 492 mutex_enter(&ahp->serial_lock); 493 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 494 if (*ppp == NULL) { 495 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 496 if (*ppp == NULL) { 497 mutex_exit(&ahp->serial_lock); 498 return (ENOMEM); 499 } 500 } 501 mutex_exit(&ahp->serial_lock); 502 } 503 app = *ppp; 504 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 505 } 506 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 507 return (0); 508 } 509 510 /* 511 * Copy anon array into a given new anon array 512 */ 513 int 514 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 515 struct anon_hdr *dahp, ulong_t d_idx, 516 pgcnt_t npages, int flags) 517 { 518 void **sapp, **dapp; 519 void *ap; 520 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 521 522 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 523 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 524 525 /* 526 * Both arrays are 1 level. 527 */ 528 if (((sahp->size <= ANON_CHUNK_SIZE) && 529 (dahp->size <= ANON_CHUNK_SIZE)) || 530 ((sahp->flags & ANON_ALLOC_FORCE) && 531 (dahp->flags & ANON_ALLOC_FORCE))) { 532 533 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 534 npages * sizeof (struct anon *)); 535 return (0); 536 } 537 538 /* 539 * Both arrays are 2 levels. 540 */ 541 if (sahp->size > ANON_CHUNK_SIZE && 542 dahp->size > ANON_CHUNK_SIZE && 543 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 544 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 545 546 ulong_t sapidx, dapidx; 547 ulong_t *sap, *dap; 548 ulong_t chknp; 549 550 while (npages != 0) { 551 552 sapidx = s_idx & ANON_CHUNK_OFF; 553 dapidx = d_idx & ANON_CHUNK_OFF; 554 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 555 if (chknp > npages) 556 chknp = npages; 557 558 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 559 if ((sap = *sapp) != NULL) { 560 dapp = &dahp->array_chunk[d_idx 561 >> ANON_CHUNK_SHIFT]; 562 if ((dap = *dapp) == NULL) { 563 *dapp = kmem_zalloc(PAGESIZE, 564 kmemflags); 565 if ((dap = *dapp) == NULL) 566 return (ENOMEM); 567 } 568 bcopy((sap + sapidx), (dap + dapidx), 569 chknp << ANON_PTRSHIFT); 570 } 571 s_idx += chknp; 572 d_idx += chknp; 573 npages -= chknp; 574 } 575 return (0); 576 } 577 578 /* 579 * At least one of the arrays is 2 level. 580 */ 581 while (npages--) { 582 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 583 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 584 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 585 return (ENOMEM); 586 } 587 s_idx++; 588 d_idx++; 589 } 590 return (0); 591 } 592 593 594 /* 595 * ANON_INITBUF is a convenience macro for anon_grow() below. It 596 * takes a buffer dst, which is at least as large as buffer src. It 597 * does a bcopy from src into dst, and then bzeros the extra bytes 598 * of dst. If tail is set, the data in src is tail aligned within 599 * dst instead of head aligned. 600 */ 601 602 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 603 if (tail) { \ 604 bzero((dst), (dstsize) - (srclen)); \ 605 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 606 } else { \ 607 bcopy((src), (dst), (srclen)); \ 608 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 609 } 610 611 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 612 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 613 614 /* 615 * anon_grow() is used to efficiently extend an existing anon array. 616 * startidx_p points to the index into the anon array of the first page 617 * that is in use. oldseg_pgs is the number of pages in use, starting at 618 * *startidx_p. newpages is the number of additional pages desired. 619 * 620 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 621 * 622 * The growth is done by creating a new top level of the anon array, 623 * and (if the array is 2-level) reusing the existing second level arrays. 624 * 625 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 626 * 627 * Returns the new number of pages in the anon array. 628 */ 629 pgcnt_t 630 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 631 pgcnt_t newseg_pgs, int flags) 632 { 633 ulong_t startidx = startidx_p ? *startidx_p : 0; 634 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 635 pgcnt_t oelems, nelems, totpages; 636 void **level1; 637 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 638 int growdown = (flags & ANON_GROWDOWN); 639 size_t newarrsz, oldarrsz; 640 void *level2; 641 642 ASSERT(!(startidx_p == NULL && growdown)); 643 ASSERT(startidx + oldseg_pgs <= ahp->size); 644 645 /* 646 * Determine the total number of pages needed in the new 647 * anon array. If growing down, totpages is all pages from 648 * startidx through the end of the array, plus <newseg_pgs> 649 * pages. If growing up, keep all pages from page 0 through 650 * the last page currently in use, plus <newseg_pgs> pages. 651 */ 652 if (growdown) 653 totpages = oldamp_pgs - startidx + newseg_pgs; 654 else 655 totpages = startidx + oldseg_pgs + newseg_pgs; 656 657 /* If the array is already large enough, just return. */ 658 659 if (oldamp_pgs >= totpages) { 660 if (growdown) 661 *startidx_p = oldamp_pgs - totpages; 662 return (oldamp_pgs); 663 } 664 665 /* 666 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 667 * by the corresponding arrays. 668 * oelems/nelems are the number of pointers in the top level arrays 669 * which may be either level 1 or level 2. 670 * Will the new anon array be one level or two levels? 671 */ 672 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 673 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 674 oelems = oldamp_pgs; 675 nelems = newamp_pgs; 676 } else { 677 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 678 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 679 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 680 } 681 682 newarrsz = nelems * sizeof (void *); 683 level1 = kmem_alloc(newarrsz, kmemflags); 684 if (level1 == NULL) 685 return (0); 686 687 /* Are we converting from a one level to a two level anon array? */ 688 689 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 690 !(ahp->flags & ANON_ALLOC_FORCE)) { 691 692 /* 693 * Yes, we're converting to a two level. Reuse old level 1 694 * as new level 2 if it is exactly PAGESIZE. Otherwise 695 * alloc a new level 2 and copy the old level 1 data into it. 696 */ 697 if (oldamp_pgs == ANON_CHUNK_SIZE) { 698 level2 = (void *)ahp->array_chunk; 699 } else { 700 level2 = kmem_alloc(PAGESIZE, kmemflags); 701 if (level2 == NULL) { 702 kmem_free(level1, newarrsz); 703 return (0); 704 } 705 oldarrsz = oldamp_pgs * sizeof (void *); 706 707 ANON_INITBUF(ahp->array_chunk, oldarrsz, 708 level2, PAGESIZE, growdown); 709 kmem_free(ahp->array_chunk, oldarrsz); 710 } 711 bzero(level1, newarrsz); 712 if (growdown) 713 level1[nelems - 1] = level2; 714 else 715 level1[0] = level2; 716 } else { 717 oldarrsz = oelems * sizeof (void *); 718 719 ANON_INITBUF(ahp->array_chunk, oldarrsz, 720 level1, newarrsz, growdown); 721 kmem_free(ahp->array_chunk, oldarrsz); 722 } 723 724 ahp->array_chunk = level1; 725 ahp->size = newamp_pgs; 726 if (growdown) 727 *startidx_p = newamp_pgs - totpages; 728 729 return (newamp_pgs); 730 } 731 732 733 /* 734 * Called to sync ani_free value. 735 */ 736 737 void 738 set_anoninfo(void) 739 { 740 processorid_t ix, max_seqid; 741 pgcnt_t total = 0; 742 static clock_t last_time; 743 clock_t new_time; 744 745 if (ani_free_pool == NULL) 746 return; 747 748 /* 749 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to 750 * identify the maximum number of CPUs were ever online. 751 */ 752 new_time = ddi_get_lbolt(); 753 if (new_time > last_time) { 754 755 max_seqid = max_cpu_seqid_ever; 756 ASSERT(ANI_MAX_POOL > max_seqid); 757 for (ix = 0; ix <= max_seqid; ix++) 758 total += ani_free_pool[ix].ani_count; 759 760 last_time = new_time; 761 k_anoninfo.ani_free = total; 762 } 763 } 764 765 /* 766 * Reserve anon space. 767 * 768 * It's no longer simply a matter of incrementing ani_resv to 769 * reserve swap space, we need to check memory-based as well 770 * as disk-backed (physical) swap. The following algorithm 771 * is used: 772 * Check the space on physical swap 773 * i.e. amount needed < ani_max - ani_phys_resv 774 * If we are swapping on swapfs check 775 * amount needed < (availrmem - swapfs_minfree) 776 * Since the algorithm to check for the quantity of swap space is 777 * almost the same as that for reserving it, we'll just use anon_resvmem 778 * with a flag to decrement availrmem. 779 * 780 * Return non-zero on success. 781 */ 782 int 783 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 784 { 785 pgcnt_t npages = btopr(size); 786 pgcnt_t mswap_pages = 0; 787 pgcnt_t pswap_pages = 0; 788 proc_t *p = curproc; 789 790 if (zone != NULL && takemem) { 791 /* test zone.max-swap resource control */ 792 mutex_enter(&p->p_lock); 793 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 794 mutex_exit(&p->p_lock); 795 return (0); 796 } 797 mutex_exit(&p->p_lock); 798 } 799 mutex_enter(&anoninfo_lock); 800 801 /* 802 * pswap_pages is the number of pages we can take from 803 * physical (i.e. disk-backed) swap. 804 */ 805 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 806 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 807 808 ANON_PRINT(A_RESV, 809 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 810 npages, takemem, pswap_pages, (void *)caller())); 811 812 if (npages <= pswap_pages) { 813 /* 814 * we have enough space on a physical swap 815 */ 816 if (takemem) 817 k_anoninfo.ani_phys_resv += npages; 818 mutex_exit(&anoninfo_lock); 819 return (1); 820 } else if (pswap_pages != 0) { 821 /* 822 * we have some space on a physical swap 823 */ 824 if (takemem) { 825 /* 826 * use up remainder of phys swap 827 */ 828 k_anoninfo.ani_phys_resv += pswap_pages; 829 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 830 } 831 } 832 /* 833 * since (npages > pswap_pages) we need mem swap 834 * mswap_pages is the number of pages needed from availrmem 835 */ 836 ASSERT(npages > pswap_pages); 837 mswap_pages = npages - pswap_pages; 838 839 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 840 mswap_pages)); 841 842 /* 843 * priv processes can reserve memory as swap as long as availrmem 844 * remains greater than swapfs_minfree; in the case of non-priv 845 * processes, memory can be reserved as swap only if availrmem 846 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 847 * swapfs_reserve amount of memswap is not available to non-priv 848 * processes. This protects daemons such as automounter dying 849 * as a result of application processes eating away almost entire 850 * membased swap. This safeguard becomes useless if apps are run 851 * with root access. 852 * 853 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 854 * 855 */ 856 if (tryhard) { 857 pgcnt_t floor_pages; 858 859 if (secpolicy_resource_anon_mem(CRED())) { 860 floor_pages = swapfs_minfree; 861 } else { 862 floor_pages = swapfs_minfree + swapfs_reserve; 863 } 864 865 mutex_exit(&anoninfo_lock); 866 (void) page_reclaim_mem(mswap_pages, floor_pages, 0, 60); 867 mutex_enter(&anoninfo_lock); 868 } 869 870 mutex_enter(&freemem_lock); 871 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 872 (availrmem > (swapfs_minfree + mswap_pages) && 873 secpolicy_resource(CRED()) == 0)) { 874 875 if (takemem) { 876 /* 877 * Take the memory from the rest of the system. 878 */ 879 availrmem -= mswap_pages; 880 mutex_exit(&freemem_lock); 881 k_anoninfo.ani_mem_resv += mswap_pages; 882 ANI_ADD(mswap_pages); 883 ANON_PRINT((A_RESV | A_MRESV), 884 ("anon_resvmem: took %ld pages of availrmem\n", 885 mswap_pages)); 886 } else { 887 mutex_exit(&freemem_lock); 888 } 889 890 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 891 mutex_exit(&anoninfo_lock); 892 return (1); 893 } else { 894 /* 895 * Fail if not enough memory 896 */ 897 if (takemem) { 898 k_anoninfo.ani_phys_resv -= pswap_pages; 899 } 900 901 mutex_exit(&freemem_lock); 902 mutex_exit(&anoninfo_lock); 903 ANON_PRINT(A_RESV, 904 ("anon_resvmem: not enough space from swapfs\n")); 905 if (zone != NULL && takemem) 906 rctl_decr_swap(zone, ptob(npages)); 907 return (0); 908 } 909 } 910 911 /* 912 * Give back an anon reservation. 913 */ 914 void 915 anon_unresvmem(size_t size, zone_t *zone) 916 { 917 pgcnt_t npages = btopr(size); 918 spgcnt_t mem_free_pages = 0; 919 pgcnt_t phys_free_slots; 920 #ifdef ANON_DEBUG 921 pgcnt_t mem_resv; 922 #endif 923 if (zone != NULL) 924 rctl_decr_swap(zone, ptob(npages)); 925 926 mutex_enter(&anoninfo_lock); 927 928 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 929 930 /* 931 * If some of this reservation belonged to swapfs 932 * give it back to availrmem. 933 * ani_mem_resv is the amount of availrmem swapfs has reserved. 934 * but some of that memory could be locked by segspt so we can only 935 * return non locked ani_mem_resv back to availrmem 936 */ 937 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 938 ANON_PRINT((A_RESV | A_MRESV), 939 ("anon_unresv: growing availrmem by %ld pages\n", 940 MIN(k_anoninfo.ani_mem_resv, npages))); 941 942 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 943 k_anoninfo.ani_locked_swap), npages); 944 mutex_enter(&freemem_lock); 945 availrmem += mem_free_pages; 946 mutex_exit(&freemem_lock); 947 k_anoninfo.ani_mem_resv -= mem_free_pages; 948 949 ANI_ADD(-mem_free_pages); 950 } 951 /* 952 * The remainder of the pages is returned to phys swap 953 */ 954 ASSERT(npages >= mem_free_pages); 955 phys_free_slots = npages - mem_free_pages; 956 957 if (phys_free_slots) { 958 k_anoninfo.ani_phys_resv -= phys_free_slots; 959 } 960 961 #ifdef ANON_DEBUG 962 mem_resv = k_anoninfo.ani_mem_resv; 963 #endif 964 965 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 966 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 967 968 mutex_exit(&anoninfo_lock); 969 970 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 971 npages, mem_resv, (void *)caller())); 972 } 973 974 /* 975 * Allocate an anon slot and return it with the lock held. 976 */ 977 struct anon * 978 anon_alloc(struct vnode *vp, anoff_t off) 979 { 980 struct anon *ap; 981 kmutex_t *ahm; 982 983 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 984 if (vp == NULL) { 985 swap_alloc(ap); 986 } else { 987 ap->an_vp = vp; 988 ap->an_off = off; 989 } 990 ap->an_refcnt = 1; 991 ap->an_pvp = NULL; 992 ap->an_poff = 0; 993 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 994 mutex_enter(ahm); 995 anon_addhash(ap); 996 mutex_exit(ahm); 997 ANI_ADD(-1); 998 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 999 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 1000 return (ap); 1001 } 1002 1003 /* 1004 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 1005 * such pages don't consume any physical swap resources needed for swapping 1006 * unlocked pages. 1007 */ 1008 void 1009 anon_swap_free(struct anon *ap, page_t *pp) 1010 { 1011 kmutex_t *ahm; 1012 1013 ASSERT(ap != NULL); 1014 ASSERT(pp != NULL); 1015 ASSERT(PAGE_LOCKED(pp)); 1016 ASSERT(pp->p_vnode != NULL); 1017 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1018 ASSERT(ap->an_refcnt != 0); 1019 ASSERT(pp->p_vnode == ap->an_vp); 1020 ASSERT(pp->p_offset == ap->an_off); 1021 1022 if (ap->an_pvp == NULL) 1023 return; 1024 1025 page_io_lock(pp); 1026 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1027 mutex_enter(ahm); 1028 1029 ASSERT(ap->an_refcnt != 0); 1030 ASSERT(pp->p_vnode == ap->an_vp); 1031 ASSERT(pp->p_offset == ap->an_off); 1032 1033 if (ap->an_pvp != NULL) { 1034 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1035 ap->an_pvp = NULL; 1036 ap->an_poff = 0; 1037 mutex_exit(ahm); 1038 hat_setmod(pp); 1039 } else { 1040 mutex_exit(ahm); 1041 } 1042 page_io_unlock(pp); 1043 } 1044 1045 /* 1046 * Decrement the reference count of an anon page. 1047 * If reference count goes to zero, free it and 1048 * its associated page (if any). 1049 */ 1050 void 1051 anon_decref(struct anon *ap) 1052 { 1053 page_t *pp; 1054 struct vnode *vp; 1055 anoff_t off; 1056 kmutex_t *ahm; 1057 1058 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1059 mutex_enter(ahm); 1060 ASSERT(ap->an_refcnt != 0); 1061 if (ap->an_refcnt == 0) 1062 panic("anon_decref: slot count 0"); 1063 if (--ap->an_refcnt == 0) { 1064 swap_xlate(ap, &vp, &off); 1065 anon_rmhash(ap); 1066 if (ap->an_pvp != NULL) 1067 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1068 mutex_exit(ahm); 1069 1070 /* 1071 * If there is a page for this anon slot we will need to 1072 * call VN_DISPOSE to get rid of the vp association and 1073 * put the page back on the free list as really free. 1074 * Acquire the "exclusive" lock to ensure that any 1075 * pending i/o always completes before the swap slot 1076 * is freed. 1077 */ 1078 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1079 if (pp != NULL) { 1080 /*LINTED: constant in conditional context */ 1081 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1082 } 1083 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1084 (void *)ap, (void *)ap->an_vp)); 1085 1086 kmem_cache_free(anon_cache, ap); 1087 1088 ANI_ADD(1); 1089 } else { 1090 mutex_exit(ahm); 1091 } 1092 } 1093 1094 1095 /* 1096 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1097 * seg->s_szc level) to determine whether COW processing is required. 1098 * anonpages_hash_lock[] held on the root ap ensures that if root's 1099 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1100 * later since this process can't fork while its AS lock is held). 1101 * 1102 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1103 */ 1104 int 1105 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1106 { 1107 struct anon *ap; 1108 kmutex_t *ahmpages = NULL; 1109 1110 ap = anon_get_ptr(ahp, anon_index); 1111 if (ap == NULL) 1112 return (0); 1113 1114 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1115 mutex_enter(ahmpages); 1116 ASSERT(ap->an_refcnt >= 1); 1117 if (ap->an_refcnt == 1) { 1118 mutex_exit(ahmpages); 1119 return (0); 1120 } 1121 mutex_exit(ahmpages); 1122 return (1); 1123 } 1124 /* 1125 * Check 'nslots' anon slots for refcnt > 1. 1126 * 1127 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1128 * returns 0. 1129 */ 1130 static int 1131 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1132 { 1133 struct anon *ap; 1134 1135 while (nslots-- > 0) { 1136 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1137 ap->an_refcnt > 1) 1138 return (1); 1139 anon_index++; 1140 } 1141 1142 return (0); 1143 } 1144 1145 static void 1146 anon_decref_pages( 1147 struct anon_hdr *ahp, 1148 ulong_t an_idx, 1149 uint_t szc) 1150 { 1151 struct anon *ap = anon_get_ptr(ahp, an_idx); 1152 kmutex_t *ahmpages = NULL; 1153 page_t *pp; 1154 pgcnt_t pgcnt = page_get_pagecnt(szc); 1155 pgcnt_t i; 1156 struct vnode *vp; 1157 anoff_t off; 1158 kmutex_t *ahm; 1159 #ifdef DEBUG 1160 int refcnt = 1; 1161 #endif 1162 1163 ASSERT(szc != 0); 1164 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1165 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1166 ASSERT(an_idx < ahp->size); 1167 1168 if (ahp->size - an_idx < pgcnt) { 1169 /* 1170 * In case of shared mappings total anon map size may not be 1171 * the largest page size aligned. 1172 */ 1173 pgcnt = ahp->size - an_idx; 1174 } 1175 1176 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1177 1178 if (ap != NULL) { 1179 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1180 mutex_enter(ahmpages); 1181 ASSERT((refcnt = ap->an_refcnt) != 0); 1182 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1183 if (ap->an_refcnt == 1) { 1184 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1185 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1186 mutex_exit(ahmpages); 1187 ahmpages = NULL; 1188 } 1189 } 1190 1191 i = 0; 1192 while (i < pgcnt) { 1193 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1194 ASSERT(refcnt == 1 && ahmpages == NULL); 1195 i++; 1196 continue; 1197 } 1198 ASSERT(ap->an_refcnt == refcnt); 1199 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1200 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1201 1202 if (ahmpages == NULL) { 1203 swap_xlate(ap, &vp, &off); 1204 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1205 if (pp == NULL || pp->p_szc == 0) { 1206 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1207 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1208 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1209 ANON_SLEEP); 1210 mutex_enter(ahm); 1211 ap->an_refcnt--; 1212 ASSERT(ap->an_refcnt == 0); 1213 anon_rmhash(ap); 1214 if (ap->an_pvp) 1215 swap_phys_free(ap->an_pvp, ap->an_poff, 1216 PAGESIZE); 1217 mutex_exit(ahm); 1218 if (pp == NULL) { 1219 pp = page_lookup(vp, (u_offset_t)off, 1220 SE_EXCL); 1221 ASSERT(pp == NULL || pp->p_szc == 0); 1222 } 1223 if (pp != NULL) { 1224 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1225 /*LINTED*/ 1226 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1227 } 1228 kmem_cache_free(anon_cache, ap); 1229 ANI_ADD(1); 1230 i++; 1231 } else { 1232 pgcnt_t j; 1233 pgcnt_t curpgcnt = 1234 page_get_pagecnt(pp->p_szc); 1235 size_t ppasize = curpgcnt * sizeof (page_t *); 1236 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1237 int dispose = 0; 1238 1239 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1240 1241 ASSERT(pp->p_szc <= szc); 1242 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1243 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1244 ASSERT(i + curpgcnt <= pgcnt); 1245 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1246 ppa[0] = pp; 1247 for (j = i + 1; j < i + curpgcnt; j++) { 1248 ap = anon_get_ptr(ahp, an_idx + j); 1249 ASSERT(ap != NULL && 1250 ap->an_refcnt == 1); 1251 swap_xlate(ap, &vp, &off); 1252 pp = page_lookup(vp, (u_offset_t)off, 1253 SE_EXCL); 1254 if (pp == NULL) 1255 panic("anon_decref_pages: " 1256 "no page"); 1257 1258 (void) hat_pageunload(pp, 1259 HAT_FORCE_PGUNLOAD); 1260 ASSERT(pp->p_szc == ppa[0]->p_szc); 1261 ASSERT(page_pptonum(pp) - 1 == 1262 page_pptonum(ppa[j - i - 1])); 1263 ppa[j - i] = pp; 1264 if (ap->an_pvp != NULL && 1265 !vn_matchopval(ap->an_pvp, 1266 VOPNAME_DISPOSE, 1267 (fs_generic_func_p)fs_dispose)) 1268 dispose = 1; 1269 } 1270 for (j = i; j < i + curpgcnt; j++) { 1271 ap = anon_get_ptr(ahp, an_idx + j); 1272 ASSERT(ap != NULL && 1273 ap->an_refcnt == 1); 1274 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1275 (void) anon_set_ptr(ahp, an_idx + j, 1276 NULL, ANON_SLEEP); 1277 mutex_enter(ahm); 1278 ap->an_refcnt--; 1279 ASSERT(ap->an_refcnt == 0); 1280 anon_rmhash(ap); 1281 if (ap->an_pvp) 1282 swap_phys_free(ap->an_pvp, 1283 ap->an_poff, PAGESIZE); 1284 mutex_exit(ahm); 1285 kmem_cache_free(anon_cache, ap); 1286 ANI_ADD(1); 1287 } 1288 if (!dispose) { 1289 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1290 page_destroy_pages(ppa[0]); 1291 } else { 1292 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1293 for (j = 0; j < curpgcnt; j++) { 1294 ASSERT(PAGE_EXCL(ppa[j])); 1295 ppa[j]->p_szc = 0; 1296 } 1297 for (j = 0; j < curpgcnt; j++) { 1298 ASSERT(!hat_page_is_mapped( 1299 ppa[j])); 1300 /*LINTED*/ 1301 VN_DISPOSE(ppa[j], B_INVAL, 0, 1302 kcred); 1303 } 1304 } 1305 kmem_free(ppa, ppasize); 1306 i += curpgcnt; 1307 } 1308 } else { 1309 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1310 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1311 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1312 mutex_enter(ahm); 1313 ap->an_refcnt--; 1314 mutex_exit(ahm); 1315 i++; 1316 } 1317 } 1318 1319 if (ahmpages != NULL) { 1320 mutex_exit(ahmpages); 1321 } 1322 } 1323 1324 /* 1325 * Duplicate references to size bytes worth of anon pages. 1326 * Used when duplicating a segment that contains private anon pages. 1327 * This code assumes that procedure calling this one has already used 1328 * hat_chgprot() to disable write access to the range of addresses that 1329 * that *old actually refers to. 1330 */ 1331 void 1332 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1333 ulong_t new_idx, size_t size) 1334 { 1335 spgcnt_t npages; 1336 kmutex_t *ahm; 1337 struct anon *ap; 1338 ulong_t off; 1339 ulong_t index; 1340 1341 npages = btopr(size); 1342 while (npages > 0) { 1343 index = old_idx; 1344 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1345 break; 1346 1347 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1348 off = index - old_idx; 1349 npages -= off; 1350 if (npages <= 0) 1351 break; 1352 1353 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1354 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1355 1356 mutex_enter(ahm); 1357 ap->an_refcnt++; 1358 mutex_exit(ahm); 1359 1360 off++; 1361 new_idx += off; 1362 old_idx += off; 1363 npages--; 1364 } 1365 } 1366 1367 /* 1368 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1369 * slots) within any large page region. That means if a large page region is 1370 * empty in the old array it will skip it. If there are 1 or more valid slots 1371 * in the large page region of the old array it will make sure to fill in any 1372 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1373 * page region should either have no valid anon slots or all slots should be 1374 * valid. 1375 */ 1376 void 1377 anon_dup_fill_holes( 1378 struct anon_hdr *old, 1379 ulong_t old_idx, 1380 struct anon_hdr *new, 1381 ulong_t new_idx, 1382 size_t size, 1383 uint_t szc, 1384 int noalloc) 1385 { 1386 struct anon *ap; 1387 spgcnt_t npages; 1388 kmutex_t *ahm, *ahmpages = NULL; 1389 pgcnt_t pgcnt, i; 1390 ulong_t index, off; 1391 #ifdef DEBUG 1392 int refcnt; 1393 #endif 1394 1395 ASSERT(szc != 0); 1396 pgcnt = page_get_pagecnt(szc); 1397 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1398 npages = btopr(size); 1399 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1400 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1401 1402 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1403 1404 while (npages > 0) { 1405 index = old_idx; 1406 1407 /* 1408 * Find the next valid slot. 1409 */ 1410 if (anon_get_next_ptr(old, &index) == NULL) 1411 break; 1412 1413 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1414 /* 1415 * Now backup index to the beginning of the 1416 * current large page region of the old array. 1417 */ 1418 index = P2ALIGN(index, pgcnt); 1419 off = index - old_idx; 1420 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1421 npages -= off; 1422 if (npages <= 0) 1423 break; 1424 1425 /* 1426 * Fill and copy a large page regions worth 1427 * of anon slots. 1428 */ 1429 for (i = 0; i < pgcnt; i++) { 1430 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1431 if (noalloc) { 1432 panic("anon_dup_fill_holes: " 1433 "empty anon slot\n"); 1434 } 1435 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1436 ap = anon_alloc(NULL, 0); 1437 (void) anon_set_ptr(old, index + i, ap, 1438 ANON_SLEEP); 1439 } else if (i == 0) { 1440 /* 1441 * make the increment of all refcnts of all 1442 * anon slots of a large page appear atomic by 1443 * getting an anonpages_hash_lock for the 1444 * first anon slot of a large page. 1445 */ 1446 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1447 1448 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1449 mutex_enter(ahmpages); 1450 /*LINTED*/ 1451 ASSERT(refcnt = ap->an_refcnt); 1452 1453 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1454 anonvmstats.dupfillholes[3]); 1455 } 1456 (void) anon_set_ptr(new, new_idx + off + i, ap, 1457 ANON_SLEEP); 1458 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1459 mutex_enter(ahm); 1460 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1461 ASSERT(i == 0 || ahmpages == NULL || 1462 refcnt == ap->an_refcnt); 1463 ap->an_refcnt++; 1464 mutex_exit(ahm); 1465 } 1466 if (ahmpages != NULL) { 1467 mutex_exit(ahmpages); 1468 ahmpages = NULL; 1469 } 1470 off += pgcnt; 1471 new_idx += off; 1472 old_idx += off; 1473 npages -= pgcnt; 1474 } 1475 } 1476 1477 /* 1478 * Used when a segment with a vnode changes szc. similarly to 1479 * anon_dup_fill_holes() makes sure each large page region either has no anon 1480 * slots or all of them. but new slots are created by COWing the file 1481 * pages. on entrance no anon slots should be shared. 1482 */ 1483 int 1484 anon_fill_cow_holes( 1485 struct seg *seg, 1486 caddr_t addr, 1487 struct anon_hdr *ahp, 1488 ulong_t an_idx, 1489 struct vnode *vp, 1490 u_offset_t vp_off, 1491 size_t size, 1492 uint_t szc, 1493 uint_t prot, 1494 struct vpage vpage[], 1495 struct cred *cred) 1496 { 1497 struct anon *ap; 1498 spgcnt_t npages; 1499 pgcnt_t pgcnt, i; 1500 ulong_t index, off; 1501 int err = 0; 1502 int pageflags = 0; 1503 1504 ASSERT(szc != 0); 1505 pgcnt = page_get_pagecnt(szc); 1506 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1507 npages = btopr(size); 1508 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1509 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1510 1511 while (npages > 0) { 1512 index = an_idx; 1513 1514 /* 1515 * Find the next valid slot. 1516 */ 1517 if (anon_get_next_ptr(ahp, &index) == NULL) { 1518 break; 1519 } 1520 1521 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1522 /* 1523 * Now backup index to the beginning of the 1524 * current large page region of the anon array. 1525 */ 1526 index = P2ALIGN(index, pgcnt); 1527 off = index - an_idx; 1528 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1529 npages -= off; 1530 if (npages <= 0) 1531 break; 1532 an_idx += off; 1533 vp_off += ptob(off); 1534 addr += ptob(off); 1535 if (vpage != NULL) { 1536 vpage += off; 1537 } 1538 1539 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1540 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1541 page_t *pl[1 + 1]; 1542 page_t *pp; 1543 1544 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1545 pl, PAGESIZE, seg, addr, S_READ, cred, 1546 NULL); 1547 if (err) { 1548 break; 1549 } 1550 if (vpage != NULL) { 1551 prot = VPP_PROT(vpage); 1552 pageflags = VPP_ISPPLOCK(vpage) ? 1553 LOCK_PAGE : 0; 1554 } 1555 pp = anon_private(&ap, seg, addr, prot, pl[0], 1556 pageflags, cred); 1557 if (pp == NULL) { 1558 err = ENOMEM; 1559 break; 1560 } 1561 (void) anon_set_ptr(ahp, an_idx, ap, 1562 ANON_SLEEP); 1563 page_unlock(pp); 1564 } 1565 ASSERT(ap->an_refcnt == 1); 1566 addr += PAGESIZE; 1567 if (vpage != NULL) { 1568 vpage++; 1569 } 1570 } 1571 npages -= pgcnt; 1572 } 1573 1574 return (err); 1575 } 1576 1577 /* 1578 * Free a group of "size" anon pages, size in bytes, 1579 * and clear out the pointers to the anon entries. 1580 */ 1581 void 1582 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1583 { 1584 spgcnt_t npages; 1585 struct anon *ap; 1586 ulong_t old; 1587 1588 npages = btopr(size); 1589 1590 while (npages > 0) { 1591 old = index; 1592 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1593 break; 1594 1595 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1596 npages -= index - old; 1597 if (npages <= 0) 1598 break; 1599 1600 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1601 anon_decref(ap); 1602 /* 1603 * Bump index and decrement page count 1604 */ 1605 index++; 1606 npages--; 1607 } 1608 } 1609 1610 void 1611 anon_free_pages( 1612 struct anon_hdr *ahp, 1613 ulong_t an_idx, 1614 size_t size, 1615 uint_t szc) 1616 { 1617 spgcnt_t npages; 1618 pgcnt_t pgcnt; 1619 ulong_t index, off; 1620 1621 ASSERT(szc != 0); 1622 pgcnt = page_get_pagecnt(szc); 1623 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1624 npages = btopr(size); 1625 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1626 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1627 ASSERT(an_idx < ahp->size); 1628 1629 VM_STAT_ADD(anonvmstats.freepages[0]); 1630 1631 while (npages > 0) { 1632 index = an_idx; 1633 1634 /* 1635 * Find the next valid slot. 1636 */ 1637 if (anon_get_next_ptr(ahp, &index) == NULL) 1638 break; 1639 1640 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1641 /* 1642 * Now backup index to the beginning of the 1643 * current large page region of the old array. 1644 */ 1645 index = P2ALIGN(index, pgcnt); 1646 off = index - an_idx; 1647 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1648 npages -= off; 1649 if (npages <= 0) 1650 break; 1651 1652 anon_decref_pages(ahp, index, szc); 1653 1654 off += pgcnt; 1655 an_idx += off; 1656 npages -= pgcnt; 1657 } 1658 } 1659 1660 /* 1661 * Make anonymous pages discardable 1662 */ 1663 void 1664 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1665 { 1666 spgcnt_t npages = btopr(size); 1667 struct anon *ap; 1668 struct vnode *vp; 1669 anoff_t off; 1670 page_t *pp, *root_pp; 1671 kmutex_t *ahm; 1672 pgcnt_t pgcnt; 1673 ulong_t old_idx, idx, i; 1674 struct anon_hdr *ahp = amp->ahp; 1675 anon_sync_obj_t cookie; 1676 1677 ASSERT(RW_READ_HELD(&->a_rwlock)); 1678 pgcnt = 1; 1679 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1680 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1681 1682 /* 1683 * get anon pointer and index for the first valid entry 1684 * in the anon list, starting from "index" 1685 */ 1686 old_idx = index; 1687 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1688 break; 1689 1690 /* 1691 * decrement npages by number of NULL anon slots we skipped 1692 */ 1693 npages -= index - old_idx; 1694 if (npages <= 0) 1695 break; 1696 1697 anon_array_enter(amp, index, &cookie); 1698 ap = anon_get_ptr(ahp, index); 1699 ASSERT(ap != NULL); 1700 1701 /* 1702 * Get anonymous page and try to lock it SE_EXCL; 1703 * if we couldn't grab the lock we skip to next page. 1704 */ 1705 swap_xlate(ap, &vp, &off); 1706 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1707 if (pp == NULL) { 1708 segadvstat.MADV_FREE_miss.value.ul++; 1709 pgcnt = 1; 1710 anon_array_exit(&cookie); 1711 continue; 1712 } 1713 pgcnt = page_get_pagecnt(pp->p_szc); 1714 1715 /* 1716 * we cannot free a page which is permanently locked. 1717 * The page_struct_lock need not be acquired to examine 1718 * these fields since the page has an "exclusive" lock. 1719 */ 1720 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1721 page_unlock(pp); 1722 segadvstat.MADV_FREE_miss.value.ul++; 1723 anon_array_exit(&cookie); 1724 continue; 1725 } 1726 1727 ahm = AH_MUTEX(vp, off); 1728 mutex_enter(ahm); 1729 ASSERT(ap->an_refcnt != 0); 1730 /* 1731 * skip this one if copy-on-write is not yet broken. 1732 */ 1733 if (ap->an_refcnt > 1) { 1734 mutex_exit(ahm); 1735 page_unlock(pp); 1736 segadvstat.MADV_FREE_miss.value.ul++; 1737 anon_array_exit(&cookie); 1738 continue; 1739 } 1740 1741 if (pp->p_szc == 0) { 1742 pgcnt = 1; 1743 1744 /* 1745 * free swap slot; 1746 */ 1747 if (ap->an_pvp) { 1748 swap_phys_free(ap->an_pvp, ap->an_poff, 1749 PAGESIZE); 1750 ap->an_pvp = NULL; 1751 ap->an_poff = 0; 1752 } 1753 mutex_exit(ahm); 1754 segadvstat.MADV_FREE_hit.value.ul++; 1755 1756 /* 1757 * while we are at it, unload all the translations 1758 * and attempt to free the page. 1759 */ 1760 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1761 /*LINTED: constant in conditional context */ 1762 VN_DISPOSE(pp, B_FREE, 0, kcred); 1763 anon_array_exit(&cookie); 1764 continue; 1765 } 1766 1767 pgcnt = page_get_pagecnt(pp->p_szc); 1768 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1769 if (!page_try_demote_pages(pp)) { 1770 mutex_exit(ahm); 1771 page_unlock(pp); 1772 segadvstat.MADV_FREE_miss.value.ul++; 1773 anon_array_exit(&cookie); 1774 continue; 1775 } else { 1776 pgcnt = 1; 1777 if (ap->an_pvp) { 1778 swap_phys_free(ap->an_pvp, 1779 ap->an_poff, PAGESIZE); 1780 ap->an_pvp = NULL; 1781 ap->an_poff = 0; 1782 } 1783 mutex_exit(ahm); 1784 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1785 /*LINTED*/ 1786 VN_DISPOSE(pp, B_FREE, 0, kcred); 1787 segadvstat.MADV_FREE_hit.value.ul++; 1788 anon_array_exit(&cookie); 1789 continue; 1790 } 1791 } 1792 mutex_exit(ahm); 1793 root_pp = pp; 1794 1795 /* 1796 * try to lock remaining pages 1797 */ 1798 for (idx = 1; idx < pgcnt; idx++) { 1799 pp++; 1800 if (!page_trylock(pp, SE_EXCL)) 1801 break; 1802 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1803 page_unlock(pp); 1804 break; 1805 } 1806 } 1807 1808 if (idx == pgcnt) { 1809 for (i = 0; i < pgcnt; i++) { 1810 ap = anon_get_ptr(ahp, index + i); 1811 if (ap == NULL) 1812 break; 1813 swap_xlate(ap, &vp, &off); 1814 ahm = AH_MUTEX(vp, off); 1815 mutex_enter(ahm); 1816 ASSERT(ap->an_refcnt != 0); 1817 1818 /* 1819 * skip this one if copy-on-write 1820 * is not yet broken. 1821 */ 1822 if (ap->an_refcnt > 1) { 1823 mutex_exit(ahm); 1824 goto skiplp; 1825 } 1826 if (ap->an_pvp) { 1827 swap_phys_free(ap->an_pvp, 1828 ap->an_poff, PAGESIZE); 1829 ap->an_pvp = NULL; 1830 ap->an_poff = 0; 1831 } 1832 mutex_exit(ahm); 1833 } 1834 page_destroy_pages(root_pp); 1835 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1836 anon_array_exit(&cookie); 1837 continue; 1838 } 1839 skiplp: 1840 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1841 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1842 page_unlock(pp); 1843 anon_array_exit(&cookie); 1844 } 1845 } 1846 1847 /* 1848 * Return the kept page(s) and protections back to the segment driver. 1849 */ 1850 int 1851 anon_getpage( 1852 struct anon **app, 1853 uint_t *protp, 1854 page_t *pl[], 1855 size_t plsz, 1856 struct seg *seg, 1857 caddr_t addr, 1858 enum seg_rw rw, 1859 struct cred *cred) 1860 { 1861 page_t *pp; 1862 struct anon *ap = *app; 1863 struct vnode *vp; 1864 anoff_t off; 1865 int err; 1866 kmutex_t *ahm; 1867 1868 swap_xlate(ap, &vp, &off); 1869 1870 /* 1871 * Lookup the page. If page is being paged in, 1872 * wait for it to finish as we must return a list of 1873 * pages since this routine acts like the VOP_GETPAGE 1874 * routine does. 1875 */ 1876 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1877 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1878 mutex_enter(ahm); 1879 if (ap->an_refcnt == 1) 1880 *protp = PROT_ALL; 1881 else 1882 *protp = PROT_ALL & ~PROT_WRITE; 1883 mutex_exit(ahm); 1884 pl[0] = pp; 1885 pl[1] = NULL; 1886 return (0); 1887 } 1888 1889 /* 1890 * Simply treat it as a vnode fault on the anon vp. 1891 */ 1892 1893 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1894 "anon_getpage:seg %x addr %x vp %x", 1895 seg, addr, vp); 1896 1897 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1898 seg, addr, rw, cred, NULL); 1899 1900 if (err == 0 && pl != NULL) { 1901 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1902 mutex_enter(ahm); 1903 if (ap->an_refcnt != 1) 1904 *protp &= ~PROT_WRITE; /* make read-only */ 1905 mutex_exit(ahm); 1906 } 1907 return (err); 1908 } 1909 1910 /* 1911 * Creates or returns kept pages to the segment driver. returns -1 if a large 1912 * page cannot be allocated. returns -2 if some other process has allocated a 1913 * larger page. 1914 * 1915 * For cowfault it will allocate any size pages to fill the requested area to 1916 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1917 * slots within a large page with other processes). This policy greatly 1918 * simplifies large page freeing (which is only freed when all anon slot 1919 * refcnts are 0). 1920 */ 1921 int 1922 anon_map_getpages( 1923 struct anon_map *amp, 1924 ulong_t start_idx, 1925 uint_t szc, 1926 struct seg *seg, 1927 caddr_t addr, 1928 uint_t prot, 1929 uint_t *protp, 1930 page_t *ppa[], 1931 uint_t *ppa_szc, 1932 struct vpage vpage[], 1933 enum seg_rw rw, 1934 int brkcow, 1935 int anypgsz, 1936 int pgflags, 1937 struct cred *cred) 1938 { 1939 pgcnt_t pgcnt; 1940 struct anon *ap; 1941 struct vnode *vp; 1942 anoff_t off; 1943 page_t *pp, *pl[2], *conpp = NULL; 1944 caddr_t vaddr; 1945 ulong_t pg_idx, an_idx, i; 1946 spgcnt_t nreloc = 0; 1947 int prealloc = 1; 1948 int err, slotcreate; 1949 uint_t vpprot; 1950 int upsize = (szc < seg->s_szc); 1951 1952 #if !defined(__i386) && !defined(__amd64) 1953 ASSERT(seg->s_szc != 0); 1954 #endif 1955 ASSERT(szc <= seg->s_szc); 1956 ASSERT(ppa_szc != NULL); 1957 ASSERT(rw != S_CREATE); 1958 1959 *protp = PROT_ALL; 1960 1961 VM_STAT_ADD(anonvmstats.getpages[0]); 1962 1963 if (szc == 0) { 1964 VM_STAT_ADD(anonvmstats.getpages[1]); 1965 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1966 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1967 addr, rw, cred); 1968 if (err) 1969 return (err); 1970 ppa[0] = pl[0]; 1971 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1972 VM_STAT_ADD(anonvmstats.getpages[2]); 1973 if (ppa[0]->p_szc != 0 && upsize) { 1974 VM_STAT_ADD(anonvmstats.getpages[3]); 1975 *ppa_szc = MIN(ppa[0]->p_szc, 1976 seg->s_szc); 1977 page_unlock(ppa[0]); 1978 return (-2); 1979 } 1980 return (0); 1981 } 1982 panic("anon_map_getpages: cowfault for szc 0"); 1983 } else { 1984 VM_STAT_ADD(anonvmstats.getpages[4]); 1985 ppa[0] = anon_zero(seg, addr, &ap, cred); 1986 if (ppa[0] == NULL) 1987 return (ENOMEM); 1988 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1989 ANON_SLEEP); 1990 return (0); 1991 } 1992 } 1993 1994 pgcnt = page_get_pagecnt(szc); 1995 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1996 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1997 1998 /* 1999 * First we check for the case that the requtested large 2000 * page or larger page already exists in the system. 2001 * Actually we only check if the first constituent page 2002 * exists and only preallocate if it's not found. 2003 */ 2004 ap = anon_get_ptr(amp->ahp, start_idx); 2005 if (ap) { 2006 uint_t pszc; 2007 swap_xlate(ap, &vp, &off); 2008 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 2009 if (pszc > szc && upsize) { 2010 *ppa_szc = MIN(pszc, seg->s_szc); 2011 return (-2); 2012 } 2013 if (pszc >= szc) { 2014 prealloc = 0; 2015 } 2016 } 2017 } 2018 2019 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 2020 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 2021 2022 top: 2023 /* 2024 * If a smaller page or no page at all was found, 2025 * grab a large page off the freelist. 2026 */ 2027 if (prealloc) { 2028 ASSERT(conpp == NULL); 2029 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 2030 szc, 0, pgflags) != 0) { 2031 VM_STAT_ADD(anonvmstats.getpages[7]); 2032 if (brkcow == 0 || szc < seg->s_szc || 2033 !anon_szcshare(amp->ahp, start_idx)) { 2034 /* 2035 * If the refcnt's of all anon slots are <= 1 2036 * they can't increase since we are holding 2037 * the address space's lock. So segvn can 2038 * safely decrease szc without risking to 2039 * generate a cow fault for the region smaller 2040 * than the segment's largest page size. 2041 */ 2042 VM_STAT_ADD(anonvmstats.getpages[8]); 2043 return (-1); 2044 } 2045 docow: 2046 /* 2047 * This is a cow fault. Copy away the entire 1 large 2048 * page region of this segment. 2049 */ 2050 if (szc != seg->s_szc) 2051 panic("anon_map_getpages: cowfault for szc %d", 2052 szc); 2053 vaddr = addr; 2054 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2055 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2056 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2057 NULL) { 2058 err = anon_getpage(&ap, &vpprot, pl, 2059 PAGESIZE, seg, vaddr, rw, cred); 2060 if (err) { 2061 for (i = 0; i < pg_idx; i++) { 2062 if ((pp = ppa[i]) != 2063 NULL) 2064 page_unlock(pp); 2065 } 2066 return (err); 2067 } 2068 ppa[pg_idx] = pl[0]; 2069 } else { 2070 /* 2071 * Since this is a cowfault we know 2072 * that this address space has a 2073 * parent or children which means 2074 * anon_dup_fill_holes() has initialized 2075 * all anon slots within a large page 2076 * region that had at least one anon 2077 * slot at the time of fork(). 2078 */ 2079 panic("anon_map_getpages: " 2080 "cowfault but anon slot is empty"); 2081 } 2082 } 2083 VM_STAT_ADD(anonvmstats.getpages[9]); 2084 *protp = PROT_ALL; 2085 return (anon_map_privatepages(amp, start_idx, szc, seg, 2086 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2087 } 2088 } 2089 2090 VM_STAT_ADD(anonvmstats.getpages[10]); 2091 2092 an_idx = start_idx; 2093 pg_idx = 0; 2094 vaddr = addr; 2095 while (pg_idx < pgcnt) { 2096 slotcreate = 0; 2097 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2098 VM_STAT_ADD(anonvmstats.getpages[11]); 2099 /* 2100 * For us to have decided not to preallocate 2101 * would have meant that a large page 2102 * was found. Which also means that all of the 2103 * anon slots for that page would have been 2104 * already created for us. 2105 */ 2106 if (prealloc == 0) 2107 panic("anon_map_getpages: prealloc = 0"); 2108 2109 slotcreate = 1; 2110 ap = anon_alloc(NULL, 0); 2111 } 2112 swap_xlate(ap, &vp, &off); 2113 2114 /* 2115 * Now setup our preallocated page to pass down 2116 * to swap_getpage(). 2117 */ 2118 if (prealloc) { 2119 ASSERT(ppa[pg_idx]->p_szc == szc); 2120 conpp = ppa[pg_idx]; 2121 } 2122 ASSERT(prealloc || conpp == NULL); 2123 2124 /* 2125 * If we just created this anon slot then call 2126 * with S_CREATE to prevent doing IO on the page. 2127 * Similar to the anon_zero case. 2128 */ 2129 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2130 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2131 slotcreate == 1 ? S_CREATE : rw, cred); 2132 2133 if (err) { 2134 ASSERT(err != -2 || upsize); 2135 VM_STAT_ADD(anonvmstats.getpages[12]); 2136 ASSERT(slotcreate == 0); 2137 goto io_err; 2138 } 2139 2140 pp = pl[0]; 2141 2142 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2143 VM_STAT_ADD(anonvmstats.getpages[13]); 2144 ASSERT(slotcreate == 0); 2145 ASSERT(prealloc == 0); 2146 ASSERT(pg_idx == 0); 2147 if (pp->p_szc > szc) { 2148 ASSERT(upsize); 2149 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2150 page_unlock(pp); 2151 VM_STAT_ADD(anonvmstats.getpages[14]); 2152 return (-2); 2153 } 2154 page_unlock(pp); 2155 prealloc = 1; 2156 goto top; 2157 } 2158 2159 /* 2160 * If we decided to preallocate but VOP_GETPAGE 2161 * found a page in the system that satisfies our 2162 * request then free up our preallocated large page 2163 * and continue looping accross the existing large 2164 * page via VOP_GETPAGE. 2165 */ 2166 if (prealloc && pp != ppa[pg_idx]) { 2167 VM_STAT_ADD(anonvmstats.getpages[15]); 2168 ASSERT(slotcreate == 0); 2169 ASSERT(pg_idx == 0); 2170 conpp = NULL; 2171 prealloc = 0; 2172 page_free_pages(ppa[0]); 2173 } 2174 2175 if (prealloc && nreloc > 1) { 2176 /* 2177 * we have relocated out of a smaller large page. 2178 * skip npgs - 1 iterations and continue which will 2179 * increment by one the loop indices. 2180 */ 2181 spgcnt_t npgs = nreloc; 2182 2183 VM_STAT_ADD(anonvmstats.getpages[16]); 2184 2185 ASSERT(pp == ppa[pg_idx]); 2186 ASSERT(slotcreate == 0); 2187 ASSERT(pg_idx + npgs <= pgcnt); 2188 if ((*protp & PROT_WRITE) && 2189 anon_share(amp->ahp, an_idx, npgs)) { 2190 *protp &= ~PROT_WRITE; 2191 } 2192 pg_idx += npgs; 2193 an_idx += npgs; 2194 vaddr += PAGESIZE * npgs; 2195 continue; 2196 } 2197 2198 VM_STAT_ADD(anonvmstats.getpages[17]); 2199 2200 /* 2201 * Anon_zero case. 2202 */ 2203 if (slotcreate) { 2204 ASSERT(prealloc); 2205 pagezero(pp, 0, PAGESIZE); 2206 CPU_STATS_ADD_K(vm, zfod, 1); 2207 hat_setrefmod(pp); 2208 } 2209 2210 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2211 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2212 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2213 2214 if (pg_idx > 0 && 2215 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2216 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2217 panic("anon_map_getpages: unexpected page"); 2218 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2219 panic("anon_map_getpages: unaligned page"); 2220 } 2221 2222 if (prealloc == 0) { 2223 ppa[pg_idx] = pp; 2224 } 2225 2226 if (ap->an_refcnt > 1) { 2227 VM_STAT_ADD(anonvmstats.getpages[18]); 2228 *protp &= ~PROT_WRITE; 2229 } 2230 2231 /* 2232 * If this is a new anon slot then initialize 2233 * the anon array entry. 2234 */ 2235 if (slotcreate) { 2236 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2237 } 2238 pg_idx++; 2239 an_idx++; 2240 vaddr += PAGESIZE; 2241 } 2242 2243 /* 2244 * Since preallocated pages come off the freelist 2245 * they are locked SE_EXCL. Simply downgrade and return. 2246 */ 2247 if (prealloc) { 2248 VM_STAT_ADD(anonvmstats.getpages[19]); 2249 conpp = NULL; 2250 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2251 page_downgrade(ppa[pg_idx]); 2252 } 2253 } 2254 ASSERT(conpp == NULL); 2255 2256 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2257 VM_STAT_ADD(anonvmstats.getpages[20]); 2258 return (0); 2259 } 2260 2261 if (szc < seg->s_szc) 2262 panic("anon_map_getpages: cowfault for szc %d", szc); 2263 2264 VM_STAT_ADD(anonvmstats.getpages[21]); 2265 2266 *protp = PROT_ALL; 2267 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2268 ppa, vpage, anypgsz, pgflags, cred)); 2269 io_err: 2270 /* 2271 * We got an IO error somewhere in our large page. 2272 * If we were using a preallocated page then just demote 2273 * all the constituent pages that we've succeeded with sofar 2274 * to PAGESIZE pages and leave them in the system 2275 * unlocked. 2276 */ 2277 2278 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2279 2280 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2281 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2282 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2283 2284 if (prealloc) { 2285 conpp = NULL; 2286 if (pg_idx > 0) { 2287 VM_STAT_ADD(anonvmstats.getpages[25]); 2288 for (i = 0; i < pgcnt; i++) { 2289 pp = ppa[i]; 2290 ASSERT(PAGE_EXCL(pp)); 2291 ASSERT(pp->p_szc == szc); 2292 pp->p_szc = 0; 2293 } 2294 for (i = 0; i < pg_idx; i++) { 2295 ASSERT(!hat_page_is_mapped(ppa[i])); 2296 page_unlock(ppa[i]); 2297 } 2298 /* 2299 * Now free up the remaining unused constituent 2300 * pages. 2301 */ 2302 while (pg_idx < pgcnt) { 2303 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2304 page_free(ppa[pg_idx], 0); 2305 pg_idx++; 2306 } 2307 } else { 2308 VM_STAT_ADD(anonvmstats.getpages[26]); 2309 page_free_pages(ppa[0]); 2310 } 2311 } else { 2312 VM_STAT_ADD(anonvmstats.getpages[27]); 2313 ASSERT(err > 0); 2314 for (i = 0; i < pg_idx; i++) 2315 page_unlock(ppa[i]); 2316 } 2317 ASSERT(conpp == NULL); 2318 if (err != -1) 2319 return (err); 2320 /* 2321 * we are here because we failed to relocate. 2322 */ 2323 ASSERT(prealloc); 2324 if (brkcow == 0 || szc < seg->s_szc || 2325 !anon_szcshare(amp->ahp, start_idx)) { 2326 VM_STAT_ADD(anonvmstats.getpages[28]); 2327 return (-1); 2328 } 2329 VM_STAT_ADD(anonvmstats.getpages[29]); 2330 goto docow; 2331 } 2332 2333 2334 /* 2335 * Turn a reference to an object or shared anon page 2336 * into a private page with a copy of the data from the 2337 * original page which is always locked by the caller. 2338 * This routine unloads the translation and unlocks the 2339 * original page, if it isn't being stolen, before returning 2340 * to the caller. 2341 * 2342 * NOTE: The original anon slot is not freed by this routine 2343 * It must be freed by the caller while holding the 2344 * "anon_map" lock to prevent races which can occur if 2345 * a process has multiple lwps in its address space. 2346 */ 2347 page_t * 2348 anon_private( 2349 struct anon **app, 2350 struct seg *seg, 2351 caddr_t addr, 2352 uint_t prot, 2353 page_t *opp, 2354 int oppflags, 2355 struct cred *cred) 2356 { 2357 struct anon *old = *app; 2358 struct anon *new; 2359 page_t *pp = NULL; 2360 struct vnode *vp; 2361 anoff_t off; 2362 page_t *anon_pl[1 + 1]; 2363 int err; 2364 2365 if (oppflags & STEAL_PAGE) 2366 ASSERT(PAGE_EXCL(opp)); 2367 else 2368 ASSERT(PAGE_LOCKED(opp)); 2369 2370 CPU_STATS_ADD_K(vm, cow_fault, 1); 2371 2372 /* Kernel probe */ 2373 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2374 tnf_opaque, address, addr); 2375 2376 *app = new = anon_alloc(NULL, 0); 2377 swap_xlate(new, &vp, &off); 2378 2379 if (oppflags & STEAL_PAGE) { 2380 page_rename(opp, vp, (u_offset_t)off); 2381 pp = opp; 2382 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2383 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2384 seg, addr, pp, vp, off); 2385 hat_setmod(pp); 2386 2387 /* bug 4026339 */ 2388 page_downgrade(pp); 2389 return (pp); 2390 } 2391 2392 /* 2393 * Call the VOP_GETPAGE routine to create the page, thereby 2394 * enabling the vnode driver to allocate any filesystem 2395 * space (e.g., disk block allocation for UFS). This also 2396 * prevents more than one page from being added to the 2397 * vnode at the same time. 2398 */ 2399 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2400 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2401 if (err) 2402 goto out; 2403 2404 pp = anon_pl[0]; 2405 2406 /* 2407 * If the original page was locked, we need to move the lock 2408 * to the new page by transfering 'cowcnt/lckcnt' of the original 2409 * page to 'cowcnt/lckcnt' of the new page. 2410 * 2411 * See Statement at the beginning of segvn_lockop() and 2412 * comments in page_pp_useclaim() regarding the way 2413 * cowcnts/lckcnts are handled. 2414 * 2415 * Also availrmem must be decremented up front for read only mapping 2416 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2417 * if availrmem did not need to be decremented after all. 2418 */ 2419 if (oppflags & LOCK_PAGE) { 2420 if ((prot & PROT_WRITE) == 0) { 2421 mutex_enter(&freemem_lock); 2422 if (availrmem > pages_pp_maximum) { 2423 availrmem--; 2424 pages_useclaim++; 2425 } else { 2426 mutex_exit(&freemem_lock); 2427 goto out; 2428 } 2429 mutex_exit(&freemem_lock); 2430 } 2431 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2432 } 2433 2434 /* 2435 * Now copy the contents from the original page, 2436 * which is locked and loaded in the MMU by 2437 * the caller to prevent yet another page fault. 2438 */ 2439 /* XXX - should set mod bit in here */ 2440 if (ppcopy(opp, pp) == 0) { 2441 /* 2442 * Before ppcopy could hanlde UE or other faults, we 2443 * would have panicked here, and still have no option 2444 * but to do so now. 2445 */ 2446 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2447 (void *)opp, (void *)pp); 2448 } 2449 2450 hat_setrefmod(pp); /* mark as modified */ 2451 2452 /* 2453 * Unload the old translation. 2454 */ 2455 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2456 2457 /* 2458 * Free unmapped, unmodified original page. 2459 * or release the lock on the original page, 2460 * otherwise the process will sleep forever in 2461 * anon_decref() waiting for the "exclusive" lock 2462 * on the page. 2463 */ 2464 (void) page_release(opp, 1); 2465 2466 /* 2467 * we are done with page creation so downgrade the new 2468 * page's selock to shared, this helps when multiple 2469 * as_fault(...SOFTLOCK...) are done to the same 2470 * page(aio) 2471 */ 2472 page_downgrade(pp); 2473 2474 /* 2475 * NOTE: The original anon slot must be freed by the 2476 * caller while holding the "anon_map" lock, if we 2477 * copied away from an anonymous page. 2478 */ 2479 return (pp); 2480 2481 out: 2482 *app = old; 2483 if (pp) 2484 page_unlock(pp); 2485 anon_decref(new); 2486 page_unlock(opp); 2487 return ((page_t *)NULL); 2488 } 2489 2490 int 2491 anon_map_privatepages( 2492 struct anon_map *amp, 2493 ulong_t start_idx, 2494 uint_t szc, 2495 struct seg *seg, 2496 caddr_t addr, 2497 uint_t prot, 2498 page_t *ppa[], 2499 struct vpage vpage[], 2500 int anypgsz, 2501 int pgflags, 2502 struct cred *cred) 2503 { 2504 pgcnt_t pgcnt; 2505 struct vnode *vp; 2506 anoff_t off; 2507 page_t *pl[2], *conpp = NULL; 2508 int err; 2509 int prealloc = 1; 2510 struct anon *ap, *oldap; 2511 caddr_t vaddr; 2512 page_t *pplist, *pp; 2513 ulong_t pg_idx, an_idx; 2514 spgcnt_t nreloc = 0; 2515 int pagelock = 0; 2516 kmutex_t *ahmpages = NULL; 2517 #ifdef DEBUG 2518 int refcnt; 2519 #endif 2520 2521 ASSERT(szc != 0); 2522 ASSERT(szc == seg->s_szc); 2523 2524 VM_STAT_ADD(anonvmstats.privatepages[0]); 2525 2526 pgcnt = page_get_pagecnt(szc); 2527 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2528 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2529 2530 ASSERT(amp != NULL); 2531 ap = anon_get_ptr(amp->ahp, start_idx); 2532 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2533 2534 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2535 2536 /* 2537 * Now try and allocate the large page. If we fail then just 2538 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2539 * the caller make this decision but to avoid added complexity 2540 * it's simplier to handle that case here. 2541 */ 2542 if (anypgsz == -1) { 2543 VM_STAT_ADD(anonvmstats.privatepages[2]); 2544 prealloc = 0; 2545 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2546 anypgsz, pgflags) != 0) { 2547 VM_STAT_ADD(anonvmstats.privatepages[3]); 2548 prealloc = 0; 2549 } 2550 2551 /* 2552 * make the decrement of all refcnts of all 2553 * anon slots of a large page appear atomic by 2554 * getting an anonpages_hash_lock for the 2555 * first anon slot of a large page. 2556 */ 2557 if (ap != NULL) { 2558 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 2559 mutex_enter(ahmpages); 2560 if (ap->an_refcnt == 1) { 2561 VM_STAT_ADD(anonvmstats.privatepages[4]); 2562 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2563 mutex_exit(ahmpages); 2564 2565 if (prealloc) { 2566 page_free_replacement_page(pplist); 2567 page_create_putback(pgcnt); 2568 } 2569 ASSERT(ppa[0]->p_szc <= szc); 2570 if (ppa[0]->p_szc == szc) { 2571 VM_STAT_ADD(anonvmstats.privatepages[5]); 2572 return (0); 2573 } 2574 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2575 ASSERT(ppa[pg_idx] != NULL); 2576 page_unlock(ppa[pg_idx]); 2577 } 2578 return (-1); 2579 } 2580 } 2581 2582 /* 2583 * If we are passed in the vpage array and this is 2584 * not PROT_WRITE then we need to decrement availrmem 2585 * up front before we try anything. If we need to and 2586 * can't decrement availrmem then its better to fail now 2587 * than in the middle of processing the new large page. 2588 * page_pp_usclaim() on behalf of each constituent page 2589 * below will adjust availrmem back for the cases not needed. 2590 */ 2591 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2592 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2593 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2594 pagelock = 1; 2595 break; 2596 } 2597 } 2598 if (pagelock) { 2599 VM_STAT_ADD(anonvmstats.privatepages[6]); 2600 mutex_enter(&freemem_lock); 2601 if (availrmem >= pages_pp_maximum + pgcnt) { 2602 availrmem -= pgcnt; 2603 pages_useclaim += pgcnt; 2604 } else { 2605 VM_STAT_ADD(anonvmstats.privatepages[7]); 2606 mutex_exit(&freemem_lock); 2607 if (ahmpages != NULL) { 2608 mutex_exit(ahmpages); 2609 } 2610 if (prealloc) { 2611 page_free_replacement_page(pplist); 2612 page_create_putback(pgcnt); 2613 } 2614 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2615 if (ppa[pg_idx] != NULL) 2616 page_unlock(ppa[pg_idx]); 2617 return (ENOMEM); 2618 } 2619 mutex_exit(&freemem_lock); 2620 } 2621 } 2622 2623 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2624 2625 VM_STAT_ADD(anonvmstats.privatepages[8]); 2626 2627 an_idx = start_idx; 2628 pg_idx = 0; 2629 vaddr = addr; 2630 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2631 ASSERT(ppa[pg_idx] != NULL); 2632 oldap = anon_get_ptr(amp->ahp, an_idx); 2633 ASSERT(ahmpages != NULL || oldap == NULL); 2634 ASSERT(ahmpages == NULL || oldap != NULL); 2635 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2636 ASSERT(ahmpages == NULL || pg_idx != 0 || 2637 (refcnt = oldap->an_refcnt)); 2638 ASSERT(ahmpages == NULL || pg_idx == 0 || 2639 refcnt == oldap->an_refcnt); 2640 2641 ap = anon_alloc(NULL, 0); 2642 2643 swap_xlate(ap, &vp, &off); 2644 2645 /* 2646 * Now setup our preallocated page to pass down to 2647 * swap_getpage(). 2648 */ 2649 if (prealloc) { 2650 pp = pplist; 2651 page_sub(&pplist, pp); 2652 conpp = pp; 2653 } 2654 2655 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2656 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2657 S_CREATE, cred); 2658 2659 /* 2660 * Impossible to fail this is S_CREATE. 2661 */ 2662 if (err) 2663 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2664 2665 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2666 ASSERT(prealloc == 0 || nreloc == 1); 2667 2668 pp = pl[0]; 2669 2670 /* 2671 * If the original page was locked, we need to move 2672 * the lock to the new page by transfering 2673 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2674 * of the new page. pg_idx can be used to index 2675 * into the vpage array since the caller will guarentee 2676 * that vpage struct passed in corresponds to addr 2677 * and forward. 2678 */ 2679 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2680 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2681 } else if (pagelock) { 2682 mutex_enter(&freemem_lock); 2683 availrmem++; 2684 pages_useclaim--; 2685 mutex_exit(&freemem_lock); 2686 } 2687 2688 /* 2689 * Now copy the contents from the original page. 2690 */ 2691 if (ppcopy(ppa[pg_idx], pp) == 0) { 2692 /* 2693 * Before ppcopy could hanlde UE or other faults, we 2694 * would have panicked here, and still have no option 2695 * but to do so now. 2696 */ 2697 panic("anon_map_privatepages, ppcopy failed"); 2698 } 2699 2700 hat_setrefmod(pp); /* mark as modified */ 2701 2702 /* 2703 * Release the lock on the original page, 2704 * derement the old slot, and down grade the lock 2705 * on the new copy. 2706 */ 2707 page_unlock(ppa[pg_idx]); 2708 2709 if (!prealloc) 2710 page_downgrade(pp); 2711 2712 ppa[pg_idx] = pp; 2713 2714 /* 2715 * Now reflect the copy in the new anon array. 2716 */ 2717 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2718 if (oldap != NULL) 2719 anon_decref(oldap); 2720 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2721 } 2722 2723 /* 2724 * Unload the old large page translation. 2725 */ 2726 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2727 2728 if (ahmpages != NULL) { 2729 mutex_exit(ahmpages); 2730 } 2731 ASSERT(prealloc == 0 || pplist == NULL); 2732 if (prealloc) { 2733 VM_STAT_ADD(anonvmstats.privatepages[9]); 2734 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2735 page_downgrade(ppa[pg_idx]); 2736 } 2737 } 2738 2739 return (0); 2740 } 2741 2742 /* 2743 * Allocate a private zero-filled anon page. 2744 */ 2745 page_t * 2746 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2747 { 2748 struct anon *ap; 2749 page_t *pp; 2750 struct vnode *vp; 2751 anoff_t off; 2752 page_t *anon_pl[1 + 1]; 2753 int err; 2754 2755 /* Kernel probe */ 2756 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2757 tnf_opaque, address, addr); 2758 2759 *app = ap = anon_alloc(NULL, 0); 2760 swap_xlate(ap, &vp, &off); 2761 2762 /* 2763 * Call the VOP_GETPAGE routine to create the page, thereby 2764 * enabling the vnode driver to allocate any filesystem 2765 * dependent structures (e.g., disk block allocation for UFS). 2766 * This also prevents more than on page from being added to 2767 * the vnode at the same time since it is locked. 2768 */ 2769 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2770 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2771 if (err) { 2772 *app = NULL; 2773 anon_decref(ap); 2774 return (NULL); 2775 } 2776 pp = anon_pl[0]; 2777 2778 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2779 page_downgrade(pp); 2780 CPU_STATS_ADD_K(vm, zfod, 1); 2781 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2782 return (pp); 2783 } 2784 2785 2786 /* 2787 * Allocate array of private zero-filled anon pages for empty slots 2788 * and kept pages for non empty slots within given range. 2789 * 2790 * NOTE: This rontine will try and use large pages 2791 * if available and supported by underlying platform. 2792 */ 2793 int 2794 anon_map_createpages( 2795 struct anon_map *amp, 2796 ulong_t start_index, 2797 size_t len, 2798 page_t *ppa[], 2799 struct seg *seg, 2800 caddr_t addr, 2801 enum seg_rw rw, 2802 struct cred *cred) 2803 { 2804 2805 struct anon *ap; 2806 struct vnode *ap_vp; 2807 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2808 int err = 0; 2809 ulong_t p_index, index; 2810 pgcnt_t npgs, pg_cnt; 2811 spgcnt_t nreloc = 0; 2812 uint_t l_szc, szc, prot; 2813 anoff_t ap_off; 2814 size_t pgsz; 2815 lgrp_t *lgrp; 2816 kmutex_t *ahm; 2817 2818 /* 2819 * XXX For now only handle S_CREATE. 2820 */ 2821 ASSERT(rw == S_CREATE); 2822 2823 index = start_index; 2824 p_index = 0; 2825 npgs = btopr(len); 2826 2827 /* 2828 * If this platform supports multiple page sizes 2829 * then try and allocate directly from the free 2830 * list for pages larger than PAGESIZE. 2831 * 2832 * NOTE:When we have page_create_ru we can stop 2833 * directly allocating from the freelist. 2834 */ 2835 l_szc = seg->s_szc; 2836 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2837 while (npgs) { 2838 2839 /* 2840 * if anon slot already exists 2841 * (means page has been created) 2842 * so 1) look up the page 2843 * 2) if the page is still in memory, get it. 2844 * 3) if not, create a page and 2845 * page in from physical swap device. 2846 * These are done in anon_getpage(). 2847 */ 2848 ap = anon_get_ptr(amp->ahp, index); 2849 if (ap) { 2850 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2851 seg, addr, S_READ, cred); 2852 if (err) { 2853 ANON_LOCK_EXIT(&->a_rwlock); 2854 panic("anon_map_createpages: anon_getpage"); 2855 } 2856 pp = anon_pl[0]; 2857 ppa[p_index++] = pp; 2858 2859 /* 2860 * an_pvp can become non-NULL after SysV's page was 2861 * paged out before ISM was attached to this SysV 2862 * shared memory segment. So free swap slot if needed. 2863 */ 2864 if (ap->an_pvp != NULL) { 2865 page_io_lock(pp); 2866 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 2867 mutex_enter(ahm); 2868 if (ap->an_pvp != NULL) { 2869 swap_phys_free(ap->an_pvp, 2870 ap->an_poff, PAGESIZE); 2871 ap->an_pvp = NULL; 2872 ap->an_poff = 0; 2873 mutex_exit(ahm); 2874 hat_setmod(pp); 2875 } else { 2876 mutex_exit(ahm); 2877 } 2878 page_io_unlock(pp); 2879 } 2880 2881 addr += PAGESIZE; 2882 index++; 2883 npgs--; 2884 continue; 2885 } 2886 /* 2887 * Now try and allocate the largest page possible 2888 * for the current address and range. 2889 * Keep dropping down in page size until: 2890 * 2891 * 1) Properly aligned 2892 * 2) Does not overlap existing anon pages 2893 * 3) Fits in remaining range. 2894 * 4) able to allocate one. 2895 * 2896 * NOTE: XXX When page_create_ru is completed this code 2897 * will change. 2898 */ 2899 szc = l_szc; 2900 pplist = NULL; 2901 pg_cnt = 0; 2902 while (szc) { 2903 pgsz = page_get_pagesize(szc); 2904 pg_cnt = pgsz >> PAGESHIFT; 2905 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2906 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2907 /* 2908 * XXX 2909 * Since we are faking page_create() 2910 * we also need to do the freemem and 2911 * pcf accounting. 2912 */ 2913 (void) page_create_wait(pg_cnt, PG_WAIT); 2914 2915 /* 2916 * Get lgroup to allocate next page of shared 2917 * memory from and use it to specify where to 2918 * allocate the physical memory 2919 */ 2920 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2921 2922 pplist = page_get_freelist( 2923 anon_vp, (u_offset_t)0, seg, 2924 addr, pgsz, 0, lgrp); 2925 2926 if (pplist == NULL) { 2927 page_create_putback(pg_cnt); 2928 } 2929 2930 /* 2931 * If a request for a page of size 2932 * larger than PAGESIZE failed 2933 * then don't try that size anymore. 2934 */ 2935 if (pplist == NULL) { 2936 l_szc = szc - 1; 2937 } else { 2938 break; 2939 } 2940 } 2941 szc--; 2942 } 2943 2944 /* 2945 * If just using PAGESIZE pages then don't 2946 * directly allocate from the free list. 2947 */ 2948 if (pplist == NULL) { 2949 ASSERT(szc == 0); 2950 pp = anon_zero(seg, addr, &ap, cred); 2951 if (pp == NULL) { 2952 ANON_LOCK_EXIT(&->a_rwlock); 2953 panic("anon_map_createpages: anon_zero"); 2954 } 2955 ppa[p_index++] = pp; 2956 2957 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2958 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2959 2960 addr += PAGESIZE; 2961 index++; 2962 npgs--; 2963 continue; 2964 } 2965 2966 /* 2967 * pplist is a list of pg_cnt PAGESIZE pages. 2968 * These pages are locked SE_EXCL since they 2969 * came directly off the free list. 2970 */ 2971 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2972 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2973 ASSERT(conpp == NULL); 2974 while (pg_cnt--) { 2975 2976 ap = anon_alloc(NULL, 0); 2977 swap_xlate(ap, &ap_vp, &ap_off); 2978 2979 ASSERT(pplist != NULL); 2980 pp = pplist; 2981 page_sub(&pplist, pp); 2982 PP_CLRFREE(pp); 2983 PP_CLRAGED(pp); 2984 conpp = pp; 2985 2986 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2987 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2988 &nreloc, seg, addr, S_CREATE, cred); 2989 2990 if (err) { 2991 ANON_LOCK_EXIT(&->a_rwlock); 2992 panic("anon_map_createpages: S_CREATE"); 2993 } 2994 2995 ASSERT(anon_pl[0] == pp); 2996 ASSERT(nreloc == 1); 2997 pagezero(pp, 0, PAGESIZE); 2998 CPU_STATS_ADD_K(vm, zfod, 1); 2999 hat_setrefmod(pp); 3000 3001 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 3002 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3003 3004 ppa[p_index++] = pp; 3005 3006 addr += PAGESIZE; 3007 index++; 3008 npgs--; 3009 } 3010 conpp = NULL; 3011 pg_cnt = pgsz >> PAGESHIFT; 3012 p_index = p_index - pg_cnt; 3013 while (pg_cnt--) { 3014 page_downgrade(ppa[p_index++]); 3015 } 3016 } 3017 ANON_LOCK_EXIT(&->a_rwlock); 3018 return (0); 3019 } 3020 3021 static int 3022 anon_try_demote_pages( 3023 struct anon_hdr *ahp, 3024 ulong_t sidx, 3025 uint_t szc, 3026 page_t **ppa, 3027 int private) 3028 { 3029 struct anon *ap; 3030 pgcnt_t pgcnt = page_get_pagecnt(szc); 3031 page_t *pp; 3032 pgcnt_t i; 3033 kmutex_t *ahmpages = NULL; 3034 int root = 0; 3035 pgcnt_t npgs; 3036 pgcnt_t curnpgs = 0; 3037 size_t ppasize = 0; 3038 3039 ASSERT(szc != 0); 3040 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3041 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3042 ASSERT(sidx < ahp->size); 3043 3044 if (ppa == NULL) { 3045 ppasize = pgcnt * sizeof (page_t *); 3046 ppa = kmem_alloc(ppasize, KM_SLEEP); 3047 } 3048 3049 ap = anon_get_ptr(ahp, sidx); 3050 if (ap != NULL && private) { 3051 VM_STAT_ADD(anonvmstats.demotepages[1]); 3052 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 3053 mutex_enter(ahmpages); 3054 } 3055 3056 if (ap != NULL && ap->an_refcnt > 1) { 3057 if (ahmpages != NULL) { 3058 VM_STAT_ADD(anonvmstats.demotepages[2]); 3059 mutex_exit(ahmpages); 3060 } 3061 if (ppasize != 0) { 3062 kmem_free(ppa, ppasize); 3063 } 3064 return (0); 3065 } 3066 if (ahmpages != NULL) { 3067 mutex_exit(ahmpages); 3068 } 3069 if (ahp->size - sidx < pgcnt) { 3070 ASSERT(private == 0); 3071 pgcnt = ahp->size - sidx; 3072 } 3073 for (i = 0; i < pgcnt; i++, sidx++) { 3074 ap = anon_get_ptr(ahp, sidx); 3075 if (ap != NULL) { 3076 if (ap->an_refcnt != 1) { 3077 panic("anon_try_demote_pages: an_refcnt != 1"); 3078 } 3079 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3080 SE_EXCL); 3081 if (pp != NULL) { 3082 (void) hat_pageunload(pp, 3083 HAT_FORCE_PGUNLOAD); 3084 } 3085 } else { 3086 ppa[i] = NULL; 3087 } 3088 } 3089 for (i = 0; i < pgcnt; i++) { 3090 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3091 ASSERT(pp->p_szc <= szc); 3092 if (!root) { 3093 VM_STAT_ADD(anonvmstats.demotepages[3]); 3094 if (curnpgs != 0) 3095 panic("anon_try_demote_pages: " 3096 "bad large page"); 3097 3098 root = 1; 3099 curnpgs = npgs = 3100 page_get_pagecnt(pp->p_szc); 3101 3102 ASSERT(npgs <= pgcnt); 3103 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3104 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3105 } else { 3106 ASSERT(i > 0); 3107 ASSERT(page_pptonum(pp) - 1 == 3108 page_pptonum(ppa[i - 1])); 3109 if ((page_pptonum(pp) & (npgs - 1)) == 3110 npgs - 1) 3111 root = 0; 3112 } 3113 ASSERT(PAGE_EXCL(pp)); 3114 pp->p_szc = 0; 3115 ASSERT(curnpgs > 0); 3116 curnpgs--; 3117 } 3118 } 3119 if (root != 0 || curnpgs != 0) 3120 panic("anon_try_demote_pages: bad large page"); 3121 3122 for (i = 0; i < pgcnt; i++) { 3123 if ((pp = ppa[i]) != NULL) { 3124 ASSERT(!hat_page_is_mapped(pp)); 3125 ASSERT(pp->p_szc == 0); 3126 page_unlock(pp); 3127 } 3128 } 3129 if (ppasize != 0) { 3130 kmem_free(ppa, ppasize); 3131 } 3132 return (1); 3133 } 3134 3135 /* 3136 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3137 */ 3138 int 3139 anon_map_demotepages( 3140 struct anon_map *amp, 3141 ulong_t start_idx, 3142 struct seg *seg, 3143 caddr_t addr, 3144 uint_t prot, 3145 struct vpage vpage[], 3146 struct cred *cred) 3147 { 3148 struct anon *ap; 3149 uint_t szc = seg->s_szc; 3150 pgcnt_t pgcnt = page_get_pagecnt(szc); 3151 size_t ppasize = pgcnt * sizeof (page_t *); 3152 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3153 page_t *pp; 3154 page_t *pl[2]; 3155 pgcnt_t i, pg_idx; 3156 ulong_t an_idx; 3157 caddr_t vaddr; 3158 int err; 3159 int retry = 0; 3160 uint_t vpprot; 3161 3162 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3163 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3164 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3165 ASSERT(ppa != NULL); 3166 ASSERT(szc != 0); 3167 ASSERT(szc == amp->a_szc); 3168 3169 VM_STAT_ADD(anonvmstats.demotepages[0]); 3170 3171 top: 3172 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3173 kmem_free(ppa, ppasize); 3174 return (0); 3175 } 3176 3177 VM_STAT_ADD(anonvmstats.demotepages[4]); 3178 3179 ASSERT(retry == 0); /* we can be here only once */ 3180 3181 vaddr = addr; 3182 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3183 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3184 ap = anon_get_ptr(amp->ahp, an_idx); 3185 if (ap == NULL) 3186 panic("anon_map_demotepages: no anon slot"); 3187 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3188 S_READ, cred); 3189 if (err) { 3190 for (i = 0; i < pg_idx; i++) { 3191 if ((pp = ppa[i]) != NULL) 3192 page_unlock(pp); 3193 } 3194 kmem_free(ppa, ppasize); 3195 return (err); 3196 } 3197 ppa[pg_idx] = pl[0]; 3198 } 3199 3200 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3201 vpage, -1, 0, cred); 3202 if (err > 0) { 3203 VM_STAT_ADD(anonvmstats.demotepages[5]); 3204 kmem_free(ppa, ppasize); 3205 return (err); 3206 } 3207 ASSERT(err == 0 || err == -1); 3208 if (err == -1) { 3209 VM_STAT_ADD(anonvmstats.demotepages[6]); 3210 retry = 1; 3211 goto top; 3212 } 3213 for (i = 0; i < pgcnt; i++) { 3214 ASSERT(ppa[i] != NULL); 3215 if (ppa[i]->p_szc != 0) 3216 retry = 1; 3217 page_unlock(ppa[i]); 3218 } 3219 if (retry) { 3220 VM_STAT_ADD(anonvmstats.demotepages[7]); 3221 goto top; 3222 } 3223 3224 VM_STAT_ADD(anonvmstats.demotepages[8]); 3225 3226 kmem_free(ppa, ppasize); 3227 3228 return (0); 3229 } 3230 3231 /* 3232 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3233 * structures with private anon maps. Therefore all anon structures should 3234 * have at most one reference at this point. This means underlying pages can 3235 * be exclusively locked and demoted or freed. If not freeing the entire 3236 * large pages demote the ends of the region we free to be able to free 3237 * subpages. Page roots correspond to aligned index positions in anon map. 3238 */ 3239 void 3240 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3241 { 3242 ulong_t eidx = sidx + btopr(len); 3243 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3244 struct anon_hdr *ahp = amp->ahp; 3245 ulong_t tidx; 3246 size_t size; 3247 ulong_t sidx_aligned; 3248 ulong_t eidx_aligned; 3249 3250 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3251 ASSERT(amp->refcnt <= 1); 3252 ASSERT(amp->a_szc > 0); 3253 ASSERT(eidx <= ahp->size); 3254 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3255 3256 if (len == 0) { /* XXX */ 3257 return; 3258 } 3259 3260 sidx_aligned = P2ALIGN(sidx, pages); 3261 if (sidx_aligned != sidx || 3262 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3263 if (!anon_try_demote_pages(ahp, sidx_aligned, 3264 amp->a_szc, NULL, 0)) { 3265 panic("anon_shmap_free_pages: demote failed"); 3266 } 3267 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3268 P2NPHASE(sidx, pages); 3269 size <<= PAGESHIFT; 3270 anon_free(ahp, sidx, size); 3271 sidx = sidx_aligned + pages; 3272 if (eidx <= sidx) { 3273 return; 3274 } 3275 } 3276 eidx_aligned = P2ALIGN(eidx, pages); 3277 if (sidx < eidx_aligned) { 3278 anon_free_pages(ahp, sidx, 3279 (eidx_aligned - sidx) << PAGESHIFT, 3280 amp->a_szc); 3281 sidx = eidx_aligned; 3282 } 3283 ASSERT(sidx == eidx_aligned); 3284 if (eidx == eidx_aligned) { 3285 return; 3286 } 3287 tidx = eidx; 3288 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3289 tidx - sidx < pages) { 3290 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3291 panic("anon_shmap_free_pages: demote failed"); 3292 } 3293 size = (eidx - sidx) << PAGESHIFT; 3294 anon_free(ahp, sidx, size); 3295 } else { 3296 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3297 } 3298 } 3299 3300 /* 3301 * This routine should be called with amp's writer lock when there're no other 3302 * users of amp. All pcache entries of this amp must have been already 3303 * inactivated. We must not drop a_rwlock here to prevent new users from 3304 * attaching to this amp. 3305 */ 3306 void 3307 anonmap_purge(struct anon_map *amp) 3308 { 3309 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3310 ASSERT(amp->refcnt <= 1); 3311 3312 if (amp->a_softlockcnt != 0) { 3313 seg_ppurge(NULL, amp, 0); 3314 } 3315 3316 /* 3317 * Since all pcache entries were already inactive before this routine 3318 * was called seg_ppurge() couldn't return while there're still 3319 * entries that can be found via the list anchored at a_phead. So we 3320 * can assert this list is empty now. a_softlockcnt may be still non 0 3321 * if asynchronous thread that manages pcache already removed pcache 3322 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3323 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3324 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3325 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3326 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3327 * barrier that prevents anonmap_purge() to complete while 3328 * shamp_reclaim() may still be referencing this amp. 3329 */ 3330 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3331 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3332 3333 mutex_enter(&->a_purgemtx); 3334 while (amp->a_softlockcnt != 0) { 3335 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3336 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3337 amp->a_purgewait = 1; 3338 cv_wait(&->a_purgecv, &->a_purgemtx); 3339 } 3340 mutex_exit(&->a_purgemtx); 3341 3342 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3343 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3344 ASSERT(amp->a_softlockcnt == 0); 3345 } 3346 3347 /* 3348 * Allocate and initialize an anon_map structure for seg 3349 * associating the given swap reservation with the new anon_map. 3350 */ 3351 struct anon_map * 3352 anonmap_alloc(size_t size, size_t swresv, int flags) 3353 { 3354 struct anon_map *amp; 3355 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3356 3357 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3358 if (amp == NULL) { 3359 ASSERT(kmflags == KM_NOSLEEP); 3360 return (NULL); 3361 } 3362 3363 amp->ahp = anon_create(btopr(size), flags); 3364 if (amp->ahp == NULL) { 3365 ASSERT(flags == ANON_NOSLEEP); 3366 kmem_cache_free(anonmap_cache, amp); 3367 return (NULL); 3368 } 3369 amp->refcnt = 1; 3370 amp->size = size; 3371 amp->swresv = swresv; 3372 amp->locality = 0; 3373 amp->a_szc = 0; 3374 amp->a_sp = NULL; 3375 amp->a_softlockcnt = 0; 3376 amp->a_purgewait = 0; 3377 amp->a_phead.p_lnext = &->a_phead; 3378 amp->a_phead.p_lprev = &->a_phead; 3379 3380 return (amp); 3381 } 3382 3383 void 3384 anonmap_free(struct anon_map *amp) 3385 { 3386 ASSERT(amp->ahp != NULL); 3387 ASSERT(amp->refcnt == 0); 3388 ASSERT(amp->a_softlockcnt == 0); 3389 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3390 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3391 3392 lgrp_shm_policy_fini(amp, NULL); 3393 anon_release(amp->ahp, btopr(amp->size)); 3394 kmem_cache_free(anonmap_cache, amp); 3395 } 3396 3397 /* 3398 * Returns true if the app array has some empty slots. 3399 * The offp and lenp parameters are in/out parameters. On entry 3400 * these values represent the starting offset and length of the 3401 * mapping. When true is returned, these values may be modified 3402 * to be the largest range which includes empty slots. 3403 */ 3404 int 3405 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3406 size_t *lenp) 3407 { 3408 ulong_t i, el; 3409 ssize_t low, high; 3410 struct anon *ap; 3411 3412 low = -1; 3413 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3414 ap = anon_get_ptr(ahp, anon_idx); 3415 if (ap == NULL) { 3416 if (low == -1) 3417 low = i; 3418 high = i; 3419 } 3420 } 3421 if (low != -1) { 3422 /* 3423 * Found at least one non-anon page. 3424 * Set up the off and len return values. 3425 */ 3426 if (low != 0) 3427 *offp += low; 3428 *lenp = high - low + PAGESIZE; 3429 return (1); 3430 } 3431 return (0); 3432 } 3433 3434 /* 3435 * Return a count of the number of existing anon pages in the anon array 3436 * app in the range (off, off+len). The array and slots must be guaranteed 3437 * stable by the caller. 3438 */ 3439 pgcnt_t 3440 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3441 { 3442 pgcnt_t cnt = 0; 3443 3444 while (nslots-- > 0) { 3445 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3446 cnt++; 3447 anon_index++; 3448 } 3449 return (cnt); 3450 } 3451 3452 /* 3453 * When memory is locked the corresponding swap is unreserved. 3454 * Swap for locked memory is "locked" which means that it can 3455 * not be unreserved until its memory is unlocked. 3456 * If there is enough of mem swap reserved, nothing is done 3457 * because availrmem was already decremented (when mem swap 3458 * was reserved). 3459 * If there is not enough of mem swap reserved, move reserved disk 3460 * swap into memory swap (unreserve phys swap and reserve mem swap 3461 * by the same amount) and decrement availrmem. The availrmem needs 3462 * to be decremented because pages are locked!. 3463 * Used by segspt and mlock when memory is locked. 3464 */ 3465 int 3466 anon_swap_adjust(pgcnt_t npages, pgcnt_t limit, int cnt) 3467 { 3468 pgcnt_t unlocked_mem_swap; 3469 3470 mutex_enter(&anoninfo_lock); 3471 3472 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3473 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3474 3475 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3476 - k_anoninfo.ani_locked_swap; 3477 if (npages > unlocked_mem_swap) { 3478 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3479 3480 /* 3481 * if there is not enough unlocked mem swap we take missing 3482 * amount from phys swap and give it to mem swap 3483 */ 3484 if (!page_reclaim_mem(adjusted_swap, limit, 1, cnt)) { 3485 mutex_exit(&anoninfo_lock); 3486 return (ENOMEM); 3487 } 3488 3489 k_anoninfo.ani_mem_resv += adjusted_swap; 3490 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3491 k_anoninfo.ani_phys_resv -= adjusted_swap; 3492 3493 ANI_ADD(adjusted_swap); 3494 } 3495 k_anoninfo.ani_locked_swap += npages; 3496 3497 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3498 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3499 3500 mutex_exit(&anoninfo_lock); 3501 3502 return (0); 3503 } 3504 3505 /* 3506 * When memory is unlocked make its "locked" swap freeable. 3507 * The unlocked mem swap is unreserved (and availrmem is decremented) 3508 * in anon_unresvmem(). 3509 */ 3510 void 3511 anon_swap_restore(pgcnt_t npages) 3512 { 3513 mutex_enter(&anoninfo_lock); 3514 3515 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3516 3517 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3518 k_anoninfo.ani_locked_swap -= npages; 3519 3520 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3521 3522 mutex_exit(&anoninfo_lock); 3523 } 3524 3525 /* 3526 * Return the pointer from the list for a 3527 * specified anon index. 3528 */ 3529 ulong_t * 3530 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3531 { 3532 struct anon **app; 3533 void **ppp; 3534 3535 ASSERT(an_idx < ahp->size); 3536 3537 /* 3538 * Single level case. 3539 */ 3540 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3541 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3542 } else { 3543 3544 /* 3545 * 2 level case. 3546 */ 3547 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3548 if (*ppp == NULL) { 3549 mutex_enter(&ahp->serial_lock); 3550 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3551 if (*ppp == NULL) 3552 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3553 mutex_exit(&ahp->serial_lock); 3554 } 3555 app = *ppp; 3556 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3557 } 3558 } 3559 3560 void 3561 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3562 { 3563 ulong_t *ap_slot; 3564 kmutex_t *mtx; 3565 kcondvar_t *cv; 3566 int hash; 3567 3568 /* 3569 * Use szc to determine anon slot(s) to appear atomic. 3570 * If szc = 0, then lock the anon slot and mark it busy. 3571 * If szc > 0, then lock the range of slots by getting the 3572 * anon_array_lock for the first anon slot, and mark only the 3573 * first anon slot busy to represent whole range being busy. 3574 */ 3575 3576 ASSERT(RW_READ_HELD(&->a_rwlock)); 3577 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3578 hash = ANON_ARRAY_HASH(amp, an_idx); 3579 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3580 sobj->sync_cv = cv = &anon_array_cv[hash]; 3581 mutex_enter(mtx); 3582 ap_slot = anon_get_slot(amp->ahp, an_idx); 3583 while (ANON_ISBUSY(ap_slot)) 3584 cv_wait(cv, mtx); 3585 ANON_SETBUSY(ap_slot); 3586 sobj->sync_data = ap_slot; 3587 mutex_exit(mtx); 3588 } 3589 3590 int 3591 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3592 anon_sync_obj_t *sobj) 3593 { 3594 ulong_t *ap_slot; 3595 kmutex_t *mtx; 3596 int hash; 3597 3598 /* 3599 * Try to lock a range of anon slots. 3600 * Use szc to determine anon slot(s) to appear atomic. 3601 * If szc = 0, then lock the anon slot and mark it busy. 3602 * If szc > 0, then lock the range of slots by getting the 3603 * anon_array_lock for the first anon slot, and mark only the 3604 * first anon slot busy to represent whole range being busy. 3605 * Fail if the mutex or the anon_array are busy. 3606 */ 3607 3608 ASSERT(RW_READ_HELD(&->a_rwlock)); 3609 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3610 hash = ANON_ARRAY_HASH(amp, an_idx); 3611 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3612 sobj->sync_cv = &anon_array_cv[hash]; 3613 if (!mutex_tryenter(mtx)) { 3614 return (EWOULDBLOCK); 3615 } 3616 ap_slot = anon_get_slot(amp->ahp, an_idx); 3617 if (ANON_ISBUSY(ap_slot)) { 3618 mutex_exit(mtx); 3619 return (EWOULDBLOCK); 3620 } 3621 ANON_SETBUSY(ap_slot); 3622 sobj->sync_data = ap_slot; 3623 mutex_exit(mtx); 3624 return (0); 3625 } 3626 3627 void 3628 anon_array_exit(anon_sync_obj_t *sobj) 3629 { 3630 mutex_enter(sobj->sync_mutex); 3631 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3632 ANON_CLRBUSY(sobj->sync_data); 3633 if (CV_HAS_WAITERS(sobj->sync_cv)) 3634 cv_broadcast(sobj->sync_cv); 3635 mutex_exit(sobj->sync_mutex); 3636 } 3637