1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * VM - anonymous pages. 40 * 41 * This layer sits immediately above the vm_swap layer. It manages 42 * physical pages that have no permanent identity in the file system 43 * name space, using the services of the vm_swap layer to allocate 44 * backing storage for these pages. Since these pages have no external 45 * identity, they are discarded when the last reference is removed. 46 * 47 * An important function of this layer is to manage low-level sharing 48 * of pages that are logically distinct but that happen to be 49 * physically identical (e.g., the corresponding pages of the processes 50 * resulting from a fork before one process or the other changes their 51 * contents). This pseudo-sharing is present only as an optimization 52 * and is not to be confused with true sharing in which multiple 53 * address spaces deliberately contain references to the same object; 54 * such sharing is managed at a higher level. 55 * 56 * The key data structure here is the anon struct, which contains a 57 * reference count for its associated physical page and a hint about 58 * the identity of that page. Anon structs typically live in arrays, 59 * with an instance's position in its array determining where the 60 * corresponding backing storage is allocated; however, the swap_xlate() 61 * routine abstracts away this representation information so that the 62 * rest of the anon layer need not know it. (See the swap layer for 63 * more details on anon struct layout.) 64 * 65 * In the future versions of the system, the association between an 66 * anon struct and its position on backing store will change so that 67 * we don't require backing store all anonymous pages in the system. 68 * This is important for consideration for large memory systems. 69 * We can also use this technique to delay binding physical locations 70 * to anonymous pages until pageout/swapout time where we can make 71 * smarter allocation decisions to improve anonymous klustering. 72 * 73 * Many of the routines defined here take a (struct anon **) argument, 74 * which allows the code at this level to manage anon pages directly, 75 * so that callers can regard anon structs as opaque objects and not be 76 * concerned with assigning or inspecting their contents. 77 * 78 * Clients of this layer refer to anon pages indirectly. That is, they 79 * maintain arrays of pointers to anon structs rather than maintaining 80 * anon structs themselves. The (struct anon **) arguments mentioned 81 * above are pointers to entries in these arrays. It is these arrays 82 * that capture the mapping between offsets within a given segment and 83 * the corresponding anonymous backing storage address. 84 */ 85 86 #ifdef DEBUG 87 #define ANON_DEBUG 88 #endif 89 90 #include <sys/types.h> 91 #include <sys/t_lock.h> 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/mman.h> 95 #include <sys/cred.h> 96 #include <sys/thread.h> 97 #include <sys/vnode.h> 98 #include <sys/cpuvar.h> 99 #include <sys/swap.h> 100 #include <sys/cmn_err.h> 101 #include <sys/vtrace.h> 102 #include <sys/kmem.h> 103 #include <sys/sysmacros.h> 104 #include <sys/bitmap.h> 105 #include <sys/vmsystm.h> 106 #include <sys/tuneable.h> 107 #include <sys/debug.h> 108 #include <sys/fs/swapnode.h> 109 #include <sys/tnf_probe.h> 110 #include <sys/lgrp.h> 111 #include <sys/policy.h> 112 #include <sys/condvar_impl.h> 113 #include <sys/mutex_impl.h> 114 #include <sys/rctl.h> 115 116 #include <vm/as.h> 117 #include <vm/hat.h> 118 #include <vm/anon.h> 119 #include <vm/page.h> 120 #include <vm/vpage.h> 121 #include <vm/seg.h> 122 #include <vm/rm.h> 123 124 #include <fs/fs_subr.h> 125 126 struct vnode *anon_vp; 127 128 int anon_debug; 129 130 kmutex_t anoninfo_lock; 131 struct k_anoninfo k_anoninfo; 132 ani_free_t ani_free_pool[ANI_MAX_POOL]; 133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 134 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 135 136 /* 137 * Global hash table for (vp, off) -> anon slot 138 */ 139 extern int swap_maxcontig; 140 size_t anon_hash_size; 141 unsigned int anon_hash_shift; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 pad_mutex_t *anonhash_lock; 148 149 /* 150 * Used to make the increment of all refcnts of all anon slots of a large 151 * page appear to be atomic. The lock is grabbed for the first anon slot of 152 * a large page. 153 */ 154 pad_mutex_t *anonpages_hash_lock; 155 156 #define APH_MUTEX(vp, off) \ 157 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \ 158 (AH_LOCK_SIZE - 1))].pad_mutex) 159 160 #ifdef VM_STATS 161 static struct anonvmstats_str { 162 ulong_t getpages[30]; 163 ulong_t privatepages[10]; 164 ulong_t demotepages[9]; 165 ulong_t decrefpages[9]; 166 ulong_t dupfillholes[4]; 167 ulong_t freepages[1]; 168 } anonvmstats; 169 #endif /* VM_STATS */ 170 171 /*ARGSUSED*/ 172 static int 173 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 174 { 175 struct anon_map *amp = buf; 176 177 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 178 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 179 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 180 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 181 return (0); 182 } 183 184 /*ARGSUSED1*/ 185 static void 186 anonmap_cache_destructor(void *buf, void *cdrarg) 187 { 188 struct anon_map *amp = buf; 189 190 rw_destroy(&->a_rwlock); 191 cv_destroy(&->a_purgecv); 192 mutex_destroy(&->a_pmtx); 193 mutex_destroy(&->a_purgemtx); 194 } 195 196 void 197 anon_init(void) 198 { 199 int i; 200 pad_mutex_t *tmp; 201 202 /* These both need to be powers of 2 so round up to the next power */ 203 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1); 204 anon_hash_size = 1L << anon_hash_shift; 205 206 /* 207 * We need to align the anonhash_lock and anonpages_hash_lock arrays 208 * to a 64B boundary to avoid false sharing. We add 63B to our 209 * allocation so that we can get a 64B aligned address to use. 210 * We allocate both of these together to avoid wasting an additional 211 * 63B. 212 */ 213 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63, 214 KM_SLEEP); 215 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64); 216 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE; 217 218 for (i = 0; i < AH_LOCK_SIZE; i++) { 219 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT, 220 NULL); 221 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL, 222 MUTEX_DEFAULT, NULL); 223 } 224 225 for (i = 0; i < ANON_LOCKSIZE; i++) { 226 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 227 MUTEX_DEFAULT, NULL); 228 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 229 } 230 231 anon_hash = (struct anon **) 232 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 233 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 234 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 235 anonmap_cache = kmem_cache_create("anonmap_cache", 236 sizeof (struct anon_map), 0, 237 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 238 NULL, NULL, 0); 239 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 240 241 anon_vp = vn_alloc(KM_SLEEP); 242 vn_setops(anon_vp, swap_vnodeops); 243 anon_vp->v_type = VREG; 244 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 245 } 246 247 /* 248 * Global anon slot hash table manipulation. 249 */ 250 251 static void 252 anon_addhash(struct anon *ap) 253 { 254 int index; 255 256 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 257 index = ANON_HASH(ap->an_vp, ap->an_off); 258 ap->an_hash = anon_hash[index]; 259 anon_hash[index] = ap; 260 } 261 262 static void 263 anon_rmhash(struct anon *ap) 264 { 265 struct anon **app; 266 267 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 268 269 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 270 *app; app = &((*app)->an_hash)) { 271 if (*app == ap) { 272 *app = ap->an_hash; 273 break; 274 } 275 } 276 } 277 278 /* 279 * The anon array interfaces. Functions allocating, 280 * freeing array of pointers, and returning/setting 281 * entries in the array of pointers for a given offset. 282 * 283 * Create the list of pointers 284 */ 285 struct anon_hdr * 286 anon_create(pgcnt_t npages, int flags) 287 { 288 struct anon_hdr *ahp; 289 ulong_t nchunks; 290 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 291 292 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 293 return (NULL); 294 } 295 296 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 297 /* 298 * Single level case. 299 */ 300 ahp->size = npages; 301 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 302 303 if (flags & ANON_ALLOC_FORCE) 304 ahp->flags |= ANON_ALLOC_FORCE; 305 306 ahp->array_chunk = kmem_zalloc( 307 ahp->size * sizeof (struct anon *), kmemflags); 308 309 if (ahp->array_chunk == NULL) { 310 kmem_free(ahp, sizeof (struct anon_hdr)); 311 return (NULL); 312 } 313 } else { 314 /* 315 * 2 Level case. 316 * anon hdr size needs to be rounded off to be a multiple 317 * of ANON_CHUNK_SIZE. This is important as various anon 318 * related functions depend on this. 319 * NOTE - 320 * anon_grow() makes anon hdr size a multiple of 321 * ANON_CHUNK_SIZE. 322 * amp size is <= anon hdr size. 323 * anon_index + seg_pgs <= anon hdr size. 324 */ 325 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 326 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 327 328 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 329 kmemflags); 330 331 if (ahp->array_chunk == NULL) { 332 kmem_free(ahp, sizeof (struct anon_hdr)); 333 return (NULL); 334 } 335 } 336 return (ahp); 337 } 338 339 /* 340 * Free the array of pointers 341 */ 342 void 343 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 344 { 345 ulong_t i; 346 void **ppp; 347 ulong_t nchunks; 348 349 ASSERT(npages <= ahp->size); 350 351 /* 352 * Single level case. 353 */ 354 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 355 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 356 } else { 357 /* 358 * 2 level case. 359 */ 360 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 361 for (i = 0; i < nchunks; i++) { 362 ppp = &ahp->array_chunk[i]; 363 if (*ppp != NULL) 364 kmem_free(*ppp, PAGESIZE); 365 } 366 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 367 } 368 mutex_destroy(&ahp->serial_lock); 369 kmem_free(ahp, sizeof (struct anon_hdr)); 370 } 371 372 /* 373 * Return the pointer from the list for a 374 * specified anon index. 375 */ 376 struct anon * 377 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 378 { 379 struct anon **app; 380 381 ASSERT(an_idx < ahp->size); 382 383 /* 384 * Single level case. 385 */ 386 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 387 return ((struct anon *) 388 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 389 } else { 390 391 /* 392 * 2 level case. 393 */ 394 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 395 if (app) { 396 return ((struct anon *) 397 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 398 ANON_PTRMASK)); 399 } else { 400 return (NULL); 401 } 402 } 403 } 404 405 /* 406 * Return the anon pointer for the first valid entry in the anon list, 407 * starting from the given index. 408 */ 409 struct anon * 410 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 411 { 412 struct anon *ap; 413 struct anon **app; 414 ulong_t chunkoff; 415 ulong_t i; 416 ulong_t j; 417 pgcnt_t size; 418 419 i = *index; 420 size = ahp->size; 421 422 ASSERT(i < size); 423 424 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 425 /* 426 * 1 level case 427 */ 428 while (i < size) { 429 ap = (struct anon *) 430 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 431 if (ap) { 432 *index = i; 433 return (ap); 434 } 435 i++; 436 } 437 } else { 438 /* 439 * 2 level case 440 */ 441 chunkoff = i & ANON_CHUNK_OFF; 442 while (i < size) { 443 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 444 if (app) 445 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 446 ap = (struct anon *) 447 ((uintptr_t)app[j] & ANON_PTRMASK); 448 if (ap) { 449 *index = i + (j - chunkoff); 450 return (ap); 451 } 452 } 453 chunkoff = 0; 454 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 455 } 456 } 457 *index = size; 458 return (NULL); 459 } 460 461 /* 462 * Set list entry with a given pointer for a specified offset 463 */ 464 int 465 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 466 { 467 void **ppp; 468 struct anon **app; 469 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 470 uintptr_t *ap_addr; 471 472 ASSERT(an_idx < ahp->size); 473 474 /* 475 * Single level case. 476 */ 477 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 478 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 479 } else { 480 481 /* 482 * 2 level case. 483 */ 484 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 485 486 ASSERT(ppp != NULL); 487 if (*ppp == NULL) { 488 mutex_enter(&ahp->serial_lock); 489 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 490 if (*ppp == NULL) { 491 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 492 if (*ppp == NULL) { 493 mutex_exit(&ahp->serial_lock); 494 return (ENOMEM); 495 } 496 } 497 mutex_exit(&ahp->serial_lock); 498 } 499 app = *ppp; 500 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 501 } 502 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 503 return (0); 504 } 505 506 /* 507 * Copy anon array into a given new anon array 508 */ 509 int 510 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 511 struct anon_hdr *dahp, ulong_t d_idx, 512 pgcnt_t npages, int flags) 513 { 514 void **sapp, **dapp; 515 void *ap; 516 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 517 518 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 519 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 520 521 /* 522 * Both arrays are 1 level. 523 */ 524 if (((sahp->size <= ANON_CHUNK_SIZE) && 525 (dahp->size <= ANON_CHUNK_SIZE)) || 526 ((sahp->flags & ANON_ALLOC_FORCE) && 527 (dahp->flags & ANON_ALLOC_FORCE))) { 528 529 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 530 npages * sizeof (struct anon *)); 531 return (0); 532 } 533 534 /* 535 * Both arrays are 2 levels. 536 */ 537 if (sahp->size > ANON_CHUNK_SIZE && 538 dahp->size > ANON_CHUNK_SIZE && 539 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 540 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 541 542 ulong_t sapidx, dapidx; 543 ulong_t *sap, *dap; 544 ulong_t chknp; 545 546 while (npages != 0) { 547 548 sapidx = s_idx & ANON_CHUNK_OFF; 549 dapidx = d_idx & ANON_CHUNK_OFF; 550 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 551 if (chknp > npages) 552 chknp = npages; 553 554 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 555 if ((sap = *sapp) != NULL) { 556 dapp = &dahp->array_chunk[d_idx 557 >> ANON_CHUNK_SHIFT]; 558 if ((dap = *dapp) == NULL) { 559 *dapp = kmem_zalloc(PAGESIZE, 560 kmemflags); 561 if ((dap = *dapp) == NULL) 562 return (ENOMEM); 563 } 564 bcopy((sap + sapidx), (dap + dapidx), 565 chknp << ANON_PTRSHIFT); 566 } 567 s_idx += chknp; 568 d_idx += chknp; 569 npages -= chknp; 570 } 571 return (0); 572 } 573 574 /* 575 * At least one of the arrays is 2 level. 576 */ 577 while (npages--) { 578 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 579 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 580 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 581 return (ENOMEM); 582 } 583 s_idx++; 584 d_idx++; 585 } 586 return (0); 587 } 588 589 590 /* 591 * ANON_INITBUF is a convenience macro for anon_grow() below. It 592 * takes a buffer dst, which is at least as large as buffer src. It 593 * does a bcopy from src into dst, and then bzeros the extra bytes 594 * of dst. If tail is set, the data in src is tail aligned within 595 * dst instead of head aligned. 596 */ 597 598 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 599 if (tail) { \ 600 bzero((dst), (dstsize) - (srclen)); \ 601 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 602 } else { \ 603 bcopy((src), (dst), (srclen)); \ 604 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 605 } 606 607 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 608 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 609 610 /* 611 * anon_grow() is used to efficiently extend an existing anon array. 612 * startidx_p points to the index into the anon array of the first page 613 * that is in use. oldseg_pgs is the number of pages in use, starting at 614 * *startidx_p. newpages is the number of additional pages desired. 615 * 616 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 617 * 618 * The growth is done by creating a new top level of the anon array, 619 * and (if the array is 2-level) reusing the existing second level arrays. 620 * 621 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 622 * 623 * Returns the new number of pages in the anon array. 624 */ 625 pgcnt_t 626 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 627 pgcnt_t newseg_pgs, int flags) 628 { 629 ulong_t startidx = startidx_p ? *startidx_p : 0; 630 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 631 pgcnt_t oelems, nelems, totpages; 632 void **level1; 633 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 634 int growdown = (flags & ANON_GROWDOWN); 635 size_t newarrsz, oldarrsz; 636 void *level2; 637 638 ASSERT(!(startidx_p == NULL && growdown)); 639 ASSERT(startidx + oldseg_pgs <= ahp->size); 640 641 /* 642 * Determine the total number of pages needed in the new 643 * anon array. If growing down, totpages is all pages from 644 * startidx through the end of the array, plus <newseg_pgs> 645 * pages. If growing up, keep all pages from page 0 through 646 * the last page currently in use, plus <newseg_pgs> pages. 647 */ 648 if (growdown) 649 totpages = oldamp_pgs - startidx + newseg_pgs; 650 else 651 totpages = startidx + oldseg_pgs + newseg_pgs; 652 653 /* If the array is already large enough, just return. */ 654 655 if (oldamp_pgs >= totpages) { 656 if (growdown) 657 *startidx_p = oldamp_pgs - totpages; 658 return (oldamp_pgs); 659 } 660 661 /* 662 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 663 * by the corresponding arrays. 664 * oelems/nelems are the number of pointers in the top level arrays 665 * which may be either level 1 or level 2. 666 * Will the new anon array be one level or two levels? 667 */ 668 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 669 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 670 oelems = oldamp_pgs; 671 nelems = newamp_pgs; 672 } else { 673 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 674 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 675 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 676 } 677 678 newarrsz = nelems * sizeof (void *); 679 level1 = kmem_alloc(newarrsz, kmemflags); 680 if (level1 == NULL) 681 return (0); 682 683 /* Are we converting from a one level to a two level anon array? */ 684 685 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 686 !(ahp->flags & ANON_ALLOC_FORCE)) { 687 688 /* 689 * Yes, we're converting to a two level. Reuse old level 1 690 * as new level 2 if it is exactly PAGESIZE. Otherwise 691 * alloc a new level 2 and copy the old level 1 data into it. 692 */ 693 if (oldamp_pgs == ANON_CHUNK_SIZE) { 694 level2 = (void *)ahp->array_chunk; 695 } else { 696 level2 = kmem_alloc(PAGESIZE, kmemflags); 697 if (level2 == NULL) { 698 kmem_free(level1, newarrsz); 699 return (0); 700 } 701 oldarrsz = oldamp_pgs * sizeof (void *); 702 703 ANON_INITBUF(ahp->array_chunk, oldarrsz, 704 level2, PAGESIZE, growdown); 705 kmem_free(ahp->array_chunk, oldarrsz); 706 } 707 bzero(level1, newarrsz); 708 if (growdown) 709 level1[nelems - 1] = level2; 710 else 711 level1[0] = level2; 712 } else { 713 oldarrsz = oelems * sizeof (void *); 714 715 ANON_INITBUF(ahp->array_chunk, oldarrsz, 716 level1, newarrsz, growdown); 717 kmem_free(ahp->array_chunk, oldarrsz); 718 } 719 720 ahp->array_chunk = level1; 721 ahp->size = newamp_pgs; 722 if (growdown) 723 *startidx_p = newamp_pgs - totpages; 724 725 return (newamp_pgs); 726 } 727 728 729 /* 730 * Called from clock handler to sync ani_free value. 731 */ 732 733 void 734 set_anoninfo(void) 735 { 736 int ix; 737 pgcnt_t total = 0; 738 739 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 740 total += ani_free_pool[ix].ani_count; 741 } 742 k_anoninfo.ani_free = total; 743 } 744 745 /* 746 * Reserve anon space. 747 * 748 * It's no longer simply a matter of incrementing ani_resv to 749 * reserve swap space, we need to check memory-based as well 750 * as disk-backed (physical) swap. The following algorithm 751 * is used: 752 * Check the space on physical swap 753 * i.e. amount needed < ani_max - ani_phys_resv 754 * If we are swapping on swapfs check 755 * amount needed < (availrmem - swapfs_minfree) 756 * Since the algorithm to check for the quantity of swap space is 757 * almost the same as that for reserving it, we'll just use anon_resvmem 758 * with a flag to decrement availrmem. 759 * 760 * Return non-zero on success. 761 */ 762 int 763 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 764 { 765 pgcnt_t npages = btopr(size); 766 pgcnt_t mswap_pages = 0; 767 pgcnt_t pswap_pages = 0; 768 proc_t *p = curproc; 769 770 if (zone != NULL && takemem) { 771 /* test zone.max-swap resource control */ 772 mutex_enter(&p->p_lock); 773 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 774 mutex_exit(&p->p_lock); 775 return (0); 776 } 777 mutex_exit(&p->p_lock); 778 } 779 mutex_enter(&anoninfo_lock); 780 781 /* 782 * pswap_pages is the number of pages we can take from 783 * physical (i.e. disk-backed) swap. 784 */ 785 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 786 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 787 788 ANON_PRINT(A_RESV, 789 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 790 npages, takemem, pswap_pages, (void *)caller())); 791 792 if (npages <= pswap_pages) { 793 /* 794 * we have enough space on a physical swap 795 */ 796 if (takemem) 797 k_anoninfo.ani_phys_resv += npages; 798 mutex_exit(&anoninfo_lock); 799 return (1); 800 } else if (pswap_pages != 0) { 801 /* 802 * we have some space on a physical swap 803 */ 804 if (takemem) { 805 /* 806 * use up remainder of phys swap 807 */ 808 k_anoninfo.ani_phys_resv += pswap_pages; 809 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 810 } 811 } 812 /* 813 * since (npages > pswap_pages) we need mem swap 814 * mswap_pages is the number of pages needed from availrmem 815 */ 816 ASSERT(npages > pswap_pages); 817 mswap_pages = npages - pswap_pages; 818 819 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 820 mswap_pages)); 821 822 /* 823 * priv processes can reserve memory as swap as long as availrmem 824 * remains greater than swapfs_minfree; in the case of non-priv 825 * processes, memory can be reserved as swap only if availrmem 826 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 827 * swapfs_reserve amount of memswap is not available to non-priv 828 * processes. This protects daemons such as automounter dying 829 * as a result of application processes eating away almost entire 830 * membased swap. This safeguard becomes useless if apps are run 831 * with root access. 832 * 833 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 834 * 835 */ 836 if (tryhard) { 837 pgcnt_t floor_pages; 838 839 if (secpolicy_resource_anon_mem(CRED())) { 840 floor_pages = swapfs_minfree; 841 } else { 842 floor_pages = swapfs_minfree + swapfs_reserve; 843 } 844 845 mutex_exit(&anoninfo_lock); 846 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 847 mutex_enter(&anoninfo_lock); 848 } 849 850 mutex_enter(&freemem_lock); 851 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 852 (availrmem > (swapfs_minfree + mswap_pages) && 853 secpolicy_resource(CRED()) == 0)) { 854 855 if (takemem) { 856 /* 857 * Take the memory from the rest of the system. 858 */ 859 availrmem -= mswap_pages; 860 mutex_exit(&freemem_lock); 861 k_anoninfo.ani_mem_resv += mswap_pages; 862 ANI_ADD(mswap_pages); 863 ANON_PRINT((A_RESV | A_MRESV), 864 ("anon_resvmem: took %ld pages of availrmem\n", 865 mswap_pages)); 866 } else { 867 mutex_exit(&freemem_lock); 868 } 869 870 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 871 mutex_exit(&anoninfo_lock); 872 return (1); 873 } else { 874 /* 875 * Fail if not enough memory 876 */ 877 if (takemem) { 878 k_anoninfo.ani_phys_resv -= pswap_pages; 879 } 880 881 mutex_exit(&freemem_lock); 882 mutex_exit(&anoninfo_lock); 883 ANON_PRINT(A_RESV, 884 ("anon_resvmem: not enough space from swapfs\n")); 885 if (zone != NULL && takemem) 886 rctl_decr_swap(zone, ptob(npages)); 887 return (0); 888 } 889 } 890 891 /* 892 * Give back an anon reservation. 893 */ 894 void 895 anon_unresvmem(size_t size, zone_t *zone) 896 { 897 pgcnt_t npages = btopr(size); 898 spgcnt_t mem_free_pages = 0; 899 pgcnt_t phys_free_slots; 900 #ifdef ANON_DEBUG 901 pgcnt_t mem_resv; 902 #endif 903 if (zone != NULL) 904 rctl_decr_swap(zone, ptob(npages)); 905 906 mutex_enter(&anoninfo_lock); 907 908 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 909 910 /* 911 * If some of this reservation belonged to swapfs 912 * give it back to availrmem. 913 * ani_mem_resv is the amount of availrmem swapfs has reserved. 914 * but some of that memory could be locked by segspt so we can only 915 * return non locked ani_mem_resv back to availrmem 916 */ 917 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 918 ANON_PRINT((A_RESV | A_MRESV), 919 ("anon_unresv: growing availrmem by %ld pages\n", 920 MIN(k_anoninfo.ani_mem_resv, npages))); 921 922 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 923 k_anoninfo.ani_locked_swap), npages); 924 mutex_enter(&freemem_lock); 925 availrmem += mem_free_pages; 926 mutex_exit(&freemem_lock); 927 k_anoninfo.ani_mem_resv -= mem_free_pages; 928 929 ANI_ADD(-mem_free_pages); 930 } 931 /* 932 * The remainder of the pages is returned to phys swap 933 */ 934 ASSERT(npages >= mem_free_pages); 935 phys_free_slots = npages - mem_free_pages; 936 937 if (phys_free_slots) { 938 k_anoninfo.ani_phys_resv -= phys_free_slots; 939 } 940 941 #ifdef ANON_DEBUG 942 mem_resv = k_anoninfo.ani_mem_resv; 943 #endif 944 945 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 946 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 947 948 mutex_exit(&anoninfo_lock); 949 950 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 951 npages, mem_resv, (void *)caller())); 952 } 953 954 /* 955 * Allocate an anon slot and return it with the lock held. 956 */ 957 struct anon * 958 anon_alloc(struct vnode *vp, anoff_t off) 959 { 960 struct anon *ap; 961 kmutex_t *ahm; 962 963 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 964 if (vp == NULL) { 965 swap_alloc(ap); 966 } else { 967 ap->an_vp = vp; 968 ap->an_off = off; 969 } 970 ap->an_refcnt = 1; 971 ap->an_pvp = NULL; 972 ap->an_poff = 0; 973 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 974 mutex_enter(ahm); 975 anon_addhash(ap); 976 mutex_exit(ahm); 977 ANI_ADD(-1); 978 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 979 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 980 return (ap); 981 } 982 983 /* 984 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 985 * such pages don't consume any physical swap resources needed for swapping 986 * unlocked pages. 987 */ 988 void 989 anon_swap_free(struct anon *ap, page_t *pp) 990 { 991 kmutex_t *ahm; 992 993 ASSERT(ap != NULL); 994 ASSERT(pp != NULL); 995 ASSERT(PAGE_LOCKED(pp)); 996 ASSERT(pp->p_vnode != NULL); 997 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 998 ASSERT(ap->an_refcnt != 0); 999 ASSERT(pp->p_vnode == ap->an_vp); 1000 ASSERT(pp->p_offset == ap->an_off); 1001 1002 if (ap->an_pvp == NULL) 1003 return; 1004 1005 page_io_lock(pp); 1006 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1007 mutex_enter(ahm); 1008 1009 ASSERT(ap->an_refcnt != 0); 1010 ASSERT(pp->p_vnode == ap->an_vp); 1011 ASSERT(pp->p_offset == ap->an_off); 1012 1013 if (ap->an_pvp != NULL) { 1014 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1015 ap->an_pvp = NULL; 1016 ap->an_poff = 0; 1017 mutex_exit(ahm); 1018 hat_setmod(pp); 1019 } else { 1020 mutex_exit(ahm); 1021 } 1022 page_io_unlock(pp); 1023 } 1024 1025 /* 1026 * Decrement the reference count of an anon page. 1027 * If reference count goes to zero, free it and 1028 * its associated page (if any). 1029 */ 1030 void 1031 anon_decref(struct anon *ap) 1032 { 1033 page_t *pp; 1034 struct vnode *vp; 1035 anoff_t off; 1036 kmutex_t *ahm; 1037 1038 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1039 mutex_enter(ahm); 1040 ASSERT(ap->an_refcnt != 0); 1041 if (ap->an_refcnt == 0) 1042 panic("anon_decref: slot count 0"); 1043 if (--ap->an_refcnt == 0) { 1044 swap_xlate(ap, &vp, &off); 1045 anon_rmhash(ap); 1046 if (ap->an_pvp != NULL) 1047 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1048 mutex_exit(ahm); 1049 1050 /* 1051 * If there is a page for this anon slot we will need to 1052 * call VN_DISPOSE to get rid of the vp association and 1053 * put the page back on the free list as really free. 1054 * Acquire the "exclusive" lock to ensure that any 1055 * pending i/o always completes before the swap slot 1056 * is freed. 1057 */ 1058 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1059 if (pp != NULL) { 1060 /*LINTED: constant in conditional context */ 1061 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1062 } 1063 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1064 (void *)ap, (void *)ap->an_vp)); 1065 1066 kmem_cache_free(anon_cache, ap); 1067 1068 ANI_ADD(1); 1069 } else { 1070 mutex_exit(ahm); 1071 } 1072 } 1073 1074 1075 /* 1076 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1077 * seg->s_szc level) to determine whether COW processing is required. 1078 * anonpages_hash_lock[] held on the root ap ensures that if root's 1079 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1080 * later since this process can't fork while its AS lock is held). 1081 * 1082 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1083 */ 1084 int 1085 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1086 { 1087 struct anon *ap; 1088 kmutex_t *ahmpages = NULL; 1089 1090 ap = anon_get_ptr(ahp, anon_index); 1091 if (ap == NULL) 1092 return (0); 1093 1094 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1095 mutex_enter(ahmpages); 1096 ASSERT(ap->an_refcnt >= 1); 1097 if (ap->an_refcnt == 1) { 1098 mutex_exit(ahmpages); 1099 return (0); 1100 } 1101 mutex_exit(ahmpages); 1102 return (1); 1103 } 1104 /* 1105 * Check 'nslots' anon slots for refcnt > 1. 1106 * 1107 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1108 * returns 0. 1109 */ 1110 static int 1111 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1112 { 1113 struct anon *ap; 1114 1115 while (nslots-- > 0) { 1116 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1117 ap->an_refcnt > 1) 1118 return (1); 1119 anon_index++; 1120 } 1121 1122 return (0); 1123 } 1124 1125 static void 1126 anon_decref_pages( 1127 struct anon_hdr *ahp, 1128 ulong_t an_idx, 1129 uint_t szc) 1130 { 1131 struct anon *ap = anon_get_ptr(ahp, an_idx); 1132 kmutex_t *ahmpages = NULL; 1133 page_t *pp; 1134 pgcnt_t pgcnt = page_get_pagecnt(szc); 1135 pgcnt_t i; 1136 struct vnode *vp; 1137 anoff_t off; 1138 kmutex_t *ahm; 1139 #ifdef DEBUG 1140 int refcnt = 1; 1141 #endif 1142 1143 ASSERT(szc != 0); 1144 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1145 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1146 ASSERT(an_idx < ahp->size); 1147 1148 if (ahp->size - an_idx < pgcnt) { 1149 /* 1150 * In case of shared mappings total anon map size may not be 1151 * the largest page size aligned. 1152 */ 1153 pgcnt = ahp->size - an_idx; 1154 } 1155 1156 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1157 1158 if (ap != NULL) { 1159 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1160 mutex_enter(ahmpages); 1161 ASSERT((refcnt = ap->an_refcnt) != 0); 1162 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1163 if (ap->an_refcnt == 1) { 1164 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1165 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1166 mutex_exit(ahmpages); 1167 ahmpages = NULL; 1168 } 1169 } 1170 1171 i = 0; 1172 while (i < pgcnt) { 1173 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1174 ASSERT(refcnt == 1 && ahmpages == NULL); 1175 i++; 1176 continue; 1177 } 1178 ASSERT(ap->an_refcnt == refcnt); 1179 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1180 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1181 1182 if (ahmpages == NULL) { 1183 swap_xlate(ap, &vp, &off); 1184 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1185 if (pp == NULL || pp->p_szc == 0) { 1186 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1187 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1188 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1189 ANON_SLEEP); 1190 mutex_enter(ahm); 1191 ap->an_refcnt--; 1192 ASSERT(ap->an_refcnt == 0); 1193 anon_rmhash(ap); 1194 if (ap->an_pvp) 1195 swap_phys_free(ap->an_pvp, ap->an_poff, 1196 PAGESIZE); 1197 mutex_exit(ahm); 1198 if (pp == NULL) { 1199 pp = page_lookup(vp, (u_offset_t)off, 1200 SE_EXCL); 1201 ASSERT(pp == NULL || pp->p_szc == 0); 1202 } 1203 if (pp != NULL) { 1204 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1205 /*LINTED*/ 1206 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1207 } 1208 kmem_cache_free(anon_cache, ap); 1209 ANI_ADD(1); 1210 i++; 1211 } else { 1212 pgcnt_t j; 1213 pgcnt_t curpgcnt = 1214 page_get_pagecnt(pp->p_szc); 1215 size_t ppasize = curpgcnt * sizeof (page_t *); 1216 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1217 int dispose = 0; 1218 1219 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1220 1221 ASSERT(pp->p_szc <= szc); 1222 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1223 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1224 ASSERT(i + curpgcnt <= pgcnt); 1225 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1226 ppa[0] = pp; 1227 for (j = i + 1; j < i + curpgcnt; j++) { 1228 ap = anon_get_ptr(ahp, an_idx + j); 1229 ASSERT(ap != NULL && 1230 ap->an_refcnt == 1); 1231 swap_xlate(ap, &vp, &off); 1232 pp = page_lookup(vp, (u_offset_t)off, 1233 SE_EXCL); 1234 if (pp == NULL) 1235 panic("anon_decref_pages: " 1236 "no page"); 1237 1238 (void) hat_pageunload(pp, 1239 HAT_FORCE_PGUNLOAD); 1240 ASSERT(pp->p_szc == ppa[0]->p_szc); 1241 ASSERT(page_pptonum(pp) - 1 == 1242 page_pptonum(ppa[j - i - 1])); 1243 ppa[j - i] = pp; 1244 if (ap->an_pvp != NULL && 1245 !vn_matchopval(ap->an_pvp, 1246 VOPNAME_DISPOSE, 1247 (fs_generic_func_p)fs_dispose)) 1248 dispose = 1; 1249 } 1250 for (j = i; j < i + curpgcnt; j++) { 1251 ap = anon_get_ptr(ahp, an_idx + j); 1252 ASSERT(ap != NULL && 1253 ap->an_refcnt == 1); 1254 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1255 (void) anon_set_ptr(ahp, an_idx + j, 1256 NULL, ANON_SLEEP); 1257 mutex_enter(ahm); 1258 ap->an_refcnt--; 1259 ASSERT(ap->an_refcnt == 0); 1260 anon_rmhash(ap); 1261 if (ap->an_pvp) 1262 swap_phys_free(ap->an_pvp, 1263 ap->an_poff, PAGESIZE); 1264 mutex_exit(ahm); 1265 kmem_cache_free(anon_cache, ap); 1266 ANI_ADD(1); 1267 } 1268 if (!dispose) { 1269 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1270 page_destroy_pages(ppa[0]); 1271 } else { 1272 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1273 for (j = 0; j < curpgcnt; j++) { 1274 ASSERT(PAGE_EXCL(ppa[j])); 1275 ppa[j]->p_szc = 0; 1276 } 1277 for (j = 0; j < curpgcnt; j++) { 1278 ASSERT(!hat_page_is_mapped( 1279 ppa[j])); 1280 /*LINTED*/ 1281 VN_DISPOSE(ppa[j], B_INVAL, 0, 1282 kcred); 1283 } 1284 } 1285 kmem_free(ppa, ppasize); 1286 i += curpgcnt; 1287 } 1288 } else { 1289 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1290 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1291 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1292 mutex_enter(ahm); 1293 ap->an_refcnt--; 1294 mutex_exit(ahm); 1295 i++; 1296 } 1297 } 1298 1299 if (ahmpages != NULL) { 1300 mutex_exit(ahmpages); 1301 } 1302 } 1303 1304 /* 1305 * Duplicate references to size bytes worth of anon pages. 1306 * Used when duplicating a segment that contains private anon pages. 1307 * This code assumes that procedure calling this one has already used 1308 * hat_chgprot() to disable write access to the range of addresses that 1309 * that *old actually refers to. 1310 */ 1311 void 1312 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1313 ulong_t new_idx, size_t size) 1314 { 1315 spgcnt_t npages; 1316 kmutex_t *ahm; 1317 struct anon *ap; 1318 ulong_t off; 1319 ulong_t index; 1320 1321 npages = btopr(size); 1322 while (npages > 0) { 1323 index = old_idx; 1324 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1325 break; 1326 1327 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1328 off = index - old_idx; 1329 npages -= off; 1330 if (npages <= 0) 1331 break; 1332 1333 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1334 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1335 1336 mutex_enter(ahm); 1337 ap->an_refcnt++; 1338 mutex_exit(ahm); 1339 1340 off++; 1341 new_idx += off; 1342 old_idx += off; 1343 npages--; 1344 } 1345 } 1346 1347 /* 1348 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1349 * slots) within any large page region. That means if a large page region is 1350 * empty in the old array it will skip it. If there are 1 or more valid slots 1351 * in the large page region of the old array it will make sure to fill in any 1352 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1353 * page region should either have no valid anon slots or all slots should be 1354 * valid. 1355 */ 1356 void 1357 anon_dup_fill_holes( 1358 struct anon_hdr *old, 1359 ulong_t old_idx, 1360 struct anon_hdr *new, 1361 ulong_t new_idx, 1362 size_t size, 1363 uint_t szc, 1364 int noalloc) 1365 { 1366 struct anon *ap; 1367 spgcnt_t npages; 1368 kmutex_t *ahm, *ahmpages = NULL; 1369 pgcnt_t pgcnt, i; 1370 ulong_t index, off; 1371 #ifdef DEBUG 1372 int refcnt; 1373 #endif 1374 1375 ASSERT(szc != 0); 1376 pgcnt = page_get_pagecnt(szc); 1377 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1378 npages = btopr(size); 1379 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1380 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1381 1382 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1383 1384 while (npages > 0) { 1385 index = old_idx; 1386 1387 /* 1388 * Find the next valid slot. 1389 */ 1390 if (anon_get_next_ptr(old, &index) == NULL) 1391 break; 1392 1393 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1394 /* 1395 * Now backup index to the beginning of the 1396 * current large page region of the old array. 1397 */ 1398 index = P2ALIGN(index, pgcnt); 1399 off = index - old_idx; 1400 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1401 npages -= off; 1402 if (npages <= 0) 1403 break; 1404 1405 /* 1406 * Fill and copy a large page regions worth 1407 * of anon slots. 1408 */ 1409 for (i = 0; i < pgcnt; i++) { 1410 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1411 if (noalloc) { 1412 panic("anon_dup_fill_holes: " 1413 "empty anon slot\n"); 1414 } 1415 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1416 ap = anon_alloc(NULL, 0); 1417 (void) anon_set_ptr(old, index + i, ap, 1418 ANON_SLEEP); 1419 } else if (i == 0) { 1420 /* 1421 * make the increment of all refcnts of all 1422 * anon slots of a large page appear atomic by 1423 * getting an anonpages_hash_lock for the 1424 * first anon slot of a large page. 1425 */ 1426 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1427 1428 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1429 mutex_enter(ahmpages); 1430 /*LINTED*/ 1431 ASSERT(refcnt = ap->an_refcnt); 1432 1433 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1434 anonvmstats.dupfillholes[3]); 1435 } 1436 (void) anon_set_ptr(new, new_idx + off + i, ap, 1437 ANON_SLEEP); 1438 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1439 mutex_enter(ahm); 1440 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1441 ASSERT(i == 0 || ahmpages == NULL || 1442 refcnt == ap->an_refcnt); 1443 ap->an_refcnt++; 1444 mutex_exit(ahm); 1445 } 1446 if (ahmpages != NULL) { 1447 mutex_exit(ahmpages); 1448 ahmpages = NULL; 1449 } 1450 off += pgcnt; 1451 new_idx += off; 1452 old_idx += off; 1453 npages -= pgcnt; 1454 } 1455 } 1456 1457 /* 1458 * Used when a segment with a vnode changes szc. similarly to 1459 * anon_dup_fill_holes() makes sure each large page region either has no anon 1460 * slots or all of them. but new slots are created by COWing the file 1461 * pages. on entrance no anon slots should be shared. 1462 */ 1463 int 1464 anon_fill_cow_holes( 1465 struct seg *seg, 1466 caddr_t addr, 1467 struct anon_hdr *ahp, 1468 ulong_t an_idx, 1469 struct vnode *vp, 1470 u_offset_t vp_off, 1471 size_t size, 1472 uint_t szc, 1473 uint_t prot, 1474 struct vpage vpage[], 1475 struct cred *cred) 1476 { 1477 struct anon *ap; 1478 spgcnt_t npages; 1479 pgcnt_t pgcnt, i; 1480 ulong_t index, off; 1481 int err = 0; 1482 int pageflags = 0; 1483 1484 ASSERT(szc != 0); 1485 pgcnt = page_get_pagecnt(szc); 1486 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1487 npages = btopr(size); 1488 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1489 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1490 1491 while (npages > 0) { 1492 index = an_idx; 1493 1494 /* 1495 * Find the next valid slot. 1496 */ 1497 if (anon_get_next_ptr(ahp, &index) == NULL) { 1498 break; 1499 } 1500 1501 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1502 /* 1503 * Now backup index to the beginning of the 1504 * current large page region of the anon array. 1505 */ 1506 index = P2ALIGN(index, pgcnt); 1507 off = index - an_idx; 1508 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1509 npages -= off; 1510 if (npages <= 0) 1511 break; 1512 an_idx += off; 1513 vp_off += ptob(off); 1514 addr += ptob(off); 1515 if (vpage != NULL) { 1516 vpage += off; 1517 } 1518 1519 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1520 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1521 page_t *pl[1 + 1]; 1522 page_t *pp; 1523 1524 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1525 pl, PAGESIZE, seg, addr, S_READ, cred, 1526 NULL); 1527 if (err) { 1528 break; 1529 } 1530 if (vpage != NULL) { 1531 prot = VPP_PROT(vpage); 1532 pageflags = VPP_ISPPLOCK(vpage) ? 1533 LOCK_PAGE : 0; 1534 } 1535 pp = anon_private(&ap, seg, addr, prot, pl[0], 1536 pageflags, cred); 1537 if (pp == NULL) { 1538 err = ENOMEM; 1539 break; 1540 } 1541 (void) anon_set_ptr(ahp, an_idx, ap, 1542 ANON_SLEEP); 1543 page_unlock(pp); 1544 } 1545 ASSERT(ap->an_refcnt == 1); 1546 addr += PAGESIZE; 1547 if (vpage != NULL) { 1548 vpage++; 1549 } 1550 } 1551 npages -= pgcnt; 1552 } 1553 1554 return (err); 1555 } 1556 1557 /* 1558 * Free a group of "size" anon pages, size in bytes, 1559 * and clear out the pointers to the anon entries. 1560 */ 1561 void 1562 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1563 { 1564 spgcnt_t npages; 1565 struct anon *ap; 1566 ulong_t old; 1567 1568 npages = btopr(size); 1569 1570 while (npages > 0) { 1571 old = index; 1572 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1573 break; 1574 1575 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1576 npages -= index - old; 1577 if (npages <= 0) 1578 break; 1579 1580 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1581 anon_decref(ap); 1582 /* 1583 * Bump index and decrement page count 1584 */ 1585 index++; 1586 npages--; 1587 } 1588 } 1589 1590 void 1591 anon_free_pages( 1592 struct anon_hdr *ahp, 1593 ulong_t an_idx, 1594 size_t size, 1595 uint_t szc) 1596 { 1597 spgcnt_t npages; 1598 pgcnt_t pgcnt; 1599 ulong_t index, off; 1600 1601 ASSERT(szc != 0); 1602 pgcnt = page_get_pagecnt(szc); 1603 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1604 npages = btopr(size); 1605 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1606 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1607 ASSERT(an_idx < ahp->size); 1608 1609 VM_STAT_ADD(anonvmstats.freepages[0]); 1610 1611 while (npages > 0) { 1612 index = an_idx; 1613 1614 /* 1615 * Find the next valid slot. 1616 */ 1617 if (anon_get_next_ptr(ahp, &index) == NULL) 1618 break; 1619 1620 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1621 /* 1622 * Now backup index to the beginning of the 1623 * current large page region of the old array. 1624 */ 1625 index = P2ALIGN(index, pgcnt); 1626 off = index - an_idx; 1627 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1628 npages -= off; 1629 if (npages <= 0) 1630 break; 1631 1632 anon_decref_pages(ahp, index, szc); 1633 1634 off += pgcnt; 1635 an_idx += off; 1636 npages -= pgcnt; 1637 } 1638 } 1639 1640 /* 1641 * Make anonymous pages discardable 1642 */ 1643 void 1644 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1645 { 1646 spgcnt_t npages = btopr(size); 1647 struct anon *ap; 1648 struct vnode *vp; 1649 anoff_t off; 1650 page_t *pp, *root_pp; 1651 kmutex_t *ahm; 1652 pgcnt_t pgcnt; 1653 ulong_t old_idx, idx, i; 1654 struct anon_hdr *ahp = amp->ahp; 1655 anon_sync_obj_t cookie; 1656 1657 ASSERT(RW_READ_HELD(&->a_rwlock)); 1658 pgcnt = 1; 1659 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1660 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1661 1662 /* 1663 * get anon pointer and index for the first valid entry 1664 * in the anon list, starting from "index" 1665 */ 1666 old_idx = index; 1667 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1668 break; 1669 1670 /* 1671 * decrement npages by number of NULL anon slots we skipped 1672 */ 1673 npages -= index - old_idx; 1674 if (npages <= 0) 1675 break; 1676 1677 anon_array_enter(amp, index, &cookie); 1678 ap = anon_get_ptr(ahp, index); 1679 ASSERT(ap != NULL); 1680 1681 /* 1682 * Get anonymous page and try to lock it SE_EXCL; 1683 * if we couldn't grab the lock we skip to next page. 1684 */ 1685 swap_xlate(ap, &vp, &off); 1686 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1687 if (pp == NULL) { 1688 segadvstat.MADV_FREE_miss.value.ul++; 1689 pgcnt = 1; 1690 anon_array_exit(&cookie); 1691 continue; 1692 } 1693 pgcnt = page_get_pagecnt(pp->p_szc); 1694 1695 /* 1696 * we cannot free a page which is permanently locked. 1697 * The page_struct_lock need not be acquired to examine 1698 * these fields since the page has an "exclusive" lock. 1699 */ 1700 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1701 page_unlock(pp); 1702 segadvstat.MADV_FREE_miss.value.ul++; 1703 anon_array_exit(&cookie); 1704 continue; 1705 } 1706 1707 ahm = AH_MUTEX(vp, off); 1708 mutex_enter(ahm); 1709 ASSERT(ap->an_refcnt != 0); 1710 /* 1711 * skip this one if copy-on-write is not yet broken. 1712 */ 1713 if (ap->an_refcnt > 1) { 1714 mutex_exit(ahm); 1715 page_unlock(pp); 1716 segadvstat.MADV_FREE_miss.value.ul++; 1717 anon_array_exit(&cookie); 1718 continue; 1719 } 1720 1721 if (pp->p_szc == 0) { 1722 pgcnt = 1; 1723 1724 /* 1725 * free swap slot; 1726 */ 1727 if (ap->an_pvp) { 1728 swap_phys_free(ap->an_pvp, ap->an_poff, 1729 PAGESIZE); 1730 ap->an_pvp = NULL; 1731 ap->an_poff = 0; 1732 } 1733 mutex_exit(ahm); 1734 segadvstat.MADV_FREE_hit.value.ul++; 1735 1736 /* 1737 * while we are at it, unload all the translations 1738 * and attempt to free the page. 1739 */ 1740 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1741 /*LINTED: constant in conditional context */ 1742 VN_DISPOSE(pp, B_FREE, 0, kcred); 1743 anon_array_exit(&cookie); 1744 continue; 1745 } 1746 1747 pgcnt = page_get_pagecnt(pp->p_szc); 1748 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1749 if (!page_try_demote_pages(pp)) { 1750 mutex_exit(ahm); 1751 page_unlock(pp); 1752 segadvstat.MADV_FREE_miss.value.ul++; 1753 anon_array_exit(&cookie); 1754 continue; 1755 } else { 1756 pgcnt = 1; 1757 if (ap->an_pvp) { 1758 swap_phys_free(ap->an_pvp, 1759 ap->an_poff, PAGESIZE); 1760 ap->an_pvp = NULL; 1761 ap->an_poff = 0; 1762 } 1763 mutex_exit(ahm); 1764 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1765 /*LINTED*/ 1766 VN_DISPOSE(pp, B_FREE, 0, kcred); 1767 segadvstat.MADV_FREE_hit.value.ul++; 1768 anon_array_exit(&cookie); 1769 continue; 1770 } 1771 } 1772 mutex_exit(ahm); 1773 root_pp = pp; 1774 1775 /* 1776 * try to lock remaining pages 1777 */ 1778 for (idx = 1; idx < pgcnt; idx++) { 1779 pp++; 1780 if (!page_trylock(pp, SE_EXCL)) 1781 break; 1782 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1783 page_unlock(pp); 1784 break; 1785 } 1786 } 1787 1788 if (idx == pgcnt) { 1789 for (i = 0; i < pgcnt; i++) { 1790 ap = anon_get_ptr(ahp, index + i); 1791 if (ap == NULL) 1792 break; 1793 swap_xlate(ap, &vp, &off); 1794 ahm = AH_MUTEX(vp, off); 1795 mutex_enter(ahm); 1796 ASSERT(ap->an_refcnt != 0); 1797 1798 /* 1799 * skip this one if copy-on-write 1800 * is not yet broken. 1801 */ 1802 if (ap->an_refcnt > 1) { 1803 mutex_exit(ahm); 1804 goto skiplp; 1805 } 1806 if (ap->an_pvp) { 1807 swap_phys_free(ap->an_pvp, 1808 ap->an_poff, PAGESIZE); 1809 ap->an_pvp = NULL; 1810 ap->an_poff = 0; 1811 } 1812 mutex_exit(ahm); 1813 } 1814 page_destroy_pages(root_pp); 1815 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1816 anon_array_exit(&cookie); 1817 continue; 1818 } 1819 skiplp: 1820 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1821 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1822 page_unlock(pp); 1823 anon_array_exit(&cookie); 1824 } 1825 } 1826 1827 /* 1828 * Return the kept page(s) and protections back to the segment driver. 1829 */ 1830 int 1831 anon_getpage( 1832 struct anon **app, 1833 uint_t *protp, 1834 page_t *pl[], 1835 size_t plsz, 1836 struct seg *seg, 1837 caddr_t addr, 1838 enum seg_rw rw, 1839 struct cred *cred) 1840 { 1841 page_t *pp; 1842 struct anon *ap = *app; 1843 struct vnode *vp; 1844 anoff_t off; 1845 int err; 1846 kmutex_t *ahm; 1847 1848 swap_xlate(ap, &vp, &off); 1849 1850 /* 1851 * Lookup the page. If page is being paged in, 1852 * wait for it to finish as we must return a list of 1853 * pages since this routine acts like the VOP_GETPAGE 1854 * routine does. 1855 */ 1856 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1857 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1858 mutex_enter(ahm); 1859 if (ap->an_refcnt == 1) 1860 *protp = PROT_ALL; 1861 else 1862 *protp = PROT_ALL & ~PROT_WRITE; 1863 mutex_exit(ahm); 1864 pl[0] = pp; 1865 pl[1] = NULL; 1866 return (0); 1867 } 1868 1869 /* 1870 * Simply treat it as a vnode fault on the anon vp. 1871 */ 1872 1873 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1874 "anon_getpage:seg %x addr %x vp %x", 1875 seg, addr, vp); 1876 1877 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1878 seg, addr, rw, cred, NULL); 1879 1880 if (err == 0 && pl != NULL) { 1881 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1882 mutex_enter(ahm); 1883 if (ap->an_refcnt != 1) 1884 *protp &= ~PROT_WRITE; /* make read-only */ 1885 mutex_exit(ahm); 1886 } 1887 return (err); 1888 } 1889 1890 /* 1891 * Creates or returns kept pages to the segment driver. returns -1 if a large 1892 * page cannot be allocated. returns -2 if some other process has allocated a 1893 * larger page. 1894 * 1895 * For cowfault it will allocate any size pages to fill the requested area to 1896 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1897 * slots within a large page with other processes). This policy greatly 1898 * simplifies large page freeing (which is only freed when all anon slot 1899 * refcnts are 0). 1900 */ 1901 int 1902 anon_map_getpages( 1903 struct anon_map *amp, 1904 ulong_t start_idx, 1905 uint_t szc, 1906 struct seg *seg, 1907 caddr_t addr, 1908 uint_t prot, 1909 uint_t *protp, 1910 page_t *ppa[], 1911 uint_t *ppa_szc, 1912 struct vpage vpage[], 1913 enum seg_rw rw, 1914 int brkcow, 1915 int anypgsz, 1916 int pgflags, 1917 struct cred *cred) 1918 { 1919 pgcnt_t pgcnt; 1920 struct anon *ap; 1921 struct vnode *vp; 1922 anoff_t off; 1923 page_t *pp, *pl[2], *conpp = NULL; 1924 caddr_t vaddr; 1925 ulong_t pg_idx, an_idx, i; 1926 spgcnt_t nreloc = 0; 1927 int prealloc = 1; 1928 int err, slotcreate; 1929 uint_t vpprot; 1930 int upsize = (szc < seg->s_szc); 1931 1932 #if !defined(__i386) && !defined(__amd64) 1933 ASSERT(seg->s_szc != 0); 1934 #endif 1935 ASSERT(szc <= seg->s_szc); 1936 ASSERT(ppa_szc != NULL); 1937 ASSERT(rw != S_CREATE); 1938 1939 *protp = PROT_ALL; 1940 1941 VM_STAT_ADD(anonvmstats.getpages[0]); 1942 1943 if (szc == 0) { 1944 VM_STAT_ADD(anonvmstats.getpages[1]); 1945 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1946 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1947 addr, rw, cred); 1948 if (err) 1949 return (err); 1950 ppa[0] = pl[0]; 1951 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1952 VM_STAT_ADD(anonvmstats.getpages[2]); 1953 if (ppa[0]->p_szc != 0 && upsize) { 1954 VM_STAT_ADD(anonvmstats.getpages[3]); 1955 *ppa_szc = MIN(ppa[0]->p_szc, 1956 seg->s_szc); 1957 page_unlock(ppa[0]); 1958 return (-2); 1959 } 1960 return (0); 1961 } 1962 panic("anon_map_getpages: cowfault for szc 0"); 1963 } else { 1964 VM_STAT_ADD(anonvmstats.getpages[4]); 1965 ppa[0] = anon_zero(seg, addr, &ap, cred); 1966 if (ppa[0] == NULL) 1967 return (ENOMEM); 1968 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1969 ANON_SLEEP); 1970 return (0); 1971 } 1972 } 1973 1974 pgcnt = page_get_pagecnt(szc); 1975 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1976 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1977 1978 /* 1979 * First we check for the case that the requtested large 1980 * page or larger page already exists in the system. 1981 * Actually we only check if the first constituent page 1982 * exists and only preallocate if it's not found. 1983 */ 1984 ap = anon_get_ptr(amp->ahp, start_idx); 1985 if (ap) { 1986 uint_t pszc; 1987 swap_xlate(ap, &vp, &off); 1988 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1989 if (pszc > szc && upsize) { 1990 *ppa_szc = MIN(pszc, seg->s_szc); 1991 return (-2); 1992 } 1993 if (pszc >= szc) { 1994 prealloc = 0; 1995 } 1996 } 1997 } 1998 1999 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 2000 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 2001 2002 top: 2003 /* 2004 * If a smaller page or no page at all was found, 2005 * grab a large page off the freelist. 2006 */ 2007 if (prealloc) { 2008 ASSERT(conpp == NULL); 2009 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 2010 szc, 0, pgflags) != 0) { 2011 VM_STAT_ADD(anonvmstats.getpages[7]); 2012 if (brkcow == 0 || szc < seg->s_szc || 2013 !anon_szcshare(amp->ahp, start_idx)) { 2014 /* 2015 * If the refcnt's of all anon slots are <= 1 2016 * they can't increase since we are holding 2017 * the address space's lock. So segvn can 2018 * safely decrease szc without risking to 2019 * generate a cow fault for the region smaller 2020 * than the segment's largest page size. 2021 */ 2022 VM_STAT_ADD(anonvmstats.getpages[8]); 2023 return (-1); 2024 } 2025 docow: 2026 /* 2027 * This is a cow fault. Copy away the entire 1 large 2028 * page region of this segment. 2029 */ 2030 if (szc != seg->s_szc) 2031 panic("anon_map_getpages: cowfault for szc %d", 2032 szc); 2033 vaddr = addr; 2034 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2035 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2036 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2037 NULL) { 2038 err = anon_getpage(&ap, &vpprot, pl, 2039 PAGESIZE, seg, vaddr, rw, cred); 2040 if (err) { 2041 for (i = 0; i < pg_idx; i++) { 2042 if ((pp = ppa[i]) != 2043 NULL) 2044 page_unlock(pp); 2045 } 2046 return (err); 2047 } 2048 ppa[pg_idx] = pl[0]; 2049 } else { 2050 /* 2051 * Since this is a cowfault we know 2052 * that this address space has a 2053 * parent or children which means 2054 * anon_dup_fill_holes() has initialized 2055 * all anon slots within a large page 2056 * region that had at least one anon 2057 * slot at the time of fork(). 2058 */ 2059 panic("anon_map_getpages: " 2060 "cowfault but anon slot is empty"); 2061 } 2062 } 2063 VM_STAT_ADD(anonvmstats.getpages[9]); 2064 *protp = PROT_ALL; 2065 return (anon_map_privatepages(amp, start_idx, szc, seg, 2066 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2067 } 2068 } 2069 2070 VM_STAT_ADD(anonvmstats.getpages[10]); 2071 2072 an_idx = start_idx; 2073 pg_idx = 0; 2074 vaddr = addr; 2075 while (pg_idx < pgcnt) { 2076 slotcreate = 0; 2077 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2078 VM_STAT_ADD(anonvmstats.getpages[11]); 2079 /* 2080 * For us to have decided not to preallocate 2081 * would have meant that a large page 2082 * was found. Which also means that all of the 2083 * anon slots for that page would have been 2084 * already created for us. 2085 */ 2086 if (prealloc == 0) 2087 panic("anon_map_getpages: prealloc = 0"); 2088 2089 slotcreate = 1; 2090 ap = anon_alloc(NULL, 0); 2091 } 2092 swap_xlate(ap, &vp, &off); 2093 2094 /* 2095 * Now setup our preallocated page to pass down 2096 * to swap_getpage(). 2097 */ 2098 if (prealloc) { 2099 ASSERT(ppa[pg_idx]->p_szc == szc); 2100 conpp = ppa[pg_idx]; 2101 } 2102 ASSERT(prealloc || conpp == NULL); 2103 2104 /* 2105 * If we just created this anon slot then call 2106 * with S_CREATE to prevent doing IO on the page. 2107 * Similar to the anon_zero case. 2108 */ 2109 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2110 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2111 slotcreate == 1 ? S_CREATE : rw, cred); 2112 2113 if (err) { 2114 ASSERT(err != -2 || upsize); 2115 VM_STAT_ADD(anonvmstats.getpages[12]); 2116 ASSERT(slotcreate == 0); 2117 goto io_err; 2118 } 2119 2120 pp = pl[0]; 2121 2122 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2123 VM_STAT_ADD(anonvmstats.getpages[13]); 2124 ASSERT(slotcreate == 0); 2125 ASSERT(prealloc == 0); 2126 ASSERT(pg_idx == 0); 2127 if (pp->p_szc > szc) { 2128 ASSERT(upsize); 2129 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2130 page_unlock(pp); 2131 VM_STAT_ADD(anonvmstats.getpages[14]); 2132 return (-2); 2133 } 2134 page_unlock(pp); 2135 prealloc = 1; 2136 goto top; 2137 } 2138 2139 /* 2140 * If we decided to preallocate but VOP_GETPAGE 2141 * found a page in the system that satisfies our 2142 * request then free up our preallocated large page 2143 * and continue looping accross the existing large 2144 * page via VOP_GETPAGE. 2145 */ 2146 if (prealloc && pp != ppa[pg_idx]) { 2147 VM_STAT_ADD(anonvmstats.getpages[15]); 2148 ASSERT(slotcreate == 0); 2149 ASSERT(pg_idx == 0); 2150 conpp = NULL; 2151 prealloc = 0; 2152 page_free_pages(ppa[0]); 2153 } 2154 2155 if (prealloc && nreloc > 1) { 2156 /* 2157 * we have relocated out of a smaller large page. 2158 * skip npgs - 1 iterations and continue which will 2159 * increment by one the loop indices. 2160 */ 2161 spgcnt_t npgs = nreloc; 2162 2163 VM_STAT_ADD(anonvmstats.getpages[16]); 2164 2165 ASSERT(pp == ppa[pg_idx]); 2166 ASSERT(slotcreate == 0); 2167 ASSERT(pg_idx + npgs <= pgcnt); 2168 if ((*protp & PROT_WRITE) && 2169 anon_share(amp->ahp, an_idx, npgs)) { 2170 *protp &= ~PROT_WRITE; 2171 } 2172 pg_idx += npgs; 2173 an_idx += npgs; 2174 vaddr += PAGESIZE * npgs; 2175 continue; 2176 } 2177 2178 VM_STAT_ADD(anonvmstats.getpages[17]); 2179 2180 /* 2181 * Anon_zero case. 2182 */ 2183 if (slotcreate) { 2184 ASSERT(prealloc); 2185 pagezero(pp, 0, PAGESIZE); 2186 CPU_STATS_ADD_K(vm, zfod, 1); 2187 hat_setrefmod(pp); 2188 } 2189 2190 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2191 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2192 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2193 2194 if (pg_idx > 0 && 2195 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2196 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2197 panic("anon_map_getpages: unexpected page"); 2198 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2199 panic("anon_map_getpages: unaligned page"); 2200 } 2201 2202 if (prealloc == 0) { 2203 ppa[pg_idx] = pp; 2204 } 2205 2206 if (ap->an_refcnt > 1) { 2207 VM_STAT_ADD(anonvmstats.getpages[18]); 2208 *protp &= ~PROT_WRITE; 2209 } 2210 2211 /* 2212 * If this is a new anon slot then initialize 2213 * the anon array entry. 2214 */ 2215 if (slotcreate) { 2216 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2217 } 2218 pg_idx++; 2219 an_idx++; 2220 vaddr += PAGESIZE; 2221 } 2222 2223 /* 2224 * Since preallocated pages come off the freelist 2225 * they are locked SE_EXCL. Simply downgrade and return. 2226 */ 2227 if (prealloc) { 2228 VM_STAT_ADD(anonvmstats.getpages[19]); 2229 conpp = NULL; 2230 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2231 page_downgrade(ppa[pg_idx]); 2232 } 2233 } 2234 ASSERT(conpp == NULL); 2235 2236 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2237 VM_STAT_ADD(anonvmstats.getpages[20]); 2238 return (0); 2239 } 2240 2241 if (szc < seg->s_szc) 2242 panic("anon_map_getpages: cowfault for szc %d", szc); 2243 2244 VM_STAT_ADD(anonvmstats.getpages[21]); 2245 2246 *protp = PROT_ALL; 2247 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2248 ppa, vpage, anypgsz, pgflags, cred)); 2249 io_err: 2250 /* 2251 * We got an IO error somewhere in our large page. 2252 * If we were using a preallocated page then just demote 2253 * all the constituent pages that we've succeeded with sofar 2254 * to PAGESIZE pages and leave them in the system 2255 * unlocked. 2256 */ 2257 2258 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2259 2260 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2261 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2262 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2263 2264 if (prealloc) { 2265 conpp = NULL; 2266 if (pg_idx > 0) { 2267 VM_STAT_ADD(anonvmstats.getpages[25]); 2268 for (i = 0; i < pgcnt; i++) { 2269 pp = ppa[i]; 2270 ASSERT(PAGE_EXCL(pp)); 2271 ASSERT(pp->p_szc == szc); 2272 pp->p_szc = 0; 2273 } 2274 for (i = 0; i < pg_idx; i++) { 2275 ASSERT(!hat_page_is_mapped(ppa[i])); 2276 page_unlock(ppa[i]); 2277 } 2278 /* 2279 * Now free up the remaining unused constituent 2280 * pages. 2281 */ 2282 while (pg_idx < pgcnt) { 2283 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2284 page_free(ppa[pg_idx], 0); 2285 pg_idx++; 2286 } 2287 } else { 2288 VM_STAT_ADD(anonvmstats.getpages[26]); 2289 page_free_pages(ppa[0]); 2290 } 2291 } else { 2292 VM_STAT_ADD(anonvmstats.getpages[27]); 2293 ASSERT(err > 0); 2294 for (i = 0; i < pg_idx; i++) 2295 page_unlock(ppa[i]); 2296 } 2297 ASSERT(conpp == NULL); 2298 if (err != -1) 2299 return (err); 2300 /* 2301 * we are here because we failed to relocate. 2302 */ 2303 ASSERT(prealloc); 2304 if (brkcow == 0 || szc < seg->s_szc || 2305 !anon_szcshare(amp->ahp, start_idx)) { 2306 VM_STAT_ADD(anonvmstats.getpages[28]); 2307 return (-1); 2308 } 2309 VM_STAT_ADD(anonvmstats.getpages[29]); 2310 goto docow; 2311 } 2312 2313 2314 /* 2315 * Turn a reference to an object or shared anon page 2316 * into a private page with a copy of the data from the 2317 * original page which is always locked by the caller. 2318 * This routine unloads the translation and unlocks the 2319 * original page, if it isn't being stolen, before returning 2320 * to the caller. 2321 * 2322 * NOTE: The original anon slot is not freed by this routine 2323 * It must be freed by the caller while holding the 2324 * "anon_map" lock to prevent races which can occur if 2325 * a process has multiple lwps in its address space. 2326 */ 2327 page_t * 2328 anon_private( 2329 struct anon **app, 2330 struct seg *seg, 2331 caddr_t addr, 2332 uint_t prot, 2333 page_t *opp, 2334 int oppflags, 2335 struct cred *cred) 2336 { 2337 struct anon *old = *app; 2338 struct anon *new; 2339 page_t *pp = NULL; 2340 struct vnode *vp; 2341 anoff_t off; 2342 page_t *anon_pl[1 + 1]; 2343 int err; 2344 2345 if (oppflags & STEAL_PAGE) 2346 ASSERT(PAGE_EXCL(opp)); 2347 else 2348 ASSERT(PAGE_LOCKED(opp)); 2349 2350 CPU_STATS_ADD_K(vm, cow_fault, 1); 2351 2352 /* Kernel probe */ 2353 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2354 tnf_opaque, address, addr); 2355 2356 *app = new = anon_alloc(NULL, 0); 2357 swap_xlate(new, &vp, &off); 2358 2359 if (oppflags & STEAL_PAGE) { 2360 page_rename(opp, vp, (u_offset_t)off); 2361 pp = opp; 2362 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2363 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2364 seg, addr, pp, vp, off); 2365 hat_setmod(pp); 2366 2367 /* bug 4026339 */ 2368 page_downgrade(pp); 2369 return (pp); 2370 } 2371 2372 /* 2373 * Call the VOP_GETPAGE routine to create the page, thereby 2374 * enabling the vnode driver to allocate any filesystem 2375 * space (e.g., disk block allocation for UFS). This also 2376 * prevents more than one page from being added to the 2377 * vnode at the same time. 2378 */ 2379 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2380 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2381 if (err) 2382 goto out; 2383 2384 pp = anon_pl[0]; 2385 2386 /* 2387 * If the original page was locked, we need to move the lock 2388 * to the new page by transfering 'cowcnt/lckcnt' of the original 2389 * page to 'cowcnt/lckcnt' of the new page. 2390 * 2391 * See Statement at the beginning of segvn_lockop() and 2392 * comments in page_pp_useclaim() regarding the way 2393 * cowcnts/lckcnts are handled. 2394 * 2395 * Also availrmem must be decremented up front for read only mapping 2396 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2397 * if availrmem did not need to be decremented after all. 2398 */ 2399 if (oppflags & LOCK_PAGE) { 2400 if ((prot & PROT_WRITE) == 0) { 2401 mutex_enter(&freemem_lock); 2402 if (availrmem > pages_pp_maximum) { 2403 availrmem--; 2404 pages_useclaim++; 2405 } else { 2406 mutex_exit(&freemem_lock); 2407 goto out; 2408 } 2409 mutex_exit(&freemem_lock); 2410 } 2411 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2412 } 2413 2414 /* 2415 * Now copy the contents from the original page, 2416 * which is locked and loaded in the MMU by 2417 * the caller to prevent yet another page fault. 2418 */ 2419 /* XXX - should set mod bit in here */ 2420 if (ppcopy(opp, pp) == 0) { 2421 /* 2422 * Before ppcopy could hanlde UE or other faults, we 2423 * would have panicked here, and still have no option 2424 * but to do so now. 2425 */ 2426 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2427 (void *)opp, (void *)pp); 2428 } 2429 2430 hat_setrefmod(pp); /* mark as modified */ 2431 2432 /* 2433 * Unload the old translation. 2434 */ 2435 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2436 2437 /* 2438 * Free unmapped, unmodified original page. 2439 * or release the lock on the original page, 2440 * otherwise the process will sleep forever in 2441 * anon_decref() waiting for the "exclusive" lock 2442 * on the page. 2443 */ 2444 (void) page_release(opp, 1); 2445 2446 /* 2447 * we are done with page creation so downgrade the new 2448 * page's selock to shared, this helps when multiple 2449 * as_fault(...SOFTLOCK...) are done to the same 2450 * page(aio) 2451 */ 2452 page_downgrade(pp); 2453 2454 /* 2455 * NOTE: The original anon slot must be freed by the 2456 * caller while holding the "anon_map" lock, if we 2457 * copied away from an anonymous page. 2458 */ 2459 return (pp); 2460 2461 out: 2462 *app = old; 2463 if (pp) 2464 page_unlock(pp); 2465 anon_decref(new); 2466 page_unlock(opp); 2467 return ((page_t *)NULL); 2468 } 2469 2470 int 2471 anon_map_privatepages( 2472 struct anon_map *amp, 2473 ulong_t start_idx, 2474 uint_t szc, 2475 struct seg *seg, 2476 caddr_t addr, 2477 uint_t prot, 2478 page_t *ppa[], 2479 struct vpage vpage[], 2480 int anypgsz, 2481 int pgflags, 2482 struct cred *cred) 2483 { 2484 pgcnt_t pgcnt; 2485 struct vnode *vp; 2486 anoff_t off; 2487 page_t *pl[2], *conpp = NULL; 2488 int err; 2489 int prealloc = 1; 2490 struct anon *ap, *oldap; 2491 caddr_t vaddr; 2492 page_t *pplist, *pp; 2493 ulong_t pg_idx, an_idx; 2494 spgcnt_t nreloc = 0; 2495 int pagelock = 0; 2496 kmutex_t *ahmpages = NULL; 2497 #ifdef DEBUG 2498 int refcnt; 2499 #endif 2500 2501 ASSERT(szc != 0); 2502 ASSERT(szc == seg->s_szc); 2503 2504 VM_STAT_ADD(anonvmstats.privatepages[0]); 2505 2506 pgcnt = page_get_pagecnt(szc); 2507 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2508 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2509 2510 ASSERT(amp != NULL); 2511 ap = anon_get_ptr(amp->ahp, start_idx); 2512 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2513 2514 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2515 2516 /* 2517 * Now try and allocate the large page. If we fail then just 2518 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2519 * the caller make this decision but to avoid added complexity 2520 * it's simplier to handle that case here. 2521 */ 2522 if (anypgsz == -1) { 2523 VM_STAT_ADD(anonvmstats.privatepages[2]); 2524 prealloc = 0; 2525 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2526 anypgsz, pgflags) != 0) { 2527 VM_STAT_ADD(anonvmstats.privatepages[3]); 2528 prealloc = 0; 2529 } 2530 2531 /* 2532 * make the decrement of all refcnts of all 2533 * anon slots of a large page appear atomic by 2534 * getting an anonpages_hash_lock for the 2535 * first anon slot of a large page. 2536 */ 2537 if (ap != NULL) { 2538 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 2539 mutex_enter(ahmpages); 2540 if (ap->an_refcnt == 1) { 2541 VM_STAT_ADD(anonvmstats.privatepages[4]); 2542 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2543 mutex_exit(ahmpages); 2544 2545 if (prealloc) { 2546 page_free_replacement_page(pplist); 2547 page_create_putback(pgcnt); 2548 } 2549 ASSERT(ppa[0]->p_szc <= szc); 2550 if (ppa[0]->p_szc == szc) { 2551 VM_STAT_ADD(anonvmstats.privatepages[5]); 2552 return (0); 2553 } 2554 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2555 ASSERT(ppa[pg_idx] != NULL); 2556 page_unlock(ppa[pg_idx]); 2557 } 2558 return (-1); 2559 } 2560 } 2561 2562 /* 2563 * If we are passed in the vpage array and this is 2564 * not PROT_WRITE then we need to decrement availrmem 2565 * up front before we try anything. If we need to and 2566 * can't decrement availrmem then its better to fail now 2567 * than in the middle of processing the new large page. 2568 * page_pp_usclaim() on behalf of each constituent page 2569 * below will adjust availrmem back for the cases not needed. 2570 */ 2571 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2572 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2573 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2574 pagelock = 1; 2575 break; 2576 } 2577 } 2578 if (pagelock) { 2579 VM_STAT_ADD(anonvmstats.privatepages[6]); 2580 mutex_enter(&freemem_lock); 2581 if (availrmem >= pages_pp_maximum + pgcnt) { 2582 availrmem -= pgcnt; 2583 pages_useclaim += pgcnt; 2584 } else { 2585 VM_STAT_ADD(anonvmstats.privatepages[7]); 2586 mutex_exit(&freemem_lock); 2587 if (ahmpages != NULL) { 2588 mutex_exit(ahmpages); 2589 } 2590 if (prealloc) { 2591 page_free_replacement_page(pplist); 2592 page_create_putback(pgcnt); 2593 } 2594 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2595 if (ppa[pg_idx] != NULL) 2596 page_unlock(ppa[pg_idx]); 2597 return (ENOMEM); 2598 } 2599 mutex_exit(&freemem_lock); 2600 } 2601 } 2602 2603 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2604 2605 VM_STAT_ADD(anonvmstats.privatepages[8]); 2606 2607 an_idx = start_idx; 2608 pg_idx = 0; 2609 vaddr = addr; 2610 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2611 ASSERT(ppa[pg_idx] != NULL); 2612 oldap = anon_get_ptr(amp->ahp, an_idx); 2613 ASSERT(ahmpages != NULL || oldap == NULL); 2614 ASSERT(ahmpages == NULL || oldap != NULL); 2615 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2616 ASSERT(ahmpages == NULL || pg_idx != 0 || 2617 (refcnt = oldap->an_refcnt)); 2618 ASSERT(ahmpages == NULL || pg_idx == 0 || 2619 refcnt == oldap->an_refcnt); 2620 2621 ap = anon_alloc(NULL, 0); 2622 2623 swap_xlate(ap, &vp, &off); 2624 2625 /* 2626 * Now setup our preallocated page to pass down to 2627 * swap_getpage(). 2628 */ 2629 if (prealloc) { 2630 pp = pplist; 2631 page_sub(&pplist, pp); 2632 conpp = pp; 2633 } 2634 2635 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2636 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2637 S_CREATE, cred); 2638 2639 /* 2640 * Impossible to fail this is S_CREATE. 2641 */ 2642 if (err) 2643 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2644 2645 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2646 ASSERT(prealloc == 0 || nreloc == 1); 2647 2648 pp = pl[0]; 2649 2650 /* 2651 * If the original page was locked, we need to move 2652 * the lock to the new page by transfering 2653 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2654 * of the new page. pg_idx can be used to index 2655 * into the vpage array since the caller will guarentee 2656 * that vpage struct passed in corresponds to addr 2657 * and forward. 2658 */ 2659 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2660 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2661 } else if (pagelock) { 2662 mutex_enter(&freemem_lock); 2663 availrmem++; 2664 pages_useclaim--; 2665 mutex_exit(&freemem_lock); 2666 } 2667 2668 /* 2669 * Now copy the contents from the original page. 2670 */ 2671 if (ppcopy(ppa[pg_idx], pp) == 0) { 2672 /* 2673 * Before ppcopy could hanlde UE or other faults, we 2674 * would have panicked here, and still have no option 2675 * but to do so now. 2676 */ 2677 panic("anon_map_privatepages, ppcopy failed"); 2678 } 2679 2680 hat_setrefmod(pp); /* mark as modified */ 2681 2682 /* 2683 * Release the lock on the original page, 2684 * derement the old slot, and down grade the lock 2685 * on the new copy. 2686 */ 2687 page_unlock(ppa[pg_idx]); 2688 2689 if (!prealloc) 2690 page_downgrade(pp); 2691 2692 ppa[pg_idx] = pp; 2693 2694 /* 2695 * Now reflect the copy in the new anon array. 2696 */ 2697 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2698 if (oldap != NULL) 2699 anon_decref(oldap); 2700 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2701 } 2702 2703 /* 2704 * Unload the old large page translation. 2705 */ 2706 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2707 2708 if (ahmpages != NULL) { 2709 mutex_exit(ahmpages); 2710 } 2711 ASSERT(prealloc == 0 || pplist == NULL); 2712 if (prealloc) { 2713 VM_STAT_ADD(anonvmstats.privatepages[9]); 2714 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2715 page_downgrade(ppa[pg_idx]); 2716 } 2717 } 2718 2719 return (0); 2720 } 2721 2722 /* 2723 * Allocate a private zero-filled anon page. 2724 */ 2725 page_t * 2726 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2727 { 2728 struct anon *ap; 2729 page_t *pp; 2730 struct vnode *vp; 2731 anoff_t off; 2732 page_t *anon_pl[1 + 1]; 2733 int err; 2734 2735 /* Kernel probe */ 2736 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2737 tnf_opaque, address, addr); 2738 2739 *app = ap = anon_alloc(NULL, 0); 2740 swap_xlate(ap, &vp, &off); 2741 2742 /* 2743 * Call the VOP_GETPAGE routine to create the page, thereby 2744 * enabling the vnode driver to allocate any filesystem 2745 * dependent structures (e.g., disk block allocation for UFS). 2746 * This also prevents more than on page from being added to 2747 * the vnode at the same time since it is locked. 2748 */ 2749 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2750 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2751 if (err) { 2752 *app = NULL; 2753 anon_decref(ap); 2754 return (NULL); 2755 } 2756 pp = anon_pl[0]; 2757 2758 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2759 page_downgrade(pp); 2760 CPU_STATS_ADD_K(vm, zfod, 1); 2761 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2762 return (pp); 2763 } 2764 2765 2766 /* 2767 * Allocate array of private zero-filled anon pages for empty slots 2768 * and kept pages for non empty slots within given range. 2769 * 2770 * NOTE: This rontine will try and use large pages 2771 * if available and supported by underlying platform. 2772 */ 2773 int 2774 anon_map_createpages( 2775 struct anon_map *amp, 2776 ulong_t start_index, 2777 size_t len, 2778 page_t *ppa[], 2779 struct seg *seg, 2780 caddr_t addr, 2781 enum seg_rw rw, 2782 struct cred *cred) 2783 { 2784 2785 struct anon *ap; 2786 struct vnode *ap_vp; 2787 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2788 int err = 0; 2789 ulong_t p_index, index; 2790 pgcnt_t npgs, pg_cnt; 2791 spgcnt_t nreloc = 0; 2792 uint_t l_szc, szc, prot; 2793 anoff_t ap_off; 2794 size_t pgsz; 2795 lgrp_t *lgrp; 2796 kmutex_t *ahm; 2797 2798 /* 2799 * XXX For now only handle S_CREATE. 2800 */ 2801 ASSERT(rw == S_CREATE); 2802 2803 index = start_index; 2804 p_index = 0; 2805 npgs = btopr(len); 2806 2807 /* 2808 * If this platform supports multiple page sizes 2809 * then try and allocate directly from the free 2810 * list for pages larger than PAGESIZE. 2811 * 2812 * NOTE:When we have page_create_ru we can stop 2813 * directly allocating from the freelist. 2814 */ 2815 l_szc = seg->s_szc; 2816 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2817 while (npgs) { 2818 2819 /* 2820 * if anon slot already exists 2821 * (means page has been created) 2822 * so 1) look up the page 2823 * 2) if the page is still in memory, get it. 2824 * 3) if not, create a page and 2825 * page in from physical swap device. 2826 * These are done in anon_getpage(). 2827 */ 2828 ap = anon_get_ptr(amp->ahp, index); 2829 if (ap) { 2830 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2831 seg, addr, S_READ, cred); 2832 if (err) { 2833 ANON_LOCK_EXIT(&->a_rwlock); 2834 panic("anon_map_createpages: anon_getpage"); 2835 } 2836 pp = anon_pl[0]; 2837 ppa[p_index++] = pp; 2838 2839 /* 2840 * an_pvp can become non-NULL after SysV's page was 2841 * paged out before ISM was attached to this SysV 2842 * shared memory segment. So free swap slot if needed. 2843 */ 2844 if (ap->an_pvp != NULL) { 2845 page_io_lock(pp); 2846 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 2847 mutex_enter(ahm); 2848 if (ap->an_pvp != NULL) { 2849 swap_phys_free(ap->an_pvp, 2850 ap->an_poff, PAGESIZE); 2851 ap->an_pvp = NULL; 2852 ap->an_poff = 0; 2853 mutex_exit(ahm); 2854 hat_setmod(pp); 2855 } else { 2856 mutex_exit(ahm); 2857 } 2858 page_io_unlock(pp); 2859 } 2860 2861 addr += PAGESIZE; 2862 index++; 2863 npgs--; 2864 continue; 2865 } 2866 /* 2867 * Now try and allocate the largest page possible 2868 * for the current address and range. 2869 * Keep dropping down in page size until: 2870 * 2871 * 1) Properly aligned 2872 * 2) Does not overlap existing anon pages 2873 * 3) Fits in remaining range. 2874 * 4) able to allocate one. 2875 * 2876 * NOTE: XXX When page_create_ru is completed this code 2877 * will change. 2878 */ 2879 szc = l_szc; 2880 pplist = NULL; 2881 pg_cnt = 0; 2882 while (szc) { 2883 pgsz = page_get_pagesize(szc); 2884 pg_cnt = pgsz >> PAGESHIFT; 2885 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2886 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2887 /* 2888 * XXX 2889 * Since we are faking page_create() 2890 * we also need to do the freemem and 2891 * pcf accounting. 2892 */ 2893 (void) page_create_wait(pg_cnt, PG_WAIT); 2894 2895 /* 2896 * Get lgroup to allocate next page of shared 2897 * memory from and use it to specify where to 2898 * allocate the physical memory 2899 */ 2900 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2901 2902 pplist = page_get_freelist( 2903 anon_vp, (u_offset_t)0, seg, 2904 addr, pgsz, 0, lgrp); 2905 2906 if (pplist == NULL) { 2907 page_create_putback(pg_cnt); 2908 } 2909 2910 /* 2911 * If a request for a page of size 2912 * larger than PAGESIZE failed 2913 * then don't try that size anymore. 2914 */ 2915 if (pplist == NULL) { 2916 l_szc = szc - 1; 2917 } else { 2918 break; 2919 } 2920 } 2921 szc--; 2922 } 2923 2924 /* 2925 * If just using PAGESIZE pages then don't 2926 * directly allocate from the free list. 2927 */ 2928 if (pplist == NULL) { 2929 ASSERT(szc == 0); 2930 pp = anon_zero(seg, addr, &ap, cred); 2931 if (pp == NULL) { 2932 ANON_LOCK_EXIT(&->a_rwlock); 2933 panic("anon_map_createpages: anon_zero"); 2934 } 2935 ppa[p_index++] = pp; 2936 2937 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2938 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2939 2940 addr += PAGESIZE; 2941 index++; 2942 npgs--; 2943 continue; 2944 } 2945 2946 /* 2947 * pplist is a list of pg_cnt PAGESIZE pages. 2948 * These pages are locked SE_EXCL since they 2949 * came directly off the free list. 2950 */ 2951 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2952 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2953 ASSERT(conpp == NULL); 2954 while (pg_cnt--) { 2955 2956 ap = anon_alloc(NULL, 0); 2957 swap_xlate(ap, &ap_vp, &ap_off); 2958 2959 ASSERT(pplist != NULL); 2960 pp = pplist; 2961 page_sub(&pplist, pp); 2962 PP_CLRFREE(pp); 2963 PP_CLRAGED(pp); 2964 conpp = pp; 2965 2966 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2967 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2968 &nreloc, seg, addr, S_CREATE, cred); 2969 2970 if (err) { 2971 ANON_LOCK_EXIT(&->a_rwlock); 2972 panic("anon_map_createpages: S_CREATE"); 2973 } 2974 2975 ASSERT(anon_pl[0] == pp); 2976 ASSERT(nreloc == 1); 2977 pagezero(pp, 0, PAGESIZE); 2978 CPU_STATS_ADD_K(vm, zfod, 1); 2979 hat_setrefmod(pp); 2980 2981 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2982 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2983 2984 ppa[p_index++] = pp; 2985 2986 addr += PAGESIZE; 2987 index++; 2988 npgs--; 2989 } 2990 conpp = NULL; 2991 pg_cnt = pgsz >> PAGESHIFT; 2992 p_index = p_index - pg_cnt; 2993 while (pg_cnt--) { 2994 page_downgrade(ppa[p_index++]); 2995 } 2996 } 2997 ANON_LOCK_EXIT(&->a_rwlock); 2998 return (0); 2999 } 3000 3001 static int 3002 anon_try_demote_pages( 3003 struct anon_hdr *ahp, 3004 ulong_t sidx, 3005 uint_t szc, 3006 page_t **ppa, 3007 int private) 3008 { 3009 struct anon *ap; 3010 pgcnt_t pgcnt = page_get_pagecnt(szc); 3011 page_t *pp; 3012 pgcnt_t i; 3013 kmutex_t *ahmpages = NULL; 3014 int root = 0; 3015 pgcnt_t npgs; 3016 pgcnt_t curnpgs = 0; 3017 size_t ppasize = 0; 3018 3019 ASSERT(szc != 0); 3020 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3021 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3022 ASSERT(sidx < ahp->size); 3023 3024 if (ppa == NULL) { 3025 ppasize = pgcnt * sizeof (page_t *); 3026 ppa = kmem_alloc(ppasize, KM_SLEEP); 3027 } 3028 3029 ap = anon_get_ptr(ahp, sidx); 3030 if (ap != NULL && private) { 3031 VM_STAT_ADD(anonvmstats.demotepages[1]); 3032 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 3033 mutex_enter(ahmpages); 3034 } 3035 3036 if (ap != NULL && ap->an_refcnt > 1) { 3037 if (ahmpages != NULL) { 3038 VM_STAT_ADD(anonvmstats.demotepages[2]); 3039 mutex_exit(ahmpages); 3040 } 3041 if (ppasize != 0) { 3042 kmem_free(ppa, ppasize); 3043 } 3044 return (0); 3045 } 3046 if (ahmpages != NULL) { 3047 mutex_exit(ahmpages); 3048 } 3049 if (ahp->size - sidx < pgcnt) { 3050 ASSERT(private == 0); 3051 pgcnt = ahp->size - sidx; 3052 } 3053 for (i = 0; i < pgcnt; i++, sidx++) { 3054 ap = anon_get_ptr(ahp, sidx); 3055 if (ap != NULL) { 3056 if (ap->an_refcnt != 1) { 3057 panic("anon_try_demote_pages: an_refcnt != 1"); 3058 } 3059 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3060 SE_EXCL); 3061 if (pp != NULL) { 3062 (void) hat_pageunload(pp, 3063 HAT_FORCE_PGUNLOAD); 3064 } 3065 } else { 3066 ppa[i] = NULL; 3067 } 3068 } 3069 for (i = 0; i < pgcnt; i++) { 3070 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3071 ASSERT(pp->p_szc <= szc); 3072 if (!root) { 3073 VM_STAT_ADD(anonvmstats.demotepages[3]); 3074 if (curnpgs != 0) 3075 panic("anon_try_demote_pages: " 3076 "bad large page"); 3077 3078 root = 1; 3079 curnpgs = npgs = 3080 page_get_pagecnt(pp->p_szc); 3081 3082 ASSERT(npgs <= pgcnt); 3083 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3084 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3085 } else { 3086 ASSERT(i > 0); 3087 ASSERT(page_pptonum(pp) - 1 == 3088 page_pptonum(ppa[i - 1])); 3089 if ((page_pptonum(pp) & (npgs - 1)) == 3090 npgs - 1) 3091 root = 0; 3092 } 3093 ASSERT(PAGE_EXCL(pp)); 3094 pp->p_szc = 0; 3095 ASSERT(curnpgs > 0); 3096 curnpgs--; 3097 } 3098 } 3099 if (root != 0 || curnpgs != 0) 3100 panic("anon_try_demote_pages: bad large page"); 3101 3102 for (i = 0; i < pgcnt; i++) { 3103 if ((pp = ppa[i]) != NULL) { 3104 ASSERT(!hat_page_is_mapped(pp)); 3105 ASSERT(pp->p_szc == 0); 3106 page_unlock(pp); 3107 } 3108 } 3109 if (ppasize != 0) { 3110 kmem_free(ppa, ppasize); 3111 } 3112 return (1); 3113 } 3114 3115 /* 3116 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3117 */ 3118 int 3119 anon_map_demotepages( 3120 struct anon_map *amp, 3121 ulong_t start_idx, 3122 struct seg *seg, 3123 caddr_t addr, 3124 uint_t prot, 3125 struct vpage vpage[], 3126 struct cred *cred) 3127 { 3128 struct anon *ap; 3129 uint_t szc = seg->s_szc; 3130 pgcnt_t pgcnt = page_get_pagecnt(szc); 3131 size_t ppasize = pgcnt * sizeof (page_t *); 3132 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3133 page_t *pp; 3134 page_t *pl[2]; 3135 pgcnt_t i, pg_idx; 3136 ulong_t an_idx; 3137 caddr_t vaddr; 3138 int err; 3139 int retry = 0; 3140 uint_t vpprot; 3141 3142 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3143 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3144 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3145 ASSERT(ppa != NULL); 3146 ASSERT(szc != 0); 3147 ASSERT(szc == amp->a_szc); 3148 3149 VM_STAT_ADD(anonvmstats.demotepages[0]); 3150 3151 top: 3152 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3153 kmem_free(ppa, ppasize); 3154 return (0); 3155 } 3156 3157 VM_STAT_ADD(anonvmstats.demotepages[4]); 3158 3159 ASSERT(retry == 0); /* we can be here only once */ 3160 3161 vaddr = addr; 3162 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3163 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3164 ap = anon_get_ptr(amp->ahp, an_idx); 3165 if (ap == NULL) 3166 panic("anon_map_demotepages: no anon slot"); 3167 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3168 S_READ, cred); 3169 if (err) { 3170 for (i = 0; i < pg_idx; i++) { 3171 if ((pp = ppa[i]) != NULL) 3172 page_unlock(pp); 3173 } 3174 kmem_free(ppa, ppasize); 3175 return (err); 3176 } 3177 ppa[pg_idx] = pl[0]; 3178 } 3179 3180 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3181 vpage, -1, 0, cred); 3182 if (err > 0) { 3183 VM_STAT_ADD(anonvmstats.demotepages[5]); 3184 kmem_free(ppa, ppasize); 3185 return (err); 3186 } 3187 ASSERT(err == 0 || err == -1); 3188 if (err == -1) { 3189 VM_STAT_ADD(anonvmstats.demotepages[6]); 3190 retry = 1; 3191 goto top; 3192 } 3193 for (i = 0; i < pgcnt; i++) { 3194 ASSERT(ppa[i] != NULL); 3195 if (ppa[i]->p_szc != 0) 3196 retry = 1; 3197 page_unlock(ppa[i]); 3198 } 3199 if (retry) { 3200 VM_STAT_ADD(anonvmstats.demotepages[7]); 3201 goto top; 3202 } 3203 3204 VM_STAT_ADD(anonvmstats.demotepages[8]); 3205 3206 kmem_free(ppa, ppasize); 3207 3208 return (0); 3209 } 3210 3211 /* 3212 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3213 * structures with private anon maps. Therefore all anon structures should 3214 * have at most one reference at this point. This means underlying pages can 3215 * be exclusively locked and demoted or freed. If not freeing the entire 3216 * large pages demote the ends of the region we free to be able to free 3217 * subpages. Page roots correspond to aligned index positions in anon map. 3218 */ 3219 void 3220 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3221 { 3222 ulong_t eidx = sidx + btopr(len); 3223 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3224 struct anon_hdr *ahp = amp->ahp; 3225 ulong_t tidx; 3226 size_t size; 3227 ulong_t sidx_aligned; 3228 ulong_t eidx_aligned; 3229 3230 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3231 ASSERT(amp->refcnt <= 1); 3232 ASSERT(amp->a_szc > 0); 3233 ASSERT(eidx <= ahp->size); 3234 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3235 3236 if (len == 0) { /* XXX */ 3237 return; 3238 } 3239 3240 sidx_aligned = P2ALIGN(sidx, pages); 3241 if (sidx_aligned != sidx || 3242 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3243 if (!anon_try_demote_pages(ahp, sidx_aligned, 3244 amp->a_szc, NULL, 0)) { 3245 panic("anon_shmap_free_pages: demote failed"); 3246 } 3247 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3248 P2NPHASE(sidx, pages); 3249 size <<= PAGESHIFT; 3250 anon_free(ahp, sidx, size); 3251 sidx = sidx_aligned + pages; 3252 if (eidx <= sidx) { 3253 return; 3254 } 3255 } 3256 eidx_aligned = P2ALIGN(eidx, pages); 3257 if (sidx < eidx_aligned) { 3258 anon_free_pages(ahp, sidx, 3259 (eidx_aligned - sidx) << PAGESHIFT, 3260 amp->a_szc); 3261 sidx = eidx_aligned; 3262 } 3263 ASSERT(sidx == eidx_aligned); 3264 if (eidx == eidx_aligned) { 3265 return; 3266 } 3267 tidx = eidx; 3268 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3269 tidx - sidx < pages) { 3270 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3271 panic("anon_shmap_free_pages: demote failed"); 3272 } 3273 size = (eidx - sidx) << PAGESHIFT; 3274 anon_free(ahp, sidx, size); 3275 } else { 3276 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3277 } 3278 } 3279 3280 /* 3281 * This routine should be called with amp's writer lock when there're no other 3282 * users of amp. All pcache entries of this amp must have been already 3283 * inactivated. We must not drop a_rwlock here to prevent new users from 3284 * attaching to this amp. 3285 */ 3286 void 3287 anonmap_purge(struct anon_map *amp) 3288 { 3289 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3290 ASSERT(amp->refcnt <= 1); 3291 3292 if (amp->a_softlockcnt != 0) { 3293 seg_ppurge(NULL, amp, 0); 3294 } 3295 3296 /* 3297 * Since all pcache entries were already inactive before this routine 3298 * was called seg_ppurge() couldn't return while there're still 3299 * entries that can be found via the list anchored at a_phead. So we 3300 * can assert this list is empty now. a_softlockcnt may be still non 0 3301 * if asynchronous thread that manages pcache already removed pcache 3302 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3303 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3304 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3305 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3306 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3307 * barrier that prevents anonmap_purge() to complete while 3308 * shamp_reclaim() may still be referencing this amp. 3309 */ 3310 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3311 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3312 3313 mutex_enter(&->a_purgemtx); 3314 while (amp->a_softlockcnt != 0) { 3315 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3316 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3317 amp->a_purgewait = 1; 3318 cv_wait(&->a_purgecv, &->a_purgemtx); 3319 } 3320 mutex_exit(&->a_purgemtx); 3321 3322 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3323 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3324 ASSERT(amp->a_softlockcnt == 0); 3325 } 3326 3327 /* 3328 * Allocate and initialize an anon_map structure for seg 3329 * associating the given swap reservation with the new anon_map. 3330 */ 3331 struct anon_map * 3332 anonmap_alloc(size_t size, size_t swresv, int flags) 3333 { 3334 struct anon_map *amp; 3335 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3336 3337 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3338 if (amp == NULL) { 3339 ASSERT(kmflags == KM_NOSLEEP); 3340 return (NULL); 3341 } 3342 3343 amp->ahp = anon_create(btopr(size), flags); 3344 if (amp->ahp == NULL) { 3345 ASSERT(flags == ANON_NOSLEEP); 3346 kmem_cache_free(anonmap_cache, amp); 3347 return (NULL); 3348 } 3349 amp->refcnt = 1; 3350 amp->size = size; 3351 amp->swresv = swresv; 3352 amp->locality = 0; 3353 amp->a_szc = 0; 3354 amp->a_sp = NULL; 3355 amp->a_softlockcnt = 0; 3356 amp->a_purgewait = 0; 3357 amp->a_phead.p_lnext = &->a_phead; 3358 amp->a_phead.p_lprev = &->a_phead; 3359 3360 return (amp); 3361 } 3362 3363 void 3364 anonmap_free(struct anon_map *amp) 3365 { 3366 ASSERT(amp->ahp != NULL); 3367 ASSERT(amp->refcnt == 0); 3368 ASSERT(amp->a_softlockcnt == 0); 3369 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3370 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3371 3372 lgrp_shm_policy_fini(amp, NULL); 3373 anon_release(amp->ahp, btopr(amp->size)); 3374 kmem_cache_free(anonmap_cache, amp); 3375 } 3376 3377 /* 3378 * Returns true if the app array has some empty slots. 3379 * The offp and lenp parameters are in/out parameters. On entry 3380 * these values represent the starting offset and length of the 3381 * mapping. When true is returned, these values may be modified 3382 * to be the largest range which includes empty slots. 3383 */ 3384 int 3385 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3386 size_t *lenp) 3387 { 3388 ulong_t i, el; 3389 ssize_t low, high; 3390 struct anon *ap; 3391 3392 low = -1; 3393 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3394 ap = anon_get_ptr(ahp, anon_idx); 3395 if (ap == NULL) { 3396 if (low == -1) 3397 low = i; 3398 high = i; 3399 } 3400 } 3401 if (low != -1) { 3402 /* 3403 * Found at least one non-anon page. 3404 * Set up the off and len return values. 3405 */ 3406 if (low != 0) 3407 *offp += low; 3408 *lenp = high - low + PAGESIZE; 3409 return (1); 3410 } 3411 return (0); 3412 } 3413 3414 /* 3415 * Return a count of the number of existing anon pages in the anon array 3416 * app in the range (off, off+len). The array and slots must be guaranteed 3417 * stable by the caller. 3418 */ 3419 pgcnt_t 3420 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3421 { 3422 pgcnt_t cnt = 0; 3423 3424 while (nslots-- > 0) { 3425 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3426 cnt++; 3427 anon_index++; 3428 } 3429 return (cnt); 3430 } 3431 3432 /* 3433 * Move reserved phys swap into memory swap (unreserve phys swap 3434 * and reserve mem swap by the same amount). 3435 * Used by segspt when it needs to lock reserved swap npages in memory 3436 */ 3437 int 3438 anon_swap_adjust(pgcnt_t npages) 3439 { 3440 pgcnt_t unlocked_mem_swap; 3441 3442 mutex_enter(&anoninfo_lock); 3443 3444 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3445 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3446 3447 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3448 - k_anoninfo.ani_locked_swap; 3449 if (npages > unlocked_mem_swap) { 3450 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3451 3452 /* 3453 * if there is not enough unlocked mem swap we take missing 3454 * amount from phys swap and give it to mem swap 3455 */ 3456 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3457 mutex_exit(&anoninfo_lock); 3458 return (ENOMEM); 3459 } 3460 3461 k_anoninfo.ani_mem_resv += adjusted_swap; 3462 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3463 k_anoninfo.ani_phys_resv -= adjusted_swap; 3464 3465 ANI_ADD(adjusted_swap); 3466 } 3467 k_anoninfo.ani_locked_swap += npages; 3468 3469 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3470 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3471 3472 mutex_exit(&anoninfo_lock); 3473 3474 return (0); 3475 } 3476 3477 /* 3478 * 'unlocked' reserved mem swap so when it is unreserved it 3479 * can be moved back phys (disk) swap 3480 */ 3481 void 3482 anon_swap_restore(pgcnt_t npages) 3483 { 3484 mutex_enter(&anoninfo_lock); 3485 3486 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3487 3488 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3489 k_anoninfo.ani_locked_swap -= npages; 3490 3491 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3492 3493 mutex_exit(&anoninfo_lock); 3494 } 3495 3496 /* 3497 * Return the pointer from the list for a 3498 * specified anon index. 3499 */ 3500 ulong_t * 3501 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3502 { 3503 struct anon **app; 3504 void **ppp; 3505 3506 ASSERT(an_idx < ahp->size); 3507 3508 /* 3509 * Single level case. 3510 */ 3511 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3512 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3513 } else { 3514 3515 /* 3516 * 2 level case. 3517 */ 3518 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3519 if (*ppp == NULL) { 3520 mutex_enter(&ahp->serial_lock); 3521 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3522 if (*ppp == NULL) 3523 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3524 mutex_exit(&ahp->serial_lock); 3525 } 3526 app = *ppp; 3527 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3528 } 3529 } 3530 3531 void 3532 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3533 { 3534 ulong_t *ap_slot; 3535 kmutex_t *mtx; 3536 kcondvar_t *cv; 3537 int hash; 3538 3539 /* 3540 * Use szc to determine anon slot(s) to appear atomic. 3541 * If szc = 0, then lock the anon slot and mark it busy. 3542 * If szc > 0, then lock the range of slots by getting the 3543 * anon_array_lock for the first anon slot, and mark only the 3544 * first anon slot busy to represent whole range being busy. 3545 */ 3546 3547 ASSERT(RW_READ_HELD(&->a_rwlock)); 3548 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3549 hash = ANON_ARRAY_HASH(amp, an_idx); 3550 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3551 sobj->sync_cv = cv = &anon_array_cv[hash]; 3552 mutex_enter(mtx); 3553 ap_slot = anon_get_slot(amp->ahp, an_idx); 3554 while (ANON_ISBUSY(ap_slot)) 3555 cv_wait(cv, mtx); 3556 ANON_SETBUSY(ap_slot); 3557 sobj->sync_data = ap_slot; 3558 mutex_exit(mtx); 3559 } 3560 3561 int 3562 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3563 anon_sync_obj_t *sobj) 3564 { 3565 ulong_t *ap_slot; 3566 kmutex_t *mtx; 3567 int hash; 3568 3569 /* 3570 * Try to lock a range of anon slots. 3571 * Use szc to determine anon slot(s) to appear atomic. 3572 * If szc = 0, then lock the anon slot and mark it busy. 3573 * If szc > 0, then lock the range of slots by getting the 3574 * anon_array_lock for the first anon slot, and mark only the 3575 * first anon slot busy to represent whole range being busy. 3576 * Fail if the mutex or the anon_array are busy. 3577 */ 3578 3579 ASSERT(RW_READ_HELD(&->a_rwlock)); 3580 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3581 hash = ANON_ARRAY_HASH(amp, an_idx); 3582 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3583 sobj->sync_cv = &anon_array_cv[hash]; 3584 if (!mutex_tryenter(mtx)) { 3585 return (EWOULDBLOCK); 3586 } 3587 ap_slot = anon_get_slot(amp->ahp, an_idx); 3588 if (ANON_ISBUSY(ap_slot)) { 3589 mutex_exit(mtx); 3590 return (EWOULDBLOCK); 3591 } 3592 ANON_SETBUSY(ap_slot); 3593 sobj->sync_data = ap_slot; 3594 mutex_exit(mtx); 3595 return (0); 3596 } 3597 3598 void 3599 anon_array_exit(anon_sync_obj_t *sobj) 3600 { 3601 mutex_enter(sobj->sync_mutex); 3602 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3603 ANON_CLRBUSY(sobj->sync_data); 3604 if (CV_HAS_WAITERS(sobj->sync_cv)) 3605 cv_broadcast(sobj->sync_cv); 3606 mutex_exit(sobj->sync_mutex); 3607 } 3608