1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * VM - anonymous pages. 40 * 41 * This layer sits immediately above the vm_swap layer. It manages 42 * physical pages that have no permanent identity in the file system 43 * name space, using the services of the vm_swap layer to allocate 44 * backing storage for these pages. Since these pages have no external 45 * identity, they are discarded when the last reference is removed. 46 * 47 * An important function of this layer is to manage low-level sharing 48 * of pages that are logically distinct but that happen to be 49 * physically identical (e.g., the corresponding pages of the processes 50 * resulting from a fork before one process or the other changes their 51 * contents). This pseudo-sharing is present only as an optimization 52 * and is not to be confused with true sharing in which multiple 53 * address spaces deliberately contain references to the same object; 54 * such sharing is managed at a higher level. 55 * 56 * The key data structure here is the anon struct, which contains a 57 * reference count for its associated physical page and a hint about 58 * the identity of that page. Anon structs typically live in arrays, 59 * with an instance's position in its array determining where the 60 * corresponding backing storage is allocated; however, the swap_xlate() 61 * routine abstracts away this representation information so that the 62 * rest of the anon layer need not know it. (See the swap layer for 63 * more details on anon struct layout.) 64 * 65 * In the future versions of the system, the association between an 66 * anon struct and its position on backing store will change so that 67 * we don't require backing store all anonymous pages in the system. 68 * This is important for consideration for large memory systems. 69 * We can also use this technique to delay binding physical locations 70 * to anonymous pages until pageout/swapout time where we can make 71 * smarter allocation decisions to improve anonymous klustering. 72 * 73 * Many of the routines defined here take a (struct anon **) argument, 74 * which allows the code at this level to manage anon pages directly, 75 * so that callers can regard anon structs as opaque objects and not be 76 * concerned with assigning or inspecting their contents. 77 * 78 * Clients of this layer refer to anon pages indirectly. That is, they 79 * maintain arrays of pointers to anon structs rather than maintaining 80 * anon structs themselves. The (struct anon **) arguments mentioned 81 * above are pointers to entries in these arrays. It is these arrays 82 * that capture the mapping between offsets within a given segment and 83 * the corresponding anonymous backing storage address. 84 */ 85 86 #ifdef DEBUG 87 #define ANON_DEBUG 88 #endif 89 90 #include <sys/types.h> 91 #include <sys/t_lock.h> 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/mman.h> 95 #include <sys/cred.h> 96 #include <sys/thread.h> 97 #include <sys/vnode.h> 98 #include <sys/cpuvar.h> 99 #include <sys/swap.h> 100 #include <sys/cmn_err.h> 101 #include <sys/vtrace.h> 102 #include <sys/kmem.h> 103 #include <sys/sysmacros.h> 104 #include <sys/bitmap.h> 105 #include <sys/vmsystm.h> 106 #include <sys/tuneable.h> 107 #include <sys/debug.h> 108 #include <sys/fs/swapnode.h> 109 #include <sys/tnf_probe.h> 110 #include <sys/lgrp.h> 111 #include <sys/policy.h> 112 #include <sys/condvar_impl.h> 113 #include <sys/mutex_impl.h> 114 #include <sys/rctl.h> 115 116 #include <vm/as.h> 117 #include <vm/hat.h> 118 #include <vm/anon.h> 119 #include <vm/page.h> 120 #include <vm/vpage.h> 121 #include <vm/seg.h> 122 #include <vm/rm.h> 123 124 #include <fs/fs_subr.h> 125 126 struct vnode *anon_vp; 127 128 int anon_debug; 129 130 kmutex_t anoninfo_lock; 131 struct k_anoninfo k_anoninfo; 132 ani_free_t ani_free_pool[ANI_MAX_POOL]; 133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 134 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 135 136 /* 137 * Global hash table for (vp, off) -> anon slot 138 */ 139 extern int swap_maxcontig; 140 size_t anon_hash_size; 141 struct anon **anon_hash; 142 143 static struct kmem_cache *anon_cache; 144 static struct kmem_cache *anonmap_cache; 145 146 pad_mutex_t *anonhash_lock; 147 148 /* 149 * Used to make the increment of all refcnts of all anon slots of a large 150 * page appear to be atomic. The lock is grabbed for the first anon slot of 151 * a large page. 152 */ 153 pad_mutex_t *anonpages_hash_lock; 154 155 #define APH_MUTEX(vp, off) \ 156 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \ 157 (AH_LOCK_SIZE - 1))].pad_mutex) 158 159 #ifdef VM_STATS 160 static struct anonvmstats_str { 161 ulong_t getpages[30]; 162 ulong_t privatepages[10]; 163 ulong_t demotepages[9]; 164 ulong_t decrefpages[9]; 165 ulong_t dupfillholes[4]; 166 ulong_t freepages[1]; 167 } anonvmstats; 168 #endif /* VM_STATS */ 169 170 /*ARGSUSED*/ 171 static int 172 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 173 { 174 struct anon_map *amp = buf; 175 176 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 177 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 178 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 179 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 180 return (0); 181 } 182 183 /*ARGSUSED1*/ 184 static void 185 anonmap_cache_destructor(void *buf, void *cdrarg) 186 { 187 struct anon_map *amp = buf; 188 189 rw_destroy(&->a_rwlock); 190 cv_destroy(&->a_purgecv); 191 mutex_destroy(&->a_pmtx); 192 mutex_destroy(&->a_purgemtx); 193 } 194 195 void 196 anon_init(void) 197 { 198 int i; 199 pad_mutex_t *tmp; 200 201 /* These both need to be powers of 2 so round up to the next power */ 202 anon_hash_size = 1L << highbit((physmem / ANON_HASHAVELEN) - 1); 203 204 /* 205 * We need to align the anonhash_lock and anonpages_hash_lock arrays 206 * to a 64B boundary to avoid false sharing. We add 63B to our 207 * allocation so that we can get a 64B aligned address to use. 208 * We allocate both of these together to avoid wasting an additional 209 * 63B. 210 */ 211 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63, 212 KM_SLEEP); 213 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64); 214 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE; 215 216 for (i = 0; i < AH_LOCK_SIZE; i++) { 217 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT, 218 NULL); 219 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL, 220 MUTEX_DEFAULT, NULL); 221 } 222 223 for (i = 0; i < ANON_LOCKSIZE; i++) { 224 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 225 MUTEX_DEFAULT, NULL); 226 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 227 } 228 229 anon_hash = (struct anon **) 230 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 231 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 232 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 233 anonmap_cache = kmem_cache_create("anonmap_cache", 234 sizeof (struct anon_map), 0, 235 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 236 NULL, NULL, 0); 237 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 238 239 anon_vp = vn_alloc(KM_SLEEP); 240 vn_setops(anon_vp, swap_vnodeops); 241 anon_vp->v_type = VREG; 242 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 243 } 244 245 /* 246 * Global anon slot hash table manipulation. 247 */ 248 249 static void 250 anon_addhash(struct anon *ap) 251 { 252 int index; 253 254 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 255 index = ANON_HASH(ap->an_vp, ap->an_off); 256 ap->an_hash = anon_hash[index]; 257 anon_hash[index] = ap; 258 } 259 260 static void 261 anon_rmhash(struct anon *ap) 262 { 263 struct anon **app; 264 265 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 266 267 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 268 *app; app = &((*app)->an_hash)) { 269 if (*app == ap) { 270 *app = ap->an_hash; 271 break; 272 } 273 } 274 } 275 276 /* 277 * The anon array interfaces. Functions allocating, 278 * freeing array of pointers, and returning/setting 279 * entries in the array of pointers for a given offset. 280 * 281 * Create the list of pointers 282 */ 283 struct anon_hdr * 284 anon_create(pgcnt_t npages, int flags) 285 { 286 struct anon_hdr *ahp; 287 ulong_t nchunks; 288 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 289 290 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 291 return (NULL); 292 } 293 294 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 295 /* 296 * Single level case. 297 */ 298 ahp->size = npages; 299 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 300 301 if (flags & ANON_ALLOC_FORCE) 302 ahp->flags |= ANON_ALLOC_FORCE; 303 304 ahp->array_chunk = kmem_zalloc( 305 ahp->size * sizeof (struct anon *), kmemflags); 306 307 if (ahp->array_chunk == NULL) { 308 kmem_free(ahp, sizeof (struct anon_hdr)); 309 return (NULL); 310 } 311 } else { 312 /* 313 * 2 Level case. 314 * anon hdr size needs to be rounded off to be a multiple 315 * of ANON_CHUNK_SIZE. This is important as various anon 316 * related functions depend on this. 317 * NOTE - 318 * anon_grow() makes anon hdr size a multiple of 319 * ANON_CHUNK_SIZE. 320 * amp size is <= anon hdr size. 321 * anon_index + seg_pgs <= anon hdr size. 322 */ 323 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 324 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 325 326 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 327 kmemflags); 328 329 if (ahp->array_chunk == NULL) { 330 kmem_free(ahp, sizeof (struct anon_hdr)); 331 return (NULL); 332 } 333 } 334 return (ahp); 335 } 336 337 /* 338 * Free the array of pointers 339 */ 340 void 341 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 342 { 343 ulong_t i; 344 void **ppp; 345 ulong_t nchunks; 346 347 ASSERT(npages <= ahp->size); 348 349 /* 350 * Single level case. 351 */ 352 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 353 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 354 } else { 355 /* 356 * 2 level case. 357 */ 358 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 359 for (i = 0; i < nchunks; i++) { 360 ppp = &ahp->array_chunk[i]; 361 if (*ppp != NULL) 362 kmem_free(*ppp, PAGESIZE); 363 } 364 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 365 } 366 mutex_destroy(&ahp->serial_lock); 367 kmem_free(ahp, sizeof (struct anon_hdr)); 368 } 369 370 /* 371 * Return the pointer from the list for a 372 * specified anon index. 373 */ 374 struct anon * 375 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 376 { 377 struct anon **app; 378 379 ASSERT(an_idx < ahp->size); 380 381 /* 382 * Single level case. 383 */ 384 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 385 return ((struct anon *) 386 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 387 } else { 388 389 /* 390 * 2 level case. 391 */ 392 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 393 if (app) { 394 return ((struct anon *) 395 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 396 ANON_PTRMASK)); 397 } else { 398 return (NULL); 399 } 400 } 401 } 402 403 /* 404 * Return the anon pointer for the first valid entry in the anon list, 405 * starting from the given index. 406 */ 407 struct anon * 408 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 409 { 410 struct anon *ap; 411 struct anon **app; 412 ulong_t chunkoff; 413 ulong_t i; 414 ulong_t j; 415 pgcnt_t size; 416 417 i = *index; 418 size = ahp->size; 419 420 ASSERT(i < size); 421 422 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 423 /* 424 * 1 level case 425 */ 426 while (i < size) { 427 ap = (struct anon *) 428 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 429 if (ap) { 430 *index = i; 431 return (ap); 432 } 433 i++; 434 } 435 } else { 436 /* 437 * 2 level case 438 */ 439 chunkoff = i & ANON_CHUNK_OFF; 440 while (i < size) { 441 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 442 if (app) 443 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 444 ap = (struct anon *) 445 ((uintptr_t)app[j] & ANON_PTRMASK); 446 if (ap) { 447 *index = i + (j - chunkoff); 448 return (ap); 449 } 450 } 451 chunkoff = 0; 452 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 453 } 454 } 455 *index = size; 456 return (NULL); 457 } 458 459 /* 460 * Set list entry with a given pointer for a specified offset 461 */ 462 int 463 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 464 { 465 void **ppp; 466 struct anon **app; 467 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 468 uintptr_t *ap_addr; 469 470 ASSERT(an_idx < ahp->size); 471 472 /* 473 * Single level case. 474 */ 475 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 476 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 477 } else { 478 479 /* 480 * 2 level case. 481 */ 482 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 483 484 ASSERT(ppp != NULL); 485 if (*ppp == NULL) { 486 mutex_enter(&ahp->serial_lock); 487 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 488 if (*ppp == NULL) { 489 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 490 if (*ppp == NULL) { 491 mutex_exit(&ahp->serial_lock); 492 return (ENOMEM); 493 } 494 } 495 mutex_exit(&ahp->serial_lock); 496 } 497 app = *ppp; 498 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 499 } 500 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 501 return (0); 502 } 503 504 /* 505 * Copy anon array into a given new anon array 506 */ 507 int 508 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 509 struct anon_hdr *dahp, ulong_t d_idx, 510 pgcnt_t npages, int flags) 511 { 512 void **sapp, **dapp; 513 void *ap; 514 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 515 516 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 517 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 518 519 /* 520 * Both arrays are 1 level. 521 */ 522 if (((sahp->size <= ANON_CHUNK_SIZE) && 523 (dahp->size <= ANON_CHUNK_SIZE)) || 524 ((sahp->flags & ANON_ALLOC_FORCE) && 525 (dahp->flags & ANON_ALLOC_FORCE))) { 526 527 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 528 npages * sizeof (struct anon *)); 529 return (0); 530 } 531 532 /* 533 * Both arrays are 2 levels. 534 */ 535 if (sahp->size > ANON_CHUNK_SIZE && 536 dahp->size > ANON_CHUNK_SIZE && 537 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 538 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 539 540 ulong_t sapidx, dapidx; 541 ulong_t *sap, *dap; 542 ulong_t chknp; 543 544 while (npages != 0) { 545 546 sapidx = s_idx & ANON_CHUNK_OFF; 547 dapidx = d_idx & ANON_CHUNK_OFF; 548 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 549 if (chknp > npages) 550 chknp = npages; 551 552 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 553 if ((sap = *sapp) != NULL) { 554 dapp = &dahp->array_chunk[d_idx 555 >> ANON_CHUNK_SHIFT]; 556 if ((dap = *dapp) == NULL) { 557 *dapp = kmem_zalloc(PAGESIZE, 558 kmemflags); 559 if ((dap = *dapp) == NULL) 560 return (ENOMEM); 561 } 562 bcopy((sap + sapidx), (dap + dapidx), 563 chknp << ANON_PTRSHIFT); 564 } 565 s_idx += chknp; 566 d_idx += chknp; 567 npages -= chknp; 568 } 569 return (0); 570 } 571 572 /* 573 * At least one of the arrays is 2 level. 574 */ 575 while (npages--) { 576 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 577 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 578 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 579 return (ENOMEM); 580 } 581 s_idx++; 582 d_idx++; 583 } 584 return (0); 585 } 586 587 588 /* 589 * ANON_INITBUF is a convenience macro for anon_grow() below. It 590 * takes a buffer dst, which is at least as large as buffer src. It 591 * does a bcopy from src into dst, and then bzeros the extra bytes 592 * of dst. If tail is set, the data in src is tail aligned within 593 * dst instead of head aligned. 594 */ 595 596 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 597 if (tail) { \ 598 bzero((dst), (dstsize) - (srclen)); \ 599 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 600 } else { \ 601 bcopy((src), (dst), (srclen)); \ 602 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 603 } 604 605 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 606 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 607 608 /* 609 * anon_grow() is used to efficiently extend an existing anon array. 610 * startidx_p points to the index into the anon array of the first page 611 * that is in use. oldseg_pgs is the number of pages in use, starting at 612 * *startidx_p. newpages is the number of additional pages desired. 613 * 614 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 615 * 616 * The growth is done by creating a new top level of the anon array, 617 * and (if the array is 2-level) reusing the existing second level arrays. 618 * 619 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 620 * 621 * Returns the new number of pages in the anon array. 622 */ 623 pgcnt_t 624 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 625 pgcnt_t newseg_pgs, int flags) 626 { 627 ulong_t startidx = startidx_p ? *startidx_p : 0; 628 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 629 pgcnt_t oelems, nelems, totpages; 630 void **level1; 631 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 632 int growdown = (flags & ANON_GROWDOWN); 633 size_t newarrsz, oldarrsz; 634 void *level2; 635 636 ASSERT(!(startidx_p == NULL && growdown)); 637 ASSERT(startidx + oldseg_pgs <= ahp->size); 638 639 /* 640 * Determine the total number of pages needed in the new 641 * anon array. If growing down, totpages is all pages from 642 * startidx through the end of the array, plus <newseg_pgs> 643 * pages. If growing up, keep all pages from page 0 through 644 * the last page currently in use, plus <newseg_pgs> pages. 645 */ 646 if (growdown) 647 totpages = oldamp_pgs - startidx + newseg_pgs; 648 else 649 totpages = startidx + oldseg_pgs + newseg_pgs; 650 651 /* If the array is already large enough, just return. */ 652 653 if (oldamp_pgs >= totpages) { 654 if (growdown) 655 *startidx_p = oldamp_pgs - totpages; 656 return (oldamp_pgs); 657 } 658 659 /* 660 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 661 * by the corresponding arrays. 662 * oelems/nelems are the number of pointers in the top level arrays 663 * which may be either level 1 or level 2. 664 * Will the new anon array be one level or two levels? 665 */ 666 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 667 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 668 oelems = oldamp_pgs; 669 nelems = newamp_pgs; 670 } else { 671 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 672 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 673 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 674 } 675 676 newarrsz = nelems * sizeof (void *); 677 level1 = kmem_alloc(newarrsz, kmemflags); 678 if (level1 == NULL) 679 return (0); 680 681 /* Are we converting from a one level to a two level anon array? */ 682 683 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 684 !(ahp->flags & ANON_ALLOC_FORCE)) { 685 686 /* 687 * Yes, we're converting to a two level. Reuse old level 1 688 * as new level 2 if it is exactly PAGESIZE. Otherwise 689 * alloc a new level 2 and copy the old level 1 data into it. 690 */ 691 if (oldamp_pgs == ANON_CHUNK_SIZE) { 692 level2 = (void *)ahp->array_chunk; 693 } else { 694 level2 = kmem_alloc(PAGESIZE, kmemflags); 695 if (level2 == NULL) { 696 kmem_free(level1, newarrsz); 697 return (0); 698 } 699 oldarrsz = oldamp_pgs * sizeof (void *); 700 701 ANON_INITBUF(ahp->array_chunk, oldarrsz, 702 level2, PAGESIZE, growdown); 703 kmem_free(ahp->array_chunk, oldarrsz); 704 } 705 bzero(level1, newarrsz); 706 if (growdown) 707 level1[nelems - 1] = level2; 708 else 709 level1[0] = level2; 710 } else { 711 oldarrsz = oelems * sizeof (void *); 712 713 ANON_INITBUF(ahp->array_chunk, oldarrsz, 714 level1, newarrsz, growdown); 715 kmem_free(ahp->array_chunk, oldarrsz); 716 } 717 718 ahp->array_chunk = level1; 719 ahp->size = newamp_pgs; 720 if (growdown) 721 *startidx_p = newamp_pgs - totpages; 722 723 return (newamp_pgs); 724 } 725 726 727 /* 728 * Called from clock handler to sync ani_free value. 729 */ 730 731 void 732 set_anoninfo(void) 733 { 734 int ix; 735 pgcnt_t total = 0; 736 737 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 738 total += ani_free_pool[ix].ani_count; 739 } 740 k_anoninfo.ani_free = total; 741 } 742 743 /* 744 * Reserve anon space. 745 * 746 * It's no longer simply a matter of incrementing ani_resv to 747 * reserve swap space, we need to check memory-based as well 748 * as disk-backed (physical) swap. The following algorithm 749 * is used: 750 * Check the space on physical swap 751 * i.e. amount needed < ani_max - ani_phys_resv 752 * If we are swapping on swapfs check 753 * amount needed < (availrmem - swapfs_minfree) 754 * Since the algorithm to check for the quantity of swap space is 755 * almost the same as that for reserving it, we'll just use anon_resvmem 756 * with a flag to decrement availrmem. 757 * 758 * Return non-zero on success. 759 */ 760 int 761 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 762 { 763 pgcnt_t npages = btopr(size); 764 pgcnt_t mswap_pages = 0; 765 pgcnt_t pswap_pages = 0; 766 proc_t *p = curproc; 767 768 if (zone != NULL && takemem) { 769 /* test zone.max-swap resource control */ 770 mutex_enter(&p->p_lock); 771 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 772 mutex_exit(&p->p_lock); 773 return (0); 774 } 775 mutex_exit(&p->p_lock); 776 } 777 mutex_enter(&anoninfo_lock); 778 779 /* 780 * pswap_pages is the number of pages we can take from 781 * physical (i.e. disk-backed) swap. 782 */ 783 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 784 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 785 786 ANON_PRINT(A_RESV, 787 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 788 npages, takemem, pswap_pages, (void *)caller())); 789 790 if (npages <= pswap_pages) { 791 /* 792 * we have enough space on a physical swap 793 */ 794 if (takemem) 795 k_anoninfo.ani_phys_resv += npages; 796 mutex_exit(&anoninfo_lock); 797 return (1); 798 } else if (pswap_pages != 0) { 799 /* 800 * we have some space on a physical swap 801 */ 802 if (takemem) { 803 /* 804 * use up remainder of phys swap 805 */ 806 k_anoninfo.ani_phys_resv += pswap_pages; 807 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 808 } 809 } 810 /* 811 * since (npages > pswap_pages) we need mem swap 812 * mswap_pages is the number of pages needed from availrmem 813 */ 814 ASSERT(npages > pswap_pages); 815 mswap_pages = npages - pswap_pages; 816 817 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 818 mswap_pages)); 819 820 /* 821 * priv processes can reserve memory as swap as long as availrmem 822 * remains greater than swapfs_minfree; in the case of non-priv 823 * processes, memory can be reserved as swap only if availrmem 824 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 825 * swapfs_reserve amount of memswap is not available to non-priv 826 * processes. This protects daemons such as automounter dying 827 * as a result of application processes eating away almost entire 828 * membased swap. This safeguard becomes useless if apps are run 829 * with root access. 830 * 831 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 832 * 833 */ 834 if (tryhard) { 835 pgcnt_t floor_pages; 836 837 if (secpolicy_resource_anon_mem(CRED())) { 838 floor_pages = swapfs_minfree; 839 } else { 840 floor_pages = swapfs_minfree + swapfs_reserve; 841 } 842 843 mutex_exit(&anoninfo_lock); 844 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 845 mutex_enter(&anoninfo_lock); 846 } 847 848 mutex_enter(&freemem_lock); 849 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 850 (availrmem > (swapfs_minfree + mswap_pages) && 851 secpolicy_resource(CRED()) == 0)) { 852 853 if (takemem) { 854 /* 855 * Take the memory from the rest of the system. 856 */ 857 availrmem -= mswap_pages; 858 mutex_exit(&freemem_lock); 859 k_anoninfo.ani_mem_resv += mswap_pages; 860 ANI_ADD(mswap_pages); 861 ANON_PRINT((A_RESV | A_MRESV), 862 ("anon_resvmem: took %ld pages of availrmem\n", 863 mswap_pages)); 864 } else { 865 mutex_exit(&freemem_lock); 866 } 867 868 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 869 mutex_exit(&anoninfo_lock); 870 return (1); 871 } else { 872 /* 873 * Fail if not enough memory 874 */ 875 if (takemem) { 876 k_anoninfo.ani_phys_resv -= pswap_pages; 877 } 878 879 mutex_exit(&freemem_lock); 880 mutex_exit(&anoninfo_lock); 881 ANON_PRINT(A_RESV, 882 ("anon_resvmem: not enough space from swapfs\n")); 883 if (zone != NULL && takemem) 884 rctl_decr_swap(zone, ptob(npages)); 885 return (0); 886 } 887 } 888 889 /* 890 * Give back an anon reservation. 891 */ 892 void 893 anon_unresvmem(size_t size, zone_t *zone) 894 { 895 pgcnt_t npages = btopr(size); 896 spgcnt_t mem_free_pages = 0; 897 pgcnt_t phys_free_slots; 898 #ifdef ANON_DEBUG 899 pgcnt_t mem_resv; 900 #endif 901 if (zone != NULL) 902 rctl_decr_swap(zone, ptob(npages)); 903 904 mutex_enter(&anoninfo_lock); 905 906 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 907 908 /* 909 * If some of this reservation belonged to swapfs 910 * give it back to availrmem. 911 * ani_mem_resv is the amount of availrmem swapfs has reserved. 912 * but some of that memory could be locked by segspt so we can only 913 * return non locked ani_mem_resv back to availrmem 914 */ 915 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 916 ANON_PRINT((A_RESV | A_MRESV), 917 ("anon_unresv: growing availrmem by %ld pages\n", 918 MIN(k_anoninfo.ani_mem_resv, npages))); 919 920 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 921 k_anoninfo.ani_locked_swap), npages); 922 mutex_enter(&freemem_lock); 923 availrmem += mem_free_pages; 924 mutex_exit(&freemem_lock); 925 k_anoninfo.ani_mem_resv -= mem_free_pages; 926 927 ANI_ADD(-mem_free_pages); 928 } 929 /* 930 * The remainder of the pages is returned to phys swap 931 */ 932 ASSERT(npages >= mem_free_pages); 933 phys_free_slots = npages - mem_free_pages; 934 935 if (phys_free_slots) { 936 k_anoninfo.ani_phys_resv -= phys_free_slots; 937 } 938 939 #ifdef ANON_DEBUG 940 mem_resv = k_anoninfo.ani_mem_resv; 941 #endif 942 943 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 944 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 945 946 mutex_exit(&anoninfo_lock); 947 948 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 949 npages, mem_resv, (void *)caller())); 950 } 951 952 /* 953 * Allocate an anon slot and return it with the lock held. 954 */ 955 struct anon * 956 anon_alloc(struct vnode *vp, anoff_t off) 957 { 958 struct anon *ap; 959 kmutex_t *ahm; 960 961 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 962 if (vp == NULL) { 963 swap_alloc(ap); 964 } else { 965 ap->an_vp = vp; 966 ap->an_off = off; 967 } 968 ap->an_refcnt = 1; 969 ap->an_pvp = NULL; 970 ap->an_poff = 0; 971 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 972 mutex_enter(ahm); 973 anon_addhash(ap); 974 mutex_exit(ahm); 975 ANI_ADD(-1); 976 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 977 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 978 return (ap); 979 } 980 981 /* 982 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 983 * such pages don't consume any physical swap resources needed for swapping 984 * unlocked pages. 985 */ 986 void 987 anon_swap_free(struct anon *ap, page_t *pp) 988 { 989 kmutex_t *ahm; 990 991 ASSERT(ap != NULL); 992 ASSERT(pp != NULL); 993 ASSERT(PAGE_LOCKED(pp)); 994 ASSERT(pp->p_vnode != NULL); 995 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 996 ASSERT(ap->an_refcnt != 0); 997 ASSERT(pp->p_vnode == ap->an_vp); 998 ASSERT(pp->p_offset == ap->an_off); 999 1000 if (ap->an_pvp == NULL) 1001 return; 1002 1003 page_io_lock(pp); 1004 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1005 mutex_enter(ahm); 1006 1007 ASSERT(ap->an_refcnt != 0); 1008 ASSERT(pp->p_vnode == ap->an_vp); 1009 ASSERT(pp->p_offset == ap->an_off); 1010 1011 if (ap->an_pvp != NULL) { 1012 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1013 ap->an_pvp = NULL; 1014 ap->an_poff = 0; 1015 mutex_exit(ahm); 1016 hat_setmod(pp); 1017 } else { 1018 mutex_exit(ahm); 1019 } 1020 page_io_unlock(pp); 1021 } 1022 1023 /* 1024 * Decrement the reference count of an anon page. 1025 * If reference count goes to zero, free it and 1026 * its associated page (if any). 1027 */ 1028 void 1029 anon_decref(struct anon *ap) 1030 { 1031 page_t *pp; 1032 struct vnode *vp; 1033 anoff_t off; 1034 kmutex_t *ahm; 1035 1036 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1037 mutex_enter(ahm); 1038 ASSERT(ap->an_refcnt != 0); 1039 if (ap->an_refcnt == 0) 1040 panic("anon_decref: slot count 0"); 1041 if (--ap->an_refcnt == 0) { 1042 swap_xlate(ap, &vp, &off); 1043 anon_rmhash(ap); 1044 if (ap->an_pvp != NULL) 1045 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1046 mutex_exit(ahm); 1047 1048 /* 1049 * If there is a page for this anon slot we will need to 1050 * call VN_DISPOSE to get rid of the vp association and 1051 * put the page back on the free list as really free. 1052 * Acquire the "exclusive" lock to ensure that any 1053 * pending i/o always completes before the swap slot 1054 * is freed. 1055 */ 1056 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1057 if (pp != NULL) { 1058 /*LINTED: constant in conditional context */ 1059 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1060 } 1061 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1062 (void *)ap, (void *)ap->an_vp)); 1063 1064 kmem_cache_free(anon_cache, ap); 1065 1066 ANI_ADD(1); 1067 } else { 1068 mutex_exit(ahm); 1069 } 1070 } 1071 1072 1073 /* 1074 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1075 * seg->s_szc level) to determine whether COW processing is required. 1076 * anonpages_hash_lock[] held on the root ap ensures that if root's 1077 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1078 * later since this process can't fork while its AS lock is held). 1079 * 1080 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1081 */ 1082 int 1083 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1084 { 1085 struct anon *ap; 1086 kmutex_t *ahmpages = NULL; 1087 1088 ap = anon_get_ptr(ahp, anon_index); 1089 if (ap == NULL) 1090 return (0); 1091 1092 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1093 mutex_enter(ahmpages); 1094 ASSERT(ap->an_refcnt >= 1); 1095 if (ap->an_refcnt == 1) { 1096 mutex_exit(ahmpages); 1097 return (0); 1098 } 1099 mutex_exit(ahmpages); 1100 return (1); 1101 } 1102 /* 1103 * Check 'nslots' anon slots for refcnt > 1. 1104 * 1105 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1106 * returns 0. 1107 */ 1108 static int 1109 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1110 { 1111 struct anon *ap; 1112 1113 while (nslots-- > 0) { 1114 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1115 ap->an_refcnt > 1) 1116 return (1); 1117 anon_index++; 1118 } 1119 1120 return (0); 1121 } 1122 1123 static void 1124 anon_decref_pages( 1125 struct anon_hdr *ahp, 1126 ulong_t an_idx, 1127 uint_t szc) 1128 { 1129 struct anon *ap = anon_get_ptr(ahp, an_idx); 1130 kmutex_t *ahmpages = NULL; 1131 page_t *pp; 1132 pgcnt_t pgcnt = page_get_pagecnt(szc); 1133 pgcnt_t i; 1134 struct vnode *vp; 1135 anoff_t off; 1136 kmutex_t *ahm; 1137 #ifdef DEBUG 1138 int refcnt = 1; 1139 #endif 1140 1141 ASSERT(szc != 0); 1142 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1143 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1144 ASSERT(an_idx < ahp->size); 1145 1146 if (ahp->size - an_idx < pgcnt) { 1147 /* 1148 * In case of shared mappings total anon map size may not be 1149 * the largest page size aligned. 1150 */ 1151 pgcnt = ahp->size - an_idx; 1152 } 1153 1154 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1155 1156 if (ap != NULL) { 1157 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1158 mutex_enter(ahmpages); 1159 ASSERT((refcnt = ap->an_refcnt) != 0); 1160 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1161 if (ap->an_refcnt == 1) { 1162 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1163 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1164 mutex_exit(ahmpages); 1165 ahmpages = NULL; 1166 } 1167 } 1168 1169 i = 0; 1170 while (i < pgcnt) { 1171 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1172 ASSERT(refcnt == 1 && ahmpages == NULL); 1173 i++; 1174 continue; 1175 } 1176 ASSERT(ap->an_refcnt == refcnt); 1177 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1178 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1179 1180 if (ahmpages == NULL) { 1181 swap_xlate(ap, &vp, &off); 1182 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1183 if (pp == NULL || pp->p_szc == 0) { 1184 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1185 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1186 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1187 ANON_SLEEP); 1188 mutex_enter(ahm); 1189 ap->an_refcnt--; 1190 ASSERT(ap->an_refcnt == 0); 1191 anon_rmhash(ap); 1192 if (ap->an_pvp) 1193 swap_phys_free(ap->an_pvp, ap->an_poff, 1194 PAGESIZE); 1195 mutex_exit(ahm); 1196 if (pp == NULL) { 1197 pp = page_lookup(vp, (u_offset_t)off, 1198 SE_EXCL); 1199 ASSERT(pp == NULL || pp->p_szc == 0); 1200 } 1201 if (pp != NULL) { 1202 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1203 /*LINTED*/ 1204 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1205 } 1206 kmem_cache_free(anon_cache, ap); 1207 ANI_ADD(1); 1208 i++; 1209 } else { 1210 pgcnt_t j; 1211 pgcnt_t curpgcnt = 1212 page_get_pagecnt(pp->p_szc); 1213 size_t ppasize = curpgcnt * sizeof (page_t *); 1214 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1215 int dispose = 0; 1216 1217 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1218 1219 ASSERT(pp->p_szc <= szc); 1220 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1221 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1222 ASSERT(i + curpgcnt <= pgcnt); 1223 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1224 ppa[0] = pp; 1225 for (j = i + 1; j < i + curpgcnt; j++) { 1226 ap = anon_get_ptr(ahp, an_idx + j); 1227 ASSERT(ap != NULL && 1228 ap->an_refcnt == 1); 1229 swap_xlate(ap, &vp, &off); 1230 pp = page_lookup(vp, (u_offset_t)off, 1231 SE_EXCL); 1232 if (pp == NULL) 1233 panic("anon_decref_pages: " 1234 "no page"); 1235 1236 (void) hat_pageunload(pp, 1237 HAT_FORCE_PGUNLOAD); 1238 ASSERT(pp->p_szc == ppa[0]->p_szc); 1239 ASSERT(page_pptonum(pp) - 1 == 1240 page_pptonum(ppa[j - i - 1])); 1241 ppa[j - i] = pp; 1242 if (ap->an_pvp != NULL && 1243 !vn_matchopval(ap->an_pvp, 1244 VOPNAME_DISPOSE, 1245 (fs_generic_func_p)fs_dispose)) 1246 dispose = 1; 1247 } 1248 for (j = i; j < i + curpgcnt; j++) { 1249 ap = anon_get_ptr(ahp, an_idx + j); 1250 ASSERT(ap != NULL && 1251 ap->an_refcnt == 1); 1252 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1253 (void) anon_set_ptr(ahp, an_idx + j, 1254 NULL, ANON_SLEEP); 1255 mutex_enter(ahm); 1256 ap->an_refcnt--; 1257 ASSERT(ap->an_refcnt == 0); 1258 anon_rmhash(ap); 1259 if (ap->an_pvp) 1260 swap_phys_free(ap->an_pvp, 1261 ap->an_poff, PAGESIZE); 1262 mutex_exit(ahm); 1263 kmem_cache_free(anon_cache, ap); 1264 ANI_ADD(1); 1265 } 1266 if (!dispose) { 1267 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1268 page_destroy_pages(ppa[0]); 1269 } else { 1270 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1271 for (j = 0; j < curpgcnt; j++) { 1272 ASSERT(PAGE_EXCL(ppa[j])); 1273 ppa[j]->p_szc = 0; 1274 } 1275 for (j = 0; j < curpgcnt; j++) { 1276 ASSERT(!hat_page_is_mapped( 1277 ppa[j])); 1278 /*LINTED*/ 1279 VN_DISPOSE(ppa[j], B_INVAL, 0, 1280 kcred); 1281 } 1282 } 1283 kmem_free(ppa, ppasize); 1284 i += curpgcnt; 1285 } 1286 } else { 1287 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1288 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1289 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1290 mutex_enter(ahm); 1291 ap->an_refcnt--; 1292 mutex_exit(ahm); 1293 i++; 1294 } 1295 } 1296 1297 if (ahmpages != NULL) { 1298 mutex_exit(ahmpages); 1299 } 1300 } 1301 1302 /* 1303 * Duplicate references to size bytes worth of anon pages. 1304 * Used when duplicating a segment that contains private anon pages. 1305 * This code assumes that procedure calling this one has already used 1306 * hat_chgprot() to disable write access to the range of addresses that 1307 * that *old actually refers to. 1308 */ 1309 void 1310 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1311 ulong_t new_idx, size_t size) 1312 { 1313 spgcnt_t npages; 1314 kmutex_t *ahm; 1315 struct anon *ap; 1316 ulong_t off; 1317 ulong_t index; 1318 1319 npages = btopr(size); 1320 while (npages > 0) { 1321 index = old_idx; 1322 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1323 break; 1324 1325 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1326 off = index - old_idx; 1327 npages -= off; 1328 if (npages <= 0) 1329 break; 1330 1331 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1332 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1333 1334 mutex_enter(ahm); 1335 ap->an_refcnt++; 1336 mutex_exit(ahm); 1337 1338 off++; 1339 new_idx += off; 1340 old_idx += off; 1341 npages--; 1342 } 1343 } 1344 1345 /* 1346 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1347 * slots) within any large page region. That means if a large page region is 1348 * empty in the old array it will skip it. If there are 1 or more valid slots 1349 * in the large page region of the old array it will make sure to fill in any 1350 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1351 * page region should either have no valid anon slots or all slots should be 1352 * valid. 1353 */ 1354 void 1355 anon_dup_fill_holes( 1356 struct anon_hdr *old, 1357 ulong_t old_idx, 1358 struct anon_hdr *new, 1359 ulong_t new_idx, 1360 size_t size, 1361 uint_t szc, 1362 int noalloc) 1363 { 1364 struct anon *ap; 1365 spgcnt_t npages; 1366 kmutex_t *ahm, *ahmpages = NULL; 1367 pgcnt_t pgcnt, i; 1368 ulong_t index, off; 1369 #ifdef DEBUG 1370 int refcnt; 1371 #endif 1372 1373 ASSERT(szc != 0); 1374 pgcnt = page_get_pagecnt(szc); 1375 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1376 npages = btopr(size); 1377 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1378 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1379 1380 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1381 1382 while (npages > 0) { 1383 index = old_idx; 1384 1385 /* 1386 * Find the next valid slot. 1387 */ 1388 if (anon_get_next_ptr(old, &index) == NULL) 1389 break; 1390 1391 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1392 /* 1393 * Now backup index to the beginning of the 1394 * current large page region of the old array. 1395 */ 1396 index = P2ALIGN(index, pgcnt); 1397 off = index - old_idx; 1398 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1399 npages -= off; 1400 if (npages <= 0) 1401 break; 1402 1403 /* 1404 * Fill and copy a large page regions worth 1405 * of anon slots. 1406 */ 1407 for (i = 0; i < pgcnt; i++) { 1408 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1409 if (noalloc) { 1410 panic("anon_dup_fill_holes: " 1411 "empty anon slot\n"); 1412 } 1413 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1414 ap = anon_alloc(NULL, 0); 1415 (void) anon_set_ptr(old, index + i, ap, 1416 ANON_SLEEP); 1417 } else if (i == 0) { 1418 /* 1419 * make the increment of all refcnts of all 1420 * anon slots of a large page appear atomic by 1421 * getting an anonpages_hash_lock for the 1422 * first anon slot of a large page. 1423 */ 1424 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1425 1426 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1427 mutex_enter(ahmpages); 1428 /*LINTED*/ 1429 ASSERT(refcnt = ap->an_refcnt); 1430 1431 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1432 anonvmstats.dupfillholes[3]); 1433 } 1434 (void) anon_set_ptr(new, new_idx + off + i, ap, 1435 ANON_SLEEP); 1436 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1437 mutex_enter(ahm); 1438 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1439 ASSERT(i == 0 || ahmpages == NULL || 1440 refcnt == ap->an_refcnt); 1441 ap->an_refcnt++; 1442 mutex_exit(ahm); 1443 } 1444 if (ahmpages != NULL) { 1445 mutex_exit(ahmpages); 1446 ahmpages = NULL; 1447 } 1448 off += pgcnt; 1449 new_idx += off; 1450 old_idx += off; 1451 npages -= pgcnt; 1452 } 1453 } 1454 1455 /* 1456 * Used when a segment with a vnode changes szc. similarly to 1457 * anon_dup_fill_holes() makes sure each large page region either has no anon 1458 * slots or all of them. but new slots are created by COWing the file 1459 * pages. on entrance no anon slots should be shared. 1460 */ 1461 int 1462 anon_fill_cow_holes( 1463 struct seg *seg, 1464 caddr_t addr, 1465 struct anon_hdr *ahp, 1466 ulong_t an_idx, 1467 struct vnode *vp, 1468 u_offset_t vp_off, 1469 size_t size, 1470 uint_t szc, 1471 uint_t prot, 1472 struct vpage vpage[], 1473 struct cred *cred) 1474 { 1475 struct anon *ap; 1476 spgcnt_t npages; 1477 pgcnt_t pgcnt, i; 1478 ulong_t index, off; 1479 int err = 0; 1480 int pageflags = 0; 1481 1482 ASSERT(szc != 0); 1483 pgcnt = page_get_pagecnt(szc); 1484 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1485 npages = btopr(size); 1486 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1487 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1488 1489 while (npages > 0) { 1490 index = an_idx; 1491 1492 /* 1493 * Find the next valid slot. 1494 */ 1495 if (anon_get_next_ptr(ahp, &index) == NULL) { 1496 break; 1497 } 1498 1499 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1500 /* 1501 * Now backup index to the beginning of the 1502 * current large page region of the anon array. 1503 */ 1504 index = P2ALIGN(index, pgcnt); 1505 off = index - an_idx; 1506 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1507 npages -= off; 1508 if (npages <= 0) 1509 break; 1510 an_idx += off; 1511 vp_off += ptob(off); 1512 addr += ptob(off); 1513 if (vpage != NULL) { 1514 vpage += off; 1515 } 1516 1517 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1518 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1519 page_t *pl[1 + 1]; 1520 page_t *pp; 1521 1522 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1523 pl, PAGESIZE, seg, addr, S_READ, cred, 1524 NULL); 1525 if (err) { 1526 break; 1527 } 1528 if (vpage != NULL) { 1529 prot = VPP_PROT(vpage); 1530 pageflags = VPP_ISPPLOCK(vpage) ? 1531 LOCK_PAGE : 0; 1532 } 1533 pp = anon_private(&ap, seg, addr, prot, pl[0], 1534 pageflags, cred); 1535 if (pp == NULL) { 1536 err = ENOMEM; 1537 break; 1538 } 1539 (void) anon_set_ptr(ahp, an_idx, ap, 1540 ANON_SLEEP); 1541 page_unlock(pp); 1542 } 1543 ASSERT(ap->an_refcnt == 1); 1544 addr += PAGESIZE; 1545 if (vpage != NULL) { 1546 vpage++; 1547 } 1548 } 1549 npages -= pgcnt; 1550 } 1551 1552 return (err); 1553 } 1554 1555 /* 1556 * Free a group of "size" anon pages, size in bytes, 1557 * and clear out the pointers to the anon entries. 1558 */ 1559 void 1560 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1561 { 1562 spgcnt_t npages; 1563 struct anon *ap; 1564 ulong_t old; 1565 1566 npages = btopr(size); 1567 1568 while (npages > 0) { 1569 old = index; 1570 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1571 break; 1572 1573 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1574 npages -= index - old; 1575 if (npages <= 0) 1576 break; 1577 1578 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1579 anon_decref(ap); 1580 /* 1581 * Bump index and decrement page count 1582 */ 1583 index++; 1584 npages--; 1585 } 1586 } 1587 1588 void 1589 anon_free_pages( 1590 struct anon_hdr *ahp, 1591 ulong_t an_idx, 1592 size_t size, 1593 uint_t szc) 1594 { 1595 spgcnt_t npages; 1596 pgcnt_t pgcnt; 1597 ulong_t index, off; 1598 1599 ASSERT(szc != 0); 1600 pgcnt = page_get_pagecnt(szc); 1601 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1602 npages = btopr(size); 1603 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1604 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1605 ASSERT(an_idx < ahp->size); 1606 1607 VM_STAT_ADD(anonvmstats.freepages[0]); 1608 1609 while (npages > 0) { 1610 index = an_idx; 1611 1612 /* 1613 * Find the next valid slot. 1614 */ 1615 if (anon_get_next_ptr(ahp, &index) == NULL) 1616 break; 1617 1618 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1619 /* 1620 * Now backup index to the beginning of the 1621 * current large page region of the old array. 1622 */ 1623 index = P2ALIGN(index, pgcnt); 1624 off = index - an_idx; 1625 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1626 npages -= off; 1627 if (npages <= 0) 1628 break; 1629 1630 anon_decref_pages(ahp, index, szc); 1631 1632 off += pgcnt; 1633 an_idx += off; 1634 npages -= pgcnt; 1635 } 1636 } 1637 1638 /* 1639 * Make anonymous pages discardable 1640 */ 1641 void 1642 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1643 { 1644 spgcnt_t npages = btopr(size); 1645 struct anon *ap; 1646 struct vnode *vp; 1647 anoff_t off; 1648 page_t *pp, *root_pp; 1649 kmutex_t *ahm; 1650 pgcnt_t pgcnt; 1651 ulong_t old_idx, idx, i; 1652 struct anon_hdr *ahp = amp->ahp; 1653 anon_sync_obj_t cookie; 1654 1655 ASSERT(RW_READ_HELD(&->a_rwlock)); 1656 pgcnt = 1; 1657 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1658 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1659 1660 /* 1661 * get anon pointer and index for the first valid entry 1662 * in the anon list, starting from "index" 1663 */ 1664 old_idx = index; 1665 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1666 break; 1667 1668 /* 1669 * decrement npages by number of NULL anon slots we skipped 1670 */ 1671 npages -= index - old_idx; 1672 if (npages <= 0) 1673 break; 1674 1675 anon_array_enter(amp, index, &cookie); 1676 ap = anon_get_ptr(ahp, index); 1677 ASSERT(ap != NULL); 1678 1679 /* 1680 * Get anonymous page and try to lock it SE_EXCL; 1681 * if we couldn't grab the lock we skip to next page. 1682 */ 1683 swap_xlate(ap, &vp, &off); 1684 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1685 if (pp == NULL) { 1686 segadvstat.MADV_FREE_miss.value.ul++; 1687 pgcnt = 1; 1688 anon_array_exit(&cookie); 1689 continue; 1690 } 1691 pgcnt = page_get_pagecnt(pp->p_szc); 1692 1693 /* 1694 * we cannot free a page which is permanently locked. 1695 * The page_struct_lock need not be acquired to examine 1696 * these fields since the page has an "exclusive" lock. 1697 */ 1698 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1699 page_unlock(pp); 1700 segadvstat.MADV_FREE_miss.value.ul++; 1701 anon_array_exit(&cookie); 1702 continue; 1703 } 1704 1705 ahm = AH_MUTEX(vp, off); 1706 mutex_enter(ahm); 1707 ASSERT(ap->an_refcnt != 0); 1708 /* 1709 * skip this one if copy-on-write is not yet broken. 1710 */ 1711 if (ap->an_refcnt > 1) { 1712 mutex_exit(ahm); 1713 page_unlock(pp); 1714 segadvstat.MADV_FREE_miss.value.ul++; 1715 anon_array_exit(&cookie); 1716 continue; 1717 } 1718 1719 if (pp->p_szc == 0) { 1720 pgcnt = 1; 1721 1722 /* 1723 * free swap slot; 1724 */ 1725 if (ap->an_pvp) { 1726 swap_phys_free(ap->an_pvp, ap->an_poff, 1727 PAGESIZE); 1728 ap->an_pvp = NULL; 1729 ap->an_poff = 0; 1730 } 1731 mutex_exit(ahm); 1732 segadvstat.MADV_FREE_hit.value.ul++; 1733 1734 /* 1735 * while we are at it, unload all the translations 1736 * and attempt to free the page. 1737 */ 1738 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1739 /*LINTED: constant in conditional context */ 1740 VN_DISPOSE(pp, B_FREE, 0, kcred); 1741 anon_array_exit(&cookie); 1742 continue; 1743 } 1744 1745 pgcnt = page_get_pagecnt(pp->p_szc); 1746 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1747 if (!page_try_demote_pages(pp)) { 1748 mutex_exit(ahm); 1749 page_unlock(pp); 1750 segadvstat.MADV_FREE_miss.value.ul++; 1751 anon_array_exit(&cookie); 1752 continue; 1753 } else { 1754 pgcnt = 1; 1755 if (ap->an_pvp) { 1756 swap_phys_free(ap->an_pvp, 1757 ap->an_poff, PAGESIZE); 1758 ap->an_pvp = NULL; 1759 ap->an_poff = 0; 1760 } 1761 mutex_exit(ahm); 1762 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1763 /*LINTED*/ 1764 VN_DISPOSE(pp, B_FREE, 0, kcred); 1765 segadvstat.MADV_FREE_hit.value.ul++; 1766 anon_array_exit(&cookie); 1767 continue; 1768 } 1769 } 1770 mutex_exit(ahm); 1771 root_pp = pp; 1772 1773 /* 1774 * try to lock remaining pages 1775 */ 1776 for (idx = 1; idx < pgcnt; idx++) { 1777 pp++; 1778 if (!page_trylock(pp, SE_EXCL)) 1779 break; 1780 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1781 page_unlock(pp); 1782 break; 1783 } 1784 } 1785 1786 if (idx == pgcnt) { 1787 for (i = 0; i < pgcnt; i++) { 1788 ap = anon_get_ptr(ahp, index + i); 1789 if (ap == NULL) 1790 break; 1791 swap_xlate(ap, &vp, &off); 1792 ahm = AH_MUTEX(vp, off); 1793 mutex_enter(ahm); 1794 ASSERT(ap->an_refcnt != 0); 1795 1796 /* 1797 * skip this one if copy-on-write 1798 * is not yet broken. 1799 */ 1800 if (ap->an_refcnt > 1) { 1801 mutex_exit(ahm); 1802 goto skiplp; 1803 } 1804 if (ap->an_pvp) { 1805 swap_phys_free(ap->an_pvp, 1806 ap->an_poff, PAGESIZE); 1807 ap->an_pvp = NULL; 1808 ap->an_poff = 0; 1809 } 1810 mutex_exit(ahm); 1811 } 1812 page_destroy_pages(root_pp); 1813 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1814 anon_array_exit(&cookie); 1815 continue; 1816 } 1817 skiplp: 1818 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1819 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1820 page_unlock(pp); 1821 anon_array_exit(&cookie); 1822 } 1823 } 1824 1825 /* 1826 * Return the kept page(s) and protections back to the segment driver. 1827 */ 1828 int 1829 anon_getpage( 1830 struct anon **app, 1831 uint_t *protp, 1832 page_t *pl[], 1833 size_t plsz, 1834 struct seg *seg, 1835 caddr_t addr, 1836 enum seg_rw rw, 1837 struct cred *cred) 1838 { 1839 page_t *pp; 1840 struct anon *ap = *app; 1841 struct vnode *vp; 1842 anoff_t off; 1843 int err; 1844 kmutex_t *ahm; 1845 1846 swap_xlate(ap, &vp, &off); 1847 1848 /* 1849 * Lookup the page. If page is being paged in, 1850 * wait for it to finish as we must return a list of 1851 * pages since this routine acts like the VOP_GETPAGE 1852 * routine does. 1853 */ 1854 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1855 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1856 mutex_enter(ahm); 1857 if (ap->an_refcnt == 1) 1858 *protp = PROT_ALL; 1859 else 1860 *protp = PROT_ALL & ~PROT_WRITE; 1861 mutex_exit(ahm); 1862 pl[0] = pp; 1863 pl[1] = NULL; 1864 return (0); 1865 } 1866 1867 /* 1868 * Simply treat it as a vnode fault on the anon vp. 1869 */ 1870 1871 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1872 "anon_getpage:seg %x addr %x vp %x", 1873 seg, addr, vp); 1874 1875 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1876 seg, addr, rw, cred, NULL); 1877 1878 if (err == 0 && pl != NULL) { 1879 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1880 mutex_enter(ahm); 1881 if (ap->an_refcnt != 1) 1882 *protp &= ~PROT_WRITE; /* make read-only */ 1883 mutex_exit(ahm); 1884 } 1885 return (err); 1886 } 1887 1888 /* 1889 * Creates or returns kept pages to the segment driver. returns -1 if a large 1890 * page cannot be allocated. returns -2 if some other process has allocated a 1891 * larger page. 1892 * 1893 * For cowfault it will allocate any size pages to fill the requested area to 1894 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1895 * slots within a large page with other processes). This policy greatly 1896 * simplifies large page freeing (which is only freed when all anon slot 1897 * refcnts are 0). 1898 */ 1899 int 1900 anon_map_getpages( 1901 struct anon_map *amp, 1902 ulong_t start_idx, 1903 uint_t szc, 1904 struct seg *seg, 1905 caddr_t addr, 1906 uint_t prot, 1907 uint_t *protp, 1908 page_t *ppa[], 1909 uint_t *ppa_szc, 1910 struct vpage vpage[], 1911 enum seg_rw rw, 1912 int brkcow, 1913 int anypgsz, 1914 int pgflags, 1915 struct cred *cred) 1916 { 1917 pgcnt_t pgcnt; 1918 struct anon *ap; 1919 struct vnode *vp; 1920 anoff_t off; 1921 page_t *pp, *pl[2], *conpp = NULL; 1922 caddr_t vaddr; 1923 ulong_t pg_idx, an_idx, i; 1924 spgcnt_t nreloc = 0; 1925 int prealloc = 1; 1926 int err, slotcreate; 1927 uint_t vpprot; 1928 int upsize = (szc < seg->s_szc); 1929 1930 #if !defined(__i386) && !defined(__amd64) 1931 ASSERT(seg->s_szc != 0); 1932 #endif 1933 ASSERT(szc <= seg->s_szc); 1934 ASSERT(ppa_szc != NULL); 1935 ASSERT(rw != S_CREATE); 1936 1937 *protp = PROT_ALL; 1938 1939 VM_STAT_ADD(anonvmstats.getpages[0]); 1940 1941 if (szc == 0) { 1942 VM_STAT_ADD(anonvmstats.getpages[1]); 1943 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1944 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1945 addr, rw, cred); 1946 if (err) 1947 return (err); 1948 ppa[0] = pl[0]; 1949 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1950 VM_STAT_ADD(anonvmstats.getpages[2]); 1951 if (ppa[0]->p_szc != 0 && upsize) { 1952 VM_STAT_ADD(anonvmstats.getpages[3]); 1953 *ppa_szc = MIN(ppa[0]->p_szc, 1954 seg->s_szc); 1955 page_unlock(ppa[0]); 1956 return (-2); 1957 } 1958 return (0); 1959 } 1960 panic("anon_map_getpages: cowfault for szc 0"); 1961 } else { 1962 VM_STAT_ADD(anonvmstats.getpages[4]); 1963 ppa[0] = anon_zero(seg, addr, &ap, cred); 1964 if (ppa[0] == NULL) 1965 return (ENOMEM); 1966 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1967 ANON_SLEEP); 1968 return (0); 1969 } 1970 } 1971 1972 pgcnt = page_get_pagecnt(szc); 1973 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1974 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1975 1976 /* 1977 * First we check for the case that the requtested large 1978 * page or larger page already exists in the system. 1979 * Actually we only check if the first constituent page 1980 * exists and only preallocate if it's not found. 1981 */ 1982 ap = anon_get_ptr(amp->ahp, start_idx); 1983 if (ap) { 1984 uint_t pszc; 1985 swap_xlate(ap, &vp, &off); 1986 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1987 if (pszc > szc && upsize) { 1988 *ppa_szc = MIN(pszc, seg->s_szc); 1989 return (-2); 1990 } 1991 if (pszc >= szc) { 1992 prealloc = 0; 1993 } 1994 } 1995 } 1996 1997 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1998 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1999 2000 top: 2001 /* 2002 * If a smaller page or no page at all was found, 2003 * grab a large page off the freelist. 2004 */ 2005 if (prealloc) { 2006 ASSERT(conpp == NULL); 2007 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 2008 szc, 0, pgflags) != 0) { 2009 VM_STAT_ADD(anonvmstats.getpages[7]); 2010 if (brkcow == 0 || szc < seg->s_szc || 2011 !anon_szcshare(amp->ahp, start_idx)) { 2012 /* 2013 * If the refcnt's of all anon slots are <= 1 2014 * they can't increase since we are holding 2015 * the address space's lock. So segvn can 2016 * safely decrease szc without risking to 2017 * generate a cow fault for the region smaller 2018 * than the segment's largest page size. 2019 */ 2020 VM_STAT_ADD(anonvmstats.getpages[8]); 2021 return (-1); 2022 } 2023 docow: 2024 /* 2025 * This is a cow fault. Copy away the entire 1 large 2026 * page region of this segment. 2027 */ 2028 if (szc != seg->s_szc) 2029 panic("anon_map_getpages: cowfault for szc %d", 2030 szc); 2031 vaddr = addr; 2032 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2033 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2034 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2035 NULL) { 2036 err = anon_getpage(&ap, &vpprot, pl, 2037 PAGESIZE, seg, vaddr, rw, cred); 2038 if (err) { 2039 for (i = 0; i < pg_idx; i++) { 2040 if ((pp = ppa[i]) != 2041 NULL) 2042 page_unlock(pp); 2043 } 2044 return (err); 2045 } 2046 ppa[pg_idx] = pl[0]; 2047 } else { 2048 /* 2049 * Since this is a cowfault we know 2050 * that this address space has a 2051 * parent or children which means 2052 * anon_dup_fill_holes() has initialized 2053 * all anon slots within a large page 2054 * region that had at least one anon 2055 * slot at the time of fork(). 2056 */ 2057 panic("anon_map_getpages: " 2058 "cowfault but anon slot is empty"); 2059 } 2060 } 2061 VM_STAT_ADD(anonvmstats.getpages[9]); 2062 *protp = PROT_ALL; 2063 return (anon_map_privatepages(amp, start_idx, szc, seg, 2064 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2065 } 2066 } 2067 2068 VM_STAT_ADD(anonvmstats.getpages[10]); 2069 2070 an_idx = start_idx; 2071 pg_idx = 0; 2072 vaddr = addr; 2073 while (pg_idx < pgcnt) { 2074 slotcreate = 0; 2075 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2076 VM_STAT_ADD(anonvmstats.getpages[11]); 2077 /* 2078 * For us to have decided not to preallocate 2079 * would have meant that a large page 2080 * was found. Which also means that all of the 2081 * anon slots for that page would have been 2082 * already created for us. 2083 */ 2084 if (prealloc == 0) 2085 panic("anon_map_getpages: prealloc = 0"); 2086 2087 slotcreate = 1; 2088 ap = anon_alloc(NULL, 0); 2089 } 2090 swap_xlate(ap, &vp, &off); 2091 2092 /* 2093 * Now setup our preallocated page to pass down 2094 * to swap_getpage(). 2095 */ 2096 if (prealloc) { 2097 ASSERT(ppa[pg_idx]->p_szc == szc); 2098 conpp = ppa[pg_idx]; 2099 } 2100 ASSERT(prealloc || conpp == NULL); 2101 2102 /* 2103 * If we just created this anon slot then call 2104 * with S_CREATE to prevent doing IO on the page. 2105 * Similar to the anon_zero case. 2106 */ 2107 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2108 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2109 slotcreate == 1 ? S_CREATE : rw, cred); 2110 2111 if (err) { 2112 ASSERT(err != -2 || upsize); 2113 VM_STAT_ADD(anonvmstats.getpages[12]); 2114 ASSERT(slotcreate == 0); 2115 goto io_err; 2116 } 2117 2118 pp = pl[0]; 2119 2120 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2121 VM_STAT_ADD(anonvmstats.getpages[13]); 2122 ASSERT(slotcreate == 0); 2123 ASSERT(prealloc == 0); 2124 ASSERT(pg_idx == 0); 2125 if (pp->p_szc > szc) { 2126 ASSERT(upsize); 2127 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2128 page_unlock(pp); 2129 VM_STAT_ADD(anonvmstats.getpages[14]); 2130 return (-2); 2131 } 2132 page_unlock(pp); 2133 prealloc = 1; 2134 goto top; 2135 } 2136 2137 /* 2138 * If we decided to preallocate but VOP_GETPAGE 2139 * found a page in the system that satisfies our 2140 * request then free up our preallocated large page 2141 * and continue looping accross the existing large 2142 * page via VOP_GETPAGE. 2143 */ 2144 if (prealloc && pp != ppa[pg_idx]) { 2145 VM_STAT_ADD(anonvmstats.getpages[15]); 2146 ASSERT(slotcreate == 0); 2147 ASSERT(pg_idx == 0); 2148 conpp = NULL; 2149 prealloc = 0; 2150 page_free_pages(ppa[0]); 2151 } 2152 2153 if (prealloc && nreloc > 1) { 2154 /* 2155 * we have relocated out of a smaller large page. 2156 * skip npgs - 1 iterations and continue which will 2157 * increment by one the loop indices. 2158 */ 2159 spgcnt_t npgs = nreloc; 2160 2161 VM_STAT_ADD(anonvmstats.getpages[16]); 2162 2163 ASSERT(pp == ppa[pg_idx]); 2164 ASSERT(slotcreate == 0); 2165 ASSERT(pg_idx + npgs <= pgcnt); 2166 if ((*protp & PROT_WRITE) && 2167 anon_share(amp->ahp, an_idx, npgs)) { 2168 *protp &= ~PROT_WRITE; 2169 } 2170 pg_idx += npgs; 2171 an_idx += npgs; 2172 vaddr += PAGESIZE * npgs; 2173 continue; 2174 } 2175 2176 VM_STAT_ADD(anonvmstats.getpages[17]); 2177 2178 /* 2179 * Anon_zero case. 2180 */ 2181 if (slotcreate) { 2182 ASSERT(prealloc); 2183 pagezero(pp, 0, PAGESIZE); 2184 CPU_STATS_ADD_K(vm, zfod, 1); 2185 hat_setrefmod(pp); 2186 } 2187 2188 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2189 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2190 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2191 2192 if (pg_idx > 0 && 2193 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2194 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2195 panic("anon_map_getpages: unexpected page"); 2196 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2197 panic("anon_map_getpages: unaligned page"); 2198 } 2199 2200 if (prealloc == 0) { 2201 ppa[pg_idx] = pp; 2202 } 2203 2204 if (ap->an_refcnt > 1) { 2205 VM_STAT_ADD(anonvmstats.getpages[18]); 2206 *protp &= ~PROT_WRITE; 2207 } 2208 2209 /* 2210 * If this is a new anon slot then initialize 2211 * the anon array entry. 2212 */ 2213 if (slotcreate) { 2214 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2215 } 2216 pg_idx++; 2217 an_idx++; 2218 vaddr += PAGESIZE; 2219 } 2220 2221 /* 2222 * Since preallocated pages come off the freelist 2223 * they are locked SE_EXCL. Simply downgrade and return. 2224 */ 2225 if (prealloc) { 2226 VM_STAT_ADD(anonvmstats.getpages[19]); 2227 conpp = NULL; 2228 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2229 page_downgrade(ppa[pg_idx]); 2230 } 2231 } 2232 ASSERT(conpp == NULL); 2233 2234 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2235 VM_STAT_ADD(anonvmstats.getpages[20]); 2236 return (0); 2237 } 2238 2239 if (szc < seg->s_szc) 2240 panic("anon_map_getpages: cowfault for szc %d", szc); 2241 2242 VM_STAT_ADD(anonvmstats.getpages[21]); 2243 2244 *protp = PROT_ALL; 2245 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2246 ppa, vpage, anypgsz, pgflags, cred)); 2247 io_err: 2248 /* 2249 * We got an IO error somewhere in our large page. 2250 * If we were using a preallocated page then just demote 2251 * all the constituent pages that we've succeeded with sofar 2252 * to PAGESIZE pages and leave them in the system 2253 * unlocked. 2254 */ 2255 2256 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2257 2258 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2259 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2260 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2261 2262 if (prealloc) { 2263 conpp = NULL; 2264 if (pg_idx > 0) { 2265 VM_STAT_ADD(anonvmstats.getpages[25]); 2266 for (i = 0; i < pgcnt; i++) { 2267 pp = ppa[i]; 2268 ASSERT(PAGE_EXCL(pp)); 2269 ASSERT(pp->p_szc == szc); 2270 pp->p_szc = 0; 2271 } 2272 for (i = 0; i < pg_idx; i++) { 2273 ASSERT(!hat_page_is_mapped(ppa[i])); 2274 page_unlock(ppa[i]); 2275 } 2276 /* 2277 * Now free up the remaining unused constituent 2278 * pages. 2279 */ 2280 while (pg_idx < pgcnt) { 2281 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2282 page_free(ppa[pg_idx], 0); 2283 pg_idx++; 2284 } 2285 } else { 2286 VM_STAT_ADD(anonvmstats.getpages[26]); 2287 page_free_pages(ppa[0]); 2288 } 2289 } else { 2290 VM_STAT_ADD(anonvmstats.getpages[27]); 2291 ASSERT(err > 0); 2292 for (i = 0; i < pg_idx; i++) 2293 page_unlock(ppa[i]); 2294 } 2295 ASSERT(conpp == NULL); 2296 if (err != -1) 2297 return (err); 2298 /* 2299 * we are here because we failed to relocate. 2300 */ 2301 ASSERT(prealloc); 2302 if (brkcow == 0 || szc < seg->s_szc || 2303 !anon_szcshare(amp->ahp, start_idx)) { 2304 VM_STAT_ADD(anonvmstats.getpages[28]); 2305 return (-1); 2306 } 2307 VM_STAT_ADD(anonvmstats.getpages[29]); 2308 goto docow; 2309 } 2310 2311 2312 /* 2313 * Turn a reference to an object or shared anon page 2314 * into a private page with a copy of the data from the 2315 * original page which is always locked by the caller. 2316 * This routine unloads the translation and unlocks the 2317 * original page, if it isn't being stolen, before returning 2318 * to the caller. 2319 * 2320 * NOTE: The original anon slot is not freed by this routine 2321 * It must be freed by the caller while holding the 2322 * "anon_map" lock to prevent races which can occur if 2323 * a process has multiple lwps in its address space. 2324 */ 2325 page_t * 2326 anon_private( 2327 struct anon **app, 2328 struct seg *seg, 2329 caddr_t addr, 2330 uint_t prot, 2331 page_t *opp, 2332 int oppflags, 2333 struct cred *cred) 2334 { 2335 struct anon *old = *app; 2336 struct anon *new; 2337 page_t *pp = NULL; 2338 struct vnode *vp; 2339 anoff_t off; 2340 page_t *anon_pl[1 + 1]; 2341 int err; 2342 2343 if (oppflags & STEAL_PAGE) 2344 ASSERT(PAGE_EXCL(opp)); 2345 else 2346 ASSERT(PAGE_LOCKED(opp)); 2347 2348 CPU_STATS_ADD_K(vm, cow_fault, 1); 2349 2350 /* Kernel probe */ 2351 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2352 tnf_opaque, address, addr); 2353 2354 *app = new = anon_alloc(NULL, 0); 2355 swap_xlate(new, &vp, &off); 2356 2357 if (oppflags & STEAL_PAGE) { 2358 page_rename(opp, vp, (u_offset_t)off); 2359 pp = opp; 2360 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2361 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2362 seg, addr, pp, vp, off); 2363 hat_setmod(pp); 2364 2365 /* bug 4026339 */ 2366 page_downgrade(pp); 2367 return (pp); 2368 } 2369 2370 /* 2371 * Call the VOP_GETPAGE routine to create the page, thereby 2372 * enabling the vnode driver to allocate any filesystem 2373 * space (e.g., disk block allocation for UFS). This also 2374 * prevents more than one page from being added to the 2375 * vnode at the same time. 2376 */ 2377 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2378 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2379 if (err) 2380 goto out; 2381 2382 pp = anon_pl[0]; 2383 2384 /* 2385 * If the original page was locked, we need to move the lock 2386 * to the new page by transfering 'cowcnt/lckcnt' of the original 2387 * page to 'cowcnt/lckcnt' of the new page. 2388 * 2389 * See Statement at the beginning of segvn_lockop() and 2390 * comments in page_pp_useclaim() regarding the way 2391 * cowcnts/lckcnts are handled. 2392 * 2393 * Also availrmem must be decremented up front for read only mapping 2394 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2395 * if availrmem did not need to be decremented after all. 2396 */ 2397 if (oppflags & LOCK_PAGE) { 2398 if ((prot & PROT_WRITE) == 0) { 2399 mutex_enter(&freemem_lock); 2400 if (availrmem > pages_pp_maximum) { 2401 availrmem--; 2402 pages_useclaim++; 2403 } else { 2404 mutex_exit(&freemem_lock); 2405 goto out; 2406 } 2407 mutex_exit(&freemem_lock); 2408 } 2409 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2410 } 2411 2412 /* 2413 * Now copy the contents from the original page, 2414 * which is locked and loaded in the MMU by 2415 * the caller to prevent yet another page fault. 2416 */ 2417 /* XXX - should set mod bit in here */ 2418 if (ppcopy(opp, pp) == 0) { 2419 /* 2420 * Before ppcopy could hanlde UE or other faults, we 2421 * would have panicked here, and still have no option 2422 * but to do so now. 2423 */ 2424 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2425 (void *)opp, (void *)pp); 2426 } 2427 2428 hat_setrefmod(pp); /* mark as modified */ 2429 2430 /* 2431 * Unload the old translation. 2432 */ 2433 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2434 2435 /* 2436 * Free unmapped, unmodified original page. 2437 * or release the lock on the original page, 2438 * otherwise the process will sleep forever in 2439 * anon_decref() waiting for the "exclusive" lock 2440 * on the page. 2441 */ 2442 (void) page_release(opp, 1); 2443 2444 /* 2445 * we are done with page creation so downgrade the new 2446 * page's selock to shared, this helps when multiple 2447 * as_fault(...SOFTLOCK...) are done to the same 2448 * page(aio) 2449 */ 2450 page_downgrade(pp); 2451 2452 /* 2453 * NOTE: The original anon slot must be freed by the 2454 * caller while holding the "anon_map" lock, if we 2455 * copied away from an anonymous page. 2456 */ 2457 return (pp); 2458 2459 out: 2460 *app = old; 2461 if (pp) 2462 page_unlock(pp); 2463 anon_decref(new); 2464 page_unlock(opp); 2465 return ((page_t *)NULL); 2466 } 2467 2468 int 2469 anon_map_privatepages( 2470 struct anon_map *amp, 2471 ulong_t start_idx, 2472 uint_t szc, 2473 struct seg *seg, 2474 caddr_t addr, 2475 uint_t prot, 2476 page_t *ppa[], 2477 struct vpage vpage[], 2478 int anypgsz, 2479 int pgflags, 2480 struct cred *cred) 2481 { 2482 pgcnt_t pgcnt; 2483 struct vnode *vp; 2484 anoff_t off; 2485 page_t *pl[2], *conpp = NULL; 2486 int err; 2487 int prealloc = 1; 2488 struct anon *ap, *oldap; 2489 caddr_t vaddr; 2490 page_t *pplist, *pp; 2491 ulong_t pg_idx, an_idx; 2492 spgcnt_t nreloc = 0; 2493 int pagelock = 0; 2494 kmutex_t *ahmpages = NULL; 2495 #ifdef DEBUG 2496 int refcnt; 2497 #endif 2498 2499 ASSERT(szc != 0); 2500 ASSERT(szc == seg->s_szc); 2501 2502 VM_STAT_ADD(anonvmstats.privatepages[0]); 2503 2504 pgcnt = page_get_pagecnt(szc); 2505 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2506 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2507 2508 ASSERT(amp != NULL); 2509 ap = anon_get_ptr(amp->ahp, start_idx); 2510 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2511 2512 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2513 2514 /* 2515 * Now try and allocate the large page. If we fail then just 2516 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2517 * the caller make this decision but to avoid added complexity 2518 * it's simplier to handle that case here. 2519 */ 2520 if (anypgsz == -1) { 2521 VM_STAT_ADD(anonvmstats.privatepages[2]); 2522 prealloc = 0; 2523 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2524 anypgsz, pgflags) != 0) { 2525 VM_STAT_ADD(anonvmstats.privatepages[3]); 2526 prealloc = 0; 2527 } 2528 2529 /* 2530 * make the decrement of all refcnts of all 2531 * anon slots of a large page appear atomic by 2532 * getting an anonpages_hash_lock for the 2533 * first anon slot of a large page. 2534 */ 2535 if (ap != NULL) { 2536 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 2537 mutex_enter(ahmpages); 2538 if (ap->an_refcnt == 1) { 2539 VM_STAT_ADD(anonvmstats.privatepages[4]); 2540 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2541 mutex_exit(ahmpages); 2542 2543 if (prealloc) { 2544 page_free_replacement_page(pplist); 2545 page_create_putback(pgcnt); 2546 } 2547 ASSERT(ppa[0]->p_szc <= szc); 2548 if (ppa[0]->p_szc == szc) { 2549 VM_STAT_ADD(anonvmstats.privatepages[5]); 2550 return (0); 2551 } 2552 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2553 ASSERT(ppa[pg_idx] != NULL); 2554 page_unlock(ppa[pg_idx]); 2555 } 2556 return (-1); 2557 } 2558 } 2559 2560 /* 2561 * If we are passed in the vpage array and this is 2562 * not PROT_WRITE then we need to decrement availrmem 2563 * up front before we try anything. If we need to and 2564 * can't decrement availrmem then its better to fail now 2565 * than in the middle of processing the new large page. 2566 * page_pp_usclaim() on behalf of each constituent page 2567 * below will adjust availrmem back for the cases not needed. 2568 */ 2569 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2570 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2571 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2572 pagelock = 1; 2573 break; 2574 } 2575 } 2576 if (pagelock) { 2577 VM_STAT_ADD(anonvmstats.privatepages[6]); 2578 mutex_enter(&freemem_lock); 2579 if (availrmem >= pages_pp_maximum + pgcnt) { 2580 availrmem -= pgcnt; 2581 pages_useclaim += pgcnt; 2582 } else { 2583 VM_STAT_ADD(anonvmstats.privatepages[7]); 2584 mutex_exit(&freemem_lock); 2585 if (ahmpages != NULL) { 2586 mutex_exit(ahmpages); 2587 } 2588 if (prealloc) { 2589 page_free_replacement_page(pplist); 2590 page_create_putback(pgcnt); 2591 } 2592 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2593 if (ppa[pg_idx] != NULL) 2594 page_unlock(ppa[pg_idx]); 2595 return (ENOMEM); 2596 } 2597 mutex_exit(&freemem_lock); 2598 } 2599 } 2600 2601 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2602 2603 VM_STAT_ADD(anonvmstats.privatepages[8]); 2604 2605 an_idx = start_idx; 2606 pg_idx = 0; 2607 vaddr = addr; 2608 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2609 ASSERT(ppa[pg_idx] != NULL); 2610 oldap = anon_get_ptr(amp->ahp, an_idx); 2611 ASSERT(ahmpages != NULL || oldap == NULL); 2612 ASSERT(ahmpages == NULL || oldap != NULL); 2613 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2614 ASSERT(ahmpages == NULL || pg_idx != 0 || 2615 (refcnt = oldap->an_refcnt)); 2616 ASSERT(ahmpages == NULL || pg_idx == 0 || 2617 refcnt == oldap->an_refcnt); 2618 2619 ap = anon_alloc(NULL, 0); 2620 2621 swap_xlate(ap, &vp, &off); 2622 2623 /* 2624 * Now setup our preallocated page to pass down to 2625 * swap_getpage(). 2626 */ 2627 if (prealloc) { 2628 pp = pplist; 2629 page_sub(&pplist, pp); 2630 conpp = pp; 2631 } 2632 2633 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2634 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2635 S_CREATE, cred); 2636 2637 /* 2638 * Impossible to fail this is S_CREATE. 2639 */ 2640 if (err) 2641 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2642 2643 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2644 ASSERT(prealloc == 0 || nreloc == 1); 2645 2646 pp = pl[0]; 2647 2648 /* 2649 * If the original page was locked, we need to move 2650 * the lock to the new page by transfering 2651 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2652 * of the new page. pg_idx can be used to index 2653 * into the vpage array since the caller will guarentee 2654 * that vpage struct passed in corresponds to addr 2655 * and forward. 2656 */ 2657 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2658 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2659 } else if (pagelock) { 2660 mutex_enter(&freemem_lock); 2661 availrmem++; 2662 pages_useclaim--; 2663 mutex_exit(&freemem_lock); 2664 } 2665 2666 /* 2667 * Now copy the contents from the original page. 2668 */ 2669 if (ppcopy(ppa[pg_idx], pp) == 0) { 2670 /* 2671 * Before ppcopy could hanlde UE or other faults, we 2672 * would have panicked here, and still have no option 2673 * but to do so now. 2674 */ 2675 panic("anon_map_privatepages, ppcopy failed"); 2676 } 2677 2678 hat_setrefmod(pp); /* mark as modified */ 2679 2680 /* 2681 * Release the lock on the original page, 2682 * derement the old slot, and down grade the lock 2683 * on the new copy. 2684 */ 2685 page_unlock(ppa[pg_idx]); 2686 2687 if (!prealloc) 2688 page_downgrade(pp); 2689 2690 ppa[pg_idx] = pp; 2691 2692 /* 2693 * Now reflect the copy in the new anon array. 2694 */ 2695 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2696 if (oldap != NULL) 2697 anon_decref(oldap); 2698 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2699 } 2700 2701 /* 2702 * Unload the old large page translation. 2703 */ 2704 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2705 2706 if (ahmpages != NULL) { 2707 mutex_exit(ahmpages); 2708 } 2709 ASSERT(prealloc == 0 || pplist == NULL); 2710 if (prealloc) { 2711 VM_STAT_ADD(anonvmstats.privatepages[9]); 2712 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2713 page_downgrade(ppa[pg_idx]); 2714 } 2715 } 2716 2717 return (0); 2718 } 2719 2720 /* 2721 * Allocate a private zero-filled anon page. 2722 */ 2723 page_t * 2724 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2725 { 2726 struct anon *ap; 2727 page_t *pp; 2728 struct vnode *vp; 2729 anoff_t off; 2730 page_t *anon_pl[1 + 1]; 2731 int err; 2732 2733 /* Kernel probe */ 2734 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2735 tnf_opaque, address, addr); 2736 2737 *app = ap = anon_alloc(NULL, 0); 2738 swap_xlate(ap, &vp, &off); 2739 2740 /* 2741 * Call the VOP_GETPAGE routine to create the page, thereby 2742 * enabling the vnode driver to allocate any filesystem 2743 * dependent structures (e.g., disk block allocation for UFS). 2744 * This also prevents more than on page from being added to 2745 * the vnode at the same time since it is locked. 2746 */ 2747 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2748 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2749 if (err) { 2750 *app = NULL; 2751 anon_decref(ap); 2752 return (NULL); 2753 } 2754 pp = anon_pl[0]; 2755 2756 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2757 page_downgrade(pp); 2758 CPU_STATS_ADD_K(vm, zfod, 1); 2759 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2760 return (pp); 2761 } 2762 2763 2764 /* 2765 * Allocate array of private zero-filled anon pages for empty slots 2766 * and kept pages for non empty slots within given range. 2767 * 2768 * NOTE: This rontine will try and use large pages 2769 * if available and supported by underlying platform. 2770 */ 2771 int 2772 anon_map_createpages( 2773 struct anon_map *amp, 2774 ulong_t start_index, 2775 size_t len, 2776 page_t *ppa[], 2777 struct seg *seg, 2778 caddr_t addr, 2779 enum seg_rw rw, 2780 struct cred *cred) 2781 { 2782 2783 struct anon *ap; 2784 struct vnode *ap_vp; 2785 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2786 int err = 0; 2787 ulong_t p_index, index; 2788 pgcnt_t npgs, pg_cnt; 2789 spgcnt_t nreloc = 0; 2790 uint_t l_szc, szc, prot; 2791 anoff_t ap_off; 2792 size_t pgsz; 2793 lgrp_t *lgrp; 2794 kmutex_t *ahm; 2795 2796 /* 2797 * XXX For now only handle S_CREATE. 2798 */ 2799 ASSERT(rw == S_CREATE); 2800 2801 index = start_index; 2802 p_index = 0; 2803 npgs = btopr(len); 2804 2805 /* 2806 * If this platform supports multiple page sizes 2807 * then try and allocate directly from the free 2808 * list for pages larger than PAGESIZE. 2809 * 2810 * NOTE:When we have page_create_ru we can stop 2811 * directly allocating from the freelist. 2812 */ 2813 l_szc = seg->s_szc; 2814 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2815 while (npgs) { 2816 2817 /* 2818 * if anon slot already exists 2819 * (means page has been created) 2820 * so 1) look up the page 2821 * 2) if the page is still in memory, get it. 2822 * 3) if not, create a page and 2823 * page in from physical swap device. 2824 * These are done in anon_getpage(). 2825 */ 2826 ap = anon_get_ptr(amp->ahp, index); 2827 if (ap) { 2828 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2829 seg, addr, S_READ, cred); 2830 if (err) { 2831 ANON_LOCK_EXIT(&->a_rwlock); 2832 panic("anon_map_createpages: anon_getpage"); 2833 } 2834 pp = anon_pl[0]; 2835 ppa[p_index++] = pp; 2836 2837 /* 2838 * an_pvp can become non-NULL after SysV's page was 2839 * paged out before ISM was attached to this SysV 2840 * shared memory segment. So free swap slot if needed. 2841 */ 2842 if (ap->an_pvp != NULL) { 2843 page_io_lock(pp); 2844 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 2845 mutex_enter(ahm); 2846 if (ap->an_pvp != NULL) { 2847 swap_phys_free(ap->an_pvp, 2848 ap->an_poff, PAGESIZE); 2849 ap->an_pvp = NULL; 2850 ap->an_poff = 0; 2851 mutex_exit(ahm); 2852 hat_setmod(pp); 2853 } else { 2854 mutex_exit(ahm); 2855 } 2856 page_io_unlock(pp); 2857 } 2858 2859 addr += PAGESIZE; 2860 index++; 2861 npgs--; 2862 continue; 2863 } 2864 /* 2865 * Now try and allocate the largest page possible 2866 * for the current address and range. 2867 * Keep dropping down in page size until: 2868 * 2869 * 1) Properly aligned 2870 * 2) Does not overlap existing anon pages 2871 * 3) Fits in remaining range. 2872 * 4) able to allocate one. 2873 * 2874 * NOTE: XXX When page_create_ru is completed this code 2875 * will change. 2876 */ 2877 szc = l_szc; 2878 pplist = NULL; 2879 pg_cnt = 0; 2880 while (szc) { 2881 pgsz = page_get_pagesize(szc); 2882 pg_cnt = pgsz >> PAGESHIFT; 2883 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2884 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2885 /* 2886 * XXX 2887 * Since we are faking page_create() 2888 * we also need to do the freemem and 2889 * pcf accounting. 2890 */ 2891 (void) page_create_wait(pg_cnt, PG_WAIT); 2892 2893 /* 2894 * Get lgroup to allocate next page of shared 2895 * memory from and use it to specify where to 2896 * allocate the physical memory 2897 */ 2898 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2899 2900 pplist = page_get_freelist( 2901 anon_vp, (u_offset_t)0, seg, 2902 addr, pgsz, 0, lgrp); 2903 2904 if (pplist == NULL) { 2905 page_create_putback(pg_cnt); 2906 } 2907 2908 /* 2909 * If a request for a page of size 2910 * larger than PAGESIZE failed 2911 * then don't try that size anymore. 2912 */ 2913 if (pplist == NULL) { 2914 l_szc = szc - 1; 2915 } else { 2916 break; 2917 } 2918 } 2919 szc--; 2920 } 2921 2922 /* 2923 * If just using PAGESIZE pages then don't 2924 * directly allocate from the free list. 2925 */ 2926 if (pplist == NULL) { 2927 ASSERT(szc == 0); 2928 pp = anon_zero(seg, addr, &ap, cred); 2929 if (pp == NULL) { 2930 ANON_LOCK_EXIT(&->a_rwlock); 2931 panic("anon_map_createpages: anon_zero"); 2932 } 2933 ppa[p_index++] = pp; 2934 2935 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2936 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2937 2938 addr += PAGESIZE; 2939 index++; 2940 npgs--; 2941 continue; 2942 } 2943 2944 /* 2945 * pplist is a list of pg_cnt PAGESIZE pages. 2946 * These pages are locked SE_EXCL since they 2947 * came directly off the free list. 2948 */ 2949 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2950 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2951 ASSERT(conpp == NULL); 2952 while (pg_cnt--) { 2953 2954 ap = anon_alloc(NULL, 0); 2955 swap_xlate(ap, &ap_vp, &ap_off); 2956 2957 ASSERT(pplist != NULL); 2958 pp = pplist; 2959 page_sub(&pplist, pp); 2960 PP_CLRFREE(pp); 2961 PP_CLRAGED(pp); 2962 conpp = pp; 2963 2964 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2965 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2966 &nreloc, seg, addr, S_CREATE, cred); 2967 2968 if (err) { 2969 ANON_LOCK_EXIT(&->a_rwlock); 2970 panic("anon_map_createpages: S_CREATE"); 2971 } 2972 2973 ASSERT(anon_pl[0] == pp); 2974 ASSERT(nreloc == 1); 2975 pagezero(pp, 0, PAGESIZE); 2976 CPU_STATS_ADD_K(vm, zfod, 1); 2977 hat_setrefmod(pp); 2978 2979 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2980 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2981 2982 ppa[p_index++] = pp; 2983 2984 addr += PAGESIZE; 2985 index++; 2986 npgs--; 2987 } 2988 conpp = NULL; 2989 pg_cnt = pgsz >> PAGESHIFT; 2990 p_index = p_index - pg_cnt; 2991 while (pg_cnt--) { 2992 page_downgrade(ppa[p_index++]); 2993 } 2994 } 2995 ANON_LOCK_EXIT(&->a_rwlock); 2996 return (0); 2997 } 2998 2999 static int 3000 anon_try_demote_pages( 3001 struct anon_hdr *ahp, 3002 ulong_t sidx, 3003 uint_t szc, 3004 page_t **ppa, 3005 int private) 3006 { 3007 struct anon *ap; 3008 pgcnt_t pgcnt = page_get_pagecnt(szc); 3009 page_t *pp; 3010 pgcnt_t i; 3011 kmutex_t *ahmpages = NULL; 3012 int root = 0; 3013 pgcnt_t npgs; 3014 pgcnt_t curnpgs = 0; 3015 size_t ppasize = 0; 3016 3017 ASSERT(szc != 0); 3018 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3019 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3020 ASSERT(sidx < ahp->size); 3021 3022 if (ppa == NULL) { 3023 ppasize = pgcnt * sizeof (page_t *); 3024 ppa = kmem_alloc(ppasize, KM_SLEEP); 3025 } 3026 3027 ap = anon_get_ptr(ahp, sidx); 3028 if (ap != NULL && private) { 3029 VM_STAT_ADD(anonvmstats.demotepages[1]); 3030 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 3031 mutex_enter(ahmpages); 3032 } 3033 3034 if (ap != NULL && ap->an_refcnt > 1) { 3035 if (ahmpages != NULL) { 3036 VM_STAT_ADD(anonvmstats.demotepages[2]); 3037 mutex_exit(ahmpages); 3038 } 3039 if (ppasize != 0) { 3040 kmem_free(ppa, ppasize); 3041 } 3042 return (0); 3043 } 3044 if (ahmpages != NULL) { 3045 mutex_exit(ahmpages); 3046 } 3047 if (ahp->size - sidx < pgcnt) { 3048 ASSERT(private == 0); 3049 pgcnt = ahp->size - sidx; 3050 } 3051 for (i = 0; i < pgcnt; i++, sidx++) { 3052 ap = anon_get_ptr(ahp, sidx); 3053 if (ap != NULL) { 3054 if (ap->an_refcnt != 1) { 3055 panic("anon_try_demote_pages: an_refcnt != 1"); 3056 } 3057 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3058 SE_EXCL); 3059 if (pp != NULL) { 3060 (void) hat_pageunload(pp, 3061 HAT_FORCE_PGUNLOAD); 3062 } 3063 } else { 3064 ppa[i] = NULL; 3065 } 3066 } 3067 for (i = 0; i < pgcnt; i++) { 3068 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3069 ASSERT(pp->p_szc <= szc); 3070 if (!root) { 3071 VM_STAT_ADD(anonvmstats.demotepages[3]); 3072 if (curnpgs != 0) 3073 panic("anon_try_demote_pages: " 3074 "bad large page"); 3075 3076 root = 1; 3077 curnpgs = npgs = 3078 page_get_pagecnt(pp->p_szc); 3079 3080 ASSERT(npgs <= pgcnt); 3081 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3082 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3083 } else { 3084 ASSERT(i > 0); 3085 ASSERT(page_pptonum(pp) - 1 == 3086 page_pptonum(ppa[i - 1])); 3087 if ((page_pptonum(pp) & (npgs - 1)) == 3088 npgs - 1) 3089 root = 0; 3090 } 3091 ASSERT(PAGE_EXCL(pp)); 3092 pp->p_szc = 0; 3093 ASSERT(curnpgs > 0); 3094 curnpgs--; 3095 } 3096 } 3097 if (root != 0 || curnpgs != 0) 3098 panic("anon_try_demote_pages: bad large page"); 3099 3100 for (i = 0; i < pgcnt; i++) { 3101 if ((pp = ppa[i]) != NULL) { 3102 ASSERT(!hat_page_is_mapped(pp)); 3103 ASSERT(pp->p_szc == 0); 3104 page_unlock(pp); 3105 } 3106 } 3107 if (ppasize != 0) { 3108 kmem_free(ppa, ppasize); 3109 } 3110 return (1); 3111 } 3112 3113 /* 3114 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3115 */ 3116 int 3117 anon_map_demotepages( 3118 struct anon_map *amp, 3119 ulong_t start_idx, 3120 struct seg *seg, 3121 caddr_t addr, 3122 uint_t prot, 3123 struct vpage vpage[], 3124 struct cred *cred) 3125 { 3126 struct anon *ap; 3127 uint_t szc = seg->s_szc; 3128 pgcnt_t pgcnt = page_get_pagecnt(szc); 3129 size_t ppasize = pgcnt * sizeof (page_t *); 3130 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3131 page_t *pp; 3132 page_t *pl[2]; 3133 pgcnt_t i, pg_idx; 3134 ulong_t an_idx; 3135 caddr_t vaddr; 3136 int err; 3137 int retry = 0; 3138 uint_t vpprot; 3139 3140 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3141 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3142 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3143 ASSERT(ppa != NULL); 3144 ASSERT(szc != 0); 3145 ASSERT(szc == amp->a_szc); 3146 3147 VM_STAT_ADD(anonvmstats.demotepages[0]); 3148 3149 top: 3150 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3151 kmem_free(ppa, ppasize); 3152 return (0); 3153 } 3154 3155 VM_STAT_ADD(anonvmstats.demotepages[4]); 3156 3157 ASSERT(retry == 0); /* we can be here only once */ 3158 3159 vaddr = addr; 3160 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3161 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3162 ap = anon_get_ptr(amp->ahp, an_idx); 3163 if (ap == NULL) 3164 panic("anon_map_demotepages: no anon slot"); 3165 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3166 S_READ, cred); 3167 if (err) { 3168 for (i = 0; i < pg_idx; i++) { 3169 if ((pp = ppa[i]) != NULL) 3170 page_unlock(pp); 3171 } 3172 kmem_free(ppa, ppasize); 3173 return (err); 3174 } 3175 ppa[pg_idx] = pl[0]; 3176 } 3177 3178 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3179 vpage, -1, 0, cred); 3180 if (err > 0) { 3181 VM_STAT_ADD(anonvmstats.demotepages[5]); 3182 kmem_free(ppa, ppasize); 3183 return (err); 3184 } 3185 ASSERT(err == 0 || err == -1); 3186 if (err == -1) { 3187 VM_STAT_ADD(anonvmstats.demotepages[6]); 3188 retry = 1; 3189 goto top; 3190 } 3191 for (i = 0; i < pgcnt; i++) { 3192 ASSERT(ppa[i] != NULL); 3193 if (ppa[i]->p_szc != 0) 3194 retry = 1; 3195 page_unlock(ppa[i]); 3196 } 3197 if (retry) { 3198 VM_STAT_ADD(anonvmstats.demotepages[7]); 3199 goto top; 3200 } 3201 3202 VM_STAT_ADD(anonvmstats.demotepages[8]); 3203 3204 kmem_free(ppa, ppasize); 3205 3206 return (0); 3207 } 3208 3209 /* 3210 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3211 * structures with private anon maps. Therefore all anon structures should 3212 * have at most one reference at this point. This means underlying pages can 3213 * be exclusively locked and demoted or freed. If not freeing the entire 3214 * large pages demote the ends of the region we free to be able to free 3215 * subpages. Page roots correspond to aligned index positions in anon map. 3216 */ 3217 void 3218 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3219 { 3220 ulong_t eidx = sidx + btopr(len); 3221 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3222 struct anon_hdr *ahp = amp->ahp; 3223 ulong_t tidx; 3224 size_t size; 3225 ulong_t sidx_aligned; 3226 ulong_t eidx_aligned; 3227 3228 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3229 ASSERT(amp->refcnt <= 1); 3230 ASSERT(amp->a_szc > 0); 3231 ASSERT(eidx <= ahp->size); 3232 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3233 3234 if (len == 0) { /* XXX */ 3235 return; 3236 } 3237 3238 sidx_aligned = P2ALIGN(sidx, pages); 3239 if (sidx_aligned != sidx || 3240 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3241 if (!anon_try_demote_pages(ahp, sidx_aligned, 3242 amp->a_szc, NULL, 0)) { 3243 panic("anon_shmap_free_pages: demote failed"); 3244 } 3245 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3246 P2NPHASE(sidx, pages); 3247 size <<= PAGESHIFT; 3248 anon_free(ahp, sidx, size); 3249 sidx = sidx_aligned + pages; 3250 if (eidx <= sidx) { 3251 return; 3252 } 3253 } 3254 eidx_aligned = P2ALIGN(eidx, pages); 3255 if (sidx < eidx_aligned) { 3256 anon_free_pages(ahp, sidx, 3257 (eidx_aligned - sidx) << PAGESHIFT, 3258 amp->a_szc); 3259 sidx = eidx_aligned; 3260 } 3261 ASSERT(sidx == eidx_aligned); 3262 if (eidx == eidx_aligned) { 3263 return; 3264 } 3265 tidx = eidx; 3266 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3267 tidx - sidx < pages) { 3268 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3269 panic("anon_shmap_free_pages: demote failed"); 3270 } 3271 size = (eidx - sidx) << PAGESHIFT; 3272 anon_free(ahp, sidx, size); 3273 } else { 3274 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3275 } 3276 } 3277 3278 /* 3279 * This routine should be called with amp's writer lock when there're no other 3280 * users of amp. All pcache entries of this amp must have been already 3281 * inactivated. We must not drop a_rwlock here to prevent new users from 3282 * attaching to this amp. 3283 */ 3284 void 3285 anonmap_purge(struct anon_map *amp) 3286 { 3287 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3288 ASSERT(amp->refcnt <= 1); 3289 3290 if (amp->a_softlockcnt != 0) { 3291 seg_ppurge(NULL, amp, 0); 3292 } 3293 3294 /* 3295 * Since all pcache entries were already inactive before this routine 3296 * was called seg_ppurge() couldn't return while there're still 3297 * entries that can be found via the list anchored at a_phead. So we 3298 * can assert this list is empty now. a_softlockcnt may be still non 0 3299 * if asynchronous thread that manages pcache already removed pcache 3300 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3301 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3302 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3303 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3304 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3305 * barrier that prevents anonmap_purge() to complete while 3306 * shamp_reclaim() may still be referencing this amp. 3307 */ 3308 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3309 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3310 3311 mutex_enter(&->a_purgemtx); 3312 while (amp->a_softlockcnt != 0) { 3313 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3314 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3315 amp->a_purgewait = 1; 3316 cv_wait(&->a_purgecv, &->a_purgemtx); 3317 } 3318 mutex_exit(&->a_purgemtx); 3319 3320 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3321 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3322 ASSERT(amp->a_softlockcnt == 0); 3323 } 3324 3325 /* 3326 * Allocate and initialize an anon_map structure for seg 3327 * associating the given swap reservation with the new anon_map. 3328 */ 3329 struct anon_map * 3330 anonmap_alloc(size_t size, size_t swresv, int flags) 3331 { 3332 struct anon_map *amp; 3333 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3334 3335 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3336 if (amp == NULL) { 3337 ASSERT(kmflags == KM_NOSLEEP); 3338 return (NULL); 3339 } 3340 3341 amp->ahp = anon_create(btopr(size), flags); 3342 if (amp->ahp == NULL) { 3343 ASSERT(flags == ANON_NOSLEEP); 3344 kmem_cache_free(anonmap_cache, amp); 3345 return (NULL); 3346 } 3347 amp->refcnt = 1; 3348 amp->size = size; 3349 amp->swresv = swresv; 3350 amp->locality = 0; 3351 amp->a_szc = 0; 3352 amp->a_sp = NULL; 3353 amp->a_softlockcnt = 0; 3354 amp->a_purgewait = 0; 3355 amp->a_phead.p_lnext = &->a_phead; 3356 amp->a_phead.p_lprev = &->a_phead; 3357 3358 return (amp); 3359 } 3360 3361 void 3362 anonmap_free(struct anon_map *amp) 3363 { 3364 ASSERT(amp->ahp != NULL); 3365 ASSERT(amp->refcnt == 0); 3366 ASSERT(amp->a_softlockcnt == 0); 3367 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3368 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3369 3370 lgrp_shm_policy_fini(amp, NULL); 3371 anon_release(amp->ahp, btopr(amp->size)); 3372 kmem_cache_free(anonmap_cache, amp); 3373 } 3374 3375 /* 3376 * Returns true if the app array has some empty slots. 3377 * The offp and lenp parameters are in/out parameters. On entry 3378 * these values represent the starting offset and length of the 3379 * mapping. When true is returned, these values may be modified 3380 * to be the largest range which includes empty slots. 3381 */ 3382 int 3383 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3384 size_t *lenp) 3385 { 3386 ulong_t i, el; 3387 ssize_t low, high; 3388 struct anon *ap; 3389 3390 low = -1; 3391 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3392 ap = anon_get_ptr(ahp, anon_idx); 3393 if (ap == NULL) { 3394 if (low == -1) 3395 low = i; 3396 high = i; 3397 } 3398 } 3399 if (low != -1) { 3400 /* 3401 * Found at least one non-anon page. 3402 * Set up the off and len return values. 3403 */ 3404 if (low != 0) 3405 *offp += low; 3406 *lenp = high - low + PAGESIZE; 3407 return (1); 3408 } 3409 return (0); 3410 } 3411 3412 /* 3413 * Return a count of the number of existing anon pages in the anon array 3414 * app in the range (off, off+len). The array and slots must be guaranteed 3415 * stable by the caller. 3416 */ 3417 pgcnt_t 3418 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3419 { 3420 pgcnt_t cnt = 0; 3421 3422 while (nslots-- > 0) { 3423 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3424 cnt++; 3425 anon_index++; 3426 } 3427 return (cnt); 3428 } 3429 3430 /* 3431 * Move reserved phys swap into memory swap (unreserve phys swap 3432 * and reserve mem swap by the same amount). 3433 * Used by segspt when it needs to lock reserved swap npages in memory 3434 */ 3435 int 3436 anon_swap_adjust(pgcnt_t npages) 3437 { 3438 pgcnt_t unlocked_mem_swap; 3439 3440 mutex_enter(&anoninfo_lock); 3441 3442 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3443 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3444 3445 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3446 - k_anoninfo.ani_locked_swap; 3447 if (npages > unlocked_mem_swap) { 3448 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3449 3450 /* 3451 * if there is not enough unlocked mem swap we take missing 3452 * amount from phys swap and give it to mem swap 3453 */ 3454 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3455 mutex_exit(&anoninfo_lock); 3456 return (ENOMEM); 3457 } 3458 3459 k_anoninfo.ani_mem_resv += adjusted_swap; 3460 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3461 k_anoninfo.ani_phys_resv -= adjusted_swap; 3462 3463 ANI_ADD(adjusted_swap); 3464 } 3465 k_anoninfo.ani_locked_swap += npages; 3466 3467 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3468 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3469 3470 mutex_exit(&anoninfo_lock); 3471 3472 return (0); 3473 } 3474 3475 /* 3476 * 'unlocked' reserved mem swap so when it is unreserved it 3477 * can be moved back phys (disk) swap 3478 */ 3479 void 3480 anon_swap_restore(pgcnt_t npages) 3481 { 3482 mutex_enter(&anoninfo_lock); 3483 3484 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3485 3486 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3487 k_anoninfo.ani_locked_swap -= npages; 3488 3489 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3490 3491 mutex_exit(&anoninfo_lock); 3492 } 3493 3494 /* 3495 * Return the pointer from the list for a 3496 * specified anon index. 3497 */ 3498 ulong_t * 3499 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3500 { 3501 struct anon **app; 3502 void **ppp; 3503 3504 ASSERT(an_idx < ahp->size); 3505 3506 /* 3507 * Single level case. 3508 */ 3509 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3510 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3511 } else { 3512 3513 /* 3514 * 2 level case. 3515 */ 3516 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3517 if (*ppp == NULL) { 3518 mutex_enter(&ahp->serial_lock); 3519 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3520 if (*ppp == NULL) 3521 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3522 mutex_exit(&ahp->serial_lock); 3523 } 3524 app = *ppp; 3525 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3526 } 3527 } 3528 3529 void 3530 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3531 { 3532 ulong_t *ap_slot; 3533 kmutex_t *mtx; 3534 kcondvar_t *cv; 3535 int hash; 3536 3537 /* 3538 * Use szc to determine anon slot(s) to appear atomic. 3539 * If szc = 0, then lock the anon slot and mark it busy. 3540 * If szc > 0, then lock the range of slots by getting the 3541 * anon_array_lock for the first anon slot, and mark only the 3542 * first anon slot busy to represent whole range being busy. 3543 */ 3544 3545 ASSERT(RW_READ_HELD(&->a_rwlock)); 3546 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3547 hash = ANON_ARRAY_HASH(amp, an_idx); 3548 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3549 sobj->sync_cv = cv = &anon_array_cv[hash]; 3550 mutex_enter(mtx); 3551 ap_slot = anon_get_slot(amp->ahp, an_idx); 3552 while (ANON_ISBUSY(ap_slot)) 3553 cv_wait(cv, mtx); 3554 ANON_SETBUSY(ap_slot); 3555 sobj->sync_data = ap_slot; 3556 mutex_exit(mtx); 3557 } 3558 3559 int 3560 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3561 anon_sync_obj_t *sobj) 3562 { 3563 ulong_t *ap_slot; 3564 kmutex_t *mtx; 3565 int hash; 3566 3567 /* 3568 * Try to lock a range of anon slots. 3569 * Use szc to determine anon slot(s) to appear atomic. 3570 * If szc = 0, then lock the anon slot and mark it busy. 3571 * If szc > 0, then lock the range of slots by getting the 3572 * anon_array_lock for the first anon slot, and mark only the 3573 * first anon slot busy to represent whole range being busy. 3574 * Fail if the mutex or the anon_array are busy. 3575 */ 3576 3577 ASSERT(RW_READ_HELD(&->a_rwlock)); 3578 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3579 hash = ANON_ARRAY_HASH(amp, an_idx); 3580 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3581 sobj->sync_cv = &anon_array_cv[hash]; 3582 if (!mutex_tryenter(mtx)) { 3583 return (EWOULDBLOCK); 3584 } 3585 ap_slot = anon_get_slot(amp->ahp, an_idx); 3586 if (ANON_ISBUSY(ap_slot)) { 3587 mutex_exit(mtx); 3588 return (EWOULDBLOCK); 3589 } 3590 ANON_SETBUSY(ap_slot); 3591 sobj->sync_data = ap_slot; 3592 mutex_exit(mtx); 3593 return (0); 3594 } 3595 3596 void 3597 anon_array_exit(anon_sync_obj_t *sobj) 3598 { 3599 mutex_enter(sobj->sync_mutex); 3600 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3601 ANON_CLRBUSY(sobj->sync_data); 3602 if (CV_HAS_WAITERS(sobj->sync_cv)) 3603 cv_broadcast(sobj->sync_cv); 3604 mutex_exit(sobj->sync_mutex); 3605 } 3606