1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - anonymous pages. 41 * 42 * This layer sits immediately above the vm_swap layer. It manages 43 * physical pages that have no permanent identity in the file system 44 * name space, using the services of the vm_swap layer to allocate 45 * backing storage for these pages. Since these pages have no external 46 * identity, they are discarded when the last reference is removed. 47 * 48 * An important function of this layer is to manage low-level sharing 49 * of pages that are logically distinct but that happen to be 50 * physically identical (e.g., the corresponding pages of the processes 51 * resulting from a fork before one process or the other changes their 52 * contents). This pseudo-sharing is present only as an optimization 53 * and is not to be confused with true sharing in which multiple 54 * address spaces deliberately contain references to the same object; 55 * such sharing is managed at a higher level. 56 * 57 * The key data structure here is the anon struct, which contains a 58 * reference count for its associated physical page and a hint about 59 * the identity of that page. Anon structs typically live in arrays, 60 * with an instance's position in its array determining where the 61 * corresponding backing storage is allocated; however, the swap_xlate() 62 * routine abstracts away this representation information so that the 63 * rest of the anon layer need not know it. (See the swap layer for 64 * more details on anon struct layout.) 65 * 66 * In the future versions of the system, the association between an 67 * anon struct and its position on backing store will change so that 68 * we don't require backing store all anonymous pages in the system. 69 * This is important for consideration for large memory systems. 70 * We can also use this technique to delay binding physical locations 71 * to anonymous pages until pageout/swapout time where we can make 72 * smarter allocation decisions to improve anonymous klustering. 73 * 74 * Many of the routines defined here take a (struct anon **) argument, 75 * which allows the code at this level to manage anon pages directly, 76 * so that callers can regard anon structs as opaque objects and not be 77 * concerned with assigning or inspecting their contents. 78 * 79 * Clients of this layer refer to anon pages indirectly. That is, they 80 * maintain arrays of pointers to anon structs rather than maintaining 81 * anon structs themselves. The (struct anon **) arguments mentioned 82 * above are pointers to entries in these arrays. It is these arrays 83 * that capture the mapping between offsets within a given segment and 84 * the corresponding anonymous backing storage address. 85 */ 86 87 #ifdef DEBUG 88 #define ANON_DEBUG 89 #endif 90 91 #include <sys/types.h> 92 #include <sys/t_lock.h> 93 #include <sys/param.h> 94 #include <sys/systm.h> 95 #include <sys/mman.h> 96 #include <sys/cred.h> 97 #include <sys/thread.h> 98 #include <sys/vnode.h> 99 #include <sys/cpuvar.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/vtrace.h> 103 #include <sys/kmem.h> 104 #include <sys/sysmacros.h> 105 #include <sys/bitmap.h> 106 #include <sys/vmsystm.h> 107 #include <sys/tuneable.h> 108 #include <sys/debug.h> 109 #include <sys/fs/swapnode.h> 110 #include <sys/lgrp.h> 111 #include <sys/policy.h> 112 #include <sys/condvar_impl.h> 113 #include <sys/mutex_impl.h> 114 #include <sys/rctl.h> 115 116 #include <vm/as.h> 117 #include <vm/hat.h> 118 #include <vm/anon.h> 119 #include <vm/page.h> 120 #include <vm/vpage.h> 121 #include <vm/seg.h> 122 #include <vm/rm.h> 123 124 #include <fs/fs_subr.h> 125 126 struct vnode *anon_vp; 127 128 int anon_debug; 129 130 kmutex_t anoninfo_lock; 131 struct k_anoninfo k_anoninfo; 132 ani_free_t *ani_free_pool; 133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 134 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 135 136 /* 137 * Global hash table for (vp, off) -> anon slot 138 */ 139 extern int swap_maxcontig; 140 size_t anon_hash_size; 141 unsigned int anon_hash_shift; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 pad_mutex_t *anonhash_lock; 148 149 /* 150 * Used to make the increment of all refcnts of all anon slots of a large 151 * page appear to be atomic. The lock is grabbed for the first anon slot of 152 * a large page. 153 */ 154 pad_mutex_t *anonpages_hash_lock; 155 156 #define APH_MUTEX(vp, off) \ 157 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \ 158 (AH_LOCK_SIZE - 1))].pad_mutex) 159 160 #ifdef VM_STATS 161 static struct anonvmstats_str { 162 ulong_t getpages[30]; 163 ulong_t privatepages[10]; 164 ulong_t demotepages[9]; 165 ulong_t decrefpages[9]; 166 ulong_t dupfillholes[4]; 167 ulong_t freepages[1]; 168 } anonvmstats; 169 #endif /* VM_STATS */ 170 171 /*ARGSUSED*/ 172 static int 173 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 174 { 175 struct anon_map *amp = buf; 176 177 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 178 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 179 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 180 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 181 return (0); 182 } 183 184 /*ARGSUSED1*/ 185 static void 186 anonmap_cache_destructor(void *buf, void *cdrarg) 187 { 188 struct anon_map *amp = buf; 189 190 rw_destroy(&->a_rwlock); 191 cv_destroy(&->a_purgecv); 192 mutex_destroy(&->a_pmtx); 193 mutex_destroy(&->a_purgemtx); 194 } 195 196 void 197 anon_init(void) 198 { 199 int i; 200 pad_mutex_t *tmp; 201 202 /* These both need to be powers of 2 so round up to the next power */ 203 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1); 204 anon_hash_size = 1L << anon_hash_shift; 205 206 /* 207 * We need to align the anonhash_lock and anonpages_hash_lock arrays 208 * to a 64B boundary to avoid false sharing. We add 63B to our 209 * allocation so that we can get a 64B aligned address to use. 210 * We allocate both of these together to avoid wasting an additional 211 * 63B. 212 */ 213 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63, 214 KM_SLEEP); 215 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64); 216 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE; 217 218 for (i = 0; i < AH_LOCK_SIZE; i++) { 219 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT, 220 NULL); 221 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL, 222 MUTEX_DEFAULT, NULL); 223 } 224 225 for (i = 0; i < ANON_LOCKSIZE; i++) { 226 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 227 MUTEX_DEFAULT, NULL); 228 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 229 } 230 231 anon_hash = (struct anon **) 232 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 233 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 234 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 235 anonmap_cache = kmem_cache_create("anonmap_cache", 236 sizeof (struct anon_map), 0, 237 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 238 NULL, NULL, 0); 239 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 240 241 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP); 242 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */ 243 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64); 244 245 anon_vp = vn_alloc(KM_SLEEP); 246 vn_setops(anon_vp, swap_vnodeops); 247 anon_vp->v_type = VREG; 248 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 249 } 250 251 /* 252 * Global anon slot hash table manipulation. 253 */ 254 255 static void 256 anon_addhash(struct anon *ap) 257 { 258 int index; 259 260 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 261 index = ANON_HASH(ap->an_vp, ap->an_off); 262 ap->an_hash = anon_hash[index]; 263 anon_hash[index] = ap; 264 } 265 266 static void 267 anon_rmhash(struct anon *ap) 268 { 269 struct anon **app; 270 271 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 272 273 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 274 *app; app = &((*app)->an_hash)) { 275 if (*app == ap) { 276 *app = ap->an_hash; 277 break; 278 } 279 } 280 } 281 282 /* 283 * The anon array interfaces. Functions allocating, 284 * freeing array of pointers, and returning/setting 285 * entries in the array of pointers for a given offset. 286 * 287 * Create the list of pointers 288 */ 289 struct anon_hdr * 290 anon_create(pgcnt_t npages, int flags) 291 { 292 struct anon_hdr *ahp; 293 ulong_t nchunks; 294 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 295 296 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 297 return (NULL); 298 } 299 300 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 301 /* 302 * Single level case. 303 */ 304 ahp->size = npages; 305 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 306 307 if (flags & ANON_ALLOC_FORCE) 308 ahp->flags |= ANON_ALLOC_FORCE; 309 310 ahp->array_chunk = kmem_zalloc( 311 ahp->size * sizeof (struct anon *), kmemflags); 312 313 if (ahp->array_chunk == NULL) { 314 kmem_free(ahp, sizeof (struct anon_hdr)); 315 return (NULL); 316 } 317 } else { 318 /* 319 * 2 Level case. 320 * anon hdr size needs to be rounded off to be a multiple 321 * of ANON_CHUNK_SIZE. This is important as various anon 322 * related functions depend on this. 323 * NOTE - 324 * anon_grow() makes anon hdr size a multiple of 325 * ANON_CHUNK_SIZE. 326 * amp size is <= anon hdr size. 327 * anon_index + seg_pgs <= anon hdr size. 328 */ 329 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 330 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 331 332 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 333 kmemflags); 334 335 if (ahp->array_chunk == NULL) { 336 kmem_free(ahp, sizeof (struct anon_hdr)); 337 return (NULL); 338 } 339 } 340 return (ahp); 341 } 342 343 /* 344 * Free the array of pointers 345 */ 346 void 347 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 348 { 349 ulong_t i; 350 void **ppp; 351 ulong_t nchunks; 352 353 ASSERT(npages <= ahp->size); 354 355 /* 356 * Single level case. 357 */ 358 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 359 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 360 } else { 361 /* 362 * 2 level case. 363 */ 364 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 365 for (i = 0; i < nchunks; i++) { 366 ppp = &ahp->array_chunk[i]; 367 if (*ppp != NULL) 368 kmem_free(*ppp, PAGESIZE); 369 } 370 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 371 } 372 mutex_destroy(&ahp->serial_lock); 373 kmem_free(ahp, sizeof (struct anon_hdr)); 374 } 375 376 /* 377 * Return the pointer from the list for a 378 * specified anon index. 379 */ 380 struct anon * 381 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 382 { 383 struct anon **app; 384 385 ASSERT(an_idx < ahp->size); 386 387 /* 388 * Single level case. 389 */ 390 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 391 return ((struct anon *) 392 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 393 } else { 394 395 /* 396 * 2 level case. 397 */ 398 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 399 if (app) { 400 return ((struct anon *) 401 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 402 ANON_PTRMASK)); 403 } else { 404 return (NULL); 405 } 406 } 407 } 408 409 /* 410 * Return the anon pointer for the first valid entry in the anon list, 411 * starting from the given index. 412 */ 413 struct anon * 414 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 415 { 416 struct anon *ap; 417 struct anon **app; 418 ulong_t chunkoff; 419 ulong_t i; 420 ulong_t j; 421 pgcnt_t size; 422 423 i = *index; 424 size = ahp->size; 425 426 ASSERT(i < size); 427 428 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 429 /* 430 * 1 level case 431 */ 432 while (i < size) { 433 ap = (struct anon *) 434 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 435 if (ap) { 436 *index = i; 437 return (ap); 438 } 439 i++; 440 } 441 } else { 442 /* 443 * 2 level case 444 */ 445 chunkoff = i & ANON_CHUNK_OFF; 446 while (i < size) { 447 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 448 if (app) 449 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 450 ap = (struct anon *) 451 ((uintptr_t)app[j] & ANON_PTRMASK); 452 if (ap) { 453 *index = i + (j - chunkoff); 454 return (ap); 455 } 456 } 457 chunkoff = 0; 458 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 459 } 460 } 461 *index = size; 462 return (NULL); 463 } 464 465 /* 466 * Set list entry with a given pointer for a specified offset 467 */ 468 int 469 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 470 { 471 void **ppp; 472 struct anon **app; 473 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 474 uintptr_t *ap_addr; 475 476 ASSERT(an_idx < ahp->size); 477 478 /* 479 * Single level case. 480 */ 481 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 482 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 483 } else { 484 485 /* 486 * 2 level case. 487 */ 488 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 489 490 ASSERT(ppp != NULL); 491 if (*ppp == NULL) { 492 mutex_enter(&ahp->serial_lock); 493 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 494 if (*ppp == NULL) { 495 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 496 if (*ppp == NULL) { 497 mutex_exit(&ahp->serial_lock); 498 return (ENOMEM); 499 } 500 } 501 mutex_exit(&ahp->serial_lock); 502 } 503 app = *ppp; 504 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 505 } 506 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 507 return (0); 508 } 509 510 /* 511 * Copy anon array into a given new anon array 512 */ 513 int 514 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, struct anon_hdr *dahp, 515 ulong_t d_idx, pgcnt_t npages, int flags) 516 { 517 void **sapp, **dapp; 518 void *ap; 519 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 520 521 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 522 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 523 524 /* 525 * Both arrays are 1 level. 526 */ 527 if (((sahp->size <= ANON_CHUNK_SIZE) && 528 (dahp->size <= ANON_CHUNK_SIZE)) || 529 ((sahp->flags & ANON_ALLOC_FORCE) && 530 (dahp->flags & ANON_ALLOC_FORCE))) { 531 532 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 533 npages * sizeof (struct anon *)); 534 return (0); 535 } 536 537 /* 538 * Both arrays are 2 levels. 539 */ 540 if (sahp->size > ANON_CHUNK_SIZE && 541 dahp->size > ANON_CHUNK_SIZE && 542 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 543 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 544 545 ulong_t sapidx, dapidx; 546 ulong_t *sap, *dap; 547 ulong_t chknp; 548 549 while (npages != 0) { 550 551 sapidx = s_idx & ANON_CHUNK_OFF; 552 dapidx = d_idx & ANON_CHUNK_OFF; 553 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 554 if (chknp > npages) 555 chknp = npages; 556 557 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 558 if ((sap = *sapp) != NULL) { 559 dapp = &dahp->array_chunk[d_idx 560 >> ANON_CHUNK_SHIFT]; 561 if ((dap = *dapp) == NULL) { 562 *dapp = kmem_zalloc(PAGESIZE, 563 kmemflags); 564 if ((dap = *dapp) == NULL) 565 return (ENOMEM); 566 } 567 bcopy((sap + sapidx), (dap + dapidx), 568 chknp << ANON_PTRSHIFT); 569 } 570 s_idx += chknp; 571 d_idx += chknp; 572 npages -= chknp; 573 } 574 return (0); 575 } 576 577 /* 578 * At least one of the arrays is 2 level. 579 */ 580 while (npages--) { 581 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 582 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 583 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 584 return (ENOMEM); 585 } 586 s_idx++; 587 d_idx++; 588 } 589 return (0); 590 } 591 592 593 /* 594 * ANON_INITBUF is a convenience macro for anon_grow() below. It 595 * takes a buffer dst, which is at least as large as buffer src. It 596 * does a bcopy from src into dst, and then bzeros the extra bytes 597 * of dst. If tail is set, the data in src is tail aligned within 598 * dst instead of head aligned. 599 */ 600 601 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 602 if (tail) { \ 603 bzero((dst), (dstsize) - (srclen)); \ 604 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 605 } else { \ 606 bcopy((src), (dst), (srclen)); \ 607 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 608 } 609 610 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 611 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 612 613 /* 614 * anon_grow() is used to efficiently extend an existing anon array. 615 * startidx_p points to the index into the anon array of the first page 616 * that is in use. oldseg_pgs is the number of pages in use, starting at 617 * *startidx_p. newpages is the number of additional pages desired. 618 * 619 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 620 * 621 * The growth is done by creating a new top level of the anon array, 622 * and (if the array is 2-level) reusing the existing second level arrays. 623 * 624 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 625 * 626 * Returns the new number of pages in the anon array. 627 */ 628 pgcnt_t 629 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 630 pgcnt_t newseg_pgs, int flags) 631 { 632 ulong_t startidx = startidx_p ? *startidx_p : 0; 633 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 634 pgcnt_t oelems, nelems, totpages; 635 void **level1; 636 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 637 int growdown = (flags & ANON_GROWDOWN); 638 size_t newarrsz, oldarrsz; 639 void *level2; 640 641 ASSERT(!(startidx_p == NULL && growdown)); 642 ASSERT(startidx + oldseg_pgs <= ahp->size); 643 644 /* 645 * Determine the total number of pages needed in the new 646 * anon array. If growing down, totpages is all pages from 647 * startidx through the end of the array, plus <newseg_pgs> 648 * pages. If growing up, keep all pages from page 0 through 649 * the last page currently in use, plus <newseg_pgs> pages. 650 */ 651 if (growdown) 652 totpages = oldamp_pgs - startidx + newseg_pgs; 653 else 654 totpages = startidx + oldseg_pgs + newseg_pgs; 655 656 /* If the array is already large enough, just return. */ 657 658 if (oldamp_pgs >= totpages) { 659 if (growdown) 660 *startidx_p = oldamp_pgs - totpages; 661 return (oldamp_pgs); 662 } 663 664 /* 665 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 666 * by the corresponding arrays. 667 * oelems/nelems are the number of pointers in the top level arrays 668 * which may be either level 1 or level 2. 669 * Will the new anon array be one level or two levels? 670 */ 671 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 672 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 673 oelems = oldamp_pgs; 674 nelems = newamp_pgs; 675 } else { 676 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 677 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 678 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 679 } 680 681 newarrsz = nelems * sizeof (void *); 682 level1 = kmem_alloc(newarrsz, kmemflags); 683 if (level1 == NULL) 684 return (0); 685 686 /* Are we converting from a one level to a two level anon array? */ 687 688 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 689 !(ahp->flags & ANON_ALLOC_FORCE)) { 690 691 /* 692 * Yes, we're converting to a two level. Reuse old level 1 693 * as new level 2 if it is exactly PAGESIZE. Otherwise 694 * alloc a new level 2 and copy the old level 1 data into it. 695 */ 696 if (oldamp_pgs == ANON_CHUNK_SIZE) { 697 level2 = (void *)ahp->array_chunk; 698 } else { 699 level2 = kmem_alloc(PAGESIZE, kmemflags); 700 if (level2 == NULL) { 701 kmem_free(level1, newarrsz); 702 return (0); 703 } 704 oldarrsz = oldamp_pgs * sizeof (void *); 705 706 ANON_INITBUF(ahp->array_chunk, oldarrsz, 707 level2, PAGESIZE, growdown); 708 kmem_free(ahp->array_chunk, oldarrsz); 709 } 710 bzero(level1, newarrsz); 711 if (growdown) 712 level1[nelems - 1] = level2; 713 else 714 level1[0] = level2; 715 } else { 716 oldarrsz = oelems * sizeof (void *); 717 718 ANON_INITBUF(ahp->array_chunk, oldarrsz, 719 level1, newarrsz, growdown); 720 kmem_free(ahp->array_chunk, oldarrsz); 721 } 722 723 ahp->array_chunk = level1; 724 ahp->size = newamp_pgs; 725 if (growdown) 726 *startidx_p = newamp_pgs - totpages; 727 728 return (newamp_pgs); 729 } 730 731 732 /* 733 * Called to sync ani_free value. 734 */ 735 736 void 737 set_anoninfo(void) 738 { 739 processorid_t ix, max_seqid; 740 pgcnt_t total = 0; 741 static clock_t last_time; 742 clock_t new_time; 743 744 if (ani_free_pool == NULL) 745 return; 746 747 /* 748 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to 749 * identify the maximum number of CPUs were ever online. 750 */ 751 new_time = ddi_get_lbolt(); 752 if (new_time > last_time) { 753 754 max_seqid = max_cpu_seqid_ever; 755 ASSERT(ANI_MAX_POOL > max_seqid); 756 for (ix = 0; ix <= max_seqid; ix++) 757 total += ani_free_pool[ix].ani_count; 758 759 last_time = new_time; 760 k_anoninfo.ani_free = total; 761 } 762 } 763 764 /* 765 * Reserve anon space. 766 * 767 * It's no longer simply a matter of incrementing ani_resv to 768 * reserve swap space, we need to check memory-based as well 769 * as disk-backed (physical) swap. The following algorithm 770 * is used: 771 * Check the space on physical swap 772 * i.e. amount needed < ani_max - ani_phys_resv 773 * If we are swapping on swapfs check 774 * amount needed < (availrmem - swapfs_minfree) 775 * Since the algorithm to check for the quantity of swap space is 776 * almost the same as that for reserving it, we'll just use anon_resvmem 777 * with a flag to decrement availrmem. 778 * 779 * Return non-zero on success. 780 */ 781 int 782 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 783 { 784 pgcnt_t npages = btopr(size); 785 pgcnt_t mswap_pages = 0; 786 pgcnt_t pswap_pages = 0; 787 proc_t *p = curproc; 788 789 if (zone != NULL) { 790 /* test zone.max-swap resource control */ 791 mutex_enter(&p->p_lock); 792 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 793 mutex_exit(&p->p_lock); 794 795 if (takemem) 796 atomic_add_64(&zone->zone_anon_alloc_fail, 1); 797 798 return (0); 799 } 800 801 if (!takemem) 802 rctl_decr_swap(zone, ptob(npages)); 803 804 mutex_exit(&p->p_lock); 805 } 806 mutex_enter(&anoninfo_lock); 807 808 /* 809 * pswap_pages is the number of pages we can take from 810 * physical (i.e. disk-backed) swap. 811 */ 812 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 813 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 814 815 ANON_PRINT(A_RESV, 816 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 817 npages, takemem, pswap_pages, (void *)caller())); 818 819 if (npages <= pswap_pages) { 820 /* 821 * we have enough space on a physical swap 822 */ 823 if (takemem) 824 k_anoninfo.ani_phys_resv += npages; 825 mutex_exit(&anoninfo_lock); 826 return (1); 827 } else if (pswap_pages != 0) { 828 /* 829 * we have some space on a physical swap 830 */ 831 if (takemem) { 832 /* 833 * use up remainder of phys swap 834 */ 835 k_anoninfo.ani_phys_resv += pswap_pages; 836 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 837 } 838 } 839 /* 840 * since (npages > pswap_pages) we need mem swap 841 * mswap_pages is the number of pages needed from availrmem 842 */ 843 ASSERT(npages > pswap_pages); 844 mswap_pages = npages - pswap_pages; 845 846 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 847 mswap_pages)); 848 849 /* 850 * priv processes can reserve memory as swap as long as availrmem 851 * remains greater than swapfs_minfree; in the case of non-priv 852 * processes, memory can be reserved as swap only if availrmem 853 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 854 * swapfs_reserve amount of memswap is not available to non-priv 855 * processes. This protects daemons such as automounter dying 856 * as a result of application processes eating away almost entire 857 * membased swap. This safeguard becomes useless if apps are run 858 * with root access. 859 * 860 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 861 * 862 */ 863 if (tryhard) { 864 pgcnt_t floor_pages; 865 866 if (secpolicy_resource_anon_mem(CRED())) { 867 floor_pages = swapfs_minfree; 868 } else { 869 floor_pages = swapfs_minfree + swapfs_reserve; 870 } 871 872 mutex_exit(&anoninfo_lock); 873 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 874 mutex_enter(&anoninfo_lock); 875 } 876 877 mutex_enter(&freemem_lock); 878 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 879 (availrmem > (swapfs_minfree + mswap_pages) && 880 secpolicy_resource(CRED()) == 0)) { 881 882 if (takemem) { 883 /* 884 * Take the memory from the rest of the system. 885 */ 886 availrmem -= mswap_pages; 887 mutex_exit(&freemem_lock); 888 k_anoninfo.ani_mem_resv += mswap_pages; 889 ANI_ADD(mswap_pages); 890 ANON_PRINT((A_RESV | A_MRESV), 891 ("anon_resvmem: took %ld pages of availrmem\n", 892 mswap_pages)); 893 } else { 894 mutex_exit(&freemem_lock); 895 } 896 897 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 898 mutex_exit(&anoninfo_lock); 899 return (1); 900 } else { 901 /* 902 * Fail if not enough memory 903 */ 904 if (takemem) { 905 k_anoninfo.ani_phys_resv -= pswap_pages; 906 } 907 908 mutex_exit(&freemem_lock); 909 mutex_exit(&anoninfo_lock); 910 ANON_PRINT(A_RESV, 911 ("anon_resvmem: not enough space from swapfs\n")); 912 if (zone != NULL && takemem) 913 rctl_decr_swap(zone, ptob(npages)); 914 return (0); 915 } 916 } 917 918 /* 919 * Give back an anon reservation. 920 */ 921 void 922 anon_unresvmem(size_t size, zone_t *zone) 923 { 924 pgcnt_t npages = btopr(size); 925 spgcnt_t mem_free_pages = 0; 926 pgcnt_t phys_free_slots; 927 #ifdef ANON_DEBUG 928 pgcnt_t mem_resv; 929 #endif 930 if (zone != NULL) 931 rctl_decr_swap(zone, ptob(npages)); 932 933 mutex_enter(&anoninfo_lock); 934 935 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 936 937 /* 938 * If some of this reservation belonged to swapfs 939 * give it back to availrmem. 940 * ani_mem_resv is the amount of availrmem swapfs has reserved. 941 * but some of that memory could be locked by segspt so we can only 942 * return non locked ani_mem_resv back to availrmem 943 */ 944 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 945 ANON_PRINT((A_RESV | A_MRESV), 946 ("anon_unresv: growing availrmem by %ld pages\n", 947 MIN(k_anoninfo.ani_mem_resv, npages))); 948 949 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 950 k_anoninfo.ani_locked_swap), npages); 951 mutex_enter(&freemem_lock); 952 availrmem += mem_free_pages; 953 mutex_exit(&freemem_lock); 954 k_anoninfo.ani_mem_resv -= mem_free_pages; 955 956 ANI_ADD(-mem_free_pages); 957 } 958 /* 959 * The remainder of the pages is returned to phys swap 960 */ 961 ASSERT(npages >= mem_free_pages); 962 phys_free_slots = npages - mem_free_pages; 963 964 if (phys_free_slots) { 965 k_anoninfo.ani_phys_resv -= phys_free_slots; 966 } 967 968 #ifdef ANON_DEBUG 969 mem_resv = k_anoninfo.ani_mem_resv; 970 #endif 971 972 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 973 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 974 975 mutex_exit(&anoninfo_lock); 976 977 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 978 npages, mem_resv, (void *)caller())); 979 } 980 981 /* 982 * Allocate an anon slot and return it with the lock held. 983 */ 984 struct anon * 985 anon_alloc(struct vnode *vp, anoff_t off) 986 { 987 struct anon *ap; 988 kmutex_t *ahm; 989 990 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 991 if (vp == NULL) { 992 swap_alloc(ap); 993 } else { 994 ap->an_vp = vp; 995 ap->an_off = off; 996 } 997 ap->an_refcnt = 1; 998 ap->an_pvp = NULL; 999 ap->an_poff = 0; 1000 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1001 mutex_enter(ahm); 1002 anon_addhash(ap); 1003 mutex_exit(ahm); 1004 ANI_ADD(-1); 1005 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 1006 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 1007 return (ap); 1008 } 1009 1010 /* 1011 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 1012 * such pages don't consume any physical swap resources needed for swapping 1013 * unlocked pages. 1014 */ 1015 void 1016 anon_swap_free(struct anon *ap, page_t *pp) 1017 { 1018 kmutex_t *ahm; 1019 1020 ASSERT(ap != NULL); 1021 ASSERT(pp != NULL); 1022 ASSERT(PAGE_LOCKED(pp)); 1023 ASSERT(pp->p_vnode != NULL); 1024 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1025 ASSERT(ap->an_refcnt != 0); 1026 ASSERT(pp->p_vnode == ap->an_vp); 1027 ASSERT(pp->p_offset == ap->an_off); 1028 1029 if (ap->an_pvp == NULL) 1030 return; 1031 1032 page_io_lock(pp); 1033 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1034 mutex_enter(ahm); 1035 1036 ASSERT(ap->an_refcnt != 0); 1037 ASSERT(pp->p_vnode == ap->an_vp); 1038 ASSERT(pp->p_offset == ap->an_off); 1039 1040 if (ap->an_pvp != NULL) { 1041 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1042 ap->an_pvp = NULL; 1043 ap->an_poff = 0; 1044 mutex_exit(ahm); 1045 hat_setmod(pp); 1046 } else { 1047 mutex_exit(ahm); 1048 } 1049 page_io_unlock(pp); 1050 } 1051 1052 /* 1053 * Decrement the reference count of an anon page. 1054 * If reference count goes to zero, free it and 1055 * its associated page (if any). 1056 */ 1057 void 1058 anon_decref(struct anon *ap) 1059 { 1060 page_t *pp; 1061 struct vnode *vp; 1062 anoff_t off; 1063 kmutex_t *ahm; 1064 1065 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1066 mutex_enter(ahm); 1067 ASSERT(ap->an_refcnt != 0); 1068 if (ap->an_refcnt == 0) 1069 panic("anon_decref: slot count 0"); 1070 if (--ap->an_refcnt == 0) { 1071 swap_xlate(ap, &vp, &off); 1072 anon_rmhash(ap); 1073 if (ap->an_pvp != NULL) 1074 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1075 mutex_exit(ahm); 1076 1077 /* 1078 * If there is a page for this anon slot we will need to 1079 * call VN_DISPOSE to get rid of the vp association and 1080 * put the page back on the free list as really free. 1081 * Acquire the "exclusive" lock to ensure that any 1082 * pending i/o always completes before the swap slot 1083 * is freed. 1084 */ 1085 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1086 if (pp != NULL) { 1087 /*LINTED: constant in conditional context */ 1088 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1089 } 1090 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1091 (void *)ap, (void *)ap->an_vp)); 1092 1093 kmem_cache_free(anon_cache, ap); 1094 1095 ANI_ADD(1); 1096 } else { 1097 mutex_exit(ahm); 1098 } 1099 } 1100 1101 1102 /* 1103 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1104 * seg->s_szc level) to determine whether COW processing is required. 1105 * anonpages_hash_lock[] held on the root ap ensures that if root's 1106 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1107 * later since this process can't fork while its AS lock is held). 1108 * 1109 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1110 */ 1111 int 1112 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1113 { 1114 struct anon *ap; 1115 kmutex_t *ahmpages = NULL; 1116 1117 ap = anon_get_ptr(ahp, anon_index); 1118 if (ap == NULL) 1119 return (0); 1120 1121 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1122 mutex_enter(ahmpages); 1123 ASSERT(ap->an_refcnt >= 1); 1124 if (ap->an_refcnt == 1) { 1125 mutex_exit(ahmpages); 1126 return (0); 1127 } 1128 mutex_exit(ahmpages); 1129 return (1); 1130 } 1131 /* 1132 * Check 'nslots' anon slots for refcnt > 1. 1133 * 1134 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1135 * returns 0. 1136 */ 1137 static int 1138 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1139 { 1140 struct anon *ap; 1141 1142 while (nslots-- > 0) { 1143 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1144 ap->an_refcnt > 1) 1145 return (1); 1146 anon_index++; 1147 } 1148 1149 return (0); 1150 } 1151 1152 static void 1153 anon_decref_pages( 1154 struct anon_hdr *ahp, 1155 ulong_t an_idx, 1156 uint_t szc) 1157 { 1158 struct anon *ap = anon_get_ptr(ahp, an_idx); 1159 kmutex_t *ahmpages = NULL; 1160 page_t *pp; 1161 pgcnt_t pgcnt = page_get_pagecnt(szc); 1162 pgcnt_t i; 1163 struct vnode *vp; 1164 anoff_t off; 1165 kmutex_t *ahm; 1166 #ifdef DEBUG 1167 int refcnt = 1; 1168 #endif 1169 1170 ASSERT(szc != 0); 1171 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1172 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1173 ASSERT(an_idx < ahp->size); 1174 1175 if (ahp->size - an_idx < pgcnt) { 1176 /* 1177 * In case of shared mappings total anon map size may not be 1178 * the largest page size aligned. 1179 */ 1180 pgcnt = ahp->size - an_idx; 1181 } 1182 1183 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1184 1185 if (ap != NULL) { 1186 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1187 mutex_enter(ahmpages); 1188 ASSERT((refcnt = ap->an_refcnt) != 0); 1189 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1190 if (ap->an_refcnt == 1) { 1191 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1192 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1193 mutex_exit(ahmpages); 1194 ahmpages = NULL; 1195 } 1196 } 1197 1198 i = 0; 1199 while (i < pgcnt) { 1200 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1201 ASSERT(refcnt == 1 && ahmpages == NULL); 1202 i++; 1203 continue; 1204 } 1205 ASSERT(ap->an_refcnt == refcnt); 1206 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1207 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1208 1209 if (ahmpages == NULL) { 1210 swap_xlate(ap, &vp, &off); 1211 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1212 if (pp == NULL || pp->p_szc == 0) { 1213 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1214 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1215 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1216 ANON_SLEEP); 1217 mutex_enter(ahm); 1218 ap->an_refcnt--; 1219 ASSERT(ap->an_refcnt == 0); 1220 anon_rmhash(ap); 1221 if (ap->an_pvp) 1222 swap_phys_free(ap->an_pvp, ap->an_poff, 1223 PAGESIZE); 1224 mutex_exit(ahm); 1225 if (pp == NULL) { 1226 pp = page_lookup(vp, (u_offset_t)off, 1227 SE_EXCL); 1228 ASSERT(pp == NULL || pp->p_szc == 0); 1229 } 1230 if (pp != NULL) { 1231 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1232 /*LINTED*/ 1233 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1234 } 1235 kmem_cache_free(anon_cache, ap); 1236 ANI_ADD(1); 1237 i++; 1238 } else { 1239 pgcnt_t j; 1240 pgcnt_t curpgcnt = 1241 page_get_pagecnt(pp->p_szc); 1242 size_t ppasize = curpgcnt * sizeof (page_t *); 1243 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1244 int dispose = 0; 1245 1246 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1247 1248 ASSERT(pp->p_szc <= szc); 1249 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1250 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1251 ASSERT(i + curpgcnt <= pgcnt); 1252 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1253 ppa[0] = pp; 1254 for (j = i + 1; j < i + curpgcnt; j++) { 1255 ap = anon_get_ptr(ahp, an_idx + j); 1256 ASSERT(ap != NULL && 1257 ap->an_refcnt == 1); 1258 swap_xlate(ap, &vp, &off); 1259 pp = page_lookup(vp, (u_offset_t)off, 1260 SE_EXCL); 1261 if (pp == NULL) 1262 panic("anon_decref_pages: " 1263 "no page"); 1264 1265 (void) hat_pageunload(pp, 1266 HAT_FORCE_PGUNLOAD); 1267 ASSERT(pp->p_szc == ppa[0]->p_szc); 1268 ASSERT(page_pptonum(pp) - 1 == 1269 page_pptonum(ppa[j - i - 1])); 1270 ppa[j - i] = pp; 1271 if (ap->an_pvp != NULL && 1272 !vn_matchopval(ap->an_pvp, 1273 VOPNAME_DISPOSE, 1274 (fs_generic_func_p)(uintptr_t) 1275 fs_dispose)) 1276 dispose = 1; 1277 } 1278 for (j = i; j < i + curpgcnt; j++) { 1279 ap = anon_get_ptr(ahp, an_idx + j); 1280 ASSERT(ap != NULL && 1281 ap->an_refcnt == 1); 1282 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1283 (void) anon_set_ptr(ahp, an_idx + j, 1284 NULL, ANON_SLEEP); 1285 mutex_enter(ahm); 1286 ap->an_refcnt--; 1287 ASSERT(ap->an_refcnt == 0); 1288 anon_rmhash(ap); 1289 if (ap->an_pvp) 1290 swap_phys_free(ap->an_pvp, 1291 ap->an_poff, PAGESIZE); 1292 mutex_exit(ahm); 1293 kmem_cache_free(anon_cache, ap); 1294 ANI_ADD(1); 1295 } 1296 if (!dispose) { 1297 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1298 page_destroy_pages(ppa[0]); 1299 } else { 1300 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1301 for (j = 0; j < curpgcnt; j++) { 1302 ASSERT(PAGE_EXCL(ppa[j])); 1303 ppa[j]->p_szc = 0; 1304 } 1305 for (j = 0; j < curpgcnt; j++) { 1306 ASSERT(!hat_page_is_mapped( 1307 ppa[j])); 1308 /*LINTED*/ 1309 VN_DISPOSE(ppa[j], B_INVAL, 0, 1310 kcred); 1311 } 1312 } 1313 kmem_free(ppa, ppasize); 1314 i += curpgcnt; 1315 } 1316 } else { 1317 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1318 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1319 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1320 mutex_enter(ahm); 1321 ap->an_refcnt--; 1322 mutex_exit(ahm); 1323 i++; 1324 } 1325 } 1326 1327 if (ahmpages != NULL) { 1328 mutex_exit(ahmpages); 1329 } 1330 } 1331 1332 /* 1333 * Duplicate references to size bytes worth of anon pages. 1334 * Used when duplicating a segment that contains private anon pages. 1335 * This code assumes that procedure calling this one has already used 1336 * hat_chgprot() to disable write access to the range of addresses that 1337 * that *old actually refers to. 1338 */ 1339 void 1340 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1341 ulong_t new_idx, size_t size) 1342 { 1343 spgcnt_t npages; 1344 kmutex_t *ahm; 1345 struct anon *ap; 1346 ulong_t off; 1347 ulong_t index; 1348 1349 npages = btopr(size); 1350 while (npages > 0) { 1351 index = old_idx; 1352 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1353 break; 1354 1355 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1356 off = index - old_idx; 1357 npages -= off; 1358 if (npages <= 0) 1359 break; 1360 1361 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1362 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1363 1364 mutex_enter(ahm); 1365 ap->an_refcnt++; 1366 mutex_exit(ahm); 1367 1368 off++; 1369 new_idx += off; 1370 old_idx += off; 1371 npages--; 1372 } 1373 } 1374 1375 /* 1376 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1377 * slots) within any large page region. That means if a large page region is 1378 * empty in the old array it will skip it. If there are 1 or more valid slots 1379 * in the large page region of the old array it will make sure to fill in any 1380 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1381 * page region should either have no valid anon slots or all slots should be 1382 * valid. 1383 */ 1384 void 1385 anon_dup_fill_holes( 1386 struct anon_hdr *old, 1387 ulong_t old_idx, 1388 struct anon_hdr *new, 1389 ulong_t new_idx, 1390 size_t size, 1391 uint_t szc, 1392 int noalloc) 1393 { 1394 struct anon *ap; 1395 spgcnt_t npages; 1396 kmutex_t *ahm, *ahmpages = NULL; 1397 pgcnt_t pgcnt, i; 1398 ulong_t index, off; 1399 #ifdef DEBUG 1400 int refcnt; 1401 #endif 1402 1403 ASSERT(szc != 0); 1404 pgcnt = page_get_pagecnt(szc); 1405 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1406 npages = btopr(size); 1407 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1408 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1409 1410 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1411 1412 while (npages > 0) { 1413 index = old_idx; 1414 1415 /* 1416 * Find the next valid slot. 1417 */ 1418 if (anon_get_next_ptr(old, &index) == NULL) 1419 break; 1420 1421 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1422 /* 1423 * Now backup index to the beginning of the 1424 * current large page region of the old array. 1425 */ 1426 index = P2ALIGN(index, pgcnt); 1427 off = index - old_idx; 1428 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1429 npages -= off; 1430 if (npages <= 0) 1431 break; 1432 1433 /* 1434 * Fill and copy a large page regions worth 1435 * of anon slots. 1436 */ 1437 for (i = 0; i < pgcnt; i++) { 1438 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1439 if (noalloc) { 1440 panic("anon_dup_fill_holes: " 1441 "empty anon slot\n"); 1442 } 1443 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1444 ap = anon_alloc(NULL, 0); 1445 (void) anon_set_ptr(old, index + i, ap, 1446 ANON_SLEEP); 1447 } else if (i == 0) { 1448 /* 1449 * make the increment of all refcnts of all 1450 * anon slots of a large page appear atomic by 1451 * getting an anonpages_hash_lock for the 1452 * first anon slot of a large page. 1453 */ 1454 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1455 1456 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1457 mutex_enter(ahmpages); 1458 /*LINTED*/ 1459 ASSERT(refcnt = ap->an_refcnt); 1460 1461 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1462 anonvmstats.dupfillholes[3]); 1463 } 1464 (void) anon_set_ptr(new, new_idx + off + i, ap, 1465 ANON_SLEEP); 1466 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1467 mutex_enter(ahm); 1468 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1469 ASSERT(i == 0 || ahmpages == NULL || 1470 refcnt == ap->an_refcnt); 1471 ap->an_refcnt++; 1472 mutex_exit(ahm); 1473 } 1474 if (ahmpages != NULL) { 1475 mutex_exit(ahmpages); 1476 ahmpages = NULL; 1477 } 1478 off += pgcnt; 1479 new_idx += off; 1480 old_idx += off; 1481 npages -= pgcnt; 1482 } 1483 } 1484 1485 /* 1486 * Used when a segment with a vnode changes szc. similarly to 1487 * anon_dup_fill_holes() makes sure each large page region either has no anon 1488 * slots or all of them. but new slots are created by COWing the file 1489 * pages. on entrance no anon slots should be shared. 1490 */ 1491 int 1492 anon_fill_cow_holes( 1493 struct seg *seg, 1494 caddr_t addr, 1495 struct anon_hdr *ahp, 1496 ulong_t an_idx, 1497 struct vnode *vp, 1498 u_offset_t vp_off, 1499 size_t size, 1500 uint_t szc, 1501 uint_t prot, 1502 struct vpage vpage[], 1503 struct cred *cred) 1504 { 1505 struct anon *ap; 1506 spgcnt_t npages; 1507 pgcnt_t pgcnt, i; 1508 ulong_t index, off; 1509 int err = 0; 1510 int pageflags = 0; 1511 1512 ASSERT(szc != 0); 1513 pgcnt = page_get_pagecnt(szc); 1514 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1515 npages = btopr(size); 1516 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1517 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1518 1519 while (npages > 0) { 1520 index = an_idx; 1521 1522 /* 1523 * Find the next valid slot. 1524 */ 1525 if (anon_get_next_ptr(ahp, &index) == NULL) { 1526 break; 1527 } 1528 1529 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1530 /* 1531 * Now backup index to the beginning of the 1532 * current large page region of the anon array. 1533 */ 1534 index = P2ALIGN(index, pgcnt); 1535 off = index - an_idx; 1536 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1537 npages -= off; 1538 if (npages <= 0) 1539 break; 1540 an_idx += off; 1541 vp_off += ptob(off); 1542 addr += ptob(off); 1543 if (vpage != NULL) { 1544 vpage += off; 1545 } 1546 1547 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1548 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1549 page_t *pl[1 + 1]; 1550 page_t *pp; 1551 1552 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1553 pl, PAGESIZE, seg, addr, S_READ, cred, 1554 NULL); 1555 if (err) { 1556 break; 1557 } 1558 if (vpage != NULL) { 1559 prot = VPP_PROT(vpage); 1560 pageflags = VPP_ISPPLOCK(vpage) ? 1561 LOCK_PAGE : 0; 1562 } 1563 pp = anon_private(&ap, seg, addr, prot, pl[0], 1564 pageflags, cred); 1565 if (pp == NULL) { 1566 err = ENOMEM; 1567 break; 1568 } 1569 (void) anon_set_ptr(ahp, an_idx, ap, 1570 ANON_SLEEP); 1571 page_unlock(pp); 1572 } 1573 ASSERT(ap->an_refcnt == 1); 1574 addr += PAGESIZE; 1575 if (vpage != NULL) { 1576 vpage++; 1577 } 1578 } 1579 npages -= pgcnt; 1580 } 1581 1582 return (err); 1583 } 1584 1585 /* 1586 * Free a group of "size" anon pages, size in bytes, 1587 * and clear out the pointers to the anon entries. 1588 */ 1589 void 1590 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1591 { 1592 spgcnt_t npages; 1593 struct anon *ap; 1594 ulong_t old; 1595 1596 npages = btopr(size); 1597 1598 while (npages > 0) { 1599 old = index; 1600 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1601 break; 1602 1603 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1604 npages -= index - old; 1605 if (npages <= 0) 1606 break; 1607 1608 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1609 anon_decref(ap); 1610 /* 1611 * Bump index and decrement page count 1612 */ 1613 index++; 1614 npages--; 1615 } 1616 } 1617 1618 void 1619 anon_free_pages( 1620 struct anon_hdr *ahp, 1621 ulong_t an_idx, 1622 size_t size, 1623 uint_t szc) 1624 { 1625 spgcnt_t npages; 1626 pgcnt_t pgcnt; 1627 ulong_t index, off; 1628 1629 ASSERT(szc != 0); 1630 pgcnt = page_get_pagecnt(szc); 1631 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1632 npages = btopr(size); 1633 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1634 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1635 ASSERT(an_idx < ahp->size); 1636 1637 VM_STAT_ADD(anonvmstats.freepages[0]); 1638 1639 while (npages > 0) { 1640 index = an_idx; 1641 1642 /* 1643 * Find the next valid slot. 1644 */ 1645 if (anon_get_next_ptr(ahp, &index) == NULL) 1646 break; 1647 1648 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1649 /* 1650 * Now backup index to the beginning of the 1651 * current large page region of the old array. 1652 */ 1653 index = P2ALIGN(index, pgcnt); 1654 off = index - an_idx; 1655 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1656 npages -= off; 1657 if (npages <= 0) 1658 break; 1659 1660 anon_decref_pages(ahp, index, szc); 1661 1662 off += pgcnt; 1663 an_idx += off; 1664 npages -= pgcnt; 1665 } 1666 } 1667 1668 /* 1669 * Make anonymous pages discardable 1670 */ 1671 int 1672 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, 1673 uint_t behav, pgcnt_t *purged) 1674 { 1675 spgcnt_t npages = btopr(size); 1676 struct anon *ap; 1677 struct vnode *vp; 1678 anoff_t off; 1679 page_t *pp, *root_pp; 1680 kmutex_t *ahm; 1681 pgcnt_t pgcnt, npurged = 0; 1682 ulong_t old_idx, idx, i; 1683 struct anon_hdr *ahp = amp->ahp; 1684 anon_sync_obj_t cookie; 1685 int err = 0; 1686 1687 VERIFY(behav == MADV_FREE || behav == MADV_PURGE); 1688 ASSERT(RW_READ_HELD(&->a_rwlock)); 1689 pgcnt = 1; 1690 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1691 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1692 1693 /* 1694 * get anon pointer and index for the first valid entry 1695 * in the anon list, starting from "index" 1696 */ 1697 old_idx = index; 1698 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1699 break; 1700 1701 /* 1702 * decrement npages by number of NULL anon slots we skipped 1703 */ 1704 npages -= index - old_idx; 1705 if (npages <= 0) 1706 break; 1707 1708 anon_array_enter(amp, index, &cookie); 1709 ap = anon_get_ptr(ahp, index); 1710 ASSERT(ap != NULL); 1711 1712 /* 1713 * Get anonymous page and try to lock it SE_EXCL; 1714 * if we couldn't grab the lock we skip to next page. 1715 */ 1716 swap_xlate(ap, &vp, &off); 1717 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1718 if (pp == NULL) { 1719 segadvstat.MADV_FREE_miss.value.ul++; 1720 pgcnt = 1; 1721 anon_array_exit(&cookie); 1722 continue; 1723 } 1724 pgcnt = page_get_pagecnt(pp->p_szc); 1725 1726 /* 1727 * we cannot free a page which is permanently locked. 1728 * The page_struct_lock need not be acquired to examine 1729 * these fields since the page has an "exclusive" lock. 1730 */ 1731 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1732 page_unlock(pp); 1733 segadvstat.MADV_FREE_miss.value.ul++; 1734 anon_array_exit(&cookie); 1735 err = EBUSY; 1736 continue; 1737 } 1738 1739 ahm = AH_MUTEX(vp, off); 1740 mutex_enter(ahm); 1741 ASSERT(ap->an_refcnt != 0); 1742 /* 1743 * skip this one if copy-on-write is not yet broken. 1744 */ 1745 if (ap->an_refcnt > 1) { 1746 mutex_exit(ahm); 1747 page_unlock(pp); 1748 segadvstat.MADV_FREE_miss.value.ul++; 1749 anon_array_exit(&cookie); 1750 continue; 1751 } 1752 1753 if (behav == MADV_PURGE && pp->p_szc != 0) { 1754 /* 1755 * If we're purging and we have a large page, simplify 1756 * things a bit by demoting ourselves into the base 1757 * page case. 1758 */ 1759 (void) page_try_demote_pages(pp); 1760 } 1761 1762 if (pp->p_szc == 0) { 1763 pgcnt = 1; 1764 1765 /* 1766 * free swap slot; 1767 */ 1768 if (ap->an_pvp) { 1769 swap_phys_free(ap->an_pvp, ap->an_poff, 1770 PAGESIZE); 1771 ap->an_pvp = NULL; 1772 ap->an_poff = 0; 1773 } 1774 1775 if (behav == MADV_PURGE) { 1776 /* 1777 * If we're purging (instead of merely freeing), 1778 * rip out this anon structure entirely to 1779 * assure that any subsequent fault pulls from 1780 * the backing vnode (if any). 1781 */ 1782 if (--ap->an_refcnt == 0) 1783 anon_rmhash(ap); 1784 1785 mutex_exit(ahm); 1786 (void) anon_set_ptr(ahp, index, 1787 NULL, ANON_SLEEP); 1788 npurged++; 1789 ANI_ADD(1); 1790 kmem_cache_free(anon_cache, ap); 1791 } else { 1792 mutex_exit(ahm); 1793 } 1794 1795 segadvstat.MADV_FREE_hit.value.ul++; 1796 1797 /* 1798 * while we are at it, unload all the translations 1799 * and attempt to free the page. 1800 */ 1801 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1802 /*LINTED: constant in conditional context */ 1803 VN_DISPOSE(pp, 1804 behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred); 1805 1806 anon_array_exit(&cookie); 1807 continue; 1808 } 1809 1810 pgcnt = page_get_pagecnt(pp->p_szc); 1811 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1812 if (!page_try_demote_pages(pp)) { 1813 mutex_exit(ahm); 1814 page_unlock(pp); 1815 segadvstat.MADV_FREE_miss.value.ul++; 1816 anon_array_exit(&cookie); 1817 err = EBUSY; 1818 continue; 1819 } else { 1820 pgcnt = 1; 1821 if (ap->an_pvp) { 1822 swap_phys_free(ap->an_pvp, 1823 ap->an_poff, PAGESIZE); 1824 ap->an_pvp = NULL; 1825 ap->an_poff = 0; 1826 } 1827 mutex_exit(ahm); 1828 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1829 /*LINTED*/ 1830 VN_DISPOSE(pp, B_FREE, 0, kcred); 1831 segadvstat.MADV_FREE_hit.value.ul++; 1832 anon_array_exit(&cookie); 1833 continue; 1834 } 1835 } 1836 mutex_exit(ahm); 1837 root_pp = pp; 1838 1839 /* 1840 * try to lock remaining pages 1841 */ 1842 for (idx = 1; idx < pgcnt; idx++) { 1843 pp++; 1844 if (!page_trylock(pp, SE_EXCL)) 1845 break; 1846 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1847 page_unlock(pp); 1848 break; 1849 } 1850 } 1851 1852 if (idx == pgcnt) { 1853 for (i = 0; i < pgcnt; i++) { 1854 ap = anon_get_ptr(ahp, index + i); 1855 if (ap == NULL) 1856 break; 1857 swap_xlate(ap, &vp, &off); 1858 ahm = AH_MUTEX(vp, off); 1859 mutex_enter(ahm); 1860 ASSERT(ap->an_refcnt != 0); 1861 1862 /* 1863 * skip this one if copy-on-write 1864 * is not yet broken. 1865 */ 1866 if (ap->an_refcnt > 1) { 1867 mutex_exit(ahm); 1868 goto skiplp; 1869 } 1870 if (ap->an_pvp) { 1871 swap_phys_free(ap->an_pvp, 1872 ap->an_poff, PAGESIZE); 1873 ap->an_pvp = NULL; 1874 ap->an_poff = 0; 1875 } 1876 mutex_exit(ahm); 1877 } 1878 page_destroy_pages(root_pp); 1879 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1880 anon_array_exit(&cookie); 1881 continue; 1882 } 1883 skiplp: 1884 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1885 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1886 page_unlock(pp); 1887 anon_array_exit(&cookie); 1888 } 1889 1890 if (purged != NULL) 1891 *purged = npurged; 1892 1893 return (err); 1894 } 1895 1896 /* 1897 * Return the kept page(s) and protections back to the segment driver. 1898 */ 1899 int 1900 anon_getpage( 1901 struct anon **app, 1902 uint_t *protp, 1903 page_t *pl[], 1904 size_t plsz, 1905 struct seg *seg, 1906 caddr_t addr, 1907 enum seg_rw rw, 1908 struct cred *cred) 1909 { 1910 page_t *pp; 1911 struct anon *ap = *app; 1912 struct vnode *vp; 1913 anoff_t off; 1914 int err; 1915 kmutex_t *ahm; 1916 1917 swap_xlate(ap, &vp, &off); 1918 1919 /* 1920 * Lookup the page. If page is being paged in, 1921 * wait for it to finish as we must return a list of 1922 * pages since this routine acts like the VOP_GETPAGE 1923 * routine does. 1924 */ 1925 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1926 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1927 mutex_enter(ahm); 1928 if (ap->an_refcnt == 1) 1929 *protp = PROT_ALL; 1930 else 1931 *protp = PROT_ALL & ~PROT_WRITE; 1932 mutex_exit(ahm); 1933 pl[0] = pp; 1934 pl[1] = NULL; 1935 return (0); 1936 } 1937 1938 /* 1939 * Simply treat it as a vnode fault on the anon vp. 1940 */ 1941 1942 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1943 "anon_getpage:seg %x addr %x vp %x", 1944 seg, addr, vp); 1945 1946 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1947 seg, addr, rw, cred, NULL); 1948 1949 if (err == 0 && pl != NULL) { 1950 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1951 mutex_enter(ahm); 1952 if (ap->an_refcnt != 1) 1953 *protp &= ~PROT_WRITE; /* make read-only */ 1954 mutex_exit(ahm); 1955 } 1956 return (err); 1957 } 1958 1959 /* 1960 * Creates or returns kept pages to the segment driver. returns -1 if a large 1961 * page cannot be allocated. returns -2 if some other process has allocated a 1962 * larger page. 1963 * 1964 * For cowfault it will allocate any size pages to fill the requested area to 1965 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1966 * slots within a large page with other processes). This policy greatly 1967 * simplifies large page freeing (which is only freed when all anon slot 1968 * refcnts are 0). 1969 */ 1970 int 1971 anon_map_getpages( 1972 struct anon_map *amp, 1973 ulong_t start_idx, 1974 uint_t szc, 1975 struct seg *seg, 1976 caddr_t addr, 1977 uint_t prot, 1978 uint_t *protp, 1979 page_t *ppa[], 1980 uint_t *ppa_szc, 1981 struct vpage vpage[], 1982 enum seg_rw rw, 1983 int brkcow, 1984 int anypgsz, 1985 int pgflags, 1986 struct cred *cred) 1987 { 1988 pgcnt_t pgcnt; 1989 struct anon *ap; 1990 struct vnode *vp; 1991 anoff_t off; 1992 page_t *pp, *pl[2], *conpp = NULL; 1993 caddr_t vaddr; 1994 ulong_t pg_idx, an_idx, i; 1995 spgcnt_t nreloc = 0; 1996 int prealloc = 1; 1997 int err, slotcreate; 1998 uint_t vpprot; 1999 int upsize = (szc < seg->s_szc); 2000 2001 #if !defined(__x86) 2002 ASSERT(seg->s_szc != 0); 2003 #endif 2004 ASSERT(szc <= seg->s_szc); 2005 ASSERT(ppa_szc != NULL); 2006 ASSERT(rw != S_CREATE); 2007 2008 *protp = PROT_ALL; 2009 2010 VM_STAT_ADD(anonvmstats.getpages[0]); 2011 2012 if (szc == 0) { 2013 VM_STAT_ADD(anonvmstats.getpages[1]); 2014 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 2015 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 2016 addr, rw, cred); 2017 if (err) 2018 return (err); 2019 ppa[0] = pl[0]; 2020 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2021 VM_STAT_ADD(anonvmstats.getpages[2]); 2022 if (ppa[0]->p_szc != 0 && upsize) { 2023 VM_STAT_ADD(anonvmstats.getpages[3]); 2024 *ppa_szc = MIN(ppa[0]->p_szc, 2025 seg->s_szc); 2026 page_unlock(ppa[0]); 2027 return (-2); 2028 } 2029 return (0); 2030 } 2031 panic("anon_map_getpages: cowfault for szc 0"); 2032 } else { 2033 VM_STAT_ADD(anonvmstats.getpages[4]); 2034 ppa[0] = anon_zero(seg, addr, &ap, cred); 2035 if (ppa[0] == NULL) 2036 return (ENOMEM); 2037 (void) anon_set_ptr(amp->ahp, start_idx, ap, 2038 ANON_SLEEP); 2039 return (0); 2040 } 2041 } 2042 2043 pgcnt = page_get_pagecnt(szc); 2044 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2045 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2046 2047 /* 2048 * First we check for the case that the requtested large 2049 * page or larger page already exists in the system. 2050 * Actually we only check if the first constituent page 2051 * exists and only preallocate if it's not found. 2052 */ 2053 ap = anon_get_ptr(amp->ahp, start_idx); 2054 if (ap) { 2055 uint_t pszc; 2056 swap_xlate(ap, &vp, &off); 2057 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 2058 if (pszc > szc && upsize) { 2059 *ppa_szc = MIN(pszc, seg->s_szc); 2060 return (-2); 2061 } 2062 if (pszc >= szc) { 2063 prealloc = 0; 2064 } 2065 } 2066 } 2067 2068 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 2069 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 2070 2071 top: 2072 /* 2073 * If a smaller page or no page at all was found, 2074 * grab a large page off the freelist. 2075 */ 2076 if (prealloc) { 2077 ASSERT(conpp == NULL); 2078 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 2079 szc, 0, pgflags) != 0) { 2080 VM_STAT_ADD(anonvmstats.getpages[7]); 2081 if (brkcow == 0 || szc < seg->s_szc || 2082 !anon_szcshare(amp->ahp, start_idx)) { 2083 /* 2084 * If the refcnt's of all anon slots are <= 1 2085 * they can't increase since we are holding 2086 * the address space's lock. So segvn can 2087 * safely decrease szc without risking to 2088 * generate a cow fault for the region smaller 2089 * than the segment's largest page size. 2090 */ 2091 VM_STAT_ADD(anonvmstats.getpages[8]); 2092 return (-1); 2093 } 2094 docow: 2095 /* 2096 * This is a cow fault. Copy away the entire 1 large 2097 * page region of this segment. 2098 */ 2099 if (szc != seg->s_szc) 2100 panic("anon_map_getpages: cowfault for szc %d", 2101 szc); 2102 vaddr = addr; 2103 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2104 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2105 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2106 NULL) { 2107 err = anon_getpage(&ap, &vpprot, pl, 2108 PAGESIZE, seg, vaddr, rw, cred); 2109 if (err) { 2110 for (i = 0; i < pg_idx; i++) { 2111 if ((pp = ppa[i]) != 2112 NULL) 2113 page_unlock(pp); 2114 } 2115 return (err); 2116 } 2117 ppa[pg_idx] = pl[0]; 2118 } else { 2119 /* 2120 * Since this is a cowfault we know 2121 * that this address space has a 2122 * parent or children which means 2123 * anon_dup_fill_holes() has initialized 2124 * all anon slots within a large page 2125 * region that had at least one anon 2126 * slot at the time of fork(). 2127 */ 2128 panic("anon_map_getpages: " 2129 "cowfault but anon slot is empty"); 2130 } 2131 } 2132 VM_STAT_ADD(anonvmstats.getpages[9]); 2133 *protp = PROT_ALL; 2134 return (anon_map_privatepages(amp, start_idx, szc, seg, 2135 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2136 } 2137 } 2138 2139 VM_STAT_ADD(anonvmstats.getpages[10]); 2140 2141 an_idx = start_idx; 2142 pg_idx = 0; 2143 vaddr = addr; 2144 while (pg_idx < pgcnt) { 2145 slotcreate = 0; 2146 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2147 VM_STAT_ADD(anonvmstats.getpages[11]); 2148 /* 2149 * For us to have decided not to preallocate 2150 * would have meant that a large page 2151 * was found. Which also means that all of the 2152 * anon slots for that page would have been 2153 * already created for us. 2154 */ 2155 if (prealloc == 0) 2156 panic("anon_map_getpages: prealloc = 0"); 2157 2158 slotcreate = 1; 2159 ap = anon_alloc(NULL, 0); 2160 } 2161 swap_xlate(ap, &vp, &off); 2162 2163 /* 2164 * Now setup our preallocated page to pass down 2165 * to swap_getpage(). 2166 */ 2167 if (prealloc) { 2168 ASSERT(ppa[pg_idx]->p_szc == szc); 2169 conpp = ppa[pg_idx]; 2170 } 2171 ASSERT(prealloc || conpp == NULL); 2172 2173 /* 2174 * If we just created this anon slot then call 2175 * with S_CREATE to prevent doing IO on the page. 2176 * Similar to the anon_zero case. 2177 */ 2178 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2179 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2180 slotcreate == 1 ? S_CREATE : rw, cred); 2181 2182 if (err) { 2183 ASSERT(err != -2 || upsize); 2184 VM_STAT_ADD(anonvmstats.getpages[12]); 2185 ASSERT(slotcreate == 0); 2186 goto io_err; 2187 } 2188 2189 pp = pl[0]; 2190 2191 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2192 VM_STAT_ADD(anonvmstats.getpages[13]); 2193 ASSERT(slotcreate == 0); 2194 ASSERT(prealloc == 0); 2195 ASSERT(pg_idx == 0); 2196 if (pp->p_szc > szc) { 2197 ASSERT(upsize); 2198 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2199 page_unlock(pp); 2200 VM_STAT_ADD(anonvmstats.getpages[14]); 2201 return (-2); 2202 } 2203 page_unlock(pp); 2204 prealloc = 1; 2205 goto top; 2206 } 2207 2208 /* 2209 * If we decided to preallocate but VOP_GETPAGE 2210 * found a page in the system that satisfies our 2211 * request then free up our preallocated large page 2212 * and continue looping accross the existing large 2213 * page via VOP_GETPAGE. 2214 */ 2215 if (prealloc && pp != ppa[pg_idx]) { 2216 VM_STAT_ADD(anonvmstats.getpages[15]); 2217 ASSERT(slotcreate == 0); 2218 ASSERT(pg_idx == 0); 2219 conpp = NULL; 2220 prealloc = 0; 2221 page_free_pages(ppa[0]); 2222 } 2223 2224 if (prealloc && nreloc > 1) { 2225 /* 2226 * we have relocated out of a smaller large page. 2227 * skip npgs - 1 iterations and continue which will 2228 * increment by one the loop indices. 2229 */ 2230 spgcnt_t npgs = nreloc; 2231 2232 VM_STAT_ADD(anonvmstats.getpages[16]); 2233 2234 ASSERT(pp == ppa[pg_idx]); 2235 ASSERT(slotcreate == 0); 2236 ASSERT(pg_idx + npgs <= pgcnt); 2237 if ((*protp & PROT_WRITE) && 2238 anon_share(amp->ahp, an_idx, npgs)) { 2239 *protp &= ~PROT_WRITE; 2240 } 2241 pg_idx += npgs; 2242 an_idx += npgs; 2243 vaddr += PAGESIZE * npgs; 2244 continue; 2245 } 2246 2247 VM_STAT_ADD(anonvmstats.getpages[17]); 2248 2249 /* 2250 * Anon_zero case. 2251 */ 2252 if (slotcreate) { 2253 ASSERT(prealloc); 2254 pagezero(pp, 0, PAGESIZE); 2255 CPU_STATS_ADD_K(vm, zfod, 1); 2256 hat_setrefmod(pp); 2257 } 2258 2259 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2260 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2261 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2262 2263 if (pg_idx > 0 && 2264 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2265 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2266 panic("anon_map_getpages: unexpected page"); 2267 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2268 panic("anon_map_getpages: unaligned page"); 2269 } 2270 2271 if (prealloc == 0) { 2272 ppa[pg_idx] = pp; 2273 } 2274 2275 if (ap->an_refcnt > 1) { 2276 VM_STAT_ADD(anonvmstats.getpages[18]); 2277 *protp &= ~PROT_WRITE; 2278 } 2279 2280 /* 2281 * If this is a new anon slot then initialize 2282 * the anon array entry. 2283 */ 2284 if (slotcreate) { 2285 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2286 } 2287 pg_idx++; 2288 an_idx++; 2289 vaddr += PAGESIZE; 2290 } 2291 2292 /* 2293 * Since preallocated pages come off the freelist 2294 * they are locked SE_EXCL. Simply downgrade and return. 2295 */ 2296 if (prealloc) { 2297 VM_STAT_ADD(anonvmstats.getpages[19]); 2298 conpp = NULL; 2299 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2300 page_downgrade(ppa[pg_idx]); 2301 } 2302 } 2303 ASSERT(conpp == NULL); 2304 2305 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2306 VM_STAT_ADD(anonvmstats.getpages[20]); 2307 return (0); 2308 } 2309 2310 if (szc < seg->s_szc) 2311 panic("anon_map_getpages: cowfault for szc %d", szc); 2312 2313 VM_STAT_ADD(anonvmstats.getpages[21]); 2314 2315 *protp = PROT_ALL; 2316 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2317 ppa, vpage, anypgsz, pgflags, cred)); 2318 io_err: 2319 /* 2320 * We got an IO error somewhere in our large page. 2321 * If we were using a preallocated page then just demote 2322 * all the constituent pages that we've succeeded with sofar 2323 * to PAGESIZE pages and leave them in the system 2324 * unlocked. 2325 */ 2326 2327 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2328 2329 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2330 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2331 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2332 2333 if (prealloc) { 2334 conpp = NULL; 2335 if (pg_idx > 0) { 2336 VM_STAT_ADD(anonvmstats.getpages[25]); 2337 for (i = 0; i < pgcnt; i++) { 2338 pp = ppa[i]; 2339 ASSERT(PAGE_EXCL(pp)); 2340 ASSERT(pp->p_szc == szc); 2341 pp->p_szc = 0; 2342 } 2343 for (i = 0; i < pg_idx; i++) { 2344 ASSERT(!hat_page_is_mapped(ppa[i])); 2345 page_unlock(ppa[i]); 2346 } 2347 /* 2348 * Now free up the remaining unused constituent 2349 * pages. 2350 */ 2351 while (pg_idx < pgcnt) { 2352 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2353 page_free(ppa[pg_idx], 0); 2354 pg_idx++; 2355 } 2356 } else { 2357 VM_STAT_ADD(anonvmstats.getpages[26]); 2358 page_free_pages(ppa[0]); 2359 } 2360 } else { 2361 VM_STAT_ADD(anonvmstats.getpages[27]); 2362 ASSERT(err > 0); 2363 for (i = 0; i < pg_idx; i++) 2364 page_unlock(ppa[i]); 2365 } 2366 ASSERT(conpp == NULL); 2367 if (err != -1) 2368 return (err); 2369 /* 2370 * we are here because we failed to relocate. 2371 */ 2372 ASSERT(prealloc); 2373 if (brkcow == 0 || szc < seg->s_szc || 2374 !anon_szcshare(amp->ahp, start_idx)) { 2375 VM_STAT_ADD(anonvmstats.getpages[28]); 2376 return (-1); 2377 } 2378 VM_STAT_ADD(anonvmstats.getpages[29]); 2379 goto docow; 2380 } 2381 2382 2383 /* 2384 * Turn a reference to an object or shared anon page 2385 * into a private page with a copy of the data from the 2386 * original page which is always locked by the caller. 2387 * This routine unloads the translation and unlocks the 2388 * original page, if it isn't being stolen, before returning 2389 * to the caller. 2390 * 2391 * NOTE: The original anon slot is not freed by this routine 2392 * It must be freed by the caller while holding the 2393 * "anon_map" lock to prevent races which can occur if 2394 * a process has multiple lwps in its address space. 2395 */ 2396 page_t * 2397 anon_private( 2398 struct anon **app, 2399 struct seg *seg, 2400 caddr_t addr, 2401 uint_t prot, 2402 page_t *opp, 2403 int oppflags, 2404 struct cred *cred) 2405 { 2406 struct anon *old = *app; 2407 struct anon *new; 2408 page_t *pp = NULL; 2409 struct vnode *vp; 2410 anoff_t off; 2411 page_t *anon_pl[1 + 1]; 2412 int err; 2413 2414 if (oppflags & STEAL_PAGE) 2415 ASSERT(PAGE_EXCL(opp)); 2416 else 2417 ASSERT(PAGE_LOCKED(opp)); 2418 2419 CPU_STATS_ADD_K(vm, cow_fault, 1); 2420 2421 *app = new = anon_alloc(NULL, 0); 2422 swap_xlate(new, &vp, &off); 2423 2424 if (oppflags & STEAL_PAGE) { 2425 page_rename(opp, vp, (u_offset_t)off); 2426 pp = opp; 2427 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2428 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2429 seg, addr, pp, vp, off); 2430 hat_setmod(pp); 2431 2432 /* bug 4026339 */ 2433 page_downgrade(pp); 2434 return (pp); 2435 } 2436 2437 /* 2438 * Call the VOP_GETPAGE routine to create the page, thereby 2439 * enabling the vnode driver to allocate any filesystem 2440 * space (e.g., disk block allocation for UFS). This also 2441 * prevents more than one page from being added to the 2442 * vnode at the same time. 2443 */ 2444 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2445 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2446 if (err) 2447 goto out; 2448 2449 pp = anon_pl[0]; 2450 2451 /* 2452 * If the original page was locked, we need to move the lock 2453 * to the new page by transfering 'cowcnt/lckcnt' of the original 2454 * page to 'cowcnt/lckcnt' of the new page. 2455 * 2456 * See Statement at the beginning of segvn_lockop() and 2457 * comments in page_pp_useclaim() regarding the way 2458 * cowcnts/lckcnts are handled. 2459 * 2460 * Also availrmem must be decremented up front for read only mapping 2461 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2462 * if availrmem did not need to be decremented after all. 2463 */ 2464 if (oppflags & LOCK_PAGE) { 2465 if ((prot & PROT_WRITE) == 0) { 2466 mutex_enter(&freemem_lock); 2467 if (availrmem > pages_pp_maximum) { 2468 availrmem--; 2469 pages_useclaim++; 2470 } else { 2471 mutex_exit(&freemem_lock); 2472 goto out; 2473 } 2474 mutex_exit(&freemem_lock); 2475 } 2476 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2477 } 2478 2479 /* 2480 * Now copy the contents from the original page, 2481 * which is locked and loaded in the MMU by 2482 * the caller to prevent yet another page fault. 2483 */ 2484 /* XXX - should set mod bit in here */ 2485 if (ppcopy(opp, pp) == 0) { 2486 /* 2487 * Before ppcopy could hanlde UE or other faults, we 2488 * would have panicked here, and still have no option 2489 * but to do so now. 2490 */ 2491 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2492 (void *)opp, (void *)pp); 2493 } 2494 2495 hat_setrefmod(pp); /* mark as modified */ 2496 2497 /* 2498 * Unload the old translation. 2499 */ 2500 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2501 2502 /* 2503 * Free unmapped, unmodified original page. 2504 * or release the lock on the original page, 2505 * otherwise the process will sleep forever in 2506 * anon_decref() waiting for the "exclusive" lock 2507 * on the page. 2508 */ 2509 (void) page_release(opp, 1); 2510 2511 /* 2512 * we are done with page creation so downgrade the new 2513 * page's selock to shared, this helps when multiple 2514 * as_fault(...SOFTLOCK...) are done to the same 2515 * page(aio) 2516 */ 2517 page_downgrade(pp); 2518 2519 /* 2520 * NOTE: The original anon slot must be freed by the 2521 * caller while holding the "anon_map" lock, if we 2522 * copied away from an anonymous page. 2523 */ 2524 return (pp); 2525 2526 out: 2527 *app = old; 2528 if (pp) 2529 page_unlock(pp); 2530 anon_decref(new); 2531 page_unlock(opp); 2532 return ((page_t *)NULL); 2533 } 2534 2535 int 2536 anon_map_privatepages( 2537 struct anon_map *amp, 2538 ulong_t start_idx, 2539 uint_t szc, 2540 struct seg *seg, 2541 caddr_t addr, 2542 uint_t prot, 2543 page_t *ppa[], 2544 struct vpage vpage[], 2545 int anypgsz, 2546 int pgflags, 2547 struct cred *cred) 2548 { 2549 pgcnt_t pgcnt; 2550 struct vnode *vp; 2551 anoff_t off; 2552 page_t *pl[2], *conpp = NULL; 2553 int err; 2554 int prealloc = 1; 2555 struct anon *ap, *oldap; 2556 caddr_t vaddr; 2557 page_t *pplist, *pp; 2558 ulong_t pg_idx, an_idx; 2559 spgcnt_t nreloc = 0; 2560 int pagelock = 0; 2561 kmutex_t *ahmpages = NULL; 2562 #ifdef DEBUG 2563 int refcnt; 2564 #endif 2565 2566 ASSERT(szc != 0); 2567 ASSERT(szc == seg->s_szc); 2568 2569 VM_STAT_ADD(anonvmstats.privatepages[0]); 2570 2571 pgcnt = page_get_pagecnt(szc); 2572 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2573 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2574 2575 ASSERT(amp != NULL); 2576 ap = anon_get_ptr(amp->ahp, start_idx); 2577 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2578 2579 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2580 2581 /* 2582 * Now try and allocate the large page. If we fail then just 2583 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2584 * the caller make this decision but to avoid added complexity 2585 * it's simplier to handle that case here. 2586 */ 2587 if (anypgsz == -1) { 2588 VM_STAT_ADD(anonvmstats.privatepages[2]); 2589 prealloc = 0; 2590 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2591 anypgsz, pgflags) != 0) { 2592 VM_STAT_ADD(anonvmstats.privatepages[3]); 2593 prealloc = 0; 2594 } 2595 2596 /* 2597 * make the decrement of all refcnts of all 2598 * anon slots of a large page appear atomic by 2599 * getting an anonpages_hash_lock for the 2600 * first anon slot of a large page. 2601 */ 2602 if (ap != NULL) { 2603 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 2604 mutex_enter(ahmpages); 2605 if (ap->an_refcnt == 1) { 2606 VM_STAT_ADD(anonvmstats.privatepages[4]); 2607 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2608 mutex_exit(ahmpages); 2609 2610 if (prealloc) { 2611 page_free_replacement_page(pplist); 2612 page_create_putback(pgcnt); 2613 } 2614 ASSERT(ppa[0]->p_szc <= szc); 2615 if (ppa[0]->p_szc == szc) { 2616 VM_STAT_ADD(anonvmstats.privatepages[5]); 2617 return (0); 2618 } 2619 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2620 ASSERT(ppa[pg_idx] != NULL); 2621 page_unlock(ppa[pg_idx]); 2622 } 2623 return (-1); 2624 } 2625 } 2626 2627 /* 2628 * If we are passed in the vpage array and this is 2629 * not PROT_WRITE then we need to decrement availrmem 2630 * up front before we try anything. If we need to and 2631 * can't decrement availrmem then its better to fail now 2632 * than in the middle of processing the new large page. 2633 * page_pp_usclaim() on behalf of each constituent page 2634 * below will adjust availrmem back for the cases not needed. 2635 */ 2636 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2637 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2638 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2639 pagelock = 1; 2640 break; 2641 } 2642 } 2643 if (pagelock) { 2644 VM_STAT_ADD(anonvmstats.privatepages[6]); 2645 mutex_enter(&freemem_lock); 2646 if (availrmem >= pages_pp_maximum + pgcnt) { 2647 availrmem -= pgcnt; 2648 pages_useclaim += pgcnt; 2649 } else { 2650 VM_STAT_ADD(anonvmstats.privatepages[7]); 2651 mutex_exit(&freemem_lock); 2652 if (ahmpages != NULL) { 2653 mutex_exit(ahmpages); 2654 } 2655 if (prealloc) { 2656 page_free_replacement_page(pplist); 2657 page_create_putback(pgcnt); 2658 } 2659 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2660 if (ppa[pg_idx] != NULL) 2661 page_unlock(ppa[pg_idx]); 2662 return (ENOMEM); 2663 } 2664 mutex_exit(&freemem_lock); 2665 } 2666 } 2667 2668 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2669 2670 VM_STAT_ADD(anonvmstats.privatepages[8]); 2671 2672 an_idx = start_idx; 2673 pg_idx = 0; 2674 vaddr = addr; 2675 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2676 ASSERT(ppa[pg_idx] != NULL); 2677 oldap = anon_get_ptr(amp->ahp, an_idx); 2678 ASSERT(ahmpages != NULL || oldap == NULL); 2679 ASSERT(ahmpages == NULL || oldap != NULL); 2680 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2681 ASSERT(ahmpages == NULL || pg_idx != 0 || 2682 (refcnt = oldap->an_refcnt)); 2683 ASSERT(ahmpages == NULL || pg_idx == 0 || 2684 refcnt == oldap->an_refcnt); 2685 2686 ap = anon_alloc(NULL, 0); 2687 2688 swap_xlate(ap, &vp, &off); 2689 2690 /* 2691 * Now setup our preallocated page to pass down to 2692 * swap_getpage(). 2693 */ 2694 if (prealloc) { 2695 pp = pplist; 2696 page_sub(&pplist, pp); 2697 conpp = pp; 2698 } 2699 2700 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2701 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2702 S_CREATE, cred); 2703 2704 /* 2705 * Impossible to fail this is S_CREATE. 2706 */ 2707 if (err) 2708 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2709 2710 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2711 ASSERT(prealloc == 0 || nreloc == 1); 2712 2713 pp = pl[0]; 2714 2715 /* 2716 * If the original page was locked, we need to move 2717 * the lock to the new page by transfering 2718 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2719 * of the new page. pg_idx can be used to index 2720 * into the vpage array since the caller will guarentee 2721 * that vpage struct passed in corresponds to addr 2722 * and forward. 2723 */ 2724 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2725 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2726 } else if (pagelock) { 2727 mutex_enter(&freemem_lock); 2728 availrmem++; 2729 pages_useclaim--; 2730 mutex_exit(&freemem_lock); 2731 } 2732 2733 /* 2734 * Now copy the contents from the original page. 2735 */ 2736 if (ppcopy(ppa[pg_idx], pp) == 0) { 2737 /* 2738 * Before ppcopy could hanlde UE or other faults, we 2739 * would have panicked here, and still have no option 2740 * but to do so now. 2741 */ 2742 panic("anon_map_privatepages, ppcopy failed"); 2743 } 2744 2745 hat_setrefmod(pp); /* mark as modified */ 2746 2747 /* 2748 * Release the lock on the original page, 2749 * derement the old slot, and down grade the lock 2750 * on the new copy. 2751 */ 2752 page_unlock(ppa[pg_idx]); 2753 2754 if (!prealloc) 2755 page_downgrade(pp); 2756 2757 ppa[pg_idx] = pp; 2758 2759 /* 2760 * Now reflect the copy in the new anon array. 2761 */ 2762 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2763 if (oldap != NULL) 2764 anon_decref(oldap); 2765 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2766 } 2767 2768 /* 2769 * Unload the old large page translation. 2770 */ 2771 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2772 2773 if (ahmpages != NULL) { 2774 mutex_exit(ahmpages); 2775 } 2776 ASSERT(prealloc == 0 || pplist == NULL); 2777 if (prealloc) { 2778 VM_STAT_ADD(anonvmstats.privatepages[9]); 2779 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2780 page_downgrade(ppa[pg_idx]); 2781 } 2782 } 2783 2784 return (0); 2785 } 2786 2787 /* 2788 * Allocate a private zero-filled anon page. 2789 */ 2790 page_t * 2791 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2792 { 2793 struct anon *ap; 2794 page_t *pp; 2795 struct vnode *vp; 2796 anoff_t off; 2797 page_t *anon_pl[1 + 1]; 2798 int err; 2799 2800 *app = ap = anon_alloc(NULL, 0); 2801 swap_xlate(ap, &vp, &off); 2802 2803 /* 2804 * Call the VOP_GETPAGE routine to create the page, thereby 2805 * enabling the vnode driver to allocate any filesystem 2806 * dependent structures (e.g., disk block allocation for UFS). 2807 * This also prevents more than on page from being added to 2808 * the vnode at the same time since it is locked. 2809 */ 2810 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2811 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2812 if (err) { 2813 *app = NULL; 2814 anon_decref(ap); 2815 return (NULL); 2816 } 2817 pp = anon_pl[0]; 2818 2819 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2820 page_downgrade(pp); 2821 CPU_STATS_ADD_K(vm, zfod, 1); 2822 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2823 return (pp); 2824 } 2825 2826 2827 /* 2828 * Allocate array of private zero-filled anon pages for empty slots 2829 * and kept pages for non empty slots within given range. 2830 * 2831 * NOTE: This rontine will try and use large pages 2832 * if available and supported by underlying platform. 2833 */ 2834 int 2835 anon_map_createpages( 2836 struct anon_map *amp, 2837 ulong_t start_index, 2838 size_t len, 2839 page_t *ppa[], 2840 struct seg *seg, 2841 caddr_t addr, 2842 enum seg_rw rw, 2843 struct cred *cred) 2844 { 2845 2846 struct anon *ap; 2847 struct vnode *ap_vp; 2848 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2849 int err = 0; 2850 ulong_t p_index, index; 2851 pgcnt_t npgs, pg_cnt; 2852 spgcnt_t nreloc = 0; 2853 uint_t l_szc, szc, prot; 2854 anoff_t ap_off; 2855 size_t pgsz; 2856 lgrp_t *lgrp; 2857 kmutex_t *ahm; 2858 2859 /* 2860 * XXX For now only handle S_CREATE. 2861 */ 2862 ASSERT(rw == S_CREATE); 2863 2864 index = start_index; 2865 p_index = 0; 2866 npgs = btopr(len); 2867 2868 /* 2869 * If this platform supports multiple page sizes 2870 * then try and allocate directly from the free 2871 * list for pages larger than PAGESIZE. 2872 * 2873 * NOTE:When we have page_create_ru we can stop 2874 * directly allocating from the freelist. 2875 */ 2876 l_szc = seg->s_szc; 2877 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2878 while (npgs) { 2879 2880 /* 2881 * if anon slot already exists 2882 * (means page has been created) 2883 * so 1) look up the page 2884 * 2) if the page is still in memory, get it. 2885 * 3) if not, create a page and 2886 * page in from physical swap device. 2887 * These are done in anon_getpage(). 2888 */ 2889 ap = anon_get_ptr(amp->ahp, index); 2890 if (ap) { 2891 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2892 seg, addr, S_READ, cred); 2893 if (err) { 2894 ANON_LOCK_EXIT(&->a_rwlock); 2895 panic("anon_map_createpages: anon_getpage"); 2896 } 2897 pp = anon_pl[0]; 2898 ppa[p_index++] = pp; 2899 2900 /* 2901 * an_pvp can become non-NULL after SysV's page was 2902 * paged out before ISM was attached to this SysV 2903 * shared memory segment. So free swap slot if needed. 2904 */ 2905 if (ap->an_pvp != NULL) { 2906 page_io_lock(pp); 2907 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 2908 mutex_enter(ahm); 2909 if (ap->an_pvp != NULL) { 2910 swap_phys_free(ap->an_pvp, 2911 ap->an_poff, PAGESIZE); 2912 ap->an_pvp = NULL; 2913 ap->an_poff = 0; 2914 mutex_exit(ahm); 2915 hat_setmod(pp); 2916 } else { 2917 mutex_exit(ahm); 2918 } 2919 page_io_unlock(pp); 2920 } 2921 2922 addr += PAGESIZE; 2923 index++; 2924 npgs--; 2925 continue; 2926 } 2927 /* 2928 * Now try and allocate the largest page possible 2929 * for the current address and range. 2930 * Keep dropping down in page size until: 2931 * 2932 * 1) Properly aligned 2933 * 2) Does not overlap existing anon pages 2934 * 3) Fits in remaining range. 2935 * 4) able to allocate one. 2936 * 2937 * NOTE: XXX When page_create_ru is completed this code 2938 * will change. 2939 */ 2940 szc = l_szc; 2941 pplist = NULL; 2942 pg_cnt = 0; 2943 while (szc) { 2944 pgsz = page_get_pagesize(szc); 2945 pg_cnt = pgsz >> PAGESHIFT; 2946 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2947 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2948 /* 2949 * XXX 2950 * Since we are faking page_create() 2951 * we also need to do the freemem and 2952 * pcf accounting. 2953 */ 2954 (void) page_create_wait(pg_cnt, PG_WAIT); 2955 2956 /* 2957 * Get lgroup to allocate next page of shared 2958 * memory from and use it to specify where to 2959 * allocate the physical memory 2960 */ 2961 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2962 2963 pplist = page_get_freelist( 2964 anon_vp, (u_offset_t)0, seg, 2965 addr, pgsz, 0, lgrp); 2966 2967 if (pplist == NULL) { 2968 page_create_putback(pg_cnt); 2969 } 2970 2971 /* 2972 * If a request for a page of size 2973 * larger than PAGESIZE failed 2974 * then don't try that size anymore. 2975 */ 2976 if (pplist == NULL) { 2977 l_szc = szc - 1; 2978 } else { 2979 break; 2980 } 2981 } 2982 szc--; 2983 } 2984 2985 /* 2986 * If just using PAGESIZE pages then don't 2987 * directly allocate from the free list. 2988 */ 2989 if (pplist == NULL) { 2990 ASSERT(szc == 0); 2991 pp = anon_zero(seg, addr, &ap, cred); 2992 if (pp == NULL) { 2993 ANON_LOCK_EXIT(&->a_rwlock); 2994 panic("anon_map_createpages: anon_zero"); 2995 } 2996 ppa[p_index++] = pp; 2997 2998 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2999 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3000 3001 addr += PAGESIZE; 3002 index++; 3003 npgs--; 3004 continue; 3005 } 3006 3007 /* 3008 * pplist is a list of pg_cnt PAGESIZE pages. 3009 * These pages are locked SE_EXCL since they 3010 * came directly off the free list. 3011 */ 3012 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 3013 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 3014 ASSERT(conpp == NULL); 3015 while (pg_cnt--) { 3016 3017 ap = anon_alloc(NULL, 0); 3018 swap_xlate(ap, &ap_vp, &ap_off); 3019 3020 ASSERT(pplist != NULL); 3021 pp = pplist; 3022 page_sub(&pplist, pp); 3023 PP_CLRFREE(pp); 3024 PP_CLRAGED(pp); 3025 conpp = pp; 3026 3027 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 3028 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 3029 &nreloc, seg, addr, S_CREATE, cred); 3030 3031 if (err) { 3032 ANON_LOCK_EXIT(&->a_rwlock); 3033 panic("anon_map_createpages: S_CREATE"); 3034 } 3035 3036 ASSERT(anon_pl[0] == pp); 3037 ASSERT(nreloc == 1); 3038 pagezero(pp, 0, PAGESIZE); 3039 CPU_STATS_ADD_K(vm, zfod, 1); 3040 hat_setrefmod(pp); 3041 3042 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 3043 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3044 3045 ppa[p_index++] = pp; 3046 3047 addr += PAGESIZE; 3048 index++; 3049 npgs--; 3050 } 3051 conpp = NULL; 3052 pg_cnt = pgsz >> PAGESHIFT; 3053 p_index = p_index - pg_cnt; 3054 while (pg_cnt--) { 3055 page_downgrade(ppa[p_index++]); 3056 } 3057 } 3058 ANON_LOCK_EXIT(&->a_rwlock); 3059 return (0); 3060 } 3061 3062 static int 3063 anon_try_demote_pages( 3064 struct anon_hdr *ahp, 3065 ulong_t sidx, 3066 uint_t szc, 3067 page_t **ppa, 3068 int private) 3069 { 3070 struct anon *ap; 3071 pgcnt_t pgcnt = page_get_pagecnt(szc); 3072 page_t *pp; 3073 pgcnt_t i; 3074 kmutex_t *ahmpages = NULL; 3075 int root = 0; 3076 pgcnt_t npgs; 3077 pgcnt_t curnpgs = 0; 3078 size_t ppasize = 0; 3079 3080 ASSERT(szc != 0); 3081 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3082 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3083 ASSERT(sidx < ahp->size); 3084 3085 if (ppa == NULL) { 3086 ppasize = pgcnt * sizeof (page_t *); 3087 ppa = kmem_alloc(ppasize, KM_SLEEP); 3088 } 3089 3090 ap = anon_get_ptr(ahp, sidx); 3091 if (ap != NULL && private) { 3092 VM_STAT_ADD(anonvmstats.demotepages[1]); 3093 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 3094 mutex_enter(ahmpages); 3095 } 3096 3097 if (ap != NULL && ap->an_refcnt > 1) { 3098 if (ahmpages != NULL) { 3099 VM_STAT_ADD(anonvmstats.demotepages[2]); 3100 mutex_exit(ahmpages); 3101 } 3102 if (ppasize != 0) { 3103 kmem_free(ppa, ppasize); 3104 } 3105 return (0); 3106 } 3107 if (ahmpages != NULL) { 3108 mutex_exit(ahmpages); 3109 } 3110 if (ahp->size - sidx < pgcnt) { 3111 ASSERT(private == 0); 3112 pgcnt = ahp->size - sidx; 3113 } 3114 for (i = 0; i < pgcnt; i++, sidx++) { 3115 ap = anon_get_ptr(ahp, sidx); 3116 if (ap != NULL) { 3117 if (ap->an_refcnt != 1) { 3118 panic("anon_try_demote_pages: an_refcnt != 1"); 3119 } 3120 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3121 SE_EXCL); 3122 if (pp != NULL) { 3123 (void) hat_pageunload(pp, 3124 HAT_FORCE_PGUNLOAD); 3125 } 3126 } else { 3127 ppa[i] = NULL; 3128 } 3129 } 3130 for (i = 0; i < pgcnt; i++) { 3131 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3132 ASSERT(pp->p_szc <= szc); 3133 if (!root) { 3134 VM_STAT_ADD(anonvmstats.demotepages[3]); 3135 if (curnpgs != 0) 3136 panic("anon_try_demote_pages: " 3137 "bad large page"); 3138 3139 root = 1; 3140 curnpgs = npgs = 3141 page_get_pagecnt(pp->p_szc); 3142 3143 ASSERT(npgs <= pgcnt); 3144 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3145 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3146 } else { 3147 ASSERT(i > 0); 3148 ASSERT(page_pptonum(pp) - 1 == 3149 page_pptonum(ppa[i - 1])); 3150 if ((page_pptonum(pp) & (npgs - 1)) == 3151 npgs - 1) 3152 root = 0; 3153 } 3154 ASSERT(PAGE_EXCL(pp)); 3155 pp->p_szc = 0; 3156 ASSERT(curnpgs > 0); 3157 curnpgs--; 3158 } 3159 } 3160 if (root != 0 || curnpgs != 0) 3161 panic("anon_try_demote_pages: bad large page"); 3162 3163 for (i = 0; i < pgcnt; i++) { 3164 if ((pp = ppa[i]) != NULL) { 3165 ASSERT(!hat_page_is_mapped(pp)); 3166 ASSERT(pp->p_szc == 0); 3167 page_unlock(pp); 3168 } 3169 } 3170 if (ppasize != 0) { 3171 kmem_free(ppa, ppasize); 3172 } 3173 return (1); 3174 } 3175 3176 /* 3177 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3178 */ 3179 int 3180 anon_map_demotepages( 3181 struct anon_map *amp, 3182 ulong_t start_idx, 3183 struct seg *seg, 3184 caddr_t addr, 3185 uint_t prot, 3186 struct vpage vpage[], 3187 struct cred *cred) 3188 { 3189 struct anon *ap; 3190 uint_t szc = seg->s_szc; 3191 pgcnt_t pgcnt = page_get_pagecnt(szc); 3192 size_t ppasize = pgcnt * sizeof (page_t *); 3193 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3194 page_t *pp; 3195 page_t *pl[2]; 3196 pgcnt_t i, pg_idx; 3197 ulong_t an_idx; 3198 caddr_t vaddr; 3199 int err; 3200 int retry = 0; 3201 uint_t vpprot; 3202 3203 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3204 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3205 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3206 ASSERT(ppa != NULL); 3207 ASSERT(szc != 0); 3208 ASSERT(szc == amp->a_szc); 3209 3210 VM_STAT_ADD(anonvmstats.demotepages[0]); 3211 3212 top: 3213 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3214 kmem_free(ppa, ppasize); 3215 return (0); 3216 } 3217 3218 VM_STAT_ADD(anonvmstats.demotepages[4]); 3219 3220 ASSERT(retry == 0); /* we can be here only once */ 3221 3222 vaddr = addr; 3223 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3224 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3225 ap = anon_get_ptr(amp->ahp, an_idx); 3226 if (ap == NULL) 3227 panic("anon_map_demotepages: no anon slot"); 3228 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3229 S_READ, cred); 3230 if (err) { 3231 for (i = 0; i < pg_idx; i++) { 3232 if ((pp = ppa[i]) != NULL) 3233 page_unlock(pp); 3234 } 3235 kmem_free(ppa, ppasize); 3236 return (err); 3237 } 3238 ppa[pg_idx] = pl[0]; 3239 } 3240 3241 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3242 vpage, -1, 0, cred); 3243 if (err > 0) { 3244 VM_STAT_ADD(anonvmstats.demotepages[5]); 3245 kmem_free(ppa, ppasize); 3246 return (err); 3247 } 3248 ASSERT(err == 0 || err == -1); 3249 if (err == -1) { 3250 VM_STAT_ADD(anonvmstats.demotepages[6]); 3251 retry = 1; 3252 goto top; 3253 } 3254 for (i = 0; i < pgcnt; i++) { 3255 ASSERT(ppa[i] != NULL); 3256 if (ppa[i]->p_szc != 0) 3257 retry = 1; 3258 page_unlock(ppa[i]); 3259 } 3260 if (retry) { 3261 VM_STAT_ADD(anonvmstats.demotepages[7]); 3262 goto top; 3263 } 3264 3265 VM_STAT_ADD(anonvmstats.demotepages[8]); 3266 3267 kmem_free(ppa, ppasize); 3268 3269 return (0); 3270 } 3271 3272 /* 3273 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3274 * structures with private anon maps. Therefore all anon structures should 3275 * have at most one reference at this point. This means underlying pages can 3276 * be exclusively locked and demoted or freed. If not freeing the entire 3277 * large pages demote the ends of the region we free to be able to free 3278 * subpages. Page roots correspond to aligned index positions in anon map. 3279 */ 3280 void 3281 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3282 { 3283 ulong_t eidx = sidx + btopr(len); 3284 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3285 struct anon_hdr *ahp = amp->ahp; 3286 ulong_t tidx; 3287 size_t size; 3288 ulong_t sidx_aligned; 3289 ulong_t eidx_aligned; 3290 3291 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3292 ASSERT(amp->refcnt <= 1); 3293 ASSERT(amp->a_szc > 0); 3294 ASSERT(eidx <= ahp->size); 3295 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3296 3297 if (len == 0) { /* XXX */ 3298 return; 3299 } 3300 3301 sidx_aligned = P2ALIGN(sidx, pages); 3302 if (sidx_aligned != sidx || 3303 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3304 if (!anon_try_demote_pages(ahp, sidx_aligned, 3305 amp->a_szc, NULL, 0)) { 3306 panic("anon_shmap_free_pages: demote failed"); 3307 } 3308 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3309 P2NPHASE(sidx, pages); 3310 size <<= PAGESHIFT; 3311 anon_free(ahp, sidx, size); 3312 sidx = sidx_aligned + pages; 3313 if (eidx <= sidx) { 3314 return; 3315 } 3316 } 3317 eidx_aligned = P2ALIGN(eidx, pages); 3318 if (sidx < eidx_aligned) { 3319 anon_free_pages(ahp, sidx, 3320 (eidx_aligned - sidx) << PAGESHIFT, 3321 amp->a_szc); 3322 sidx = eidx_aligned; 3323 } 3324 ASSERT(sidx == eidx_aligned); 3325 if (eidx == eidx_aligned) { 3326 return; 3327 } 3328 tidx = eidx; 3329 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3330 tidx - sidx < pages) { 3331 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3332 panic("anon_shmap_free_pages: demote failed"); 3333 } 3334 size = (eidx - sidx) << PAGESHIFT; 3335 anon_free(ahp, sidx, size); 3336 } else { 3337 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3338 } 3339 } 3340 3341 /* 3342 * This routine should be called with amp's writer lock when there're no other 3343 * users of amp. All pcache entries of this amp must have been already 3344 * inactivated. We must not drop a_rwlock here to prevent new users from 3345 * attaching to this amp. 3346 */ 3347 void 3348 anonmap_purge(struct anon_map *amp) 3349 { 3350 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3351 ASSERT(amp->refcnt <= 1); 3352 3353 if (amp->a_softlockcnt != 0) { 3354 seg_ppurge(NULL, amp, 0); 3355 } 3356 3357 /* 3358 * Since all pcache entries were already inactive before this routine 3359 * was called seg_ppurge() couldn't return while there're still 3360 * entries that can be found via the list anchored at a_phead. So we 3361 * can assert this list is empty now. a_softlockcnt may be still non 0 3362 * if asynchronous thread that manages pcache already removed pcache 3363 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3364 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3365 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3366 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3367 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3368 * barrier that prevents anonmap_purge() to complete while 3369 * shamp_reclaim() may still be referencing this amp. 3370 */ 3371 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3372 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3373 3374 mutex_enter(&->a_purgemtx); 3375 while (amp->a_softlockcnt != 0) { 3376 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3377 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3378 amp->a_purgewait = 1; 3379 cv_wait(&->a_purgecv, &->a_purgemtx); 3380 } 3381 mutex_exit(&->a_purgemtx); 3382 3383 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3384 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3385 ASSERT(amp->a_softlockcnt == 0); 3386 } 3387 3388 /* 3389 * Allocate and initialize an anon_map structure for seg 3390 * associating the given swap reservation with the new anon_map. 3391 */ 3392 struct anon_map * 3393 anonmap_alloc(size_t size, size_t swresv, int flags) 3394 { 3395 struct anon_map *amp; 3396 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3397 3398 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3399 if (amp == NULL) { 3400 ASSERT(kmflags == KM_NOSLEEP); 3401 return (NULL); 3402 } 3403 3404 amp->ahp = anon_create(btopr(size), flags); 3405 if (amp->ahp == NULL) { 3406 ASSERT(flags == ANON_NOSLEEP); 3407 kmem_cache_free(anonmap_cache, amp); 3408 return (NULL); 3409 } 3410 amp->refcnt = 1; 3411 amp->size = size; 3412 amp->swresv = swresv; 3413 amp->locality = 0; 3414 amp->a_szc = 0; 3415 amp->a_sp = NULL; 3416 amp->a_softlockcnt = 0; 3417 amp->a_purgewait = 0; 3418 amp->a_phead.p_lnext = &->a_phead; 3419 amp->a_phead.p_lprev = &->a_phead; 3420 3421 return (amp); 3422 } 3423 3424 void 3425 anonmap_free(struct anon_map *amp) 3426 { 3427 ASSERT(amp->ahp != NULL); 3428 ASSERT(amp->refcnt == 0); 3429 ASSERT(amp->a_softlockcnt == 0); 3430 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3431 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3432 3433 lgrp_shm_policy_fini(amp, NULL); 3434 anon_release(amp->ahp, btopr(amp->size)); 3435 kmem_cache_free(anonmap_cache, amp); 3436 } 3437 3438 /* 3439 * Returns true if the app array has some empty slots. 3440 * The offp and lenp parameters are in/out parameters. On entry 3441 * these values represent the starting offset and length of the 3442 * mapping. When true is returned, these values may be modified 3443 * to be the largest range which includes empty slots. 3444 */ 3445 int 3446 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3447 size_t *lenp) 3448 { 3449 ulong_t i, el; 3450 ssize_t low, high; 3451 struct anon *ap; 3452 3453 low = -1; 3454 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3455 ap = anon_get_ptr(ahp, anon_idx); 3456 if (ap == NULL) { 3457 if (low == -1) 3458 low = i; 3459 high = i; 3460 } 3461 } 3462 if (low != -1) { 3463 /* 3464 * Found at least one non-anon page. 3465 * Set up the off and len return values. 3466 */ 3467 if (low != 0) 3468 *offp += low; 3469 *lenp = high - low + PAGESIZE; 3470 return (1); 3471 } 3472 return (0); 3473 } 3474 3475 /* 3476 * Return a count of the number of existing anon pages in the anon array 3477 * app in the range (off, off+len). The array and slots must be guaranteed 3478 * stable by the caller. 3479 */ 3480 pgcnt_t 3481 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3482 { 3483 pgcnt_t cnt = 0; 3484 3485 while (nslots-- > 0) { 3486 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3487 cnt++; 3488 anon_index++; 3489 } 3490 return (cnt); 3491 } 3492 3493 /* 3494 * Move reserved phys swap into memory swap (unreserve phys swap 3495 * and reserve mem swap by the same amount). 3496 * Used by segspt when it needs to lock reserved swap npages in memory 3497 */ 3498 int 3499 anon_swap_adjust(pgcnt_t npages) 3500 { 3501 pgcnt_t unlocked_mem_swap; 3502 3503 mutex_enter(&anoninfo_lock); 3504 3505 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3506 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3507 3508 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3509 - k_anoninfo.ani_locked_swap; 3510 if (npages > unlocked_mem_swap) { 3511 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3512 3513 /* 3514 * if there is not enough unlocked mem swap we take missing 3515 * amount from phys swap and give it to mem swap 3516 */ 3517 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3518 mutex_exit(&anoninfo_lock); 3519 return (ENOMEM); 3520 } 3521 3522 k_anoninfo.ani_mem_resv += adjusted_swap; 3523 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3524 k_anoninfo.ani_phys_resv -= adjusted_swap; 3525 3526 ANI_ADD(adjusted_swap); 3527 } 3528 k_anoninfo.ani_locked_swap += npages; 3529 3530 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3531 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3532 3533 mutex_exit(&anoninfo_lock); 3534 3535 return (0); 3536 } 3537 3538 /* 3539 * 'unlocked' reserved mem swap so when it is unreserved it 3540 * can be moved back phys (disk) swap 3541 */ 3542 void 3543 anon_swap_restore(pgcnt_t npages) 3544 { 3545 mutex_enter(&anoninfo_lock); 3546 3547 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3548 3549 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3550 k_anoninfo.ani_locked_swap -= npages; 3551 3552 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3553 3554 mutex_exit(&anoninfo_lock); 3555 } 3556 3557 /* 3558 * Return the pointer from the list for a 3559 * specified anon index. 3560 */ 3561 ulong_t * 3562 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3563 { 3564 struct anon **app; 3565 void **ppp; 3566 3567 ASSERT(an_idx < ahp->size); 3568 3569 /* 3570 * Single level case. 3571 */ 3572 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3573 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3574 } else { 3575 3576 /* 3577 * 2 level case. 3578 */ 3579 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3580 if (*ppp == NULL) { 3581 mutex_enter(&ahp->serial_lock); 3582 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3583 if (*ppp == NULL) 3584 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3585 mutex_exit(&ahp->serial_lock); 3586 } 3587 app = *ppp; 3588 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3589 } 3590 } 3591 3592 void 3593 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3594 { 3595 ulong_t *ap_slot; 3596 kmutex_t *mtx; 3597 kcondvar_t *cv; 3598 int hash; 3599 3600 /* 3601 * Use szc to determine anon slot(s) to appear atomic. 3602 * If szc = 0, then lock the anon slot and mark it busy. 3603 * If szc > 0, then lock the range of slots by getting the 3604 * anon_array_lock for the first anon slot, and mark only the 3605 * first anon slot busy to represent whole range being busy. 3606 */ 3607 3608 ASSERT(RW_READ_HELD(&->a_rwlock)); 3609 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3610 hash = ANON_ARRAY_HASH(amp, an_idx); 3611 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3612 sobj->sync_cv = cv = &anon_array_cv[hash]; 3613 mutex_enter(mtx); 3614 ap_slot = anon_get_slot(amp->ahp, an_idx); 3615 while (ANON_ISBUSY(ap_slot)) 3616 cv_wait(cv, mtx); 3617 ANON_SETBUSY(ap_slot); 3618 sobj->sync_data = ap_slot; 3619 mutex_exit(mtx); 3620 } 3621 3622 int 3623 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3624 anon_sync_obj_t *sobj) 3625 { 3626 ulong_t *ap_slot; 3627 kmutex_t *mtx; 3628 int hash; 3629 3630 /* 3631 * Try to lock a range of anon slots. 3632 * Use szc to determine anon slot(s) to appear atomic. 3633 * If szc = 0, then lock the anon slot and mark it busy. 3634 * If szc > 0, then lock the range of slots by getting the 3635 * anon_array_lock for the first anon slot, and mark only the 3636 * first anon slot busy to represent whole range being busy. 3637 * Fail if the mutex or the anon_array are busy. 3638 */ 3639 3640 ASSERT(RW_READ_HELD(&->a_rwlock)); 3641 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3642 hash = ANON_ARRAY_HASH(amp, an_idx); 3643 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3644 sobj->sync_cv = &anon_array_cv[hash]; 3645 if (!mutex_tryenter(mtx)) { 3646 return (EWOULDBLOCK); 3647 } 3648 ap_slot = anon_get_slot(amp->ahp, an_idx); 3649 if (ANON_ISBUSY(ap_slot)) { 3650 mutex_exit(mtx); 3651 return (EWOULDBLOCK); 3652 } 3653 ANON_SETBUSY(ap_slot); 3654 sobj->sync_data = ap_slot; 3655 mutex_exit(mtx); 3656 return (0); 3657 } 3658 3659 void 3660 anon_array_exit(anon_sync_obj_t *sobj) 3661 { 3662 mutex_enter(sobj->sync_mutex); 3663 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3664 ANON_CLRBUSY(sobj->sync_data); 3665 if (CV_HAS_WAITERS(sobj->sync_cv)) 3666 cv_broadcast(sobj->sync_cv); 3667 mutex_exit(sobj->sync_mutex); 3668 } 3669