1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2015, Joyent, Inc. All rights reserved. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - anonymous pages. 41 * 42 * This layer sits immediately above the vm_swap layer. It manages 43 * physical pages that have no permanent identity in the file system 44 * name space, using the services of the vm_swap layer to allocate 45 * backing storage for these pages. Since these pages have no external 46 * identity, they are discarded when the last reference is removed. 47 * 48 * An important function of this layer is to manage low-level sharing 49 * of pages that are logically distinct but that happen to be 50 * physically identical (e.g., the corresponding pages of the processes 51 * resulting from a fork before one process or the other changes their 52 * contents). This pseudo-sharing is present only as an optimization 53 * and is not to be confused with true sharing in which multiple 54 * address spaces deliberately contain references to the same object; 55 * such sharing is managed at a higher level. 56 * 57 * The key data structure here is the anon struct, which contains a 58 * reference count for its associated physical page and a hint about 59 * the identity of that page. Anon structs typically live in arrays, 60 * with an instance's position in its array determining where the 61 * corresponding backing storage is allocated; however, the swap_xlate() 62 * routine abstracts away this representation information so that the 63 * rest of the anon layer need not know it. (See the swap layer for 64 * more details on anon struct layout.) 65 * 66 * In the future versions of the system, the association between an 67 * anon struct and its position on backing store will change so that 68 * we don't require backing store all anonymous pages in the system. 69 * This is important for consideration for large memory systems. 70 * We can also use this technique to delay binding physical locations 71 * to anonymous pages until pageout/swapout time where we can make 72 * smarter allocation decisions to improve anonymous klustering. 73 * 74 * Many of the routines defined here take a (struct anon **) argument, 75 * which allows the code at this level to manage anon pages directly, 76 * so that callers can regard anon structs as opaque objects and not be 77 * concerned with assigning or inspecting their contents. 78 * 79 * Clients of this layer refer to anon pages indirectly. That is, they 80 * maintain arrays of pointers to anon structs rather than maintaining 81 * anon structs themselves. The (struct anon **) arguments mentioned 82 * above are pointers to entries in these arrays. It is these arrays 83 * that capture the mapping between offsets within a given segment and 84 * the corresponding anonymous backing storage address. 85 */ 86 87 #ifdef DEBUG 88 #define ANON_DEBUG 89 #endif 90 91 #include <sys/types.h> 92 #include <sys/t_lock.h> 93 #include <sys/param.h> 94 #include <sys/systm.h> 95 #include <sys/mman.h> 96 #include <sys/cred.h> 97 #include <sys/thread.h> 98 #include <sys/vnode.h> 99 #include <sys/cpuvar.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/vtrace.h> 103 #include <sys/kmem.h> 104 #include <sys/sysmacros.h> 105 #include <sys/bitmap.h> 106 #include <sys/vmsystm.h> 107 #include <sys/tuneable.h> 108 #include <sys/debug.h> 109 #include <sys/fs/swapnode.h> 110 #include <sys/tnf_probe.h> 111 #include <sys/lgrp.h> 112 #include <sys/policy.h> 113 #include <sys/condvar_impl.h> 114 #include <sys/mutex_impl.h> 115 #include <sys/rctl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 struct vnode *anon_vp; 128 129 int anon_debug; 130 131 kmutex_t anoninfo_lock; 132 struct k_anoninfo k_anoninfo; 133 ani_free_t *ani_free_pool; 134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 135 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 136 137 /* 138 * Global hash table for (vp, off) -> anon slot 139 */ 140 extern int swap_maxcontig; 141 size_t anon_hash_size; 142 unsigned int anon_hash_shift; 143 struct anon **anon_hash; 144 145 static struct kmem_cache *anon_cache; 146 static struct kmem_cache *anonmap_cache; 147 148 pad_mutex_t *anonhash_lock; 149 150 /* 151 * Used to make the increment of all refcnts of all anon slots of a large 152 * page appear to be atomic. The lock is grabbed for the first anon slot of 153 * a large page. 154 */ 155 pad_mutex_t *anonpages_hash_lock; 156 157 #define APH_MUTEX(vp, off) \ 158 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \ 159 (AH_LOCK_SIZE - 1))].pad_mutex) 160 161 #ifdef VM_STATS 162 static struct anonvmstats_str { 163 ulong_t getpages[30]; 164 ulong_t privatepages[10]; 165 ulong_t demotepages[9]; 166 ulong_t decrefpages[9]; 167 ulong_t dupfillholes[4]; 168 ulong_t freepages[1]; 169 } anonvmstats; 170 #endif /* VM_STATS */ 171 172 /*ARGSUSED*/ 173 static int 174 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 175 { 176 struct anon_map *amp = buf; 177 178 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 179 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 180 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 181 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 182 return (0); 183 } 184 185 /*ARGSUSED1*/ 186 static void 187 anonmap_cache_destructor(void *buf, void *cdrarg) 188 { 189 struct anon_map *amp = buf; 190 191 rw_destroy(&->a_rwlock); 192 cv_destroy(&->a_purgecv); 193 mutex_destroy(&->a_pmtx); 194 mutex_destroy(&->a_purgemtx); 195 } 196 197 void 198 anon_init(void) 199 { 200 int i; 201 pad_mutex_t *tmp; 202 203 /* These both need to be powers of 2 so round up to the next power */ 204 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1); 205 anon_hash_size = 1L << anon_hash_shift; 206 207 /* 208 * We need to align the anonhash_lock and anonpages_hash_lock arrays 209 * to a 64B boundary to avoid false sharing. We add 63B to our 210 * allocation so that we can get a 64B aligned address to use. 211 * We allocate both of these together to avoid wasting an additional 212 * 63B. 213 */ 214 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63, 215 KM_SLEEP); 216 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64); 217 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE; 218 219 for (i = 0; i < AH_LOCK_SIZE; i++) { 220 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT, 221 NULL); 222 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL, 223 MUTEX_DEFAULT, NULL); 224 } 225 226 for (i = 0; i < ANON_LOCKSIZE; i++) { 227 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 228 MUTEX_DEFAULT, NULL); 229 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 230 } 231 232 anon_hash = (struct anon **) 233 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 234 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 235 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 236 anonmap_cache = kmem_cache_create("anonmap_cache", 237 sizeof (struct anon_map), 0, 238 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 239 NULL, NULL, 0); 240 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 241 242 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP); 243 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */ 244 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64); 245 246 anon_vp = vn_alloc(KM_SLEEP); 247 vn_setops(anon_vp, swap_vnodeops); 248 anon_vp->v_type = VREG; 249 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 250 } 251 252 /* 253 * Global anon slot hash table manipulation. 254 */ 255 256 static void 257 anon_addhash(struct anon *ap) 258 { 259 int index; 260 261 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 262 index = ANON_HASH(ap->an_vp, ap->an_off); 263 ap->an_hash = anon_hash[index]; 264 anon_hash[index] = ap; 265 } 266 267 static void 268 anon_rmhash(struct anon *ap) 269 { 270 struct anon **app; 271 272 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off))); 273 274 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 275 *app; app = &((*app)->an_hash)) { 276 if (*app == ap) { 277 *app = ap->an_hash; 278 break; 279 } 280 } 281 } 282 283 /* 284 * The anon array interfaces. Functions allocating, 285 * freeing array of pointers, and returning/setting 286 * entries in the array of pointers for a given offset. 287 * 288 * Create the list of pointers 289 */ 290 struct anon_hdr * 291 anon_create(pgcnt_t npages, int flags) 292 { 293 struct anon_hdr *ahp; 294 ulong_t nchunks; 295 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 296 297 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 298 return (NULL); 299 } 300 301 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 302 /* 303 * Single level case. 304 */ 305 ahp->size = npages; 306 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 307 308 if (flags & ANON_ALLOC_FORCE) 309 ahp->flags |= ANON_ALLOC_FORCE; 310 311 ahp->array_chunk = kmem_zalloc( 312 ahp->size * sizeof (struct anon *), kmemflags); 313 314 if (ahp->array_chunk == NULL) { 315 kmem_free(ahp, sizeof (struct anon_hdr)); 316 return (NULL); 317 } 318 } else { 319 /* 320 * 2 Level case. 321 * anon hdr size needs to be rounded off to be a multiple 322 * of ANON_CHUNK_SIZE. This is important as various anon 323 * related functions depend on this. 324 * NOTE - 325 * anon_grow() makes anon hdr size a multiple of 326 * ANON_CHUNK_SIZE. 327 * amp size is <= anon hdr size. 328 * anon_index + seg_pgs <= anon hdr size. 329 */ 330 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 331 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 332 333 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 334 kmemflags); 335 336 if (ahp->array_chunk == NULL) { 337 kmem_free(ahp, sizeof (struct anon_hdr)); 338 return (NULL); 339 } 340 } 341 return (ahp); 342 } 343 344 /* 345 * Free the array of pointers 346 */ 347 void 348 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 349 { 350 ulong_t i; 351 void **ppp; 352 ulong_t nchunks; 353 354 ASSERT(npages <= ahp->size); 355 356 /* 357 * Single level case. 358 */ 359 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 360 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 361 } else { 362 /* 363 * 2 level case. 364 */ 365 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 366 for (i = 0; i < nchunks; i++) { 367 ppp = &ahp->array_chunk[i]; 368 if (*ppp != NULL) 369 kmem_free(*ppp, PAGESIZE); 370 } 371 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 372 } 373 mutex_destroy(&ahp->serial_lock); 374 kmem_free(ahp, sizeof (struct anon_hdr)); 375 } 376 377 /* 378 * Return the pointer from the list for a 379 * specified anon index. 380 */ 381 struct anon * 382 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 383 { 384 struct anon **app; 385 386 ASSERT(an_idx < ahp->size); 387 388 /* 389 * Single level case. 390 */ 391 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 392 return ((struct anon *) 393 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 394 } else { 395 396 /* 397 * 2 level case. 398 */ 399 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 400 if (app) { 401 return ((struct anon *) 402 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 403 ANON_PTRMASK)); 404 } else { 405 return (NULL); 406 } 407 } 408 } 409 410 /* 411 * Return the anon pointer for the first valid entry in the anon list, 412 * starting from the given index. 413 */ 414 struct anon * 415 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 416 { 417 struct anon *ap; 418 struct anon **app; 419 ulong_t chunkoff; 420 ulong_t i; 421 ulong_t j; 422 pgcnt_t size; 423 424 i = *index; 425 size = ahp->size; 426 427 ASSERT(i < size); 428 429 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 430 /* 431 * 1 level case 432 */ 433 while (i < size) { 434 ap = (struct anon *) 435 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 436 if (ap) { 437 *index = i; 438 return (ap); 439 } 440 i++; 441 } 442 } else { 443 /* 444 * 2 level case 445 */ 446 chunkoff = i & ANON_CHUNK_OFF; 447 while (i < size) { 448 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 449 if (app) 450 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 451 ap = (struct anon *) 452 ((uintptr_t)app[j] & ANON_PTRMASK); 453 if (ap) { 454 *index = i + (j - chunkoff); 455 return (ap); 456 } 457 } 458 chunkoff = 0; 459 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 460 } 461 } 462 *index = size; 463 return (NULL); 464 } 465 466 /* 467 * Set list entry with a given pointer for a specified offset 468 */ 469 int 470 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 471 { 472 void **ppp; 473 struct anon **app; 474 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 475 uintptr_t *ap_addr; 476 477 ASSERT(an_idx < ahp->size); 478 479 /* 480 * Single level case. 481 */ 482 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 483 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 484 } else { 485 486 /* 487 * 2 level case. 488 */ 489 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 490 491 ASSERT(ppp != NULL); 492 if (*ppp == NULL) { 493 mutex_enter(&ahp->serial_lock); 494 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 495 if (*ppp == NULL) { 496 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 497 if (*ppp == NULL) { 498 mutex_exit(&ahp->serial_lock); 499 return (ENOMEM); 500 } 501 } 502 mutex_exit(&ahp->serial_lock); 503 } 504 app = *ppp; 505 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 506 } 507 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 508 return (0); 509 } 510 511 /* 512 * Copy anon array into a given new anon array 513 */ 514 int 515 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 516 struct anon_hdr *dahp, ulong_t d_idx, 517 pgcnt_t npages, int flags) 518 { 519 void **sapp, **dapp; 520 void *ap; 521 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 522 523 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 524 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 525 526 /* 527 * Both arrays are 1 level. 528 */ 529 if (((sahp->size <= ANON_CHUNK_SIZE) && 530 (dahp->size <= ANON_CHUNK_SIZE)) || 531 ((sahp->flags & ANON_ALLOC_FORCE) && 532 (dahp->flags & ANON_ALLOC_FORCE))) { 533 534 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 535 npages * sizeof (struct anon *)); 536 return (0); 537 } 538 539 /* 540 * Both arrays are 2 levels. 541 */ 542 if (sahp->size > ANON_CHUNK_SIZE && 543 dahp->size > ANON_CHUNK_SIZE && 544 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 545 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 546 547 ulong_t sapidx, dapidx; 548 ulong_t *sap, *dap; 549 ulong_t chknp; 550 551 while (npages != 0) { 552 553 sapidx = s_idx & ANON_CHUNK_OFF; 554 dapidx = d_idx & ANON_CHUNK_OFF; 555 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 556 if (chknp > npages) 557 chknp = npages; 558 559 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 560 if ((sap = *sapp) != NULL) { 561 dapp = &dahp->array_chunk[d_idx 562 >> ANON_CHUNK_SHIFT]; 563 if ((dap = *dapp) == NULL) { 564 *dapp = kmem_zalloc(PAGESIZE, 565 kmemflags); 566 if ((dap = *dapp) == NULL) 567 return (ENOMEM); 568 } 569 bcopy((sap + sapidx), (dap + dapidx), 570 chknp << ANON_PTRSHIFT); 571 } 572 s_idx += chknp; 573 d_idx += chknp; 574 npages -= chknp; 575 } 576 return (0); 577 } 578 579 /* 580 * At least one of the arrays is 2 level. 581 */ 582 while (npages--) { 583 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 584 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 585 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 586 return (ENOMEM); 587 } 588 s_idx++; 589 d_idx++; 590 } 591 return (0); 592 } 593 594 595 /* 596 * ANON_INITBUF is a convenience macro for anon_grow() below. It 597 * takes a buffer dst, which is at least as large as buffer src. It 598 * does a bcopy from src into dst, and then bzeros the extra bytes 599 * of dst. If tail is set, the data in src is tail aligned within 600 * dst instead of head aligned. 601 */ 602 603 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 604 if (tail) { \ 605 bzero((dst), (dstsize) - (srclen)); \ 606 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 607 } else { \ 608 bcopy((src), (dst), (srclen)); \ 609 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 610 } 611 612 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 613 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 614 615 /* 616 * anon_grow() is used to efficiently extend an existing anon array. 617 * startidx_p points to the index into the anon array of the first page 618 * that is in use. oldseg_pgs is the number of pages in use, starting at 619 * *startidx_p. newpages is the number of additional pages desired. 620 * 621 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 622 * 623 * The growth is done by creating a new top level of the anon array, 624 * and (if the array is 2-level) reusing the existing second level arrays. 625 * 626 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 627 * 628 * Returns the new number of pages in the anon array. 629 */ 630 pgcnt_t 631 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 632 pgcnt_t newseg_pgs, int flags) 633 { 634 ulong_t startidx = startidx_p ? *startidx_p : 0; 635 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 636 pgcnt_t oelems, nelems, totpages; 637 void **level1; 638 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 639 int growdown = (flags & ANON_GROWDOWN); 640 size_t newarrsz, oldarrsz; 641 void *level2; 642 643 ASSERT(!(startidx_p == NULL && growdown)); 644 ASSERT(startidx + oldseg_pgs <= ahp->size); 645 646 /* 647 * Determine the total number of pages needed in the new 648 * anon array. If growing down, totpages is all pages from 649 * startidx through the end of the array, plus <newseg_pgs> 650 * pages. If growing up, keep all pages from page 0 through 651 * the last page currently in use, plus <newseg_pgs> pages. 652 */ 653 if (growdown) 654 totpages = oldamp_pgs - startidx + newseg_pgs; 655 else 656 totpages = startidx + oldseg_pgs + newseg_pgs; 657 658 /* If the array is already large enough, just return. */ 659 660 if (oldamp_pgs >= totpages) { 661 if (growdown) 662 *startidx_p = oldamp_pgs - totpages; 663 return (oldamp_pgs); 664 } 665 666 /* 667 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 668 * by the corresponding arrays. 669 * oelems/nelems are the number of pointers in the top level arrays 670 * which may be either level 1 or level 2. 671 * Will the new anon array be one level or two levels? 672 */ 673 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 674 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 675 oelems = oldamp_pgs; 676 nelems = newamp_pgs; 677 } else { 678 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 679 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 680 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 681 } 682 683 newarrsz = nelems * sizeof (void *); 684 level1 = kmem_alloc(newarrsz, kmemflags); 685 if (level1 == NULL) 686 return (0); 687 688 /* Are we converting from a one level to a two level anon array? */ 689 690 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 691 !(ahp->flags & ANON_ALLOC_FORCE)) { 692 693 /* 694 * Yes, we're converting to a two level. Reuse old level 1 695 * as new level 2 if it is exactly PAGESIZE. Otherwise 696 * alloc a new level 2 and copy the old level 1 data into it. 697 */ 698 if (oldamp_pgs == ANON_CHUNK_SIZE) { 699 level2 = (void *)ahp->array_chunk; 700 } else { 701 level2 = kmem_alloc(PAGESIZE, kmemflags); 702 if (level2 == NULL) { 703 kmem_free(level1, newarrsz); 704 return (0); 705 } 706 oldarrsz = oldamp_pgs * sizeof (void *); 707 708 ANON_INITBUF(ahp->array_chunk, oldarrsz, 709 level2, PAGESIZE, growdown); 710 kmem_free(ahp->array_chunk, oldarrsz); 711 } 712 bzero(level1, newarrsz); 713 if (growdown) 714 level1[nelems - 1] = level2; 715 else 716 level1[0] = level2; 717 } else { 718 oldarrsz = oelems * sizeof (void *); 719 720 ANON_INITBUF(ahp->array_chunk, oldarrsz, 721 level1, newarrsz, growdown); 722 kmem_free(ahp->array_chunk, oldarrsz); 723 } 724 725 ahp->array_chunk = level1; 726 ahp->size = newamp_pgs; 727 if (growdown) 728 *startidx_p = newamp_pgs - totpages; 729 730 return (newamp_pgs); 731 } 732 733 734 /* 735 * Called to sync ani_free value. 736 */ 737 738 void 739 set_anoninfo(void) 740 { 741 processorid_t ix, max_seqid; 742 pgcnt_t total = 0; 743 static clock_t last_time; 744 clock_t new_time; 745 746 if (ani_free_pool == NULL) 747 return; 748 749 /* 750 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to 751 * identify the maximum number of CPUs were ever online. 752 */ 753 new_time = ddi_get_lbolt(); 754 if (new_time > last_time) { 755 756 max_seqid = max_cpu_seqid_ever; 757 ASSERT(ANI_MAX_POOL > max_seqid); 758 for (ix = 0; ix <= max_seqid; ix++) 759 total += ani_free_pool[ix].ani_count; 760 761 last_time = new_time; 762 k_anoninfo.ani_free = total; 763 } 764 } 765 766 /* 767 * Reserve anon space. 768 * 769 * It's no longer simply a matter of incrementing ani_resv to 770 * reserve swap space, we need to check memory-based as well 771 * as disk-backed (physical) swap. The following algorithm 772 * is used: 773 * Check the space on physical swap 774 * i.e. amount needed < ani_max - ani_phys_resv 775 * If we are swapping on swapfs check 776 * amount needed < (availrmem - swapfs_minfree) 777 * Since the algorithm to check for the quantity of swap space is 778 * almost the same as that for reserving it, we'll just use anon_resvmem 779 * with a flag to decrement availrmem. 780 * 781 * Return non-zero on success. 782 */ 783 int 784 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 785 { 786 pgcnt_t npages = btopr(size); 787 pgcnt_t mswap_pages = 0; 788 pgcnt_t pswap_pages = 0; 789 proc_t *p = curproc; 790 791 if (zone != NULL && takemem) { 792 /* test zone.max-swap resource control */ 793 mutex_enter(&p->p_lock); 794 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 795 mutex_exit(&p->p_lock); 796 atomic_add_64(&zone->zone_anon_alloc_fail, 1); 797 return (0); 798 } 799 mutex_exit(&p->p_lock); 800 } 801 mutex_enter(&anoninfo_lock); 802 803 /* 804 * pswap_pages is the number of pages we can take from 805 * physical (i.e. disk-backed) swap. 806 */ 807 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 808 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 809 810 ANON_PRINT(A_RESV, 811 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 812 npages, takemem, pswap_pages, (void *)caller())); 813 814 if (npages <= pswap_pages) { 815 /* 816 * we have enough space on a physical swap 817 */ 818 if (takemem) 819 k_anoninfo.ani_phys_resv += npages; 820 mutex_exit(&anoninfo_lock); 821 return (1); 822 } else if (pswap_pages != 0) { 823 /* 824 * we have some space on a physical swap 825 */ 826 if (takemem) { 827 /* 828 * use up remainder of phys swap 829 */ 830 k_anoninfo.ani_phys_resv += pswap_pages; 831 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 832 } 833 } 834 /* 835 * since (npages > pswap_pages) we need mem swap 836 * mswap_pages is the number of pages needed from availrmem 837 */ 838 ASSERT(npages > pswap_pages); 839 mswap_pages = npages - pswap_pages; 840 841 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 842 mswap_pages)); 843 844 /* 845 * priv processes can reserve memory as swap as long as availrmem 846 * remains greater than swapfs_minfree; in the case of non-priv 847 * processes, memory can be reserved as swap only if availrmem 848 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 849 * swapfs_reserve amount of memswap is not available to non-priv 850 * processes. This protects daemons such as automounter dying 851 * as a result of application processes eating away almost entire 852 * membased swap. This safeguard becomes useless if apps are run 853 * with root access. 854 * 855 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 856 * 857 */ 858 if (tryhard) { 859 pgcnt_t floor_pages; 860 861 if (secpolicy_resource_anon_mem(CRED())) { 862 floor_pages = swapfs_minfree; 863 } else { 864 floor_pages = swapfs_minfree + swapfs_reserve; 865 } 866 867 mutex_exit(&anoninfo_lock); 868 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 869 mutex_enter(&anoninfo_lock); 870 } 871 872 mutex_enter(&freemem_lock); 873 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 874 (availrmem > (swapfs_minfree + mswap_pages) && 875 secpolicy_resource(CRED()) == 0)) { 876 877 if (takemem) { 878 /* 879 * Take the memory from the rest of the system. 880 */ 881 availrmem -= mswap_pages; 882 mutex_exit(&freemem_lock); 883 k_anoninfo.ani_mem_resv += mswap_pages; 884 ANI_ADD(mswap_pages); 885 ANON_PRINT((A_RESV | A_MRESV), 886 ("anon_resvmem: took %ld pages of availrmem\n", 887 mswap_pages)); 888 } else { 889 mutex_exit(&freemem_lock); 890 } 891 892 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 893 mutex_exit(&anoninfo_lock); 894 return (1); 895 } else { 896 /* 897 * Fail if not enough memory 898 */ 899 if (takemem) { 900 k_anoninfo.ani_phys_resv -= pswap_pages; 901 } 902 903 mutex_exit(&freemem_lock); 904 mutex_exit(&anoninfo_lock); 905 ANON_PRINT(A_RESV, 906 ("anon_resvmem: not enough space from swapfs\n")); 907 if (zone != NULL && takemem) 908 rctl_decr_swap(zone, ptob(npages)); 909 return (0); 910 } 911 } 912 913 /* 914 * Give back an anon reservation. 915 */ 916 void 917 anon_unresvmem(size_t size, zone_t *zone) 918 { 919 pgcnt_t npages = btopr(size); 920 spgcnt_t mem_free_pages = 0; 921 pgcnt_t phys_free_slots; 922 #ifdef ANON_DEBUG 923 pgcnt_t mem_resv; 924 #endif 925 if (zone != NULL) 926 rctl_decr_swap(zone, ptob(npages)); 927 928 mutex_enter(&anoninfo_lock); 929 930 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 931 932 /* 933 * If some of this reservation belonged to swapfs 934 * give it back to availrmem. 935 * ani_mem_resv is the amount of availrmem swapfs has reserved. 936 * but some of that memory could be locked by segspt so we can only 937 * return non locked ani_mem_resv back to availrmem 938 */ 939 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 940 ANON_PRINT((A_RESV | A_MRESV), 941 ("anon_unresv: growing availrmem by %ld pages\n", 942 MIN(k_anoninfo.ani_mem_resv, npages))); 943 944 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 945 k_anoninfo.ani_locked_swap), npages); 946 mutex_enter(&freemem_lock); 947 availrmem += mem_free_pages; 948 mutex_exit(&freemem_lock); 949 k_anoninfo.ani_mem_resv -= mem_free_pages; 950 951 ANI_ADD(-mem_free_pages); 952 } 953 /* 954 * The remainder of the pages is returned to phys swap 955 */ 956 ASSERT(npages >= mem_free_pages); 957 phys_free_slots = npages - mem_free_pages; 958 959 if (phys_free_slots) { 960 k_anoninfo.ani_phys_resv -= phys_free_slots; 961 } 962 963 #ifdef ANON_DEBUG 964 mem_resv = k_anoninfo.ani_mem_resv; 965 #endif 966 967 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 968 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 969 970 mutex_exit(&anoninfo_lock); 971 972 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 973 npages, mem_resv, (void *)caller())); 974 } 975 976 /* 977 * Allocate an anon slot and return it with the lock held. 978 */ 979 struct anon * 980 anon_alloc(struct vnode *vp, anoff_t off) 981 { 982 struct anon *ap; 983 kmutex_t *ahm; 984 985 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 986 if (vp == NULL) { 987 swap_alloc(ap); 988 } else { 989 ap->an_vp = vp; 990 ap->an_off = off; 991 } 992 ap->an_refcnt = 1; 993 ap->an_pvp = NULL; 994 ap->an_poff = 0; 995 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 996 mutex_enter(ahm); 997 anon_addhash(ap); 998 mutex_exit(ahm); 999 ANI_ADD(-1); 1000 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 1001 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 1002 return (ap); 1003 } 1004 1005 /* 1006 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 1007 * such pages don't consume any physical swap resources needed for swapping 1008 * unlocked pages. 1009 */ 1010 void 1011 anon_swap_free(struct anon *ap, page_t *pp) 1012 { 1013 kmutex_t *ahm; 1014 1015 ASSERT(ap != NULL); 1016 ASSERT(pp != NULL); 1017 ASSERT(PAGE_LOCKED(pp)); 1018 ASSERT(pp->p_vnode != NULL); 1019 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 1020 ASSERT(ap->an_refcnt != 0); 1021 ASSERT(pp->p_vnode == ap->an_vp); 1022 ASSERT(pp->p_offset == ap->an_off); 1023 1024 if (ap->an_pvp == NULL) 1025 return; 1026 1027 page_io_lock(pp); 1028 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1029 mutex_enter(ahm); 1030 1031 ASSERT(ap->an_refcnt != 0); 1032 ASSERT(pp->p_vnode == ap->an_vp); 1033 ASSERT(pp->p_offset == ap->an_off); 1034 1035 if (ap->an_pvp != NULL) { 1036 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1037 ap->an_pvp = NULL; 1038 ap->an_poff = 0; 1039 mutex_exit(ahm); 1040 hat_setmod(pp); 1041 } else { 1042 mutex_exit(ahm); 1043 } 1044 page_io_unlock(pp); 1045 } 1046 1047 /* 1048 * Decrement the reference count of an anon page. 1049 * If reference count goes to zero, free it and 1050 * its associated page (if any). 1051 */ 1052 void 1053 anon_decref(struct anon *ap) 1054 { 1055 page_t *pp; 1056 struct vnode *vp; 1057 anoff_t off; 1058 kmutex_t *ahm; 1059 1060 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1061 mutex_enter(ahm); 1062 ASSERT(ap->an_refcnt != 0); 1063 if (ap->an_refcnt == 0) 1064 panic("anon_decref: slot count 0"); 1065 if (--ap->an_refcnt == 0) { 1066 swap_xlate(ap, &vp, &off); 1067 anon_rmhash(ap); 1068 if (ap->an_pvp != NULL) 1069 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1070 mutex_exit(ahm); 1071 1072 /* 1073 * If there is a page for this anon slot we will need to 1074 * call VN_DISPOSE to get rid of the vp association and 1075 * put the page back on the free list as really free. 1076 * Acquire the "exclusive" lock to ensure that any 1077 * pending i/o always completes before the swap slot 1078 * is freed. 1079 */ 1080 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1081 if (pp != NULL) { 1082 /*LINTED: constant in conditional context */ 1083 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1084 } 1085 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1086 (void *)ap, (void *)ap->an_vp)); 1087 1088 kmem_cache_free(anon_cache, ap); 1089 1090 ANI_ADD(1); 1091 } else { 1092 mutex_exit(ahm); 1093 } 1094 } 1095 1096 1097 /* 1098 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1099 * seg->s_szc level) to determine whether COW processing is required. 1100 * anonpages_hash_lock[] held on the root ap ensures that if root's 1101 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1102 * later since this process can't fork while its AS lock is held). 1103 * 1104 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1105 */ 1106 int 1107 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1108 { 1109 struct anon *ap; 1110 kmutex_t *ahmpages = NULL; 1111 1112 ap = anon_get_ptr(ahp, anon_index); 1113 if (ap == NULL) 1114 return (0); 1115 1116 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1117 mutex_enter(ahmpages); 1118 ASSERT(ap->an_refcnt >= 1); 1119 if (ap->an_refcnt == 1) { 1120 mutex_exit(ahmpages); 1121 return (0); 1122 } 1123 mutex_exit(ahmpages); 1124 return (1); 1125 } 1126 /* 1127 * Check 'nslots' anon slots for refcnt > 1. 1128 * 1129 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1130 * returns 0. 1131 */ 1132 static int 1133 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1134 { 1135 struct anon *ap; 1136 1137 while (nslots-- > 0) { 1138 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1139 ap->an_refcnt > 1) 1140 return (1); 1141 anon_index++; 1142 } 1143 1144 return (0); 1145 } 1146 1147 static void 1148 anon_decref_pages( 1149 struct anon_hdr *ahp, 1150 ulong_t an_idx, 1151 uint_t szc) 1152 { 1153 struct anon *ap = anon_get_ptr(ahp, an_idx); 1154 kmutex_t *ahmpages = NULL; 1155 page_t *pp; 1156 pgcnt_t pgcnt = page_get_pagecnt(szc); 1157 pgcnt_t i; 1158 struct vnode *vp; 1159 anoff_t off; 1160 kmutex_t *ahm; 1161 #ifdef DEBUG 1162 int refcnt = 1; 1163 #endif 1164 1165 ASSERT(szc != 0); 1166 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1167 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1168 ASSERT(an_idx < ahp->size); 1169 1170 if (ahp->size - an_idx < pgcnt) { 1171 /* 1172 * In case of shared mappings total anon map size may not be 1173 * the largest page size aligned. 1174 */ 1175 pgcnt = ahp->size - an_idx; 1176 } 1177 1178 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1179 1180 if (ap != NULL) { 1181 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1182 mutex_enter(ahmpages); 1183 ASSERT((refcnt = ap->an_refcnt) != 0); 1184 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1185 if (ap->an_refcnt == 1) { 1186 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1187 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1188 mutex_exit(ahmpages); 1189 ahmpages = NULL; 1190 } 1191 } 1192 1193 i = 0; 1194 while (i < pgcnt) { 1195 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1196 ASSERT(refcnt == 1 && ahmpages == NULL); 1197 i++; 1198 continue; 1199 } 1200 ASSERT(ap->an_refcnt == refcnt); 1201 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1202 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1203 1204 if (ahmpages == NULL) { 1205 swap_xlate(ap, &vp, &off); 1206 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1207 if (pp == NULL || pp->p_szc == 0) { 1208 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1209 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1210 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1211 ANON_SLEEP); 1212 mutex_enter(ahm); 1213 ap->an_refcnt--; 1214 ASSERT(ap->an_refcnt == 0); 1215 anon_rmhash(ap); 1216 if (ap->an_pvp) 1217 swap_phys_free(ap->an_pvp, ap->an_poff, 1218 PAGESIZE); 1219 mutex_exit(ahm); 1220 if (pp == NULL) { 1221 pp = page_lookup(vp, (u_offset_t)off, 1222 SE_EXCL); 1223 ASSERT(pp == NULL || pp->p_szc == 0); 1224 } 1225 if (pp != NULL) { 1226 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1227 /*LINTED*/ 1228 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1229 } 1230 kmem_cache_free(anon_cache, ap); 1231 ANI_ADD(1); 1232 i++; 1233 } else { 1234 pgcnt_t j; 1235 pgcnt_t curpgcnt = 1236 page_get_pagecnt(pp->p_szc); 1237 size_t ppasize = curpgcnt * sizeof (page_t *); 1238 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1239 int dispose = 0; 1240 1241 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1242 1243 ASSERT(pp->p_szc <= szc); 1244 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1245 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1246 ASSERT(i + curpgcnt <= pgcnt); 1247 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1248 ppa[0] = pp; 1249 for (j = i + 1; j < i + curpgcnt; j++) { 1250 ap = anon_get_ptr(ahp, an_idx + j); 1251 ASSERT(ap != NULL && 1252 ap->an_refcnt == 1); 1253 swap_xlate(ap, &vp, &off); 1254 pp = page_lookup(vp, (u_offset_t)off, 1255 SE_EXCL); 1256 if (pp == NULL) 1257 panic("anon_decref_pages: " 1258 "no page"); 1259 1260 (void) hat_pageunload(pp, 1261 HAT_FORCE_PGUNLOAD); 1262 ASSERT(pp->p_szc == ppa[0]->p_szc); 1263 ASSERT(page_pptonum(pp) - 1 == 1264 page_pptonum(ppa[j - i - 1])); 1265 ppa[j - i] = pp; 1266 if (ap->an_pvp != NULL && 1267 !vn_matchopval(ap->an_pvp, 1268 VOPNAME_DISPOSE, 1269 (fs_generic_func_p)fs_dispose)) 1270 dispose = 1; 1271 } 1272 for (j = i; j < i + curpgcnt; j++) { 1273 ap = anon_get_ptr(ahp, an_idx + j); 1274 ASSERT(ap != NULL && 1275 ap->an_refcnt == 1); 1276 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1277 (void) anon_set_ptr(ahp, an_idx + j, 1278 NULL, ANON_SLEEP); 1279 mutex_enter(ahm); 1280 ap->an_refcnt--; 1281 ASSERT(ap->an_refcnt == 0); 1282 anon_rmhash(ap); 1283 if (ap->an_pvp) 1284 swap_phys_free(ap->an_pvp, 1285 ap->an_poff, PAGESIZE); 1286 mutex_exit(ahm); 1287 kmem_cache_free(anon_cache, ap); 1288 ANI_ADD(1); 1289 } 1290 if (!dispose) { 1291 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1292 page_destroy_pages(ppa[0]); 1293 } else { 1294 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1295 for (j = 0; j < curpgcnt; j++) { 1296 ASSERT(PAGE_EXCL(ppa[j])); 1297 ppa[j]->p_szc = 0; 1298 } 1299 for (j = 0; j < curpgcnt; j++) { 1300 ASSERT(!hat_page_is_mapped( 1301 ppa[j])); 1302 /*LINTED*/ 1303 VN_DISPOSE(ppa[j], B_INVAL, 0, 1304 kcred); 1305 } 1306 } 1307 kmem_free(ppa, ppasize); 1308 i += curpgcnt; 1309 } 1310 } else { 1311 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1312 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1313 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1314 mutex_enter(ahm); 1315 ap->an_refcnt--; 1316 mutex_exit(ahm); 1317 i++; 1318 } 1319 } 1320 1321 if (ahmpages != NULL) { 1322 mutex_exit(ahmpages); 1323 } 1324 } 1325 1326 /* 1327 * Duplicate references to size bytes worth of anon pages. 1328 * Used when duplicating a segment that contains private anon pages. 1329 * This code assumes that procedure calling this one has already used 1330 * hat_chgprot() to disable write access to the range of addresses that 1331 * that *old actually refers to. 1332 */ 1333 void 1334 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1335 ulong_t new_idx, size_t size) 1336 { 1337 spgcnt_t npages; 1338 kmutex_t *ahm; 1339 struct anon *ap; 1340 ulong_t off; 1341 ulong_t index; 1342 1343 npages = btopr(size); 1344 while (npages > 0) { 1345 index = old_idx; 1346 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1347 break; 1348 1349 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1350 off = index - old_idx; 1351 npages -= off; 1352 if (npages <= 0) 1353 break; 1354 1355 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1356 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1357 1358 mutex_enter(ahm); 1359 ap->an_refcnt++; 1360 mutex_exit(ahm); 1361 1362 off++; 1363 new_idx += off; 1364 old_idx += off; 1365 npages--; 1366 } 1367 } 1368 1369 /* 1370 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1371 * slots) within any large page region. That means if a large page region is 1372 * empty in the old array it will skip it. If there are 1 or more valid slots 1373 * in the large page region of the old array it will make sure to fill in any 1374 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1375 * page region should either have no valid anon slots or all slots should be 1376 * valid. 1377 */ 1378 void 1379 anon_dup_fill_holes( 1380 struct anon_hdr *old, 1381 ulong_t old_idx, 1382 struct anon_hdr *new, 1383 ulong_t new_idx, 1384 size_t size, 1385 uint_t szc, 1386 int noalloc) 1387 { 1388 struct anon *ap; 1389 spgcnt_t npages; 1390 kmutex_t *ahm, *ahmpages = NULL; 1391 pgcnt_t pgcnt, i; 1392 ulong_t index, off; 1393 #ifdef DEBUG 1394 int refcnt; 1395 #endif 1396 1397 ASSERT(szc != 0); 1398 pgcnt = page_get_pagecnt(szc); 1399 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1400 npages = btopr(size); 1401 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1402 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1403 1404 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1405 1406 while (npages > 0) { 1407 index = old_idx; 1408 1409 /* 1410 * Find the next valid slot. 1411 */ 1412 if (anon_get_next_ptr(old, &index) == NULL) 1413 break; 1414 1415 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1416 /* 1417 * Now backup index to the beginning of the 1418 * current large page region of the old array. 1419 */ 1420 index = P2ALIGN(index, pgcnt); 1421 off = index - old_idx; 1422 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1423 npages -= off; 1424 if (npages <= 0) 1425 break; 1426 1427 /* 1428 * Fill and copy a large page regions worth 1429 * of anon slots. 1430 */ 1431 for (i = 0; i < pgcnt; i++) { 1432 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1433 if (noalloc) { 1434 panic("anon_dup_fill_holes: " 1435 "empty anon slot\n"); 1436 } 1437 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1438 ap = anon_alloc(NULL, 0); 1439 (void) anon_set_ptr(old, index + i, ap, 1440 ANON_SLEEP); 1441 } else if (i == 0) { 1442 /* 1443 * make the increment of all refcnts of all 1444 * anon slots of a large page appear atomic by 1445 * getting an anonpages_hash_lock for the 1446 * first anon slot of a large page. 1447 */ 1448 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1449 1450 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 1451 mutex_enter(ahmpages); 1452 /*LINTED*/ 1453 ASSERT(refcnt = ap->an_refcnt); 1454 1455 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1456 anonvmstats.dupfillholes[3]); 1457 } 1458 (void) anon_set_ptr(new, new_idx + off + i, ap, 1459 ANON_SLEEP); 1460 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1461 mutex_enter(ahm); 1462 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1463 ASSERT(i == 0 || ahmpages == NULL || 1464 refcnt == ap->an_refcnt); 1465 ap->an_refcnt++; 1466 mutex_exit(ahm); 1467 } 1468 if (ahmpages != NULL) { 1469 mutex_exit(ahmpages); 1470 ahmpages = NULL; 1471 } 1472 off += pgcnt; 1473 new_idx += off; 1474 old_idx += off; 1475 npages -= pgcnt; 1476 } 1477 } 1478 1479 /* 1480 * Used when a segment with a vnode changes szc. similarly to 1481 * anon_dup_fill_holes() makes sure each large page region either has no anon 1482 * slots or all of them. but new slots are created by COWing the file 1483 * pages. on entrance no anon slots should be shared. 1484 */ 1485 int 1486 anon_fill_cow_holes( 1487 struct seg *seg, 1488 caddr_t addr, 1489 struct anon_hdr *ahp, 1490 ulong_t an_idx, 1491 struct vnode *vp, 1492 u_offset_t vp_off, 1493 size_t size, 1494 uint_t szc, 1495 uint_t prot, 1496 struct vpage vpage[], 1497 struct cred *cred) 1498 { 1499 struct anon *ap; 1500 spgcnt_t npages; 1501 pgcnt_t pgcnt, i; 1502 ulong_t index, off; 1503 int err = 0; 1504 int pageflags = 0; 1505 1506 ASSERT(szc != 0); 1507 pgcnt = page_get_pagecnt(szc); 1508 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1509 npages = btopr(size); 1510 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1511 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1512 1513 while (npages > 0) { 1514 index = an_idx; 1515 1516 /* 1517 * Find the next valid slot. 1518 */ 1519 if (anon_get_next_ptr(ahp, &index) == NULL) { 1520 break; 1521 } 1522 1523 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1524 /* 1525 * Now backup index to the beginning of the 1526 * current large page region of the anon array. 1527 */ 1528 index = P2ALIGN(index, pgcnt); 1529 off = index - an_idx; 1530 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1531 npages -= off; 1532 if (npages <= 0) 1533 break; 1534 an_idx += off; 1535 vp_off += ptob(off); 1536 addr += ptob(off); 1537 if (vpage != NULL) { 1538 vpage += off; 1539 } 1540 1541 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1542 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1543 page_t *pl[1 + 1]; 1544 page_t *pp; 1545 1546 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1547 pl, PAGESIZE, seg, addr, S_READ, cred, 1548 NULL); 1549 if (err) { 1550 break; 1551 } 1552 if (vpage != NULL) { 1553 prot = VPP_PROT(vpage); 1554 pageflags = VPP_ISPPLOCK(vpage) ? 1555 LOCK_PAGE : 0; 1556 } 1557 pp = anon_private(&ap, seg, addr, prot, pl[0], 1558 pageflags, cred); 1559 if (pp == NULL) { 1560 err = ENOMEM; 1561 break; 1562 } 1563 (void) anon_set_ptr(ahp, an_idx, ap, 1564 ANON_SLEEP); 1565 page_unlock(pp); 1566 } 1567 ASSERT(ap->an_refcnt == 1); 1568 addr += PAGESIZE; 1569 if (vpage != NULL) { 1570 vpage++; 1571 } 1572 } 1573 npages -= pgcnt; 1574 } 1575 1576 return (err); 1577 } 1578 1579 /* 1580 * Free a group of "size" anon pages, size in bytes, 1581 * and clear out the pointers to the anon entries. 1582 */ 1583 void 1584 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1585 { 1586 spgcnt_t npages; 1587 struct anon *ap; 1588 ulong_t old; 1589 1590 npages = btopr(size); 1591 1592 while (npages > 0) { 1593 old = index; 1594 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1595 break; 1596 1597 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1598 npages -= index - old; 1599 if (npages <= 0) 1600 break; 1601 1602 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1603 anon_decref(ap); 1604 /* 1605 * Bump index and decrement page count 1606 */ 1607 index++; 1608 npages--; 1609 } 1610 } 1611 1612 void 1613 anon_free_pages( 1614 struct anon_hdr *ahp, 1615 ulong_t an_idx, 1616 size_t size, 1617 uint_t szc) 1618 { 1619 spgcnt_t npages; 1620 pgcnt_t pgcnt; 1621 ulong_t index, off; 1622 1623 ASSERT(szc != 0); 1624 pgcnt = page_get_pagecnt(szc); 1625 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1626 npages = btopr(size); 1627 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1628 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1629 ASSERT(an_idx < ahp->size); 1630 1631 VM_STAT_ADD(anonvmstats.freepages[0]); 1632 1633 while (npages > 0) { 1634 index = an_idx; 1635 1636 /* 1637 * Find the next valid slot. 1638 */ 1639 if (anon_get_next_ptr(ahp, &index) == NULL) 1640 break; 1641 1642 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1643 /* 1644 * Now backup index to the beginning of the 1645 * current large page region of the old array. 1646 */ 1647 index = P2ALIGN(index, pgcnt); 1648 off = index - an_idx; 1649 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1650 npages -= off; 1651 if (npages <= 0) 1652 break; 1653 1654 anon_decref_pages(ahp, index, szc); 1655 1656 off += pgcnt; 1657 an_idx += off; 1658 npages -= pgcnt; 1659 } 1660 } 1661 1662 /* 1663 * Make anonymous pages discardable 1664 */ 1665 int 1666 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, 1667 uint_t behav, pgcnt_t *purged) 1668 { 1669 spgcnt_t npages = btopr(size); 1670 struct anon *ap; 1671 struct vnode *vp; 1672 anoff_t off; 1673 page_t *pp, *root_pp; 1674 kmutex_t *ahm; 1675 pgcnt_t pgcnt, npurged = 0; 1676 ulong_t old_idx, idx, i; 1677 struct anon_hdr *ahp = amp->ahp; 1678 anon_sync_obj_t cookie; 1679 int err = 0; 1680 1681 VERIFY(behav == MADV_FREE || behav == MADV_PURGE); 1682 ASSERT(RW_READ_HELD(&->a_rwlock)); 1683 pgcnt = 1; 1684 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1685 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1686 1687 /* 1688 * get anon pointer and index for the first valid entry 1689 * in the anon list, starting from "index" 1690 */ 1691 old_idx = index; 1692 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1693 break; 1694 1695 /* 1696 * decrement npages by number of NULL anon slots we skipped 1697 */ 1698 npages -= index - old_idx; 1699 if (npages <= 0) 1700 break; 1701 1702 anon_array_enter(amp, index, &cookie); 1703 ap = anon_get_ptr(ahp, index); 1704 ASSERT(ap != NULL); 1705 1706 /* 1707 * Get anonymous page and try to lock it SE_EXCL; 1708 * if we couldn't grab the lock we skip to next page. 1709 */ 1710 swap_xlate(ap, &vp, &off); 1711 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1712 if (pp == NULL) { 1713 segadvstat.MADV_FREE_miss.value.ul++; 1714 pgcnt = 1; 1715 anon_array_exit(&cookie); 1716 continue; 1717 } 1718 pgcnt = page_get_pagecnt(pp->p_szc); 1719 1720 /* 1721 * we cannot free a page which is permanently locked. 1722 * The page_struct_lock need not be acquired to examine 1723 * these fields since the page has an "exclusive" lock. 1724 */ 1725 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1726 page_unlock(pp); 1727 segadvstat.MADV_FREE_miss.value.ul++; 1728 anon_array_exit(&cookie); 1729 err = EBUSY; 1730 continue; 1731 } 1732 1733 ahm = AH_MUTEX(vp, off); 1734 mutex_enter(ahm); 1735 ASSERT(ap->an_refcnt != 0); 1736 /* 1737 * skip this one if copy-on-write is not yet broken. 1738 */ 1739 if (ap->an_refcnt > 1) { 1740 mutex_exit(ahm); 1741 page_unlock(pp); 1742 segadvstat.MADV_FREE_miss.value.ul++; 1743 anon_array_exit(&cookie); 1744 continue; 1745 } 1746 1747 if (behav == MADV_PURGE && pp->p_szc != 0) { 1748 /* 1749 * If we're purging and we have a large page, simplify 1750 * things a bit by demoting ourselves into the base 1751 * page case. 1752 */ 1753 (void) page_try_demote_pages(pp); 1754 } 1755 1756 if (pp->p_szc == 0) { 1757 pgcnt = 1; 1758 1759 /* 1760 * free swap slot; 1761 */ 1762 if (ap->an_pvp) { 1763 swap_phys_free(ap->an_pvp, ap->an_poff, 1764 PAGESIZE); 1765 ap->an_pvp = NULL; 1766 ap->an_poff = 0; 1767 } 1768 1769 if (behav == MADV_PURGE) { 1770 /* 1771 * If we're purging (instead of merely freeing), 1772 * rip out this anon structure entirely to 1773 * assure that any subsequent fault pulls from 1774 * the backing vnode (if any). 1775 */ 1776 if (--ap->an_refcnt == 0) 1777 anon_rmhash(ap); 1778 1779 mutex_exit(ahm); 1780 (void) anon_set_ptr(ahp, index, 1781 NULL, ANON_SLEEP); 1782 npurged++; 1783 ANI_ADD(1); 1784 kmem_cache_free(anon_cache, ap); 1785 } else { 1786 mutex_exit(ahm); 1787 } 1788 1789 segadvstat.MADV_FREE_hit.value.ul++; 1790 1791 /* 1792 * while we are at it, unload all the translations 1793 * and attempt to free the page. 1794 */ 1795 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1796 /*LINTED: constant in conditional context */ 1797 VN_DISPOSE(pp, 1798 behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred); 1799 1800 anon_array_exit(&cookie); 1801 continue; 1802 } 1803 1804 pgcnt = page_get_pagecnt(pp->p_szc); 1805 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1806 if (!page_try_demote_pages(pp)) { 1807 mutex_exit(ahm); 1808 page_unlock(pp); 1809 segadvstat.MADV_FREE_miss.value.ul++; 1810 anon_array_exit(&cookie); 1811 err = EBUSY; 1812 continue; 1813 } else { 1814 pgcnt = 1; 1815 if (ap->an_pvp) { 1816 swap_phys_free(ap->an_pvp, 1817 ap->an_poff, PAGESIZE); 1818 ap->an_pvp = NULL; 1819 ap->an_poff = 0; 1820 } 1821 mutex_exit(ahm); 1822 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1823 /*LINTED*/ 1824 VN_DISPOSE(pp, B_FREE, 0, kcred); 1825 segadvstat.MADV_FREE_hit.value.ul++; 1826 anon_array_exit(&cookie); 1827 continue; 1828 } 1829 } 1830 mutex_exit(ahm); 1831 root_pp = pp; 1832 1833 /* 1834 * try to lock remaining pages 1835 */ 1836 for (idx = 1; idx < pgcnt; idx++) { 1837 pp++; 1838 if (!page_trylock(pp, SE_EXCL)) 1839 break; 1840 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1841 page_unlock(pp); 1842 break; 1843 } 1844 } 1845 1846 if (idx == pgcnt) { 1847 for (i = 0; i < pgcnt; i++) { 1848 ap = anon_get_ptr(ahp, index + i); 1849 if (ap == NULL) 1850 break; 1851 swap_xlate(ap, &vp, &off); 1852 ahm = AH_MUTEX(vp, off); 1853 mutex_enter(ahm); 1854 ASSERT(ap->an_refcnt != 0); 1855 1856 /* 1857 * skip this one if copy-on-write 1858 * is not yet broken. 1859 */ 1860 if (ap->an_refcnt > 1) { 1861 mutex_exit(ahm); 1862 goto skiplp; 1863 } 1864 if (ap->an_pvp) { 1865 swap_phys_free(ap->an_pvp, 1866 ap->an_poff, PAGESIZE); 1867 ap->an_pvp = NULL; 1868 ap->an_poff = 0; 1869 } 1870 mutex_exit(ahm); 1871 } 1872 page_destroy_pages(root_pp); 1873 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1874 anon_array_exit(&cookie); 1875 continue; 1876 } 1877 skiplp: 1878 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1879 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1880 page_unlock(pp); 1881 anon_array_exit(&cookie); 1882 } 1883 1884 if (purged != NULL) 1885 *purged = npurged; 1886 1887 return (err); 1888 } 1889 1890 /* 1891 * Return the kept page(s) and protections back to the segment driver. 1892 */ 1893 int 1894 anon_getpage( 1895 struct anon **app, 1896 uint_t *protp, 1897 page_t *pl[], 1898 size_t plsz, 1899 struct seg *seg, 1900 caddr_t addr, 1901 enum seg_rw rw, 1902 struct cred *cred) 1903 { 1904 page_t *pp; 1905 struct anon *ap = *app; 1906 struct vnode *vp; 1907 anoff_t off; 1908 int err; 1909 kmutex_t *ahm; 1910 1911 swap_xlate(ap, &vp, &off); 1912 1913 /* 1914 * Lookup the page. If page is being paged in, 1915 * wait for it to finish as we must return a list of 1916 * pages since this routine acts like the VOP_GETPAGE 1917 * routine does. 1918 */ 1919 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1920 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1921 mutex_enter(ahm); 1922 if (ap->an_refcnt == 1) 1923 *protp = PROT_ALL; 1924 else 1925 *protp = PROT_ALL & ~PROT_WRITE; 1926 mutex_exit(ahm); 1927 pl[0] = pp; 1928 pl[1] = NULL; 1929 return (0); 1930 } 1931 1932 /* 1933 * Simply treat it as a vnode fault on the anon vp. 1934 */ 1935 1936 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1937 "anon_getpage:seg %x addr %x vp %x", 1938 seg, addr, vp); 1939 1940 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1941 seg, addr, rw, cred, NULL); 1942 1943 if (err == 0 && pl != NULL) { 1944 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 1945 mutex_enter(ahm); 1946 if (ap->an_refcnt != 1) 1947 *protp &= ~PROT_WRITE; /* make read-only */ 1948 mutex_exit(ahm); 1949 } 1950 return (err); 1951 } 1952 1953 /* 1954 * Creates or returns kept pages to the segment driver. returns -1 if a large 1955 * page cannot be allocated. returns -2 if some other process has allocated a 1956 * larger page. 1957 * 1958 * For cowfault it will allocate any size pages to fill the requested area to 1959 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1960 * slots within a large page with other processes). This policy greatly 1961 * simplifies large page freeing (which is only freed when all anon slot 1962 * refcnts are 0). 1963 */ 1964 int 1965 anon_map_getpages( 1966 struct anon_map *amp, 1967 ulong_t start_idx, 1968 uint_t szc, 1969 struct seg *seg, 1970 caddr_t addr, 1971 uint_t prot, 1972 uint_t *protp, 1973 page_t *ppa[], 1974 uint_t *ppa_szc, 1975 struct vpage vpage[], 1976 enum seg_rw rw, 1977 int brkcow, 1978 int anypgsz, 1979 int pgflags, 1980 struct cred *cred) 1981 { 1982 pgcnt_t pgcnt; 1983 struct anon *ap; 1984 struct vnode *vp; 1985 anoff_t off; 1986 page_t *pp, *pl[2], *conpp = NULL; 1987 caddr_t vaddr; 1988 ulong_t pg_idx, an_idx, i; 1989 spgcnt_t nreloc = 0; 1990 int prealloc = 1; 1991 int err, slotcreate; 1992 uint_t vpprot; 1993 int upsize = (szc < seg->s_szc); 1994 1995 #if !defined(__i386) && !defined(__amd64) 1996 ASSERT(seg->s_szc != 0); 1997 #endif 1998 ASSERT(szc <= seg->s_szc); 1999 ASSERT(ppa_szc != NULL); 2000 ASSERT(rw != S_CREATE); 2001 2002 *protp = PROT_ALL; 2003 2004 VM_STAT_ADD(anonvmstats.getpages[0]); 2005 2006 if (szc == 0) { 2007 VM_STAT_ADD(anonvmstats.getpages[1]); 2008 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 2009 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 2010 addr, rw, cred); 2011 if (err) 2012 return (err); 2013 ppa[0] = pl[0]; 2014 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2015 VM_STAT_ADD(anonvmstats.getpages[2]); 2016 if (ppa[0]->p_szc != 0 && upsize) { 2017 VM_STAT_ADD(anonvmstats.getpages[3]); 2018 *ppa_szc = MIN(ppa[0]->p_szc, 2019 seg->s_szc); 2020 page_unlock(ppa[0]); 2021 return (-2); 2022 } 2023 return (0); 2024 } 2025 panic("anon_map_getpages: cowfault for szc 0"); 2026 } else { 2027 VM_STAT_ADD(anonvmstats.getpages[4]); 2028 ppa[0] = anon_zero(seg, addr, &ap, cred); 2029 if (ppa[0] == NULL) 2030 return (ENOMEM); 2031 (void) anon_set_ptr(amp->ahp, start_idx, ap, 2032 ANON_SLEEP); 2033 return (0); 2034 } 2035 } 2036 2037 pgcnt = page_get_pagecnt(szc); 2038 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2039 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2040 2041 /* 2042 * First we check for the case that the requtested large 2043 * page or larger page already exists in the system. 2044 * Actually we only check if the first constituent page 2045 * exists and only preallocate if it's not found. 2046 */ 2047 ap = anon_get_ptr(amp->ahp, start_idx); 2048 if (ap) { 2049 uint_t pszc; 2050 swap_xlate(ap, &vp, &off); 2051 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 2052 if (pszc > szc && upsize) { 2053 *ppa_szc = MIN(pszc, seg->s_szc); 2054 return (-2); 2055 } 2056 if (pszc >= szc) { 2057 prealloc = 0; 2058 } 2059 } 2060 } 2061 2062 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 2063 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 2064 2065 top: 2066 /* 2067 * If a smaller page or no page at all was found, 2068 * grab a large page off the freelist. 2069 */ 2070 if (prealloc) { 2071 ASSERT(conpp == NULL); 2072 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 2073 szc, 0, pgflags) != 0) { 2074 VM_STAT_ADD(anonvmstats.getpages[7]); 2075 if (brkcow == 0 || szc < seg->s_szc || 2076 !anon_szcshare(amp->ahp, start_idx)) { 2077 /* 2078 * If the refcnt's of all anon slots are <= 1 2079 * they can't increase since we are holding 2080 * the address space's lock. So segvn can 2081 * safely decrease szc without risking to 2082 * generate a cow fault for the region smaller 2083 * than the segment's largest page size. 2084 */ 2085 VM_STAT_ADD(anonvmstats.getpages[8]); 2086 return (-1); 2087 } 2088 docow: 2089 /* 2090 * This is a cow fault. Copy away the entire 1 large 2091 * page region of this segment. 2092 */ 2093 if (szc != seg->s_szc) 2094 panic("anon_map_getpages: cowfault for szc %d", 2095 szc); 2096 vaddr = addr; 2097 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2098 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2099 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2100 NULL) { 2101 err = anon_getpage(&ap, &vpprot, pl, 2102 PAGESIZE, seg, vaddr, rw, cred); 2103 if (err) { 2104 for (i = 0; i < pg_idx; i++) { 2105 if ((pp = ppa[i]) != 2106 NULL) 2107 page_unlock(pp); 2108 } 2109 return (err); 2110 } 2111 ppa[pg_idx] = pl[0]; 2112 } else { 2113 /* 2114 * Since this is a cowfault we know 2115 * that this address space has a 2116 * parent or children which means 2117 * anon_dup_fill_holes() has initialized 2118 * all anon slots within a large page 2119 * region that had at least one anon 2120 * slot at the time of fork(). 2121 */ 2122 panic("anon_map_getpages: " 2123 "cowfault but anon slot is empty"); 2124 } 2125 } 2126 VM_STAT_ADD(anonvmstats.getpages[9]); 2127 *protp = PROT_ALL; 2128 return (anon_map_privatepages(amp, start_idx, szc, seg, 2129 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2130 } 2131 } 2132 2133 VM_STAT_ADD(anonvmstats.getpages[10]); 2134 2135 an_idx = start_idx; 2136 pg_idx = 0; 2137 vaddr = addr; 2138 while (pg_idx < pgcnt) { 2139 slotcreate = 0; 2140 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2141 VM_STAT_ADD(anonvmstats.getpages[11]); 2142 /* 2143 * For us to have decided not to preallocate 2144 * would have meant that a large page 2145 * was found. Which also means that all of the 2146 * anon slots for that page would have been 2147 * already created for us. 2148 */ 2149 if (prealloc == 0) 2150 panic("anon_map_getpages: prealloc = 0"); 2151 2152 slotcreate = 1; 2153 ap = anon_alloc(NULL, 0); 2154 } 2155 swap_xlate(ap, &vp, &off); 2156 2157 /* 2158 * Now setup our preallocated page to pass down 2159 * to swap_getpage(). 2160 */ 2161 if (prealloc) { 2162 ASSERT(ppa[pg_idx]->p_szc == szc); 2163 conpp = ppa[pg_idx]; 2164 } 2165 ASSERT(prealloc || conpp == NULL); 2166 2167 /* 2168 * If we just created this anon slot then call 2169 * with S_CREATE to prevent doing IO on the page. 2170 * Similar to the anon_zero case. 2171 */ 2172 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2173 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2174 slotcreate == 1 ? S_CREATE : rw, cred); 2175 2176 if (err) { 2177 ASSERT(err != -2 || upsize); 2178 VM_STAT_ADD(anonvmstats.getpages[12]); 2179 ASSERT(slotcreate == 0); 2180 goto io_err; 2181 } 2182 2183 pp = pl[0]; 2184 2185 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2186 VM_STAT_ADD(anonvmstats.getpages[13]); 2187 ASSERT(slotcreate == 0); 2188 ASSERT(prealloc == 0); 2189 ASSERT(pg_idx == 0); 2190 if (pp->p_szc > szc) { 2191 ASSERT(upsize); 2192 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2193 page_unlock(pp); 2194 VM_STAT_ADD(anonvmstats.getpages[14]); 2195 return (-2); 2196 } 2197 page_unlock(pp); 2198 prealloc = 1; 2199 goto top; 2200 } 2201 2202 /* 2203 * If we decided to preallocate but VOP_GETPAGE 2204 * found a page in the system that satisfies our 2205 * request then free up our preallocated large page 2206 * and continue looping accross the existing large 2207 * page via VOP_GETPAGE. 2208 */ 2209 if (prealloc && pp != ppa[pg_idx]) { 2210 VM_STAT_ADD(anonvmstats.getpages[15]); 2211 ASSERT(slotcreate == 0); 2212 ASSERT(pg_idx == 0); 2213 conpp = NULL; 2214 prealloc = 0; 2215 page_free_pages(ppa[0]); 2216 } 2217 2218 if (prealloc && nreloc > 1) { 2219 /* 2220 * we have relocated out of a smaller large page. 2221 * skip npgs - 1 iterations and continue which will 2222 * increment by one the loop indices. 2223 */ 2224 spgcnt_t npgs = nreloc; 2225 2226 VM_STAT_ADD(anonvmstats.getpages[16]); 2227 2228 ASSERT(pp == ppa[pg_idx]); 2229 ASSERT(slotcreate == 0); 2230 ASSERT(pg_idx + npgs <= pgcnt); 2231 if ((*protp & PROT_WRITE) && 2232 anon_share(amp->ahp, an_idx, npgs)) { 2233 *protp &= ~PROT_WRITE; 2234 } 2235 pg_idx += npgs; 2236 an_idx += npgs; 2237 vaddr += PAGESIZE * npgs; 2238 continue; 2239 } 2240 2241 VM_STAT_ADD(anonvmstats.getpages[17]); 2242 2243 /* 2244 * Anon_zero case. 2245 */ 2246 if (slotcreate) { 2247 ASSERT(prealloc); 2248 pagezero(pp, 0, PAGESIZE); 2249 CPU_STATS_ADD_K(vm, zfod, 1); 2250 hat_setrefmod(pp); 2251 } 2252 2253 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2254 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2255 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2256 2257 if (pg_idx > 0 && 2258 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2259 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2260 panic("anon_map_getpages: unexpected page"); 2261 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2262 panic("anon_map_getpages: unaligned page"); 2263 } 2264 2265 if (prealloc == 0) { 2266 ppa[pg_idx] = pp; 2267 } 2268 2269 if (ap->an_refcnt > 1) { 2270 VM_STAT_ADD(anonvmstats.getpages[18]); 2271 *protp &= ~PROT_WRITE; 2272 } 2273 2274 /* 2275 * If this is a new anon slot then initialize 2276 * the anon array entry. 2277 */ 2278 if (slotcreate) { 2279 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2280 } 2281 pg_idx++; 2282 an_idx++; 2283 vaddr += PAGESIZE; 2284 } 2285 2286 /* 2287 * Since preallocated pages come off the freelist 2288 * they are locked SE_EXCL. Simply downgrade and return. 2289 */ 2290 if (prealloc) { 2291 VM_STAT_ADD(anonvmstats.getpages[19]); 2292 conpp = NULL; 2293 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2294 page_downgrade(ppa[pg_idx]); 2295 } 2296 } 2297 ASSERT(conpp == NULL); 2298 2299 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2300 VM_STAT_ADD(anonvmstats.getpages[20]); 2301 return (0); 2302 } 2303 2304 if (szc < seg->s_szc) 2305 panic("anon_map_getpages: cowfault for szc %d", szc); 2306 2307 VM_STAT_ADD(anonvmstats.getpages[21]); 2308 2309 *protp = PROT_ALL; 2310 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2311 ppa, vpage, anypgsz, pgflags, cred)); 2312 io_err: 2313 /* 2314 * We got an IO error somewhere in our large page. 2315 * If we were using a preallocated page then just demote 2316 * all the constituent pages that we've succeeded with sofar 2317 * to PAGESIZE pages and leave them in the system 2318 * unlocked. 2319 */ 2320 2321 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2322 2323 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2324 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2325 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2326 2327 if (prealloc) { 2328 conpp = NULL; 2329 if (pg_idx > 0) { 2330 VM_STAT_ADD(anonvmstats.getpages[25]); 2331 for (i = 0; i < pgcnt; i++) { 2332 pp = ppa[i]; 2333 ASSERT(PAGE_EXCL(pp)); 2334 ASSERT(pp->p_szc == szc); 2335 pp->p_szc = 0; 2336 } 2337 for (i = 0; i < pg_idx; i++) { 2338 ASSERT(!hat_page_is_mapped(ppa[i])); 2339 page_unlock(ppa[i]); 2340 } 2341 /* 2342 * Now free up the remaining unused constituent 2343 * pages. 2344 */ 2345 while (pg_idx < pgcnt) { 2346 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2347 page_free(ppa[pg_idx], 0); 2348 pg_idx++; 2349 } 2350 } else { 2351 VM_STAT_ADD(anonvmstats.getpages[26]); 2352 page_free_pages(ppa[0]); 2353 } 2354 } else { 2355 VM_STAT_ADD(anonvmstats.getpages[27]); 2356 ASSERT(err > 0); 2357 for (i = 0; i < pg_idx; i++) 2358 page_unlock(ppa[i]); 2359 } 2360 ASSERT(conpp == NULL); 2361 if (err != -1) 2362 return (err); 2363 /* 2364 * we are here because we failed to relocate. 2365 */ 2366 ASSERT(prealloc); 2367 if (brkcow == 0 || szc < seg->s_szc || 2368 !anon_szcshare(amp->ahp, start_idx)) { 2369 VM_STAT_ADD(anonvmstats.getpages[28]); 2370 return (-1); 2371 } 2372 VM_STAT_ADD(anonvmstats.getpages[29]); 2373 goto docow; 2374 } 2375 2376 2377 /* 2378 * Turn a reference to an object or shared anon page 2379 * into a private page with a copy of the data from the 2380 * original page which is always locked by the caller. 2381 * This routine unloads the translation and unlocks the 2382 * original page, if it isn't being stolen, before returning 2383 * to the caller. 2384 * 2385 * NOTE: The original anon slot is not freed by this routine 2386 * It must be freed by the caller while holding the 2387 * "anon_map" lock to prevent races which can occur if 2388 * a process has multiple lwps in its address space. 2389 */ 2390 page_t * 2391 anon_private( 2392 struct anon **app, 2393 struct seg *seg, 2394 caddr_t addr, 2395 uint_t prot, 2396 page_t *opp, 2397 int oppflags, 2398 struct cred *cred) 2399 { 2400 struct anon *old = *app; 2401 struct anon *new; 2402 page_t *pp = NULL; 2403 struct vnode *vp; 2404 anoff_t off; 2405 page_t *anon_pl[1 + 1]; 2406 int err; 2407 2408 if (oppflags & STEAL_PAGE) 2409 ASSERT(PAGE_EXCL(opp)); 2410 else 2411 ASSERT(PAGE_LOCKED(opp)); 2412 2413 CPU_STATS_ADD_K(vm, cow_fault, 1); 2414 2415 /* Kernel probe */ 2416 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2417 tnf_opaque, address, addr); 2418 2419 *app = new = anon_alloc(NULL, 0); 2420 swap_xlate(new, &vp, &off); 2421 2422 if (oppflags & STEAL_PAGE) { 2423 page_rename(opp, vp, (u_offset_t)off); 2424 pp = opp; 2425 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2426 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2427 seg, addr, pp, vp, off); 2428 hat_setmod(pp); 2429 2430 /* bug 4026339 */ 2431 page_downgrade(pp); 2432 return (pp); 2433 } 2434 2435 /* 2436 * Call the VOP_GETPAGE routine to create the page, thereby 2437 * enabling the vnode driver to allocate any filesystem 2438 * space (e.g., disk block allocation for UFS). This also 2439 * prevents more than one page from being added to the 2440 * vnode at the same time. 2441 */ 2442 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2443 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2444 if (err) 2445 goto out; 2446 2447 pp = anon_pl[0]; 2448 2449 /* 2450 * If the original page was locked, we need to move the lock 2451 * to the new page by transfering 'cowcnt/lckcnt' of the original 2452 * page to 'cowcnt/lckcnt' of the new page. 2453 * 2454 * See Statement at the beginning of segvn_lockop() and 2455 * comments in page_pp_useclaim() regarding the way 2456 * cowcnts/lckcnts are handled. 2457 * 2458 * Also availrmem must be decremented up front for read only mapping 2459 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2460 * if availrmem did not need to be decremented after all. 2461 */ 2462 if (oppflags & LOCK_PAGE) { 2463 if ((prot & PROT_WRITE) == 0) { 2464 mutex_enter(&freemem_lock); 2465 if (availrmem > pages_pp_maximum) { 2466 availrmem--; 2467 pages_useclaim++; 2468 } else { 2469 mutex_exit(&freemem_lock); 2470 goto out; 2471 } 2472 mutex_exit(&freemem_lock); 2473 } 2474 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2475 } 2476 2477 /* 2478 * Now copy the contents from the original page, 2479 * which is locked and loaded in the MMU by 2480 * the caller to prevent yet another page fault. 2481 */ 2482 /* XXX - should set mod bit in here */ 2483 if (ppcopy(opp, pp) == 0) { 2484 /* 2485 * Before ppcopy could hanlde UE or other faults, we 2486 * would have panicked here, and still have no option 2487 * but to do so now. 2488 */ 2489 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2490 (void *)opp, (void *)pp); 2491 } 2492 2493 hat_setrefmod(pp); /* mark as modified */ 2494 2495 /* 2496 * Unload the old translation. 2497 */ 2498 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2499 2500 /* 2501 * Free unmapped, unmodified original page. 2502 * or release the lock on the original page, 2503 * otherwise the process will sleep forever in 2504 * anon_decref() waiting for the "exclusive" lock 2505 * on the page. 2506 */ 2507 (void) page_release(opp, 1); 2508 2509 /* 2510 * we are done with page creation so downgrade the new 2511 * page's selock to shared, this helps when multiple 2512 * as_fault(...SOFTLOCK...) are done to the same 2513 * page(aio) 2514 */ 2515 page_downgrade(pp); 2516 2517 /* 2518 * NOTE: The original anon slot must be freed by the 2519 * caller while holding the "anon_map" lock, if we 2520 * copied away from an anonymous page. 2521 */ 2522 return (pp); 2523 2524 out: 2525 *app = old; 2526 if (pp) 2527 page_unlock(pp); 2528 anon_decref(new); 2529 page_unlock(opp); 2530 return ((page_t *)NULL); 2531 } 2532 2533 int 2534 anon_map_privatepages( 2535 struct anon_map *amp, 2536 ulong_t start_idx, 2537 uint_t szc, 2538 struct seg *seg, 2539 caddr_t addr, 2540 uint_t prot, 2541 page_t *ppa[], 2542 struct vpage vpage[], 2543 int anypgsz, 2544 int pgflags, 2545 struct cred *cred) 2546 { 2547 pgcnt_t pgcnt; 2548 struct vnode *vp; 2549 anoff_t off; 2550 page_t *pl[2], *conpp = NULL; 2551 int err; 2552 int prealloc = 1; 2553 struct anon *ap, *oldap; 2554 caddr_t vaddr; 2555 page_t *pplist, *pp; 2556 ulong_t pg_idx, an_idx; 2557 spgcnt_t nreloc = 0; 2558 int pagelock = 0; 2559 kmutex_t *ahmpages = NULL; 2560 #ifdef DEBUG 2561 int refcnt; 2562 #endif 2563 2564 ASSERT(szc != 0); 2565 ASSERT(szc == seg->s_szc); 2566 2567 VM_STAT_ADD(anonvmstats.privatepages[0]); 2568 2569 pgcnt = page_get_pagecnt(szc); 2570 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2571 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2572 2573 ASSERT(amp != NULL); 2574 ap = anon_get_ptr(amp->ahp, start_idx); 2575 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2576 2577 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2578 2579 /* 2580 * Now try and allocate the large page. If we fail then just 2581 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2582 * the caller make this decision but to avoid added complexity 2583 * it's simplier to handle that case here. 2584 */ 2585 if (anypgsz == -1) { 2586 VM_STAT_ADD(anonvmstats.privatepages[2]); 2587 prealloc = 0; 2588 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2589 anypgsz, pgflags) != 0) { 2590 VM_STAT_ADD(anonvmstats.privatepages[3]); 2591 prealloc = 0; 2592 } 2593 2594 /* 2595 * make the decrement of all refcnts of all 2596 * anon slots of a large page appear atomic by 2597 * getting an anonpages_hash_lock for the 2598 * first anon slot of a large page. 2599 */ 2600 if (ap != NULL) { 2601 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 2602 mutex_enter(ahmpages); 2603 if (ap->an_refcnt == 1) { 2604 VM_STAT_ADD(anonvmstats.privatepages[4]); 2605 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2606 mutex_exit(ahmpages); 2607 2608 if (prealloc) { 2609 page_free_replacement_page(pplist); 2610 page_create_putback(pgcnt); 2611 } 2612 ASSERT(ppa[0]->p_szc <= szc); 2613 if (ppa[0]->p_szc == szc) { 2614 VM_STAT_ADD(anonvmstats.privatepages[5]); 2615 return (0); 2616 } 2617 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2618 ASSERT(ppa[pg_idx] != NULL); 2619 page_unlock(ppa[pg_idx]); 2620 } 2621 return (-1); 2622 } 2623 } 2624 2625 /* 2626 * If we are passed in the vpage array and this is 2627 * not PROT_WRITE then we need to decrement availrmem 2628 * up front before we try anything. If we need to and 2629 * can't decrement availrmem then its better to fail now 2630 * than in the middle of processing the new large page. 2631 * page_pp_usclaim() on behalf of each constituent page 2632 * below will adjust availrmem back for the cases not needed. 2633 */ 2634 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2635 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2636 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2637 pagelock = 1; 2638 break; 2639 } 2640 } 2641 if (pagelock) { 2642 VM_STAT_ADD(anonvmstats.privatepages[6]); 2643 mutex_enter(&freemem_lock); 2644 if (availrmem >= pages_pp_maximum + pgcnt) { 2645 availrmem -= pgcnt; 2646 pages_useclaim += pgcnt; 2647 } else { 2648 VM_STAT_ADD(anonvmstats.privatepages[7]); 2649 mutex_exit(&freemem_lock); 2650 if (ahmpages != NULL) { 2651 mutex_exit(ahmpages); 2652 } 2653 if (prealloc) { 2654 page_free_replacement_page(pplist); 2655 page_create_putback(pgcnt); 2656 } 2657 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2658 if (ppa[pg_idx] != NULL) 2659 page_unlock(ppa[pg_idx]); 2660 return (ENOMEM); 2661 } 2662 mutex_exit(&freemem_lock); 2663 } 2664 } 2665 2666 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2667 2668 VM_STAT_ADD(anonvmstats.privatepages[8]); 2669 2670 an_idx = start_idx; 2671 pg_idx = 0; 2672 vaddr = addr; 2673 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2674 ASSERT(ppa[pg_idx] != NULL); 2675 oldap = anon_get_ptr(amp->ahp, an_idx); 2676 ASSERT(ahmpages != NULL || oldap == NULL); 2677 ASSERT(ahmpages == NULL || oldap != NULL); 2678 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2679 ASSERT(ahmpages == NULL || pg_idx != 0 || 2680 (refcnt = oldap->an_refcnt)); 2681 ASSERT(ahmpages == NULL || pg_idx == 0 || 2682 refcnt == oldap->an_refcnt); 2683 2684 ap = anon_alloc(NULL, 0); 2685 2686 swap_xlate(ap, &vp, &off); 2687 2688 /* 2689 * Now setup our preallocated page to pass down to 2690 * swap_getpage(). 2691 */ 2692 if (prealloc) { 2693 pp = pplist; 2694 page_sub(&pplist, pp); 2695 conpp = pp; 2696 } 2697 2698 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2699 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2700 S_CREATE, cred); 2701 2702 /* 2703 * Impossible to fail this is S_CREATE. 2704 */ 2705 if (err) 2706 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2707 2708 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2709 ASSERT(prealloc == 0 || nreloc == 1); 2710 2711 pp = pl[0]; 2712 2713 /* 2714 * If the original page was locked, we need to move 2715 * the lock to the new page by transfering 2716 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2717 * of the new page. pg_idx can be used to index 2718 * into the vpage array since the caller will guarentee 2719 * that vpage struct passed in corresponds to addr 2720 * and forward. 2721 */ 2722 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2723 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2724 } else if (pagelock) { 2725 mutex_enter(&freemem_lock); 2726 availrmem++; 2727 pages_useclaim--; 2728 mutex_exit(&freemem_lock); 2729 } 2730 2731 /* 2732 * Now copy the contents from the original page. 2733 */ 2734 if (ppcopy(ppa[pg_idx], pp) == 0) { 2735 /* 2736 * Before ppcopy could hanlde UE or other faults, we 2737 * would have panicked here, and still have no option 2738 * but to do so now. 2739 */ 2740 panic("anon_map_privatepages, ppcopy failed"); 2741 } 2742 2743 hat_setrefmod(pp); /* mark as modified */ 2744 2745 /* 2746 * Release the lock on the original page, 2747 * derement the old slot, and down grade the lock 2748 * on the new copy. 2749 */ 2750 page_unlock(ppa[pg_idx]); 2751 2752 if (!prealloc) 2753 page_downgrade(pp); 2754 2755 ppa[pg_idx] = pp; 2756 2757 /* 2758 * Now reflect the copy in the new anon array. 2759 */ 2760 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2761 if (oldap != NULL) 2762 anon_decref(oldap); 2763 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2764 } 2765 2766 /* 2767 * Unload the old large page translation. 2768 */ 2769 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2770 2771 if (ahmpages != NULL) { 2772 mutex_exit(ahmpages); 2773 } 2774 ASSERT(prealloc == 0 || pplist == NULL); 2775 if (prealloc) { 2776 VM_STAT_ADD(anonvmstats.privatepages[9]); 2777 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2778 page_downgrade(ppa[pg_idx]); 2779 } 2780 } 2781 2782 return (0); 2783 } 2784 2785 /* 2786 * Allocate a private zero-filled anon page. 2787 */ 2788 page_t * 2789 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2790 { 2791 struct anon *ap; 2792 page_t *pp; 2793 struct vnode *vp; 2794 anoff_t off; 2795 page_t *anon_pl[1 + 1]; 2796 int err; 2797 2798 /* Kernel probe */ 2799 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2800 tnf_opaque, address, addr); 2801 2802 *app = ap = anon_alloc(NULL, 0); 2803 swap_xlate(ap, &vp, &off); 2804 2805 /* 2806 * Call the VOP_GETPAGE routine to create the page, thereby 2807 * enabling the vnode driver to allocate any filesystem 2808 * dependent structures (e.g., disk block allocation for UFS). 2809 * This also prevents more than on page from being added to 2810 * the vnode at the same time since it is locked. 2811 */ 2812 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2813 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2814 if (err) { 2815 *app = NULL; 2816 anon_decref(ap); 2817 return (NULL); 2818 } 2819 pp = anon_pl[0]; 2820 2821 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2822 page_downgrade(pp); 2823 CPU_STATS_ADD_K(vm, zfod, 1); 2824 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2825 return (pp); 2826 } 2827 2828 2829 /* 2830 * Allocate array of private zero-filled anon pages for empty slots 2831 * and kept pages for non empty slots within given range. 2832 * 2833 * NOTE: This rontine will try and use large pages 2834 * if available and supported by underlying platform. 2835 */ 2836 int 2837 anon_map_createpages( 2838 struct anon_map *amp, 2839 ulong_t start_index, 2840 size_t len, 2841 page_t *ppa[], 2842 struct seg *seg, 2843 caddr_t addr, 2844 enum seg_rw rw, 2845 struct cred *cred) 2846 { 2847 2848 struct anon *ap; 2849 struct vnode *ap_vp; 2850 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2851 int err = 0; 2852 ulong_t p_index, index; 2853 pgcnt_t npgs, pg_cnt; 2854 spgcnt_t nreloc = 0; 2855 uint_t l_szc, szc, prot; 2856 anoff_t ap_off; 2857 size_t pgsz; 2858 lgrp_t *lgrp; 2859 kmutex_t *ahm; 2860 2861 /* 2862 * XXX For now only handle S_CREATE. 2863 */ 2864 ASSERT(rw == S_CREATE); 2865 2866 index = start_index; 2867 p_index = 0; 2868 npgs = btopr(len); 2869 2870 /* 2871 * If this platform supports multiple page sizes 2872 * then try and allocate directly from the free 2873 * list for pages larger than PAGESIZE. 2874 * 2875 * NOTE:When we have page_create_ru we can stop 2876 * directly allocating from the freelist. 2877 */ 2878 l_szc = seg->s_szc; 2879 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2880 while (npgs) { 2881 2882 /* 2883 * if anon slot already exists 2884 * (means page has been created) 2885 * so 1) look up the page 2886 * 2) if the page is still in memory, get it. 2887 * 3) if not, create a page and 2888 * page in from physical swap device. 2889 * These are done in anon_getpage(). 2890 */ 2891 ap = anon_get_ptr(amp->ahp, index); 2892 if (ap) { 2893 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2894 seg, addr, S_READ, cred); 2895 if (err) { 2896 ANON_LOCK_EXIT(&->a_rwlock); 2897 panic("anon_map_createpages: anon_getpage"); 2898 } 2899 pp = anon_pl[0]; 2900 ppa[p_index++] = pp; 2901 2902 /* 2903 * an_pvp can become non-NULL after SysV's page was 2904 * paged out before ISM was attached to this SysV 2905 * shared memory segment. So free swap slot if needed. 2906 */ 2907 if (ap->an_pvp != NULL) { 2908 page_io_lock(pp); 2909 ahm = AH_MUTEX(ap->an_vp, ap->an_off); 2910 mutex_enter(ahm); 2911 if (ap->an_pvp != NULL) { 2912 swap_phys_free(ap->an_pvp, 2913 ap->an_poff, PAGESIZE); 2914 ap->an_pvp = NULL; 2915 ap->an_poff = 0; 2916 mutex_exit(ahm); 2917 hat_setmod(pp); 2918 } else { 2919 mutex_exit(ahm); 2920 } 2921 page_io_unlock(pp); 2922 } 2923 2924 addr += PAGESIZE; 2925 index++; 2926 npgs--; 2927 continue; 2928 } 2929 /* 2930 * Now try and allocate the largest page possible 2931 * for the current address and range. 2932 * Keep dropping down in page size until: 2933 * 2934 * 1) Properly aligned 2935 * 2) Does not overlap existing anon pages 2936 * 3) Fits in remaining range. 2937 * 4) able to allocate one. 2938 * 2939 * NOTE: XXX When page_create_ru is completed this code 2940 * will change. 2941 */ 2942 szc = l_szc; 2943 pplist = NULL; 2944 pg_cnt = 0; 2945 while (szc) { 2946 pgsz = page_get_pagesize(szc); 2947 pg_cnt = pgsz >> PAGESHIFT; 2948 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2949 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2950 /* 2951 * XXX 2952 * Since we are faking page_create() 2953 * we also need to do the freemem and 2954 * pcf accounting. 2955 */ 2956 (void) page_create_wait(pg_cnt, PG_WAIT); 2957 2958 /* 2959 * Get lgroup to allocate next page of shared 2960 * memory from and use it to specify where to 2961 * allocate the physical memory 2962 */ 2963 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2964 2965 pplist = page_get_freelist( 2966 anon_vp, (u_offset_t)0, seg, 2967 addr, pgsz, 0, lgrp); 2968 2969 if (pplist == NULL) { 2970 page_create_putback(pg_cnt); 2971 } 2972 2973 /* 2974 * If a request for a page of size 2975 * larger than PAGESIZE failed 2976 * then don't try that size anymore. 2977 */ 2978 if (pplist == NULL) { 2979 l_szc = szc - 1; 2980 } else { 2981 break; 2982 } 2983 } 2984 szc--; 2985 } 2986 2987 /* 2988 * If just using PAGESIZE pages then don't 2989 * directly allocate from the free list. 2990 */ 2991 if (pplist == NULL) { 2992 ASSERT(szc == 0); 2993 pp = anon_zero(seg, addr, &ap, cred); 2994 if (pp == NULL) { 2995 ANON_LOCK_EXIT(&->a_rwlock); 2996 panic("anon_map_createpages: anon_zero"); 2997 } 2998 ppa[p_index++] = pp; 2999 3000 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 3001 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3002 3003 addr += PAGESIZE; 3004 index++; 3005 npgs--; 3006 continue; 3007 } 3008 3009 /* 3010 * pplist is a list of pg_cnt PAGESIZE pages. 3011 * These pages are locked SE_EXCL since they 3012 * came directly off the free list. 3013 */ 3014 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 3015 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 3016 ASSERT(conpp == NULL); 3017 while (pg_cnt--) { 3018 3019 ap = anon_alloc(NULL, 0); 3020 swap_xlate(ap, &ap_vp, &ap_off); 3021 3022 ASSERT(pplist != NULL); 3023 pp = pplist; 3024 page_sub(&pplist, pp); 3025 PP_CLRFREE(pp); 3026 PP_CLRAGED(pp); 3027 conpp = pp; 3028 3029 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 3030 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 3031 &nreloc, seg, addr, S_CREATE, cred); 3032 3033 if (err) { 3034 ANON_LOCK_EXIT(&->a_rwlock); 3035 panic("anon_map_createpages: S_CREATE"); 3036 } 3037 3038 ASSERT(anon_pl[0] == pp); 3039 ASSERT(nreloc == 1); 3040 pagezero(pp, 0, PAGESIZE); 3041 CPU_STATS_ADD_K(vm, zfod, 1); 3042 hat_setrefmod(pp); 3043 3044 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 3045 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 3046 3047 ppa[p_index++] = pp; 3048 3049 addr += PAGESIZE; 3050 index++; 3051 npgs--; 3052 } 3053 conpp = NULL; 3054 pg_cnt = pgsz >> PAGESHIFT; 3055 p_index = p_index - pg_cnt; 3056 while (pg_cnt--) { 3057 page_downgrade(ppa[p_index++]); 3058 } 3059 } 3060 ANON_LOCK_EXIT(&->a_rwlock); 3061 return (0); 3062 } 3063 3064 static int 3065 anon_try_demote_pages( 3066 struct anon_hdr *ahp, 3067 ulong_t sidx, 3068 uint_t szc, 3069 page_t **ppa, 3070 int private) 3071 { 3072 struct anon *ap; 3073 pgcnt_t pgcnt = page_get_pagecnt(szc); 3074 page_t *pp; 3075 pgcnt_t i; 3076 kmutex_t *ahmpages = NULL; 3077 int root = 0; 3078 pgcnt_t npgs; 3079 pgcnt_t curnpgs = 0; 3080 size_t ppasize = 0; 3081 3082 ASSERT(szc != 0); 3083 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3084 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3085 ASSERT(sidx < ahp->size); 3086 3087 if (ppa == NULL) { 3088 ppasize = pgcnt * sizeof (page_t *); 3089 ppa = kmem_alloc(ppasize, KM_SLEEP); 3090 } 3091 3092 ap = anon_get_ptr(ahp, sidx); 3093 if (ap != NULL && private) { 3094 VM_STAT_ADD(anonvmstats.demotepages[1]); 3095 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off); 3096 mutex_enter(ahmpages); 3097 } 3098 3099 if (ap != NULL && ap->an_refcnt > 1) { 3100 if (ahmpages != NULL) { 3101 VM_STAT_ADD(anonvmstats.demotepages[2]); 3102 mutex_exit(ahmpages); 3103 } 3104 if (ppasize != 0) { 3105 kmem_free(ppa, ppasize); 3106 } 3107 return (0); 3108 } 3109 if (ahmpages != NULL) { 3110 mutex_exit(ahmpages); 3111 } 3112 if (ahp->size - sidx < pgcnt) { 3113 ASSERT(private == 0); 3114 pgcnt = ahp->size - sidx; 3115 } 3116 for (i = 0; i < pgcnt; i++, sidx++) { 3117 ap = anon_get_ptr(ahp, sidx); 3118 if (ap != NULL) { 3119 if (ap->an_refcnt != 1) { 3120 panic("anon_try_demote_pages: an_refcnt != 1"); 3121 } 3122 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3123 SE_EXCL); 3124 if (pp != NULL) { 3125 (void) hat_pageunload(pp, 3126 HAT_FORCE_PGUNLOAD); 3127 } 3128 } else { 3129 ppa[i] = NULL; 3130 } 3131 } 3132 for (i = 0; i < pgcnt; i++) { 3133 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3134 ASSERT(pp->p_szc <= szc); 3135 if (!root) { 3136 VM_STAT_ADD(anonvmstats.demotepages[3]); 3137 if (curnpgs != 0) 3138 panic("anon_try_demote_pages: " 3139 "bad large page"); 3140 3141 root = 1; 3142 curnpgs = npgs = 3143 page_get_pagecnt(pp->p_szc); 3144 3145 ASSERT(npgs <= pgcnt); 3146 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3147 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3148 } else { 3149 ASSERT(i > 0); 3150 ASSERT(page_pptonum(pp) - 1 == 3151 page_pptonum(ppa[i - 1])); 3152 if ((page_pptonum(pp) & (npgs - 1)) == 3153 npgs - 1) 3154 root = 0; 3155 } 3156 ASSERT(PAGE_EXCL(pp)); 3157 pp->p_szc = 0; 3158 ASSERT(curnpgs > 0); 3159 curnpgs--; 3160 } 3161 } 3162 if (root != 0 || curnpgs != 0) 3163 panic("anon_try_demote_pages: bad large page"); 3164 3165 for (i = 0; i < pgcnt; i++) { 3166 if ((pp = ppa[i]) != NULL) { 3167 ASSERT(!hat_page_is_mapped(pp)); 3168 ASSERT(pp->p_szc == 0); 3169 page_unlock(pp); 3170 } 3171 } 3172 if (ppasize != 0) { 3173 kmem_free(ppa, ppasize); 3174 } 3175 return (1); 3176 } 3177 3178 /* 3179 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3180 */ 3181 int 3182 anon_map_demotepages( 3183 struct anon_map *amp, 3184 ulong_t start_idx, 3185 struct seg *seg, 3186 caddr_t addr, 3187 uint_t prot, 3188 struct vpage vpage[], 3189 struct cred *cred) 3190 { 3191 struct anon *ap; 3192 uint_t szc = seg->s_szc; 3193 pgcnt_t pgcnt = page_get_pagecnt(szc); 3194 size_t ppasize = pgcnt * sizeof (page_t *); 3195 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3196 page_t *pp; 3197 page_t *pl[2]; 3198 pgcnt_t i, pg_idx; 3199 ulong_t an_idx; 3200 caddr_t vaddr; 3201 int err; 3202 int retry = 0; 3203 uint_t vpprot; 3204 3205 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3206 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3207 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3208 ASSERT(ppa != NULL); 3209 ASSERT(szc != 0); 3210 ASSERT(szc == amp->a_szc); 3211 3212 VM_STAT_ADD(anonvmstats.demotepages[0]); 3213 3214 top: 3215 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3216 kmem_free(ppa, ppasize); 3217 return (0); 3218 } 3219 3220 VM_STAT_ADD(anonvmstats.demotepages[4]); 3221 3222 ASSERT(retry == 0); /* we can be here only once */ 3223 3224 vaddr = addr; 3225 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3226 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3227 ap = anon_get_ptr(amp->ahp, an_idx); 3228 if (ap == NULL) 3229 panic("anon_map_demotepages: no anon slot"); 3230 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3231 S_READ, cred); 3232 if (err) { 3233 for (i = 0; i < pg_idx; i++) { 3234 if ((pp = ppa[i]) != NULL) 3235 page_unlock(pp); 3236 } 3237 kmem_free(ppa, ppasize); 3238 return (err); 3239 } 3240 ppa[pg_idx] = pl[0]; 3241 } 3242 3243 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3244 vpage, -1, 0, cred); 3245 if (err > 0) { 3246 VM_STAT_ADD(anonvmstats.demotepages[5]); 3247 kmem_free(ppa, ppasize); 3248 return (err); 3249 } 3250 ASSERT(err == 0 || err == -1); 3251 if (err == -1) { 3252 VM_STAT_ADD(anonvmstats.demotepages[6]); 3253 retry = 1; 3254 goto top; 3255 } 3256 for (i = 0; i < pgcnt; i++) { 3257 ASSERT(ppa[i] != NULL); 3258 if (ppa[i]->p_szc != 0) 3259 retry = 1; 3260 page_unlock(ppa[i]); 3261 } 3262 if (retry) { 3263 VM_STAT_ADD(anonvmstats.demotepages[7]); 3264 goto top; 3265 } 3266 3267 VM_STAT_ADD(anonvmstats.demotepages[8]); 3268 3269 kmem_free(ppa, ppasize); 3270 3271 return (0); 3272 } 3273 3274 /* 3275 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3276 * structures with private anon maps. Therefore all anon structures should 3277 * have at most one reference at this point. This means underlying pages can 3278 * be exclusively locked and demoted or freed. If not freeing the entire 3279 * large pages demote the ends of the region we free to be able to free 3280 * subpages. Page roots correspond to aligned index positions in anon map. 3281 */ 3282 void 3283 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3284 { 3285 ulong_t eidx = sidx + btopr(len); 3286 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3287 struct anon_hdr *ahp = amp->ahp; 3288 ulong_t tidx; 3289 size_t size; 3290 ulong_t sidx_aligned; 3291 ulong_t eidx_aligned; 3292 3293 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3294 ASSERT(amp->refcnt <= 1); 3295 ASSERT(amp->a_szc > 0); 3296 ASSERT(eidx <= ahp->size); 3297 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3298 3299 if (len == 0) { /* XXX */ 3300 return; 3301 } 3302 3303 sidx_aligned = P2ALIGN(sidx, pages); 3304 if (sidx_aligned != sidx || 3305 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3306 if (!anon_try_demote_pages(ahp, sidx_aligned, 3307 amp->a_szc, NULL, 0)) { 3308 panic("anon_shmap_free_pages: demote failed"); 3309 } 3310 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3311 P2NPHASE(sidx, pages); 3312 size <<= PAGESHIFT; 3313 anon_free(ahp, sidx, size); 3314 sidx = sidx_aligned + pages; 3315 if (eidx <= sidx) { 3316 return; 3317 } 3318 } 3319 eidx_aligned = P2ALIGN(eidx, pages); 3320 if (sidx < eidx_aligned) { 3321 anon_free_pages(ahp, sidx, 3322 (eidx_aligned - sidx) << PAGESHIFT, 3323 amp->a_szc); 3324 sidx = eidx_aligned; 3325 } 3326 ASSERT(sidx == eidx_aligned); 3327 if (eidx == eidx_aligned) { 3328 return; 3329 } 3330 tidx = eidx; 3331 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3332 tidx - sidx < pages) { 3333 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3334 panic("anon_shmap_free_pages: demote failed"); 3335 } 3336 size = (eidx - sidx) << PAGESHIFT; 3337 anon_free(ahp, sidx, size); 3338 } else { 3339 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3340 } 3341 } 3342 3343 /* 3344 * This routine should be called with amp's writer lock when there're no other 3345 * users of amp. All pcache entries of this amp must have been already 3346 * inactivated. We must not drop a_rwlock here to prevent new users from 3347 * attaching to this amp. 3348 */ 3349 void 3350 anonmap_purge(struct anon_map *amp) 3351 { 3352 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3353 ASSERT(amp->refcnt <= 1); 3354 3355 if (amp->a_softlockcnt != 0) { 3356 seg_ppurge(NULL, amp, 0); 3357 } 3358 3359 /* 3360 * Since all pcache entries were already inactive before this routine 3361 * was called seg_ppurge() couldn't return while there're still 3362 * entries that can be found via the list anchored at a_phead. So we 3363 * can assert this list is empty now. a_softlockcnt may be still non 0 3364 * if asynchronous thread that manages pcache already removed pcache 3365 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3366 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3367 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3368 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3369 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3370 * barrier that prevents anonmap_purge() to complete while 3371 * shamp_reclaim() may still be referencing this amp. 3372 */ 3373 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3374 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3375 3376 mutex_enter(&->a_purgemtx); 3377 while (amp->a_softlockcnt != 0) { 3378 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3379 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3380 amp->a_purgewait = 1; 3381 cv_wait(&->a_purgecv, &->a_purgemtx); 3382 } 3383 mutex_exit(&->a_purgemtx); 3384 3385 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3386 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3387 ASSERT(amp->a_softlockcnt == 0); 3388 } 3389 3390 /* 3391 * Allocate and initialize an anon_map structure for seg 3392 * associating the given swap reservation with the new anon_map. 3393 */ 3394 struct anon_map * 3395 anonmap_alloc(size_t size, size_t swresv, int flags) 3396 { 3397 struct anon_map *amp; 3398 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3399 3400 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3401 if (amp == NULL) { 3402 ASSERT(kmflags == KM_NOSLEEP); 3403 return (NULL); 3404 } 3405 3406 amp->ahp = anon_create(btopr(size), flags); 3407 if (amp->ahp == NULL) { 3408 ASSERT(flags == ANON_NOSLEEP); 3409 kmem_cache_free(anonmap_cache, amp); 3410 return (NULL); 3411 } 3412 amp->refcnt = 1; 3413 amp->size = size; 3414 amp->swresv = swresv; 3415 amp->locality = 0; 3416 amp->a_szc = 0; 3417 amp->a_sp = NULL; 3418 amp->a_softlockcnt = 0; 3419 amp->a_purgewait = 0; 3420 amp->a_phead.p_lnext = &->a_phead; 3421 amp->a_phead.p_lprev = &->a_phead; 3422 3423 return (amp); 3424 } 3425 3426 void 3427 anonmap_free(struct anon_map *amp) 3428 { 3429 ASSERT(amp->ahp != NULL); 3430 ASSERT(amp->refcnt == 0); 3431 ASSERT(amp->a_softlockcnt == 0); 3432 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3433 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3434 3435 lgrp_shm_policy_fini(amp, NULL); 3436 anon_release(amp->ahp, btopr(amp->size)); 3437 kmem_cache_free(anonmap_cache, amp); 3438 } 3439 3440 /* 3441 * Returns true if the app array has some empty slots. 3442 * The offp and lenp parameters are in/out parameters. On entry 3443 * these values represent the starting offset and length of the 3444 * mapping. When true is returned, these values may be modified 3445 * to be the largest range which includes empty slots. 3446 */ 3447 int 3448 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3449 size_t *lenp) 3450 { 3451 ulong_t i, el; 3452 ssize_t low, high; 3453 struct anon *ap; 3454 3455 low = -1; 3456 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3457 ap = anon_get_ptr(ahp, anon_idx); 3458 if (ap == NULL) { 3459 if (low == -1) 3460 low = i; 3461 high = i; 3462 } 3463 } 3464 if (low != -1) { 3465 /* 3466 * Found at least one non-anon page. 3467 * Set up the off and len return values. 3468 */ 3469 if (low != 0) 3470 *offp += low; 3471 *lenp = high - low + PAGESIZE; 3472 return (1); 3473 } 3474 return (0); 3475 } 3476 3477 /* 3478 * Return a count of the number of existing anon pages in the anon array 3479 * app in the range (off, off+len). The array and slots must be guaranteed 3480 * stable by the caller. 3481 */ 3482 pgcnt_t 3483 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3484 { 3485 pgcnt_t cnt = 0; 3486 3487 while (nslots-- > 0) { 3488 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3489 cnt++; 3490 anon_index++; 3491 } 3492 return (cnt); 3493 } 3494 3495 /* 3496 * Move reserved phys swap into memory swap (unreserve phys swap 3497 * and reserve mem swap by the same amount). 3498 * Used by segspt when it needs to lock reserved swap npages in memory 3499 */ 3500 int 3501 anon_swap_adjust(pgcnt_t npages) 3502 { 3503 pgcnt_t unlocked_mem_swap; 3504 3505 mutex_enter(&anoninfo_lock); 3506 3507 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3508 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3509 3510 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3511 - k_anoninfo.ani_locked_swap; 3512 if (npages > unlocked_mem_swap) { 3513 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3514 3515 /* 3516 * if there is not enough unlocked mem swap we take missing 3517 * amount from phys swap and give it to mem swap 3518 */ 3519 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3520 mutex_exit(&anoninfo_lock); 3521 return (ENOMEM); 3522 } 3523 3524 k_anoninfo.ani_mem_resv += adjusted_swap; 3525 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3526 k_anoninfo.ani_phys_resv -= adjusted_swap; 3527 3528 ANI_ADD(adjusted_swap); 3529 } 3530 k_anoninfo.ani_locked_swap += npages; 3531 3532 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3533 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3534 3535 mutex_exit(&anoninfo_lock); 3536 3537 return (0); 3538 } 3539 3540 /* 3541 * 'unlocked' reserved mem swap so when it is unreserved it 3542 * can be moved back phys (disk) swap 3543 */ 3544 void 3545 anon_swap_restore(pgcnt_t npages) 3546 { 3547 mutex_enter(&anoninfo_lock); 3548 3549 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3550 3551 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3552 k_anoninfo.ani_locked_swap -= npages; 3553 3554 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3555 3556 mutex_exit(&anoninfo_lock); 3557 } 3558 3559 /* 3560 * Return the pointer from the list for a 3561 * specified anon index. 3562 */ 3563 ulong_t * 3564 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3565 { 3566 struct anon **app; 3567 void **ppp; 3568 3569 ASSERT(an_idx < ahp->size); 3570 3571 /* 3572 * Single level case. 3573 */ 3574 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3575 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3576 } else { 3577 3578 /* 3579 * 2 level case. 3580 */ 3581 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3582 if (*ppp == NULL) { 3583 mutex_enter(&ahp->serial_lock); 3584 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3585 if (*ppp == NULL) 3586 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3587 mutex_exit(&ahp->serial_lock); 3588 } 3589 app = *ppp; 3590 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3591 } 3592 } 3593 3594 void 3595 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3596 { 3597 ulong_t *ap_slot; 3598 kmutex_t *mtx; 3599 kcondvar_t *cv; 3600 int hash; 3601 3602 /* 3603 * Use szc to determine anon slot(s) to appear atomic. 3604 * If szc = 0, then lock the anon slot and mark it busy. 3605 * If szc > 0, then lock the range of slots by getting the 3606 * anon_array_lock for the first anon slot, and mark only the 3607 * first anon slot busy to represent whole range being busy. 3608 */ 3609 3610 ASSERT(RW_READ_HELD(&->a_rwlock)); 3611 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3612 hash = ANON_ARRAY_HASH(amp, an_idx); 3613 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3614 sobj->sync_cv = cv = &anon_array_cv[hash]; 3615 mutex_enter(mtx); 3616 ap_slot = anon_get_slot(amp->ahp, an_idx); 3617 while (ANON_ISBUSY(ap_slot)) 3618 cv_wait(cv, mtx); 3619 ANON_SETBUSY(ap_slot); 3620 sobj->sync_data = ap_slot; 3621 mutex_exit(mtx); 3622 } 3623 3624 int 3625 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3626 anon_sync_obj_t *sobj) 3627 { 3628 ulong_t *ap_slot; 3629 kmutex_t *mtx; 3630 int hash; 3631 3632 /* 3633 * Try to lock a range of anon slots. 3634 * Use szc to determine anon slot(s) to appear atomic. 3635 * If szc = 0, then lock the anon slot and mark it busy. 3636 * If szc > 0, then lock the range of slots by getting the 3637 * anon_array_lock for the first anon slot, and mark only the 3638 * first anon slot busy to represent whole range being busy. 3639 * Fail if the mutex or the anon_array are busy. 3640 */ 3641 3642 ASSERT(RW_READ_HELD(&->a_rwlock)); 3643 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3644 hash = ANON_ARRAY_HASH(amp, an_idx); 3645 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3646 sobj->sync_cv = &anon_array_cv[hash]; 3647 if (!mutex_tryenter(mtx)) { 3648 return (EWOULDBLOCK); 3649 } 3650 ap_slot = anon_get_slot(amp->ahp, an_idx); 3651 if (ANON_ISBUSY(ap_slot)) { 3652 mutex_exit(mtx); 3653 return (EWOULDBLOCK); 3654 } 3655 ANON_SETBUSY(ap_slot); 3656 sobj->sync_data = ap_slot; 3657 mutex_exit(mtx); 3658 return (0); 3659 } 3660 3661 void 3662 anon_array_exit(anon_sync_obj_t *sobj) 3663 { 3664 mutex_enter(sobj->sync_mutex); 3665 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3666 ANON_CLRBUSY(sobj->sync_data); 3667 if (CV_HAS_WAITERS(sobj->sync_cv)) 3668 cv_broadcast(sobj->sync_cv); 3669 mutex_exit(sobj->sync_mutex); 3670 } 3671