1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - anonymous pages. 43 * 44 * This layer sits immediately above the vm_swap layer. It manages 45 * physical pages that have no permanent identity in the file system 46 * name space, using the services of the vm_swap layer to allocate 47 * backing storage for these pages. Since these pages have no external 48 * identity, they are discarded when the last reference is removed. 49 * 50 * An important function of this layer is to manage low-level sharing 51 * of pages that are logically distinct but that happen to be 52 * physically identical (e.g., the corresponding pages of the processes 53 * resulting from a fork before one process or the other changes their 54 * contents). This pseudo-sharing is present only as an optimization 55 * and is not to be confused with true sharing in which multiple 56 * address spaces deliberately contain references to the same object; 57 * such sharing is managed at a higher level. 58 * 59 * The key data structure here is the anon struct, which contains a 60 * reference count for its associated physical page and a hint about 61 * the identity of that page. Anon structs typically live in arrays, 62 * with an instance's position in its array determining where the 63 * corresponding backing storage is allocated; however, the swap_xlate() 64 * routine abstracts away this representation information so that the 65 * rest of the anon layer need not know it. (See the swap layer for 66 * more details on anon struct layout.) 67 * 68 * In the future versions of the system, the association between an 69 * anon struct and its position on backing store will change so that 70 * we don't require backing store all anonymous pages in the system. 71 * This is important for consideration for large memory systems. 72 * We can also use this technique to delay binding physical locations 73 * to anonymous pages until pageout/swapout time where we can make 74 * smarter allocation decisions to improve anonymous klustering. 75 * 76 * Many of the routines defined here take a (struct anon **) argument, 77 * which allows the code at this level to manage anon pages directly, 78 * so that callers can regard anon structs as opaque objects and not be 79 * concerned with assigning or inspecting their contents. 80 * 81 * Clients of this layer refer to anon pages indirectly. That is, they 82 * maintain arrays of pointers to anon structs rather than maintaining 83 * anon structs themselves. The (struct anon **) arguments mentioned 84 * above are pointers to entries in these arrays. It is these arrays 85 * that capture the mapping between offsets within a given segment and 86 * the corresponding anonymous backing storage address. 87 */ 88 89 #ifdef DEBUG 90 #define ANON_DEBUG 91 #endif 92 93 #include <sys/types.h> 94 #include <sys/t_lock.h> 95 #include <sys/param.h> 96 #include <sys/systm.h> 97 #include <sys/mman.h> 98 #include <sys/cred.h> 99 #include <sys/thread.h> 100 #include <sys/vnode.h> 101 #include <sys/cpuvar.h> 102 #include <sys/swap.h> 103 #include <sys/cmn_err.h> 104 #include <sys/vtrace.h> 105 #include <sys/kmem.h> 106 #include <sys/sysmacros.h> 107 #include <sys/bitmap.h> 108 #include <sys/vmsystm.h> 109 #include <sys/tuneable.h> 110 #include <sys/debug.h> 111 #include <sys/fs/swapnode.h> 112 #include <sys/tnf_probe.h> 113 #include <sys/lgrp.h> 114 #include <sys/policy.h> 115 #include <sys/condvar_impl.h> 116 #include <sys/mutex_impl.h> 117 #include <sys/rctl.h> 118 119 #include <vm/as.h> 120 #include <vm/hat.h> 121 #include <vm/anon.h> 122 #include <vm/page.h> 123 #include <vm/vpage.h> 124 #include <vm/seg.h> 125 #include <vm/rm.h> 126 127 #include <fs/fs_subr.h> 128 129 struct vnode *anon_vp; 130 131 int anon_debug; 132 133 kmutex_t anoninfo_lock; 134 struct k_anoninfo k_anoninfo; 135 ani_free_t ani_free_pool[ANI_MAX_POOL]; 136 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 137 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 138 139 /* 140 * Global hash table for (vp, off) -> anon slot 141 */ 142 extern int swap_maxcontig; 143 size_t anon_hash_size; 144 struct anon **anon_hash; 145 146 static struct kmem_cache *anon_cache; 147 static struct kmem_cache *anonmap_cache; 148 149 #ifdef VM_STATS 150 static struct anonvmstats_str { 151 ulong_t getpages[30]; 152 ulong_t privatepages[10]; 153 ulong_t demotepages[9]; 154 ulong_t decrefpages[9]; 155 ulong_t dupfillholes[4]; 156 ulong_t freepages[1]; 157 } anonvmstats; 158 #endif /* VM_STATS */ 159 160 /*ARGSUSED*/ 161 static int 162 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 163 { 164 struct anon_map *amp = buf; 165 166 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 167 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 168 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 169 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 170 return (0); 171 } 172 173 /*ARGSUSED1*/ 174 static void 175 anonmap_cache_destructor(void *buf, void *cdrarg) 176 { 177 struct anon_map *amp = buf; 178 179 rw_destroy(&->a_rwlock); 180 cv_destroy(&->a_purgecv); 181 mutex_destroy(&->a_pmtx); 182 mutex_destroy(&->a_purgemtx); 183 } 184 185 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 186 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 187 188 void 189 anon_init(void) 190 { 191 int i; 192 193 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 194 195 for (i = 0; i < AH_LOCK_SIZE; i++) { 196 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 197 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 198 } 199 200 for (i = 0; i < ANON_LOCKSIZE; i++) { 201 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 202 MUTEX_DEFAULT, NULL); 203 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 204 } 205 206 anon_hash = (struct anon **) 207 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 208 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 209 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 210 anonmap_cache = kmem_cache_create("anonmap_cache", 211 sizeof (struct anon_map), 0, 212 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 213 NULL, NULL, 0); 214 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 215 216 anon_vp = vn_alloc(KM_SLEEP); 217 vn_setops(anon_vp, swap_vnodeops); 218 anon_vp->v_type = VREG; 219 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 220 } 221 222 /* 223 * Global anon slot hash table manipulation. 224 */ 225 226 static void 227 anon_addhash(struct anon *ap) 228 { 229 int index; 230 231 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 232 index = ANON_HASH(ap->an_vp, ap->an_off); 233 ap->an_hash = anon_hash[index]; 234 anon_hash[index] = ap; 235 } 236 237 static void 238 anon_rmhash(struct anon *ap) 239 { 240 struct anon **app; 241 242 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 243 244 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 245 *app; app = &((*app)->an_hash)) { 246 if (*app == ap) { 247 *app = ap->an_hash; 248 break; 249 } 250 } 251 } 252 253 /* 254 * The anon array interfaces. Functions allocating, 255 * freeing array of pointers, and returning/setting 256 * entries in the array of pointers for a given offset. 257 * 258 * Create the list of pointers 259 */ 260 struct anon_hdr * 261 anon_create(pgcnt_t npages, int flags) 262 { 263 struct anon_hdr *ahp; 264 ulong_t nchunks; 265 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 266 267 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 268 return (NULL); 269 } 270 271 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 272 /* 273 * Single level case. 274 */ 275 ahp->size = npages; 276 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 277 278 if (flags & ANON_ALLOC_FORCE) 279 ahp->flags |= ANON_ALLOC_FORCE; 280 281 ahp->array_chunk = kmem_zalloc( 282 ahp->size * sizeof (struct anon *), kmemflags); 283 284 if (ahp->array_chunk == NULL) { 285 kmem_free(ahp, sizeof (struct anon_hdr)); 286 return (NULL); 287 } 288 } else { 289 /* 290 * 2 Level case. 291 * anon hdr size needs to be rounded off to be a multiple 292 * of ANON_CHUNK_SIZE. This is important as various anon 293 * related functions depend on this. 294 * NOTE - 295 * anon_grow() makes anon hdr size a multiple of 296 * ANON_CHUNK_SIZE. 297 * amp size is <= anon hdr size. 298 * anon_index + seg_pgs <= anon hdr size. 299 */ 300 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 301 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 302 303 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 304 kmemflags); 305 306 if (ahp->array_chunk == NULL) { 307 kmem_free(ahp, sizeof (struct anon_hdr)); 308 return (NULL); 309 } 310 } 311 return (ahp); 312 } 313 314 /* 315 * Free the array of pointers 316 */ 317 void 318 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 319 { 320 ulong_t i; 321 void **ppp; 322 ulong_t nchunks; 323 324 ASSERT(npages <= ahp->size); 325 326 /* 327 * Single level case. 328 */ 329 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 330 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 331 } else { 332 /* 333 * 2 level case. 334 */ 335 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 336 for (i = 0; i < nchunks; i++) { 337 ppp = &ahp->array_chunk[i]; 338 if (*ppp != NULL) 339 kmem_free(*ppp, PAGESIZE); 340 } 341 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 342 } 343 mutex_destroy(&ahp->serial_lock); 344 kmem_free(ahp, sizeof (struct anon_hdr)); 345 } 346 347 /* 348 * Return the pointer from the list for a 349 * specified anon index. 350 */ 351 struct anon * 352 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 353 { 354 struct anon **app; 355 356 ASSERT(an_idx < ahp->size); 357 358 /* 359 * Single level case. 360 */ 361 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 362 return ((struct anon *) 363 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 364 } else { 365 366 /* 367 * 2 level case. 368 */ 369 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 370 if (app) { 371 return ((struct anon *) 372 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 373 ANON_PTRMASK)); 374 } else { 375 return (NULL); 376 } 377 } 378 } 379 380 /* 381 * Return the anon pointer for the first valid entry in the anon list, 382 * starting from the given index. 383 */ 384 struct anon * 385 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 386 { 387 struct anon *ap; 388 struct anon **app; 389 ulong_t chunkoff; 390 ulong_t i; 391 ulong_t j; 392 pgcnt_t size; 393 394 i = *index; 395 size = ahp->size; 396 397 ASSERT(i < size); 398 399 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 400 /* 401 * 1 level case 402 */ 403 while (i < size) { 404 ap = (struct anon *) 405 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 406 if (ap) { 407 *index = i; 408 return (ap); 409 } 410 i++; 411 } 412 } else { 413 /* 414 * 2 level case 415 */ 416 chunkoff = i & ANON_CHUNK_OFF; 417 while (i < size) { 418 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 419 if (app) 420 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 421 ap = (struct anon *) 422 ((uintptr_t)app[j] & ANON_PTRMASK); 423 if (ap) { 424 *index = i + (j - chunkoff); 425 return (ap); 426 } 427 } 428 chunkoff = 0; 429 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 430 } 431 } 432 *index = size; 433 return (NULL); 434 } 435 436 /* 437 * Set list entry with a given pointer for a specified offset 438 */ 439 int 440 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 441 { 442 void **ppp; 443 struct anon **app; 444 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 445 uintptr_t *ap_addr; 446 447 ASSERT(an_idx < ahp->size); 448 449 /* 450 * Single level case. 451 */ 452 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 453 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 454 } else { 455 456 /* 457 * 2 level case. 458 */ 459 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 460 461 ASSERT(ppp != NULL); 462 if (*ppp == NULL) { 463 mutex_enter(&ahp->serial_lock); 464 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 465 if (*ppp == NULL) { 466 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 467 if (*ppp == NULL) { 468 mutex_exit(&ahp->serial_lock); 469 return (ENOMEM); 470 } 471 } 472 mutex_exit(&ahp->serial_lock); 473 } 474 app = *ppp; 475 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 476 } 477 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 478 return (0); 479 } 480 481 /* 482 * Copy anon array into a given new anon array 483 */ 484 int 485 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 486 struct anon_hdr *dahp, ulong_t d_idx, 487 pgcnt_t npages, int flags) 488 { 489 void **sapp, **dapp; 490 void *ap; 491 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 492 493 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 494 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 495 496 /* 497 * Both arrays are 1 level. 498 */ 499 if (((sahp->size <= ANON_CHUNK_SIZE) && 500 (dahp->size <= ANON_CHUNK_SIZE)) || 501 ((sahp->flags & ANON_ALLOC_FORCE) && 502 (dahp->flags & ANON_ALLOC_FORCE))) { 503 504 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 505 npages * sizeof (struct anon *)); 506 return (0); 507 } 508 509 /* 510 * Both arrays are 2 levels. 511 */ 512 if (sahp->size > ANON_CHUNK_SIZE && 513 dahp->size > ANON_CHUNK_SIZE && 514 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 515 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 516 517 ulong_t sapidx, dapidx; 518 ulong_t *sap, *dap; 519 ulong_t chknp; 520 521 while (npages != 0) { 522 523 sapidx = s_idx & ANON_CHUNK_OFF; 524 dapidx = d_idx & ANON_CHUNK_OFF; 525 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 526 if (chknp > npages) 527 chknp = npages; 528 529 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 530 if ((sap = *sapp) != NULL) { 531 dapp = &dahp->array_chunk[d_idx 532 >> ANON_CHUNK_SHIFT]; 533 if ((dap = *dapp) == NULL) { 534 *dapp = kmem_zalloc(PAGESIZE, 535 kmemflags); 536 if ((dap = *dapp) == NULL) 537 return (ENOMEM); 538 } 539 bcopy((sap + sapidx), (dap + dapidx), 540 chknp << ANON_PTRSHIFT); 541 } 542 s_idx += chknp; 543 d_idx += chknp; 544 npages -= chknp; 545 } 546 return (0); 547 } 548 549 /* 550 * At least one of the arrays is 2 level. 551 */ 552 while (npages--) { 553 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 554 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 555 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 556 return (ENOMEM); 557 } 558 s_idx++; 559 d_idx++; 560 } 561 return (0); 562 } 563 564 565 /* 566 * ANON_INITBUF is a convenience macro for anon_grow() below. It 567 * takes a buffer dst, which is at least as large as buffer src. It 568 * does a bcopy from src into dst, and then bzeros the extra bytes 569 * of dst. If tail is set, the data in src is tail aligned within 570 * dst instead of head aligned. 571 */ 572 573 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 574 if (tail) { \ 575 bzero((dst), (dstsize) - (srclen)); \ 576 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 577 } else { \ 578 bcopy((src), (dst), (srclen)); \ 579 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 580 } 581 582 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 583 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 584 585 /* 586 * anon_grow() is used to efficiently extend an existing anon array. 587 * startidx_p points to the index into the anon array of the first page 588 * that is in use. oldseg_pgs is the number of pages in use, starting at 589 * *startidx_p. newpages is the number of additional pages desired. 590 * 591 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 592 * 593 * The growth is done by creating a new top level of the anon array, 594 * and (if the array is 2-level) reusing the existing second level arrays. 595 * 596 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 597 * 598 * Returns the new number of pages in the anon array. 599 */ 600 pgcnt_t 601 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 602 pgcnt_t newseg_pgs, int flags) 603 { 604 ulong_t startidx = startidx_p ? *startidx_p : 0; 605 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 606 pgcnt_t oelems, nelems, totpages; 607 void **level1; 608 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 609 int growdown = (flags & ANON_GROWDOWN); 610 size_t newarrsz, oldarrsz; 611 void *level2; 612 613 ASSERT(!(startidx_p == NULL && growdown)); 614 ASSERT(startidx + oldseg_pgs <= ahp->size); 615 616 /* 617 * Determine the total number of pages needed in the new 618 * anon array. If growing down, totpages is all pages from 619 * startidx through the end of the array, plus <newseg_pgs> 620 * pages. If growing up, keep all pages from page 0 through 621 * the last page currently in use, plus <newseg_pgs> pages. 622 */ 623 if (growdown) 624 totpages = oldamp_pgs - startidx + newseg_pgs; 625 else 626 totpages = startidx + oldseg_pgs + newseg_pgs; 627 628 /* If the array is already large enough, just return. */ 629 630 if (oldamp_pgs >= totpages) { 631 if (growdown) 632 *startidx_p = oldamp_pgs - totpages; 633 return (oldamp_pgs); 634 } 635 636 /* 637 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 638 * by the corresponding arrays. 639 * oelems/nelems are the number of pointers in the top level arrays 640 * which may be either level 1 or level 2. 641 * Will the new anon array be one level or two levels? 642 */ 643 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 644 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 645 oelems = oldamp_pgs; 646 nelems = newamp_pgs; 647 } else { 648 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 649 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 650 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 651 } 652 653 newarrsz = nelems * sizeof (void *); 654 level1 = kmem_alloc(newarrsz, kmemflags); 655 if (level1 == NULL) 656 return (0); 657 658 /* Are we converting from a one level to a two level anon array? */ 659 660 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 661 !(ahp->flags & ANON_ALLOC_FORCE)) { 662 663 /* 664 * Yes, we're converting to a two level. Reuse old level 1 665 * as new level 2 if it is exactly PAGESIZE. Otherwise 666 * alloc a new level 2 and copy the old level 1 data into it. 667 */ 668 if (oldamp_pgs == ANON_CHUNK_SIZE) { 669 level2 = (void *)ahp->array_chunk; 670 } else { 671 level2 = kmem_alloc(PAGESIZE, kmemflags); 672 if (level2 == NULL) { 673 kmem_free(level1, newarrsz); 674 return (0); 675 } 676 oldarrsz = oldamp_pgs * sizeof (void *); 677 678 ANON_INITBUF(ahp->array_chunk, oldarrsz, 679 level2, PAGESIZE, growdown); 680 kmem_free(ahp->array_chunk, oldarrsz); 681 } 682 bzero(level1, newarrsz); 683 if (growdown) 684 level1[nelems - 1] = level2; 685 else 686 level1[0] = level2; 687 } else { 688 oldarrsz = oelems * sizeof (void *); 689 690 ANON_INITBUF(ahp->array_chunk, oldarrsz, 691 level1, newarrsz, growdown); 692 kmem_free(ahp->array_chunk, oldarrsz); 693 } 694 695 ahp->array_chunk = level1; 696 ahp->size = newamp_pgs; 697 if (growdown) 698 *startidx_p = newamp_pgs - totpages; 699 700 return (newamp_pgs); 701 } 702 703 704 /* 705 * Called from clock handler to sync ani_free value. 706 */ 707 708 void 709 set_anoninfo(void) 710 { 711 int ix; 712 pgcnt_t total = 0; 713 714 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 715 total += ani_free_pool[ix].ani_count; 716 } 717 k_anoninfo.ani_free = total; 718 } 719 720 /* 721 * Reserve anon space. 722 * 723 * It's no longer simply a matter of incrementing ani_resv to 724 * reserve swap space, we need to check memory-based as well 725 * as disk-backed (physical) swap. The following algorithm 726 * is used: 727 * Check the space on physical swap 728 * i.e. amount needed < ani_max - ani_phys_resv 729 * If we are swapping on swapfs check 730 * amount needed < (availrmem - swapfs_minfree) 731 * Since the algorithm to check for the quantity of swap space is 732 * almost the same as that for reserving it, we'll just use anon_resvmem 733 * with a flag to decrement availrmem. 734 * 735 * Return non-zero on success. 736 */ 737 int 738 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 739 { 740 pgcnt_t npages = btopr(size); 741 pgcnt_t mswap_pages = 0; 742 pgcnt_t pswap_pages = 0; 743 proc_t *p = curproc; 744 745 if (zone != NULL && takemem) { 746 /* test zone.max-swap resource control */ 747 mutex_enter(&p->p_lock); 748 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 749 mutex_exit(&p->p_lock); 750 return (0); 751 } 752 mutex_exit(&p->p_lock); 753 } 754 mutex_enter(&anoninfo_lock); 755 756 /* 757 * pswap_pages is the number of pages we can take from 758 * physical (i.e. disk-backed) swap. 759 */ 760 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 761 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 762 763 ANON_PRINT(A_RESV, 764 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 765 npages, takemem, pswap_pages, (void *)caller())); 766 767 if (npages <= pswap_pages) { 768 /* 769 * we have enough space on a physical swap 770 */ 771 if (takemem) 772 k_anoninfo.ani_phys_resv += npages; 773 mutex_exit(&anoninfo_lock); 774 return (1); 775 } else if (pswap_pages != 0) { 776 /* 777 * we have some space on a physical swap 778 */ 779 if (takemem) { 780 /* 781 * use up remainder of phys swap 782 */ 783 k_anoninfo.ani_phys_resv += pswap_pages; 784 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 785 } 786 } 787 /* 788 * since (npages > pswap_pages) we need mem swap 789 * mswap_pages is the number of pages needed from availrmem 790 */ 791 ASSERT(npages > pswap_pages); 792 mswap_pages = npages - pswap_pages; 793 794 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 795 mswap_pages)); 796 797 /* 798 * priv processes can reserve memory as swap as long as availrmem 799 * remains greater than swapfs_minfree; in the case of non-priv 800 * processes, memory can be reserved as swap only if availrmem 801 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 802 * swapfs_reserve amount of memswap is not available to non-priv 803 * processes. This protects daemons such as automounter dying 804 * as a result of application processes eating away almost entire 805 * membased swap. This safeguard becomes useless if apps are run 806 * with root access. 807 * 808 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 809 * 810 */ 811 if (tryhard) { 812 mutex_exit(&anoninfo_lock); 813 (void) page_reclaim_mem(mswap_pages, 814 swapfs_minfree + swapfs_reserve, 0); 815 mutex_enter(&anoninfo_lock); 816 } 817 818 mutex_enter(&freemem_lock); 819 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 820 (availrmem > (swapfs_minfree + mswap_pages) && 821 secpolicy_resource(CRED()) == 0)) { 822 823 if (takemem) { 824 /* 825 * Take the memory from the rest of the system. 826 */ 827 availrmem -= mswap_pages; 828 mutex_exit(&freemem_lock); 829 k_anoninfo.ani_mem_resv += mswap_pages; 830 ANI_ADD(mswap_pages); 831 ANON_PRINT((A_RESV | A_MRESV), 832 ("anon_resvmem: took %ld pages of availrmem\n", 833 mswap_pages)); 834 } else { 835 mutex_exit(&freemem_lock); 836 } 837 838 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 839 mutex_exit(&anoninfo_lock); 840 return (1); 841 842 } else { 843 /* 844 * Fail if not enough memory 845 */ 846 847 if (takemem) { 848 k_anoninfo.ani_phys_resv -= pswap_pages; 849 } 850 851 mutex_exit(&freemem_lock); 852 mutex_exit(&anoninfo_lock); 853 ANON_PRINT(A_RESV, 854 ("anon_resvmem: not enough space from swapfs\n")); 855 if (zone != NULL && takemem) 856 rctl_decr_swap(zone, ptob(npages)); 857 return (0); 858 } 859 } 860 861 /* 862 * Give back an anon reservation. 863 */ 864 void 865 anon_unresvmem(size_t size, zone_t *zone) 866 { 867 pgcnt_t npages = btopr(size); 868 spgcnt_t mem_free_pages = 0; 869 pgcnt_t phys_free_slots; 870 #ifdef ANON_DEBUG 871 pgcnt_t mem_resv; 872 #endif 873 if (zone != NULL) 874 rctl_decr_swap(zone, ptob(npages)); 875 876 mutex_enter(&anoninfo_lock); 877 878 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 879 880 /* 881 * If some of this reservation belonged to swapfs 882 * give it back to availrmem. 883 * ani_mem_resv is the amount of availrmem swapfs has reserved. 884 * but some of that memory could be locked by segspt so we can only 885 * return non locked ani_mem_resv back to availrmem 886 */ 887 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 888 ANON_PRINT((A_RESV | A_MRESV), 889 ("anon_unresv: growing availrmem by %ld pages\n", 890 MIN(k_anoninfo.ani_mem_resv, npages))); 891 892 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 893 k_anoninfo.ani_locked_swap), npages); 894 mutex_enter(&freemem_lock); 895 availrmem += mem_free_pages; 896 mutex_exit(&freemem_lock); 897 k_anoninfo.ani_mem_resv -= mem_free_pages; 898 899 ANI_ADD(-mem_free_pages); 900 } 901 /* 902 * The remainder of the pages is returned to phys swap 903 */ 904 ASSERT(npages >= mem_free_pages); 905 phys_free_slots = npages - mem_free_pages; 906 907 if (phys_free_slots) { 908 k_anoninfo.ani_phys_resv -= phys_free_slots; 909 } 910 911 #ifdef ANON_DEBUG 912 mem_resv = k_anoninfo.ani_mem_resv; 913 #endif 914 915 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 916 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 917 918 mutex_exit(&anoninfo_lock); 919 920 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 921 npages, mem_resv, (void *)caller())); 922 } 923 924 /* 925 * Allocate an anon slot and return it with the lock held. 926 */ 927 struct anon * 928 anon_alloc(struct vnode *vp, anoff_t off) 929 { 930 struct anon *ap; 931 kmutex_t *ahm; 932 933 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 934 if (vp == NULL) { 935 swap_alloc(ap); 936 } else { 937 ap->an_vp = vp; 938 ap->an_off = off; 939 } 940 ap->an_refcnt = 1; 941 ap->an_pvp = NULL; 942 ap->an_poff = 0; 943 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 944 mutex_enter(ahm); 945 anon_addhash(ap); 946 mutex_exit(ahm); 947 ANI_ADD(-1); 948 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 949 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 950 return (ap); 951 } 952 953 /* 954 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 955 * such pages don't consume any physical swap resources needed for swapping 956 * unlocked pages. 957 */ 958 void 959 anon_swap_free(struct anon *ap, page_t *pp) 960 { 961 kmutex_t *ahm; 962 963 ASSERT(ap != NULL); 964 ASSERT(pp != NULL); 965 ASSERT(PAGE_LOCKED(pp)); 966 ASSERT(pp->p_vnode != NULL); 967 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 968 ASSERT(ap->an_refcnt != 0); 969 ASSERT(pp->p_vnode == ap->an_vp); 970 ASSERT(pp->p_offset == ap->an_off); 971 972 if (ap->an_pvp == NULL) 973 return; 974 975 page_io_lock(pp); 976 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 977 mutex_enter(ahm); 978 979 ASSERT(ap->an_refcnt != 0); 980 ASSERT(pp->p_vnode == ap->an_vp); 981 ASSERT(pp->p_offset == ap->an_off); 982 983 if (ap->an_pvp != NULL) { 984 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 985 ap->an_pvp = NULL; 986 ap->an_poff = 0; 987 mutex_exit(ahm); 988 hat_setmod(pp); 989 } else { 990 mutex_exit(ahm); 991 } 992 page_io_unlock(pp); 993 } 994 995 /* 996 * Decrement the reference count of an anon page. 997 * If reference count goes to zero, free it and 998 * its associated page (if any). 999 */ 1000 void 1001 anon_decref(struct anon *ap) 1002 { 1003 page_t *pp; 1004 struct vnode *vp; 1005 anoff_t off; 1006 kmutex_t *ahm; 1007 1008 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1009 mutex_enter(ahm); 1010 ASSERT(ap->an_refcnt != 0); 1011 if (ap->an_refcnt == 0) 1012 panic("anon_decref: slot count 0"); 1013 if (--ap->an_refcnt == 0) { 1014 swap_xlate(ap, &vp, &off); 1015 anon_rmhash(ap); 1016 if (ap->an_pvp != NULL) 1017 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1018 mutex_exit(ahm); 1019 1020 /* 1021 * If there is a page for this anon slot we will need to 1022 * call VN_DISPOSE to get rid of the vp association and 1023 * put the page back on the free list as really free. 1024 * Acquire the "exclusive" lock to ensure that any 1025 * pending i/o always completes before the swap slot 1026 * is freed. 1027 */ 1028 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1029 if (pp != NULL) { 1030 /*LINTED: constant in conditional context */ 1031 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1032 } 1033 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1034 (void *)ap, (void *)ap->an_vp)); 1035 1036 kmem_cache_free(anon_cache, ap); 1037 1038 ANI_ADD(1); 1039 } else { 1040 mutex_exit(ahm); 1041 } 1042 } 1043 1044 1045 /* 1046 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1047 * seg->s_szc level) to determine whether COW processing is required. 1048 * anonpages_hash_lock[] held on the root ap ensures that if root's 1049 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1050 * later since this process can't fork while its AS lock is held). 1051 * 1052 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1053 */ 1054 int 1055 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1056 { 1057 struct anon *ap; 1058 kmutex_t *ahmpages = NULL; 1059 1060 ap = anon_get_ptr(ahp, anon_index); 1061 if (ap == NULL) 1062 return (0); 1063 1064 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1065 mutex_enter(ahmpages); 1066 ASSERT(ap->an_refcnt >= 1); 1067 if (ap->an_refcnt == 1) { 1068 mutex_exit(ahmpages); 1069 return (0); 1070 } 1071 mutex_exit(ahmpages); 1072 return (1); 1073 } 1074 /* 1075 * Check 'nslots' anon slots for refcnt > 1. 1076 * 1077 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1078 * returns 0. 1079 */ 1080 static int 1081 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1082 { 1083 struct anon *ap; 1084 1085 while (nslots-- > 0) { 1086 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1087 ap->an_refcnt > 1) 1088 return (1); 1089 anon_index++; 1090 } 1091 1092 return (0); 1093 } 1094 1095 static void 1096 anon_decref_pages( 1097 struct anon_hdr *ahp, 1098 ulong_t an_idx, 1099 uint_t szc) 1100 { 1101 struct anon *ap = anon_get_ptr(ahp, an_idx); 1102 kmutex_t *ahmpages = NULL; 1103 page_t *pp; 1104 pgcnt_t pgcnt = page_get_pagecnt(szc); 1105 pgcnt_t i; 1106 struct vnode *vp; 1107 anoff_t off; 1108 kmutex_t *ahm; 1109 #ifdef DEBUG 1110 int refcnt = 1; 1111 #endif 1112 1113 ASSERT(szc != 0); 1114 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1115 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1116 ASSERT(an_idx < ahp->size); 1117 1118 if (ahp->size - an_idx < pgcnt) { 1119 /* 1120 * In case of shared mappings total anon map size may not be 1121 * the largest page size aligned. 1122 */ 1123 pgcnt = ahp->size - an_idx; 1124 } 1125 1126 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1127 1128 if (ap != NULL) { 1129 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1130 mutex_enter(ahmpages); 1131 ASSERT((refcnt = ap->an_refcnt) != 0); 1132 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1133 if (ap->an_refcnt == 1) { 1134 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1135 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1136 mutex_exit(ahmpages); 1137 ahmpages = NULL; 1138 } 1139 } 1140 1141 i = 0; 1142 while (i < pgcnt) { 1143 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1144 ASSERT(refcnt == 1 && ahmpages == NULL); 1145 i++; 1146 continue; 1147 } 1148 ASSERT(ap->an_refcnt == refcnt); 1149 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1150 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1151 1152 if (ahmpages == NULL) { 1153 swap_xlate(ap, &vp, &off); 1154 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1155 if (pp == NULL || pp->p_szc == 0) { 1156 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1157 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1158 ap->an_off)]; 1159 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1160 ANON_SLEEP); 1161 mutex_enter(ahm); 1162 ap->an_refcnt--; 1163 ASSERT(ap->an_refcnt == 0); 1164 anon_rmhash(ap); 1165 if (ap->an_pvp) 1166 swap_phys_free(ap->an_pvp, ap->an_poff, 1167 PAGESIZE); 1168 mutex_exit(ahm); 1169 if (pp == NULL) { 1170 pp = page_lookup(vp, (u_offset_t)off, 1171 SE_EXCL); 1172 ASSERT(pp == NULL || pp->p_szc == 0); 1173 } 1174 if (pp != NULL) { 1175 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1176 /*LINTED*/ 1177 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1178 } 1179 kmem_cache_free(anon_cache, ap); 1180 ANI_ADD(1); 1181 i++; 1182 } else { 1183 pgcnt_t j; 1184 pgcnt_t curpgcnt = 1185 page_get_pagecnt(pp->p_szc); 1186 size_t ppasize = curpgcnt * sizeof (page_t *); 1187 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1188 int dispose = 0; 1189 1190 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1191 1192 ASSERT(pp->p_szc <= szc); 1193 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1194 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1195 ASSERT(i + curpgcnt <= pgcnt); 1196 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1197 ppa[0] = pp; 1198 for (j = i + 1; j < i + curpgcnt; j++) { 1199 ap = anon_get_ptr(ahp, an_idx + j); 1200 ASSERT(ap != NULL && 1201 ap->an_refcnt == 1); 1202 swap_xlate(ap, &vp, &off); 1203 pp = page_lookup(vp, (u_offset_t)off, 1204 SE_EXCL); 1205 if (pp == NULL) 1206 panic("anon_decref_pages: " 1207 "no page"); 1208 1209 (void) hat_pageunload(pp, 1210 HAT_FORCE_PGUNLOAD); 1211 ASSERT(pp->p_szc == ppa[0]->p_szc); 1212 ASSERT(page_pptonum(pp) - 1 == 1213 page_pptonum(ppa[j - i - 1])); 1214 ppa[j - i] = pp; 1215 if (ap->an_pvp != NULL && 1216 !vn_matchopval(ap->an_pvp, 1217 VOPNAME_DISPOSE, 1218 (fs_generic_func_p)fs_dispose)) 1219 dispose = 1; 1220 } 1221 for (j = i; j < i + curpgcnt; j++) { 1222 ap = anon_get_ptr(ahp, an_idx + j); 1223 ASSERT(ap != NULL && 1224 ap->an_refcnt == 1); 1225 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1226 ap->an_off)]; 1227 (void) anon_set_ptr(ahp, an_idx + j, 1228 NULL, ANON_SLEEP); 1229 mutex_enter(ahm); 1230 ap->an_refcnt--; 1231 ASSERT(ap->an_refcnt == 0); 1232 anon_rmhash(ap); 1233 if (ap->an_pvp) 1234 swap_phys_free(ap->an_pvp, 1235 ap->an_poff, PAGESIZE); 1236 mutex_exit(ahm); 1237 kmem_cache_free(anon_cache, ap); 1238 ANI_ADD(1); 1239 } 1240 if (!dispose) { 1241 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1242 page_destroy_pages(ppa[0]); 1243 } else { 1244 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1245 for (j = 0; j < curpgcnt; j++) { 1246 ASSERT(PAGE_EXCL(ppa[j])); 1247 ppa[j]->p_szc = 0; 1248 } 1249 for (j = 0; j < curpgcnt; j++) { 1250 ASSERT(!hat_page_is_mapped( 1251 ppa[j])); 1252 /*LINTED*/ 1253 VN_DISPOSE(ppa[j], B_INVAL, 0, 1254 kcred); 1255 } 1256 } 1257 kmem_free(ppa, ppasize); 1258 i += curpgcnt; 1259 } 1260 } else { 1261 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1262 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1263 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1264 mutex_enter(ahm); 1265 ap->an_refcnt--; 1266 mutex_exit(ahm); 1267 i++; 1268 } 1269 } 1270 1271 if (ahmpages != NULL) { 1272 mutex_exit(ahmpages); 1273 } 1274 } 1275 1276 /* 1277 * Duplicate references to size bytes worth of anon pages. 1278 * Used when duplicating a segment that contains private anon pages. 1279 * This code assumes that procedure calling this one has already used 1280 * hat_chgprot() to disable write access to the range of addresses that 1281 * that *old actually refers to. 1282 */ 1283 void 1284 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1285 ulong_t new_idx, size_t size) 1286 { 1287 spgcnt_t npages; 1288 kmutex_t *ahm; 1289 struct anon *ap; 1290 ulong_t off; 1291 ulong_t index; 1292 1293 npages = btopr(size); 1294 while (npages > 0) { 1295 index = old_idx; 1296 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1297 break; 1298 1299 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1300 off = index - old_idx; 1301 npages -= off; 1302 if (npages <= 0) 1303 break; 1304 1305 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1306 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1307 1308 mutex_enter(ahm); 1309 ap->an_refcnt++; 1310 mutex_exit(ahm); 1311 1312 off++; 1313 new_idx += off; 1314 old_idx += off; 1315 npages--; 1316 } 1317 } 1318 1319 /* 1320 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1321 * slots) within any large page region. That means if a large page region is 1322 * empty in the old array it will skip it. If there are 1 or more valid slots 1323 * in the large page region of the old array it will make sure to fill in any 1324 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1325 * page region should either have no valid anon slots or all slots should be 1326 * valid. 1327 */ 1328 void 1329 anon_dup_fill_holes( 1330 struct anon_hdr *old, 1331 ulong_t old_idx, 1332 struct anon_hdr *new, 1333 ulong_t new_idx, 1334 size_t size, 1335 uint_t szc, 1336 int noalloc) 1337 { 1338 struct anon *ap; 1339 spgcnt_t npages; 1340 kmutex_t *ahm, *ahmpages = NULL; 1341 pgcnt_t pgcnt, i; 1342 ulong_t index, off; 1343 #ifdef DEBUG 1344 int refcnt; 1345 #endif 1346 1347 ASSERT(szc != 0); 1348 pgcnt = page_get_pagecnt(szc); 1349 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1350 npages = btopr(size); 1351 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1352 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1353 1354 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1355 1356 while (npages > 0) { 1357 index = old_idx; 1358 1359 /* 1360 * Find the next valid slot. 1361 */ 1362 if (anon_get_next_ptr(old, &index) == NULL) 1363 break; 1364 1365 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1366 /* 1367 * Now backup index to the beginning of the 1368 * current large page region of the old array. 1369 */ 1370 index = P2ALIGN(index, pgcnt); 1371 off = index - old_idx; 1372 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1373 npages -= off; 1374 if (npages <= 0) 1375 break; 1376 1377 /* 1378 * Fill and copy a large page regions worth 1379 * of anon slots. 1380 */ 1381 for (i = 0; i < pgcnt; i++) { 1382 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1383 if (noalloc) { 1384 panic("anon_dup_fill_holes: " 1385 "empty anon slot\n"); 1386 } 1387 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1388 ap = anon_alloc(NULL, 0); 1389 (void) anon_set_ptr(old, index + i, ap, 1390 ANON_SLEEP); 1391 } else if (i == 0) { 1392 /* 1393 * make the increment of all refcnts of all 1394 * anon slots of a large page appear atomic by 1395 * getting an anonpages_hash_lock for the 1396 * first anon slot of a large page. 1397 */ 1398 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1399 1400 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1401 1402 ahmpages = &anonpages_hash_lock[hash]; 1403 mutex_enter(ahmpages); 1404 /*LINTED*/ 1405 ASSERT(refcnt = ap->an_refcnt); 1406 1407 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1408 anonvmstats.dupfillholes[3]); 1409 } 1410 (void) anon_set_ptr(new, new_idx + off + i, ap, 1411 ANON_SLEEP); 1412 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1413 mutex_enter(ahm); 1414 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1415 ASSERT(i == 0 || ahmpages == NULL || 1416 refcnt == ap->an_refcnt); 1417 ap->an_refcnt++; 1418 mutex_exit(ahm); 1419 } 1420 if (ahmpages != NULL) { 1421 mutex_exit(ahmpages); 1422 ahmpages = NULL; 1423 } 1424 off += pgcnt; 1425 new_idx += off; 1426 old_idx += off; 1427 npages -= pgcnt; 1428 } 1429 } 1430 1431 /* 1432 * Used when a segment with a vnode changes szc. similarly to 1433 * anon_dup_fill_holes() makes sure each large page region either has no anon 1434 * slots or all of them. but new slots are created by COWing the file 1435 * pages. on entrance no anon slots should be shared. 1436 */ 1437 int 1438 anon_fill_cow_holes( 1439 struct seg *seg, 1440 caddr_t addr, 1441 struct anon_hdr *ahp, 1442 ulong_t an_idx, 1443 struct vnode *vp, 1444 u_offset_t vp_off, 1445 size_t size, 1446 uint_t szc, 1447 uint_t prot, 1448 struct vpage vpage[], 1449 struct cred *cred) 1450 { 1451 struct anon *ap; 1452 spgcnt_t npages; 1453 pgcnt_t pgcnt, i; 1454 ulong_t index, off; 1455 int err = 0; 1456 int pageflags = 0; 1457 1458 ASSERT(szc != 0); 1459 pgcnt = page_get_pagecnt(szc); 1460 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1461 npages = btopr(size); 1462 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1463 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1464 1465 while (npages > 0) { 1466 index = an_idx; 1467 1468 /* 1469 * Find the next valid slot. 1470 */ 1471 if (anon_get_next_ptr(ahp, &index) == NULL) { 1472 break; 1473 } 1474 1475 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1476 /* 1477 * Now backup index to the beginning of the 1478 * current large page region of the anon array. 1479 */ 1480 index = P2ALIGN(index, pgcnt); 1481 off = index - an_idx; 1482 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1483 npages -= off; 1484 if (npages <= 0) 1485 break; 1486 an_idx += off; 1487 vp_off += ptob(off); 1488 addr += ptob(off); 1489 if (vpage != NULL) { 1490 vpage += off; 1491 } 1492 1493 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1494 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1495 page_t *pl[1 + 1]; 1496 page_t *pp; 1497 1498 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1499 pl, PAGESIZE, seg, addr, S_READ, cred, 1500 NULL); 1501 if (err) { 1502 break; 1503 } 1504 if (vpage != NULL) { 1505 prot = VPP_PROT(vpage); 1506 pageflags = VPP_ISPPLOCK(vpage) ? 1507 LOCK_PAGE : 0; 1508 } 1509 pp = anon_private(&ap, seg, addr, prot, pl[0], 1510 pageflags, cred); 1511 if (pp == NULL) { 1512 err = ENOMEM; 1513 break; 1514 } 1515 (void) anon_set_ptr(ahp, an_idx, ap, 1516 ANON_SLEEP); 1517 page_unlock(pp); 1518 } 1519 ASSERT(ap->an_refcnt == 1); 1520 addr += PAGESIZE; 1521 if (vpage != NULL) { 1522 vpage++; 1523 } 1524 } 1525 npages -= pgcnt; 1526 } 1527 1528 return (err); 1529 } 1530 1531 /* 1532 * Free a group of "size" anon pages, size in bytes, 1533 * and clear out the pointers to the anon entries. 1534 */ 1535 void 1536 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1537 { 1538 spgcnt_t npages; 1539 struct anon *ap; 1540 ulong_t old; 1541 1542 npages = btopr(size); 1543 1544 while (npages > 0) { 1545 old = index; 1546 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1547 break; 1548 1549 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1550 npages -= index - old; 1551 if (npages <= 0) 1552 break; 1553 1554 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1555 anon_decref(ap); 1556 /* 1557 * Bump index and decrement page count 1558 */ 1559 index++; 1560 npages--; 1561 } 1562 } 1563 1564 void 1565 anon_free_pages( 1566 struct anon_hdr *ahp, 1567 ulong_t an_idx, 1568 size_t size, 1569 uint_t szc) 1570 { 1571 spgcnt_t npages; 1572 pgcnt_t pgcnt; 1573 ulong_t index, off; 1574 1575 ASSERT(szc != 0); 1576 pgcnt = page_get_pagecnt(szc); 1577 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1578 npages = btopr(size); 1579 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1580 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1581 ASSERT(an_idx < ahp->size); 1582 1583 VM_STAT_ADD(anonvmstats.freepages[0]); 1584 1585 while (npages > 0) { 1586 index = an_idx; 1587 1588 /* 1589 * Find the next valid slot. 1590 */ 1591 if (anon_get_next_ptr(ahp, &index) == NULL) 1592 break; 1593 1594 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1595 /* 1596 * Now backup index to the beginning of the 1597 * current large page region of the old array. 1598 */ 1599 index = P2ALIGN(index, pgcnt); 1600 off = index - an_idx; 1601 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1602 npages -= off; 1603 if (npages <= 0) 1604 break; 1605 1606 anon_decref_pages(ahp, index, szc); 1607 1608 off += pgcnt; 1609 an_idx += off; 1610 npages -= pgcnt; 1611 } 1612 } 1613 1614 /* 1615 * Make anonymous pages discardable 1616 */ 1617 void 1618 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1619 { 1620 spgcnt_t npages = btopr(size); 1621 struct anon *ap; 1622 struct vnode *vp; 1623 anoff_t off; 1624 page_t *pp, *root_pp; 1625 kmutex_t *ahm; 1626 pgcnt_t pgcnt; 1627 ulong_t old_idx, idx, i; 1628 struct anon_hdr *ahp = amp->ahp; 1629 anon_sync_obj_t cookie; 1630 1631 ASSERT(RW_READ_HELD(&->a_rwlock)); 1632 pgcnt = 1; 1633 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1634 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1635 1636 /* 1637 * get anon pointer and index for the first valid entry 1638 * in the anon list, starting from "index" 1639 */ 1640 old_idx = index; 1641 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1642 break; 1643 1644 /* 1645 * decrement npages by number of NULL anon slots we skipped 1646 */ 1647 npages -= index - old_idx; 1648 if (npages <= 0) 1649 break; 1650 1651 anon_array_enter(amp, index, &cookie); 1652 ap = anon_get_ptr(ahp, index); 1653 ASSERT(ap != NULL); 1654 1655 /* 1656 * Get anonymous page and try to lock it SE_EXCL; 1657 * if we couldn't grab the lock we skip to next page. 1658 */ 1659 swap_xlate(ap, &vp, &off); 1660 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1661 if (pp == NULL) { 1662 segadvstat.MADV_FREE_miss.value.ul++; 1663 pgcnt = 1; 1664 anon_array_exit(&cookie); 1665 continue; 1666 } 1667 pgcnt = page_get_pagecnt(pp->p_szc); 1668 1669 /* 1670 * we cannot free a page which is permanently locked. 1671 * The page_struct_lock need not be acquired to examine 1672 * these fields since the page has an "exclusive" lock. 1673 */ 1674 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1675 page_unlock(pp); 1676 segadvstat.MADV_FREE_miss.value.ul++; 1677 anon_array_exit(&cookie); 1678 continue; 1679 } 1680 1681 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1682 mutex_enter(ahm); 1683 ASSERT(ap->an_refcnt != 0); 1684 /* 1685 * skip this one if copy-on-write is not yet broken. 1686 */ 1687 if (ap->an_refcnt > 1) { 1688 mutex_exit(ahm); 1689 page_unlock(pp); 1690 segadvstat.MADV_FREE_miss.value.ul++; 1691 anon_array_exit(&cookie); 1692 continue; 1693 } 1694 1695 if (pp->p_szc == 0) { 1696 pgcnt = 1; 1697 1698 /* 1699 * free swap slot; 1700 */ 1701 if (ap->an_pvp) { 1702 swap_phys_free(ap->an_pvp, ap->an_poff, 1703 PAGESIZE); 1704 ap->an_pvp = NULL; 1705 ap->an_poff = 0; 1706 } 1707 mutex_exit(ahm); 1708 segadvstat.MADV_FREE_hit.value.ul++; 1709 1710 /* 1711 * while we are at it, unload all the translations 1712 * and attempt to free the page. 1713 */ 1714 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1715 /*LINTED: constant in conditional context */ 1716 VN_DISPOSE(pp, B_FREE, 0, kcred); 1717 anon_array_exit(&cookie); 1718 continue; 1719 } 1720 1721 pgcnt = page_get_pagecnt(pp->p_szc); 1722 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1723 if (!page_try_demote_pages(pp)) { 1724 mutex_exit(ahm); 1725 page_unlock(pp); 1726 segadvstat.MADV_FREE_miss.value.ul++; 1727 anon_array_exit(&cookie); 1728 continue; 1729 } else { 1730 pgcnt = 1; 1731 if (ap->an_pvp) { 1732 swap_phys_free(ap->an_pvp, 1733 ap->an_poff, PAGESIZE); 1734 ap->an_pvp = NULL; 1735 ap->an_poff = 0; 1736 } 1737 mutex_exit(ahm); 1738 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1739 /*LINTED*/ 1740 VN_DISPOSE(pp, B_FREE, 0, kcred); 1741 segadvstat.MADV_FREE_hit.value.ul++; 1742 anon_array_exit(&cookie); 1743 continue; 1744 } 1745 } 1746 mutex_exit(ahm); 1747 root_pp = pp; 1748 1749 /* 1750 * try to lock remaining pages 1751 */ 1752 for (idx = 1; idx < pgcnt; idx++) { 1753 pp++; 1754 if (!page_trylock(pp, SE_EXCL)) 1755 break; 1756 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1757 page_unlock(pp); 1758 break; 1759 } 1760 } 1761 1762 if (idx == pgcnt) { 1763 for (i = 0; i < pgcnt; i++) { 1764 ap = anon_get_ptr(ahp, index + i); 1765 if (ap == NULL) 1766 break; 1767 swap_xlate(ap, &vp, &off); 1768 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1769 mutex_enter(ahm); 1770 ASSERT(ap->an_refcnt != 0); 1771 1772 /* 1773 * skip this one if copy-on-write 1774 * is not yet broken. 1775 */ 1776 if (ap->an_refcnt > 1) { 1777 mutex_exit(ahm); 1778 goto skiplp; 1779 } 1780 if (ap->an_pvp) { 1781 swap_phys_free(ap->an_pvp, 1782 ap->an_poff, PAGESIZE); 1783 ap->an_pvp = NULL; 1784 ap->an_poff = 0; 1785 } 1786 mutex_exit(ahm); 1787 } 1788 page_destroy_pages(root_pp); 1789 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1790 anon_array_exit(&cookie); 1791 continue; 1792 } 1793 skiplp: 1794 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1795 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1796 page_unlock(pp); 1797 anon_array_exit(&cookie); 1798 } 1799 } 1800 1801 /* 1802 * Return the kept page(s) and protections back to the segment driver. 1803 */ 1804 int 1805 anon_getpage( 1806 struct anon **app, 1807 uint_t *protp, 1808 page_t *pl[], 1809 size_t plsz, 1810 struct seg *seg, 1811 caddr_t addr, 1812 enum seg_rw rw, 1813 struct cred *cred) 1814 { 1815 page_t *pp; 1816 struct anon *ap = *app; 1817 struct vnode *vp; 1818 anoff_t off; 1819 int err; 1820 kmutex_t *ahm; 1821 1822 swap_xlate(ap, &vp, &off); 1823 1824 /* 1825 * Lookup the page. If page is being paged in, 1826 * wait for it to finish as we must return a list of 1827 * pages since this routine acts like the VOP_GETPAGE 1828 * routine does. 1829 */ 1830 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1831 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1832 mutex_enter(ahm); 1833 if (ap->an_refcnt == 1) 1834 *protp = PROT_ALL; 1835 else 1836 *protp = PROT_ALL & ~PROT_WRITE; 1837 mutex_exit(ahm); 1838 pl[0] = pp; 1839 pl[1] = NULL; 1840 return (0); 1841 } 1842 1843 /* 1844 * Simply treat it as a vnode fault on the anon vp. 1845 */ 1846 1847 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1848 "anon_getpage:seg %x addr %x vp %x", 1849 seg, addr, vp); 1850 1851 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1852 seg, addr, rw, cred, NULL); 1853 1854 if (err == 0 && pl != NULL) { 1855 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1856 mutex_enter(ahm); 1857 if (ap->an_refcnt != 1) 1858 *protp &= ~PROT_WRITE; /* make read-only */ 1859 mutex_exit(ahm); 1860 } 1861 return (err); 1862 } 1863 1864 /* 1865 * Creates or returns kept pages to the segment driver. returns -1 if a large 1866 * page cannot be allocated. returns -2 if some other process has allocated a 1867 * larger page. 1868 * 1869 * For cowfault it will allocate any size pages to fill the requested area to 1870 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1871 * slots within a large page with other processes). This policy greatly 1872 * simplifies large page freeing (which is only freed when all anon slot 1873 * refcnts are 0). 1874 */ 1875 int 1876 anon_map_getpages( 1877 struct anon_map *amp, 1878 ulong_t start_idx, 1879 uint_t szc, 1880 struct seg *seg, 1881 caddr_t addr, 1882 uint_t prot, 1883 uint_t *protp, 1884 page_t *ppa[], 1885 uint_t *ppa_szc, 1886 struct vpage vpage[], 1887 enum seg_rw rw, 1888 int brkcow, 1889 int anypgsz, 1890 int pgflags, 1891 struct cred *cred) 1892 { 1893 pgcnt_t pgcnt; 1894 struct anon *ap; 1895 struct vnode *vp; 1896 anoff_t off; 1897 page_t *pp, *pl[2], *conpp = NULL; 1898 caddr_t vaddr; 1899 ulong_t pg_idx, an_idx, i; 1900 spgcnt_t nreloc = 0; 1901 int prealloc = 1; 1902 int err, slotcreate; 1903 uint_t vpprot; 1904 int upsize = (szc < seg->s_szc); 1905 1906 #if !defined(__i386) && !defined(__amd64) 1907 ASSERT(seg->s_szc != 0); 1908 #endif 1909 ASSERT(szc <= seg->s_szc); 1910 ASSERT(ppa_szc != NULL); 1911 ASSERT(rw != S_CREATE); 1912 1913 *protp = PROT_ALL; 1914 1915 VM_STAT_ADD(anonvmstats.getpages[0]); 1916 1917 if (szc == 0) { 1918 VM_STAT_ADD(anonvmstats.getpages[1]); 1919 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1920 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1921 addr, rw, cred); 1922 if (err) 1923 return (err); 1924 ppa[0] = pl[0]; 1925 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1926 VM_STAT_ADD(anonvmstats.getpages[2]); 1927 if (ppa[0]->p_szc != 0 && upsize) { 1928 VM_STAT_ADD(anonvmstats.getpages[3]); 1929 *ppa_szc = MIN(ppa[0]->p_szc, 1930 seg->s_szc); 1931 page_unlock(ppa[0]); 1932 return (-2); 1933 } 1934 return (0); 1935 } 1936 panic("anon_map_getpages: cowfault for szc 0"); 1937 } else { 1938 VM_STAT_ADD(anonvmstats.getpages[4]); 1939 ppa[0] = anon_zero(seg, addr, &ap, cred); 1940 if (ppa[0] == NULL) 1941 return (ENOMEM); 1942 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1943 ANON_SLEEP); 1944 return (0); 1945 } 1946 } 1947 1948 pgcnt = page_get_pagecnt(szc); 1949 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1950 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1951 1952 /* 1953 * First we check for the case that the requtested large 1954 * page or larger page already exists in the system. 1955 * Actually we only check if the first constituent page 1956 * exists and only preallocate if it's not found. 1957 */ 1958 ap = anon_get_ptr(amp->ahp, start_idx); 1959 if (ap) { 1960 uint_t pszc; 1961 swap_xlate(ap, &vp, &off); 1962 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1963 if (pszc > szc && upsize) { 1964 *ppa_szc = MIN(pszc, seg->s_szc); 1965 return (-2); 1966 } 1967 if (pszc >= szc) { 1968 prealloc = 0; 1969 } 1970 } 1971 } 1972 1973 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1974 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1975 1976 top: 1977 /* 1978 * If a smaller page or no page at all was found, 1979 * grab a large page off the freelist. 1980 */ 1981 if (prealloc) { 1982 ASSERT(conpp == NULL); 1983 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1984 szc, 0, pgflags) != 0) { 1985 VM_STAT_ADD(anonvmstats.getpages[7]); 1986 if (brkcow == 0 || szc < seg->s_szc || 1987 !anon_szcshare(amp->ahp, start_idx)) { 1988 /* 1989 * If the refcnt's of all anon slots are <= 1 1990 * they can't increase since we are holding 1991 * the address space's lock. So segvn can 1992 * safely decrease szc without risking to 1993 * generate a cow fault for the region smaller 1994 * than the segment's largest page size. 1995 */ 1996 VM_STAT_ADD(anonvmstats.getpages[8]); 1997 return (-1); 1998 } 1999 docow: 2000 /* 2001 * This is a cow fault. Copy away the entire 1 large 2002 * page region of this segment. 2003 */ 2004 if (szc != seg->s_szc) 2005 panic("anon_map_getpages: cowfault for szc %d", 2006 szc); 2007 vaddr = addr; 2008 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2009 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2010 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2011 NULL) { 2012 err = anon_getpage(&ap, &vpprot, pl, 2013 PAGESIZE, seg, vaddr, rw, cred); 2014 if (err) { 2015 for (i = 0; i < pg_idx; i++) { 2016 if ((pp = ppa[i]) != 2017 NULL) 2018 page_unlock(pp); 2019 } 2020 return (err); 2021 } 2022 ppa[pg_idx] = pl[0]; 2023 } else { 2024 /* 2025 * Since this is a cowfault we know 2026 * that this address space has a 2027 * parent or children which means 2028 * anon_dup_fill_holes() has initialized 2029 * all anon slots within a large page 2030 * region that had at least one anon 2031 * slot at the time of fork(). 2032 */ 2033 panic("anon_map_getpages: " 2034 "cowfault but anon slot is empty"); 2035 } 2036 } 2037 VM_STAT_ADD(anonvmstats.getpages[9]); 2038 *protp = PROT_ALL; 2039 return (anon_map_privatepages(amp, start_idx, szc, seg, 2040 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2041 } 2042 } 2043 2044 VM_STAT_ADD(anonvmstats.getpages[10]); 2045 2046 an_idx = start_idx; 2047 pg_idx = 0; 2048 vaddr = addr; 2049 while (pg_idx < pgcnt) { 2050 slotcreate = 0; 2051 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2052 VM_STAT_ADD(anonvmstats.getpages[11]); 2053 /* 2054 * For us to have decided not to preallocate 2055 * would have meant that a large page 2056 * was found. Which also means that all of the 2057 * anon slots for that page would have been 2058 * already created for us. 2059 */ 2060 if (prealloc == 0) 2061 panic("anon_map_getpages: prealloc = 0"); 2062 2063 slotcreate = 1; 2064 ap = anon_alloc(NULL, 0); 2065 } 2066 swap_xlate(ap, &vp, &off); 2067 2068 /* 2069 * Now setup our preallocated page to pass down 2070 * to swap_getpage(). 2071 */ 2072 if (prealloc) { 2073 ASSERT(ppa[pg_idx]->p_szc == szc); 2074 conpp = ppa[pg_idx]; 2075 } 2076 ASSERT(prealloc || conpp == NULL); 2077 2078 /* 2079 * If we just created this anon slot then call 2080 * with S_CREATE to prevent doing IO on the page. 2081 * Similar to the anon_zero case. 2082 */ 2083 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2084 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2085 slotcreate == 1 ? S_CREATE : rw, cred); 2086 2087 if (err) { 2088 ASSERT(err != -2 || upsize); 2089 VM_STAT_ADD(anonvmstats.getpages[12]); 2090 ASSERT(slotcreate == 0); 2091 goto io_err; 2092 } 2093 2094 pp = pl[0]; 2095 2096 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2097 VM_STAT_ADD(anonvmstats.getpages[13]); 2098 ASSERT(slotcreate == 0); 2099 ASSERT(prealloc == 0); 2100 ASSERT(pg_idx == 0); 2101 if (pp->p_szc > szc) { 2102 ASSERT(upsize); 2103 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2104 page_unlock(pp); 2105 VM_STAT_ADD(anonvmstats.getpages[14]); 2106 return (-2); 2107 } 2108 page_unlock(pp); 2109 prealloc = 1; 2110 goto top; 2111 } 2112 2113 /* 2114 * If we decided to preallocate but VOP_GETPAGE 2115 * found a page in the system that satisfies our 2116 * request then free up our preallocated large page 2117 * and continue looping accross the existing large 2118 * page via VOP_GETPAGE. 2119 */ 2120 if (prealloc && pp != ppa[pg_idx]) { 2121 VM_STAT_ADD(anonvmstats.getpages[15]); 2122 ASSERT(slotcreate == 0); 2123 ASSERT(pg_idx == 0); 2124 conpp = NULL; 2125 prealloc = 0; 2126 page_free_pages(ppa[0]); 2127 } 2128 2129 if (prealloc && nreloc > 1) { 2130 /* 2131 * we have relocated out of a smaller large page. 2132 * skip npgs - 1 iterations and continue which will 2133 * increment by one the loop indices. 2134 */ 2135 spgcnt_t npgs = nreloc; 2136 2137 VM_STAT_ADD(anonvmstats.getpages[16]); 2138 2139 ASSERT(pp == ppa[pg_idx]); 2140 ASSERT(slotcreate == 0); 2141 ASSERT(pg_idx + npgs <= pgcnt); 2142 if ((*protp & PROT_WRITE) && 2143 anon_share(amp->ahp, an_idx, npgs)) { 2144 *protp &= ~PROT_WRITE; 2145 } 2146 pg_idx += npgs; 2147 an_idx += npgs; 2148 vaddr += PAGESIZE * npgs; 2149 continue; 2150 } 2151 2152 VM_STAT_ADD(anonvmstats.getpages[17]); 2153 2154 /* 2155 * Anon_zero case. 2156 */ 2157 if (slotcreate) { 2158 ASSERT(prealloc); 2159 pagezero(pp, 0, PAGESIZE); 2160 CPU_STATS_ADD_K(vm, zfod, 1); 2161 hat_setrefmod(pp); 2162 } 2163 2164 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2165 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2166 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2167 2168 if (pg_idx > 0 && 2169 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2170 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2171 panic("anon_map_getpages: unexpected page"); 2172 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2173 panic("anon_map_getpages: unaligned page"); 2174 } 2175 2176 if (prealloc == 0) { 2177 ppa[pg_idx] = pp; 2178 } 2179 2180 if (ap->an_refcnt > 1) { 2181 VM_STAT_ADD(anonvmstats.getpages[18]); 2182 *protp &= ~PROT_WRITE; 2183 } 2184 2185 /* 2186 * If this is a new anon slot then initialize 2187 * the anon array entry. 2188 */ 2189 if (slotcreate) { 2190 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2191 } 2192 pg_idx++; 2193 an_idx++; 2194 vaddr += PAGESIZE; 2195 } 2196 2197 /* 2198 * Since preallocated pages come off the freelist 2199 * they are locked SE_EXCL. Simply downgrade and return. 2200 */ 2201 if (prealloc) { 2202 VM_STAT_ADD(anonvmstats.getpages[19]); 2203 conpp = NULL; 2204 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2205 page_downgrade(ppa[pg_idx]); 2206 } 2207 } 2208 ASSERT(conpp == NULL); 2209 2210 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2211 VM_STAT_ADD(anonvmstats.getpages[20]); 2212 return (0); 2213 } 2214 2215 if (szc < seg->s_szc) 2216 panic("anon_map_getpages: cowfault for szc %d", szc); 2217 2218 VM_STAT_ADD(anonvmstats.getpages[21]); 2219 2220 *protp = PROT_ALL; 2221 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2222 ppa, vpage, anypgsz, pgflags, cred)); 2223 io_err: 2224 /* 2225 * We got an IO error somewhere in our large page. 2226 * If we were using a preallocated page then just demote 2227 * all the constituent pages that we've succeeded with sofar 2228 * to PAGESIZE pages and leave them in the system 2229 * unlocked. 2230 */ 2231 2232 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2233 2234 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2235 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2236 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2237 2238 if (prealloc) { 2239 conpp = NULL; 2240 if (pg_idx > 0) { 2241 VM_STAT_ADD(anonvmstats.getpages[25]); 2242 for (i = 0; i < pgcnt; i++) { 2243 pp = ppa[i]; 2244 ASSERT(PAGE_EXCL(pp)); 2245 ASSERT(pp->p_szc == szc); 2246 pp->p_szc = 0; 2247 } 2248 for (i = 0; i < pg_idx; i++) { 2249 ASSERT(!hat_page_is_mapped(ppa[i])); 2250 page_unlock(ppa[i]); 2251 } 2252 /* 2253 * Now free up the remaining unused constituent 2254 * pages. 2255 */ 2256 while (pg_idx < pgcnt) { 2257 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2258 page_free(ppa[pg_idx], 0); 2259 pg_idx++; 2260 } 2261 } else { 2262 VM_STAT_ADD(anonvmstats.getpages[26]); 2263 page_free_pages(ppa[0]); 2264 } 2265 } else { 2266 VM_STAT_ADD(anonvmstats.getpages[27]); 2267 ASSERT(err > 0); 2268 for (i = 0; i < pg_idx; i++) 2269 page_unlock(ppa[i]); 2270 } 2271 ASSERT(conpp == NULL); 2272 if (err != -1) 2273 return (err); 2274 /* 2275 * we are here because we failed to relocate. 2276 */ 2277 ASSERT(prealloc); 2278 if (brkcow == 0 || szc < seg->s_szc || 2279 !anon_szcshare(amp->ahp, start_idx)) { 2280 VM_STAT_ADD(anonvmstats.getpages[28]); 2281 return (-1); 2282 } 2283 VM_STAT_ADD(anonvmstats.getpages[29]); 2284 goto docow; 2285 } 2286 2287 2288 /* 2289 * Turn a reference to an object or shared anon page 2290 * into a private page with a copy of the data from the 2291 * original page which is always locked by the caller. 2292 * This routine unloads the translation and unlocks the 2293 * original page, if it isn't being stolen, before returning 2294 * to the caller. 2295 * 2296 * NOTE: The original anon slot is not freed by this routine 2297 * It must be freed by the caller while holding the 2298 * "anon_map" lock to prevent races which can occur if 2299 * a process has multiple lwps in its address space. 2300 */ 2301 page_t * 2302 anon_private( 2303 struct anon **app, 2304 struct seg *seg, 2305 caddr_t addr, 2306 uint_t prot, 2307 page_t *opp, 2308 int oppflags, 2309 struct cred *cred) 2310 { 2311 struct anon *old = *app; 2312 struct anon *new; 2313 page_t *pp = NULL; 2314 struct vnode *vp; 2315 anoff_t off; 2316 page_t *anon_pl[1 + 1]; 2317 int err; 2318 2319 if (oppflags & STEAL_PAGE) 2320 ASSERT(PAGE_EXCL(opp)); 2321 else 2322 ASSERT(PAGE_LOCKED(opp)); 2323 2324 CPU_STATS_ADD_K(vm, cow_fault, 1); 2325 2326 /* Kernel probe */ 2327 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2328 tnf_opaque, address, addr); 2329 2330 *app = new = anon_alloc(NULL, 0); 2331 swap_xlate(new, &vp, &off); 2332 2333 if (oppflags & STEAL_PAGE) { 2334 page_rename(opp, vp, (u_offset_t)off); 2335 pp = opp; 2336 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2337 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2338 seg, addr, pp, vp, off); 2339 hat_setmod(pp); 2340 2341 /* bug 4026339 */ 2342 page_downgrade(pp); 2343 return (pp); 2344 } 2345 2346 /* 2347 * Call the VOP_GETPAGE routine to create the page, thereby 2348 * enabling the vnode driver to allocate any filesystem 2349 * space (e.g., disk block allocation for UFS). This also 2350 * prevents more than one page from being added to the 2351 * vnode at the same time. 2352 */ 2353 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2354 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2355 if (err) 2356 goto out; 2357 2358 pp = anon_pl[0]; 2359 2360 /* 2361 * If the original page was locked, we need to move the lock 2362 * to the new page by transfering 'cowcnt/lckcnt' of the original 2363 * page to 'cowcnt/lckcnt' of the new page. 2364 * 2365 * See Statement at the beginning of segvn_lockop() and 2366 * comments in page_pp_useclaim() regarding the way 2367 * cowcnts/lckcnts are handled. 2368 * 2369 * Also availrmem must be decremented up front for read only mapping 2370 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2371 * if availrmem did not need to be decremented after all. 2372 */ 2373 if (oppflags & LOCK_PAGE) { 2374 if ((prot & PROT_WRITE) == 0) { 2375 mutex_enter(&freemem_lock); 2376 if (availrmem > pages_pp_maximum) { 2377 availrmem--; 2378 pages_useclaim++; 2379 } else { 2380 mutex_exit(&freemem_lock); 2381 goto out; 2382 } 2383 mutex_exit(&freemem_lock); 2384 } 2385 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2386 } 2387 2388 /* 2389 * Now copy the contents from the original page, 2390 * which is locked and loaded in the MMU by 2391 * the caller to prevent yet another page fault. 2392 */ 2393 /* XXX - should set mod bit in here */ 2394 if (ppcopy(opp, pp) == 0) { 2395 /* 2396 * Before ppcopy could hanlde UE or other faults, we 2397 * would have panicked here, and still have no option 2398 * but to do so now. 2399 */ 2400 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2401 opp, pp); 2402 } 2403 2404 hat_setrefmod(pp); /* mark as modified */ 2405 2406 /* 2407 * Unload the old translation. 2408 */ 2409 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2410 2411 /* 2412 * Free unmapped, unmodified original page. 2413 * or release the lock on the original page, 2414 * otherwise the process will sleep forever in 2415 * anon_decref() waiting for the "exclusive" lock 2416 * on the page. 2417 */ 2418 (void) page_release(opp, 1); 2419 2420 /* 2421 * we are done with page creation so downgrade the new 2422 * page's selock to shared, this helps when multiple 2423 * as_fault(...SOFTLOCK...) are done to the same 2424 * page(aio) 2425 */ 2426 page_downgrade(pp); 2427 2428 /* 2429 * NOTE: The original anon slot must be freed by the 2430 * caller while holding the "anon_map" lock, if we 2431 * copied away from an anonymous page. 2432 */ 2433 return (pp); 2434 2435 out: 2436 *app = old; 2437 if (pp) 2438 page_unlock(pp); 2439 anon_decref(new); 2440 page_unlock(opp); 2441 return ((page_t *)NULL); 2442 } 2443 2444 int 2445 anon_map_privatepages( 2446 struct anon_map *amp, 2447 ulong_t start_idx, 2448 uint_t szc, 2449 struct seg *seg, 2450 caddr_t addr, 2451 uint_t prot, 2452 page_t *ppa[], 2453 struct vpage vpage[], 2454 int anypgsz, 2455 int pgflags, 2456 struct cred *cred) 2457 { 2458 pgcnt_t pgcnt; 2459 struct vnode *vp; 2460 anoff_t off; 2461 page_t *pl[2], *conpp = NULL; 2462 int err; 2463 int prealloc = 1; 2464 struct anon *ap, *oldap; 2465 caddr_t vaddr; 2466 page_t *pplist, *pp; 2467 ulong_t pg_idx, an_idx; 2468 spgcnt_t nreloc = 0; 2469 int pagelock = 0; 2470 kmutex_t *ahmpages = NULL; 2471 #ifdef DEBUG 2472 int refcnt; 2473 #endif 2474 2475 ASSERT(szc != 0); 2476 ASSERT(szc == seg->s_szc); 2477 2478 VM_STAT_ADD(anonvmstats.privatepages[0]); 2479 2480 pgcnt = page_get_pagecnt(szc); 2481 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2482 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2483 2484 ASSERT(amp != NULL); 2485 ap = anon_get_ptr(amp->ahp, start_idx); 2486 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2487 2488 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2489 2490 /* 2491 * Now try and allocate the large page. If we fail then just 2492 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2493 * the caller make this decision but to avoid added complexity 2494 * it's simplier to handle that case here. 2495 */ 2496 if (anypgsz == -1) { 2497 VM_STAT_ADD(anonvmstats.privatepages[2]); 2498 prealloc = 0; 2499 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2500 anypgsz, pgflags) != 0) { 2501 VM_STAT_ADD(anonvmstats.privatepages[3]); 2502 prealloc = 0; 2503 } 2504 2505 /* 2506 * make the decrement of all refcnts of all 2507 * anon slots of a large page appear atomic by 2508 * getting an anonpages_hash_lock for the 2509 * first anon slot of a large page. 2510 */ 2511 if (ap != NULL) { 2512 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2513 ap->an_off)]; 2514 mutex_enter(ahmpages); 2515 if (ap->an_refcnt == 1) { 2516 VM_STAT_ADD(anonvmstats.privatepages[4]); 2517 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2518 mutex_exit(ahmpages); 2519 2520 if (prealloc) { 2521 page_free_replacement_page(pplist); 2522 page_create_putback(pgcnt); 2523 } 2524 ASSERT(ppa[0]->p_szc <= szc); 2525 if (ppa[0]->p_szc == szc) { 2526 VM_STAT_ADD(anonvmstats.privatepages[5]); 2527 return (0); 2528 } 2529 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2530 ASSERT(ppa[pg_idx] != NULL); 2531 page_unlock(ppa[pg_idx]); 2532 } 2533 return (-1); 2534 } 2535 } 2536 2537 /* 2538 * If we are passed in the vpage array and this is 2539 * not PROT_WRITE then we need to decrement availrmem 2540 * up front before we try anything. If we need to and 2541 * can't decrement availrmem then its better to fail now 2542 * than in the middle of processing the new large page. 2543 * page_pp_usclaim() on behalf of each constituent page 2544 * below will adjust availrmem back for the cases not needed. 2545 */ 2546 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2547 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2548 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2549 pagelock = 1; 2550 break; 2551 } 2552 } 2553 if (pagelock) { 2554 VM_STAT_ADD(anonvmstats.privatepages[6]); 2555 mutex_enter(&freemem_lock); 2556 if (availrmem >= pages_pp_maximum + pgcnt) { 2557 availrmem -= pgcnt; 2558 pages_useclaim += pgcnt; 2559 } else { 2560 VM_STAT_ADD(anonvmstats.privatepages[7]); 2561 mutex_exit(&freemem_lock); 2562 if (ahmpages != NULL) { 2563 mutex_exit(ahmpages); 2564 } 2565 if (prealloc) { 2566 page_free_replacement_page(pplist); 2567 page_create_putback(pgcnt); 2568 } 2569 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2570 if (ppa[pg_idx] != NULL) 2571 page_unlock(ppa[pg_idx]); 2572 return (ENOMEM); 2573 } 2574 mutex_exit(&freemem_lock); 2575 } 2576 } 2577 2578 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2579 2580 VM_STAT_ADD(anonvmstats.privatepages[8]); 2581 2582 an_idx = start_idx; 2583 pg_idx = 0; 2584 vaddr = addr; 2585 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2586 ASSERT(ppa[pg_idx] != NULL); 2587 oldap = anon_get_ptr(amp->ahp, an_idx); 2588 ASSERT(ahmpages != NULL || oldap == NULL); 2589 ASSERT(ahmpages == NULL || oldap != NULL); 2590 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2591 ASSERT(ahmpages == NULL || pg_idx != 0 || 2592 (refcnt = oldap->an_refcnt)); 2593 ASSERT(ahmpages == NULL || pg_idx == 0 || 2594 refcnt == oldap->an_refcnt); 2595 2596 ap = anon_alloc(NULL, 0); 2597 2598 swap_xlate(ap, &vp, &off); 2599 2600 /* 2601 * Now setup our preallocated page to pass down to 2602 * swap_getpage(). 2603 */ 2604 if (prealloc) { 2605 pp = pplist; 2606 page_sub(&pplist, pp); 2607 conpp = pp; 2608 } 2609 2610 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2611 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2612 S_CREATE, cred); 2613 2614 /* 2615 * Impossible to fail this is S_CREATE. 2616 */ 2617 if (err) 2618 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2619 2620 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2621 ASSERT(prealloc == 0 || nreloc == 1); 2622 2623 pp = pl[0]; 2624 2625 /* 2626 * If the original page was locked, we need to move 2627 * the lock to the new page by transfering 2628 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2629 * of the new page. pg_idx can be used to index 2630 * into the vpage array since the caller will guarentee 2631 * that vpage struct passed in corresponds to addr 2632 * and forward. 2633 */ 2634 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2635 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2636 } else if (pagelock) { 2637 mutex_enter(&freemem_lock); 2638 availrmem++; 2639 pages_useclaim--; 2640 mutex_exit(&freemem_lock); 2641 } 2642 2643 /* 2644 * Now copy the contents from the original page. 2645 */ 2646 if (ppcopy(ppa[pg_idx], pp) == 0) { 2647 /* 2648 * Before ppcopy could hanlde UE or other faults, we 2649 * would have panicked here, and still have no option 2650 * but to do so now. 2651 */ 2652 panic("anon_map_privatepages, ppcopy failed"); 2653 } 2654 2655 hat_setrefmod(pp); /* mark as modified */ 2656 2657 /* 2658 * Release the lock on the original page, 2659 * derement the old slot, and down grade the lock 2660 * on the new copy. 2661 */ 2662 page_unlock(ppa[pg_idx]); 2663 2664 if (!prealloc) 2665 page_downgrade(pp); 2666 2667 ppa[pg_idx] = pp; 2668 2669 /* 2670 * Now reflect the copy in the new anon array. 2671 */ 2672 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2673 if (oldap != NULL) 2674 anon_decref(oldap); 2675 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2676 } 2677 2678 /* 2679 * Unload the old large page translation. 2680 */ 2681 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2682 2683 if (ahmpages != NULL) { 2684 mutex_exit(ahmpages); 2685 } 2686 ASSERT(prealloc == 0 || pplist == NULL); 2687 if (prealloc) { 2688 VM_STAT_ADD(anonvmstats.privatepages[9]); 2689 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2690 page_downgrade(ppa[pg_idx]); 2691 } 2692 } 2693 2694 return (0); 2695 } 2696 2697 /* 2698 * Allocate a private zero-filled anon page. 2699 */ 2700 page_t * 2701 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2702 { 2703 struct anon *ap; 2704 page_t *pp; 2705 struct vnode *vp; 2706 anoff_t off; 2707 page_t *anon_pl[1 + 1]; 2708 int err; 2709 2710 /* Kernel probe */ 2711 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2712 tnf_opaque, address, addr); 2713 2714 *app = ap = anon_alloc(NULL, 0); 2715 swap_xlate(ap, &vp, &off); 2716 2717 /* 2718 * Call the VOP_GETPAGE routine to create the page, thereby 2719 * enabling the vnode driver to allocate any filesystem 2720 * dependent structures (e.g., disk block allocation for UFS). 2721 * This also prevents more than on page from being added to 2722 * the vnode at the same time since it is locked. 2723 */ 2724 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2725 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2726 if (err) { 2727 *app = NULL; 2728 anon_decref(ap); 2729 return (NULL); 2730 } 2731 pp = anon_pl[0]; 2732 2733 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2734 page_downgrade(pp); 2735 CPU_STATS_ADD_K(vm, zfod, 1); 2736 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2737 return (pp); 2738 } 2739 2740 2741 /* 2742 * Allocate array of private zero-filled anon pages for empty slots 2743 * and kept pages for non empty slots within given range. 2744 * 2745 * NOTE: This rontine will try and use large pages 2746 * if available and supported by underlying platform. 2747 */ 2748 int 2749 anon_map_createpages( 2750 struct anon_map *amp, 2751 ulong_t start_index, 2752 size_t len, 2753 page_t *ppa[], 2754 struct seg *seg, 2755 caddr_t addr, 2756 enum seg_rw rw, 2757 struct cred *cred) 2758 { 2759 2760 struct anon *ap; 2761 struct vnode *ap_vp; 2762 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2763 int err = 0; 2764 ulong_t p_index, index; 2765 pgcnt_t npgs, pg_cnt; 2766 spgcnt_t nreloc = 0; 2767 uint_t l_szc, szc, prot; 2768 anoff_t ap_off; 2769 size_t pgsz; 2770 lgrp_t *lgrp; 2771 kmutex_t *ahm; 2772 2773 /* 2774 * XXX For now only handle S_CREATE. 2775 */ 2776 ASSERT(rw == S_CREATE); 2777 2778 index = start_index; 2779 p_index = 0; 2780 npgs = btopr(len); 2781 2782 /* 2783 * If this platform supports multiple page sizes 2784 * then try and allocate directly from the free 2785 * list for pages larger than PAGESIZE. 2786 * 2787 * NOTE:When we have page_create_ru we can stop 2788 * directly allocating from the freelist. 2789 */ 2790 l_szc = seg->s_szc; 2791 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2792 while (npgs) { 2793 2794 /* 2795 * if anon slot already exists 2796 * (means page has been created) 2797 * so 1) look up the page 2798 * 2) if the page is still in memory, get it. 2799 * 3) if not, create a page and 2800 * page in from physical swap device. 2801 * These are done in anon_getpage(). 2802 */ 2803 ap = anon_get_ptr(amp->ahp, index); 2804 if (ap) { 2805 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2806 seg, addr, S_READ, cred); 2807 if (err) { 2808 ANON_LOCK_EXIT(&->a_rwlock); 2809 panic("anon_map_createpages: anon_getpage"); 2810 } 2811 pp = anon_pl[0]; 2812 ppa[p_index++] = pp; 2813 2814 /* 2815 * an_pvp can become non-NULL after SysV's page was 2816 * paged out before ISM was attached to this SysV 2817 * shared memory segment. So free swap slot if needed. 2818 */ 2819 if (ap->an_pvp != NULL) { 2820 page_io_lock(pp); 2821 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 2822 ap->an_off)]; 2823 mutex_enter(ahm); 2824 if (ap->an_pvp != NULL) { 2825 swap_phys_free(ap->an_pvp, 2826 ap->an_poff, PAGESIZE); 2827 ap->an_pvp = NULL; 2828 ap->an_poff = 0; 2829 mutex_exit(ahm); 2830 hat_setmod(pp); 2831 } else { 2832 mutex_exit(ahm); 2833 } 2834 page_io_unlock(pp); 2835 } 2836 2837 addr += PAGESIZE; 2838 index++; 2839 npgs--; 2840 continue; 2841 } 2842 /* 2843 * Now try and allocate the largest page possible 2844 * for the current address and range. 2845 * Keep dropping down in page size until: 2846 * 2847 * 1) Properly aligned 2848 * 2) Does not overlap existing anon pages 2849 * 3) Fits in remaining range. 2850 * 4) able to allocate one. 2851 * 2852 * NOTE: XXX When page_create_ru is completed this code 2853 * will change. 2854 */ 2855 szc = l_szc; 2856 pplist = NULL; 2857 pg_cnt = 0; 2858 while (szc) { 2859 pgsz = page_get_pagesize(szc); 2860 pg_cnt = pgsz >> PAGESHIFT; 2861 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2862 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2863 /* 2864 * XXX 2865 * Since we are faking page_create() 2866 * we also need to do the freemem and 2867 * pcf accounting. 2868 */ 2869 (void) page_create_wait(pg_cnt, PG_WAIT); 2870 2871 /* 2872 * Get lgroup to allocate next page of shared 2873 * memory from and use it to specify where to 2874 * allocate the physical memory 2875 */ 2876 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2877 2878 pplist = page_get_freelist( 2879 anon_vp, (u_offset_t)0, seg, 2880 addr, pgsz, 0, lgrp); 2881 2882 if (pplist == NULL) { 2883 page_create_putback(pg_cnt); 2884 } 2885 2886 /* 2887 * If a request for a page of size 2888 * larger than PAGESIZE failed 2889 * then don't try that size anymore. 2890 */ 2891 if (pplist == NULL) { 2892 l_szc = szc - 1; 2893 } else { 2894 break; 2895 } 2896 } 2897 szc--; 2898 } 2899 2900 /* 2901 * If just using PAGESIZE pages then don't 2902 * directly allocate from the free list. 2903 */ 2904 if (pplist == NULL) { 2905 ASSERT(szc == 0); 2906 pp = anon_zero(seg, addr, &ap, cred); 2907 if (pp == NULL) { 2908 ANON_LOCK_EXIT(&->a_rwlock); 2909 panic("anon_map_createpages: anon_zero"); 2910 } 2911 ppa[p_index++] = pp; 2912 2913 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2914 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2915 2916 addr += PAGESIZE; 2917 index++; 2918 npgs--; 2919 continue; 2920 } 2921 2922 /* 2923 * pplist is a list of pg_cnt PAGESIZE pages. 2924 * These pages are locked SE_EXCL since they 2925 * came directly off the free list. 2926 */ 2927 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2928 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2929 ASSERT(conpp == NULL); 2930 while (pg_cnt--) { 2931 2932 ap = anon_alloc(NULL, 0); 2933 swap_xlate(ap, &ap_vp, &ap_off); 2934 2935 ASSERT(pplist != NULL); 2936 pp = pplist; 2937 page_sub(&pplist, pp); 2938 PP_CLRFREE(pp); 2939 PP_CLRAGED(pp); 2940 conpp = pp; 2941 2942 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2943 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2944 &nreloc, seg, addr, S_CREATE, cred); 2945 2946 if (err) { 2947 ANON_LOCK_EXIT(&->a_rwlock); 2948 panic("anon_map_createpages: S_CREATE"); 2949 } 2950 2951 ASSERT(anon_pl[0] == pp); 2952 ASSERT(nreloc == 1); 2953 pagezero(pp, 0, PAGESIZE); 2954 CPU_STATS_ADD_K(vm, zfod, 1); 2955 hat_setrefmod(pp); 2956 2957 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2958 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2959 2960 ppa[p_index++] = pp; 2961 2962 addr += PAGESIZE; 2963 index++; 2964 npgs--; 2965 } 2966 conpp = NULL; 2967 pg_cnt = pgsz >> PAGESHIFT; 2968 p_index = p_index - pg_cnt; 2969 while (pg_cnt--) { 2970 page_downgrade(ppa[p_index++]); 2971 } 2972 } 2973 ANON_LOCK_EXIT(&->a_rwlock); 2974 return (0); 2975 } 2976 2977 static int 2978 anon_try_demote_pages( 2979 struct anon_hdr *ahp, 2980 ulong_t sidx, 2981 uint_t szc, 2982 page_t **ppa, 2983 int private) 2984 { 2985 struct anon *ap; 2986 pgcnt_t pgcnt = page_get_pagecnt(szc); 2987 page_t *pp; 2988 pgcnt_t i; 2989 kmutex_t *ahmpages = NULL; 2990 int root = 0; 2991 pgcnt_t npgs; 2992 pgcnt_t curnpgs = 0; 2993 size_t ppasize = 0; 2994 2995 ASSERT(szc != 0); 2996 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2997 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 2998 ASSERT(sidx < ahp->size); 2999 3000 if (ppa == NULL) { 3001 ppasize = pgcnt * sizeof (page_t *); 3002 ppa = kmem_alloc(ppasize, KM_SLEEP); 3003 } 3004 3005 ap = anon_get_ptr(ahp, sidx); 3006 if (ap != NULL && private) { 3007 VM_STAT_ADD(anonvmstats.demotepages[1]); 3008 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 3009 mutex_enter(ahmpages); 3010 } 3011 3012 if (ap != NULL && ap->an_refcnt > 1) { 3013 if (ahmpages != NULL) { 3014 VM_STAT_ADD(anonvmstats.demotepages[2]); 3015 mutex_exit(ahmpages); 3016 } 3017 if (ppasize != 0) { 3018 kmem_free(ppa, ppasize); 3019 } 3020 return (0); 3021 } 3022 if (ahmpages != NULL) { 3023 mutex_exit(ahmpages); 3024 } 3025 if (ahp->size - sidx < pgcnt) { 3026 ASSERT(private == 0); 3027 pgcnt = ahp->size - sidx; 3028 } 3029 for (i = 0; i < pgcnt; i++, sidx++) { 3030 ap = anon_get_ptr(ahp, sidx); 3031 if (ap != NULL) { 3032 if (ap->an_refcnt != 1) { 3033 panic("anon_try_demote_pages: an_refcnt != 1"); 3034 } 3035 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3036 SE_EXCL); 3037 if (pp != NULL) { 3038 (void) hat_pageunload(pp, 3039 HAT_FORCE_PGUNLOAD); 3040 } 3041 } else { 3042 ppa[i] = NULL; 3043 } 3044 } 3045 for (i = 0; i < pgcnt; i++) { 3046 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3047 ASSERT(pp->p_szc <= szc); 3048 if (!root) { 3049 VM_STAT_ADD(anonvmstats.demotepages[3]); 3050 if (curnpgs != 0) 3051 panic("anon_try_demote_pages: " 3052 "bad large page"); 3053 3054 root = 1; 3055 curnpgs = npgs = 3056 page_get_pagecnt(pp->p_szc); 3057 3058 ASSERT(npgs <= pgcnt); 3059 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3060 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3061 } else { 3062 ASSERT(i > 0); 3063 ASSERT(page_pptonum(pp) - 1 == 3064 page_pptonum(ppa[i - 1])); 3065 if ((page_pptonum(pp) & (npgs - 1)) == 3066 npgs - 1) 3067 root = 0; 3068 } 3069 ASSERT(PAGE_EXCL(pp)); 3070 pp->p_szc = 0; 3071 ASSERT(curnpgs > 0); 3072 curnpgs--; 3073 } 3074 } 3075 if (root != 0 || curnpgs != 0) 3076 panic("anon_try_demote_pages: bad large page"); 3077 3078 for (i = 0; i < pgcnt; i++) { 3079 if ((pp = ppa[i]) != NULL) { 3080 ASSERT(!hat_page_is_mapped(pp)); 3081 ASSERT(pp->p_szc == 0); 3082 page_unlock(pp); 3083 } 3084 } 3085 if (ppasize != 0) { 3086 kmem_free(ppa, ppasize); 3087 } 3088 return (1); 3089 } 3090 3091 /* 3092 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3093 */ 3094 int 3095 anon_map_demotepages( 3096 struct anon_map *amp, 3097 ulong_t start_idx, 3098 struct seg *seg, 3099 caddr_t addr, 3100 uint_t prot, 3101 struct vpage vpage[], 3102 struct cred *cred) 3103 { 3104 struct anon *ap; 3105 uint_t szc = seg->s_szc; 3106 pgcnt_t pgcnt = page_get_pagecnt(szc); 3107 size_t ppasize = pgcnt * sizeof (page_t *); 3108 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3109 page_t *pp; 3110 page_t *pl[2]; 3111 pgcnt_t i, pg_idx; 3112 ulong_t an_idx; 3113 caddr_t vaddr; 3114 int err; 3115 int retry = 0; 3116 uint_t vpprot; 3117 3118 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3119 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3120 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3121 ASSERT(ppa != NULL); 3122 ASSERT(szc != 0); 3123 ASSERT(szc == amp->a_szc); 3124 3125 VM_STAT_ADD(anonvmstats.demotepages[0]); 3126 3127 top: 3128 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3129 kmem_free(ppa, ppasize); 3130 return (0); 3131 } 3132 3133 VM_STAT_ADD(anonvmstats.demotepages[4]); 3134 3135 ASSERT(retry == 0); /* we can be here only once */ 3136 3137 vaddr = addr; 3138 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3139 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3140 ap = anon_get_ptr(amp->ahp, an_idx); 3141 if (ap == NULL) 3142 panic("anon_map_demotepages: no anon slot"); 3143 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3144 S_READ, cred); 3145 if (err) { 3146 for (i = 0; i < pg_idx; i++) { 3147 if ((pp = ppa[i]) != NULL) 3148 page_unlock(pp); 3149 } 3150 kmem_free(ppa, ppasize); 3151 return (err); 3152 } 3153 ppa[pg_idx] = pl[0]; 3154 } 3155 3156 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3157 vpage, -1, 0, cred); 3158 if (err > 0) { 3159 VM_STAT_ADD(anonvmstats.demotepages[5]); 3160 kmem_free(ppa, ppasize); 3161 return (err); 3162 } 3163 ASSERT(err == 0 || err == -1); 3164 if (err == -1) { 3165 VM_STAT_ADD(anonvmstats.demotepages[6]); 3166 retry = 1; 3167 goto top; 3168 } 3169 for (i = 0; i < pgcnt; i++) { 3170 ASSERT(ppa[i] != NULL); 3171 if (ppa[i]->p_szc != 0) 3172 retry = 1; 3173 page_unlock(ppa[i]); 3174 } 3175 if (retry) { 3176 VM_STAT_ADD(anonvmstats.demotepages[7]); 3177 goto top; 3178 } 3179 3180 VM_STAT_ADD(anonvmstats.demotepages[8]); 3181 3182 kmem_free(ppa, ppasize); 3183 3184 return (0); 3185 } 3186 3187 /* 3188 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3189 * structures with private anon maps. Therefore all anon structures should 3190 * have at most one reference at this point. This means underlying pages can 3191 * be exclusively locked and demoted or freed. If not freeing the entire 3192 * large pages demote the ends of the region we free to be able to free 3193 * subpages. Page roots correspond to aligned index positions in anon map. 3194 */ 3195 void 3196 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3197 { 3198 ulong_t eidx = sidx + btopr(len); 3199 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3200 struct anon_hdr *ahp = amp->ahp; 3201 ulong_t tidx; 3202 size_t size; 3203 ulong_t sidx_aligned; 3204 ulong_t eidx_aligned; 3205 3206 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3207 ASSERT(amp->refcnt <= 1); 3208 ASSERT(amp->a_szc > 0); 3209 ASSERT(eidx <= ahp->size); 3210 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3211 3212 if (len == 0) { /* XXX */ 3213 return; 3214 } 3215 3216 sidx_aligned = P2ALIGN(sidx, pages); 3217 if (sidx_aligned != sidx || 3218 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3219 if (!anon_try_demote_pages(ahp, sidx_aligned, 3220 amp->a_szc, NULL, 0)) { 3221 panic("anon_shmap_free_pages: demote failed"); 3222 } 3223 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3224 P2NPHASE(sidx, pages); 3225 size <<= PAGESHIFT; 3226 anon_free(ahp, sidx, size); 3227 sidx = sidx_aligned + pages; 3228 if (eidx <= sidx) { 3229 return; 3230 } 3231 } 3232 eidx_aligned = P2ALIGN(eidx, pages); 3233 if (sidx < eidx_aligned) { 3234 anon_free_pages(ahp, sidx, 3235 (eidx_aligned - sidx) << PAGESHIFT, 3236 amp->a_szc); 3237 sidx = eidx_aligned; 3238 } 3239 ASSERT(sidx == eidx_aligned); 3240 if (eidx == eidx_aligned) { 3241 return; 3242 } 3243 tidx = eidx; 3244 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3245 tidx - sidx < pages) { 3246 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3247 panic("anon_shmap_free_pages: demote failed"); 3248 } 3249 size = (eidx - sidx) << PAGESHIFT; 3250 anon_free(ahp, sidx, size); 3251 } else { 3252 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3253 } 3254 } 3255 3256 /* 3257 * This routine should be called with amp's writer lock when there're no other 3258 * users of amp. All pcache entries of this amp must have been already 3259 * inactivated. We must not drop a_rwlock here to prevent new users from 3260 * attaching to this amp. 3261 */ 3262 void 3263 anonmap_purge(struct anon_map *amp) 3264 { 3265 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3266 ASSERT(amp->refcnt <= 1); 3267 3268 if (amp->a_softlockcnt != 0) { 3269 seg_ppurge(NULL, amp, 0); 3270 } 3271 3272 /* 3273 * Since all pcache entries were already inactive before this routine 3274 * was called seg_ppurge() couldn't return while there're still 3275 * entries that can be found via the list anchored at a_phead. So we 3276 * can assert this list is empty now. a_softlockcnt may be still non 0 3277 * if asynchronous thread that manages pcache already removed pcache 3278 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3279 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3280 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3281 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3282 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3283 * barrier that prevents anonmap_purge() to complete while 3284 * shamp_reclaim() may still be referencing this amp. 3285 */ 3286 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3287 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3288 3289 mutex_enter(&->a_purgemtx); 3290 while (amp->a_softlockcnt != 0) { 3291 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3292 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3293 amp->a_purgewait = 1; 3294 cv_wait(&->a_purgecv, &->a_purgemtx); 3295 } 3296 mutex_exit(&->a_purgemtx); 3297 3298 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3299 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3300 ASSERT(amp->a_softlockcnt == 0); 3301 } 3302 3303 /* 3304 * Allocate and initialize an anon_map structure for seg 3305 * associating the given swap reservation with the new anon_map. 3306 */ 3307 struct anon_map * 3308 anonmap_alloc(size_t size, size_t swresv, int flags) 3309 { 3310 struct anon_map *amp; 3311 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3312 3313 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3314 if (amp == NULL) { 3315 ASSERT(kmflags == KM_NOSLEEP); 3316 return (NULL); 3317 } 3318 3319 amp->ahp = anon_create(btopr(size), flags); 3320 if (amp->ahp == NULL) { 3321 ASSERT(flags == ANON_NOSLEEP); 3322 kmem_cache_free(anonmap_cache, amp); 3323 return (NULL); 3324 } 3325 amp->refcnt = 1; 3326 amp->size = size; 3327 amp->swresv = swresv; 3328 amp->locality = 0; 3329 amp->a_szc = 0; 3330 amp->a_sp = NULL; 3331 amp->a_softlockcnt = 0; 3332 amp->a_purgewait = 0; 3333 amp->a_phead.p_lnext = &->a_phead; 3334 amp->a_phead.p_lprev = &->a_phead; 3335 3336 return (amp); 3337 } 3338 3339 void 3340 anonmap_free(struct anon_map *amp) 3341 { 3342 ASSERT(amp->ahp != NULL); 3343 ASSERT(amp->refcnt == 0); 3344 ASSERT(amp->a_softlockcnt == 0); 3345 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3346 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3347 3348 lgrp_shm_policy_fini(amp, NULL); 3349 anon_release(amp->ahp, btopr(amp->size)); 3350 kmem_cache_free(anonmap_cache, amp); 3351 } 3352 3353 /* 3354 * Returns true if the app array has some empty slots. 3355 * The offp and lenp parameters are in/out parameters. On entry 3356 * these values represent the starting offset and length of the 3357 * mapping. When true is returned, these values may be modified 3358 * to be the largest range which includes empty slots. 3359 */ 3360 int 3361 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3362 size_t *lenp) 3363 { 3364 ulong_t i, el; 3365 ssize_t low, high; 3366 struct anon *ap; 3367 3368 low = -1; 3369 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3370 ap = anon_get_ptr(ahp, anon_idx); 3371 if (ap == NULL) { 3372 if (low == -1) 3373 low = i; 3374 high = i; 3375 } 3376 } 3377 if (low != -1) { 3378 /* 3379 * Found at least one non-anon page. 3380 * Set up the off and len return values. 3381 */ 3382 if (low != 0) 3383 *offp += low; 3384 *lenp = high - low + PAGESIZE; 3385 return (1); 3386 } 3387 return (0); 3388 } 3389 3390 /* 3391 * Return a count of the number of existing anon pages in the anon array 3392 * app in the range (off, off+len). The array and slots must be guaranteed 3393 * stable by the caller. 3394 */ 3395 pgcnt_t 3396 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3397 { 3398 pgcnt_t cnt = 0; 3399 3400 while (nslots-- > 0) { 3401 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3402 cnt++; 3403 anon_index++; 3404 } 3405 return (cnt); 3406 } 3407 3408 /* 3409 * Move reserved phys swap into memory swap (unreserve phys swap 3410 * and reserve mem swap by the same amount). 3411 * Used by segspt when it needs to lock reserved swap npages in memory 3412 */ 3413 int 3414 anon_swap_adjust(pgcnt_t npages) 3415 { 3416 pgcnt_t unlocked_mem_swap; 3417 3418 mutex_enter(&anoninfo_lock); 3419 3420 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3421 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3422 3423 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3424 - k_anoninfo.ani_locked_swap; 3425 if (npages > unlocked_mem_swap) { 3426 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3427 3428 /* 3429 * if there is not enough unlocked mem swap we take missing 3430 * amount from phys swap and give it to mem swap 3431 */ 3432 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3433 mutex_exit(&anoninfo_lock); 3434 return (ENOMEM); 3435 } 3436 3437 k_anoninfo.ani_mem_resv += adjusted_swap; 3438 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3439 k_anoninfo.ani_phys_resv -= adjusted_swap; 3440 3441 ANI_ADD(adjusted_swap); 3442 } 3443 k_anoninfo.ani_locked_swap += npages; 3444 3445 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3446 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3447 3448 mutex_exit(&anoninfo_lock); 3449 3450 return (0); 3451 } 3452 3453 /* 3454 * 'unlocked' reserved mem swap so when it is unreserved it 3455 * can be moved back phys (disk) swap 3456 */ 3457 void 3458 anon_swap_restore(pgcnt_t npages) 3459 { 3460 mutex_enter(&anoninfo_lock); 3461 3462 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3463 3464 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3465 k_anoninfo.ani_locked_swap -= npages; 3466 3467 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3468 3469 mutex_exit(&anoninfo_lock); 3470 } 3471 3472 /* 3473 * Return the pointer from the list for a 3474 * specified anon index. 3475 */ 3476 ulong_t * 3477 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3478 { 3479 struct anon **app; 3480 void **ppp; 3481 3482 ASSERT(an_idx < ahp->size); 3483 3484 /* 3485 * Single level case. 3486 */ 3487 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3488 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3489 } else { 3490 3491 /* 3492 * 2 level case. 3493 */ 3494 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3495 if (*ppp == NULL) { 3496 mutex_enter(&ahp->serial_lock); 3497 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3498 if (*ppp == NULL) 3499 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3500 mutex_exit(&ahp->serial_lock); 3501 } 3502 app = *ppp; 3503 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3504 } 3505 } 3506 3507 void 3508 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3509 { 3510 ulong_t *ap_slot; 3511 kmutex_t *mtx; 3512 kcondvar_t *cv; 3513 int hash; 3514 3515 /* 3516 * Use szc to determine anon slot(s) to appear atomic. 3517 * If szc = 0, then lock the anon slot and mark it busy. 3518 * If szc > 0, then lock the range of slots by getting the 3519 * anon_array_lock for the first anon slot, and mark only the 3520 * first anon slot busy to represent whole range being busy. 3521 */ 3522 3523 ASSERT(RW_READ_HELD(&->a_rwlock)); 3524 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3525 hash = ANON_ARRAY_HASH(amp, an_idx); 3526 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3527 sobj->sync_cv = cv = &anon_array_cv[hash]; 3528 mutex_enter(mtx); 3529 ap_slot = anon_get_slot(amp->ahp, an_idx); 3530 while (ANON_ISBUSY(ap_slot)) 3531 cv_wait(cv, mtx); 3532 ANON_SETBUSY(ap_slot); 3533 sobj->sync_data = ap_slot; 3534 mutex_exit(mtx); 3535 } 3536 3537 int 3538 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3539 anon_sync_obj_t *sobj) 3540 { 3541 ulong_t *ap_slot; 3542 kmutex_t *mtx; 3543 int hash; 3544 3545 /* 3546 * Try to lock a range of anon slots. 3547 * Use szc to determine anon slot(s) to appear atomic. 3548 * If szc = 0, then lock the anon slot and mark it busy. 3549 * If szc > 0, then lock the range of slots by getting the 3550 * anon_array_lock for the first anon slot, and mark only the 3551 * first anon slot busy to represent whole range being busy. 3552 * Fail if the mutex or the anon_array are busy. 3553 */ 3554 3555 ASSERT(RW_READ_HELD(&->a_rwlock)); 3556 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3557 hash = ANON_ARRAY_HASH(amp, an_idx); 3558 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3559 sobj->sync_cv = &anon_array_cv[hash]; 3560 if (!mutex_tryenter(mtx)) { 3561 return (EWOULDBLOCK); 3562 } 3563 ap_slot = anon_get_slot(amp->ahp, an_idx); 3564 if (ANON_ISBUSY(ap_slot)) { 3565 mutex_exit(mtx); 3566 return (EWOULDBLOCK); 3567 } 3568 ANON_SETBUSY(ap_slot); 3569 sobj->sync_data = ap_slot; 3570 mutex_exit(mtx); 3571 return (0); 3572 } 3573 3574 void 3575 anon_array_exit(anon_sync_obj_t *sobj) 3576 { 3577 mutex_enter(sobj->sync_mutex); 3578 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3579 ANON_CLRBUSY(sobj->sync_data); 3580 if (CV_HAS_WAITERS(sobj->sync_cv)) 3581 cv_broadcast(sobj->sync_cv); 3582 mutex_exit(sobj->sync_mutex); 3583 } 3584