1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - anonymous pages. 41 * 42 * This layer sits immediately above the vm_swap layer. It manages 43 * physical pages that have no permanent identity in the file system 44 * name space, using the services of the vm_swap layer to allocate 45 * backing storage for these pages. Since these pages have no external 46 * identity, they are discarded when the last reference is removed. 47 * 48 * An important function of this layer is to manage low-level sharing 49 * of pages that are logically distinct but that happen to be 50 * physically identical (e.g., the corresponding pages of the processes 51 * resulting from a fork before one process or the other changes their 52 * contents). This pseudo-sharing is present only as an optimization 53 * and is not to be confused with true sharing in which multiple 54 * address spaces deliberately contain references to the same object; 55 * such sharing is managed at a higher level. 56 * 57 * The key data structure here is the anon struct, which contains a 58 * reference count for its associated physical page and a hint about 59 * the identity of that page. Anon structs typically live in arrays, 60 * with an instance's position in its array determining where the 61 * corresponding backing storage is allocated; however, the swap_xlate() 62 * routine abstracts away this representation information so that the 63 * rest of the anon layer need not know it. (See the swap layer for 64 * more details on anon struct layout.) 65 * 66 * In the future versions of the system, the association between an 67 * anon struct and its position on backing store will change so that 68 * we don't require backing store all anonymous pages in the system. 69 * This is important for consideration for large memory systems. 70 * We can also use this technique to delay binding physical locations 71 * to anonymous pages until pageout/swapout time where we can make 72 * smarter allocation decisions to improve anonymous klustering. 73 * 74 * Many of the routines defined here take a (struct anon **) argument, 75 * which allows the code at this level to manage anon pages directly, 76 * so that callers can regard anon structs as opaque objects and not be 77 * concerned with assigning or inspecting their contents. 78 * 79 * Clients of this layer refer to anon pages indirectly. That is, they 80 * maintain arrays of pointers to anon structs rather than maintaining 81 * anon structs themselves. The (struct anon **) arguments mentioned 82 * above are pointers to entries in these arrays. It is these arrays 83 * that capture the mapping between offsets within a given segment and 84 * the corresponding anonymous backing storage address. 85 */ 86 87 #ifdef DEBUG 88 #define ANON_DEBUG 89 #endif 90 91 #include <sys/types.h> 92 #include <sys/t_lock.h> 93 #include <sys/param.h> 94 #include <sys/systm.h> 95 #include <sys/mman.h> 96 #include <sys/cred.h> 97 #include <sys/thread.h> 98 #include <sys/vnode.h> 99 #include <sys/cpuvar.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/vtrace.h> 103 #include <sys/kmem.h> 104 #include <sys/sysmacros.h> 105 #include <sys/bitmap.h> 106 #include <sys/vmsystm.h> 107 #include <sys/tuneable.h> 108 #include <sys/debug.h> 109 #include <sys/fs/swapnode.h> 110 #include <sys/tnf_probe.h> 111 #include <sys/lgrp.h> 112 #include <sys/policy.h> 113 #include <sys/condvar_impl.h> 114 #include <sys/mutex_impl.h> 115 #include <sys/rctl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 struct vnode *anon_vp; 128 129 int anon_debug; 130 131 kmutex_t anoninfo_lock; 132 struct k_anoninfo k_anoninfo; 133 ani_free_t ani_free_pool[ANI_MAX_POOL]; 134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 135 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 136 137 /* 138 * Global hash table for (vp, off) -> anon slot 139 */ 140 extern int swap_maxcontig; 141 size_t anon_hash_size; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 #ifdef VM_STATS 148 static struct anonvmstats_str { 149 ulong_t getpages[30]; 150 ulong_t privatepages[10]; 151 ulong_t demotepages[9]; 152 ulong_t decrefpages[9]; 153 ulong_t dupfillholes[4]; 154 ulong_t freepages[1]; 155 } anonvmstats; 156 #endif /* VM_STATS */ 157 158 /*ARGSUSED*/ 159 static int 160 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 161 { 162 struct anon_map *amp = buf; 163 164 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 165 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 166 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 167 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 168 return (0); 169 } 170 171 /*ARGSUSED1*/ 172 static void 173 anonmap_cache_destructor(void *buf, void *cdrarg) 174 { 175 struct anon_map *amp = buf; 176 177 rw_destroy(&->a_rwlock); 178 cv_destroy(&->a_purgecv); 179 mutex_destroy(&->a_pmtx); 180 mutex_destroy(&->a_purgemtx); 181 } 182 183 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 184 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 185 186 void 187 anon_init(void) 188 { 189 int i; 190 191 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 192 193 for (i = 0; i < AH_LOCK_SIZE; i++) { 194 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 195 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 196 } 197 198 for (i = 0; i < ANON_LOCKSIZE; i++) { 199 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 200 MUTEX_DEFAULT, NULL); 201 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 202 } 203 204 anon_hash = (struct anon **) 205 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 206 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 207 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 208 anonmap_cache = kmem_cache_create("anonmap_cache", 209 sizeof (struct anon_map), 0, 210 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 211 NULL, NULL, 0); 212 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 213 214 anon_vp = vn_alloc(KM_SLEEP); 215 vn_setops(anon_vp, swap_vnodeops); 216 anon_vp->v_type = VREG; 217 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 218 } 219 220 /* 221 * Global anon slot hash table manipulation. 222 */ 223 224 static void 225 anon_addhash(struct anon *ap) 226 { 227 int index; 228 229 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 230 index = ANON_HASH(ap->an_vp, ap->an_off); 231 ap->an_hash = anon_hash[index]; 232 anon_hash[index] = ap; 233 } 234 235 static void 236 anon_rmhash(struct anon *ap) 237 { 238 struct anon **app; 239 240 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 241 242 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 243 *app; app = &((*app)->an_hash)) { 244 if (*app == ap) { 245 *app = ap->an_hash; 246 break; 247 } 248 } 249 } 250 251 /* 252 * The anon array interfaces. Functions allocating, 253 * freeing array of pointers, and returning/setting 254 * entries in the array of pointers for a given offset. 255 * 256 * Create the list of pointers 257 */ 258 struct anon_hdr * 259 anon_create(pgcnt_t npages, int flags) 260 { 261 struct anon_hdr *ahp; 262 ulong_t nchunks; 263 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 264 265 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 266 return (NULL); 267 } 268 269 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 270 /* 271 * Single level case. 272 */ 273 ahp->size = npages; 274 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 275 276 if (flags & ANON_ALLOC_FORCE) 277 ahp->flags |= ANON_ALLOC_FORCE; 278 279 ahp->array_chunk = kmem_zalloc( 280 ahp->size * sizeof (struct anon *), kmemflags); 281 282 if (ahp->array_chunk == NULL) { 283 kmem_free(ahp, sizeof (struct anon_hdr)); 284 return (NULL); 285 } 286 } else { 287 /* 288 * 2 Level case. 289 * anon hdr size needs to be rounded off to be a multiple 290 * of ANON_CHUNK_SIZE. This is important as various anon 291 * related functions depend on this. 292 * NOTE - 293 * anon_grow() makes anon hdr size a multiple of 294 * ANON_CHUNK_SIZE. 295 * amp size is <= anon hdr size. 296 * anon_index + seg_pgs <= anon hdr size. 297 */ 298 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 299 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 300 301 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 302 kmemflags); 303 304 if (ahp->array_chunk == NULL) { 305 kmem_free(ahp, sizeof (struct anon_hdr)); 306 return (NULL); 307 } 308 } 309 return (ahp); 310 } 311 312 /* 313 * Free the array of pointers 314 */ 315 void 316 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 317 { 318 ulong_t i; 319 void **ppp; 320 ulong_t nchunks; 321 322 ASSERT(npages <= ahp->size); 323 324 /* 325 * Single level case. 326 */ 327 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 328 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 329 } else { 330 /* 331 * 2 level case. 332 */ 333 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 334 for (i = 0; i < nchunks; i++) { 335 ppp = &ahp->array_chunk[i]; 336 if (*ppp != NULL) 337 kmem_free(*ppp, PAGESIZE); 338 } 339 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 340 } 341 mutex_destroy(&ahp->serial_lock); 342 kmem_free(ahp, sizeof (struct anon_hdr)); 343 } 344 345 /* 346 * Return the pointer from the list for a 347 * specified anon index. 348 */ 349 struct anon * 350 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 351 { 352 struct anon **app; 353 354 ASSERT(an_idx < ahp->size); 355 356 /* 357 * Single level case. 358 */ 359 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 360 return ((struct anon *) 361 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 362 } else { 363 364 /* 365 * 2 level case. 366 */ 367 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 368 if (app) { 369 return ((struct anon *) 370 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 371 ANON_PTRMASK)); 372 } else { 373 return (NULL); 374 } 375 } 376 } 377 378 /* 379 * Return the anon pointer for the first valid entry in the anon list, 380 * starting from the given index. 381 */ 382 struct anon * 383 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 384 { 385 struct anon *ap; 386 struct anon **app; 387 ulong_t chunkoff; 388 ulong_t i; 389 ulong_t j; 390 pgcnt_t size; 391 392 i = *index; 393 size = ahp->size; 394 395 ASSERT(i < size); 396 397 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 398 /* 399 * 1 level case 400 */ 401 while (i < size) { 402 ap = (struct anon *) 403 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 404 if (ap) { 405 *index = i; 406 return (ap); 407 } 408 i++; 409 } 410 } else { 411 /* 412 * 2 level case 413 */ 414 chunkoff = i & ANON_CHUNK_OFF; 415 while (i < size) { 416 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 417 if (app) 418 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 419 ap = (struct anon *) 420 ((uintptr_t)app[j] & ANON_PTRMASK); 421 if (ap) { 422 *index = i + (j - chunkoff); 423 return (ap); 424 } 425 } 426 chunkoff = 0; 427 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 428 } 429 } 430 *index = size; 431 return (NULL); 432 } 433 434 /* 435 * Set list entry with a given pointer for a specified offset 436 */ 437 int 438 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 439 { 440 void **ppp; 441 struct anon **app; 442 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 443 uintptr_t *ap_addr; 444 445 ASSERT(an_idx < ahp->size); 446 447 /* 448 * Single level case. 449 */ 450 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 451 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 452 } else { 453 454 /* 455 * 2 level case. 456 */ 457 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 458 459 ASSERT(ppp != NULL); 460 if (*ppp == NULL) { 461 mutex_enter(&ahp->serial_lock); 462 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 463 if (*ppp == NULL) { 464 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 465 if (*ppp == NULL) { 466 mutex_exit(&ahp->serial_lock); 467 return (ENOMEM); 468 } 469 } 470 mutex_exit(&ahp->serial_lock); 471 } 472 app = *ppp; 473 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 474 } 475 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 476 return (0); 477 } 478 479 /* 480 * Copy anon array into a given new anon array 481 */ 482 int 483 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 484 struct anon_hdr *dahp, ulong_t d_idx, 485 pgcnt_t npages, int flags) 486 { 487 void **sapp, **dapp; 488 void *ap; 489 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 490 491 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 492 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 493 494 /* 495 * Both arrays are 1 level. 496 */ 497 if (((sahp->size <= ANON_CHUNK_SIZE) && 498 (dahp->size <= ANON_CHUNK_SIZE)) || 499 ((sahp->flags & ANON_ALLOC_FORCE) && 500 (dahp->flags & ANON_ALLOC_FORCE))) { 501 502 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 503 npages * sizeof (struct anon *)); 504 return (0); 505 } 506 507 /* 508 * Both arrays are 2 levels. 509 */ 510 if (sahp->size > ANON_CHUNK_SIZE && 511 dahp->size > ANON_CHUNK_SIZE && 512 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 513 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 514 515 ulong_t sapidx, dapidx; 516 ulong_t *sap, *dap; 517 ulong_t chknp; 518 519 while (npages != 0) { 520 521 sapidx = s_idx & ANON_CHUNK_OFF; 522 dapidx = d_idx & ANON_CHUNK_OFF; 523 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 524 if (chknp > npages) 525 chknp = npages; 526 527 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 528 if ((sap = *sapp) != NULL) { 529 dapp = &dahp->array_chunk[d_idx 530 >> ANON_CHUNK_SHIFT]; 531 if ((dap = *dapp) == NULL) { 532 *dapp = kmem_zalloc(PAGESIZE, 533 kmemflags); 534 if ((dap = *dapp) == NULL) 535 return (ENOMEM); 536 } 537 bcopy((sap + sapidx), (dap + dapidx), 538 chknp << ANON_PTRSHIFT); 539 } 540 s_idx += chknp; 541 d_idx += chknp; 542 npages -= chknp; 543 } 544 return (0); 545 } 546 547 /* 548 * At least one of the arrays is 2 level. 549 */ 550 while (npages--) { 551 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 552 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 553 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 554 return (ENOMEM); 555 } 556 s_idx++; 557 d_idx++; 558 } 559 return (0); 560 } 561 562 563 /* 564 * ANON_INITBUF is a convenience macro for anon_grow() below. It 565 * takes a buffer dst, which is at least as large as buffer src. It 566 * does a bcopy from src into dst, and then bzeros the extra bytes 567 * of dst. If tail is set, the data in src is tail aligned within 568 * dst instead of head aligned. 569 */ 570 571 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 572 if (tail) { \ 573 bzero((dst), (dstsize) - (srclen)); \ 574 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 575 } else { \ 576 bcopy((src), (dst), (srclen)); \ 577 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 578 } 579 580 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 581 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 582 583 /* 584 * anon_grow() is used to efficiently extend an existing anon array. 585 * startidx_p points to the index into the anon array of the first page 586 * that is in use. oldseg_pgs is the number of pages in use, starting at 587 * *startidx_p. newpages is the number of additional pages desired. 588 * 589 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 590 * 591 * The growth is done by creating a new top level of the anon array, 592 * and (if the array is 2-level) reusing the existing second level arrays. 593 * 594 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 595 * 596 * Returns the new number of pages in the anon array. 597 */ 598 pgcnt_t 599 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 600 pgcnt_t newseg_pgs, int flags) 601 { 602 ulong_t startidx = startidx_p ? *startidx_p : 0; 603 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 604 pgcnt_t oelems, nelems, totpages; 605 void **level1; 606 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 607 int growdown = (flags & ANON_GROWDOWN); 608 size_t newarrsz, oldarrsz; 609 void *level2; 610 611 ASSERT(!(startidx_p == NULL && growdown)); 612 ASSERT(startidx + oldseg_pgs <= ahp->size); 613 614 /* 615 * Determine the total number of pages needed in the new 616 * anon array. If growing down, totpages is all pages from 617 * startidx through the end of the array, plus <newseg_pgs> 618 * pages. If growing up, keep all pages from page 0 through 619 * the last page currently in use, plus <newseg_pgs> pages. 620 */ 621 if (growdown) 622 totpages = oldamp_pgs - startidx + newseg_pgs; 623 else 624 totpages = startidx + oldseg_pgs + newseg_pgs; 625 626 /* If the array is already large enough, just return. */ 627 628 if (oldamp_pgs >= totpages) { 629 if (growdown) 630 *startidx_p = oldamp_pgs - totpages; 631 return (oldamp_pgs); 632 } 633 634 /* 635 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 636 * by the corresponding arrays. 637 * oelems/nelems are the number of pointers in the top level arrays 638 * which may be either level 1 or level 2. 639 * Will the new anon array be one level or two levels? 640 */ 641 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 642 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 643 oelems = oldamp_pgs; 644 nelems = newamp_pgs; 645 } else { 646 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 647 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 648 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 649 } 650 651 newarrsz = nelems * sizeof (void *); 652 level1 = kmem_alloc(newarrsz, kmemflags); 653 if (level1 == NULL) 654 return (0); 655 656 /* Are we converting from a one level to a two level anon array? */ 657 658 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 659 !(ahp->flags & ANON_ALLOC_FORCE)) { 660 661 /* 662 * Yes, we're converting to a two level. Reuse old level 1 663 * as new level 2 if it is exactly PAGESIZE. Otherwise 664 * alloc a new level 2 and copy the old level 1 data into it. 665 */ 666 if (oldamp_pgs == ANON_CHUNK_SIZE) { 667 level2 = (void *)ahp->array_chunk; 668 } else { 669 level2 = kmem_alloc(PAGESIZE, kmemflags); 670 if (level2 == NULL) { 671 kmem_free(level1, newarrsz); 672 return (0); 673 } 674 oldarrsz = oldamp_pgs * sizeof (void *); 675 676 ANON_INITBUF(ahp->array_chunk, oldarrsz, 677 level2, PAGESIZE, growdown); 678 kmem_free(ahp->array_chunk, oldarrsz); 679 } 680 bzero(level1, newarrsz); 681 if (growdown) 682 level1[nelems - 1] = level2; 683 else 684 level1[0] = level2; 685 } else { 686 oldarrsz = oelems * sizeof (void *); 687 688 ANON_INITBUF(ahp->array_chunk, oldarrsz, 689 level1, newarrsz, growdown); 690 kmem_free(ahp->array_chunk, oldarrsz); 691 } 692 693 ahp->array_chunk = level1; 694 ahp->size = newamp_pgs; 695 if (growdown) 696 *startidx_p = newamp_pgs - totpages; 697 698 return (newamp_pgs); 699 } 700 701 702 /* 703 * Called from clock handler to sync ani_free value. 704 */ 705 706 void 707 set_anoninfo(void) 708 { 709 int ix; 710 pgcnt_t total = 0; 711 712 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 713 total += ani_free_pool[ix].ani_count; 714 } 715 k_anoninfo.ani_free = total; 716 } 717 718 /* 719 * Reserve anon space. 720 * 721 * It's no longer simply a matter of incrementing ani_resv to 722 * reserve swap space, we need to check memory-based as well 723 * as disk-backed (physical) swap. The following algorithm 724 * is used: 725 * Check the space on physical swap 726 * i.e. amount needed < ani_max - ani_phys_resv 727 * If we are swapping on swapfs check 728 * amount needed < (availrmem - swapfs_minfree) 729 * Since the algorithm to check for the quantity of swap space is 730 * almost the same as that for reserving it, we'll just use anon_resvmem 731 * with a flag to decrement availrmem. 732 * 733 * Return non-zero on success. 734 */ 735 int 736 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 737 { 738 pgcnt_t npages = btopr(size); 739 pgcnt_t mswap_pages = 0; 740 pgcnt_t pswap_pages = 0; 741 proc_t *p = curproc; 742 743 if (zone != NULL && takemem) { 744 /* test zone.max-swap resource control */ 745 mutex_enter(&p->p_lock); 746 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 747 mutex_exit(&p->p_lock); 748 return (0); 749 } 750 mutex_exit(&p->p_lock); 751 } 752 mutex_enter(&anoninfo_lock); 753 754 /* 755 * pswap_pages is the number of pages we can take from 756 * physical (i.e. disk-backed) swap. 757 */ 758 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 759 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 760 761 ANON_PRINT(A_RESV, 762 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 763 npages, takemem, pswap_pages, (void *)caller())); 764 765 if (npages <= pswap_pages) { 766 /* 767 * we have enough space on a physical swap 768 */ 769 if (takemem) 770 k_anoninfo.ani_phys_resv += npages; 771 mutex_exit(&anoninfo_lock); 772 return (1); 773 } else if (pswap_pages != 0) { 774 /* 775 * we have some space on a physical swap 776 */ 777 if (takemem) { 778 /* 779 * use up remainder of phys swap 780 */ 781 k_anoninfo.ani_phys_resv += pswap_pages; 782 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 783 } 784 } 785 /* 786 * since (npages > pswap_pages) we need mem swap 787 * mswap_pages is the number of pages needed from availrmem 788 */ 789 ASSERT(npages > pswap_pages); 790 mswap_pages = npages - pswap_pages; 791 792 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 793 mswap_pages)); 794 795 /* 796 * priv processes can reserve memory as swap as long as availrmem 797 * remains greater than swapfs_minfree; in the case of non-priv 798 * processes, memory can be reserved as swap only if availrmem 799 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 800 * swapfs_reserve amount of memswap is not available to non-priv 801 * processes. This protects daemons such as automounter dying 802 * as a result of application processes eating away almost entire 803 * membased swap. This safeguard becomes useless if apps are run 804 * with root access. 805 * 806 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 807 * 808 */ 809 if (tryhard) { 810 mutex_exit(&anoninfo_lock); 811 (void) page_reclaim_mem(mswap_pages, 812 swapfs_minfree + swapfs_reserve, 0); 813 mutex_enter(&anoninfo_lock); 814 } 815 816 mutex_enter(&freemem_lock); 817 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 818 (availrmem > (swapfs_minfree + mswap_pages) && 819 secpolicy_resource(CRED()) == 0)) { 820 821 if (takemem) { 822 /* 823 * Take the memory from the rest of the system. 824 */ 825 availrmem -= mswap_pages; 826 mutex_exit(&freemem_lock); 827 k_anoninfo.ani_mem_resv += mswap_pages; 828 ANI_ADD(mswap_pages); 829 ANON_PRINT((A_RESV | A_MRESV), 830 ("anon_resvmem: took %ld pages of availrmem\n", 831 mswap_pages)); 832 } else { 833 mutex_exit(&freemem_lock); 834 } 835 836 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 837 mutex_exit(&anoninfo_lock); 838 return (1); 839 840 } else { 841 /* 842 * Fail if not enough memory 843 */ 844 845 if (takemem) { 846 k_anoninfo.ani_phys_resv -= pswap_pages; 847 } 848 849 mutex_exit(&freemem_lock); 850 mutex_exit(&anoninfo_lock); 851 ANON_PRINT(A_RESV, 852 ("anon_resvmem: not enough space from swapfs\n")); 853 if (zone != NULL && takemem) 854 rctl_decr_swap(zone, ptob(npages)); 855 return (0); 856 } 857 } 858 859 /* 860 * Give back an anon reservation. 861 */ 862 void 863 anon_unresvmem(size_t size, zone_t *zone) 864 { 865 pgcnt_t npages = btopr(size); 866 spgcnt_t mem_free_pages = 0; 867 pgcnt_t phys_free_slots; 868 #ifdef ANON_DEBUG 869 pgcnt_t mem_resv; 870 #endif 871 if (zone != NULL) 872 rctl_decr_swap(zone, ptob(npages)); 873 874 mutex_enter(&anoninfo_lock); 875 876 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 877 878 /* 879 * If some of this reservation belonged to swapfs 880 * give it back to availrmem. 881 * ani_mem_resv is the amount of availrmem swapfs has reserved. 882 * but some of that memory could be locked by segspt so we can only 883 * return non locked ani_mem_resv back to availrmem 884 */ 885 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 886 ANON_PRINT((A_RESV | A_MRESV), 887 ("anon_unresv: growing availrmem by %ld pages\n", 888 MIN(k_anoninfo.ani_mem_resv, npages))); 889 890 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 891 k_anoninfo.ani_locked_swap), npages); 892 mutex_enter(&freemem_lock); 893 availrmem += mem_free_pages; 894 mutex_exit(&freemem_lock); 895 k_anoninfo.ani_mem_resv -= mem_free_pages; 896 897 ANI_ADD(-mem_free_pages); 898 } 899 /* 900 * The remainder of the pages is returned to phys swap 901 */ 902 ASSERT(npages >= mem_free_pages); 903 phys_free_slots = npages - mem_free_pages; 904 905 if (phys_free_slots) { 906 k_anoninfo.ani_phys_resv -= phys_free_slots; 907 } 908 909 #ifdef ANON_DEBUG 910 mem_resv = k_anoninfo.ani_mem_resv; 911 #endif 912 913 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 914 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 915 916 mutex_exit(&anoninfo_lock); 917 918 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 919 npages, mem_resv, (void *)caller())); 920 } 921 922 /* 923 * Allocate an anon slot and return it with the lock held. 924 */ 925 struct anon * 926 anon_alloc(struct vnode *vp, anoff_t off) 927 { 928 struct anon *ap; 929 kmutex_t *ahm; 930 931 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 932 if (vp == NULL) { 933 swap_alloc(ap); 934 } else { 935 ap->an_vp = vp; 936 ap->an_off = off; 937 } 938 ap->an_refcnt = 1; 939 ap->an_pvp = NULL; 940 ap->an_poff = 0; 941 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 942 mutex_enter(ahm); 943 anon_addhash(ap); 944 mutex_exit(ahm); 945 ANI_ADD(-1); 946 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 947 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 948 return (ap); 949 } 950 951 /* 952 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 953 * such pages don't consume any physical swap resources needed for swapping 954 * unlocked pages. 955 */ 956 void 957 anon_swap_free(struct anon *ap, page_t *pp) 958 { 959 kmutex_t *ahm; 960 961 ASSERT(ap != NULL); 962 ASSERT(pp != NULL); 963 ASSERT(PAGE_LOCKED(pp)); 964 ASSERT(pp->p_vnode != NULL); 965 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 966 ASSERT(ap->an_refcnt != 0); 967 ASSERT(pp->p_vnode == ap->an_vp); 968 ASSERT(pp->p_offset == ap->an_off); 969 970 if (ap->an_pvp == NULL) 971 return; 972 973 page_io_lock(pp); 974 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 975 mutex_enter(ahm); 976 977 ASSERT(ap->an_refcnt != 0); 978 ASSERT(pp->p_vnode == ap->an_vp); 979 ASSERT(pp->p_offset == ap->an_off); 980 981 if (ap->an_pvp != NULL) { 982 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 983 ap->an_pvp = NULL; 984 ap->an_poff = 0; 985 mutex_exit(ahm); 986 hat_setmod(pp); 987 } else { 988 mutex_exit(ahm); 989 } 990 page_io_unlock(pp); 991 } 992 993 /* 994 * Decrement the reference count of an anon page. 995 * If reference count goes to zero, free it and 996 * its associated page (if any). 997 */ 998 void 999 anon_decref(struct anon *ap) 1000 { 1001 page_t *pp; 1002 struct vnode *vp; 1003 anoff_t off; 1004 kmutex_t *ahm; 1005 1006 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1007 mutex_enter(ahm); 1008 ASSERT(ap->an_refcnt != 0); 1009 if (ap->an_refcnt == 0) 1010 panic("anon_decref: slot count 0"); 1011 if (--ap->an_refcnt == 0) { 1012 swap_xlate(ap, &vp, &off); 1013 anon_rmhash(ap); 1014 if (ap->an_pvp != NULL) 1015 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1016 mutex_exit(ahm); 1017 1018 /* 1019 * If there is a page for this anon slot we will need to 1020 * call VN_DISPOSE to get rid of the vp association and 1021 * put the page back on the free list as really free. 1022 * Acquire the "exclusive" lock to ensure that any 1023 * pending i/o always completes before the swap slot 1024 * is freed. 1025 */ 1026 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1027 if (pp != NULL) { 1028 /*LINTED: constant in conditional context */ 1029 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1030 } 1031 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1032 (void *)ap, (void *)ap->an_vp)); 1033 1034 kmem_cache_free(anon_cache, ap); 1035 1036 ANI_ADD(1); 1037 } else { 1038 mutex_exit(ahm); 1039 } 1040 } 1041 1042 1043 /* 1044 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1045 * seg->s_szc level) to determine whether COW processing is required. 1046 * anonpages_hash_lock[] held on the root ap ensures that if root's 1047 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1048 * later since this process can't fork while its AS lock is held). 1049 * 1050 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1051 */ 1052 int 1053 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1054 { 1055 struct anon *ap; 1056 kmutex_t *ahmpages = NULL; 1057 1058 ap = anon_get_ptr(ahp, anon_index); 1059 if (ap == NULL) 1060 return (0); 1061 1062 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1063 mutex_enter(ahmpages); 1064 ASSERT(ap->an_refcnt >= 1); 1065 if (ap->an_refcnt == 1) { 1066 mutex_exit(ahmpages); 1067 return (0); 1068 } 1069 mutex_exit(ahmpages); 1070 return (1); 1071 } 1072 /* 1073 * Check 'nslots' anon slots for refcnt > 1. 1074 * 1075 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1076 * returns 0. 1077 */ 1078 static int 1079 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1080 { 1081 struct anon *ap; 1082 1083 while (nslots-- > 0) { 1084 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1085 ap->an_refcnt > 1) 1086 return (1); 1087 anon_index++; 1088 } 1089 1090 return (0); 1091 } 1092 1093 static void 1094 anon_decref_pages( 1095 struct anon_hdr *ahp, 1096 ulong_t an_idx, 1097 uint_t szc) 1098 { 1099 struct anon *ap = anon_get_ptr(ahp, an_idx); 1100 kmutex_t *ahmpages = NULL; 1101 page_t *pp; 1102 pgcnt_t pgcnt = page_get_pagecnt(szc); 1103 pgcnt_t i; 1104 struct vnode *vp; 1105 anoff_t off; 1106 kmutex_t *ahm; 1107 #ifdef DEBUG 1108 int refcnt = 1; 1109 #endif 1110 1111 ASSERT(szc != 0); 1112 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1113 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1114 ASSERT(an_idx < ahp->size); 1115 1116 if (ahp->size - an_idx < pgcnt) { 1117 /* 1118 * In case of shared mappings total anon map size may not be 1119 * the largest page size aligned. 1120 */ 1121 pgcnt = ahp->size - an_idx; 1122 } 1123 1124 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1125 1126 if (ap != NULL) { 1127 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1128 mutex_enter(ahmpages); 1129 ASSERT((refcnt = ap->an_refcnt) != 0); 1130 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1131 if (ap->an_refcnt == 1) { 1132 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1133 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1134 mutex_exit(ahmpages); 1135 ahmpages = NULL; 1136 } 1137 } 1138 1139 i = 0; 1140 while (i < pgcnt) { 1141 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1142 ASSERT(refcnt == 1 && ahmpages == NULL); 1143 i++; 1144 continue; 1145 } 1146 ASSERT(ap->an_refcnt == refcnt); 1147 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1148 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1149 1150 if (ahmpages == NULL) { 1151 swap_xlate(ap, &vp, &off); 1152 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1153 if (pp == NULL || pp->p_szc == 0) { 1154 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1155 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1156 ap->an_off)]; 1157 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1158 ANON_SLEEP); 1159 mutex_enter(ahm); 1160 ap->an_refcnt--; 1161 ASSERT(ap->an_refcnt == 0); 1162 anon_rmhash(ap); 1163 if (ap->an_pvp) 1164 swap_phys_free(ap->an_pvp, ap->an_poff, 1165 PAGESIZE); 1166 mutex_exit(ahm); 1167 if (pp == NULL) { 1168 pp = page_lookup(vp, (u_offset_t)off, 1169 SE_EXCL); 1170 ASSERT(pp == NULL || pp->p_szc == 0); 1171 } 1172 if (pp != NULL) { 1173 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1174 /*LINTED*/ 1175 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1176 } 1177 kmem_cache_free(anon_cache, ap); 1178 ANI_ADD(1); 1179 i++; 1180 } else { 1181 pgcnt_t j; 1182 pgcnt_t curpgcnt = 1183 page_get_pagecnt(pp->p_szc); 1184 size_t ppasize = curpgcnt * sizeof (page_t *); 1185 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1186 int dispose = 0; 1187 1188 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1189 1190 ASSERT(pp->p_szc <= szc); 1191 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1192 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1193 ASSERT(i + curpgcnt <= pgcnt); 1194 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1195 ppa[0] = pp; 1196 for (j = i + 1; j < i + curpgcnt; j++) { 1197 ap = anon_get_ptr(ahp, an_idx + j); 1198 ASSERT(ap != NULL && 1199 ap->an_refcnt == 1); 1200 swap_xlate(ap, &vp, &off); 1201 pp = page_lookup(vp, (u_offset_t)off, 1202 SE_EXCL); 1203 if (pp == NULL) 1204 panic("anon_decref_pages: " 1205 "no page"); 1206 1207 (void) hat_pageunload(pp, 1208 HAT_FORCE_PGUNLOAD); 1209 ASSERT(pp->p_szc == ppa[0]->p_szc); 1210 ASSERT(page_pptonum(pp) - 1 == 1211 page_pptonum(ppa[j - i - 1])); 1212 ppa[j - i] = pp; 1213 if (ap->an_pvp != NULL && 1214 !vn_matchopval(ap->an_pvp, 1215 VOPNAME_DISPOSE, 1216 (fs_generic_func_p)fs_dispose)) 1217 dispose = 1; 1218 } 1219 for (j = i; j < i + curpgcnt; j++) { 1220 ap = anon_get_ptr(ahp, an_idx + j); 1221 ASSERT(ap != NULL && 1222 ap->an_refcnt == 1); 1223 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1224 ap->an_off)]; 1225 (void) anon_set_ptr(ahp, an_idx + j, 1226 NULL, ANON_SLEEP); 1227 mutex_enter(ahm); 1228 ap->an_refcnt--; 1229 ASSERT(ap->an_refcnt == 0); 1230 anon_rmhash(ap); 1231 if (ap->an_pvp) 1232 swap_phys_free(ap->an_pvp, 1233 ap->an_poff, PAGESIZE); 1234 mutex_exit(ahm); 1235 kmem_cache_free(anon_cache, ap); 1236 ANI_ADD(1); 1237 } 1238 if (!dispose) { 1239 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1240 page_destroy_pages(ppa[0]); 1241 } else { 1242 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1243 for (j = 0; j < curpgcnt; j++) { 1244 ASSERT(PAGE_EXCL(ppa[j])); 1245 ppa[j]->p_szc = 0; 1246 } 1247 for (j = 0; j < curpgcnt; j++) { 1248 ASSERT(!hat_page_is_mapped( 1249 ppa[j])); 1250 /*LINTED*/ 1251 VN_DISPOSE(ppa[j], B_INVAL, 0, 1252 kcred); 1253 } 1254 } 1255 kmem_free(ppa, ppasize); 1256 i += curpgcnt; 1257 } 1258 } else { 1259 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1260 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1261 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1262 mutex_enter(ahm); 1263 ap->an_refcnt--; 1264 mutex_exit(ahm); 1265 i++; 1266 } 1267 } 1268 1269 if (ahmpages != NULL) { 1270 mutex_exit(ahmpages); 1271 } 1272 } 1273 1274 /* 1275 * Duplicate references to size bytes worth of anon pages. 1276 * Used when duplicating a segment that contains private anon pages. 1277 * This code assumes that procedure calling this one has already used 1278 * hat_chgprot() to disable write access to the range of addresses that 1279 * that *old actually refers to. 1280 */ 1281 void 1282 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1283 ulong_t new_idx, size_t size) 1284 { 1285 spgcnt_t npages; 1286 kmutex_t *ahm; 1287 struct anon *ap; 1288 ulong_t off; 1289 ulong_t index; 1290 1291 npages = btopr(size); 1292 while (npages > 0) { 1293 index = old_idx; 1294 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1295 break; 1296 1297 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1298 off = index - old_idx; 1299 npages -= off; 1300 if (npages <= 0) 1301 break; 1302 1303 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1304 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1305 1306 mutex_enter(ahm); 1307 ap->an_refcnt++; 1308 mutex_exit(ahm); 1309 1310 off++; 1311 new_idx += off; 1312 old_idx += off; 1313 npages--; 1314 } 1315 } 1316 1317 /* 1318 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1319 * slots) within any large page region. That means if a large page region is 1320 * empty in the old array it will skip it. If there are 1 or more valid slots 1321 * in the large page region of the old array it will make sure to fill in any 1322 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1323 * page region should either have no valid anon slots or all slots should be 1324 * valid. 1325 */ 1326 void 1327 anon_dup_fill_holes( 1328 struct anon_hdr *old, 1329 ulong_t old_idx, 1330 struct anon_hdr *new, 1331 ulong_t new_idx, 1332 size_t size, 1333 uint_t szc, 1334 int noalloc) 1335 { 1336 struct anon *ap; 1337 spgcnt_t npages; 1338 kmutex_t *ahm, *ahmpages = NULL; 1339 pgcnt_t pgcnt, i; 1340 ulong_t index, off; 1341 #ifdef DEBUG 1342 int refcnt; 1343 #endif 1344 1345 ASSERT(szc != 0); 1346 pgcnt = page_get_pagecnt(szc); 1347 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1348 npages = btopr(size); 1349 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1350 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1351 1352 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1353 1354 while (npages > 0) { 1355 index = old_idx; 1356 1357 /* 1358 * Find the next valid slot. 1359 */ 1360 if (anon_get_next_ptr(old, &index) == NULL) 1361 break; 1362 1363 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1364 /* 1365 * Now backup index to the beginning of the 1366 * current large page region of the old array. 1367 */ 1368 index = P2ALIGN(index, pgcnt); 1369 off = index - old_idx; 1370 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1371 npages -= off; 1372 if (npages <= 0) 1373 break; 1374 1375 /* 1376 * Fill and copy a large page regions worth 1377 * of anon slots. 1378 */ 1379 for (i = 0; i < pgcnt; i++) { 1380 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1381 if (noalloc) { 1382 panic("anon_dup_fill_holes: " 1383 "empty anon slot\n"); 1384 } 1385 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1386 ap = anon_alloc(NULL, 0); 1387 (void) anon_set_ptr(old, index + i, ap, 1388 ANON_SLEEP); 1389 } else if (i == 0) { 1390 /* 1391 * make the increment of all refcnts of all 1392 * anon slots of a large page appear atomic by 1393 * getting an anonpages_hash_lock for the 1394 * first anon slot of a large page. 1395 */ 1396 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1397 1398 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1399 1400 ahmpages = &anonpages_hash_lock[hash]; 1401 mutex_enter(ahmpages); 1402 /*LINTED*/ 1403 ASSERT(refcnt = ap->an_refcnt); 1404 1405 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1406 anonvmstats.dupfillholes[3]); 1407 } 1408 (void) anon_set_ptr(new, new_idx + off + i, ap, 1409 ANON_SLEEP); 1410 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1411 mutex_enter(ahm); 1412 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1413 ASSERT(i == 0 || ahmpages == NULL || 1414 refcnt == ap->an_refcnt); 1415 ap->an_refcnt++; 1416 mutex_exit(ahm); 1417 } 1418 if (ahmpages != NULL) { 1419 mutex_exit(ahmpages); 1420 ahmpages = NULL; 1421 } 1422 off += pgcnt; 1423 new_idx += off; 1424 old_idx += off; 1425 npages -= pgcnt; 1426 } 1427 } 1428 1429 /* 1430 * Used when a segment with a vnode changes szc. similarly to 1431 * anon_dup_fill_holes() makes sure each large page region either has no anon 1432 * slots or all of them. but new slots are created by COWing the file 1433 * pages. on entrance no anon slots should be shared. 1434 */ 1435 int 1436 anon_fill_cow_holes( 1437 struct seg *seg, 1438 caddr_t addr, 1439 struct anon_hdr *ahp, 1440 ulong_t an_idx, 1441 struct vnode *vp, 1442 u_offset_t vp_off, 1443 size_t size, 1444 uint_t szc, 1445 uint_t prot, 1446 struct vpage vpage[], 1447 struct cred *cred) 1448 { 1449 struct anon *ap; 1450 spgcnt_t npages; 1451 pgcnt_t pgcnt, i; 1452 ulong_t index, off; 1453 int err = 0; 1454 int pageflags = 0; 1455 1456 ASSERT(szc != 0); 1457 pgcnt = page_get_pagecnt(szc); 1458 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1459 npages = btopr(size); 1460 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1461 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1462 1463 while (npages > 0) { 1464 index = an_idx; 1465 1466 /* 1467 * Find the next valid slot. 1468 */ 1469 if (anon_get_next_ptr(ahp, &index) == NULL) { 1470 break; 1471 } 1472 1473 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1474 /* 1475 * Now backup index to the beginning of the 1476 * current large page region of the anon array. 1477 */ 1478 index = P2ALIGN(index, pgcnt); 1479 off = index - an_idx; 1480 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1481 npages -= off; 1482 if (npages <= 0) 1483 break; 1484 an_idx += off; 1485 vp_off += ptob(off); 1486 addr += ptob(off); 1487 if (vpage != NULL) { 1488 vpage += off; 1489 } 1490 1491 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1492 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1493 page_t *pl[1 + 1]; 1494 page_t *pp; 1495 1496 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1497 pl, PAGESIZE, seg, addr, S_READ, cred, 1498 NULL); 1499 if (err) { 1500 break; 1501 } 1502 if (vpage != NULL) { 1503 prot = VPP_PROT(vpage); 1504 pageflags = VPP_ISPPLOCK(vpage) ? 1505 LOCK_PAGE : 0; 1506 } 1507 pp = anon_private(&ap, seg, addr, prot, pl[0], 1508 pageflags, cred); 1509 if (pp == NULL) { 1510 err = ENOMEM; 1511 break; 1512 } 1513 (void) anon_set_ptr(ahp, an_idx, ap, 1514 ANON_SLEEP); 1515 page_unlock(pp); 1516 } 1517 ASSERT(ap->an_refcnt == 1); 1518 addr += PAGESIZE; 1519 if (vpage != NULL) { 1520 vpage++; 1521 } 1522 } 1523 npages -= pgcnt; 1524 } 1525 1526 return (err); 1527 } 1528 1529 /* 1530 * Free a group of "size" anon pages, size in bytes, 1531 * and clear out the pointers to the anon entries. 1532 */ 1533 void 1534 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1535 { 1536 spgcnt_t npages; 1537 struct anon *ap; 1538 ulong_t old; 1539 1540 npages = btopr(size); 1541 1542 while (npages > 0) { 1543 old = index; 1544 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1545 break; 1546 1547 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1548 npages -= index - old; 1549 if (npages <= 0) 1550 break; 1551 1552 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1553 anon_decref(ap); 1554 /* 1555 * Bump index and decrement page count 1556 */ 1557 index++; 1558 npages--; 1559 } 1560 } 1561 1562 void 1563 anon_free_pages( 1564 struct anon_hdr *ahp, 1565 ulong_t an_idx, 1566 size_t size, 1567 uint_t szc) 1568 { 1569 spgcnt_t npages; 1570 pgcnt_t pgcnt; 1571 ulong_t index, off; 1572 1573 ASSERT(szc != 0); 1574 pgcnt = page_get_pagecnt(szc); 1575 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1576 npages = btopr(size); 1577 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1578 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1579 ASSERT(an_idx < ahp->size); 1580 1581 VM_STAT_ADD(anonvmstats.freepages[0]); 1582 1583 while (npages > 0) { 1584 index = an_idx; 1585 1586 /* 1587 * Find the next valid slot. 1588 */ 1589 if (anon_get_next_ptr(ahp, &index) == NULL) 1590 break; 1591 1592 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1593 /* 1594 * Now backup index to the beginning of the 1595 * current large page region of the old array. 1596 */ 1597 index = P2ALIGN(index, pgcnt); 1598 off = index - an_idx; 1599 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1600 npages -= off; 1601 if (npages <= 0) 1602 break; 1603 1604 anon_decref_pages(ahp, index, szc); 1605 1606 off += pgcnt; 1607 an_idx += off; 1608 npages -= pgcnt; 1609 } 1610 } 1611 1612 /* 1613 * Make anonymous pages discardable 1614 */ 1615 void 1616 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1617 { 1618 spgcnt_t npages = btopr(size); 1619 struct anon *ap; 1620 struct vnode *vp; 1621 anoff_t off; 1622 page_t *pp, *root_pp; 1623 kmutex_t *ahm; 1624 pgcnt_t pgcnt; 1625 ulong_t old_idx, idx, i; 1626 struct anon_hdr *ahp = amp->ahp; 1627 anon_sync_obj_t cookie; 1628 1629 ASSERT(RW_READ_HELD(&->a_rwlock)); 1630 pgcnt = 1; 1631 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1632 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1633 1634 /* 1635 * get anon pointer and index for the first valid entry 1636 * in the anon list, starting from "index" 1637 */ 1638 old_idx = index; 1639 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1640 break; 1641 1642 /* 1643 * decrement npages by number of NULL anon slots we skipped 1644 */ 1645 npages -= index - old_idx; 1646 if (npages <= 0) 1647 break; 1648 1649 anon_array_enter(amp, index, &cookie); 1650 ap = anon_get_ptr(ahp, index); 1651 ASSERT(ap != NULL); 1652 1653 /* 1654 * Get anonymous page and try to lock it SE_EXCL; 1655 * if we couldn't grab the lock we skip to next page. 1656 */ 1657 swap_xlate(ap, &vp, &off); 1658 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1659 if (pp == NULL) { 1660 segadvstat.MADV_FREE_miss.value.ul++; 1661 pgcnt = 1; 1662 anon_array_exit(&cookie); 1663 continue; 1664 } 1665 pgcnt = page_get_pagecnt(pp->p_szc); 1666 1667 /* 1668 * we cannot free a page which is permanently locked. 1669 * The page_struct_lock need not be acquired to examine 1670 * these fields since the page has an "exclusive" lock. 1671 */ 1672 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1673 page_unlock(pp); 1674 segadvstat.MADV_FREE_miss.value.ul++; 1675 anon_array_exit(&cookie); 1676 continue; 1677 } 1678 1679 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1680 mutex_enter(ahm); 1681 ASSERT(ap->an_refcnt != 0); 1682 /* 1683 * skip this one if copy-on-write is not yet broken. 1684 */ 1685 if (ap->an_refcnt > 1) { 1686 mutex_exit(ahm); 1687 page_unlock(pp); 1688 segadvstat.MADV_FREE_miss.value.ul++; 1689 anon_array_exit(&cookie); 1690 continue; 1691 } 1692 1693 if (pp->p_szc == 0) { 1694 pgcnt = 1; 1695 1696 /* 1697 * free swap slot; 1698 */ 1699 if (ap->an_pvp) { 1700 swap_phys_free(ap->an_pvp, ap->an_poff, 1701 PAGESIZE); 1702 ap->an_pvp = NULL; 1703 ap->an_poff = 0; 1704 } 1705 mutex_exit(ahm); 1706 segadvstat.MADV_FREE_hit.value.ul++; 1707 1708 /* 1709 * while we are at it, unload all the translations 1710 * and attempt to free the page. 1711 */ 1712 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1713 /*LINTED: constant in conditional context */ 1714 VN_DISPOSE(pp, B_FREE, 0, kcred); 1715 anon_array_exit(&cookie); 1716 continue; 1717 } 1718 1719 pgcnt = page_get_pagecnt(pp->p_szc); 1720 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1721 if (!page_try_demote_pages(pp)) { 1722 mutex_exit(ahm); 1723 page_unlock(pp); 1724 segadvstat.MADV_FREE_miss.value.ul++; 1725 anon_array_exit(&cookie); 1726 continue; 1727 } else { 1728 pgcnt = 1; 1729 if (ap->an_pvp) { 1730 swap_phys_free(ap->an_pvp, 1731 ap->an_poff, PAGESIZE); 1732 ap->an_pvp = NULL; 1733 ap->an_poff = 0; 1734 } 1735 mutex_exit(ahm); 1736 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1737 /*LINTED*/ 1738 VN_DISPOSE(pp, B_FREE, 0, kcred); 1739 segadvstat.MADV_FREE_hit.value.ul++; 1740 anon_array_exit(&cookie); 1741 continue; 1742 } 1743 } 1744 mutex_exit(ahm); 1745 root_pp = pp; 1746 1747 /* 1748 * try to lock remaining pages 1749 */ 1750 for (idx = 1; idx < pgcnt; idx++) { 1751 pp++; 1752 if (!page_trylock(pp, SE_EXCL)) 1753 break; 1754 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1755 page_unlock(pp); 1756 break; 1757 } 1758 } 1759 1760 if (idx == pgcnt) { 1761 for (i = 0; i < pgcnt; i++) { 1762 ap = anon_get_ptr(ahp, index + i); 1763 if (ap == NULL) 1764 break; 1765 swap_xlate(ap, &vp, &off); 1766 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1767 mutex_enter(ahm); 1768 ASSERT(ap->an_refcnt != 0); 1769 1770 /* 1771 * skip this one if copy-on-write 1772 * is not yet broken. 1773 */ 1774 if (ap->an_refcnt > 1) { 1775 mutex_exit(ahm); 1776 goto skiplp; 1777 } 1778 if (ap->an_pvp) { 1779 swap_phys_free(ap->an_pvp, 1780 ap->an_poff, PAGESIZE); 1781 ap->an_pvp = NULL; 1782 ap->an_poff = 0; 1783 } 1784 mutex_exit(ahm); 1785 } 1786 page_destroy_pages(root_pp); 1787 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1788 anon_array_exit(&cookie); 1789 continue; 1790 } 1791 skiplp: 1792 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1793 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1794 page_unlock(pp); 1795 anon_array_exit(&cookie); 1796 } 1797 } 1798 1799 /* 1800 * Return the kept page(s) and protections back to the segment driver. 1801 */ 1802 int 1803 anon_getpage( 1804 struct anon **app, 1805 uint_t *protp, 1806 page_t *pl[], 1807 size_t plsz, 1808 struct seg *seg, 1809 caddr_t addr, 1810 enum seg_rw rw, 1811 struct cred *cred) 1812 { 1813 page_t *pp; 1814 struct anon *ap = *app; 1815 struct vnode *vp; 1816 anoff_t off; 1817 int err; 1818 kmutex_t *ahm; 1819 1820 swap_xlate(ap, &vp, &off); 1821 1822 /* 1823 * Lookup the page. If page is being paged in, 1824 * wait for it to finish as we must return a list of 1825 * pages since this routine acts like the VOP_GETPAGE 1826 * routine does. 1827 */ 1828 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1829 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1830 mutex_enter(ahm); 1831 if (ap->an_refcnt == 1) 1832 *protp = PROT_ALL; 1833 else 1834 *protp = PROT_ALL & ~PROT_WRITE; 1835 mutex_exit(ahm); 1836 pl[0] = pp; 1837 pl[1] = NULL; 1838 return (0); 1839 } 1840 1841 /* 1842 * Simply treat it as a vnode fault on the anon vp. 1843 */ 1844 1845 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1846 "anon_getpage:seg %x addr %x vp %x", 1847 seg, addr, vp); 1848 1849 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1850 seg, addr, rw, cred, NULL); 1851 1852 if (err == 0 && pl != NULL) { 1853 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1854 mutex_enter(ahm); 1855 if (ap->an_refcnt != 1) 1856 *protp &= ~PROT_WRITE; /* make read-only */ 1857 mutex_exit(ahm); 1858 } 1859 return (err); 1860 } 1861 1862 /* 1863 * Creates or returns kept pages to the segment driver. returns -1 if a large 1864 * page cannot be allocated. returns -2 if some other process has allocated a 1865 * larger page. 1866 * 1867 * For cowfault it will allocate any size pages to fill the requested area to 1868 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1869 * slots within a large page with other processes). This policy greatly 1870 * simplifies large page freeing (which is only freed when all anon slot 1871 * refcnts are 0). 1872 */ 1873 int 1874 anon_map_getpages( 1875 struct anon_map *amp, 1876 ulong_t start_idx, 1877 uint_t szc, 1878 struct seg *seg, 1879 caddr_t addr, 1880 uint_t prot, 1881 uint_t *protp, 1882 page_t *ppa[], 1883 uint_t *ppa_szc, 1884 struct vpage vpage[], 1885 enum seg_rw rw, 1886 int brkcow, 1887 int anypgsz, 1888 int pgflags, 1889 struct cred *cred) 1890 { 1891 pgcnt_t pgcnt; 1892 struct anon *ap; 1893 struct vnode *vp; 1894 anoff_t off; 1895 page_t *pp, *pl[2], *conpp = NULL; 1896 caddr_t vaddr; 1897 ulong_t pg_idx, an_idx, i; 1898 spgcnt_t nreloc = 0; 1899 int prealloc = 1; 1900 int err, slotcreate; 1901 uint_t vpprot; 1902 int upsize = (szc < seg->s_szc); 1903 1904 #if !defined(__i386) && !defined(__amd64) 1905 ASSERT(seg->s_szc != 0); 1906 #endif 1907 ASSERT(szc <= seg->s_szc); 1908 ASSERT(ppa_szc != NULL); 1909 ASSERT(rw != S_CREATE); 1910 1911 *protp = PROT_ALL; 1912 1913 VM_STAT_ADD(anonvmstats.getpages[0]); 1914 1915 if (szc == 0) { 1916 VM_STAT_ADD(anonvmstats.getpages[1]); 1917 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1918 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1919 addr, rw, cred); 1920 if (err) 1921 return (err); 1922 ppa[0] = pl[0]; 1923 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1924 VM_STAT_ADD(anonvmstats.getpages[2]); 1925 if (ppa[0]->p_szc != 0 && upsize) { 1926 VM_STAT_ADD(anonvmstats.getpages[3]); 1927 *ppa_szc = MIN(ppa[0]->p_szc, 1928 seg->s_szc); 1929 page_unlock(ppa[0]); 1930 return (-2); 1931 } 1932 return (0); 1933 } 1934 panic("anon_map_getpages: cowfault for szc 0"); 1935 } else { 1936 VM_STAT_ADD(anonvmstats.getpages[4]); 1937 ppa[0] = anon_zero(seg, addr, &ap, cred); 1938 if (ppa[0] == NULL) 1939 return (ENOMEM); 1940 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1941 ANON_SLEEP); 1942 return (0); 1943 } 1944 } 1945 1946 pgcnt = page_get_pagecnt(szc); 1947 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1948 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1949 1950 /* 1951 * First we check for the case that the requtested large 1952 * page or larger page already exists in the system. 1953 * Actually we only check if the first constituent page 1954 * exists and only preallocate if it's not found. 1955 */ 1956 ap = anon_get_ptr(amp->ahp, start_idx); 1957 if (ap) { 1958 uint_t pszc; 1959 swap_xlate(ap, &vp, &off); 1960 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1961 if (pszc > szc && upsize) { 1962 *ppa_szc = MIN(pszc, seg->s_szc); 1963 return (-2); 1964 } 1965 if (pszc >= szc) { 1966 prealloc = 0; 1967 } 1968 } 1969 } 1970 1971 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1972 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1973 1974 top: 1975 /* 1976 * If a smaller page or no page at all was found, 1977 * grab a large page off the freelist. 1978 */ 1979 if (prealloc) { 1980 ASSERT(conpp == NULL); 1981 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1982 szc, 0, pgflags) != 0) { 1983 VM_STAT_ADD(anonvmstats.getpages[7]); 1984 if (brkcow == 0 || szc < seg->s_szc || 1985 !anon_szcshare(amp->ahp, start_idx)) { 1986 /* 1987 * If the refcnt's of all anon slots are <= 1 1988 * they can't increase since we are holding 1989 * the address space's lock. So segvn can 1990 * safely decrease szc without risking to 1991 * generate a cow fault for the region smaller 1992 * than the segment's largest page size. 1993 */ 1994 VM_STAT_ADD(anonvmstats.getpages[8]); 1995 return (-1); 1996 } 1997 docow: 1998 /* 1999 * This is a cow fault. Copy away the entire 1 large 2000 * page region of this segment. 2001 */ 2002 if (szc != seg->s_szc) 2003 panic("anon_map_getpages: cowfault for szc %d", 2004 szc); 2005 vaddr = addr; 2006 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2007 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2008 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2009 NULL) { 2010 err = anon_getpage(&ap, &vpprot, pl, 2011 PAGESIZE, seg, vaddr, rw, cred); 2012 if (err) { 2013 for (i = 0; i < pg_idx; i++) { 2014 if ((pp = ppa[i]) != 2015 NULL) 2016 page_unlock(pp); 2017 } 2018 return (err); 2019 } 2020 ppa[pg_idx] = pl[0]; 2021 } else { 2022 /* 2023 * Since this is a cowfault we know 2024 * that this address space has a 2025 * parent or children which means 2026 * anon_dup_fill_holes() has initialized 2027 * all anon slots within a large page 2028 * region that had at least one anon 2029 * slot at the time of fork(). 2030 */ 2031 panic("anon_map_getpages: " 2032 "cowfault but anon slot is empty"); 2033 } 2034 } 2035 VM_STAT_ADD(anonvmstats.getpages[9]); 2036 *protp = PROT_ALL; 2037 return (anon_map_privatepages(amp, start_idx, szc, seg, 2038 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2039 } 2040 } 2041 2042 VM_STAT_ADD(anonvmstats.getpages[10]); 2043 2044 an_idx = start_idx; 2045 pg_idx = 0; 2046 vaddr = addr; 2047 while (pg_idx < pgcnt) { 2048 slotcreate = 0; 2049 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2050 VM_STAT_ADD(anonvmstats.getpages[11]); 2051 /* 2052 * For us to have decided not to preallocate 2053 * would have meant that a large page 2054 * was found. Which also means that all of the 2055 * anon slots for that page would have been 2056 * already created for us. 2057 */ 2058 if (prealloc == 0) 2059 panic("anon_map_getpages: prealloc = 0"); 2060 2061 slotcreate = 1; 2062 ap = anon_alloc(NULL, 0); 2063 } 2064 swap_xlate(ap, &vp, &off); 2065 2066 /* 2067 * Now setup our preallocated page to pass down 2068 * to swap_getpage(). 2069 */ 2070 if (prealloc) { 2071 ASSERT(ppa[pg_idx]->p_szc == szc); 2072 conpp = ppa[pg_idx]; 2073 } 2074 ASSERT(prealloc || conpp == NULL); 2075 2076 /* 2077 * If we just created this anon slot then call 2078 * with S_CREATE to prevent doing IO on the page. 2079 * Similar to the anon_zero case. 2080 */ 2081 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2082 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2083 slotcreate == 1 ? S_CREATE : rw, cred); 2084 2085 if (err) { 2086 ASSERT(err != -2 || upsize); 2087 VM_STAT_ADD(anonvmstats.getpages[12]); 2088 ASSERT(slotcreate == 0); 2089 goto io_err; 2090 } 2091 2092 pp = pl[0]; 2093 2094 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2095 VM_STAT_ADD(anonvmstats.getpages[13]); 2096 ASSERT(slotcreate == 0); 2097 ASSERT(prealloc == 0); 2098 ASSERT(pg_idx == 0); 2099 if (pp->p_szc > szc) { 2100 ASSERT(upsize); 2101 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2102 page_unlock(pp); 2103 VM_STAT_ADD(anonvmstats.getpages[14]); 2104 return (-2); 2105 } 2106 page_unlock(pp); 2107 prealloc = 1; 2108 goto top; 2109 } 2110 2111 /* 2112 * If we decided to preallocate but VOP_GETPAGE 2113 * found a page in the system that satisfies our 2114 * request then free up our preallocated large page 2115 * and continue looping accross the existing large 2116 * page via VOP_GETPAGE. 2117 */ 2118 if (prealloc && pp != ppa[pg_idx]) { 2119 VM_STAT_ADD(anonvmstats.getpages[15]); 2120 ASSERT(slotcreate == 0); 2121 ASSERT(pg_idx == 0); 2122 conpp = NULL; 2123 prealloc = 0; 2124 page_free_pages(ppa[0]); 2125 } 2126 2127 if (prealloc && nreloc > 1) { 2128 /* 2129 * we have relocated out of a smaller large page. 2130 * skip npgs - 1 iterations and continue which will 2131 * increment by one the loop indices. 2132 */ 2133 spgcnt_t npgs = nreloc; 2134 2135 VM_STAT_ADD(anonvmstats.getpages[16]); 2136 2137 ASSERT(pp == ppa[pg_idx]); 2138 ASSERT(slotcreate == 0); 2139 ASSERT(pg_idx + npgs <= pgcnt); 2140 if ((*protp & PROT_WRITE) && 2141 anon_share(amp->ahp, an_idx, npgs)) { 2142 *protp &= ~PROT_WRITE; 2143 } 2144 pg_idx += npgs; 2145 an_idx += npgs; 2146 vaddr += PAGESIZE * npgs; 2147 continue; 2148 } 2149 2150 VM_STAT_ADD(anonvmstats.getpages[17]); 2151 2152 /* 2153 * Anon_zero case. 2154 */ 2155 if (slotcreate) { 2156 ASSERT(prealloc); 2157 pagezero(pp, 0, PAGESIZE); 2158 CPU_STATS_ADD_K(vm, zfod, 1); 2159 hat_setrefmod(pp); 2160 } 2161 2162 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2163 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2164 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2165 2166 if (pg_idx > 0 && 2167 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2168 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2169 panic("anon_map_getpages: unexpected page"); 2170 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2171 panic("anon_map_getpages: unaligned page"); 2172 } 2173 2174 if (prealloc == 0) { 2175 ppa[pg_idx] = pp; 2176 } 2177 2178 if (ap->an_refcnt > 1) { 2179 VM_STAT_ADD(anonvmstats.getpages[18]); 2180 *protp &= ~PROT_WRITE; 2181 } 2182 2183 /* 2184 * If this is a new anon slot then initialize 2185 * the anon array entry. 2186 */ 2187 if (slotcreate) { 2188 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2189 } 2190 pg_idx++; 2191 an_idx++; 2192 vaddr += PAGESIZE; 2193 } 2194 2195 /* 2196 * Since preallocated pages come off the freelist 2197 * they are locked SE_EXCL. Simply downgrade and return. 2198 */ 2199 if (prealloc) { 2200 VM_STAT_ADD(anonvmstats.getpages[19]); 2201 conpp = NULL; 2202 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2203 page_downgrade(ppa[pg_idx]); 2204 } 2205 } 2206 ASSERT(conpp == NULL); 2207 2208 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2209 VM_STAT_ADD(anonvmstats.getpages[20]); 2210 return (0); 2211 } 2212 2213 if (szc < seg->s_szc) 2214 panic("anon_map_getpages: cowfault for szc %d", szc); 2215 2216 VM_STAT_ADD(anonvmstats.getpages[21]); 2217 2218 *protp = PROT_ALL; 2219 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2220 ppa, vpage, anypgsz, pgflags, cred)); 2221 io_err: 2222 /* 2223 * We got an IO error somewhere in our large page. 2224 * If we were using a preallocated page then just demote 2225 * all the constituent pages that we've succeeded with sofar 2226 * to PAGESIZE pages and leave them in the system 2227 * unlocked. 2228 */ 2229 2230 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2231 2232 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2233 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2234 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2235 2236 if (prealloc) { 2237 conpp = NULL; 2238 if (pg_idx > 0) { 2239 VM_STAT_ADD(anonvmstats.getpages[25]); 2240 for (i = 0; i < pgcnt; i++) { 2241 pp = ppa[i]; 2242 ASSERT(PAGE_EXCL(pp)); 2243 ASSERT(pp->p_szc == szc); 2244 pp->p_szc = 0; 2245 } 2246 for (i = 0; i < pg_idx; i++) { 2247 ASSERT(!hat_page_is_mapped(ppa[i])); 2248 page_unlock(ppa[i]); 2249 } 2250 /* 2251 * Now free up the remaining unused constituent 2252 * pages. 2253 */ 2254 while (pg_idx < pgcnt) { 2255 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2256 page_free(ppa[pg_idx], 0); 2257 pg_idx++; 2258 } 2259 } else { 2260 VM_STAT_ADD(anonvmstats.getpages[26]); 2261 page_free_pages(ppa[0]); 2262 } 2263 } else { 2264 VM_STAT_ADD(anonvmstats.getpages[27]); 2265 ASSERT(err > 0); 2266 for (i = 0; i < pg_idx; i++) 2267 page_unlock(ppa[i]); 2268 } 2269 ASSERT(conpp == NULL); 2270 if (err != -1) 2271 return (err); 2272 /* 2273 * we are here because we failed to relocate. 2274 */ 2275 ASSERT(prealloc); 2276 if (brkcow == 0 || szc < seg->s_szc || 2277 !anon_szcshare(amp->ahp, start_idx)) { 2278 VM_STAT_ADD(anonvmstats.getpages[28]); 2279 return (-1); 2280 } 2281 VM_STAT_ADD(anonvmstats.getpages[29]); 2282 goto docow; 2283 } 2284 2285 2286 /* 2287 * Turn a reference to an object or shared anon page 2288 * into a private page with a copy of the data from the 2289 * original page which is always locked by the caller. 2290 * This routine unloads the translation and unlocks the 2291 * original page, if it isn't being stolen, before returning 2292 * to the caller. 2293 * 2294 * NOTE: The original anon slot is not freed by this routine 2295 * It must be freed by the caller while holding the 2296 * "anon_map" lock to prevent races which can occur if 2297 * a process has multiple lwps in its address space. 2298 */ 2299 page_t * 2300 anon_private( 2301 struct anon **app, 2302 struct seg *seg, 2303 caddr_t addr, 2304 uint_t prot, 2305 page_t *opp, 2306 int oppflags, 2307 struct cred *cred) 2308 { 2309 struct anon *old = *app; 2310 struct anon *new; 2311 page_t *pp = NULL; 2312 struct vnode *vp; 2313 anoff_t off; 2314 page_t *anon_pl[1 + 1]; 2315 int err; 2316 2317 if (oppflags & STEAL_PAGE) 2318 ASSERT(PAGE_EXCL(opp)); 2319 else 2320 ASSERT(PAGE_LOCKED(opp)); 2321 2322 CPU_STATS_ADD_K(vm, cow_fault, 1); 2323 2324 /* Kernel probe */ 2325 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2326 tnf_opaque, address, addr); 2327 2328 *app = new = anon_alloc(NULL, 0); 2329 swap_xlate(new, &vp, &off); 2330 2331 if (oppflags & STEAL_PAGE) { 2332 page_rename(opp, vp, (u_offset_t)off); 2333 pp = opp; 2334 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2335 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2336 seg, addr, pp, vp, off); 2337 hat_setmod(pp); 2338 2339 /* bug 4026339 */ 2340 page_downgrade(pp); 2341 return (pp); 2342 } 2343 2344 /* 2345 * Call the VOP_GETPAGE routine to create the page, thereby 2346 * enabling the vnode driver to allocate any filesystem 2347 * space (e.g., disk block allocation for UFS). This also 2348 * prevents more than one page from being added to the 2349 * vnode at the same time. 2350 */ 2351 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2352 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2353 if (err) 2354 goto out; 2355 2356 pp = anon_pl[0]; 2357 2358 /* 2359 * If the original page was locked, we need to move the lock 2360 * to the new page by transfering 'cowcnt/lckcnt' of the original 2361 * page to 'cowcnt/lckcnt' of the new page. 2362 * 2363 * See Statement at the beginning of segvn_lockop() and 2364 * comments in page_pp_useclaim() regarding the way 2365 * cowcnts/lckcnts are handled. 2366 * 2367 * Also availrmem must be decremented up front for read only mapping 2368 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2369 * if availrmem did not need to be decremented after all. 2370 */ 2371 if (oppflags & LOCK_PAGE) { 2372 if ((prot & PROT_WRITE) == 0) { 2373 mutex_enter(&freemem_lock); 2374 if (availrmem > pages_pp_maximum) { 2375 availrmem--; 2376 pages_useclaim++; 2377 } else { 2378 mutex_exit(&freemem_lock); 2379 goto out; 2380 } 2381 mutex_exit(&freemem_lock); 2382 } 2383 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2384 } 2385 2386 /* 2387 * Now copy the contents from the original page, 2388 * which is locked and loaded in the MMU by 2389 * the caller to prevent yet another page fault. 2390 */ 2391 /* XXX - should set mod bit in here */ 2392 if (ppcopy(opp, pp) == 0) { 2393 /* 2394 * Before ppcopy could hanlde UE or other faults, we 2395 * would have panicked here, and still have no option 2396 * but to do so now. 2397 */ 2398 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2399 (void *)opp, (void *)pp); 2400 } 2401 2402 hat_setrefmod(pp); /* mark as modified */ 2403 2404 /* 2405 * Unload the old translation. 2406 */ 2407 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2408 2409 /* 2410 * Free unmapped, unmodified original page. 2411 * or release the lock on the original page, 2412 * otherwise the process will sleep forever in 2413 * anon_decref() waiting for the "exclusive" lock 2414 * on the page. 2415 */ 2416 (void) page_release(opp, 1); 2417 2418 /* 2419 * we are done with page creation so downgrade the new 2420 * page's selock to shared, this helps when multiple 2421 * as_fault(...SOFTLOCK...) are done to the same 2422 * page(aio) 2423 */ 2424 page_downgrade(pp); 2425 2426 /* 2427 * NOTE: The original anon slot must be freed by the 2428 * caller while holding the "anon_map" lock, if we 2429 * copied away from an anonymous page. 2430 */ 2431 return (pp); 2432 2433 out: 2434 *app = old; 2435 if (pp) 2436 page_unlock(pp); 2437 anon_decref(new); 2438 page_unlock(opp); 2439 return ((page_t *)NULL); 2440 } 2441 2442 int 2443 anon_map_privatepages( 2444 struct anon_map *amp, 2445 ulong_t start_idx, 2446 uint_t szc, 2447 struct seg *seg, 2448 caddr_t addr, 2449 uint_t prot, 2450 page_t *ppa[], 2451 struct vpage vpage[], 2452 int anypgsz, 2453 int pgflags, 2454 struct cred *cred) 2455 { 2456 pgcnt_t pgcnt; 2457 struct vnode *vp; 2458 anoff_t off; 2459 page_t *pl[2], *conpp = NULL; 2460 int err; 2461 int prealloc = 1; 2462 struct anon *ap, *oldap; 2463 caddr_t vaddr; 2464 page_t *pplist, *pp; 2465 ulong_t pg_idx, an_idx; 2466 spgcnt_t nreloc = 0; 2467 int pagelock = 0; 2468 kmutex_t *ahmpages = NULL; 2469 #ifdef DEBUG 2470 int refcnt; 2471 #endif 2472 2473 ASSERT(szc != 0); 2474 ASSERT(szc == seg->s_szc); 2475 2476 VM_STAT_ADD(anonvmstats.privatepages[0]); 2477 2478 pgcnt = page_get_pagecnt(szc); 2479 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2480 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2481 2482 ASSERT(amp != NULL); 2483 ap = anon_get_ptr(amp->ahp, start_idx); 2484 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2485 2486 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2487 2488 /* 2489 * Now try and allocate the large page. If we fail then just 2490 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2491 * the caller make this decision but to avoid added complexity 2492 * it's simplier to handle that case here. 2493 */ 2494 if (anypgsz == -1) { 2495 VM_STAT_ADD(anonvmstats.privatepages[2]); 2496 prealloc = 0; 2497 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2498 anypgsz, pgflags) != 0) { 2499 VM_STAT_ADD(anonvmstats.privatepages[3]); 2500 prealloc = 0; 2501 } 2502 2503 /* 2504 * make the decrement of all refcnts of all 2505 * anon slots of a large page appear atomic by 2506 * getting an anonpages_hash_lock for the 2507 * first anon slot of a large page. 2508 */ 2509 if (ap != NULL) { 2510 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2511 ap->an_off)]; 2512 mutex_enter(ahmpages); 2513 if (ap->an_refcnt == 1) { 2514 VM_STAT_ADD(anonvmstats.privatepages[4]); 2515 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2516 mutex_exit(ahmpages); 2517 2518 if (prealloc) { 2519 page_free_replacement_page(pplist); 2520 page_create_putback(pgcnt); 2521 } 2522 ASSERT(ppa[0]->p_szc <= szc); 2523 if (ppa[0]->p_szc == szc) { 2524 VM_STAT_ADD(anonvmstats.privatepages[5]); 2525 return (0); 2526 } 2527 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2528 ASSERT(ppa[pg_idx] != NULL); 2529 page_unlock(ppa[pg_idx]); 2530 } 2531 return (-1); 2532 } 2533 } 2534 2535 /* 2536 * If we are passed in the vpage array and this is 2537 * not PROT_WRITE then we need to decrement availrmem 2538 * up front before we try anything. If we need to and 2539 * can't decrement availrmem then its better to fail now 2540 * than in the middle of processing the new large page. 2541 * page_pp_usclaim() on behalf of each constituent page 2542 * below will adjust availrmem back for the cases not needed. 2543 */ 2544 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2545 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2546 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2547 pagelock = 1; 2548 break; 2549 } 2550 } 2551 if (pagelock) { 2552 VM_STAT_ADD(anonvmstats.privatepages[6]); 2553 mutex_enter(&freemem_lock); 2554 if (availrmem >= pages_pp_maximum + pgcnt) { 2555 availrmem -= pgcnt; 2556 pages_useclaim += pgcnt; 2557 } else { 2558 VM_STAT_ADD(anonvmstats.privatepages[7]); 2559 mutex_exit(&freemem_lock); 2560 if (ahmpages != NULL) { 2561 mutex_exit(ahmpages); 2562 } 2563 if (prealloc) { 2564 page_free_replacement_page(pplist); 2565 page_create_putback(pgcnt); 2566 } 2567 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2568 if (ppa[pg_idx] != NULL) 2569 page_unlock(ppa[pg_idx]); 2570 return (ENOMEM); 2571 } 2572 mutex_exit(&freemem_lock); 2573 } 2574 } 2575 2576 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2577 2578 VM_STAT_ADD(anonvmstats.privatepages[8]); 2579 2580 an_idx = start_idx; 2581 pg_idx = 0; 2582 vaddr = addr; 2583 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2584 ASSERT(ppa[pg_idx] != NULL); 2585 oldap = anon_get_ptr(amp->ahp, an_idx); 2586 ASSERT(ahmpages != NULL || oldap == NULL); 2587 ASSERT(ahmpages == NULL || oldap != NULL); 2588 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2589 ASSERT(ahmpages == NULL || pg_idx != 0 || 2590 (refcnt = oldap->an_refcnt)); 2591 ASSERT(ahmpages == NULL || pg_idx == 0 || 2592 refcnt == oldap->an_refcnt); 2593 2594 ap = anon_alloc(NULL, 0); 2595 2596 swap_xlate(ap, &vp, &off); 2597 2598 /* 2599 * Now setup our preallocated page to pass down to 2600 * swap_getpage(). 2601 */ 2602 if (prealloc) { 2603 pp = pplist; 2604 page_sub(&pplist, pp); 2605 conpp = pp; 2606 } 2607 2608 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2609 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2610 S_CREATE, cred); 2611 2612 /* 2613 * Impossible to fail this is S_CREATE. 2614 */ 2615 if (err) 2616 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2617 2618 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2619 ASSERT(prealloc == 0 || nreloc == 1); 2620 2621 pp = pl[0]; 2622 2623 /* 2624 * If the original page was locked, we need to move 2625 * the lock to the new page by transfering 2626 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2627 * of the new page. pg_idx can be used to index 2628 * into the vpage array since the caller will guarentee 2629 * that vpage struct passed in corresponds to addr 2630 * and forward. 2631 */ 2632 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2633 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2634 } else if (pagelock) { 2635 mutex_enter(&freemem_lock); 2636 availrmem++; 2637 pages_useclaim--; 2638 mutex_exit(&freemem_lock); 2639 } 2640 2641 /* 2642 * Now copy the contents from the original page. 2643 */ 2644 if (ppcopy(ppa[pg_idx], pp) == 0) { 2645 /* 2646 * Before ppcopy could hanlde UE or other faults, we 2647 * would have panicked here, and still have no option 2648 * but to do so now. 2649 */ 2650 panic("anon_map_privatepages, ppcopy failed"); 2651 } 2652 2653 hat_setrefmod(pp); /* mark as modified */ 2654 2655 /* 2656 * Release the lock on the original page, 2657 * derement the old slot, and down grade the lock 2658 * on the new copy. 2659 */ 2660 page_unlock(ppa[pg_idx]); 2661 2662 if (!prealloc) 2663 page_downgrade(pp); 2664 2665 ppa[pg_idx] = pp; 2666 2667 /* 2668 * Now reflect the copy in the new anon array. 2669 */ 2670 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2671 if (oldap != NULL) 2672 anon_decref(oldap); 2673 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2674 } 2675 2676 /* 2677 * Unload the old large page translation. 2678 */ 2679 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2680 2681 if (ahmpages != NULL) { 2682 mutex_exit(ahmpages); 2683 } 2684 ASSERT(prealloc == 0 || pplist == NULL); 2685 if (prealloc) { 2686 VM_STAT_ADD(anonvmstats.privatepages[9]); 2687 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2688 page_downgrade(ppa[pg_idx]); 2689 } 2690 } 2691 2692 return (0); 2693 } 2694 2695 /* 2696 * Allocate a private zero-filled anon page. 2697 */ 2698 page_t * 2699 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2700 { 2701 struct anon *ap; 2702 page_t *pp; 2703 struct vnode *vp; 2704 anoff_t off; 2705 page_t *anon_pl[1 + 1]; 2706 int err; 2707 2708 /* Kernel probe */ 2709 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2710 tnf_opaque, address, addr); 2711 2712 *app = ap = anon_alloc(NULL, 0); 2713 swap_xlate(ap, &vp, &off); 2714 2715 /* 2716 * Call the VOP_GETPAGE routine to create the page, thereby 2717 * enabling the vnode driver to allocate any filesystem 2718 * dependent structures (e.g., disk block allocation for UFS). 2719 * This also prevents more than on page from being added to 2720 * the vnode at the same time since it is locked. 2721 */ 2722 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2723 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2724 if (err) { 2725 *app = NULL; 2726 anon_decref(ap); 2727 return (NULL); 2728 } 2729 pp = anon_pl[0]; 2730 2731 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2732 page_downgrade(pp); 2733 CPU_STATS_ADD_K(vm, zfod, 1); 2734 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2735 return (pp); 2736 } 2737 2738 2739 /* 2740 * Allocate array of private zero-filled anon pages for empty slots 2741 * and kept pages for non empty slots within given range. 2742 * 2743 * NOTE: This rontine will try and use large pages 2744 * if available and supported by underlying platform. 2745 */ 2746 int 2747 anon_map_createpages( 2748 struct anon_map *amp, 2749 ulong_t start_index, 2750 size_t len, 2751 page_t *ppa[], 2752 struct seg *seg, 2753 caddr_t addr, 2754 enum seg_rw rw, 2755 struct cred *cred) 2756 { 2757 2758 struct anon *ap; 2759 struct vnode *ap_vp; 2760 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2761 int err = 0; 2762 ulong_t p_index, index; 2763 pgcnt_t npgs, pg_cnt; 2764 spgcnt_t nreloc = 0; 2765 uint_t l_szc, szc, prot; 2766 anoff_t ap_off; 2767 size_t pgsz; 2768 lgrp_t *lgrp; 2769 kmutex_t *ahm; 2770 2771 /* 2772 * XXX For now only handle S_CREATE. 2773 */ 2774 ASSERT(rw == S_CREATE); 2775 2776 index = start_index; 2777 p_index = 0; 2778 npgs = btopr(len); 2779 2780 /* 2781 * If this platform supports multiple page sizes 2782 * then try and allocate directly from the free 2783 * list for pages larger than PAGESIZE. 2784 * 2785 * NOTE:When we have page_create_ru we can stop 2786 * directly allocating from the freelist. 2787 */ 2788 l_szc = seg->s_szc; 2789 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2790 while (npgs) { 2791 2792 /* 2793 * if anon slot already exists 2794 * (means page has been created) 2795 * so 1) look up the page 2796 * 2) if the page is still in memory, get it. 2797 * 3) if not, create a page and 2798 * page in from physical swap device. 2799 * These are done in anon_getpage(). 2800 */ 2801 ap = anon_get_ptr(amp->ahp, index); 2802 if (ap) { 2803 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2804 seg, addr, S_READ, cred); 2805 if (err) { 2806 ANON_LOCK_EXIT(&->a_rwlock); 2807 panic("anon_map_createpages: anon_getpage"); 2808 } 2809 pp = anon_pl[0]; 2810 ppa[p_index++] = pp; 2811 2812 /* 2813 * an_pvp can become non-NULL after SysV's page was 2814 * paged out before ISM was attached to this SysV 2815 * shared memory segment. So free swap slot if needed. 2816 */ 2817 if (ap->an_pvp != NULL) { 2818 page_io_lock(pp); 2819 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 2820 ap->an_off)]; 2821 mutex_enter(ahm); 2822 if (ap->an_pvp != NULL) { 2823 swap_phys_free(ap->an_pvp, 2824 ap->an_poff, PAGESIZE); 2825 ap->an_pvp = NULL; 2826 ap->an_poff = 0; 2827 mutex_exit(ahm); 2828 hat_setmod(pp); 2829 } else { 2830 mutex_exit(ahm); 2831 } 2832 page_io_unlock(pp); 2833 } 2834 2835 addr += PAGESIZE; 2836 index++; 2837 npgs--; 2838 continue; 2839 } 2840 /* 2841 * Now try and allocate the largest page possible 2842 * for the current address and range. 2843 * Keep dropping down in page size until: 2844 * 2845 * 1) Properly aligned 2846 * 2) Does not overlap existing anon pages 2847 * 3) Fits in remaining range. 2848 * 4) able to allocate one. 2849 * 2850 * NOTE: XXX When page_create_ru is completed this code 2851 * will change. 2852 */ 2853 szc = l_szc; 2854 pplist = NULL; 2855 pg_cnt = 0; 2856 while (szc) { 2857 pgsz = page_get_pagesize(szc); 2858 pg_cnt = pgsz >> PAGESHIFT; 2859 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2860 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2861 /* 2862 * XXX 2863 * Since we are faking page_create() 2864 * we also need to do the freemem and 2865 * pcf accounting. 2866 */ 2867 (void) page_create_wait(pg_cnt, PG_WAIT); 2868 2869 /* 2870 * Get lgroup to allocate next page of shared 2871 * memory from and use it to specify where to 2872 * allocate the physical memory 2873 */ 2874 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2875 2876 pplist = page_get_freelist( 2877 anon_vp, (u_offset_t)0, seg, 2878 addr, pgsz, 0, lgrp); 2879 2880 if (pplist == NULL) { 2881 page_create_putback(pg_cnt); 2882 } 2883 2884 /* 2885 * If a request for a page of size 2886 * larger than PAGESIZE failed 2887 * then don't try that size anymore. 2888 */ 2889 if (pplist == NULL) { 2890 l_szc = szc - 1; 2891 } else { 2892 break; 2893 } 2894 } 2895 szc--; 2896 } 2897 2898 /* 2899 * If just using PAGESIZE pages then don't 2900 * directly allocate from the free list. 2901 */ 2902 if (pplist == NULL) { 2903 ASSERT(szc == 0); 2904 pp = anon_zero(seg, addr, &ap, cred); 2905 if (pp == NULL) { 2906 ANON_LOCK_EXIT(&->a_rwlock); 2907 panic("anon_map_createpages: anon_zero"); 2908 } 2909 ppa[p_index++] = pp; 2910 2911 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2912 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2913 2914 addr += PAGESIZE; 2915 index++; 2916 npgs--; 2917 continue; 2918 } 2919 2920 /* 2921 * pplist is a list of pg_cnt PAGESIZE pages. 2922 * These pages are locked SE_EXCL since they 2923 * came directly off the free list. 2924 */ 2925 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2926 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2927 ASSERT(conpp == NULL); 2928 while (pg_cnt--) { 2929 2930 ap = anon_alloc(NULL, 0); 2931 swap_xlate(ap, &ap_vp, &ap_off); 2932 2933 ASSERT(pplist != NULL); 2934 pp = pplist; 2935 page_sub(&pplist, pp); 2936 PP_CLRFREE(pp); 2937 PP_CLRAGED(pp); 2938 conpp = pp; 2939 2940 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2941 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2942 &nreloc, seg, addr, S_CREATE, cred); 2943 2944 if (err) { 2945 ANON_LOCK_EXIT(&->a_rwlock); 2946 panic("anon_map_createpages: S_CREATE"); 2947 } 2948 2949 ASSERT(anon_pl[0] == pp); 2950 ASSERT(nreloc == 1); 2951 pagezero(pp, 0, PAGESIZE); 2952 CPU_STATS_ADD_K(vm, zfod, 1); 2953 hat_setrefmod(pp); 2954 2955 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2956 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2957 2958 ppa[p_index++] = pp; 2959 2960 addr += PAGESIZE; 2961 index++; 2962 npgs--; 2963 } 2964 conpp = NULL; 2965 pg_cnt = pgsz >> PAGESHIFT; 2966 p_index = p_index - pg_cnt; 2967 while (pg_cnt--) { 2968 page_downgrade(ppa[p_index++]); 2969 } 2970 } 2971 ANON_LOCK_EXIT(&->a_rwlock); 2972 return (0); 2973 } 2974 2975 static int 2976 anon_try_demote_pages( 2977 struct anon_hdr *ahp, 2978 ulong_t sidx, 2979 uint_t szc, 2980 page_t **ppa, 2981 int private) 2982 { 2983 struct anon *ap; 2984 pgcnt_t pgcnt = page_get_pagecnt(szc); 2985 page_t *pp; 2986 pgcnt_t i; 2987 kmutex_t *ahmpages = NULL; 2988 int root = 0; 2989 pgcnt_t npgs; 2990 pgcnt_t curnpgs = 0; 2991 size_t ppasize = 0; 2992 2993 ASSERT(szc != 0); 2994 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2995 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 2996 ASSERT(sidx < ahp->size); 2997 2998 if (ppa == NULL) { 2999 ppasize = pgcnt * sizeof (page_t *); 3000 ppa = kmem_alloc(ppasize, KM_SLEEP); 3001 } 3002 3003 ap = anon_get_ptr(ahp, sidx); 3004 if (ap != NULL && private) { 3005 VM_STAT_ADD(anonvmstats.demotepages[1]); 3006 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 3007 mutex_enter(ahmpages); 3008 } 3009 3010 if (ap != NULL && ap->an_refcnt > 1) { 3011 if (ahmpages != NULL) { 3012 VM_STAT_ADD(anonvmstats.demotepages[2]); 3013 mutex_exit(ahmpages); 3014 } 3015 if (ppasize != 0) { 3016 kmem_free(ppa, ppasize); 3017 } 3018 return (0); 3019 } 3020 if (ahmpages != NULL) { 3021 mutex_exit(ahmpages); 3022 } 3023 if (ahp->size - sidx < pgcnt) { 3024 ASSERT(private == 0); 3025 pgcnt = ahp->size - sidx; 3026 } 3027 for (i = 0; i < pgcnt; i++, sidx++) { 3028 ap = anon_get_ptr(ahp, sidx); 3029 if (ap != NULL) { 3030 if (ap->an_refcnt != 1) { 3031 panic("anon_try_demote_pages: an_refcnt != 1"); 3032 } 3033 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3034 SE_EXCL); 3035 if (pp != NULL) { 3036 (void) hat_pageunload(pp, 3037 HAT_FORCE_PGUNLOAD); 3038 } 3039 } else { 3040 ppa[i] = NULL; 3041 } 3042 } 3043 for (i = 0; i < pgcnt; i++) { 3044 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3045 ASSERT(pp->p_szc <= szc); 3046 if (!root) { 3047 VM_STAT_ADD(anonvmstats.demotepages[3]); 3048 if (curnpgs != 0) 3049 panic("anon_try_demote_pages: " 3050 "bad large page"); 3051 3052 root = 1; 3053 curnpgs = npgs = 3054 page_get_pagecnt(pp->p_szc); 3055 3056 ASSERT(npgs <= pgcnt); 3057 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3058 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3059 } else { 3060 ASSERT(i > 0); 3061 ASSERT(page_pptonum(pp) - 1 == 3062 page_pptonum(ppa[i - 1])); 3063 if ((page_pptonum(pp) & (npgs - 1)) == 3064 npgs - 1) 3065 root = 0; 3066 } 3067 ASSERT(PAGE_EXCL(pp)); 3068 pp->p_szc = 0; 3069 ASSERT(curnpgs > 0); 3070 curnpgs--; 3071 } 3072 } 3073 if (root != 0 || curnpgs != 0) 3074 panic("anon_try_demote_pages: bad large page"); 3075 3076 for (i = 0; i < pgcnt; i++) { 3077 if ((pp = ppa[i]) != NULL) { 3078 ASSERT(!hat_page_is_mapped(pp)); 3079 ASSERT(pp->p_szc == 0); 3080 page_unlock(pp); 3081 } 3082 } 3083 if (ppasize != 0) { 3084 kmem_free(ppa, ppasize); 3085 } 3086 return (1); 3087 } 3088 3089 /* 3090 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3091 */ 3092 int 3093 anon_map_demotepages( 3094 struct anon_map *amp, 3095 ulong_t start_idx, 3096 struct seg *seg, 3097 caddr_t addr, 3098 uint_t prot, 3099 struct vpage vpage[], 3100 struct cred *cred) 3101 { 3102 struct anon *ap; 3103 uint_t szc = seg->s_szc; 3104 pgcnt_t pgcnt = page_get_pagecnt(szc); 3105 size_t ppasize = pgcnt * sizeof (page_t *); 3106 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3107 page_t *pp; 3108 page_t *pl[2]; 3109 pgcnt_t i, pg_idx; 3110 ulong_t an_idx; 3111 caddr_t vaddr; 3112 int err; 3113 int retry = 0; 3114 uint_t vpprot; 3115 3116 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3117 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3118 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3119 ASSERT(ppa != NULL); 3120 ASSERT(szc != 0); 3121 ASSERT(szc == amp->a_szc); 3122 3123 VM_STAT_ADD(anonvmstats.demotepages[0]); 3124 3125 top: 3126 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3127 kmem_free(ppa, ppasize); 3128 return (0); 3129 } 3130 3131 VM_STAT_ADD(anonvmstats.demotepages[4]); 3132 3133 ASSERT(retry == 0); /* we can be here only once */ 3134 3135 vaddr = addr; 3136 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3137 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3138 ap = anon_get_ptr(amp->ahp, an_idx); 3139 if (ap == NULL) 3140 panic("anon_map_demotepages: no anon slot"); 3141 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3142 S_READ, cred); 3143 if (err) { 3144 for (i = 0; i < pg_idx; i++) { 3145 if ((pp = ppa[i]) != NULL) 3146 page_unlock(pp); 3147 } 3148 kmem_free(ppa, ppasize); 3149 return (err); 3150 } 3151 ppa[pg_idx] = pl[0]; 3152 } 3153 3154 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3155 vpage, -1, 0, cred); 3156 if (err > 0) { 3157 VM_STAT_ADD(anonvmstats.demotepages[5]); 3158 kmem_free(ppa, ppasize); 3159 return (err); 3160 } 3161 ASSERT(err == 0 || err == -1); 3162 if (err == -1) { 3163 VM_STAT_ADD(anonvmstats.demotepages[6]); 3164 retry = 1; 3165 goto top; 3166 } 3167 for (i = 0; i < pgcnt; i++) { 3168 ASSERT(ppa[i] != NULL); 3169 if (ppa[i]->p_szc != 0) 3170 retry = 1; 3171 page_unlock(ppa[i]); 3172 } 3173 if (retry) { 3174 VM_STAT_ADD(anonvmstats.demotepages[7]); 3175 goto top; 3176 } 3177 3178 VM_STAT_ADD(anonvmstats.demotepages[8]); 3179 3180 kmem_free(ppa, ppasize); 3181 3182 return (0); 3183 } 3184 3185 /* 3186 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3187 * structures with private anon maps. Therefore all anon structures should 3188 * have at most one reference at this point. This means underlying pages can 3189 * be exclusively locked and demoted or freed. If not freeing the entire 3190 * large pages demote the ends of the region we free to be able to free 3191 * subpages. Page roots correspond to aligned index positions in anon map. 3192 */ 3193 void 3194 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3195 { 3196 ulong_t eidx = sidx + btopr(len); 3197 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3198 struct anon_hdr *ahp = amp->ahp; 3199 ulong_t tidx; 3200 size_t size; 3201 ulong_t sidx_aligned; 3202 ulong_t eidx_aligned; 3203 3204 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3205 ASSERT(amp->refcnt <= 1); 3206 ASSERT(amp->a_szc > 0); 3207 ASSERT(eidx <= ahp->size); 3208 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3209 3210 if (len == 0) { /* XXX */ 3211 return; 3212 } 3213 3214 sidx_aligned = P2ALIGN(sidx, pages); 3215 if (sidx_aligned != sidx || 3216 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3217 if (!anon_try_demote_pages(ahp, sidx_aligned, 3218 amp->a_szc, NULL, 0)) { 3219 panic("anon_shmap_free_pages: demote failed"); 3220 } 3221 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3222 P2NPHASE(sidx, pages); 3223 size <<= PAGESHIFT; 3224 anon_free(ahp, sidx, size); 3225 sidx = sidx_aligned + pages; 3226 if (eidx <= sidx) { 3227 return; 3228 } 3229 } 3230 eidx_aligned = P2ALIGN(eidx, pages); 3231 if (sidx < eidx_aligned) { 3232 anon_free_pages(ahp, sidx, 3233 (eidx_aligned - sidx) << PAGESHIFT, 3234 amp->a_szc); 3235 sidx = eidx_aligned; 3236 } 3237 ASSERT(sidx == eidx_aligned); 3238 if (eidx == eidx_aligned) { 3239 return; 3240 } 3241 tidx = eidx; 3242 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3243 tidx - sidx < pages) { 3244 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3245 panic("anon_shmap_free_pages: demote failed"); 3246 } 3247 size = (eidx - sidx) << PAGESHIFT; 3248 anon_free(ahp, sidx, size); 3249 } else { 3250 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3251 } 3252 } 3253 3254 /* 3255 * This routine should be called with amp's writer lock when there're no other 3256 * users of amp. All pcache entries of this amp must have been already 3257 * inactivated. We must not drop a_rwlock here to prevent new users from 3258 * attaching to this amp. 3259 */ 3260 void 3261 anonmap_purge(struct anon_map *amp) 3262 { 3263 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3264 ASSERT(amp->refcnt <= 1); 3265 3266 if (amp->a_softlockcnt != 0) { 3267 seg_ppurge(NULL, amp, 0); 3268 } 3269 3270 /* 3271 * Since all pcache entries were already inactive before this routine 3272 * was called seg_ppurge() couldn't return while there're still 3273 * entries that can be found via the list anchored at a_phead. So we 3274 * can assert this list is empty now. a_softlockcnt may be still non 0 3275 * if asynchronous thread that manages pcache already removed pcache 3276 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3277 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3278 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3279 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3280 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3281 * barrier that prevents anonmap_purge() to complete while 3282 * shamp_reclaim() may still be referencing this amp. 3283 */ 3284 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3285 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3286 3287 mutex_enter(&->a_purgemtx); 3288 while (amp->a_softlockcnt != 0) { 3289 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3290 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3291 amp->a_purgewait = 1; 3292 cv_wait(&->a_purgecv, &->a_purgemtx); 3293 } 3294 mutex_exit(&->a_purgemtx); 3295 3296 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3297 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3298 ASSERT(amp->a_softlockcnt == 0); 3299 } 3300 3301 /* 3302 * Allocate and initialize an anon_map structure for seg 3303 * associating the given swap reservation with the new anon_map. 3304 */ 3305 struct anon_map * 3306 anonmap_alloc(size_t size, size_t swresv, int flags) 3307 { 3308 struct anon_map *amp; 3309 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3310 3311 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3312 if (amp == NULL) { 3313 ASSERT(kmflags == KM_NOSLEEP); 3314 return (NULL); 3315 } 3316 3317 amp->ahp = anon_create(btopr(size), flags); 3318 if (amp->ahp == NULL) { 3319 ASSERT(flags == ANON_NOSLEEP); 3320 kmem_cache_free(anonmap_cache, amp); 3321 return (NULL); 3322 } 3323 amp->refcnt = 1; 3324 amp->size = size; 3325 amp->swresv = swresv; 3326 amp->locality = 0; 3327 amp->a_szc = 0; 3328 amp->a_sp = NULL; 3329 amp->a_softlockcnt = 0; 3330 amp->a_purgewait = 0; 3331 amp->a_phead.p_lnext = &->a_phead; 3332 amp->a_phead.p_lprev = &->a_phead; 3333 3334 return (amp); 3335 } 3336 3337 void 3338 anonmap_free(struct anon_map *amp) 3339 { 3340 ASSERT(amp->ahp != NULL); 3341 ASSERT(amp->refcnt == 0); 3342 ASSERT(amp->a_softlockcnt == 0); 3343 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3344 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3345 3346 lgrp_shm_policy_fini(amp, NULL); 3347 anon_release(amp->ahp, btopr(amp->size)); 3348 kmem_cache_free(anonmap_cache, amp); 3349 } 3350 3351 /* 3352 * Returns true if the app array has some empty slots. 3353 * The offp and lenp parameters are in/out parameters. On entry 3354 * these values represent the starting offset and length of the 3355 * mapping. When true is returned, these values may be modified 3356 * to be the largest range which includes empty slots. 3357 */ 3358 int 3359 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3360 size_t *lenp) 3361 { 3362 ulong_t i, el; 3363 ssize_t low, high; 3364 struct anon *ap; 3365 3366 low = -1; 3367 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3368 ap = anon_get_ptr(ahp, anon_idx); 3369 if (ap == NULL) { 3370 if (low == -1) 3371 low = i; 3372 high = i; 3373 } 3374 } 3375 if (low != -1) { 3376 /* 3377 * Found at least one non-anon page. 3378 * Set up the off and len return values. 3379 */ 3380 if (low != 0) 3381 *offp += low; 3382 *lenp = high - low + PAGESIZE; 3383 return (1); 3384 } 3385 return (0); 3386 } 3387 3388 /* 3389 * Return a count of the number of existing anon pages in the anon array 3390 * app in the range (off, off+len). The array and slots must be guaranteed 3391 * stable by the caller. 3392 */ 3393 pgcnt_t 3394 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3395 { 3396 pgcnt_t cnt = 0; 3397 3398 while (nslots-- > 0) { 3399 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3400 cnt++; 3401 anon_index++; 3402 } 3403 return (cnt); 3404 } 3405 3406 /* 3407 * Move reserved phys swap into memory swap (unreserve phys swap 3408 * and reserve mem swap by the same amount). 3409 * Used by segspt when it needs to lock reserved swap npages in memory 3410 */ 3411 int 3412 anon_swap_adjust(pgcnt_t npages) 3413 { 3414 pgcnt_t unlocked_mem_swap; 3415 3416 mutex_enter(&anoninfo_lock); 3417 3418 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3419 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3420 3421 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3422 - k_anoninfo.ani_locked_swap; 3423 if (npages > unlocked_mem_swap) { 3424 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3425 3426 /* 3427 * if there is not enough unlocked mem swap we take missing 3428 * amount from phys swap and give it to mem swap 3429 */ 3430 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3431 mutex_exit(&anoninfo_lock); 3432 return (ENOMEM); 3433 } 3434 3435 k_anoninfo.ani_mem_resv += adjusted_swap; 3436 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3437 k_anoninfo.ani_phys_resv -= adjusted_swap; 3438 3439 ANI_ADD(adjusted_swap); 3440 } 3441 k_anoninfo.ani_locked_swap += npages; 3442 3443 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3444 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3445 3446 mutex_exit(&anoninfo_lock); 3447 3448 return (0); 3449 } 3450 3451 /* 3452 * 'unlocked' reserved mem swap so when it is unreserved it 3453 * can be moved back phys (disk) swap 3454 */ 3455 void 3456 anon_swap_restore(pgcnt_t npages) 3457 { 3458 mutex_enter(&anoninfo_lock); 3459 3460 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3461 3462 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3463 k_anoninfo.ani_locked_swap -= npages; 3464 3465 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3466 3467 mutex_exit(&anoninfo_lock); 3468 } 3469 3470 /* 3471 * Return the pointer from the list for a 3472 * specified anon index. 3473 */ 3474 ulong_t * 3475 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3476 { 3477 struct anon **app; 3478 void **ppp; 3479 3480 ASSERT(an_idx < ahp->size); 3481 3482 /* 3483 * Single level case. 3484 */ 3485 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3486 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3487 } else { 3488 3489 /* 3490 * 2 level case. 3491 */ 3492 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3493 if (*ppp == NULL) { 3494 mutex_enter(&ahp->serial_lock); 3495 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3496 if (*ppp == NULL) 3497 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3498 mutex_exit(&ahp->serial_lock); 3499 } 3500 app = *ppp; 3501 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3502 } 3503 } 3504 3505 void 3506 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3507 { 3508 ulong_t *ap_slot; 3509 kmutex_t *mtx; 3510 kcondvar_t *cv; 3511 int hash; 3512 3513 /* 3514 * Use szc to determine anon slot(s) to appear atomic. 3515 * If szc = 0, then lock the anon slot and mark it busy. 3516 * If szc > 0, then lock the range of slots by getting the 3517 * anon_array_lock for the first anon slot, and mark only the 3518 * first anon slot busy to represent whole range being busy. 3519 */ 3520 3521 ASSERT(RW_READ_HELD(&->a_rwlock)); 3522 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3523 hash = ANON_ARRAY_HASH(amp, an_idx); 3524 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3525 sobj->sync_cv = cv = &anon_array_cv[hash]; 3526 mutex_enter(mtx); 3527 ap_slot = anon_get_slot(amp->ahp, an_idx); 3528 while (ANON_ISBUSY(ap_slot)) 3529 cv_wait(cv, mtx); 3530 ANON_SETBUSY(ap_slot); 3531 sobj->sync_data = ap_slot; 3532 mutex_exit(mtx); 3533 } 3534 3535 int 3536 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3537 anon_sync_obj_t *sobj) 3538 { 3539 ulong_t *ap_slot; 3540 kmutex_t *mtx; 3541 int hash; 3542 3543 /* 3544 * Try to lock a range of anon slots. 3545 * Use szc to determine anon slot(s) to appear atomic. 3546 * If szc = 0, then lock the anon slot and mark it busy. 3547 * If szc > 0, then lock the range of slots by getting the 3548 * anon_array_lock for the first anon slot, and mark only the 3549 * first anon slot busy to represent whole range being busy. 3550 * Fail if the mutex or the anon_array are busy. 3551 */ 3552 3553 ASSERT(RW_READ_HELD(&->a_rwlock)); 3554 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3555 hash = ANON_ARRAY_HASH(amp, an_idx); 3556 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3557 sobj->sync_cv = &anon_array_cv[hash]; 3558 if (!mutex_tryenter(mtx)) { 3559 return (EWOULDBLOCK); 3560 } 3561 ap_slot = anon_get_slot(amp->ahp, an_idx); 3562 if (ANON_ISBUSY(ap_slot)) { 3563 mutex_exit(mtx); 3564 return (EWOULDBLOCK); 3565 } 3566 ANON_SETBUSY(ap_slot); 3567 sobj->sync_data = ap_slot; 3568 mutex_exit(mtx); 3569 return (0); 3570 } 3571 3572 void 3573 anon_array_exit(anon_sync_obj_t *sobj) 3574 { 3575 mutex_enter(sobj->sync_mutex); 3576 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3577 ANON_CLRBUSY(sobj->sync_data); 3578 if (CV_HAS_WAITERS(sobj->sync_cv)) 3579 cv_broadcast(sobj->sync_cv); 3580 mutex_exit(sobj->sync_mutex); 3581 } 3582