1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - anonymous pages. 43 * 44 * This layer sits immediately above the vm_swap layer. It manages 45 * physical pages that have no permanent identity in the file system 46 * name space, using the services of the vm_swap layer to allocate 47 * backing storage for these pages. Since these pages have no external 48 * identity, they are discarded when the last reference is removed. 49 * 50 * An important function of this layer is to manage low-level sharing 51 * of pages that are logically distinct but that happen to be 52 * physically identical (e.g., the corresponding pages of the processes 53 * resulting from a fork before one process or the other changes their 54 * contents). This pseudo-sharing is present only as an optimization 55 * and is not to be confused with true sharing in which multiple 56 * address spaces deliberately contain references to the same object; 57 * such sharing is managed at a higher level. 58 * 59 * The key data structure here is the anon struct, which contains a 60 * reference count for its associated physical page and a hint about 61 * the identity of that page. Anon structs typically live in arrays, 62 * with an instance's position in its array determining where the 63 * corresponding backing storage is allocated; however, the swap_xlate() 64 * routine abstracts away this representation information so that the 65 * rest of the anon layer need not know it. (See the swap layer for 66 * more details on anon struct layout.) 67 * 68 * In the future versions of the system, the association between an 69 * anon struct and its position on backing store will change so that 70 * we don't require backing store all anonymous pages in the system. 71 * This is important for consideration for large memory systems. 72 * We can also use this technique to delay binding physical locations 73 * to anonymous pages until pageout/swapout time where we can make 74 * smarter allocation decisions to improve anonymous klustering. 75 * 76 * Many of the routines defined here take a (struct anon **) argument, 77 * which allows the code at this level to manage anon pages directly, 78 * so that callers can regard anon structs as opaque objects and not be 79 * concerned with assigning or inspecting their contents. 80 * 81 * Clients of this layer refer to anon pages indirectly. That is, they 82 * maintain arrays of pointers to anon structs rather than maintaining 83 * anon structs themselves. The (struct anon **) arguments mentioned 84 * above are pointers to entries in these arrays. It is these arrays 85 * that capture the mapping between offsets within a given segment and 86 * the corresponding anonymous backing storage address. 87 */ 88 89 #ifdef DEBUG 90 #define ANON_DEBUG 91 #endif 92 93 #include <sys/types.h> 94 #include <sys/t_lock.h> 95 #include <sys/param.h> 96 #include <sys/systm.h> 97 #include <sys/mman.h> 98 #include <sys/cred.h> 99 #include <sys/thread.h> 100 #include <sys/vnode.h> 101 #include <sys/cpuvar.h> 102 #include <sys/swap.h> 103 #include <sys/cmn_err.h> 104 #include <sys/vtrace.h> 105 #include <sys/kmem.h> 106 #include <sys/sysmacros.h> 107 #include <sys/bitmap.h> 108 #include <sys/vmsystm.h> 109 #include <sys/debug.h> 110 #include <sys/fs/swapnode.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 #include <sys/rctl.h> 117 118 #include <vm/as.h> 119 #include <vm/hat.h> 120 #include <vm/anon.h> 121 #include <vm/page.h> 122 #include <vm/vpage.h> 123 #include <vm/seg.h> 124 #include <vm/rm.h> 125 126 #include <fs/fs_subr.h> 127 128 struct vnode *anon_vp; 129 130 int anon_debug; 131 132 kmutex_t anoninfo_lock; 133 struct k_anoninfo k_anoninfo; 134 ani_free_t ani_free_pool[ANI_MAX_POOL]; 135 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 136 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 137 138 /* 139 * Global hash table for (vp, off) -> anon slot 140 */ 141 extern int swap_maxcontig; 142 size_t anon_hash_size; 143 struct anon **anon_hash; 144 145 static struct kmem_cache *anon_cache; 146 static struct kmem_cache *anonmap_cache; 147 148 #ifdef VM_STATS 149 static struct anonvmstats_str { 150 ulong_t getpages[30]; 151 ulong_t privatepages[10]; 152 ulong_t demotepages[9]; 153 ulong_t decrefpages[9]; 154 ulong_t dupfillholes[4]; 155 ulong_t freepages[1]; 156 } anonvmstats; 157 #endif /* VM_STATS */ 158 159 160 /*ARGSUSED*/ 161 static int 162 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 163 { 164 struct anon_map *amp = buf; 165 166 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 167 return (0); 168 } 169 170 /*ARGSUSED1*/ 171 static void 172 anonmap_cache_destructor(void *buf, void *cdrarg) 173 { 174 struct anon_map *amp = buf; 175 176 rw_destroy(&->a_rwlock); 177 } 178 179 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 180 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 181 182 void 183 anon_init(void) 184 { 185 int i; 186 187 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 188 189 for (i = 0; i < AH_LOCK_SIZE; i++) { 190 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 191 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 192 } 193 194 for (i = 0; i < ANON_LOCKSIZE; i++) { 195 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 196 MUTEX_DEFAULT, NULL); 197 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 198 } 199 200 anon_hash = (struct anon **) 201 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 202 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 203 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 204 anonmap_cache = kmem_cache_create("anonmap_cache", 205 sizeof (struct anon_map), 0, 206 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 207 NULL, NULL, 0); 208 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 209 210 anon_vp = vn_alloc(KM_SLEEP); 211 vn_setops(anon_vp, swap_vnodeops); 212 anon_vp->v_type = VREG; 213 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 214 } 215 216 /* 217 * Global anon slot hash table manipulation. 218 */ 219 220 static void 221 anon_addhash(struct anon *ap) 222 { 223 int index; 224 225 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 226 index = ANON_HASH(ap->an_vp, ap->an_off); 227 ap->an_hash = anon_hash[index]; 228 anon_hash[index] = ap; 229 } 230 231 static void 232 anon_rmhash(struct anon *ap) 233 { 234 struct anon **app; 235 236 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 237 238 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 239 *app; app = &((*app)->an_hash)) { 240 if (*app == ap) { 241 *app = ap->an_hash; 242 break; 243 } 244 } 245 } 246 247 /* 248 * The anon array interfaces. Functions allocating, 249 * freeing array of pointers, and returning/setting 250 * entries in the array of pointers for a given offset. 251 * 252 * Create the list of pointers 253 */ 254 struct anon_hdr * 255 anon_create(pgcnt_t npages, int flags) 256 { 257 struct anon_hdr *ahp; 258 ulong_t nchunks; 259 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 260 261 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 262 return (NULL); 263 } 264 265 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 266 /* 267 * Single level case. 268 */ 269 ahp->size = npages; 270 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 271 272 if (flags & ANON_ALLOC_FORCE) 273 ahp->flags |= ANON_ALLOC_FORCE; 274 275 ahp->array_chunk = kmem_zalloc( 276 ahp->size * sizeof (struct anon *), kmemflags); 277 278 if (ahp->array_chunk == NULL) { 279 kmem_free(ahp, sizeof (struct anon_hdr)); 280 return (NULL); 281 } 282 } else { 283 /* 284 * 2 Level case. 285 * anon hdr size needs to be rounded off to be a multiple 286 * of ANON_CHUNK_SIZE. This is important as various anon 287 * related functions depend on this. 288 * NOTE - 289 * anon_grow() makes anon hdr size a multiple of 290 * ANON_CHUNK_SIZE. 291 * amp size is <= anon hdr size. 292 * anon_index + seg_pgs <= anon hdr size. 293 */ 294 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 295 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 296 297 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 298 kmemflags); 299 300 if (ahp->array_chunk == NULL) { 301 kmem_free(ahp, sizeof (struct anon_hdr)); 302 return (NULL); 303 } 304 } 305 return (ahp); 306 } 307 308 /* 309 * Free the array of pointers 310 */ 311 void 312 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 313 { 314 ulong_t i; 315 void **ppp; 316 ulong_t nchunks; 317 318 ASSERT(npages <= ahp->size); 319 320 /* 321 * Single level case. 322 */ 323 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 324 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 325 } else { 326 /* 327 * 2 level case. 328 */ 329 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 330 for (i = 0; i < nchunks; i++) { 331 ppp = &ahp->array_chunk[i]; 332 if (*ppp != NULL) 333 kmem_free(*ppp, PAGESIZE); 334 } 335 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 336 } 337 mutex_destroy(&ahp->serial_lock); 338 kmem_free(ahp, sizeof (struct anon_hdr)); 339 } 340 341 /* 342 * Return the pointer from the list for a 343 * specified anon index. 344 */ 345 struct anon * 346 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 347 { 348 struct anon **app; 349 350 ASSERT(an_idx < ahp->size); 351 352 /* 353 * Single level case. 354 */ 355 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 356 return ((struct anon *) 357 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 358 } else { 359 360 /* 361 * 2 level case. 362 */ 363 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 364 if (app) { 365 return ((struct anon *) 366 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 367 ANON_PTRMASK)); 368 } else { 369 return (NULL); 370 } 371 } 372 } 373 374 /* 375 * Return the anon pointer for the first valid entry in the anon list, 376 * starting from the given index. 377 */ 378 struct anon * 379 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 380 { 381 struct anon *ap; 382 struct anon **app; 383 ulong_t chunkoff; 384 ulong_t i; 385 ulong_t j; 386 pgcnt_t size; 387 388 i = *index; 389 size = ahp->size; 390 391 ASSERT(i < size); 392 393 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 394 /* 395 * 1 level case 396 */ 397 while (i < size) { 398 ap = (struct anon *) 399 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 400 if (ap) { 401 *index = i; 402 return (ap); 403 } 404 i++; 405 } 406 } else { 407 /* 408 * 2 level case 409 */ 410 chunkoff = i & ANON_CHUNK_OFF; 411 while (i < size) { 412 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 413 if (app) 414 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 415 ap = (struct anon *) 416 ((uintptr_t)app[j] & ANON_PTRMASK); 417 if (ap) { 418 *index = i + (j - chunkoff); 419 return (ap); 420 } 421 } 422 chunkoff = 0; 423 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 424 } 425 } 426 *index = size; 427 return (NULL); 428 } 429 430 /* 431 * Set list entry with a given pointer for a specified offset 432 */ 433 int 434 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 435 { 436 void **ppp; 437 struct anon **app; 438 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 439 uintptr_t *ap_addr; 440 441 ASSERT(an_idx < ahp->size); 442 443 /* 444 * Single level case. 445 */ 446 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 447 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 448 } else { 449 450 /* 451 * 2 level case. 452 */ 453 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 454 455 ASSERT(ppp != NULL); 456 if (*ppp == NULL) { 457 mutex_enter(&ahp->serial_lock); 458 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 459 if (*ppp == NULL) { 460 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 461 if (*ppp == NULL) { 462 mutex_exit(&ahp->serial_lock); 463 return (ENOMEM); 464 } 465 } 466 mutex_exit(&ahp->serial_lock); 467 } 468 app = *ppp; 469 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 470 } 471 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 472 return (0); 473 } 474 475 /* 476 * Copy anon array into a given new anon array 477 */ 478 int 479 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 480 struct anon_hdr *dahp, ulong_t d_idx, 481 pgcnt_t npages, int flags) 482 { 483 void **sapp, **dapp; 484 void *ap; 485 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 486 487 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 488 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 489 490 /* 491 * Both arrays are 1 level. 492 */ 493 if (((sahp->size <= ANON_CHUNK_SIZE) && 494 (dahp->size <= ANON_CHUNK_SIZE)) || 495 ((sahp->flags & ANON_ALLOC_FORCE) && 496 (dahp->flags & ANON_ALLOC_FORCE))) { 497 498 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 499 npages * sizeof (struct anon *)); 500 return (0); 501 } 502 503 /* 504 * Both arrays are 2 levels. 505 */ 506 if (sahp->size > ANON_CHUNK_SIZE && 507 dahp->size > ANON_CHUNK_SIZE && 508 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 509 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 510 511 ulong_t sapidx, dapidx; 512 ulong_t *sap, *dap; 513 ulong_t chknp; 514 515 while (npages != 0) { 516 517 sapidx = s_idx & ANON_CHUNK_OFF; 518 dapidx = d_idx & ANON_CHUNK_OFF; 519 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 520 if (chknp > npages) 521 chknp = npages; 522 523 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 524 if ((sap = *sapp) != NULL) { 525 dapp = &dahp->array_chunk[d_idx 526 >> ANON_CHUNK_SHIFT]; 527 if ((dap = *dapp) == NULL) { 528 *dapp = kmem_zalloc(PAGESIZE, 529 kmemflags); 530 if ((dap = *dapp) == NULL) 531 return (ENOMEM); 532 } 533 bcopy((sap + sapidx), (dap + dapidx), 534 chknp << ANON_PTRSHIFT); 535 } 536 s_idx += chknp; 537 d_idx += chknp; 538 npages -= chknp; 539 } 540 return (0); 541 } 542 543 /* 544 * At least one of the arrays is 2 level. 545 */ 546 while (npages--) { 547 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 548 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 549 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 550 return (ENOMEM); 551 } 552 s_idx++; 553 d_idx++; 554 } 555 return (0); 556 } 557 558 559 /* 560 * ANON_INITBUF is a convenience macro for anon_grow() below. It 561 * takes a buffer dst, which is at least as large as buffer src. It 562 * does a bcopy from src into dst, and then bzeros the extra bytes 563 * of dst. If tail is set, the data in src is tail aligned within 564 * dst instead of head aligned. 565 */ 566 567 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 568 if (tail) { \ 569 bzero((dst), (dstsize) - (srclen)); \ 570 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 571 } else { \ 572 bcopy((src), (dst), (srclen)); \ 573 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 574 } 575 576 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 577 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 578 579 /* 580 * anon_grow() is used to efficiently extend an existing anon array. 581 * startidx_p points to the index into the anon array of the first page 582 * that is in use. oldseg_pgs is the number of pages in use, starting at 583 * *startidx_p. newpages is the number of additional pages desired. 584 * 585 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 586 * 587 * The growth is done by creating a new top level of the anon array, 588 * and (if the array is 2-level) reusing the existing second level arrays. 589 * 590 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 591 * 592 * Returns the new number of pages in the anon array. 593 */ 594 pgcnt_t 595 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 596 pgcnt_t newseg_pgs, int flags) 597 { 598 ulong_t startidx = startidx_p ? *startidx_p : 0; 599 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 600 pgcnt_t oelems, nelems, totpages; 601 void **level1; 602 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 603 int growdown = (flags & ANON_GROWDOWN); 604 size_t newarrsz, oldarrsz; 605 void *level2; 606 607 ASSERT(!(startidx_p == NULL && growdown)); 608 ASSERT(startidx + oldseg_pgs <= ahp->size); 609 610 /* 611 * Determine the total number of pages needed in the new 612 * anon array. If growing down, totpages is all pages from 613 * startidx through the end of the array, plus <newseg_pgs> 614 * pages. If growing up, keep all pages from page 0 through 615 * the last page currently in use, plus <newseg_pgs> pages. 616 */ 617 if (growdown) 618 totpages = oldamp_pgs - startidx + newseg_pgs; 619 else 620 totpages = startidx + oldseg_pgs + newseg_pgs; 621 622 /* If the array is already large enough, just return. */ 623 624 if (oldamp_pgs >= totpages) { 625 if (growdown) 626 *startidx_p = oldamp_pgs - totpages; 627 return (oldamp_pgs); 628 } 629 630 /* 631 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 632 * by the corresponding arrays. 633 * oelems/nelems are the number of pointers in the top level arrays 634 * which may be either level 1 or level 2. 635 * Will the new anon array be one level or two levels? 636 */ 637 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 638 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 639 oelems = oldamp_pgs; 640 nelems = newamp_pgs; 641 } else { 642 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 643 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 644 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 645 } 646 647 newarrsz = nelems * sizeof (void *); 648 level1 = kmem_alloc(newarrsz, kmemflags); 649 if (level1 == NULL) 650 return (0); 651 652 /* Are we converting from a one level to a two level anon array? */ 653 654 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 655 !(ahp->flags & ANON_ALLOC_FORCE)) { 656 657 /* 658 * Yes, we're converting to a two level. Reuse old level 1 659 * as new level 2 if it is exactly PAGESIZE. Otherwise 660 * alloc a new level 2 and copy the old level 1 data into it. 661 */ 662 if (oldamp_pgs == ANON_CHUNK_SIZE) { 663 level2 = (void *)ahp->array_chunk; 664 } else { 665 level2 = kmem_alloc(PAGESIZE, kmemflags); 666 if (level2 == NULL) { 667 kmem_free(level1, newarrsz); 668 return (0); 669 } 670 oldarrsz = oldamp_pgs * sizeof (void *); 671 672 ANON_INITBUF(ahp->array_chunk, oldarrsz, 673 level2, PAGESIZE, growdown); 674 kmem_free(ahp->array_chunk, oldarrsz); 675 } 676 bzero(level1, newarrsz); 677 if (growdown) 678 level1[nelems - 1] = level2; 679 else 680 level1[0] = level2; 681 } else { 682 oldarrsz = oelems * sizeof (void *); 683 684 ANON_INITBUF(ahp->array_chunk, oldarrsz, 685 level1, newarrsz, growdown); 686 kmem_free(ahp->array_chunk, oldarrsz); 687 } 688 689 ahp->array_chunk = level1; 690 ahp->size = newamp_pgs; 691 if (growdown) 692 *startidx_p = newamp_pgs - totpages; 693 694 return (newamp_pgs); 695 } 696 697 698 /* 699 * Called from clock handler to sync ani_free value. 700 */ 701 702 void 703 set_anoninfo(void) 704 { 705 int ix; 706 pgcnt_t total = 0; 707 708 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 709 total += ani_free_pool[ix].ani_count; 710 } 711 k_anoninfo.ani_free = total; 712 } 713 714 /* 715 * Reserve anon space. 716 * 717 * It's no longer simply a matter of incrementing ani_resv to 718 * reserve swap space, we need to check memory-based as well 719 * as disk-backed (physical) swap. The following algorithm 720 * is used: 721 * Check the space on physical swap 722 * i.e. amount needed < ani_max - ani_phys_resv 723 * If we are swapping on swapfs check 724 * amount needed < (availrmem - swapfs_minfree) 725 * Since the algorithm to check for the quantity of swap space is 726 * almost the same as that for reserving it, we'll just use anon_resvmem 727 * with a flag to decrement availrmem. 728 * 729 * Return non-zero on success. 730 */ 731 int 732 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 733 { 734 pgcnt_t npages = btopr(size); 735 pgcnt_t mswap_pages = 0; 736 pgcnt_t pswap_pages = 0; 737 proc_t *p = curproc; 738 739 if (zone != NULL && takemem) { 740 /* test zone.max-swap resource control */ 741 mutex_enter(&p->p_lock); 742 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 743 mutex_exit(&p->p_lock); 744 return (0); 745 } 746 mutex_exit(&p->p_lock); 747 } 748 mutex_enter(&anoninfo_lock); 749 750 /* 751 * pswap_pages is the number of pages we can take from 752 * physical (i.e. disk-backed) swap. 753 */ 754 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 755 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 756 757 ANON_PRINT(A_RESV, 758 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 759 npages, takemem, pswap_pages, (void *)caller())); 760 761 if (npages <= pswap_pages) { 762 /* 763 * we have enough space on a physical swap 764 */ 765 if (takemem) 766 k_anoninfo.ani_phys_resv += npages; 767 mutex_exit(&anoninfo_lock); 768 return (1); 769 } else if (pswap_pages != 0) { 770 /* 771 * we have some space on a physical swap 772 */ 773 if (takemem) { 774 /* 775 * use up remainder of phys swap 776 */ 777 k_anoninfo.ani_phys_resv += pswap_pages; 778 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 779 } 780 } 781 /* 782 * since (npages > pswap_pages) we need mem swap 783 * mswap_pages is the number of pages needed from availrmem 784 */ 785 ASSERT(npages > pswap_pages); 786 mswap_pages = npages - pswap_pages; 787 788 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 789 mswap_pages)); 790 791 /* 792 * priv processes can reserve memory as swap as long as availrmem 793 * remains greater than swapfs_minfree; in the case of non-priv 794 * processes, memory can be reserved as swap only if availrmem 795 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 796 * swapfs_reserve amount of memswap is not available to non-priv 797 * processes. This protects daemons such as automounter dying 798 * as a result of application processes eating away almost entire 799 * membased swap. This safeguard becomes useless if apps are run 800 * with root access. 801 * 802 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 803 * 804 */ 805 if (tryhard) { 806 mutex_exit(&anoninfo_lock); 807 (void) page_reclaim_mem(mswap_pages, 808 swapfs_minfree + swapfs_reserve, 0); 809 mutex_enter(&anoninfo_lock); 810 } 811 812 mutex_enter(&freemem_lock); 813 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 814 (availrmem > (swapfs_minfree + mswap_pages) && 815 secpolicy_resource(CRED()) == 0)) { 816 817 if (takemem) { 818 /* 819 * Take the memory from the rest of the system. 820 */ 821 availrmem -= mswap_pages; 822 mutex_exit(&freemem_lock); 823 k_anoninfo.ani_mem_resv += mswap_pages; 824 ANI_ADD(mswap_pages); 825 ANON_PRINT((A_RESV | A_MRESV), 826 ("anon_resvmem: took %ld pages of availrmem\n", 827 mswap_pages)); 828 } else { 829 mutex_exit(&freemem_lock); 830 } 831 832 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 833 mutex_exit(&anoninfo_lock); 834 return (1); 835 836 } else { 837 /* 838 * Fail if not enough memory 839 */ 840 841 if (takemem) { 842 k_anoninfo.ani_phys_resv -= pswap_pages; 843 } 844 845 mutex_exit(&freemem_lock); 846 mutex_exit(&anoninfo_lock); 847 ANON_PRINT(A_RESV, 848 ("anon_resvmem: not enough space from swapfs\n")); 849 if (zone != NULL && takemem) 850 rctl_decr_swap(zone, ptob(npages)); 851 return (0); 852 } 853 } 854 855 /* 856 * Give back an anon reservation. 857 */ 858 void 859 anon_unresvmem(size_t size, zone_t *zone) 860 { 861 pgcnt_t npages = btopr(size); 862 spgcnt_t mem_free_pages = 0; 863 pgcnt_t phys_free_slots; 864 #ifdef ANON_DEBUG 865 pgcnt_t mem_resv; 866 #endif 867 if (zone != NULL) 868 rctl_decr_swap(zone, ptob(npages)); 869 870 mutex_enter(&anoninfo_lock); 871 872 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 873 /* 874 * If some of this reservation belonged to swapfs 875 * give it back to availrmem. 876 * ani_mem_resv is the amount of availrmem swapfs has reserved. 877 * but some of that memory could be locked by segspt so we can only 878 * return non locked ani_mem_resv back to availrmem 879 */ 880 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 881 ANON_PRINT((A_RESV | A_MRESV), 882 ("anon_unresv: growing availrmem by %ld pages\n", 883 MIN(k_anoninfo.ani_mem_resv, npages))); 884 885 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 886 k_anoninfo.ani_locked_swap), npages); 887 mutex_enter(&freemem_lock); 888 availrmem += mem_free_pages; 889 mutex_exit(&freemem_lock); 890 k_anoninfo.ani_mem_resv -= mem_free_pages; 891 892 ANI_ADD(-mem_free_pages); 893 } 894 /* 895 * The remainder of the pages is returned to phys swap 896 */ 897 ASSERT(npages >= mem_free_pages); 898 phys_free_slots = npages - mem_free_pages; 899 900 if (phys_free_slots) { 901 k_anoninfo.ani_phys_resv -= phys_free_slots; 902 } 903 904 #ifdef ANON_DEBUG 905 mem_resv = k_anoninfo.ani_mem_resv; 906 #endif 907 908 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 909 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 910 911 mutex_exit(&anoninfo_lock); 912 913 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 914 npages, mem_resv, (void *)caller())); 915 } 916 917 /* 918 * Allocate an anon slot and return it with the lock held. 919 */ 920 struct anon * 921 anon_alloc(struct vnode *vp, anoff_t off) 922 { 923 struct anon *ap; 924 kmutex_t *ahm; 925 926 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 927 if (vp == NULL) { 928 swap_alloc(ap); 929 } else { 930 ap->an_vp = vp; 931 ap->an_off = off; 932 } 933 ap->an_refcnt = 1; 934 ap->an_pvp = NULL; 935 ap->an_poff = 0; 936 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 937 mutex_enter(ahm); 938 anon_addhash(ap); 939 mutex_exit(ahm); 940 ANI_ADD(-1); 941 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 942 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 943 return (ap); 944 } 945 946 /* 947 * Decrement the reference count of an anon page. 948 * If reference count goes to zero, free it and 949 * its associated page (if any). 950 */ 951 void 952 anon_decref(struct anon *ap) 953 { 954 page_t *pp; 955 struct vnode *vp; 956 anoff_t off; 957 kmutex_t *ahm; 958 959 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 960 mutex_enter(ahm); 961 ASSERT(ap->an_refcnt != 0); 962 if (ap->an_refcnt == 0) 963 panic("anon_decref: slot count 0"); 964 if (--ap->an_refcnt == 0) { 965 swap_xlate(ap, &vp, &off); 966 mutex_exit(ahm); 967 968 /* 969 * If there is a page for this anon slot we will need to 970 * call VN_DISPOSE to get rid of the vp association and 971 * put the page back on the free list as really free. 972 * Acquire the "exclusive" lock to ensure that any 973 * pending i/o always completes before the swap slot 974 * is freed. 975 */ 976 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 977 978 /* 979 * If there was a page, we've synchronized on it (getting 980 * the exclusive lock is as good as gettting the iolock) 981 * so now we can free the physical backing store. Also, this 982 * is where we would free the name of the anonymous page 983 * (swap_free(ap)), a no-op in the current implementation. 984 */ 985 mutex_enter(ahm); 986 ASSERT(ap->an_refcnt == 0); 987 anon_rmhash(ap); 988 if (ap->an_pvp) 989 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 990 mutex_exit(ahm); 991 992 if (pp != NULL) { 993 /*LINTED: constant in conditional context */ 994 VN_DISPOSE(pp, B_INVAL, 0, kcred); 995 } 996 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 997 (void *)ap, (void *)ap->an_vp)); 998 kmem_cache_free(anon_cache, ap); 999 1000 ANI_ADD(1); 1001 } else { 1002 mutex_exit(ahm); 1003 } 1004 } 1005 1006 1007 /* 1008 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1009 * seg->s_szc level) to determine whether COW processing is required. 1010 * anonpages_hash_lock[] held on the root ap ensures that if root's 1011 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1012 * later since this process can't fork while its AS lock is held). 1013 * 1014 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1015 */ 1016 int 1017 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1018 { 1019 struct anon *ap; 1020 kmutex_t *ahmpages = NULL; 1021 1022 ap = anon_get_ptr(ahp, anon_index); 1023 if (ap == NULL) 1024 return (0); 1025 1026 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1027 mutex_enter(ahmpages); 1028 ASSERT(ap->an_refcnt >= 1); 1029 if (ap->an_refcnt == 1) { 1030 mutex_exit(ahmpages); 1031 return (0); 1032 } 1033 mutex_exit(ahmpages); 1034 return (1); 1035 } 1036 /* 1037 * Check 'nslots' anon slots for refcnt > 1. 1038 * 1039 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1040 * returns 0. 1041 */ 1042 static int 1043 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1044 { 1045 struct anon *ap; 1046 1047 while (nslots-- > 0) { 1048 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1049 ap->an_refcnt > 1) 1050 return (1); 1051 anon_index++; 1052 } 1053 1054 return (0); 1055 } 1056 1057 static void 1058 anon_decref_pages( 1059 struct anon_hdr *ahp, 1060 ulong_t an_idx, 1061 uint_t szc) 1062 { 1063 struct anon *ap = anon_get_ptr(ahp, an_idx); 1064 kmutex_t *ahmpages = NULL; 1065 page_t *pp; 1066 pgcnt_t pgcnt = page_get_pagecnt(szc); 1067 pgcnt_t i; 1068 struct vnode *vp; 1069 anoff_t off; 1070 kmutex_t *ahm; 1071 #ifdef DEBUG 1072 int refcnt = 1; 1073 #endif 1074 1075 ASSERT(szc != 0); 1076 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1077 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1078 ASSERT(an_idx < ahp->size); 1079 1080 if (ahp->size - an_idx < pgcnt) { 1081 /* 1082 * In case of shared mappings total anon map size may not be 1083 * the largest page size aligned. 1084 */ 1085 pgcnt = ahp->size - an_idx; 1086 } 1087 1088 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1089 1090 if (ap != NULL) { 1091 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1092 mutex_enter(ahmpages); 1093 ASSERT((refcnt = ap->an_refcnt) != 0); 1094 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1095 if (ap->an_refcnt == 1) { 1096 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1097 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1098 mutex_exit(ahmpages); 1099 ahmpages = NULL; 1100 } 1101 } 1102 1103 i = 0; 1104 while (i < pgcnt) { 1105 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1106 ASSERT(refcnt == 1 && ahmpages == NULL); 1107 i++; 1108 continue; 1109 } 1110 ASSERT(ap->an_refcnt == refcnt); 1111 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1112 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1113 1114 if (ahmpages == NULL) { 1115 swap_xlate(ap, &vp, &off); 1116 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1117 if (pp == NULL || pp->p_szc == 0) { 1118 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1119 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1120 ap->an_off)]; 1121 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1122 ANON_SLEEP); 1123 mutex_enter(ahm); 1124 ap->an_refcnt--; 1125 ASSERT(ap->an_refcnt == 0); 1126 anon_rmhash(ap); 1127 if (ap->an_pvp) 1128 swap_phys_free(ap->an_pvp, ap->an_poff, 1129 PAGESIZE); 1130 mutex_exit(ahm); 1131 if (pp != NULL) { 1132 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1133 /*LINTED*/ 1134 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1135 } 1136 kmem_cache_free(anon_cache, ap); 1137 ANI_ADD(1); 1138 i++; 1139 } else { 1140 pgcnt_t j; 1141 pgcnt_t curpgcnt = 1142 page_get_pagecnt(pp->p_szc); 1143 size_t ppasize = curpgcnt * sizeof (page_t *); 1144 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1145 int dispose = 0; 1146 1147 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1148 1149 ASSERT(pp->p_szc <= szc); 1150 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1151 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1152 ASSERT(i + curpgcnt <= pgcnt); 1153 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1154 ppa[0] = pp; 1155 for (j = i + 1; j < i + curpgcnt; j++) { 1156 ap = anon_get_ptr(ahp, an_idx + j); 1157 ASSERT(ap != NULL && 1158 ap->an_refcnt == 1); 1159 swap_xlate(ap, &vp, &off); 1160 pp = page_lookup(vp, (u_offset_t)off, 1161 SE_EXCL); 1162 if (pp == NULL) 1163 panic("anon_decref_pages: " 1164 "no page"); 1165 1166 (void) hat_pageunload(pp, 1167 HAT_FORCE_PGUNLOAD); 1168 ASSERT(pp->p_szc == ppa[0]->p_szc); 1169 ASSERT(page_pptonum(pp) - 1 == 1170 page_pptonum(ppa[j - i - 1])); 1171 ppa[j - i] = pp; 1172 if (ap->an_pvp != NULL && 1173 !vn_matchopval(ap->an_pvp, 1174 VOPNAME_DISPOSE, 1175 (fs_generic_func_p)fs_dispose)) 1176 dispose = 1; 1177 } 1178 if (!dispose) { 1179 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1180 page_destroy_pages(ppa[0]); 1181 } else { 1182 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1183 for (j = 0; j < curpgcnt; j++) { 1184 ASSERT(PAGE_EXCL(ppa[j])); 1185 ppa[j]->p_szc = 0; 1186 } 1187 for (j = 0; j < curpgcnt; j++) { 1188 ASSERT(!hat_page_is_mapped( 1189 ppa[j])); 1190 /*LINTED*/ 1191 VN_DISPOSE(ppa[j], B_INVAL, 0, 1192 kcred); 1193 } 1194 } 1195 kmem_free(ppa, ppasize); 1196 for (j = i; j < i + curpgcnt; j++) { 1197 ap = anon_get_ptr(ahp, an_idx + j); 1198 ASSERT(ap != NULL && 1199 ap->an_refcnt == 1); 1200 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1201 ap->an_off)]; 1202 (void) anon_set_ptr(ahp, an_idx + j, 1203 NULL, ANON_SLEEP); 1204 mutex_enter(ahm); 1205 ap->an_refcnt--; 1206 ASSERT(ap->an_refcnt == 0); 1207 anon_rmhash(ap); 1208 if (ap->an_pvp) 1209 swap_phys_free(ap->an_pvp, 1210 ap->an_poff, PAGESIZE); 1211 mutex_exit(ahm); 1212 kmem_cache_free(anon_cache, ap); 1213 ANI_ADD(1); 1214 } 1215 i += curpgcnt; 1216 } 1217 } else { 1218 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1219 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1220 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1221 mutex_enter(ahm); 1222 ap->an_refcnt--; 1223 mutex_exit(ahm); 1224 i++; 1225 } 1226 } 1227 1228 if (ahmpages != NULL) { 1229 mutex_exit(ahmpages); 1230 } 1231 } 1232 1233 /* 1234 * Duplicate references to size bytes worth of anon pages. 1235 * Used when duplicating a segment that contains private anon pages. 1236 * This code assumes that procedure calling this one has already used 1237 * hat_chgprot() to disable write access to the range of addresses that 1238 * that *old actually refers to. 1239 */ 1240 void 1241 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1242 ulong_t new_idx, size_t size) 1243 { 1244 spgcnt_t npages; 1245 kmutex_t *ahm; 1246 struct anon *ap; 1247 ulong_t off; 1248 ulong_t index; 1249 1250 npages = btopr(size); 1251 while (npages > 0) { 1252 index = old_idx; 1253 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1254 break; 1255 1256 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1257 off = index - old_idx; 1258 npages -= off; 1259 if (npages <= 0) 1260 break; 1261 1262 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1263 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1264 1265 mutex_enter(ahm); 1266 ap->an_refcnt++; 1267 mutex_exit(ahm); 1268 1269 off++; 1270 new_idx += off; 1271 old_idx += off; 1272 npages--; 1273 } 1274 } 1275 1276 /* 1277 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1278 * slots) within any large page region. That means if a large page region is 1279 * empty in the old array it will skip it. If there are 1 or more valid slots 1280 * in the large page region of the old array it will make sure to fill in any 1281 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1282 * page region should either have no valid anon slots or all slots should be 1283 * valid. 1284 */ 1285 void 1286 anon_dup_fill_holes( 1287 struct anon_hdr *old, 1288 ulong_t old_idx, 1289 struct anon_hdr *new, 1290 ulong_t new_idx, 1291 size_t size, 1292 uint_t szc, 1293 int noalloc) 1294 { 1295 struct anon *ap; 1296 spgcnt_t npages; 1297 kmutex_t *ahm, *ahmpages = NULL; 1298 pgcnt_t pgcnt, i; 1299 ulong_t index, off; 1300 #ifdef DEBUG 1301 int refcnt; 1302 #endif 1303 1304 ASSERT(szc != 0); 1305 pgcnt = page_get_pagecnt(szc); 1306 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1307 npages = btopr(size); 1308 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1309 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1310 1311 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1312 1313 while (npages > 0) { 1314 index = old_idx; 1315 1316 /* 1317 * Find the next valid slot. 1318 */ 1319 if (anon_get_next_ptr(old, &index) == NULL) 1320 break; 1321 1322 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1323 /* 1324 * Now backup index to the beginning of the 1325 * current large page region of the old array. 1326 */ 1327 index = P2ALIGN(index, pgcnt); 1328 off = index - old_idx; 1329 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1330 npages -= off; 1331 if (npages <= 0) 1332 break; 1333 1334 /* 1335 * Fill and copy a large page regions worth 1336 * of anon slots. 1337 */ 1338 for (i = 0; i < pgcnt; i++) { 1339 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1340 if (noalloc) { 1341 panic("anon_dup_fill_holes: " 1342 "empty anon slot\n"); 1343 } 1344 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1345 ap = anon_alloc(NULL, 0); 1346 (void) anon_set_ptr(old, index + i, ap, 1347 ANON_SLEEP); 1348 } else if (i == 0) { 1349 /* 1350 * make the increment of all refcnts of all 1351 * anon slots of a large page appear atomic by 1352 * getting an anonpages_hash_lock for the 1353 * first anon slot of a large page. 1354 */ 1355 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1356 1357 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1358 1359 ahmpages = &anonpages_hash_lock[hash]; 1360 mutex_enter(ahmpages); 1361 /*LINTED*/ 1362 ASSERT(refcnt = ap->an_refcnt); 1363 1364 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1365 anonvmstats.dupfillholes[3]); 1366 } 1367 (void) anon_set_ptr(new, new_idx + off + i, ap, 1368 ANON_SLEEP); 1369 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1370 mutex_enter(ahm); 1371 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1372 ASSERT(i == 0 || ahmpages == NULL || 1373 refcnt == ap->an_refcnt); 1374 ap->an_refcnt++; 1375 mutex_exit(ahm); 1376 } 1377 if (ahmpages != NULL) { 1378 mutex_exit(ahmpages); 1379 ahmpages = NULL; 1380 } 1381 off += pgcnt; 1382 new_idx += off; 1383 old_idx += off; 1384 npages -= pgcnt; 1385 } 1386 } 1387 1388 /* 1389 * Used when a segment with a vnode changes szc. similarly to 1390 * anon_dup_fill_holes() makes sure each large page region either has no anon 1391 * slots or all of them. but new slots are created by COWing the file 1392 * pages. on entrance no anon slots should be shared. 1393 */ 1394 int 1395 anon_fill_cow_holes( 1396 struct seg *seg, 1397 caddr_t addr, 1398 struct anon_hdr *ahp, 1399 ulong_t an_idx, 1400 struct vnode *vp, 1401 u_offset_t vp_off, 1402 size_t size, 1403 uint_t szc, 1404 uint_t prot, 1405 struct vpage vpage[], 1406 struct cred *cred) 1407 { 1408 struct anon *ap; 1409 spgcnt_t npages; 1410 pgcnt_t pgcnt, i; 1411 ulong_t index, off; 1412 int err = 0; 1413 int pageflags = 0; 1414 1415 ASSERT(szc != 0); 1416 pgcnt = page_get_pagecnt(szc); 1417 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1418 npages = btopr(size); 1419 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1420 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1421 1422 while (npages > 0) { 1423 index = an_idx; 1424 1425 /* 1426 * Find the next valid slot. 1427 */ 1428 if (anon_get_next_ptr(ahp, &index) == NULL) { 1429 break; 1430 } 1431 1432 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1433 /* 1434 * Now backup index to the beginning of the 1435 * current large page region of the anon array. 1436 */ 1437 index = P2ALIGN(index, pgcnt); 1438 off = index - an_idx; 1439 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1440 npages -= off; 1441 if (npages <= 0) 1442 break; 1443 an_idx += off; 1444 vp_off += ptob(off); 1445 addr += ptob(off); 1446 if (vpage != NULL) { 1447 vpage += off; 1448 } 1449 1450 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1451 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1452 page_t *pl[1 + 1]; 1453 page_t *pp; 1454 1455 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1456 pl, PAGESIZE, seg, addr, S_READ, cred, 1457 NULL); 1458 if (err) { 1459 break; 1460 } 1461 if (vpage != NULL) { 1462 prot = VPP_PROT(vpage); 1463 pageflags = VPP_ISPPLOCK(vpage) ? 1464 LOCK_PAGE : 0; 1465 } 1466 pp = anon_private(&ap, seg, addr, prot, pl[0], 1467 pageflags, cred); 1468 if (pp == NULL) { 1469 err = ENOMEM; 1470 break; 1471 } 1472 (void) anon_set_ptr(ahp, an_idx, ap, 1473 ANON_SLEEP); 1474 page_unlock(pp); 1475 } 1476 ASSERT(ap->an_refcnt == 1); 1477 addr += PAGESIZE; 1478 if (vpage != NULL) { 1479 vpage++; 1480 } 1481 } 1482 npages -= pgcnt; 1483 } 1484 1485 return (err); 1486 } 1487 1488 /* 1489 * Free a group of "size" anon pages, size in bytes, 1490 * and clear out the pointers to the anon entries. 1491 */ 1492 void 1493 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1494 { 1495 spgcnt_t npages; 1496 struct anon *ap; 1497 ulong_t old; 1498 1499 npages = btopr(size); 1500 1501 while (npages > 0) { 1502 old = index; 1503 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1504 break; 1505 1506 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1507 npages -= index - old; 1508 if (npages <= 0) 1509 break; 1510 1511 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1512 anon_decref(ap); 1513 /* 1514 * Bump index and decrement page count 1515 */ 1516 index++; 1517 npages--; 1518 } 1519 } 1520 1521 void 1522 anon_free_pages( 1523 struct anon_hdr *ahp, 1524 ulong_t an_idx, 1525 size_t size, 1526 uint_t szc) 1527 { 1528 spgcnt_t npages; 1529 pgcnt_t pgcnt; 1530 ulong_t index, off; 1531 1532 ASSERT(szc != 0); 1533 pgcnt = page_get_pagecnt(szc); 1534 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1535 npages = btopr(size); 1536 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1537 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1538 ASSERT(an_idx < ahp->size); 1539 1540 VM_STAT_ADD(anonvmstats.freepages[0]); 1541 1542 while (npages > 0) { 1543 index = an_idx; 1544 1545 /* 1546 * Find the next valid slot. 1547 */ 1548 if (anon_get_next_ptr(ahp, &index) == NULL) 1549 break; 1550 1551 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1552 /* 1553 * Now backup index to the beginning of the 1554 * current large page region of the old array. 1555 */ 1556 index = P2ALIGN(index, pgcnt); 1557 off = index - an_idx; 1558 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1559 npages -= off; 1560 if (npages <= 0) 1561 break; 1562 1563 anon_decref_pages(ahp, index, szc); 1564 1565 off += pgcnt; 1566 an_idx += off; 1567 npages -= pgcnt; 1568 } 1569 } 1570 1571 /* 1572 * Make anonymous pages discardable 1573 */ 1574 void 1575 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1576 { 1577 spgcnt_t npages = btopr(size); 1578 struct anon *ap; 1579 struct vnode *vp; 1580 anoff_t off; 1581 page_t *pp, *root_pp; 1582 kmutex_t *ahm; 1583 pgcnt_t pgcnt; 1584 ulong_t old_idx, idx, i; 1585 struct anon_hdr *ahp = amp->ahp; 1586 anon_sync_obj_t cookie; 1587 1588 ASSERT(RW_READ_HELD(&->a_rwlock)); 1589 pgcnt = 1; 1590 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1591 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1592 1593 /* 1594 * get anon pointer and index for the first valid entry 1595 * in the anon list, starting from "index" 1596 */ 1597 old_idx = index; 1598 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1599 break; 1600 1601 /* 1602 * decrement npages by number of NULL anon slots we skipped 1603 */ 1604 npages -= index - old_idx; 1605 if (npages <= 0) 1606 break; 1607 1608 anon_array_enter(amp, index, &cookie); 1609 ap = anon_get_ptr(ahp, index); 1610 ASSERT(ap != NULL); 1611 1612 /* 1613 * Get anonymous page and try to lock it SE_EXCL; 1614 * if we couldn't grab the lock we skip to next page. 1615 */ 1616 swap_xlate(ap, &vp, &off); 1617 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1618 if (pp == NULL) { 1619 segadvstat.MADV_FREE_miss.value.ul++; 1620 pgcnt = 1; 1621 anon_array_exit(&cookie); 1622 continue; 1623 } 1624 pgcnt = page_get_pagecnt(pp->p_szc); 1625 1626 /* 1627 * we cannot free a page which is permanently locked. 1628 * The page_struct_lock need not be acquired to examine 1629 * these fields since the page has an "exclusive" lock. 1630 */ 1631 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1632 page_unlock(pp); 1633 segadvstat.MADV_FREE_miss.value.ul++; 1634 anon_array_exit(&cookie); 1635 continue; 1636 } 1637 1638 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1639 mutex_enter(ahm); 1640 ASSERT(ap->an_refcnt != 0); 1641 /* 1642 * skip this one if copy-on-write is not yet broken. 1643 */ 1644 if (ap->an_refcnt > 1) { 1645 mutex_exit(ahm); 1646 page_unlock(pp); 1647 segadvstat.MADV_FREE_miss.value.ul++; 1648 anon_array_exit(&cookie); 1649 continue; 1650 } 1651 1652 if (pp->p_szc == 0) { 1653 pgcnt = 1; 1654 1655 /* 1656 * free swap slot; 1657 */ 1658 if (ap->an_pvp) { 1659 swap_phys_free(ap->an_pvp, ap->an_poff, 1660 PAGESIZE); 1661 ap->an_pvp = NULL; 1662 ap->an_poff = 0; 1663 } 1664 mutex_exit(ahm); 1665 segadvstat.MADV_FREE_hit.value.ul++; 1666 1667 /* 1668 * while we are at it, unload all the translations 1669 * and attempt to free the page. 1670 */ 1671 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1672 /*LINTED: constant in conditional context */ 1673 VN_DISPOSE(pp, B_FREE, 0, kcred); 1674 anon_array_exit(&cookie); 1675 continue; 1676 } 1677 1678 pgcnt = page_get_pagecnt(pp->p_szc); 1679 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1680 if (!page_try_demote_pages(pp)) { 1681 mutex_exit(ahm); 1682 page_unlock(pp); 1683 segadvstat.MADV_FREE_miss.value.ul++; 1684 anon_array_exit(&cookie); 1685 continue; 1686 } else { 1687 pgcnt = 1; 1688 if (ap->an_pvp) { 1689 swap_phys_free(ap->an_pvp, 1690 ap->an_poff, PAGESIZE); 1691 ap->an_pvp = NULL; 1692 ap->an_poff = 0; 1693 } 1694 mutex_exit(ahm); 1695 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1696 /*LINTED*/ 1697 VN_DISPOSE(pp, B_FREE, 0, kcred); 1698 segadvstat.MADV_FREE_hit.value.ul++; 1699 anon_array_exit(&cookie); 1700 continue; 1701 } 1702 } 1703 mutex_exit(ahm); 1704 root_pp = pp; 1705 1706 /* 1707 * try to lock remaining pages 1708 */ 1709 for (idx = 1; idx < pgcnt; idx++) { 1710 pp++; 1711 if (!page_trylock(pp, SE_EXCL)) 1712 break; 1713 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1714 page_unlock(pp); 1715 break; 1716 } 1717 } 1718 1719 if (idx == pgcnt) { 1720 for (i = 0; i < pgcnt; i++) { 1721 ap = anon_get_ptr(ahp, index + i); 1722 if (ap == NULL) 1723 break; 1724 swap_xlate(ap, &vp, &off); 1725 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1726 mutex_enter(ahm); 1727 ASSERT(ap->an_refcnt != 0); 1728 1729 /* 1730 * skip this one if copy-on-write 1731 * is not yet broken. 1732 */ 1733 if (ap->an_refcnt > 1) { 1734 mutex_exit(ahm); 1735 goto skiplp; 1736 } 1737 if (ap->an_pvp) { 1738 swap_phys_free(ap->an_pvp, 1739 ap->an_poff, PAGESIZE); 1740 ap->an_pvp = NULL; 1741 ap->an_poff = 0; 1742 } 1743 mutex_exit(ahm); 1744 } 1745 page_destroy_pages(root_pp); 1746 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1747 anon_array_exit(&cookie); 1748 continue; 1749 } 1750 skiplp: 1751 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1752 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1753 page_unlock(pp); 1754 anon_array_exit(&cookie); 1755 } 1756 } 1757 1758 /* 1759 * Return the kept page(s) and protections back to the segment driver. 1760 */ 1761 int 1762 anon_getpage( 1763 struct anon **app, 1764 uint_t *protp, 1765 page_t *pl[], 1766 size_t plsz, 1767 struct seg *seg, 1768 caddr_t addr, 1769 enum seg_rw rw, 1770 struct cred *cred) 1771 { 1772 page_t *pp; 1773 struct anon *ap = *app; 1774 struct vnode *vp; 1775 anoff_t off; 1776 int err; 1777 kmutex_t *ahm; 1778 1779 swap_xlate(ap, &vp, &off); 1780 1781 /* 1782 * Lookup the page. If page is being paged in, 1783 * wait for it to finish as we must return a list of 1784 * pages since this routine acts like the VOP_GETPAGE 1785 * routine does. 1786 */ 1787 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1788 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1789 mutex_enter(ahm); 1790 if (ap->an_refcnt == 1) 1791 *protp = PROT_ALL; 1792 else 1793 *protp = PROT_ALL & ~PROT_WRITE; 1794 mutex_exit(ahm); 1795 pl[0] = pp; 1796 pl[1] = NULL; 1797 return (0); 1798 } 1799 1800 /* 1801 * Simply treat it as a vnode fault on the anon vp. 1802 */ 1803 1804 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1805 "anon_getpage:seg %x addr %x vp %x", 1806 seg, addr, vp); 1807 1808 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1809 seg, addr, rw, cred, NULL); 1810 1811 if (err == 0 && pl != NULL) { 1812 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1813 mutex_enter(ahm); 1814 if (ap->an_refcnt != 1) 1815 *protp &= ~PROT_WRITE; /* make read-only */ 1816 mutex_exit(ahm); 1817 } 1818 return (err); 1819 } 1820 1821 /* 1822 * Creates or returns kept pages to the segment driver. returns -1 if a large 1823 * page cannot be allocated. returns -2 if some other process has allocated a 1824 * larger page. 1825 * 1826 * For cowfault it will allocate any size pages to fill the requested area to 1827 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1828 * slots within a large page with other processes). This policy greatly 1829 * simplifies large page freeing (which is only freed when all anon slot 1830 * refcnts are 0). 1831 */ 1832 int 1833 anon_map_getpages( 1834 struct anon_map *amp, 1835 ulong_t start_idx, 1836 uint_t szc, 1837 struct seg *seg, 1838 caddr_t addr, 1839 uint_t prot, 1840 uint_t *protp, 1841 page_t *ppa[], 1842 uint_t *ppa_szc, 1843 struct vpage vpage[], 1844 enum seg_rw rw, 1845 int brkcow, 1846 int anypgsz, 1847 int pgflags, 1848 struct cred *cred) 1849 { 1850 pgcnt_t pgcnt; 1851 struct anon *ap; 1852 struct vnode *vp; 1853 anoff_t off; 1854 page_t *pp, *pl[2], *conpp = NULL; 1855 caddr_t vaddr; 1856 ulong_t pg_idx, an_idx, i; 1857 spgcnt_t nreloc = 0; 1858 int prealloc = 1; 1859 int err, slotcreate; 1860 uint_t vpprot; 1861 int upsize = (szc < seg->s_szc); 1862 1863 #if !defined(__i386) && !defined(__amd64) 1864 ASSERT(seg->s_szc != 0); 1865 #endif 1866 ASSERT(szc <= seg->s_szc); 1867 ASSERT(ppa_szc != NULL); 1868 ASSERT(rw != S_CREATE); 1869 1870 *protp = PROT_ALL; 1871 1872 VM_STAT_ADD(anonvmstats.getpages[0]); 1873 1874 if (szc == 0) { 1875 VM_STAT_ADD(anonvmstats.getpages[1]); 1876 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1877 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1878 addr, rw, cred); 1879 if (err) 1880 return (err); 1881 ppa[0] = pl[0]; 1882 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1883 VM_STAT_ADD(anonvmstats.getpages[2]); 1884 if (ppa[0]->p_szc != 0 && upsize) { 1885 VM_STAT_ADD(anonvmstats.getpages[3]); 1886 *ppa_szc = MIN(ppa[0]->p_szc, 1887 seg->s_szc); 1888 page_unlock(ppa[0]); 1889 return (-2); 1890 } 1891 return (0); 1892 } 1893 panic("anon_map_getpages: cowfault for szc 0"); 1894 } else { 1895 VM_STAT_ADD(anonvmstats.getpages[4]); 1896 ppa[0] = anon_zero(seg, addr, &ap, cred); 1897 if (ppa[0] == NULL) 1898 return (ENOMEM); 1899 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1900 ANON_SLEEP); 1901 return (0); 1902 } 1903 } 1904 1905 pgcnt = page_get_pagecnt(szc); 1906 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1907 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1908 1909 /* 1910 * First we check for the case that the requtested large 1911 * page or larger page already exists in the system. 1912 * Actually we only check if the first constituent page 1913 * exists and only preallocate if it's not found. 1914 */ 1915 ap = anon_get_ptr(amp->ahp, start_idx); 1916 if (ap) { 1917 uint_t pszc; 1918 swap_xlate(ap, &vp, &off); 1919 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1920 if (pszc > szc && upsize) { 1921 *ppa_szc = MIN(pszc, seg->s_szc); 1922 return (-2); 1923 } 1924 if (pszc >= szc) { 1925 prealloc = 0; 1926 } 1927 } 1928 } 1929 1930 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1931 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1932 1933 top: 1934 /* 1935 * If a smaller page or no page at all was found, 1936 * grab a large page off the freelist. 1937 */ 1938 if (prealloc) { 1939 ASSERT(conpp == NULL); 1940 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1941 szc, 0, pgflags) != 0) { 1942 VM_STAT_ADD(anonvmstats.getpages[7]); 1943 if (brkcow == 0 || szc < seg->s_szc || 1944 !anon_szcshare(amp->ahp, start_idx)) { 1945 /* 1946 * If the refcnt's of all anon slots are <= 1 1947 * they can't increase since we are holding 1948 * the address space's lock. So segvn can 1949 * safely decrease szc without risking to 1950 * generate a cow fault for the region smaller 1951 * than the segment's largest page size. 1952 */ 1953 VM_STAT_ADD(anonvmstats.getpages[8]); 1954 return (-1); 1955 } 1956 docow: 1957 /* 1958 * This is a cow fault. Copy away the entire 1 large 1959 * page region of this segment. 1960 */ 1961 if (szc != seg->s_szc) 1962 panic("anon_map_getpages: cowfault for szc %d", 1963 szc); 1964 vaddr = addr; 1965 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1966 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1967 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1968 NULL) { 1969 err = anon_getpage(&ap, &vpprot, pl, 1970 PAGESIZE, seg, vaddr, rw, cred); 1971 if (err) { 1972 for (i = 0; i < pg_idx; i++) { 1973 if ((pp = ppa[i]) != 1974 NULL) 1975 page_unlock(pp); 1976 } 1977 return (err); 1978 } 1979 ppa[pg_idx] = pl[0]; 1980 } else { 1981 /* 1982 * Since this is a cowfault we know 1983 * that this address space has a 1984 * parent or children which means 1985 * anon_dup_fill_holes() has initialized 1986 * all anon slots within a large page 1987 * region that had at least one anon 1988 * slot at the time of fork(). 1989 */ 1990 panic("anon_map_getpages: " 1991 "cowfault but anon slot is empty"); 1992 } 1993 } 1994 VM_STAT_ADD(anonvmstats.getpages[9]); 1995 *protp = PROT_ALL; 1996 return (anon_map_privatepages(amp, start_idx, szc, seg, 1997 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 1998 } 1999 } 2000 2001 VM_STAT_ADD(anonvmstats.getpages[10]); 2002 2003 an_idx = start_idx; 2004 pg_idx = 0; 2005 vaddr = addr; 2006 while (pg_idx < pgcnt) { 2007 slotcreate = 0; 2008 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2009 VM_STAT_ADD(anonvmstats.getpages[11]); 2010 /* 2011 * For us to have decided not to preallocate 2012 * would have meant that a large page 2013 * was found. Which also means that all of the 2014 * anon slots for that page would have been 2015 * already created for us. 2016 */ 2017 if (prealloc == 0) 2018 panic("anon_map_getpages: prealloc = 0"); 2019 2020 slotcreate = 1; 2021 ap = anon_alloc(NULL, 0); 2022 } 2023 swap_xlate(ap, &vp, &off); 2024 2025 /* 2026 * Now setup our preallocated page to pass down 2027 * to swap_getpage(). 2028 */ 2029 if (prealloc) { 2030 ASSERT(ppa[pg_idx]->p_szc == szc); 2031 conpp = ppa[pg_idx]; 2032 } 2033 ASSERT(prealloc || conpp == NULL); 2034 2035 /* 2036 * If we just created this anon slot then call 2037 * with S_CREATE to prevent doing IO on the page. 2038 * Similar to the anon_zero case. 2039 */ 2040 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2041 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2042 slotcreate == 1 ? S_CREATE : rw, cred); 2043 2044 if (err) { 2045 ASSERT(err != -2 || upsize); 2046 VM_STAT_ADD(anonvmstats.getpages[12]); 2047 ASSERT(slotcreate == 0); 2048 goto io_err; 2049 } 2050 2051 pp = pl[0]; 2052 2053 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2054 VM_STAT_ADD(anonvmstats.getpages[13]); 2055 ASSERT(slotcreate == 0); 2056 ASSERT(prealloc == 0); 2057 ASSERT(pg_idx == 0); 2058 if (pp->p_szc > szc) { 2059 ASSERT(upsize); 2060 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2061 page_unlock(pp); 2062 VM_STAT_ADD(anonvmstats.getpages[14]); 2063 return (-2); 2064 } 2065 page_unlock(pp); 2066 prealloc = 1; 2067 goto top; 2068 } 2069 2070 /* 2071 * If we decided to preallocate but VOP_GETPAGE 2072 * found a page in the system that satisfies our 2073 * request then free up our preallocated large page 2074 * and continue looping accross the existing large 2075 * page via VOP_GETPAGE. 2076 */ 2077 if (prealloc && pp != ppa[pg_idx]) { 2078 VM_STAT_ADD(anonvmstats.getpages[15]); 2079 ASSERT(slotcreate == 0); 2080 ASSERT(pg_idx == 0); 2081 conpp = NULL; 2082 prealloc = 0; 2083 page_free_pages(ppa[0]); 2084 } 2085 2086 if (prealloc && nreloc > 1) { 2087 /* 2088 * we have relocated out of a smaller large page. 2089 * skip npgs - 1 iterations and continue which will 2090 * increment by one the loop indices. 2091 */ 2092 spgcnt_t npgs = nreloc; 2093 2094 VM_STAT_ADD(anonvmstats.getpages[16]); 2095 2096 ASSERT(pp == ppa[pg_idx]); 2097 ASSERT(slotcreate == 0); 2098 ASSERT(pg_idx + npgs <= pgcnt); 2099 if ((*protp & PROT_WRITE) && 2100 anon_share(amp->ahp, an_idx, npgs)) { 2101 *protp &= ~PROT_WRITE; 2102 } 2103 pg_idx += npgs; 2104 an_idx += npgs; 2105 vaddr += PAGESIZE * npgs; 2106 continue; 2107 } 2108 2109 VM_STAT_ADD(anonvmstats.getpages[17]); 2110 2111 /* 2112 * Anon_zero case. 2113 */ 2114 if (slotcreate) { 2115 ASSERT(prealloc); 2116 pagezero(pp, 0, PAGESIZE); 2117 CPU_STATS_ADD_K(vm, zfod, 1); 2118 hat_setrefmod(pp); 2119 } 2120 2121 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2122 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2123 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2124 2125 if (pg_idx > 0 && 2126 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2127 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2128 panic("anon_map_getpages: unexpected page"); 2129 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2130 panic("anon_map_getpages: unaligned page"); 2131 } 2132 2133 if (prealloc == 0) { 2134 ppa[pg_idx] = pp; 2135 } 2136 2137 if (ap->an_refcnt > 1) { 2138 VM_STAT_ADD(anonvmstats.getpages[18]); 2139 *protp &= ~PROT_WRITE; 2140 } 2141 2142 /* 2143 * If this is a new anon slot then initialize 2144 * the anon array entry. 2145 */ 2146 if (slotcreate) { 2147 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2148 } 2149 pg_idx++; 2150 an_idx++; 2151 vaddr += PAGESIZE; 2152 } 2153 2154 /* 2155 * Since preallocated pages come off the freelist 2156 * they are locked SE_EXCL. Simply downgrade and return. 2157 */ 2158 if (prealloc) { 2159 VM_STAT_ADD(anonvmstats.getpages[19]); 2160 conpp = NULL; 2161 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2162 page_downgrade(ppa[pg_idx]); 2163 } 2164 } 2165 ASSERT(conpp == NULL); 2166 2167 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2168 VM_STAT_ADD(anonvmstats.getpages[20]); 2169 return (0); 2170 } 2171 2172 if (szc < seg->s_szc) 2173 panic("anon_map_getpages: cowfault for szc %d", szc); 2174 2175 VM_STAT_ADD(anonvmstats.getpages[21]); 2176 2177 *protp = PROT_ALL; 2178 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2179 ppa, vpage, anypgsz, pgflags, cred)); 2180 io_err: 2181 /* 2182 * We got an IO error somewhere in our large page. 2183 * If we were using a preallocated page then just demote 2184 * all the constituent pages that we've succeeded with sofar 2185 * to PAGESIZE pages and leave them in the system 2186 * unlocked. 2187 */ 2188 2189 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2190 2191 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2192 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2193 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2194 2195 if (prealloc) { 2196 conpp = NULL; 2197 if (pg_idx > 0) { 2198 VM_STAT_ADD(anonvmstats.getpages[25]); 2199 for (i = 0; i < pgcnt; i++) { 2200 pp = ppa[i]; 2201 ASSERT(PAGE_EXCL(pp)); 2202 ASSERT(pp->p_szc == szc); 2203 pp->p_szc = 0; 2204 } 2205 for (i = 0; i < pg_idx; i++) { 2206 ASSERT(!hat_page_is_mapped(ppa[i])); 2207 page_unlock(ppa[i]); 2208 } 2209 /* 2210 * Now free up the remaining unused constituent 2211 * pages. 2212 */ 2213 while (pg_idx < pgcnt) { 2214 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2215 page_free(ppa[pg_idx], 0); 2216 pg_idx++; 2217 } 2218 } else { 2219 VM_STAT_ADD(anonvmstats.getpages[26]); 2220 page_free_pages(ppa[0]); 2221 } 2222 } else { 2223 VM_STAT_ADD(anonvmstats.getpages[27]); 2224 ASSERT(err > 0); 2225 for (i = 0; i < pg_idx; i++) 2226 page_unlock(ppa[i]); 2227 } 2228 ASSERT(conpp == NULL); 2229 if (err != -1) 2230 return (err); 2231 /* 2232 * we are here because we failed to relocate. 2233 */ 2234 ASSERT(prealloc); 2235 if (brkcow == 0 || szc < seg->s_szc || 2236 !anon_szcshare(amp->ahp, start_idx)) { 2237 VM_STAT_ADD(anonvmstats.getpages[28]); 2238 return (-1); 2239 } 2240 VM_STAT_ADD(anonvmstats.getpages[29]); 2241 goto docow; 2242 } 2243 2244 2245 /* 2246 * Turn a reference to an object or shared anon page 2247 * into a private page with a copy of the data from the 2248 * original page which is always locked by the caller. 2249 * This routine unloads the translation and unlocks the 2250 * original page, if it isn't being stolen, before returning 2251 * to the caller. 2252 * 2253 * NOTE: The original anon slot is not freed by this routine 2254 * It must be freed by the caller while holding the 2255 * "anon_map" lock to prevent races which can occur if 2256 * a process has multiple lwps in its address space. 2257 */ 2258 page_t * 2259 anon_private( 2260 struct anon **app, 2261 struct seg *seg, 2262 caddr_t addr, 2263 uint_t prot, 2264 page_t *opp, 2265 int oppflags, 2266 struct cred *cred) 2267 { 2268 struct anon *old = *app; 2269 struct anon *new; 2270 page_t *pp = NULL; 2271 struct vnode *vp; 2272 anoff_t off; 2273 page_t *anon_pl[1 + 1]; 2274 int err; 2275 2276 if (oppflags & STEAL_PAGE) 2277 ASSERT(PAGE_EXCL(opp)); 2278 else 2279 ASSERT(PAGE_LOCKED(opp)); 2280 2281 CPU_STATS_ADD_K(vm, cow_fault, 1); 2282 2283 /* Kernel probe */ 2284 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2285 tnf_opaque, address, addr); 2286 2287 *app = new = anon_alloc(NULL, 0); 2288 swap_xlate(new, &vp, &off); 2289 2290 if (oppflags & STEAL_PAGE) { 2291 page_rename(opp, vp, (u_offset_t)off); 2292 pp = opp; 2293 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2294 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2295 seg, addr, pp, vp, off); 2296 hat_setmod(pp); 2297 2298 /* bug 4026339 */ 2299 page_downgrade(pp); 2300 return (pp); 2301 } 2302 2303 /* 2304 * Call the VOP_GETPAGE routine to create the page, thereby 2305 * enabling the vnode driver to allocate any filesystem 2306 * space (e.g., disk block allocation for UFS). This also 2307 * prevents more than one page from being added to the 2308 * vnode at the same time. 2309 */ 2310 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2311 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2312 if (err) 2313 goto out; 2314 2315 pp = anon_pl[0]; 2316 2317 /* 2318 * If the original page was locked, we need to move the lock 2319 * to the new page by transfering 'cowcnt/lckcnt' of the original 2320 * page to 'cowcnt/lckcnt' of the new page. 2321 * 2322 * See Statement at the beginning of segvn_lockop() and 2323 * comments in page_pp_useclaim() regarding the way 2324 * cowcnts/lckcnts are handled. 2325 * 2326 * Also availrmem must be decremented up front for read only mapping 2327 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2328 * if availrmem did not need to be decremented after all. 2329 */ 2330 if (oppflags & LOCK_PAGE) { 2331 if ((prot & PROT_WRITE) == 0) { 2332 mutex_enter(&freemem_lock); 2333 if (availrmem > pages_pp_maximum) { 2334 availrmem--; 2335 pages_useclaim++; 2336 } else { 2337 mutex_exit(&freemem_lock); 2338 goto out; 2339 } 2340 mutex_exit(&freemem_lock); 2341 } 2342 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2343 } 2344 2345 /* 2346 * Now copy the contents from the original page, 2347 * which is locked and loaded in the MMU by 2348 * the caller to prevent yet another page fault. 2349 */ 2350 /* XXX - should set mod bit in here */ 2351 if (ppcopy(opp, pp) == 0) { 2352 /* 2353 * Before ppcopy could hanlde UE or other faults, we 2354 * would have panicked here, and still have no option 2355 * but to do so now. 2356 */ 2357 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2358 opp, pp); 2359 } 2360 2361 hat_setrefmod(pp); /* mark as modified */ 2362 2363 /* 2364 * Unload the old translation. 2365 */ 2366 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2367 2368 /* 2369 * Free unmapped, unmodified original page. 2370 * or release the lock on the original page, 2371 * otherwise the process will sleep forever in 2372 * anon_decref() waiting for the "exclusive" lock 2373 * on the page. 2374 */ 2375 (void) page_release(opp, 1); 2376 2377 /* 2378 * we are done with page creation so downgrade the new 2379 * page's selock to shared, this helps when multiple 2380 * as_fault(...SOFTLOCK...) are done to the same 2381 * page(aio) 2382 */ 2383 page_downgrade(pp); 2384 2385 /* 2386 * NOTE: The original anon slot must be freed by the 2387 * caller while holding the "anon_map" lock, if we 2388 * copied away from an anonymous page. 2389 */ 2390 return (pp); 2391 2392 out: 2393 *app = old; 2394 if (pp) 2395 page_unlock(pp); 2396 anon_decref(new); 2397 page_unlock(opp); 2398 return ((page_t *)NULL); 2399 } 2400 2401 int 2402 anon_map_privatepages( 2403 struct anon_map *amp, 2404 ulong_t start_idx, 2405 uint_t szc, 2406 struct seg *seg, 2407 caddr_t addr, 2408 uint_t prot, 2409 page_t *ppa[], 2410 struct vpage vpage[], 2411 int anypgsz, 2412 int pgflags, 2413 struct cred *cred) 2414 { 2415 pgcnt_t pgcnt; 2416 struct vnode *vp; 2417 anoff_t off; 2418 page_t *pl[2], *conpp = NULL; 2419 int err; 2420 int prealloc = 1; 2421 struct anon *ap, *oldap; 2422 caddr_t vaddr; 2423 page_t *pplist, *pp; 2424 ulong_t pg_idx, an_idx; 2425 spgcnt_t nreloc = 0; 2426 int pagelock = 0; 2427 kmutex_t *ahmpages = NULL; 2428 #ifdef DEBUG 2429 int refcnt; 2430 #endif 2431 2432 ASSERT(szc != 0); 2433 ASSERT(szc == seg->s_szc); 2434 2435 VM_STAT_ADD(anonvmstats.privatepages[0]); 2436 2437 pgcnt = page_get_pagecnt(szc); 2438 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2439 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2440 2441 ASSERT(amp != NULL); 2442 ap = anon_get_ptr(amp->ahp, start_idx); 2443 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2444 2445 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2446 2447 /* 2448 * Now try and allocate the large page. If we fail then just 2449 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2450 * the caller make this decision but to avoid added complexity 2451 * it's simplier to handle that case here. 2452 */ 2453 if (anypgsz == -1) { 2454 VM_STAT_ADD(anonvmstats.privatepages[2]); 2455 prealloc = 0; 2456 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2457 anypgsz, pgflags) != 0) { 2458 VM_STAT_ADD(anonvmstats.privatepages[3]); 2459 prealloc = 0; 2460 } 2461 2462 /* 2463 * make the decrement of all refcnts of all 2464 * anon slots of a large page appear atomic by 2465 * getting an anonpages_hash_lock for the 2466 * first anon slot of a large page. 2467 */ 2468 if (ap != NULL) { 2469 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2470 ap->an_off)]; 2471 mutex_enter(ahmpages); 2472 if (ap->an_refcnt == 1) { 2473 VM_STAT_ADD(anonvmstats.privatepages[4]); 2474 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2475 mutex_exit(ahmpages); 2476 2477 if (prealloc) { 2478 page_free_replacement_page(pplist); 2479 page_create_putback(pgcnt); 2480 } 2481 ASSERT(ppa[0]->p_szc <= szc); 2482 if (ppa[0]->p_szc == szc) { 2483 VM_STAT_ADD(anonvmstats.privatepages[5]); 2484 return (0); 2485 } 2486 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2487 ASSERT(ppa[pg_idx] != NULL); 2488 page_unlock(ppa[pg_idx]); 2489 } 2490 return (-1); 2491 } 2492 } 2493 2494 /* 2495 * If we are passed in the vpage array and this is 2496 * not PROT_WRITE then we need to decrement availrmem 2497 * up front before we try anything. If we need to and 2498 * can't decrement availrmem then its better to fail now 2499 * than in the middle of processing the new large page. 2500 * page_pp_usclaim() on behalf of each constituent page 2501 * below will adjust availrmem back for the cases not needed. 2502 */ 2503 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2504 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2505 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2506 pagelock = 1; 2507 break; 2508 } 2509 } 2510 if (pagelock) { 2511 VM_STAT_ADD(anonvmstats.privatepages[6]); 2512 mutex_enter(&freemem_lock); 2513 if (availrmem >= pages_pp_maximum + pgcnt) { 2514 availrmem -= pgcnt; 2515 pages_useclaim += pgcnt; 2516 } else { 2517 VM_STAT_ADD(anonvmstats.privatepages[7]); 2518 mutex_exit(&freemem_lock); 2519 if (ahmpages != NULL) { 2520 mutex_exit(ahmpages); 2521 } 2522 if (prealloc) { 2523 page_free_replacement_page(pplist); 2524 page_create_putback(pgcnt); 2525 } 2526 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2527 if (ppa[pg_idx] != NULL) 2528 page_unlock(ppa[pg_idx]); 2529 return (ENOMEM); 2530 } 2531 mutex_exit(&freemem_lock); 2532 } 2533 } 2534 2535 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2536 2537 VM_STAT_ADD(anonvmstats.privatepages[8]); 2538 2539 an_idx = start_idx; 2540 pg_idx = 0; 2541 vaddr = addr; 2542 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2543 ASSERT(ppa[pg_idx] != NULL); 2544 oldap = anon_get_ptr(amp->ahp, an_idx); 2545 ASSERT(ahmpages != NULL || oldap == NULL); 2546 ASSERT(ahmpages == NULL || oldap != NULL); 2547 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2548 ASSERT(ahmpages == NULL || pg_idx != 0 || 2549 (refcnt = oldap->an_refcnt)); 2550 ASSERT(ahmpages == NULL || pg_idx == 0 || 2551 refcnt == oldap->an_refcnt); 2552 2553 ap = anon_alloc(NULL, 0); 2554 2555 swap_xlate(ap, &vp, &off); 2556 2557 /* 2558 * Now setup our preallocated page to pass down to 2559 * swap_getpage(). 2560 */ 2561 if (prealloc) { 2562 pp = pplist; 2563 page_sub(&pplist, pp); 2564 conpp = pp; 2565 } 2566 2567 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2568 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2569 S_CREATE, cred); 2570 2571 /* 2572 * Impossible to fail this is S_CREATE. 2573 */ 2574 if (err) 2575 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2576 2577 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2578 ASSERT(prealloc == 0 || nreloc == 1); 2579 2580 pp = pl[0]; 2581 2582 /* 2583 * If the original page was locked, we need to move 2584 * the lock to the new page by transfering 2585 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2586 * of the new page. pg_idx can be used to index 2587 * into the vpage array since the caller will guarentee 2588 * that vpage struct passed in corresponds to addr 2589 * and forward. 2590 */ 2591 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2592 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2593 } else if (pagelock) { 2594 mutex_enter(&freemem_lock); 2595 availrmem++; 2596 pages_useclaim--; 2597 mutex_exit(&freemem_lock); 2598 } 2599 2600 /* 2601 * Now copy the contents from the original page. 2602 */ 2603 if (ppcopy(ppa[pg_idx], pp) == 0) { 2604 /* 2605 * Before ppcopy could hanlde UE or other faults, we 2606 * would have panicked here, and still have no option 2607 * but to do so now. 2608 */ 2609 panic("anon_map_privatepages, ppcopy failed"); 2610 } 2611 2612 hat_setrefmod(pp); /* mark as modified */ 2613 2614 /* 2615 * Release the lock on the original page, 2616 * derement the old slot, and down grade the lock 2617 * on the new copy. 2618 */ 2619 page_unlock(ppa[pg_idx]); 2620 2621 if (!prealloc) 2622 page_downgrade(pp); 2623 2624 ppa[pg_idx] = pp; 2625 2626 /* 2627 * Now reflect the copy in the new anon array. 2628 */ 2629 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2630 if (oldap != NULL) 2631 anon_decref(oldap); 2632 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2633 } 2634 if (ahmpages != NULL) { 2635 mutex_exit(ahmpages); 2636 } 2637 ASSERT(prealloc == 0 || pplist == NULL); 2638 if (prealloc) { 2639 VM_STAT_ADD(anonvmstats.privatepages[9]); 2640 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2641 page_downgrade(ppa[pg_idx]); 2642 } 2643 } 2644 2645 /* 2646 * Unload the old large page translation. 2647 */ 2648 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2649 return (0); 2650 } 2651 2652 /* 2653 * Allocate a private zero-filled anon page. 2654 */ 2655 page_t * 2656 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2657 { 2658 struct anon *ap; 2659 page_t *pp; 2660 struct vnode *vp; 2661 anoff_t off; 2662 page_t *anon_pl[1 + 1]; 2663 int err; 2664 2665 /* Kernel probe */ 2666 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2667 tnf_opaque, address, addr); 2668 2669 *app = ap = anon_alloc(NULL, 0); 2670 swap_xlate(ap, &vp, &off); 2671 2672 /* 2673 * Call the VOP_GETPAGE routine to create the page, thereby 2674 * enabling the vnode driver to allocate any filesystem 2675 * dependent structures (e.g., disk block allocation for UFS). 2676 * This also prevents more than on page from being added to 2677 * the vnode at the same time since it is locked. 2678 */ 2679 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2680 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2681 if (err) { 2682 *app = NULL; 2683 anon_decref(ap); 2684 return (NULL); 2685 } 2686 pp = anon_pl[0]; 2687 2688 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2689 page_downgrade(pp); 2690 CPU_STATS_ADD_K(vm, zfod, 1); 2691 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2692 return (pp); 2693 } 2694 2695 2696 /* 2697 * Allocate array of private zero-filled anon pages for empty slots 2698 * and kept pages for non empty slots within given range. 2699 * 2700 * NOTE: This rontine will try and use large pages 2701 * if available and supported by underlying platform. 2702 */ 2703 int 2704 anon_map_createpages( 2705 struct anon_map *amp, 2706 ulong_t start_index, 2707 size_t len, 2708 page_t *ppa[], 2709 struct seg *seg, 2710 caddr_t addr, 2711 enum seg_rw rw, 2712 struct cred *cred) 2713 { 2714 2715 struct anon *ap; 2716 struct vnode *ap_vp; 2717 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2718 int err = 0; 2719 ulong_t p_index, index; 2720 pgcnt_t npgs, pg_cnt; 2721 spgcnt_t nreloc = 0; 2722 uint_t l_szc, szc, prot; 2723 anoff_t ap_off; 2724 size_t pgsz; 2725 lgrp_t *lgrp; 2726 kmutex_t *ahm; 2727 2728 /* 2729 * XXX For now only handle S_CREATE. 2730 */ 2731 ASSERT(rw == S_CREATE); 2732 2733 index = start_index; 2734 p_index = 0; 2735 npgs = btopr(len); 2736 2737 /* 2738 * If this platform supports multiple page sizes 2739 * then try and allocate directly from the free 2740 * list for pages larger than PAGESIZE. 2741 * 2742 * NOTE:When we have page_create_ru we can stop 2743 * directly allocating from the freelist. 2744 */ 2745 l_szc = seg->s_szc; 2746 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2747 while (npgs) { 2748 2749 /* 2750 * if anon slot already exists 2751 * (means page has been created) 2752 * so 1) look up the page 2753 * 2) if the page is still in memory, get it. 2754 * 3) if not, create a page and 2755 * page in from physical swap device. 2756 * These are done in anon_getpage(). 2757 */ 2758 ap = anon_get_ptr(amp->ahp, index); 2759 if (ap) { 2760 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2761 seg, addr, S_READ, cred); 2762 if (err) { 2763 ANON_LOCK_EXIT(&->a_rwlock); 2764 panic("anon_map_createpages: anon_getpage"); 2765 } 2766 pp = anon_pl[0]; 2767 ppa[p_index++] = pp; 2768 2769 /* 2770 * an_pvp can become non-NULL after SysV's page was 2771 * paged out before ISM was attached to this SysV 2772 * shared memory segment. So free swap slot if needed. 2773 */ 2774 if (ap->an_pvp != NULL) { 2775 page_io_lock(pp); 2776 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 2777 ap->an_off)]; 2778 mutex_enter(ahm); 2779 if (ap->an_pvp != NULL) { 2780 swap_phys_free(ap->an_pvp, 2781 ap->an_poff, PAGESIZE); 2782 ap->an_pvp = NULL; 2783 ap->an_poff = 0; 2784 mutex_exit(ahm); 2785 hat_setmod(pp); 2786 } else { 2787 mutex_exit(ahm); 2788 } 2789 page_io_unlock(pp); 2790 } 2791 2792 addr += PAGESIZE; 2793 index++; 2794 npgs--; 2795 continue; 2796 } 2797 /* 2798 * Now try and allocate the largest page possible 2799 * for the current address and range. 2800 * Keep dropping down in page size until: 2801 * 2802 * 1) Properly aligned 2803 * 2) Does not overlap existing anon pages 2804 * 3) Fits in remaining range. 2805 * 4) able to allocate one. 2806 * 2807 * NOTE: XXX When page_create_ru is completed this code 2808 * will change. 2809 */ 2810 szc = l_szc; 2811 pplist = NULL; 2812 pg_cnt = 0; 2813 while (szc) { 2814 pgsz = page_get_pagesize(szc); 2815 pg_cnt = pgsz >> PAGESHIFT; 2816 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2817 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2818 /* 2819 * XXX 2820 * Since we are faking page_create() 2821 * we also need to do the freemem and 2822 * pcf accounting. 2823 */ 2824 (void) page_create_wait(pg_cnt, PG_WAIT); 2825 2826 /* 2827 * Get lgroup to allocate next page of shared 2828 * memory from and use it to specify where to 2829 * allocate the physical memory 2830 */ 2831 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2832 2833 pplist = page_get_freelist( 2834 anon_vp, (u_offset_t)0, seg, 2835 addr, pgsz, 0, lgrp); 2836 2837 if (pplist == NULL) { 2838 page_create_putback(pg_cnt); 2839 } 2840 2841 /* 2842 * If a request for a page of size 2843 * larger than PAGESIZE failed 2844 * then don't try that size anymore. 2845 */ 2846 if (pplist == NULL) { 2847 l_szc = szc - 1; 2848 } else { 2849 break; 2850 } 2851 } 2852 szc--; 2853 } 2854 2855 /* 2856 * If just using PAGESIZE pages then don't 2857 * directly allocate from the free list. 2858 */ 2859 if (pplist == NULL) { 2860 ASSERT(szc == 0); 2861 pp = anon_zero(seg, addr, &ap, cred); 2862 if (pp == NULL) { 2863 ANON_LOCK_EXIT(&->a_rwlock); 2864 panic("anon_map_createpages: anon_zero"); 2865 } 2866 ppa[p_index++] = pp; 2867 2868 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2869 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2870 2871 addr += PAGESIZE; 2872 index++; 2873 npgs--; 2874 continue; 2875 } 2876 2877 /* 2878 * pplist is a list of pg_cnt PAGESIZE pages. 2879 * These pages are locked SE_EXCL since they 2880 * came directly off the free list. 2881 */ 2882 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2883 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2884 ASSERT(conpp == NULL); 2885 while (pg_cnt--) { 2886 2887 ap = anon_alloc(NULL, 0); 2888 swap_xlate(ap, &ap_vp, &ap_off); 2889 2890 ASSERT(pplist != NULL); 2891 pp = pplist; 2892 page_sub(&pplist, pp); 2893 PP_CLRFREE(pp); 2894 PP_CLRAGED(pp); 2895 conpp = pp; 2896 2897 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2898 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2899 &nreloc, seg, addr, S_CREATE, cred); 2900 2901 if (err) { 2902 ANON_LOCK_EXIT(&->a_rwlock); 2903 panic("anon_map_createpages: S_CREATE"); 2904 } 2905 2906 ASSERT(anon_pl[0] == pp); 2907 ASSERT(nreloc == 1); 2908 pagezero(pp, 0, PAGESIZE); 2909 CPU_STATS_ADD_K(vm, zfod, 1); 2910 hat_setrefmod(pp); 2911 2912 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2913 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2914 2915 ppa[p_index++] = pp; 2916 2917 addr += PAGESIZE; 2918 index++; 2919 npgs--; 2920 } 2921 conpp = NULL; 2922 pg_cnt = pgsz >> PAGESHIFT; 2923 p_index = p_index - pg_cnt; 2924 while (pg_cnt--) { 2925 page_downgrade(ppa[p_index++]); 2926 } 2927 } 2928 ANON_LOCK_EXIT(&->a_rwlock); 2929 return (0); 2930 } 2931 2932 static int 2933 anon_try_demote_pages( 2934 struct anon_hdr *ahp, 2935 ulong_t sidx, 2936 uint_t szc, 2937 page_t **ppa, 2938 int private) 2939 { 2940 struct anon *ap; 2941 pgcnt_t pgcnt = page_get_pagecnt(szc); 2942 page_t *pp; 2943 pgcnt_t i; 2944 kmutex_t *ahmpages = NULL; 2945 int root = 0; 2946 pgcnt_t npgs; 2947 pgcnt_t curnpgs = 0; 2948 size_t ppasize = 0; 2949 2950 ASSERT(szc != 0); 2951 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2952 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 2953 ASSERT(sidx < ahp->size); 2954 2955 if (ppa == NULL) { 2956 ppasize = pgcnt * sizeof (page_t *); 2957 ppa = kmem_alloc(ppasize, KM_SLEEP); 2958 } 2959 2960 ap = anon_get_ptr(ahp, sidx); 2961 if (ap != NULL && private) { 2962 VM_STAT_ADD(anonvmstats.demotepages[1]); 2963 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2964 mutex_enter(ahmpages); 2965 } 2966 2967 if (ap != NULL && ap->an_refcnt > 1) { 2968 if (ahmpages != NULL) { 2969 VM_STAT_ADD(anonvmstats.demotepages[2]); 2970 mutex_exit(ahmpages); 2971 } 2972 if (ppasize != 0) { 2973 kmem_free(ppa, ppasize); 2974 } 2975 return (0); 2976 } 2977 if (ahmpages != NULL) { 2978 mutex_exit(ahmpages); 2979 } 2980 if (ahp->size - sidx < pgcnt) { 2981 ASSERT(private == 0); 2982 pgcnt = ahp->size - sidx; 2983 } 2984 for (i = 0; i < pgcnt; i++, sidx++) { 2985 ap = anon_get_ptr(ahp, sidx); 2986 if (ap != NULL) { 2987 if (ap->an_refcnt != 1) { 2988 panic("anon_try_demote_pages: an_refcnt != 1"); 2989 } 2990 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2991 SE_EXCL); 2992 if (pp != NULL) { 2993 (void) hat_pageunload(pp, 2994 HAT_FORCE_PGUNLOAD); 2995 } 2996 } else { 2997 ppa[i] = NULL; 2998 } 2999 } 3000 for (i = 0; i < pgcnt; i++) { 3001 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3002 ASSERT(pp->p_szc <= szc); 3003 if (!root) { 3004 VM_STAT_ADD(anonvmstats.demotepages[3]); 3005 if (curnpgs != 0) 3006 panic("anon_try_demote_pages: " 3007 "bad large page"); 3008 3009 root = 1; 3010 curnpgs = npgs = 3011 page_get_pagecnt(pp->p_szc); 3012 3013 ASSERT(npgs <= pgcnt); 3014 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3015 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3016 } else { 3017 ASSERT(i > 0); 3018 ASSERT(page_pptonum(pp) - 1 == 3019 page_pptonum(ppa[i - 1])); 3020 if ((page_pptonum(pp) & (npgs - 1)) == 3021 npgs - 1) 3022 root = 0; 3023 } 3024 ASSERT(PAGE_EXCL(pp)); 3025 pp->p_szc = 0; 3026 ASSERT(curnpgs > 0); 3027 curnpgs--; 3028 } 3029 } 3030 if (root != 0 || curnpgs != 0) 3031 panic("anon_try_demote_pages: bad large page"); 3032 3033 for (i = 0; i < pgcnt; i++) { 3034 if ((pp = ppa[i]) != NULL) { 3035 ASSERT(!hat_page_is_mapped(pp)); 3036 ASSERT(pp->p_szc == 0); 3037 page_unlock(pp); 3038 } 3039 } 3040 if (ppasize != 0) { 3041 kmem_free(ppa, ppasize); 3042 } 3043 return (1); 3044 } 3045 3046 /* 3047 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3048 */ 3049 int 3050 anon_map_demotepages( 3051 struct anon_map *amp, 3052 ulong_t start_idx, 3053 struct seg *seg, 3054 caddr_t addr, 3055 uint_t prot, 3056 struct vpage vpage[], 3057 struct cred *cred) 3058 { 3059 struct anon *ap; 3060 uint_t szc = seg->s_szc; 3061 pgcnt_t pgcnt = page_get_pagecnt(szc); 3062 size_t ppasize = pgcnt * sizeof (page_t *); 3063 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3064 page_t *pp; 3065 page_t *pl[2]; 3066 pgcnt_t i, pg_idx; 3067 ulong_t an_idx; 3068 caddr_t vaddr; 3069 int err; 3070 int retry = 0; 3071 uint_t vpprot; 3072 3073 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3074 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3075 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3076 ASSERT(ppa != NULL); 3077 ASSERT(szc != 0); 3078 ASSERT(szc == amp->a_szc); 3079 3080 VM_STAT_ADD(anonvmstats.demotepages[0]); 3081 3082 top: 3083 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3084 kmem_free(ppa, ppasize); 3085 return (0); 3086 } 3087 3088 VM_STAT_ADD(anonvmstats.demotepages[4]); 3089 3090 ASSERT(retry == 0); /* we can be here only once */ 3091 3092 vaddr = addr; 3093 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3094 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3095 ap = anon_get_ptr(amp->ahp, an_idx); 3096 if (ap == NULL) 3097 panic("anon_map_demotepages: no anon slot"); 3098 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3099 S_READ, cred); 3100 if (err) { 3101 for (i = 0; i < pg_idx; i++) { 3102 if ((pp = ppa[i]) != NULL) 3103 page_unlock(pp); 3104 } 3105 kmem_free(ppa, ppasize); 3106 return (err); 3107 } 3108 ppa[pg_idx] = pl[0]; 3109 } 3110 3111 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3112 vpage, -1, 0, cred); 3113 if (err > 0) { 3114 VM_STAT_ADD(anonvmstats.demotepages[5]); 3115 kmem_free(ppa, ppasize); 3116 return (err); 3117 } 3118 ASSERT(err == 0 || err == -1); 3119 if (err == -1) { 3120 VM_STAT_ADD(anonvmstats.demotepages[6]); 3121 retry = 1; 3122 goto top; 3123 } 3124 for (i = 0; i < pgcnt; i++) { 3125 ASSERT(ppa[i] != NULL); 3126 if (ppa[i]->p_szc != 0) 3127 retry = 1; 3128 page_unlock(ppa[i]); 3129 } 3130 if (retry) { 3131 VM_STAT_ADD(anonvmstats.demotepages[7]); 3132 goto top; 3133 } 3134 3135 VM_STAT_ADD(anonvmstats.demotepages[8]); 3136 3137 kmem_free(ppa, ppasize); 3138 3139 return (0); 3140 } 3141 3142 /* 3143 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3144 * structures with private anon maps. Therefore all anon structures should 3145 * have at most one reference at this point. This means underlying pages can 3146 * be exclusively locked and demoted or freed. If not freeing the entire 3147 * large pages demote the ends of the region we free to be able to free 3148 * subpages. Page roots correspond to aligned index positions in anon map. 3149 */ 3150 void 3151 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3152 { 3153 ulong_t eidx = sidx + btopr(len); 3154 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3155 struct anon_hdr *ahp = amp->ahp; 3156 ulong_t tidx; 3157 size_t size; 3158 ulong_t sidx_aligned; 3159 ulong_t eidx_aligned; 3160 3161 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3162 ASSERT(amp->refcnt <= 1); 3163 ASSERT(amp->a_szc > 0); 3164 ASSERT(eidx <= ahp->size); 3165 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3166 3167 if (len == 0) { /* XXX */ 3168 return; 3169 } 3170 3171 sidx_aligned = P2ALIGN(sidx, pages); 3172 if (sidx_aligned != sidx || 3173 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3174 if (!anon_try_demote_pages(ahp, sidx_aligned, 3175 amp->a_szc, NULL, 0)) { 3176 panic("anon_shmap_free_pages: demote failed"); 3177 } 3178 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3179 P2NPHASE(sidx, pages); 3180 size <<= PAGESHIFT; 3181 anon_free(ahp, sidx, size); 3182 sidx = sidx_aligned + pages; 3183 if (eidx <= sidx) { 3184 return; 3185 } 3186 } 3187 eidx_aligned = P2ALIGN(eidx, pages); 3188 if (sidx < eidx_aligned) { 3189 anon_free_pages(ahp, sidx, 3190 (eidx_aligned - sidx) << PAGESHIFT, 3191 amp->a_szc); 3192 sidx = eidx_aligned; 3193 } 3194 ASSERT(sidx == eidx_aligned); 3195 if (eidx == eidx_aligned) { 3196 return; 3197 } 3198 tidx = eidx; 3199 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3200 tidx - sidx < pages) { 3201 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3202 panic("anon_shmap_free_pages: demote failed"); 3203 } 3204 size = (eidx - sidx) << PAGESHIFT; 3205 anon_free(ahp, sidx, size); 3206 } else { 3207 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3208 } 3209 } 3210 3211 /* 3212 * Allocate and initialize an anon_map structure for seg 3213 * associating the given swap reservation with the new anon_map. 3214 */ 3215 struct anon_map * 3216 anonmap_alloc(size_t size, size_t swresv, int flags) 3217 { 3218 struct anon_map *amp; 3219 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3220 3221 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3222 if (amp == NULL) { 3223 ASSERT(kmflags == KM_NOSLEEP); 3224 return (NULL); 3225 } 3226 3227 amp->ahp = anon_create(btopr(size), flags); 3228 if (amp->ahp == NULL) { 3229 ASSERT(flags == ANON_NOSLEEP); 3230 kmem_cache_free(anonmap_cache, amp); 3231 return (NULL); 3232 } 3233 amp->refcnt = 1; 3234 amp->size = size; 3235 amp->swresv = swresv; 3236 amp->locality = 0; 3237 amp->a_szc = 0; 3238 amp->a_sp = NULL; 3239 return (amp); 3240 } 3241 3242 void 3243 anonmap_free(struct anon_map *amp) 3244 { 3245 ASSERT(amp->ahp); 3246 ASSERT(amp->refcnt == 0); 3247 3248 lgrp_shm_policy_fini(amp, NULL); 3249 anon_release(amp->ahp, btopr(amp->size)); 3250 kmem_cache_free(anonmap_cache, amp); 3251 } 3252 3253 /* 3254 * Returns true if the app array has some empty slots. 3255 * The offp and lenp parameters are in/out parameters. On entry 3256 * these values represent the starting offset and length of the 3257 * mapping. When true is returned, these values may be modified 3258 * to be the largest range which includes empty slots. 3259 */ 3260 int 3261 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3262 size_t *lenp) 3263 { 3264 ulong_t i, el; 3265 ssize_t low, high; 3266 struct anon *ap; 3267 3268 low = -1; 3269 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3270 ap = anon_get_ptr(ahp, anon_idx); 3271 if (ap == NULL) { 3272 if (low == -1) 3273 low = i; 3274 high = i; 3275 } 3276 } 3277 if (low != -1) { 3278 /* 3279 * Found at least one non-anon page. 3280 * Set up the off and len return values. 3281 */ 3282 if (low != 0) 3283 *offp += low; 3284 *lenp = high - low + PAGESIZE; 3285 return (1); 3286 } 3287 return (0); 3288 } 3289 3290 /* 3291 * Return a count of the number of existing anon pages in the anon array 3292 * app in the range (off, off+len). The array and slots must be guaranteed 3293 * stable by the caller. 3294 */ 3295 pgcnt_t 3296 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3297 { 3298 pgcnt_t cnt = 0; 3299 3300 while (nslots-- > 0) { 3301 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3302 cnt++; 3303 anon_index++; 3304 } 3305 return (cnt); 3306 } 3307 3308 /* 3309 * Move reserved phys swap into memory swap (unreserve phys swap 3310 * and reserve mem swap by the same amount). 3311 * Used by segspt when it needs to lock reserved swap npages in memory 3312 */ 3313 int 3314 anon_swap_adjust(pgcnt_t npages) 3315 { 3316 pgcnt_t unlocked_mem_swap; 3317 3318 mutex_enter(&anoninfo_lock); 3319 3320 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3321 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3322 3323 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3324 - k_anoninfo.ani_locked_swap; 3325 if (npages > unlocked_mem_swap) { 3326 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3327 3328 /* 3329 * if there is not enough unlocked mem swap we take missing 3330 * amount from phys swap and give it to mem swap 3331 */ 3332 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3333 mutex_exit(&anoninfo_lock); 3334 return (ENOMEM); 3335 } 3336 3337 k_anoninfo.ani_mem_resv += adjusted_swap; 3338 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3339 k_anoninfo.ani_phys_resv -= adjusted_swap; 3340 3341 ANI_ADD(adjusted_swap); 3342 } 3343 k_anoninfo.ani_locked_swap += npages; 3344 3345 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3346 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3347 3348 mutex_exit(&anoninfo_lock); 3349 3350 return (0); 3351 } 3352 3353 /* 3354 * 'unlocked' reserved mem swap so when it is unreserved it 3355 * can be moved back phys (disk) swap 3356 */ 3357 void 3358 anon_swap_restore(pgcnt_t npages) 3359 { 3360 mutex_enter(&anoninfo_lock); 3361 3362 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3363 3364 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3365 k_anoninfo.ani_locked_swap -= npages; 3366 3367 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3368 3369 mutex_exit(&anoninfo_lock); 3370 } 3371 3372 /* 3373 * Return the pointer from the list for a 3374 * specified anon index. 3375 */ 3376 ulong_t * 3377 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3378 { 3379 struct anon **app; 3380 void **ppp; 3381 3382 ASSERT(an_idx < ahp->size); 3383 3384 /* 3385 * Single level case. 3386 */ 3387 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3388 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3389 } else { 3390 3391 /* 3392 * 2 level case. 3393 */ 3394 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3395 if (*ppp == NULL) { 3396 mutex_enter(&ahp->serial_lock); 3397 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3398 if (*ppp == NULL) 3399 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3400 mutex_exit(&ahp->serial_lock); 3401 } 3402 app = *ppp; 3403 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3404 } 3405 } 3406 3407 void 3408 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3409 { 3410 ulong_t *ap_slot; 3411 kmutex_t *mtx; 3412 kcondvar_t *cv; 3413 int hash; 3414 3415 /* 3416 * Use szc to determine anon slot(s) to appear atomic. 3417 * If szc = 0, then lock the anon slot and mark it busy. 3418 * If szc > 0, then lock the range of slots by getting the 3419 * anon_array_lock for the first anon slot, and mark only the 3420 * first anon slot busy to represent whole range being busy. 3421 */ 3422 3423 ASSERT(RW_READ_HELD(&->a_rwlock)); 3424 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3425 hash = ANON_ARRAY_HASH(amp, an_idx); 3426 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3427 sobj->sync_cv = cv = &anon_array_cv[hash]; 3428 mutex_enter(mtx); 3429 ap_slot = anon_get_slot(amp->ahp, an_idx); 3430 while (ANON_ISBUSY(ap_slot)) 3431 cv_wait(cv, mtx); 3432 ANON_SETBUSY(ap_slot); 3433 sobj->sync_data = ap_slot; 3434 mutex_exit(mtx); 3435 } 3436 3437 int 3438 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3439 anon_sync_obj_t *sobj) 3440 { 3441 ulong_t *ap_slot; 3442 kmutex_t *mtx; 3443 int hash; 3444 3445 /* 3446 * Try to lock a range of anon slots. 3447 * Use szc to determine anon slot(s) to appear atomic. 3448 * If szc = 0, then lock the anon slot and mark it busy. 3449 * If szc > 0, then lock the range of slots by getting the 3450 * anon_array_lock for the first anon slot, and mark only the 3451 * first anon slot busy to represent whole range being busy. 3452 * Fail if the mutex or the anon_array are busy. 3453 */ 3454 3455 ASSERT(RW_READ_HELD(&->a_rwlock)); 3456 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3457 hash = ANON_ARRAY_HASH(amp, an_idx); 3458 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3459 sobj->sync_cv = &anon_array_cv[hash]; 3460 if (!mutex_tryenter(mtx)) { 3461 return (EWOULDBLOCK); 3462 } 3463 ap_slot = anon_get_slot(amp->ahp, an_idx); 3464 if (ANON_ISBUSY(ap_slot)) { 3465 mutex_exit(mtx); 3466 return (EWOULDBLOCK); 3467 } 3468 ANON_SETBUSY(ap_slot); 3469 sobj->sync_data = ap_slot; 3470 mutex_exit(mtx); 3471 return (0); 3472 } 3473 3474 void 3475 anon_array_exit(anon_sync_obj_t *sobj) 3476 { 3477 mutex_enter(sobj->sync_mutex); 3478 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3479 ANON_CLRBUSY(sobj->sync_data); 3480 if (CV_HAS_WAITERS(sobj->sync_cv)) 3481 cv_broadcast(sobj->sync_cv); 3482 mutex_exit(sobj->sync_mutex); 3483 } 3484