1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - anonymous pages. 44 * 45 * This layer sits immediately above the vm_swap layer. It manages 46 * physical pages that have no permanent identity in the file system 47 * name space, using the services of the vm_swap layer to allocate 48 * backing storage for these pages. Since these pages have no external 49 * identity, they are discarded when the last reference is removed. 50 * 51 * An important function of this layer is to manage low-level sharing 52 * of pages that are logically distinct but that happen to be 53 * physically identical (e.g., the corresponding pages of the processes 54 * resulting from a fork before one process or the other changes their 55 * contents). This pseudo-sharing is present only as an optimization 56 * and is not to be confused with true sharing in which multiple 57 * address spaces deliberately contain references to the same object; 58 * such sharing is managed at a higher level. 59 * 60 * The key data structure here is the anon struct, which contains a 61 * reference count for its associated physical page and a hint about 62 * the identity of that page. Anon structs typically live in arrays, 63 * with an instance's position in its array determining where the 64 * corresponding backing storage is allocated; however, the swap_xlate() 65 * routine abstracts away this representation information so that the 66 * rest of the anon layer need not know it. (See the swap layer for 67 * more details on anon struct layout.) 68 * 69 * In the future versions of the system, the association between an 70 * anon struct and its position on backing store will change so that 71 * we don't require backing store all anonymous pages in the system. 72 * This is important for consideration for large memory systems. 73 * We can also use this technique to delay binding physical locations 74 * to anonymous pages until pageout/swapout time where we can make 75 * smarter allocation decisions to improve anonymous klustering. 76 * 77 * Many of the routines defined here take a (struct anon **) argument, 78 * which allows the code at this level to manage anon pages directly, 79 * so that callers can regard anon structs as opaque objects and not be 80 * concerned with assigning or inspecting their contents. 81 * 82 * Clients of this layer refer to anon pages indirectly. That is, they 83 * maintain arrays of pointers to anon structs rather than maintaining 84 * anon structs themselves. The (struct anon **) arguments mentioned 85 * above are pointers to entries in these arrays. It is these arrays 86 * that capture the mapping between offsets within a given segment and 87 * the corresponding anonymous backing storage address. 88 */ 89 90 #ifdef DEBUG 91 #define ANON_DEBUG 92 #endif 93 94 #include <sys/types.h> 95 #include <sys/t_lock.h> 96 #include <sys/param.h> 97 #include <sys/systm.h> 98 #include <sys/mman.h> 99 #include <sys/cred.h> 100 #include <sys/thread.h> 101 #include <sys/vnode.h> 102 #include <sys/cpuvar.h> 103 #include <sys/swap.h> 104 #include <sys/cmn_err.h> 105 #include <sys/vtrace.h> 106 #include <sys/kmem.h> 107 #include <sys/sysmacros.h> 108 #include <sys/bitmap.h> 109 #include <sys/vmsystm.h> 110 #include <sys/debug.h> 111 #include <sys/fs/swapnode.h> 112 #include <sys/tnf_probe.h> 113 #include <sys/lgrp.h> 114 #include <sys/policy.h> 115 #include <sys/condvar_impl.h> 116 #include <sys/mutex_impl.h> 117 118 #include <vm/as.h> 119 #include <vm/hat.h> 120 #include <vm/anon.h> 121 #include <vm/page.h> 122 #include <vm/vpage.h> 123 #include <vm/seg.h> 124 #include <vm/rm.h> 125 126 #include <fs/fs_subr.h> 127 128 struct vnode *anon_vp; 129 130 int anon_debug; 131 132 kmutex_t anoninfo_lock; 133 struct k_anoninfo k_anoninfo; 134 ani_free_t ani_free_pool[ANI_MAX_POOL]; 135 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 136 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 137 138 /* 139 * Global hash table for (vp, off) -> anon slot 140 */ 141 extern int swap_maxcontig; 142 size_t anon_hash_size; 143 struct anon **anon_hash; 144 145 static struct kmem_cache *anon_cache; 146 static struct kmem_cache *anonmap_cache; 147 148 #ifdef VM_STATS 149 static struct anonvmstats_str { 150 ulong_t getpages[30]; 151 ulong_t privatepages[10]; 152 ulong_t demotepages[9]; 153 ulong_t decrefpages[9]; 154 ulong_t dupfillholes[4]; 155 ulong_t freepages[1]; 156 } anonvmstats; 157 #endif /* VM_STATS */ 158 159 160 /*ARGSUSED*/ 161 static int 162 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 163 { 164 struct anon_map *amp = buf; 165 166 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 167 return (0); 168 } 169 170 /*ARGSUSED1*/ 171 static void 172 anonmap_cache_destructor(void *buf, void *cdrarg) 173 { 174 struct anon_map *amp = buf; 175 176 rw_destroy(&->a_rwlock); 177 } 178 179 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 180 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 181 182 void 183 anon_init(void) 184 { 185 int i; 186 187 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 188 189 for (i = 0; i < AH_LOCK_SIZE; i++) { 190 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 191 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 192 } 193 194 for (i = 0; i < ANON_LOCKSIZE; i++) { 195 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 196 MUTEX_DEFAULT, NULL); 197 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 198 } 199 200 anon_hash = (struct anon **) 201 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 202 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 203 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 204 anonmap_cache = kmem_cache_create("anonmap_cache", 205 sizeof (struct anon_map), 0, 206 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 207 NULL, NULL, 0); 208 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 209 210 anon_vp = vn_alloc(KM_SLEEP); 211 vn_setops(anon_vp, swap_vnodeops); 212 anon_vp->v_type = VREG; 213 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 214 } 215 216 /* 217 * Global anon slot hash table manipulation. 218 */ 219 220 static void 221 anon_addhash(struct anon *ap) 222 { 223 int index; 224 225 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 226 index = ANON_HASH(ap->an_vp, ap->an_off); 227 ap->an_hash = anon_hash[index]; 228 anon_hash[index] = ap; 229 } 230 231 static void 232 anon_rmhash(struct anon *ap) 233 { 234 struct anon **app; 235 236 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 237 238 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 239 *app; app = &((*app)->an_hash)) { 240 if (*app == ap) { 241 *app = ap->an_hash; 242 break; 243 } 244 } 245 } 246 247 /* 248 * The anon array interfaces. Functions allocating, 249 * freeing array of pointers, and returning/setting 250 * entries in the array of pointers for a given offset. 251 * 252 * Create the list of pointers 253 */ 254 struct anon_hdr * 255 anon_create(pgcnt_t npages, int flags) 256 { 257 struct anon_hdr *ahp; 258 ulong_t nchunks; 259 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 260 261 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 262 return (NULL); 263 } 264 265 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 266 /* 267 * Single level case. 268 */ 269 ahp->size = npages; 270 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 271 272 if (flags & ANON_ALLOC_FORCE) 273 ahp->flags |= ANON_ALLOC_FORCE; 274 275 ahp->array_chunk = kmem_zalloc( 276 ahp->size * sizeof (struct anon *), kmemflags); 277 278 if (ahp->array_chunk == NULL) { 279 kmem_free(ahp, sizeof (struct anon_hdr)); 280 return (NULL); 281 } 282 } else { 283 /* 284 * 2 Level case. 285 */ 286 nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 287 288 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 289 kmemflags); 290 291 if (ahp->array_chunk == NULL) { 292 kmem_free(ahp, sizeof (struct anon_hdr)); 293 return (NULL); 294 } 295 } 296 return (ahp); 297 } 298 299 /* 300 * Free the array of pointers 301 */ 302 void 303 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 304 { 305 ulong_t i; 306 void **ppp; 307 ulong_t nchunks; 308 309 ASSERT(npages == ahp->size); 310 311 /* 312 * Single level case. 313 */ 314 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 315 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 316 } else { 317 /* 318 * 2 level case. 319 */ 320 nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 321 for (i = 0; i < nchunks; i++) { 322 ppp = &ahp->array_chunk[i]; 323 if (*ppp != NULL) 324 kmem_free(*ppp, PAGESIZE); 325 } 326 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 327 } 328 mutex_destroy(&ahp->serial_lock); 329 kmem_free(ahp, sizeof (struct anon_hdr)); 330 } 331 332 /* 333 * Return the pointer from the list for a 334 * specified anon index. 335 */ 336 struct anon * 337 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 338 { 339 struct anon **app; 340 341 ASSERT(an_idx < ahp->size); 342 343 /* 344 * Single level case. 345 */ 346 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 347 return ((struct anon *) 348 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 349 } else { 350 351 /* 352 * 2 level case. 353 */ 354 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 355 if (app) { 356 return ((struct anon *) 357 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 358 ANON_PTRMASK)); 359 } else { 360 return (NULL); 361 } 362 } 363 } 364 365 /* 366 * Return the anon pointer for the first valid entry in the anon list, 367 * starting from the given index. 368 */ 369 struct anon * 370 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 371 { 372 struct anon *ap; 373 struct anon **app; 374 ulong_t chunkoff; 375 ulong_t i; 376 ulong_t j; 377 pgcnt_t size; 378 379 i = *index; 380 size = ahp->size; 381 382 ASSERT(i < size); 383 384 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 385 /* 386 * 1 level case 387 */ 388 while (i < size) { 389 ap = (struct anon *) 390 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 391 if (ap) { 392 *index = i; 393 return (ap); 394 } 395 i++; 396 } 397 } else { 398 /* 399 * 2 level case 400 */ 401 chunkoff = i & ANON_CHUNK_OFF; 402 while (i < size) { 403 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 404 if (app) 405 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 406 ap = (struct anon *) 407 ((uintptr_t)app[j] & 408 ANON_PTRMASK); 409 if (ap) { 410 *index = i + (j - chunkoff); 411 return (ap); 412 } 413 } 414 chunkoff = 0; 415 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 416 } 417 } 418 *index = size; 419 return (NULL); 420 } 421 422 /* 423 * Set list entry with a given pointer for a specified offset 424 */ 425 int 426 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 427 { 428 void **ppp; 429 struct anon **app; 430 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 431 uintptr_t *ap_addr; 432 433 ASSERT(an_idx < ahp->size); 434 435 /* 436 * Single level case. 437 */ 438 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 439 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 440 } else { 441 442 /* 443 * 2 level case. 444 */ 445 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 446 447 ASSERT(ppp != NULL); 448 if (*ppp == NULL) { 449 mutex_enter(&ahp->serial_lock); 450 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 451 if (*ppp == NULL) { 452 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 453 if (*ppp == NULL) { 454 mutex_exit(&ahp->serial_lock); 455 return (ENOMEM); 456 } 457 } 458 mutex_exit(&ahp->serial_lock); 459 } 460 app = *ppp; 461 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 462 } 463 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 464 return (0); 465 } 466 467 /* 468 * Copy anon array into a given new anon array 469 */ 470 int 471 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 472 struct anon_hdr *dahp, ulong_t d_idx, 473 pgcnt_t npages, int flags) 474 { 475 void **sapp, **dapp; 476 void *ap; 477 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 478 479 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 480 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 481 482 /* 483 * Both arrays are 1 level. 484 */ 485 if (((sahp->size <= ANON_CHUNK_SIZE) && 486 (dahp->size <= ANON_CHUNK_SIZE)) || 487 ((sahp->flags & ANON_ALLOC_FORCE) && 488 (dahp->flags & ANON_ALLOC_FORCE))) { 489 490 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 491 npages * sizeof (struct anon *)); 492 return (0); 493 } 494 495 /* 496 * Both arrays are 2 levels. 497 */ 498 if (sahp->size > ANON_CHUNK_SIZE && 499 dahp->size > ANON_CHUNK_SIZE && 500 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 501 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 502 503 ulong_t sapidx, dapidx; 504 ulong_t *sap, *dap; 505 ulong_t chknp; 506 507 while (npages != 0) { 508 509 sapidx = s_idx & ANON_CHUNK_OFF; 510 dapidx = d_idx & ANON_CHUNK_OFF; 511 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 512 if (chknp > npages) 513 chknp = npages; 514 515 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 516 if ((sap = *sapp) != NULL) { 517 dapp = &dahp->array_chunk[d_idx 518 >> ANON_CHUNK_SHIFT]; 519 if ((dap = *dapp) == NULL) { 520 *dapp = kmem_zalloc(PAGESIZE, 521 kmemflags); 522 if ((dap = *dapp) == NULL) 523 return (ENOMEM); 524 } 525 bcopy((sap + sapidx), (dap + dapidx), 526 chknp << ANON_PTRSHIFT); 527 } 528 s_idx += chknp; 529 d_idx += chknp; 530 npages -= chknp; 531 } 532 return (0); 533 } 534 535 /* 536 * At least one of the arrays is 2 level. 537 */ 538 while (npages--) { 539 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 540 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 541 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 542 return (ENOMEM); 543 } 544 s_idx++; 545 d_idx++; 546 } 547 return (0); 548 } 549 550 551 /* 552 * ANON_INITBUF is a convenience macro for anon_grow() below. It 553 * takes a buffer dst, which is at least as large as buffer src. It 554 * does a bcopy from src into dst, and then bzeros the extra bytes 555 * of dst. If tail is set, the data in src is tail aligned within 556 * dst instead of head aligned. 557 */ 558 559 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 560 if (tail) { \ 561 bzero((dst), (dstsize) - (srclen)); \ 562 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 563 } else { \ 564 bcopy((src), (dst), (srclen)); \ 565 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 566 } 567 568 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 569 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 570 571 /* 572 * anon_grow() is used to efficiently extend an existing anon array. 573 * startidx_p points to the index into the anon array of the first page 574 * that is in use. oldseg_pgs is the number of pages in use, starting at 575 * *startidx_p. newpages is the number of additional pages desired. 576 * 577 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 578 * 579 * The growth is done by creating a new top level of the anon array, 580 * and (if the array is 2-level) reusing the existing second level arrays. 581 * 582 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 583 * 584 * Returns the new number of pages in the anon array. 585 */ 586 pgcnt_t 587 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 588 pgcnt_t newseg_pgs, int flags) 589 { 590 ulong_t startidx = startidx_p ? *startidx_p : 0; 591 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 592 pgcnt_t oelems, nelems, totpages; 593 void **level1; 594 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 595 int growdown = (flags & ANON_GROWDOWN); 596 size_t newarrsz, oldarrsz; 597 void *level2; 598 599 ASSERT(!(startidx_p == NULL && growdown)); 600 ASSERT(startidx + oldseg_pgs <= ahp->size); 601 602 /* 603 * Determine the total number of pages needed in the new 604 * anon array. If growing down, totpages is all pages from 605 * startidx through the end of the array, plus <newseg_pgs> 606 * pages. If growing up, keep all pages from page 0 through 607 * the last page currently in use, plus <newseg_pgs> pages. 608 */ 609 if (growdown) 610 totpages = oldamp_pgs - startidx + newseg_pgs; 611 else 612 totpages = startidx + oldseg_pgs + newseg_pgs; 613 614 /* If the array is already large enough, just return. */ 615 616 if (oldamp_pgs >= totpages) { 617 if (growdown) 618 *startidx_p = oldamp_pgs - totpages; 619 return (oldamp_pgs); 620 } 621 622 /* 623 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 624 * by the corresponding arrays. 625 * oelems/nelems are the number of pointers in the top level arrays 626 * which may be either level 1 or level 2. 627 * Will the new anon array be one level or two levels? 628 */ 629 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 630 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 631 oelems = oldamp_pgs; 632 nelems = newamp_pgs; 633 } else { 634 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 635 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 636 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 637 } 638 639 newarrsz = nelems * sizeof (void *); 640 level1 = kmem_alloc(newarrsz, kmemflags); 641 if (level1 == NULL) 642 return (0); 643 644 /* Are we converting from a one level to a two level anon array? */ 645 646 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 647 !(ahp->flags & ANON_ALLOC_FORCE)) { 648 649 /* 650 * Yes, we're converting to a two level. Reuse old level 1 651 * as new level 2 if it is exactly PAGESIZE. Otherwise 652 * alloc a new level 2 and copy the old level 1 data into it. 653 */ 654 if (oldamp_pgs == ANON_CHUNK_SIZE) { 655 level2 = (void *)ahp->array_chunk; 656 } else { 657 level2 = kmem_alloc(PAGESIZE, kmemflags); 658 if (level2 == NULL) { 659 kmem_free(level1, newarrsz); 660 return (0); 661 } 662 oldarrsz = oldamp_pgs * sizeof (void *); 663 664 ANON_INITBUF(ahp->array_chunk, oldarrsz, 665 level2, PAGESIZE, growdown); 666 kmem_free(ahp->array_chunk, oldarrsz); 667 } 668 bzero(level1, newarrsz); 669 if (growdown) 670 level1[nelems - 1] = level2; 671 else 672 level1[0] = level2; 673 } else { 674 oldarrsz = oelems * sizeof (void *); 675 676 ANON_INITBUF(ahp->array_chunk, oldarrsz, 677 level1, newarrsz, growdown); 678 kmem_free(ahp->array_chunk, oldarrsz); 679 } 680 681 ahp->array_chunk = level1; 682 ahp->size = newamp_pgs; 683 if (growdown) { 684 *startidx_p = newamp_pgs - totpages; 685 if (oldamp_pgs > ANON_CHUNK_SIZE) 686 *startidx_p -= P2NPHASE(oldseg_pgs, ANON_CHUNK_SIZE); 687 } 688 return (newamp_pgs); 689 } 690 691 692 /* 693 * Called from clock handler to sync ani_free value. 694 */ 695 696 void 697 set_anoninfo(void) 698 { 699 int ix; 700 pgcnt_t total = 0; 701 702 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 703 total += ani_free_pool[ix].ani_count; 704 } 705 k_anoninfo.ani_free = total; 706 } 707 708 /* 709 * Reserve anon space. 710 * 711 * It's no longer simply a matter of incrementing ani_resv to 712 * reserve swap space, we need to check memory-based as well 713 * as disk-backed (physical) swap. The following algorithm 714 * is used: 715 * Check the space on physical swap 716 * i.e. amount needed < ani_max - ani_phys_resv 717 * If we are swapping on swapfs check 718 * amount needed < (availrmem - swapfs_minfree) 719 * Since the algorithm to check for the quantity of swap space is 720 * almost the same as that for reserving it, we'll just use anon_resvmem 721 * with a flag to decrement availrmem. 722 * 723 * Return non-zero on success. 724 */ 725 int 726 anon_resvmem(size_t size, uint_t takemem) 727 { 728 pgcnt_t npages = btopr(size); 729 pgcnt_t mswap_pages = 0; 730 pgcnt_t pswap_pages = 0; 731 732 mutex_enter(&anoninfo_lock); 733 734 /* 735 * pswap_pages is the number of pages we can take from 736 * physical (i.e. disk-backed) swap. 737 */ 738 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 739 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 740 741 ANON_PRINT(A_RESV, 742 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 743 npages, takemem, pswap_pages, (void *)caller())); 744 745 if (npages <= pswap_pages) { 746 /* 747 * we have enough space on a physical swap 748 */ 749 if (takemem) 750 k_anoninfo.ani_phys_resv += npages; 751 mutex_exit(&anoninfo_lock); 752 return (1); 753 } else if (pswap_pages != 0) { 754 /* 755 * we have some space on a physical swap 756 */ 757 if (takemem) { 758 /* 759 * use up remainder of phys swap 760 */ 761 k_anoninfo.ani_phys_resv += pswap_pages; 762 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 763 } 764 } 765 /* 766 * since (npages > pswap_pages) we need mem swap 767 * mswap_pages is the number of pages needed from availrmem 768 */ 769 ASSERT(npages > pswap_pages); 770 mswap_pages = npages - pswap_pages; 771 772 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 773 mswap_pages)); 774 775 /* 776 * priv processes can reserve memory as swap as long as availrmem 777 * remains greater than swapfs_minfree; in the case of non-priv 778 * processes, memory can be reserved as swap only if availrmem 779 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 780 * swapfs_reserve amount of memswap is not available to non-priv 781 * processes. This protects daemons such as automounter dying 782 * as a result of application processes eating away almost entire 783 * membased swap. This safeguard becomes useless if apps are run 784 * with root access. 785 * 786 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 787 * 788 */ 789 mutex_enter(&freemem_lock); 790 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 791 (availrmem > (swapfs_minfree + mswap_pages) && 792 secpolicy_resource(CRED()) == 0)) { 793 794 if (takemem) { 795 /* 796 * Take the memory from the rest of the system. 797 */ 798 availrmem -= mswap_pages; 799 mutex_exit(&freemem_lock); 800 k_anoninfo.ani_mem_resv += mswap_pages; 801 ANI_ADD(mswap_pages); 802 ANON_PRINT((A_RESV | A_MRESV), 803 ("anon_resvmem: took %ld pages of availrmem\n", 804 mswap_pages)); 805 } else { 806 mutex_exit(&freemem_lock); 807 } 808 809 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 810 mutex_exit(&anoninfo_lock); 811 return (1); 812 813 } else { 814 /* 815 * Fail if not enough memory 816 */ 817 818 if (takemem) { 819 k_anoninfo.ani_phys_resv -= pswap_pages; 820 } 821 822 mutex_exit(&freemem_lock); 823 mutex_exit(&anoninfo_lock); 824 ANON_PRINT(A_RESV, 825 ("anon_resvmem: not enough space from swapfs\n")); 826 return (0); 827 } 828 } 829 830 831 /* 832 * Give back an anon reservation. 833 */ 834 void 835 anon_unresv(size_t size) 836 { 837 pgcnt_t npages = btopr(size); 838 spgcnt_t mem_free_pages = 0; 839 pgcnt_t phys_free_slots; 840 #ifdef ANON_DEBUG 841 pgcnt_t mem_resv; 842 #endif 843 844 mutex_enter(&anoninfo_lock); 845 846 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 847 /* 848 * If some of this reservation belonged to swapfs 849 * give it back to availrmem. 850 * ani_mem_resv is the amount of availrmem swapfs has reserved. 851 * but some of that memory could be locked by segspt so we can only 852 * return non locked ani_mem_resv back to availrmem 853 */ 854 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 855 ANON_PRINT((A_RESV | A_MRESV), 856 ("anon_unresv: growing availrmem by %ld pages\n", 857 MIN(k_anoninfo.ani_mem_resv, npages))); 858 859 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 860 k_anoninfo.ani_locked_swap), npages); 861 mutex_enter(&freemem_lock); 862 availrmem += mem_free_pages; 863 mutex_exit(&freemem_lock); 864 k_anoninfo.ani_mem_resv -= mem_free_pages; 865 866 ANI_ADD(-mem_free_pages); 867 } 868 /* 869 * The remainder of the pages is returned to phys swap 870 */ 871 ASSERT(npages >= mem_free_pages); 872 phys_free_slots = npages - mem_free_pages; 873 874 if (phys_free_slots) { 875 k_anoninfo.ani_phys_resv -= phys_free_slots; 876 } 877 878 #ifdef ANON_DEBUG 879 mem_resv = k_anoninfo.ani_mem_resv; 880 #endif 881 882 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 883 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 884 885 mutex_exit(&anoninfo_lock); 886 887 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 888 npages, mem_resv, (void *)caller())); 889 } 890 891 /* 892 * Allocate an anon slot and return it with the lock held. 893 */ 894 struct anon * 895 anon_alloc(struct vnode *vp, anoff_t off) 896 { 897 struct anon *ap; 898 kmutex_t *ahm; 899 900 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 901 if (vp == NULL) { 902 swap_alloc(ap); 903 } else { 904 ap->an_vp = vp; 905 ap->an_off = off; 906 } 907 ap->an_refcnt = 1; 908 ap->an_pvp = NULL; 909 ap->an_poff = 0; 910 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 911 mutex_enter(ahm); 912 anon_addhash(ap); 913 mutex_exit(ahm); 914 ANI_ADD(-1); 915 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 916 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 917 return (ap); 918 } 919 920 /* 921 * Decrement the reference count of an anon page. 922 * If reference count goes to zero, free it and 923 * its associated page (if any). 924 */ 925 void 926 anon_decref(struct anon *ap) 927 { 928 page_t *pp; 929 struct vnode *vp; 930 anoff_t off; 931 kmutex_t *ahm; 932 933 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 934 mutex_enter(ahm); 935 ASSERT(ap->an_refcnt != 0); 936 if (ap->an_refcnt == 0) 937 panic("anon_decref: slot count 0"); 938 if (--ap->an_refcnt == 0) { 939 swap_xlate(ap, &vp, &off); 940 mutex_exit(ahm); 941 942 /* 943 * If there is a page for this anon slot we will need to 944 * call VN_DISPOSE to get rid of the vp association and 945 * put the page back on the free list as really free. 946 * Acquire the "exclusive" lock to ensure that any 947 * pending i/o always completes before the swap slot 948 * is freed. 949 */ 950 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 951 952 /* 953 * If there was a page, we've synchronized on it (getting 954 * the exclusive lock is as good as gettting the iolock) 955 * so now we can free the physical backing store. Also, this 956 * is where we would free the name of the anonymous page 957 * (swap_free(ap)), a no-op in the current implementation. 958 */ 959 mutex_enter(ahm); 960 ASSERT(ap->an_refcnt == 0); 961 anon_rmhash(ap); 962 if (ap->an_pvp) 963 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 964 mutex_exit(ahm); 965 966 if (pp != NULL) { 967 /*LINTED: constant in conditional context */ 968 VN_DISPOSE(pp, B_INVAL, 0, kcred); 969 } 970 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 971 (void *)ap, (void *)ap->an_vp)); 972 kmem_cache_free(anon_cache, ap); 973 974 ANI_ADD(1); 975 } else { 976 mutex_exit(ahm); 977 } 978 } 979 980 static int 981 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 982 { 983 struct anon *ap; 984 985 while (nslots-- > 0) { 986 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 987 ap->an_refcnt > 1) 988 return (1); 989 anon_index++; 990 } 991 992 return (0); 993 } 994 995 static void 996 anon_decref_pages( 997 struct anon_hdr *ahp, 998 ulong_t an_idx, 999 uint_t szc) 1000 { 1001 struct anon *ap = anon_get_ptr(ahp, an_idx); 1002 kmutex_t *ahmpages = NULL; 1003 page_t *pp; 1004 pgcnt_t pgcnt = page_get_pagecnt(szc); 1005 pgcnt_t i; 1006 struct vnode *vp; 1007 anoff_t off; 1008 kmutex_t *ahm; 1009 #ifdef DEBUG 1010 int refcnt = 1; 1011 #endif 1012 1013 ASSERT(szc != 0); 1014 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1015 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1016 1017 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1018 1019 if (ap != NULL) { 1020 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1021 mutex_enter(ahmpages); 1022 ASSERT((refcnt = ap->an_refcnt) != 0); 1023 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1024 if (ap->an_refcnt == 1) { 1025 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1026 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1027 mutex_exit(ahmpages); 1028 ahmpages = NULL; 1029 } 1030 } 1031 1032 i = 0; 1033 while (i < pgcnt) { 1034 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1035 ASSERT(refcnt == 1 && ahmpages == NULL); 1036 i++; 1037 continue; 1038 } 1039 ASSERT(ap->an_refcnt == refcnt); 1040 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1041 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1042 1043 if (ahmpages == NULL) { 1044 swap_xlate(ap, &vp, &off); 1045 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1046 if (pp == NULL || pp->p_szc == 0) { 1047 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1048 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1049 ap->an_off)]; 1050 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1051 ANON_SLEEP); 1052 mutex_enter(ahm); 1053 ap->an_refcnt--; 1054 ASSERT(ap->an_refcnt == 0); 1055 anon_rmhash(ap); 1056 if (ap->an_pvp) 1057 swap_phys_free(ap->an_pvp, ap->an_poff, 1058 PAGESIZE); 1059 mutex_exit(ahm); 1060 if (pp != NULL) { 1061 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1062 /*LINTED*/ 1063 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1064 } 1065 kmem_cache_free(anon_cache, ap); 1066 ANI_ADD(1); 1067 i++; 1068 } else { 1069 pgcnt_t j; 1070 pgcnt_t curpgcnt = 1071 page_get_pagecnt(pp->p_szc); 1072 size_t ppasize = curpgcnt * sizeof (page_t *); 1073 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1074 int dispose = 0; 1075 1076 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1077 1078 ASSERT(pp->p_szc <= szc); 1079 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1080 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1081 ASSERT(i + curpgcnt <= pgcnt); 1082 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1083 ppa[0] = pp; 1084 for (j = i + 1; j < i + curpgcnt; j++) { 1085 ap = anon_get_ptr(ahp, an_idx + j); 1086 ASSERT(ap != NULL && 1087 ap->an_refcnt == 1); 1088 swap_xlate(ap, &vp, &off); 1089 pp = page_lookup(vp, (u_offset_t)off, 1090 SE_EXCL); 1091 if (pp == NULL) 1092 panic("anon_decref_pages: " 1093 "no page"); 1094 1095 (void) hat_pageunload(pp, 1096 HAT_FORCE_PGUNLOAD); 1097 ASSERT(pp->p_szc == ppa[0]->p_szc); 1098 ASSERT(page_pptonum(pp) - 1 == 1099 page_pptonum(ppa[j - i - 1])); 1100 ppa[j - i] = pp; 1101 if (ap->an_pvp != NULL && 1102 !vn_matchopval(ap->an_pvp, 1103 VOPNAME_DISPOSE, 1104 (fs_generic_func_p)fs_dispose)) 1105 dispose = 1; 1106 } 1107 if (!dispose) { 1108 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1109 page_destroy_pages(ppa[0]); 1110 } else { 1111 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1112 for (j = 0; j < curpgcnt; j++) { 1113 ASSERT(PAGE_EXCL(ppa[j])); 1114 ppa[j]->p_szc = 0; 1115 } 1116 for (j = 0; j < curpgcnt; j++) { 1117 ASSERT(!hat_page_is_mapped( 1118 ppa[j])); 1119 /*LINTED*/ 1120 VN_DISPOSE(ppa[j], B_INVAL, 0, 1121 kcred); 1122 } 1123 } 1124 kmem_free(ppa, ppasize); 1125 for (j = i; j < i + curpgcnt; j++) { 1126 ap = anon_get_ptr(ahp, an_idx + j); 1127 ASSERT(ap != NULL && 1128 ap->an_refcnt == 1); 1129 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1130 ap->an_off)]; 1131 (void) anon_set_ptr(ahp, an_idx + j, 1132 NULL, ANON_SLEEP); 1133 mutex_enter(ahm); 1134 ap->an_refcnt--; 1135 ASSERT(ap->an_refcnt == 0); 1136 anon_rmhash(ap); 1137 if (ap->an_pvp) 1138 swap_phys_free(ap->an_pvp, 1139 ap->an_poff, PAGESIZE); 1140 mutex_exit(ahm); 1141 kmem_cache_free(anon_cache, ap); 1142 ANI_ADD(1); 1143 } 1144 i += curpgcnt; 1145 } 1146 } else { 1147 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1148 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1149 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1150 mutex_enter(ahm); 1151 ap->an_refcnt--; 1152 mutex_exit(ahm); 1153 i++; 1154 } 1155 } 1156 1157 if (ahmpages != NULL) { 1158 mutex_exit(ahmpages); 1159 } 1160 } 1161 1162 /* 1163 * Duplicate references to size bytes worth of anon pages. 1164 * Used when duplicating a segment that contains private anon pages. 1165 * This code assumes that procedure calling this one has already used 1166 * hat_chgprot() to disable write access to the range of addresses that 1167 * that *old actually refers to. 1168 */ 1169 void 1170 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1171 ulong_t new_idx, size_t size) 1172 { 1173 spgcnt_t npages; 1174 kmutex_t *ahm; 1175 struct anon *ap; 1176 ulong_t off; 1177 ulong_t index; 1178 1179 npages = btopr(size); 1180 while (npages > 0) { 1181 index = old_idx; 1182 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1183 break; 1184 1185 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1186 off = index - old_idx; 1187 npages -= off; 1188 if (npages <= 0) 1189 break; 1190 1191 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1192 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1193 1194 mutex_enter(ahm); 1195 ap->an_refcnt++; 1196 mutex_exit(ahm); 1197 1198 off++; 1199 new_idx += off; 1200 old_idx += off; 1201 npages--; 1202 } 1203 } 1204 1205 /* 1206 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1207 * slots) within any large page region. That means if a large page region is 1208 * empty in the old array it will skip it. If there are 1 or more valid slots 1209 * in the large page region of the old array it will make sure to fill in any 1210 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1211 * page region should either have no valid anon slots or all slots should be 1212 * valid. 1213 */ 1214 void 1215 anon_dup_fill_holes( 1216 struct anon_hdr *old, 1217 ulong_t old_idx, 1218 struct anon_hdr *new, 1219 ulong_t new_idx, 1220 size_t size, 1221 uint_t szc, 1222 int noalloc) 1223 { 1224 struct anon *ap; 1225 spgcnt_t npages; 1226 kmutex_t *ahm, *ahmpages = NULL; 1227 pgcnt_t pgcnt, i; 1228 ulong_t index, off; 1229 #ifdef DEBUG 1230 int refcnt; 1231 #endif 1232 1233 ASSERT(szc != 0); 1234 pgcnt = page_get_pagecnt(szc); 1235 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1236 npages = btopr(size); 1237 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1238 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1239 1240 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1241 1242 while (npages > 0) { 1243 index = old_idx; 1244 1245 /* 1246 * Find the next valid slot. 1247 */ 1248 if (anon_get_next_ptr(old, &index) == NULL) 1249 break; 1250 1251 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1252 /* 1253 * Now backup index to the beginning of the 1254 * current large page region of the old array. 1255 */ 1256 index = P2ALIGN(index, pgcnt); 1257 off = index - old_idx; 1258 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1259 npages -= off; 1260 if (npages <= 0) 1261 break; 1262 1263 /* 1264 * Fill and copy a large page regions worth 1265 * of anon slots. 1266 */ 1267 for (i = 0; i < pgcnt; i++) { 1268 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1269 if (noalloc) { 1270 panic("anon_dup_fill_holes: " 1271 "empty anon slot\n"); 1272 } 1273 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1274 ap = anon_alloc(NULL, 0); 1275 (void) anon_set_ptr(old, index + i, ap, 1276 ANON_SLEEP); 1277 } else if (i == 0) { 1278 /* 1279 * make the increment of all refcnts of all 1280 * anon slots of a large page appear atomic by 1281 * getting an anonpages_hash_lock for the 1282 * first anon slot of a large page. 1283 */ 1284 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1285 1286 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1287 1288 ahmpages = &anonpages_hash_lock[hash]; 1289 mutex_enter(ahmpages); 1290 /*LINTED*/ 1291 ASSERT(refcnt = ap->an_refcnt); 1292 1293 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1294 anonvmstats.dupfillholes[3]); 1295 } 1296 (void) anon_set_ptr(new, new_idx + off + i, ap, 1297 ANON_SLEEP); 1298 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1299 mutex_enter(ahm); 1300 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1301 ASSERT(i == 0 || ahmpages == NULL || 1302 refcnt == ap->an_refcnt); 1303 ap->an_refcnt++; 1304 mutex_exit(ahm); 1305 } 1306 if (ahmpages != NULL) { 1307 mutex_exit(ahmpages); 1308 ahmpages = NULL; 1309 } 1310 off += pgcnt; 1311 new_idx += off; 1312 old_idx += off; 1313 npages -= pgcnt; 1314 } 1315 } 1316 1317 /* 1318 * Used when a segment with a vnode changes szc. similarly to 1319 * anon_dup_fill_holes() makes sure each large page region either has no anon 1320 * slots or all of them. but new slots are created by COWing the file 1321 * pages. on entrance no anon slots should be shared. 1322 */ 1323 int 1324 anon_fill_cow_holes( 1325 struct seg *seg, 1326 caddr_t addr, 1327 struct anon_hdr *ahp, 1328 ulong_t an_idx, 1329 struct vnode *vp, 1330 u_offset_t vp_off, 1331 size_t size, 1332 uint_t szc, 1333 uint_t prot, 1334 struct vpage vpage[], 1335 struct cred *cred) 1336 { 1337 struct anon *ap; 1338 spgcnt_t npages; 1339 pgcnt_t pgcnt, i; 1340 ulong_t index, off; 1341 int err = 0; 1342 int pageflags = 0; 1343 1344 ASSERT(szc != 0); 1345 pgcnt = page_get_pagecnt(szc); 1346 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1347 npages = btopr(size); 1348 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1349 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1350 1351 while (npages > 0) { 1352 index = an_idx; 1353 1354 /* 1355 * Find the next valid slot. 1356 */ 1357 if (anon_get_next_ptr(ahp, &index) == NULL) { 1358 break; 1359 } 1360 1361 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1362 /* 1363 * Now backup index to the beginning of the 1364 * current large page region of the anon array. 1365 */ 1366 index = P2ALIGN(index, pgcnt); 1367 off = index - an_idx; 1368 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1369 npages -= off; 1370 if (npages <= 0) 1371 break; 1372 an_idx += off; 1373 vp_off += ptob(off); 1374 addr += ptob(off); 1375 if (vpage != NULL) { 1376 vpage += off; 1377 } 1378 1379 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1380 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1381 page_t *pl[1 + 1]; 1382 page_t *pp; 1383 1384 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1385 pl, PAGESIZE, seg, addr, S_READ, cred); 1386 if (err) { 1387 break; 1388 } 1389 if (vpage != NULL) { 1390 prot = VPP_PROT(vpage); 1391 pageflags = VPP_ISPPLOCK(vpage) ? 1392 LOCK_PAGE : 0; 1393 } 1394 pp = anon_private(&ap, seg, addr, prot, pl[0], 1395 pageflags, cred); 1396 if (pp == NULL) { 1397 err = ENOMEM; 1398 break; 1399 } 1400 (void) anon_set_ptr(ahp, an_idx, ap, 1401 ANON_SLEEP); 1402 page_unlock(pp); 1403 } 1404 ASSERT(ap->an_refcnt == 1); 1405 addr += PAGESIZE; 1406 if (vpage != NULL) { 1407 vpage++; 1408 } 1409 } 1410 npages -= pgcnt; 1411 } 1412 1413 return (err); 1414 } 1415 1416 /* 1417 * Free a group of "size" anon pages, size in bytes, 1418 * and clear out the pointers to the anon entries. 1419 */ 1420 void 1421 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1422 { 1423 spgcnt_t npages; 1424 struct anon *ap; 1425 ulong_t old; 1426 1427 npages = btopr(size); 1428 1429 while (npages > 0) { 1430 old = index; 1431 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1432 break; 1433 1434 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1435 npages -= index - old; 1436 if (npages <= 0) 1437 break; 1438 1439 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1440 anon_decref(ap); 1441 /* 1442 * Bump index and decrement page count 1443 */ 1444 index++; 1445 npages--; 1446 } 1447 } 1448 1449 void 1450 anon_free_pages( 1451 struct anon_hdr *ahp, 1452 ulong_t an_idx, 1453 size_t size, 1454 uint_t szc) 1455 { 1456 spgcnt_t npages; 1457 pgcnt_t pgcnt; 1458 ulong_t index, off; 1459 1460 ASSERT(szc != 0); 1461 pgcnt = page_get_pagecnt(szc); 1462 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1463 npages = btopr(size); 1464 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1465 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1466 1467 VM_STAT_ADD(anonvmstats.freepages[0]); 1468 1469 while (npages > 0) { 1470 index = an_idx; 1471 1472 /* 1473 * Find the next valid slot. 1474 */ 1475 if (anon_get_next_ptr(ahp, &index) == NULL) 1476 break; 1477 1478 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1479 /* 1480 * Now backup index to the beginning of the 1481 * current large page region of the old array. 1482 */ 1483 index = P2ALIGN(index, pgcnt); 1484 off = index - an_idx; 1485 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1486 npages -= off; 1487 if (npages <= 0) 1488 break; 1489 1490 anon_decref_pages(ahp, index, szc); 1491 1492 off += pgcnt; 1493 an_idx += off; 1494 npages -= pgcnt; 1495 } 1496 } 1497 1498 /* 1499 * Make anonymous pages discardable 1500 */ 1501 void 1502 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) 1503 { 1504 spgcnt_t npages = btopr(size); 1505 struct anon *ap; 1506 struct vnode *vp; 1507 anoff_t off; 1508 page_t *pp, *root_pp; 1509 kmutex_t *ahm; 1510 pgcnt_t pgcnt; 1511 ulong_t old_idx, idx, i; 1512 struct anon_hdr *ahp = amp->ahp; 1513 anon_sync_obj_t cookie; 1514 1515 ASSERT(RW_READ_HELD(&->a_rwlock)); 1516 pgcnt = 1; 1517 for (; npages > 0; index = (pgcnt == 1) ? index + 1: 1518 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1519 1520 /* 1521 * get anon pointer and index for the first valid entry 1522 * in the anon list, starting from "index" 1523 */ 1524 old_idx = index; 1525 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1526 break; 1527 1528 /* 1529 * decrement npages by number of NULL anon slots we skipped 1530 */ 1531 npages -= index - old_idx; 1532 if (npages <= 0) 1533 break; 1534 1535 anon_array_enter(amp, index, &cookie); 1536 ap = anon_get_ptr(ahp, index); 1537 ASSERT(ap != NULL); 1538 1539 /* 1540 * Get anonymous page and try to lock it SE_EXCL; 1541 * For non blocking case if we couldn't grab the lock 1542 * we skip to next page. 1543 * For blocking case (ANON_PGLOOKUP_BLK) block 1544 * until we grab SE_EXCL lock. 1545 */ 1546 swap_xlate(ap, &vp, &off); 1547 if (flags & ANON_PGLOOKUP_BLK) 1548 pp = page_lookup_create(vp, (u_offset_t)off, 1549 SE_EXCL, NULL, NULL, SE_EXCL_WANTED); 1550 else 1551 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1552 if (pp == NULL) { 1553 segadvstat.MADV_FREE_miss.value.ul++; 1554 pgcnt = 1; 1555 anon_array_exit(&cookie); 1556 continue; 1557 } 1558 pgcnt = page_get_pagecnt(pp->p_szc); 1559 1560 /* 1561 * we cannot free a page which is permanently locked. 1562 * The page_struct_lock need not be acquired to examine 1563 * these fields since the page has an "exclusive" lock. 1564 */ 1565 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1566 page_unlock(pp); 1567 segadvstat.MADV_FREE_miss.value.ul++; 1568 anon_array_exit(&cookie); 1569 continue; 1570 } 1571 1572 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1573 mutex_enter(ahm); 1574 ASSERT(ap->an_refcnt != 0); 1575 /* 1576 * skip this one if copy-on-write is not yet broken. 1577 */ 1578 if (ap->an_refcnt > 1) { 1579 mutex_exit(ahm); 1580 page_unlock(pp); 1581 segadvstat.MADV_FREE_miss.value.ul++; 1582 anon_array_exit(&cookie); 1583 continue; 1584 } 1585 1586 if (pp->p_szc == 0) { 1587 pgcnt = 1; 1588 1589 /* 1590 * free swap slot; 1591 */ 1592 if (ap->an_pvp) { 1593 swap_phys_free(ap->an_pvp, ap->an_poff, 1594 PAGESIZE); 1595 ap->an_pvp = NULL; 1596 ap->an_poff = 0; 1597 } 1598 mutex_exit(ahm); 1599 segadvstat.MADV_FREE_hit.value.ul++; 1600 1601 /* 1602 * while we are at it, unload all the translations 1603 * and attempt to free the page. 1604 */ 1605 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1606 /*LINTED: constant in conditional context */ 1607 VN_DISPOSE(pp, B_FREE, 0, kcred); 1608 anon_array_exit(&cookie); 1609 continue; 1610 } 1611 1612 pgcnt = page_get_pagecnt(pp->p_szc); 1613 if (!IS_P2ALIGNED(index, pgcnt)) { 1614 if (!page_try_demote_pages(pp)) { 1615 mutex_exit(ahm); 1616 page_unlock(pp); 1617 segadvstat.MADV_FREE_miss.value.ul++; 1618 anon_array_exit(&cookie); 1619 continue; 1620 } else { 1621 pgcnt = 1; 1622 if (ap->an_pvp) { 1623 swap_phys_free(ap->an_pvp, 1624 ap->an_poff, PAGESIZE); 1625 ap->an_pvp = NULL; 1626 ap->an_poff = 0; 1627 } 1628 mutex_exit(ahm); 1629 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1630 /*LINTED*/ 1631 VN_DISPOSE(pp, B_FREE, 0, kcred); 1632 segadvstat.MADV_FREE_hit.value.ul++; 1633 anon_array_exit(&cookie); 1634 continue; 1635 } 1636 } 1637 mutex_exit(ahm); 1638 root_pp = pp; 1639 1640 /* 1641 * try to lock remaining pages 1642 */ 1643 for (idx = 1; idx < pgcnt; idx++) { 1644 pp++; 1645 if (!page_trylock(pp, SE_EXCL)) 1646 break; 1647 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1648 page_unlock(pp); 1649 break; 1650 } 1651 } 1652 1653 if (idx == pgcnt) { 1654 for (i = 0; i < pgcnt; i++) { 1655 ap = anon_get_ptr(ahp, index + i); 1656 if (ap == NULL) 1657 break; 1658 swap_xlate(ap, &vp, &off); 1659 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1660 mutex_enter(ahm); 1661 ASSERT(ap->an_refcnt != 0); 1662 1663 /* 1664 * skip this one if copy-on-write 1665 * is not yet broken. 1666 */ 1667 if (ap->an_refcnt > 1) { 1668 mutex_exit(ahm); 1669 goto skiplp; 1670 } 1671 if (ap->an_pvp) { 1672 swap_phys_free(ap->an_pvp, 1673 ap->an_poff, PAGESIZE); 1674 ap->an_pvp = NULL; 1675 ap->an_poff = 0; 1676 } 1677 mutex_exit(ahm); 1678 } 1679 page_destroy_pages(root_pp); 1680 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1681 anon_array_exit(&cookie); 1682 continue; 1683 } 1684 skiplp: 1685 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1686 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1687 page_unlock(pp); 1688 anon_array_exit(&cookie); 1689 } 1690 } 1691 1692 /* 1693 * Return the kept page(s) and protections back to the segment driver. 1694 */ 1695 int 1696 anon_getpage( 1697 struct anon **app, 1698 uint_t *protp, 1699 page_t *pl[], 1700 size_t plsz, 1701 struct seg *seg, 1702 caddr_t addr, 1703 enum seg_rw rw, 1704 struct cred *cred) 1705 { 1706 page_t *pp; 1707 struct anon *ap = *app; 1708 struct vnode *vp; 1709 anoff_t off; 1710 int err; 1711 kmutex_t *ahm; 1712 1713 swap_xlate(ap, &vp, &off); 1714 1715 /* 1716 * Lookup the page. If page is being paged in, 1717 * wait for it to finish as we must return a list of 1718 * pages since this routine acts like the VOP_GETPAGE 1719 * routine does. 1720 */ 1721 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1722 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1723 mutex_enter(ahm); 1724 if (ap->an_refcnt == 1) 1725 *protp = PROT_ALL; 1726 else 1727 *protp = PROT_ALL & ~PROT_WRITE; 1728 mutex_exit(ahm); 1729 pl[0] = pp; 1730 pl[1] = NULL; 1731 return (0); 1732 } 1733 1734 /* 1735 * Simply treat it as a vnode fault on the anon vp. 1736 */ 1737 1738 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1739 "anon_getpage:seg %x addr %x vp %x", 1740 seg, addr, vp); 1741 1742 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1743 seg, addr, rw, cred); 1744 1745 if (err == 0 && pl != NULL) { 1746 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1747 mutex_enter(ahm); 1748 if (ap->an_refcnt != 1) 1749 *protp &= ~PROT_WRITE; /* make read-only */ 1750 mutex_exit(ahm); 1751 } 1752 return (err); 1753 } 1754 1755 /* 1756 * Creates or returns kept pages to the segment driver. returns -1 if a large 1757 * page cannot be allocated. returns -2 if some other process has allocated a 1758 * larger page. 1759 * 1760 * For cowfault it will alocate any size pages to fill the requested area to 1761 * avoid partially overwritting anon slots (i.e. sharing only some of the anon 1762 * slots within a large page with other processes). This policy greatly 1763 * simplifies large page freeing (which is only freed when all anon slot 1764 * refcnts are 0). 1765 */ 1766 int 1767 anon_map_getpages( 1768 struct anon_map *amp, 1769 ulong_t start_idx, 1770 uint_t szc, 1771 struct seg *seg, 1772 caddr_t addr, 1773 uint_t prot, 1774 uint_t *protp, 1775 page_t *ppa[], 1776 uint_t *ppa_szc, 1777 struct vpage vpage[], 1778 enum seg_rw rw, 1779 int brkcow, 1780 int anypgsz, 1781 struct cred *cred) 1782 { 1783 pgcnt_t pgcnt; 1784 struct anon *ap; 1785 struct vnode *vp; 1786 anoff_t off; 1787 page_t *pp, *pl[2], *conpp = NULL; 1788 caddr_t vaddr; 1789 ulong_t pg_idx, an_idx, i; 1790 spgcnt_t nreloc = 0; 1791 int prealloc = 1; 1792 int err, slotcreate; 1793 uint_t vpprot; 1794 1795 #if !defined(__i386) && !defined(__amd64) 1796 ASSERT(seg->s_szc != 0); 1797 #endif 1798 ASSERT(szc <= seg->s_szc); 1799 ASSERT(ppa_szc != NULL); 1800 ASSERT(rw != S_CREATE); 1801 1802 *protp = PROT_ALL; 1803 1804 VM_STAT_ADD(anonvmstats.getpages[0]); 1805 1806 if (szc == 0) { 1807 VM_STAT_ADD(anonvmstats.getpages[1]); 1808 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1809 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1810 addr, rw, cred); 1811 if (err) 1812 return (err); 1813 ppa[0] = pl[0]; 1814 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1815 VM_STAT_ADD(anonvmstats.getpages[2]); 1816 if (ppa[0]->p_szc != 0) { 1817 VM_STAT_ADD(anonvmstats.getpages[3]); 1818 *ppa_szc = ppa[0]->p_szc; 1819 page_unlock(ppa[0]); 1820 return (-2); 1821 } 1822 return (0); 1823 } 1824 panic("anon_map_getpages: cowfault for szc 0"); 1825 } else { 1826 VM_STAT_ADD(anonvmstats.getpages[4]); 1827 ppa[0] = anon_zero(seg, addr, &ap, cred); 1828 if (ppa[0] == NULL) 1829 return (ENOMEM); 1830 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1831 ANON_SLEEP); 1832 return (0); 1833 } 1834 } 1835 1836 pgcnt = page_get_pagecnt(szc); 1837 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1838 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1839 1840 /* 1841 * First we check for the case that the requtested large 1842 * page or larger page already exists in the system. 1843 * Actually we only check if the first constituent page 1844 * exists and only preallocate if it's not found. 1845 */ 1846 ap = anon_get_ptr(amp->ahp, start_idx); 1847 if (ap) { 1848 uint_t pszc; 1849 swap_xlate(ap, &vp, &off); 1850 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1851 if (pszc > szc) { 1852 *ppa_szc = pszc; 1853 return (-2); 1854 } 1855 if (pszc == szc) { 1856 prealloc = 0; 1857 } 1858 } 1859 } 1860 1861 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1862 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1863 1864 top: 1865 /* 1866 * If a smaller page or no page at all was found, 1867 * grab a large page off the freelist. 1868 */ 1869 if (prealloc) { 1870 ASSERT(conpp == NULL); 1871 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1872 szc, 0) != 0) { 1873 VM_STAT_ADD(anonvmstats.getpages[7]); 1874 if (brkcow == 0 || 1875 !anon_share(amp->ahp, start_idx, pgcnt)) { 1876 /* 1877 * If the refcnt's of all anon slots are <= 1 1878 * they can't increase since we are holding 1879 * the address space's lock. So segvn can 1880 * safely decrease szc without risking to 1881 * generate a cow fault for the region smaller 1882 * than the segment's largest page size. 1883 */ 1884 VM_STAT_ADD(anonvmstats.getpages[8]); 1885 return (-1); 1886 } 1887 docow: 1888 /* 1889 * This is a cow fault. Copy away the entire 1 large 1890 * page region of this segment. 1891 */ 1892 if (szc != seg->s_szc) 1893 panic("anon_map_getpages: cowfault for szc %d", 1894 szc); 1895 vaddr = addr; 1896 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1897 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1898 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1899 NULL) { 1900 err = anon_getpage(&ap, &vpprot, pl, 1901 PAGESIZE, seg, vaddr, rw, cred); 1902 if (err) { 1903 for (i = 0; i < pg_idx; i++) { 1904 if ((pp = ppa[i]) != 1905 NULL) 1906 page_unlock(pp); 1907 } 1908 return (err); 1909 } 1910 ppa[pg_idx] = pl[0]; 1911 } else { 1912 /* 1913 * Since this is a cowfault we know 1914 * that this address space has a 1915 * parent or children which means 1916 * anon_dup_fill_holes() has initialized 1917 * all anon slots within a large page 1918 * region that had at least one anon 1919 * slot at the time of fork(). 1920 */ 1921 panic("anon_map_getpages: " 1922 "cowfault but anon slot is empty"); 1923 } 1924 } 1925 VM_STAT_ADD(anonvmstats.getpages[9]); 1926 *protp = PROT_ALL; 1927 return (anon_map_privatepages(amp, start_idx, szc, seg, 1928 addr, prot, ppa, vpage, anypgsz, cred)); 1929 } 1930 } 1931 1932 VM_STAT_ADD(anonvmstats.getpages[10]); 1933 1934 an_idx = start_idx; 1935 pg_idx = 0; 1936 vaddr = addr; 1937 while (pg_idx < pgcnt) { 1938 slotcreate = 0; 1939 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 1940 VM_STAT_ADD(anonvmstats.getpages[11]); 1941 /* 1942 * For us to have decided not to preallocate 1943 * would have meant that a large page 1944 * was found. Which also means that all of the 1945 * anon slots for that page would have been 1946 * already created for us. 1947 */ 1948 if (prealloc == 0) 1949 panic("anon_map_getpages: prealloc = 0"); 1950 1951 slotcreate = 1; 1952 ap = anon_alloc(NULL, 0); 1953 } 1954 swap_xlate(ap, &vp, &off); 1955 1956 /* 1957 * Now setup our preallocated page to pass down 1958 * to swap_getpage(). 1959 */ 1960 if (prealloc) { 1961 ASSERT(ppa[pg_idx]->p_szc == szc); 1962 conpp = ppa[pg_idx]; 1963 } 1964 ASSERT(prealloc || conpp == NULL); 1965 1966 /* 1967 * If we just created this anon slot then call 1968 * with S_CREATE to prevent doing IO on the page. 1969 * Similar to the anon_zero case. 1970 */ 1971 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 1972 NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr, 1973 slotcreate == 1 ? S_CREATE : rw, cred); 1974 1975 if (err) { 1976 VM_STAT_ADD(anonvmstats.getpages[12]); 1977 ASSERT(slotcreate == 0); 1978 goto io_err; 1979 } 1980 1981 pp = pl[0]; 1982 1983 if (pp->p_szc != szc) { 1984 VM_STAT_ADD(anonvmstats.getpages[13]); 1985 ASSERT(slotcreate == 0); 1986 ASSERT(prealloc == 0); 1987 ASSERT(pg_idx == 0); 1988 if (pp->p_szc > szc) { 1989 page_unlock(pp); 1990 VM_STAT_ADD(anonvmstats.getpages[14]); 1991 return (-2); 1992 } 1993 page_unlock(pp); 1994 prealloc = 1; 1995 goto top; 1996 } 1997 1998 /* 1999 * If we decided to preallocate but VOP_GETPAGE 2000 * found a page in the system that satisfies our 2001 * request then free up our preallocated large page 2002 * and continue looping accross the existing large 2003 * page via VOP_GETPAGE. 2004 */ 2005 if (prealloc && pp != ppa[pg_idx]) { 2006 VM_STAT_ADD(anonvmstats.getpages[15]); 2007 ASSERT(slotcreate == 0); 2008 ASSERT(pg_idx == 0); 2009 conpp = NULL; 2010 prealloc = 0; 2011 page_free_pages(ppa[0]); 2012 } 2013 2014 if (prealloc && nreloc > 1) { 2015 /* 2016 * we have relocated out of a smaller large page. 2017 * skip npgs - 1 iterations and continue which will 2018 * increment by one the loop indices. 2019 */ 2020 spgcnt_t npgs = nreloc; 2021 2022 VM_STAT_ADD(anonvmstats.getpages[16]); 2023 2024 ASSERT(pp == ppa[pg_idx]); 2025 ASSERT(slotcreate == 0); 2026 ASSERT(pg_idx + npgs <= pgcnt); 2027 if ((*protp & PROT_WRITE) && 2028 anon_share(amp->ahp, an_idx, npgs)) { 2029 *protp &= ~PROT_WRITE; 2030 } 2031 pg_idx += npgs; 2032 an_idx += npgs; 2033 vaddr += PAGESIZE * npgs; 2034 continue; 2035 } 2036 2037 VM_STAT_ADD(anonvmstats.getpages[17]); 2038 2039 /* 2040 * Anon_zero case. 2041 */ 2042 if (slotcreate) { 2043 ASSERT(prealloc); 2044 pagezero(pp, 0, PAGESIZE); 2045 CPU_STATS_ADD_K(vm, zfod, 1); 2046 hat_setrefmod(pp); 2047 } 2048 2049 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2050 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2051 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2052 2053 if (pg_idx > 0 && 2054 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2055 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) 2056 panic("anon_map_getpages: unexpected page"); 2057 2058 if (prealloc == 0) { 2059 ppa[pg_idx] = pp; 2060 } 2061 2062 if (ap->an_refcnt > 1) { 2063 VM_STAT_ADD(anonvmstats.getpages[18]); 2064 *protp &= ~PROT_WRITE; 2065 } 2066 2067 /* 2068 * If this is a new anon slot then initialize 2069 * the anon array entry. 2070 */ 2071 if (slotcreate) { 2072 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2073 } 2074 pg_idx++; 2075 an_idx++; 2076 vaddr += PAGESIZE; 2077 } 2078 2079 /* 2080 * Since preallocated pages come off the freelist 2081 * they are locked SE_EXCL. Simply downgrade and return. 2082 */ 2083 if (prealloc) { 2084 VM_STAT_ADD(anonvmstats.getpages[19]); 2085 conpp = NULL; 2086 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2087 page_downgrade(ppa[pg_idx]); 2088 } 2089 } 2090 ASSERT(conpp == NULL); 2091 2092 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2093 VM_STAT_ADD(anonvmstats.getpages[20]); 2094 return (0); 2095 } 2096 2097 if (szc < seg->s_szc) 2098 panic("anon_map_getpages: cowfault for szc %d", szc); 2099 2100 VM_STAT_ADD(anonvmstats.getpages[21]); 2101 2102 *protp = PROT_ALL; 2103 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2104 ppa, vpage, anypgsz, cred)); 2105 io_err: 2106 /* 2107 * We got an IO error somewhere in our large page. 2108 * If we were using a preallocated page then just demote 2109 * all the constituent pages that we've succeeded with sofar 2110 * to PAGESIZE pages and leave them in the system 2111 * unlocked. 2112 */ 2113 2114 ASSERT(err != -2 || pg_idx == 0); 2115 2116 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2117 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2118 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2119 2120 if (prealloc) { 2121 conpp = NULL; 2122 if (pg_idx > 0) { 2123 VM_STAT_ADD(anonvmstats.getpages[25]); 2124 for (i = 0; i < pgcnt; i++) { 2125 pp = ppa[i]; 2126 ASSERT(PAGE_EXCL(pp)); 2127 ASSERT(pp->p_szc == szc); 2128 pp->p_szc = 0; 2129 } 2130 for (i = 0; i < pg_idx; i++) { 2131 ASSERT(!hat_page_is_mapped(ppa[i])); 2132 page_unlock(ppa[i]); 2133 } 2134 /* 2135 * Now free up the remaining unused constituent 2136 * pages. 2137 */ 2138 while (pg_idx < pgcnt) { 2139 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2140 page_free(ppa[pg_idx], 0); 2141 pg_idx++; 2142 } 2143 } else { 2144 VM_STAT_ADD(anonvmstats.getpages[26]); 2145 page_free_pages(ppa[0]); 2146 } 2147 } else { 2148 VM_STAT_ADD(anonvmstats.getpages[27]); 2149 ASSERT(err > 0); 2150 for (i = 0; i < pg_idx; i++) 2151 page_unlock(ppa[i]); 2152 } 2153 ASSERT(conpp == NULL); 2154 if (err != -1) 2155 return (err); 2156 /* 2157 * we are here because we failed to relocate. 2158 */ 2159 ASSERT(prealloc); 2160 if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { 2161 VM_STAT_ADD(anonvmstats.getpages[28]); 2162 return (-1); 2163 } 2164 VM_STAT_ADD(anonvmstats.getpages[29]); 2165 goto docow; 2166 } 2167 2168 2169 /* 2170 * Turn a reference to an object or shared anon page 2171 * into a private page with a copy of the data from the 2172 * original page which is always locked by the caller. 2173 * This routine unloads the translation and unlocks the 2174 * original page, if it isn't being stolen, before returning 2175 * to the caller. 2176 * 2177 * NOTE: The original anon slot is not freed by this routine 2178 * It must be freed by the caller while holding the 2179 * "anon_map" lock to prevent races which can occur if 2180 * a process has multiple lwps in its address space. 2181 */ 2182 page_t * 2183 anon_private( 2184 struct anon **app, 2185 struct seg *seg, 2186 caddr_t addr, 2187 uint_t prot, 2188 page_t *opp, 2189 int oppflags, 2190 struct cred *cred) 2191 { 2192 struct anon *old = *app; 2193 struct anon *new; 2194 page_t *pp = NULL; 2195 struct vnode *vp; 2196 anoff_t off; 2197 page_t *anon_pl[1 + 1]; 2198 int err; 2199 2200 if (oppflags & STEAL_PAGE) 2201 ASSERT(PAGE_EXCL(opp)); 2202 else 2203 ASSERT(PAGE_LOCKED(opp)); 2204 2205 CPU_STATS_ADD_K(vm, cow_fault, 1); 2206 2207 /* Kernel probe */ 2208 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2209 tnf_opaque, address, addr); 2210 2211 *app = new = anon_alloc(NULL, 0); 2212 swap_xlate(new, &vp, &off); 2213 2214 if (oppflags & STEAL_PAGE) { 2215 page_rename(opp, vp, (u_offset_t)off); 2216 pp = opp; 2217 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2218 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2219 seg, addr, pp, vp, off); 2220 hat_setmod(pp); 2221 2222 /* bug 4026339 */ 2223 page_downgrade(pp); 2224 return (pp); 2225 } 2226 2227 /* 2228 * Call the VOP_GETPAGE routine to create the page, thereby 2229 * enabling the vnode driver to allocate any filesystem 2230 * space (e.g., disk block allocation for UFS). This also 2231 * prevents more than one page from being added to the 2232 * vnode at the same time. 2233 */ 2234 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2235 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2236 if (err) 2237 goto out; 2238 2239 pp = anon_pl[0]; 2240 2241 /* 2242 * If the original page was locked, we need to move the lock 2243 * to the new page by transfering 'cowcnt/lckcnt' of the original 2244 * page to 'cowcnt/lckcnt' of the new page. 2245 * 2246 * See Statement at the beginning of segvn_lockop() and 2247 * comments in page_pp_useclaim() regarding the way 2248 * cowcnts/lckcnts are handled. 2249 * 2250 * Also availrmem must be decremented up front for read only mapping 2251 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2252 * if availrmem did not need to be decremented after all. 2253 */ 2254 if (oppflags & LOCK_PAGE) { 2255 if ((prot & PROT_WRITE) == 0) { 2256 mutex_enter(&freemem_lock); 2257 if (availrmem > pages_pp_maximum) { 2258 availrmem--; 2259 pages_useclaim++; 2260 } else { 2261 mutex_exit(&freemem_lock); 2262 goto out; 2263 } 2264 mutex_exit(&freemem_lock); 2265 } 2266 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2267 } 2268 2269 /* 2270 * Now copy the contents from the original page, 2271 * which is locked and loaded in the MMU by 2272 * the caller to prevent yet another page fault. 2273 */ 2274 ppcopy(opp, pp); /* XXX - should set mod bit in here */ 2275 2276 hat_setrefmod(pp); /* mark as modified */ 2277 2278 /* 2279 * Unload the old translation. 2280 */ 2281 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2282 2283 /* 2284 * Free unmapped, unmodified original page. 2285 * or release the lock on the original page, 2286 * otherwise the process will sleep forever in 2287 * anon_decref() waiting for the "exclusive" lock 2288 * on the page. 2289 */ 2290 (void) page_release(opp, 1); 2291 2292 /* 2293 * we are done with page creation so downgrade the new 2294 * page's selock to shared, this helps when multiple 2295 * as_fault(...SOFTLOCK...) are done to the same 2296 * page(aio) 2297 */ 2298 page_downgrade(pp); 2299 2300 /* 2301 * NOTE: The original anon slot must be freed by the 2302 * caller while holding the "anon_map" lock, if we 2303 * copied away from an anonymous page. 2304 */ 2305 return (pp); 2306 2307 out: 2308 *app = old; 2309 if (pp) 2310 page_unlock(pp); 2311 anon_decref(new); 2312 page_unlock(opp); 2313 return ((page_t *)NULL); 2314 } 2315 2316 int 2317 anon_map_privatepages( 2318 struct anon_map *amp, 2319 ulong_t start_idx, 2320 uint_t szc, 2321 struct seg *seg, 2322 caddr_t addr, 2323 uint_t prot, 2324 page_t *ppa[], 2325 struct vpage vpage[], 2326 int anypgsz, 2327 struct cred *cred) 2328 { 2329 pgcnt_t pgcnt; 2330 struct vnode *vp; 2331 anoff_t off; 2332 page_t *pl[2], *conpp = NULL; 2333 int err; 2334 int prealloc = 1; 2335 struct anon *ap, *oldap; 2336 caddr_t vaddr; 2337 page_t *pplist, *pp; 2338 ulong_t pg_idx, an_idx; 2339 spgcnt_t nreloc = 0; 2340 int pagelock = 0; 2341 kmutex_t *ahmpages = NULL; 2342 #ifdef DEBUG 2343 int refcnt; 2344 #endif 2345 2346 ASSERT(szc != 0); 2347 ASSERT(szc == seg->s_szc); 2348 2349 VM_STAT_ADD(anonvmstats.privatepages[0]); 2350 2351 pgcnt = page_get_pagecnt(szc); 2352 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2353 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2354 2355 ASSERT(amp != NULL); 2356 ap = anon_get_ptr(amp->ahp, start_idx); 2357 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2358 2359 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2360 2361 /* 2362 * Now try and allocate the large page. If we fail then just 2363 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2364 * the caller make this decision but to avoid added complexity 2365 * it's simplier to handle that case here. 2366 */ 2367 if (anypgsz == -1) { 2368 VM_STAT_ADD(anonvmstats.privatepages[2]); 2369 prealloc = 0; 2370 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2371 anypgsz) != 0) { 2372 VM_STAT_ADD(anonvmstats.privatepages[3]); 2373 prealloc = 0; 2374 } 2375 2376 /* 2377 * make the decrement of all refcnts of all 2378 * anon slots of a large page appear atomic by 2379 * getting an anonpages_hash_lock for the 2380 * first anon slot of a large page. 2381 */ 2382 if (ap != NULL) { 2383 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2384 ap->an_off)]; 2385 mutex_enter(ahmpages); 2386 if (ap->an_refcnt == 1) { 2387 VM_STAT_ADD(anonvmstats.privatepages[4]); 2388 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2389 mutex_exit(ahmpages); 2390 2391 if (prealloc) { 2392 page_free_replacement_page(pplist); 2393 page_create_putback(pgcnt); 2394 } 2395 ASSERT(ppa[0]->p_szc <= szc); 2396 if (ppa[0]->p_szc == szc) { 2397 VM_STAT_ADD(anonvmstats.privatepages[5]); 2398 return (0); 2399 } 2400 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2401 ASSERT(ppa[pg_idx] != NULL); 2402 page_unlock(ppa[pg_idx]); 2403 } 2404 return (-1); 2405 } 2406 } 2407 2408 /* 2409 * If we are passed in the vpage array and this is 2410 * not PROT_WRITE then we need to decrement availrmem 2411 * up front before we try anything. If we need to and 2412 * can't decrement availrmem then its better to fail now 2413 * than in the middle of processing the new large page. 2414 * page_pp_usclaim() on behalf of each constituent page 2415 * below will adjust availrmem back for the cases not needed. 2416 */ 2417 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2418 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2419 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2420 pagelock = 1; 2421 break; 2422 } 2423 } 2424 if (pagelock) { 2425 VM_STAT_ADD(anonvmstats.privatepages[6]); 2426 mutex_enter(&freemem_lock); 2427 if (availrmem >= pages_pp_maximum + pgcnt) { 2428 availrmem -= pgcnt; 2429 pages_useclaim += pgcnt; 2430 } else { 2431 VM_STAT_ADD(anonvmstats.privatepages[7]); 2432 mutex_exit(&freemem_lock); 2433 if (ahmpages != NULL) { 2434 mutex_exit(ahmpages); 2435 } 2436 if (prealloc) { 2437 page_free_replacement_page(pplist); 2438 page_create_putback(pgcnt); 2439 } 2440 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2441 if (ppa[pg_idx] != NULL) 2442 page_unlock(ppa[pg_idx]); 2443 return (ENOMEM); 2444 } 2445 mutex_exit(&freemem_lock); 2446 } 2447 } 2448 2449 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2450 2451 VM_STAT_ADD(anonvmstats.privatepages[8]); 2452 2453 an_idx = start_idx; 2454 pg_idx = 0; 2455 vaddr = addr; 2456 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2457 ASSERT(ppa[pg_idx] != NULL); 2458 oldap = anon_get_ptr(amp->ahp, an_idx); 2459 ASSERT(ahmpages != NULL || oldap == NULL); 2460 ASSERT(ahmpages == NULL || oldap != NULL); 2461 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2462 ASSERT(ahmpages == NULL || pg_idx != 0 || 2463 (refcnt = oldap->an_refcnt)); 2464 ASSERT(ahmpages == NULL || pg_idx == 0 || 2465 refcnt == oldap->an_refcnt); 2466 2467 ap = anon_alloc(NULL, 0); 2468 2469 swap_xlate(ap, &vp, &off); 2470 2471 /* 2472 * Now setup our preallocated page to pass down to 2473 * swap_getpage(). 2474 */ 2475 if (prealloc) { 2476 pp = pplist; 2477 page_sub(&pplist, pp); 2478 conpp = pp; 2479 } 2480 2481 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2482 PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred); 2483 2484 /* 2485 * Impossible to fail this is S_CREATE. 2486 */ 2487 if (err) 2488 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2489 2490 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2491 ASSERT(prealloc == 0 || nreloc == 1); 2492 2493 pp = pl[0]; 2494 2495 /* 2496 * If the original page was locked, we need to move 2497 * the lock to the new page by transfering 2498 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2499 * of the new page. pg_idx can be used to index 2500 * into the vpage array since the caller will guarentee 2501 * that vpage struct passed in corresponds to addr 2502 * and forward. 2503 */ 2504 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2505 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2506 } else if (pagelock) { 2507 mutex_enter(&freemem_lock); 2508 availrmem++; 2509 pages_useclaim--; 2510 mutex_exit(&freemem_lock); 2511 } 2512 2513 /* 2514 * Now copy the contents from the original page. 2515 */ 2516 ppcopy(ppa[pg_idx], pp); 2517 2518 hat_setrefmod(pp); /* mark as modified */ 2519 2520 /* 2521 * Release the lock on the original page, 2522 * derement the old slot, and down grade the lock 2523 * on the new copy. 2524 */ 2525 page_unlock(ppa[pg_idx]); 2526 2527 if (!prealloc) 2528 page_downgrade(pp); 2529 2530 ppa[pg_idx] = pp; 2531 2532 /* 2533 * Now reflect the copy in the new anon array. 2534 */ 2535 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2536 if (oldap != NULL) 2537 anon_decref(oldap); 2538 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2539 } 2540 if (ahmpages != NULL) { 2541 mutex_exit(ahmpages); 2542 } 2543 ASSERT(prealloc == 0 || pplist == NULL); 2544 if (prealloc) { 2545 VM_STAT_ADD(anonvmstats.privatepages[9]); 2546 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2547 page_downgrade(ppa[pg_idx]); 2548 } 2549 } 2550 2551 /* 2552 * Unload the old large page translation. 2553 */ 2554 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2555 return (0); 2556 } 2557 2558 /* 2559 * Allocate a private zero-filled anon page. 2560 */ 2561 page_t * 2562 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2563 { 2564 struct anon *ap; 2565 page_t *pp; 2566 struct vnode *vp; 2567 anoff_t off; 2568 page_t *anon_pl[1 + 1]; 2569 int err; 2570 2571 /* Kernel probe */ 2572 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2573 tnf_opaque, address, addr); 2574 2575 *app = ap = anon_alloc(NULL, 0); 2576 swap_xlate(ap, &vp, &off); 2577 2578 /* 2579 * Call the VOP_GETPAGE routine to create the page, thereby 2580 * enabling the vnode driver to allocate any filesystem 2581 * dependent structures (e.g., disk block allocation for UFS). 2582 * This also prevents more than on page from being added to 2583 * the vnode at the same time since it is locked. 2584 */ 2585 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2586 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2587 if (err) { 2588 *app = NULL; 2589 anon_decref(ap); 2590 return (NULL); 2591 } 2592 pp = anon_pl[0]; 2593 2594 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2595 page_downgrade(pp); 2596 CPU_STATS_ADD_K(vm, zfod, 1); 2597 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2598 return (pp); 2599 } 2600 2601 2602 /* 2603 * Allocate array of private zero-filled anon pages for empty slots 2604 * and kept pages for non empty slots within given range. 2605 * 2606 * NOTE: This rontine will try and use large pages 2607 * if available and supported by underlying platform. 2608 */ 2609 int 2610 anon_map_createpages( 2611 struct anon_map *amp, 2612 ulong_t start_index, 2613 size_t len, 2614 page_t *ppa[], 2615 struct seg *seg, 2616 caddr_t addr, 2617 enum seg_rw rw, 2618 struct cred *cred) 2619 { 2620 2621 struct anon *ap; 2622 struct vnode *ap_vp; 2623 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2624 int err = 0; 2625 ulong_t p_index, index; 2626 pgcnt_t npgs, pg_cnt; 2627 spgcnt_t nreloc = 0; 2628 uint_t l_szc, szc, prot; 2629 anoff_t ap_off; 2630 size_t pgsz; 2631 lgrp_t *lgrp; 2632 2633 /* 2634 * XXX For now only handle S_CREATE. 2635 */ 2636 ASSERT(rw == S_CREATE); 2637 2638 index = start_index; 2639 p_index = 0; 2640 npgs = btopr(len); 2641 2642 /* 2643 * If this platform supports multiple page sizes 2644 * then try and allocate directly from the free 2645 * list for pages larger than PAGESIZE. 2646 * 2647 * NOTE:When we have page_create_ru we can stop 2648 * directly allocating from the freelist. 2649 */ 2650 l_szc = seg->s_szc; 2651 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2652 while (npgs) { 2653 2654 /* 2655 * if anon slot already exists 2656 * (means page has been created) 2657 * so 1) look up the page 2658 * 2) if the page is still in memory, get it. 2659 * 3) if not, create a page and 2660 * page in from physical swap device. 2661 * These are done in anon_getpage(). 2662 */ 2663 ap = anon_get_ptr(amp->ahp, index); 2664 if (ap) { 2665 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2666 seg, addr, S_READ, cred); 2667 if (err) { 2668 ANON_LOCK_EXIT(&->a_rwlock); 2669 panic("anon_map_createpages: anon_getpage"); 2670 } 2671 pp = anon_pl[0]; 2672 ppa[p_index++] = pp; 2673 2674 addr += PAGESIZE; 2675 index++; 2676 npgs--; 2677 continue; 2678 } 2679 /* 2680 * Now try and allocate the largest page possible 2681 * for the current address and range. 2682 * Keep dropping down in page size until: 2683 * 2684 * 1) Properly aligned 2685 * 2) Does not overlap existing anon pages 2686 * 3) Fits in remaining range. 2687 * 4) able to allocate one. 2688 * 2689 * NOTE: XXX When page_create_ru is completed this code 2690 * will change. 2691 */ 2692 szc = l_szc; 2693 pplist = NULL; 2694 pg_cnt = 0; 2695 while (szc) { 2696 pgsz = page_get_pagesize(szc); 2697 pg_cnt = pgsz >> PAGESHIFT; 2698 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2699 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2700 /* 2701 * XXX 2702 * Since we are faking page_create() 2703 * we also need to do the freemem and 2704 * pcf accounting. 2705 */ 2706 (void) page_create_wait(pg_cnt, PG_WAIT); 2707 2708 /* 2709 * Get lgroup to allocate next page of shared 2710 * memory from and use it to specify where to 2711 * allocate the physical memory 2712 */ 2713 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2714 2715 pplist = page_get_freelist( 2716 anon_vp, (u_offset_t)0, seg, 2717 addr, pgsz, 0, lgrp); 2718 2719 if (pplist == NULL) { 2720 page_create_putback(pg_cnt); 2721 } 2722 2723 /* 2724 * If a request for a page of size 2725 * larger than PAGESIZE failed 2726 * then don't try that size anymore. 2727 */ 2728 if (pplist == NULL) { 2729 l_szc = szc - 1; 2730 } else { 2731 break; 2732 } 2733 } 2734 szc--; 2735 } 2736 2737 /* 2738 * If just using PAGESIZE pages then don't 2739 * directly allocate from the free list. 2740 */ 2741 if (pplist == NULL) { 2742 ASSERT(szc == 0); 2743 pp = anon_zero(seg, addr, &ap, cred); 2744 if (pp == NULL) { 2745 ANON_LOCK_EXIT(&->a_rwlock); 2746 panic("anon_map_createpages: anon_zero"); 2747 } 2748 ppa[p_index++] = pp; 2749 2750 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2751 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2752 2753 addr += PAGESIZE; 2754 index++; 2755 npgs--; 2756 continue; 2757 } 2758 2759 /* 2760 * pplist is a list of pg_cnt PAGESIZE pages. 2761 * These pages are locked SE_EXCL since they 2762 * came directly off the free list. 2763 */ 2764 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2765 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2766 ASSERT(conpp == NULL); 2767 while (pg_cnt--) { 2768 2769 ap = anon_alloc(NULL, 0); 2770 swap_xlate(ap, &ap_vp, &ap_off); 2771 2772 ASSERT(pplist != NULL); 2773 pp = pplist; 2774 page_sub(&pplist, pp); 2775 PP_CLRFREE(pp); 2776 PP_CLRAGED(pp); 2777 conpp = pp; 2778 2779 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2780 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc, 2781 seg, addr, S_CREATE, cred); 2782 2783 if (err) { 2784 ANON_LOCK_EXIT(&->a_rwlock); 2785 panic("anon_map_createpages: S_CREATE"); 2786 } 2787 2788 ASSERT(anon_pl[0] == pp); 2789 ASSERT(nreloc == 1); 2790 pagezero(pp, 0, PAGESIZE); 2791 CPU_STATS_ADD_K(vm, zfod, 1); 2792 hat_setrefmod(pp); 2793 2794 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2795 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2796 2797 ppa[p_index++] = pp; 2798 2799 addr += PAGESIZE; 2800 index++; 2801 npgs--; 2802 } 2803 conpp = NULL; 2804 pg_cnt = pgsz >> PAGESHIFT; 2805 p_index = p_index - pg_cnt; 2806 while (pg_cnt--) { 2807 page_downgrade(ppa[p_index++]); 2808 } 2809 } 2810 ANON_LOCK_EXIT(&->a_rwlock); 2811 return (0); 2812 } 2813 2814 int 2815 anon_map_demotepages( 2816 struct anon_map *amp, 2817 ulong_t start_idx, 2818 struct seg *seg, 2819 caddr_t addr, 2820 uint_t prot, 2821 struct vpage vpage[], 2822 struct cred *cred) 2823 { 2824 struct anon *ap; 2825 uint_t szc = seg->s_szc; 2826 pgcnt_t pgcnt = page_get_pagecnt(szc); 2827 size_t ppasize = pgcnt * sizeof (page_t *); 2828 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 2829 page_t *pp; 2830 page_t *pl[2]; 2831 pgcnt_t i, pg_idx; 2832 ulong_t an_idx; 2833 caddr_t vaddr; 2834 kmutex_t *ahmpages = NULL; 2835 int err; 2836 int retry = 0; 2837 uint_t vpprot; 2838 2839 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 2840 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2841 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2842 ASSERT(ppa != NULL); 2843 2844 VM_STAT_ADD(anonvmstats.demotepages[0]); 2845 2846 ap = anon_get_ptr(amp->ahp, start_idx); 2847 if (ap != NULL) { 2848 VM_STAT_ADD(anonvmstats.demotepages[1]); 2849 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2850 mutex_enter(ahmpages); 2851 } 2852 top: 2853 if (ap == NULL || ap->an_refcnt <= 1) { 2854 int root = 0; 2855 pgcnt_t npgs, curnpgs = 0; 2856 2857 VM_STAT_ADD(anonvmstats.demotepages[2]); 2858 2859 ASSERT(retry == 0 || ap != NULL); 2860 2861 if (ahmpages != NULL) 2862 mutex_exit(ahmpages); 2863 an_idx = start_idx; 2864 for (i = 0; i < pgcnt; i++, an_idx++) { 2865 ap = anon_get_ptr(amp->ahp, an_idx); 2866 if (ap != NULL) { 2867 ASSERT(ap->an_refcnt == 1); 2868 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2869 SE_EXCL); 2870 if (pp != NULL) { 2871 (void) hat_pageunload(pp, 2872 HAT_FORCE_PGUNLOAD); 2873 } 2874 } else { 2875 ppa[i] = NULL; 2876 } 2877 } 2878 for (i = 0; i < pgcnt; i++) { 2879 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 2880 ASSERT(pp->p_szc <= szc); 2881 if (!root) { 2882 VM_STAT_ADD(anonvmstats.demotepages[3]); 2883 if (curnpgs != 0) 2884 panic("anon_map_demotepages: " 2885 "bad large page"); 2886 2887 root = 1; 2888 curnpgs = npgs = 2889 page_get_pagecnt(pp->p_szc); 2890 2891 ASSERT(npgs <= pgcnt); 2892 ASSERT(IS_P2ALIGNED(npgs, npgs)); 2893 ASSERT(!(page_pptonum(pp) & 2894 (npgs - 1))); 2895 } else { 2896 ASSERT(i > 0); 2897 ASSERT(page_pptonum(pp) - 1 == 2898 page_pptonum(ppa[i - 1])); 2899 if ((page_pptonum(pp) & (npgs - 1)) == 2900 npgs - 1) 2901 root = 0; 2902 } 2903 ASSERT(PAGE_EXCL(pp)); 2904 pp->p_szc = 0; 2905 curnpgs--; 2906 } 2907 } 2908 if (root != 0 || curnpgs != 0) 2909 panic("anon_map_demotepages: bad large page"); 2910 2911 for (i = 0; i < pgcnt; i++) { 2912 if ((pp = ppa[i]) != NULL) { 2913 ASSERT(!hat_page_is_mapped(pp)); 2914 ASSERT(pp->p_szc == 0); 2915 page_unlock(pp); 2916 } 2917 } 2918 kmem_free(ppa, ppasize); 2919 return (0); 2920 } 2921 ASSERT(ahmpages != NULL); 2922 mutex_exit(ahmpages); 2923 ahmpages = NULL; 2924 2925 VM_STAT_ADD(anonvmstats.demotepages[4]); 2926 2927 ASSERT(retry == 0); /* we can be here only once */ 2928 2929 vaddr = addr; 2930 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2931 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2932 ap = anon_get_ptr(amp->ahp, an_idx); 2933 if (ap == NULL) 2934 panic("anon_map_demotepages: no anon slot"); 2935 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 2936 S_READ, cred); 2937 if (err) { 2938 for (i = 0; i < pg_idx; i++) { 2939 if ((pp = ppa[i]) != NULL) 2940 page_unlock(pp); 2941 } 2942 kmem_free(ppa, ppasize); 2943 return (err); 2944 } 2945 ppa[pg_idx] = pl[0]; 2946 } 2947 2948 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 2949 vpage, -1, cred); 2950 if (err > 0) { 2951 VM_STAT_ADD(anonvmstats.demotepages[5]); 2952 kmem_free(ppa, ppasize); 2953 return (err); 2954 } 2955 ASSERT(err == 0 || err == -1); 2956 if (err == -1) { 2957 VM_STAT_ADD(anonvmstats.demotepages[6]); 2958 retry = 1; 2959 goto top; 2960 } 2961 for (i = 0; i < pgcnt; i++) { 2962 ASSERT(ppa[i] != NULL); 2963 if (ppa[i]->p_szc != 0) 2964 retry = 1; 2965 page_unlock(ppa[i]); 2966 } 2967 if (retry) { 2968 VM_STAT_ADD(anonvmstats.demotepages[7]); 2969 goto top; 2970 } 2971 2972 VM_STAT_ADD(anonvmstats.demotepages[8]); 2973 2974 kmem_free(ppa, ppasize); 2975 2976 return (0); 2977 } 2978 2979 /* 2980 * Allocate and initialize an anon_map structure for seg 2981 * associating the given swap reservation with the new anon_map. 2982 */ 2983 struct anon_map * 2984 anonmap_alloc(size_t size, size_t swresv) 2985 { 2986 struct anon_map *amp; 2987 2988 amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP); 2989 2990 amp->refcnt = 1; 2991 amp->size = size; 2992 2993 amp->ahp = anon_create(btopr(size), ANON_SLEEP); 2994 amp->swresv = swresv; 2995 amp->locality = 0; 2996 amp->a_szc = 0; 2997 return (amp); 2998 } 2999 3000 void 3001 anonmap_free(struct anon_map *amp) 3002 { 3003 ASSERT(amp->ahp); 3004 ASSERT(amp->refcnt == 0); 3005 3006 lgrp_shm_policy_fini(amp, NULL); 3007 anon_release(amp->ahp, btopr(amp->size)); 3008 kmem_cache_free(anonmap_cache, amp); 3009 } 3010 3011 /* 3012 * Returns true if the app array has some empty slots. 3013 * The offp and lenp paramters are in/out paramters. On entry 3014 * these values represent the starting offset and length of the 3015 * mapping. When true is returned, these values may be modified 3016 * to be the largest range which includes empty slots. 3017 */ 3018 int 3019 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3020 size_t *lenp) 3021 { 3022 ulong_t i, el; 3023 ssize_t low, high; 3024 struct anon *ap; 3025 3026 low = -1; 3027 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3028 ap = anon_get_ptr(ahp, anon_idx); 3029 if (ap == NULL) { 3030 if (low == -1) 3031 low = i; 3032 high = i; 3033 } 3034 } 3035 if (low != -1) { 3036 /* 3037 * Found at least one non-anon page. 3038 * Set up the off and len return values. 3039 */ 3040 if (low != 0) 3041 *offp += low; 3042 *lenp = high - low + PAGESIZE; 3043 return (1); 3044 } 3045 return (0); 3046 } 3047 3048 /* 3049 * Return a count of the number of existing anon pages in the anon array 3050 * app in the range (off, off+len). The array and slots must be guaranteed 3051 * stable by the caller. 3052 */ 3053 pgcnt_t 3054 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3055 { 3056 pgcnt_t cnt = 0; 3057 3058 while (nslots-- > 0) { 3059 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3060 cnt++; 3061 anon_index++; 3062 } 3063 return (cnt); 3064 } 3065 3066 /* 3067 * Move reserved phys swap into memory swap (unreserve phys swap 3068 * and reserve mem swap by the same amount). 3069 * Used by segspt when it needs to lock resrved swap npages in memory 3070 */ 3071 int 3072 anon_swap_adjust(pgcnt_t npages) 3073 { 3074 pgcnt_t unlocked_mem_swap; 3075 3076 mutex_enter(&anoninfo_lock); 3077 3078 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3079 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3080 3081 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3082 - k_anoninfo.ani_locked_swap; 3083 if (npages > unlocked_mem_swap) { 3084 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3085 3086 /* 3087 * if there is not enough unlocked mem swap we take missing 3088 * amount from phys swap and give it to mem swap 3089 */ 3090 mutex_enter(&freemem_lock); 3091 if (availrmem < adjusted_swap + segspt_minfree) { 3092 mutex_exit(&freemem_lock); 3093 mutex_exit(&anoninfo_lock); 3094 return (ENOMEM); 3095 } 3096 availrmem -= adjusted_swap; 3097 mutex_exit(&freemem_lock); 3098 3099 k_anoninfo.ani_mem_resv += adjusted_swap; 3100 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3101 k_anoninfo.ani_phys_resv -= adjusted_swap; 3102 3103 ANI_ADD(adjusted_swap); 3104 } 3105 k_anoninfo.ani_locked_swap += npages; 3106 3107 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3108 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3109 3110 mutex_exit(&anoninfo_lock); 3111 3112 return (0); 3113 } 3114 3115 /* 3116 * 'unlocked' reserved mem swap so when it is unreserved it 3117 * can be moved back phys (disk) swap 3118 */ 3119 void 3120 anon_swap_restore(pgcnt_t npages) 3121 { 3122 mutex_enter(&anoninfo_lock); 3123 3124 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3125 3126 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3127 k_anoninfo.ani_locked_swap -= npages; 3128 3129 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3130 3131 mutex_exit(&anoninfo_lock); 3132 } 3133 3134 /* 3135 * Return the pointer from the list for a 3136 * specified anon index. 3137 */ 3138 ulong_t * 3139 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3140 { 3141 struct anon **app; 3142 void **ppp; 3143 3144 ASSERT(an_idx < ahp->size); 3145 3146 /* 3147 * Single level case. 3148 */ 3149 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3150 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3151 } else { 3152 3153 /* 3154 * 2 level case. 3155 */ 3156 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3157 if (*ppp == NULL) { 3158 mutex_enter(&ahp->serial_lock); 3159 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3160 if (*ppp == NULL) 3161 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3162 mutex_exit(&ahp->serial_lock); 3163 } 3164 app = *ppp; 3165 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3166 } 3167 } 3168 3169 void 3170 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3171 { 3172 ulong_t *ap_slot; 3173 kmutex_t *mtx; 3174 kcondvar_t *cv; 3175 int hash; 3176 3177 /* 3178 * Use szc to determine anon slot(s) to appear atomic. 3179 * If szc = 0, then lock the anon slot and mark it busy. 3180 * If szc > 0, then lock the range of slots by getting the 3181 * anon_array_lock for the first anon slot, and mark only the 3182 * first anon slot busy to represent whole range being busy. 3183 */ 3184 3185 ASSERT(RW_READ_HELD(&->a_rwlock)); 3186 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3187 hash = ANON_ARRAY_HASH(amp, an_idx); 3188 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3189 sobj->sync_cv = cv = &anon_array_cv[hash]; 3190 mutex_enter(mtx); 3191 ap_slot = anon_get_slot(amp->ahp, an_idx); 3192 while (ANON_ISBUSY(ap_slot)) 3193 cv_wait(cv, mtx); 3194 ANON_SETBUSY(ap_slot); 3195 sobj->sync_data = ap_slot; 3196 mutex_exit(mtx); 3197 } 3198 3199 void 3200 anon_array_exit(anon_sync_obj_t *sobj) 3201 { 3202 mutex_enter(sobj->sync_mutex); 3203 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3204 ANON_CLRBUSY(sobj->sync_data); 3205 if (CV_HAS_WAITERS(sobj->sync_cv)) 3206 cv_broadcast(sobj->sync_cv); 3207 mutex_exit(sobj->sync_mutex); 3208 } 3209