1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - anonymous pages. 43 * 44 * This layer sits immediately above the vm_swap layer. It manages 45 * physical pages that have no permanent identity in the file system 46 * name space, using the services of the vm_swap layer to allocate 47 * backing storage for these pages. Since these pages have no external 48 * identity, they are discarded when the last reference is removed. 49 * 50 * An important function of this layer is to manage low-level sharing 51 * of pages that are logically distinct but that happen to be 52 * physically identical (e.g., the corresponding pages of the processes 53 * resulting from a fork before one process or the other changes their 54 * contents). This pseudo-sharing is present only as an optimization 55 * and is not to be confused with true sharing in which multiple 56 * address spaces deliberately contain references to the same object; 57 * such sharing is managed at a higher level. 58 * 59 * The key data structure here is the anon struct, which contains a 60 * reference count for its associated physical page and a hint about 61 * the identity of that page. Anon structs typically live in arrays, 62 * with an instance's position in its array determining where the 63 * corresponding backing storage is allocated; however, the swap_xlate() 64 * routine abstracts away this representation information so that the 65 * rest of the anon layer need not know it. (See the swap layer for 66 * more details on anon struct layout.) 67 * 68 * In the future versions of the system, the association between an 69 * anon struct and its position on backing store will change so that 70 * we don't require backing store all anonymous pages in the system. 71 * This is important for consideration for large memory systems. 72 * We can also use this technique to delay binding physical locations 73 * to anonymous pages until pageout/swapout time where we can make 74 * smarter allocation decisions to improve anonymous klustering. 75 * 76 * Many of the routines defined here take a (struct anon **) argument, 77 * which allows the code at this level to manage anon pages directly, 78 * so that callers can regard anon structs as opaque objects and not be 79 * concerned with assigning or inspecting their contents. 80 * 81 * Clients of this layer refer to anon pages indirectly. That is, they 82 * maintain arrays of pointers to anon structs rather than maintaining 83 * anon structs themselves. The (struct anon **) arguments mentioned 84 * above are pointers to entries in these arrays. It is these arrays 85 * that capture the mapping between offsets within a given segment and 86 * the corresponding anonymous backing storage address. 87 */ 88 89 #ifdef DEBUG 90 #define ANON_DEBUG 91 #endif 92 93 #include <sys/types.h> 94 #include <sys/t_lock.h> 95 #include <sys/param.h> 96 #include <sys/systm.h> 97 #include <sys/mman.h> 98 #include <sys/cred.h> 99 #include <sys/thread.h> 100 #include <sys/vnode.h> 101 #include <sys/cpuvar.h> 102 #include <sys/swap.h> 103 #include <sys/cmn_err.h> 104 #include <sys/vtrace.h> 105 #include <sys/kmem.h> 106 #include <sys/sysmacros.h> 107 #include <sys/bitmap.h> 108 #include <sys/vmsystm.h> 109 #include <sys/debug.h> 110 #include <sys/fs/swapnode.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 struct vnode *anon_vp; 128 129 int anon_debug; 130 131 kmutex_t anoninfo_lock; 132 struct k_anoninfo k_anoninfo; 133 ani_free_t ani_free_pool[ANI_MAX_POOL]; 134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 135 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 136 137 /* 138 * Global hash table for (vp, off) -> anon slot 139 */ 140 extern int swap_maxcontig; 141 size_t anon_hash_size; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 #ifdef VM_STATS 148 static struct anonvmstats_str { 149 ulong_t getpages[30]; 150 ulong_t privatepages[10]; 151 ulong_t demotepages[9]; 152 ulong_t decrefpages[9]; 153 ulong_t dupfillholes[4]; 154 ulong_t freepages[1]; 155 } anonvmstats; 156 #endif /* VM_STATS */ 157 158 159 /*ARGSUSED*/ 160 static int 161 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 162 { 163 struct anon_map *amp = buf; 164 165 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 166 return (0); 167 } 168 169 /*ARGSUSED1*/ 170 static void 171 anonmap_cache_destructor(void *buf, void *cdrarg) 172 { 173 struct anon_map *amp = buf; 174 175 rw_destroy(&->a_rwlock); 176 } 177 178 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 179 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 180 181 void 182 anon_init(void) 183 { 184 int i; 185 186 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 187 188 for (i = 0; i < AH_LOCK_SIZE; i++) { 189 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 190 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 191 } 192 193 for (i = 0; i < ANON_LOCKSIZE; i++) { 194 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 195 MUTEX_DEFAULT, NULL); 196 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 197 } 198 199 anon_hash = (struct anon **) 200 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 201 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 202 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 203 anonmap_cache = kmem_cache_create("anonmap_cache", 204 sizeof (struct anon_map), 0, 205 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 206 NULL, NULL, 0); 207 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 208 209 anon_vp = vn_alloc(KM_SLEEP); 210 vn_setops(anon_vp, swap_vnodeops); 211 anon_vp->v_type = VREG; 212 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 213 } 214 215 /* 216 * Global anon slot hash table manipulation. 217 */ 218 219 static void 220 anon_addhash(struct anon *ap) 221 { 222 int index; 223 224 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 225 index = ANON_HASH(ap->an_vp, ap->an_off); 226 ap->an_hash = anon_hash[index]; 227 anon_hash[index] = ap; 228 } 229 230 static void 231 anon_rmhash(struct anon *ap) 232 { 233 struct anon **app; 234 235 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 236 237 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 238 *app; app = &((*app)->an_hash)) { 239 if (*app == ap) { 240 *app = ap->an_hash; 241 break; 242 } 243 } 244 } 245 246 /* 247 * The anon array interfaces. Functions allocating, 248 * freeing array of pointers, and returning/setting 249 * entries in the array of pointers for a given offset. 250 * 251 * Create the list of pointers 252 */ 253 struct anon_hdr * 254 anon_create(pgcnt_t npages, int flags) 255 { 256 struct anon_hdr *ahp; 257 ulong_t nchunks; 258 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 259 260 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 261 return (NULL); 262 } 263 264 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 265 /* 266 * Single level case. 267 */ 268 ahp->size = npages; 269 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 270 271 if (flags & ANON_ALLOC_FORCE) 272 ahp->flags |= ANON_ALLOC_FORCE; 273 274 ahp->array_chunk = kmem_zalloc( 275 ahp->size * sizeof (struct anon *), kmemflags); 276 277 if (ahp->array_chunk == NULL) { 278 kmem_free(ahp, sizeof (struct anon_hdr)); 279 return (NULL); 280 } 281 } else { 282 /* 283 * 2 Level case. 284 * anon hdr size needs to be rounded off to be a multiple 285 * of ANON_CHUNK_SIZE. This is important as various anon 286 * related functions depend on this. 287 * NOTE - 288 * anon_grow() makes anon hdr size a multiple of 289 * ANON_CHUNK_SIZE. 290 * amp size is <= anon hdr size. 291 * anon_index + seg_pgs <= anon hdr size. 292 */ 293 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 294 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 295 296 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 297 kmemflags); 298 299 if (ahp->array_chunk == NULL) { 300 kmem_free(ahp, sizeof (struct anon_hdr)); 301 return (NULL); 302 } 303 } 304 return (ahp); 305 } 306 307 /* 308 * Free the array of pointers 309 */ 310 void 311 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 312 { 313 ulong_t i; 314 void **ppp; 315 ulong_t nchunks; 316 317 ASSERT(npages <= ahp->size); 318 319 /* 320 * Single level case. 321 */ 322 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 323 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 324 } else { 325 /* 326 * 2 level case. 327 */ 328 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 329 for (i = 0; i < nchunks; i++) { 330 ppp = &ahp->array_chunk[i]; 331 if (*ppp != NULL) 332 kmem_free(*ppp, PAGESIZE); 333 } 334 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 335 } 336 mutex_destroy(&ahp->serial_lock); 337 kmem_free(ahp, sizeof (struct anon_hdr)); 338 } 339 340 /* 341 * Return the pointer from the list for a 342 * specified anon index. 343 */ 344 struct anon * 345 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 346 { 347 struct anon **app; 348 349 ASSERT(an_idx < ahp->size); 350 351 /* 352 * Single level case. 353 */ 354 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 355 return ((struct anon *) 356 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 357 } else { 358 359 /* 360 * 2 level case. 361 */ 362 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 363 if (app) { 364 return ((struct anon *) 365 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 366 ANON_PTRMASK)); 367 } else { 368 return (NULL); 369 } 370 } 371 } 372 373 /* 374 * Return the anon pointer for the first valid entry in the anon list, 375 * starting from the given index. 376 */ 377 struct anon * 378 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 379 { 380 struct anon *ap; 381 struct anon **app; 382 ulong_t chunkoff; 383 ulong_t i; 384 ulong_t j; 385 pgcnt_t size; 386 387 i = *index; 388 size = ahp->size; 389 390 ASSERT(i < size); 391 392 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 393 /* 394 * 1 level case 395 */ 396 while (i < size) { 397 ap = (struct anon *) 398 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 399 if (ap) { 400 *index = i; 401 return (ap); 402 } 403 i++; 404 } 405 } else { 406 /* 407 * 2 level case 408 */ 409 chunkoff = i & ANON_CHUNK_OFF; 410 while (i < size) { 411 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 412 if (app) 413 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 414 ap = (struct anon *) 415 ((uintptr_t)app[j] & 416 ANON_PTRMASK); 417 if (ap) { 418 *index = i + (j - chunkoff); 419 return (ap); 420 } 421 } 422 chunkoff = 0; 423 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 424 } 425 } 426 *index = size; 427 return (NULL); 428 } 429 430 /* 431 * Set list entry with a given pointer for a specified offset 432 */ 433 int 434 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 435 { 436 void **ppp; 437 struct anon **app; 438 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 439 uintptr_t *ap_addr; 440 441 ASSERT(an_idx < ahp->size); 442 443 /* 444 * Single level case. 445 */ 446 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 447 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 448 } else { 449 450 /* 451 * 2 level case. 452 */ 453 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 454 455 ASSERT(ppp != NULL); 456 if (*ppp == NULL) { 457 mutex_enter(&ahp->serial_lock); 458 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 459 if (*ppp == NULL) { 460 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 461 if (*ppp == NULL) { 462 mutex_exit(&ahp->serial_lock); 463 return (ENOMEM); 464 } 465 } 466 mutex_exit(&ahp->serial_lock); 467 } 468 app = *ppp; 469 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 470 } 471 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 472 return (0); 473 } 474 475 /* 476 * Copy anon array into a given new anon array 477 */ 478 int 479 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 480 struct anon_hdr *dahp, ulong_t d_idx, 481 pgcnt_t npages, int flags) 482 { 483 void **sapp, **dapp; 484 void *ap; 485 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 486 487 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 488 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 489 490 /* 491 * Both arrays are 1 level. 492 */ 493 if (((sahp->size <= ANON_CHUNK_SIZE) && 494 (dahp->size <= ANON_CHUNK_SIZE)) || 495 ((sahp->flags & ANON_ALLOC_FORCE) && 496 (dahp->flags & ANON_ALLOC_FORCE))) { 497 498 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 499 npages * sizeof (struct anon *)); 500 return (0); 501 } 502 503 /* 504 * Both arrays are 2 levels. 505 */ 506 if (sahp->size > ANON_CHUNK_SIZE && 507 dahp->size > ANON_CHUNK_SIZE && 508 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 509 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 510 511 ulong_t sapidx, dapidx; 512 ulong_t *sap, *dap; 513 ulong_t chknp; 514 515 while (npages != 0) { 516 517 sapidx = s_idx & ANON_CHUNK_OFF; 518 dapidx = d_idx & ANON_CHUNK_OFF; 519 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 520 if (chknp > npages) 521 chknp = npages; 522 523 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 524 if ((sap = *sapp) != NULL) { 525 dapp = &dahp->array_chunk[d_idx 526 >> ANON_CHUNK_SHIFT]; 527 if ((dap = *dapp) == NULL) { 528 *dapp = kmem_zalloc(PAGESIZE, 529 kmemflags); 530 if ((dap = *dapp) == NULL) 531 return (ENOMEM); 532 } 533 bcopy((sap + sapidx), (dap + dapidx), 534 chknp << ANON_PTRSHIFT); 535 } 536 s_idx += chknp; 537 d_idx += chknp; 538 npages -= chknp; 539 } 540 return (0); 541 } 542 543 /* 544 * At least one of the arrays is 2 level. 545 */ 546 while (npages--) { 547 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 548 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 549 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 550 return (ENOMEM); 551 } 552 s_idx++; 553 d_idx++; 554 } 555 return (0); 556 } 557 558 559 /* 560 * ANON_INITBUF is a convenience macro for anon_grow() below. It 561 * takes a buffer dst, which is at least as large as buffer src. It 562 * does a bcopy from src into dst, and then bzeros the extra bytes 563 * of dst. If tail is set, the data in src is tail aligned within 564 * dst instead of head aligned. 565 */ 566 567 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 568 if (tail) { \ 569 bzero((dst), (dstsize) - (srclen)); \ 570 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 571 } else { \ 572 bcopy((src), (dst), (srclen)); \ 573 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 574 } 575 576 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 577 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 578 579 /* 580 * anon_grow() is used to efficiently extend an existing anon array. 581 * startidx_p points to the index into the anon array of the first page 582 * that is in use. oldseg_pgs is the number of pages in use, starting at 583 * *startidx_p. newpages is the number of additional pages desired. 584 * 585 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 586 * 587 * The growth is done by creating a new top level of the anon array, 588 * and (if the array is 2-level) reusing the existing second level arrays. 589 * 590 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 591 * 592 * Returns the new number of pages in the anon array. 593 */ 594 pgcnt_t 595 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 596 pgcnt_t newseg_pgs, int flags) 597 { 598 ulong_t startidx = startidx_p ? *startidx_p : 0; 599 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 600 pgcnt_t oelems, nelems, totpages; 601 void **level1; 602 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 603 int growdown = (flags & ANON_GROWDOWN); 604 size_t newarrsz, oldarrsz; 605 void *level2; 606 607 ASSERT(!(startidx_p == NULL && growdown)); 608 ASSERT(startidx + oldseg_pgs <= ahp->size); 609 610 /* 611 * Determine the total number of pages needed in the new 612 * anon array. If growing down, totpages is all pages from 613 * startidx through the end of the array, plus <newseg_pgs> 614 * pages. If growing up, keep all pages from page 0 through 615 * the last page currently in use, plus <newseg_pgs> pages. 616 */ 617 if (growdown) 618 totpages = oldamp_pgs - startidx + newseg_pgs; 619 else 620 totpages = startidx + oldseg_pgs + newseg_pgs; 621 622 /* If the array is already large enough, just return. */ 623 624 if (oldamp_pgs >= totpages) { 625 if (growdown) 626 *startidx_p = oldamp_pgs - totpages; 627 return (oldamp_pgs); 628 } 629 630 /* 631 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 632 * by the corresponding arrays. 633 * oelems/nelems are the number of pointers in the top level arrays 634 * which may be either level 1 or level 2. 635 * Will the new anon array be one level or two levels? 636 */ 637 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 638 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 639 oelems = oldamp_pgs; 640 nelems = newamp_pgs; 641 } else { 642 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 643 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 644 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 645 } 646 647 newarrsz = nelems * sizeof (void *); 648 level1 = kmem_alloc(newarrsz, kmemflags); 649 if (level1 == NULL) 650 return (0); 651 652 /* Are we converting from a one level to a two level anon array? */ 653 654 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 655 !(ahp->flags & ANON_ALLOC_FORCE)) { 656 657 /* 658 * Yes, we're converting to a two level. Reuse old level 1 659 * as new level 2 if it is exactly PAGESIZE. Otherwise 660 * alloc a new level 2 and copy the old level 1 data into it. 661 */ 662 if (oldamp_pgs == ANON_CHUNK_SIZE) { 663 level2 = (void *)ahp->array_chunk; 664 } else { 665 level2 = kmem_alloc(PAGESIZE, kmemflags); 666 if (level2 == NULL) { 667 kmem_free(level1, newarrsz); 668 return (0); 669 } 670 oldarrsz = oldamp_pgs * sizeof (void *); 671 672 ANON_INITBUF(ahp->array_chunk, oldarrsz, 673 level2, PAGESIZE, growdown); 674 kmem_free(ahp->array_chunk, oldarrsz); 675 } 676 bzero(level1, newarrsz); 677 if (growdown) 678 level1[nelems - 1] = level2; 679 else 680 level1[0] = level2; 681 } else { 682 oldarrsz = oelems * sizeof (void *); 683 684 ANON_INITBUF(ahp->array_chunk, oldarrsz, 685 level1, newarrsz, growdown); 686 kmem_free(ahp->array_chunk, oldarrsz); 687 } 688 689 ahp->array_chunk = level1; 690 ahp->size = newamp_pgs; 691 if (growdown) 692 *startidx_p = newamp_pgs - totpages; 693 694 return (newamp_pgs); 695 } 696 697 698 /* 699 * Called from clock handler to sync ani_free value. 700 */ 701 702 void 703 set_anoninfo(void) 704 { 705 int ix; 706 pgcnt_t total = 0; 707 708 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 709 total += ani_free_pool[ix].ani_count; 710 } 711 k_anoninfo.ani_free = total; 712 } 713 714 /* 715 * Reserve anon space. 716 * 717 * It's no longer simply a matter of incrementing ani_resv to 718 * reserve swap space, we need to check memory-based as well 719 * as disk-backed (physical) swap. The following algorithm 720 * is used: 721 * Check the space on physical swap 722 * i.e. amount needed < ani_max - ani_phys_resv 723 * If we are swapping on swapfs check 724 * amount needed < (availrmem - swapfs_minfree) 725 * Since the algorithm to check for the quantity of swap space is 726 * almost the same as that for reserving it, we'll just use anon_resvmem 727 * with a flag to decrement availrmem. 728 * 729 * Return non-zero on success. 730 */ 731 int 732 anon_resvmem(size_t size, uint_t takemem) 733 { 734 pgcnt_t npages = btopr(size); 735 pgcnt_t mswap_pages = 0; 736 pgcnt_t pswap_pages = 0; 737 738 mutex_enter(&anoninfo_lock); 739 740 /* 741 * pswap_pages is the number of pages we can take from 742 * physical (i.e. disk-backed) swap. 743 */ 744 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 745 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 746 747 ANON_PRINT(A_RESV, 748 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 749 npages, takemem, pswap_pages, (void *)caller())); 750 751 if (npages <= pswap_pages) { 752 /* 753 * we have enough space on a physical swap 754 */ 755 if (takemem) 756 k_anoninfo.ani_phys_resv += npages; 757 mutex_exit(&anoninfo_lock); 758 return (1); 759 } else if (pswap_pages != 0) { 760 /* 761 * we have some space on a physical swap 762 */ 763 if (takemem) { 764 /* 765 * use up remainder of phys swap 766 */ 767 k_anoninfo.ani_phys_resv += pswap_pages; 768 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 769 } 770 } 771 /* 772 * since (npages > pswap_pages) we need mem swap 773 * mswap_pages is the number of pages needed from availrmem 774 */ 775 ASSERT(npages > pswap_pages); 776 mswap_pages = npages - pswap_pages; 777 778 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 779 mswap_pages)); 780 781 /* 782 * priv processes can reserve memory as swap as long as availrmem 783 * remains greater than swapfs_minfree; in the case of non-priv 784 * processes, memory can be reserved as swap only if availrmem 785 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 786 * swapfs_reserve amount of memswap is not available to non-priv 787 * processes. This protects daemons such as automounter dying 788 * as a result of application processes eating away almost entire 789 * membased swap. This safeguard becomes useless if apps are run 790 * with root access. 791 * 792 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 793 * 794 */ 795 mutex_enter(&freemem_lock); 796 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 797 (availrmem > (swapfs_minfree + mswap_pages) && 798 secpolicy_resource(CRED()) == 0)) { 799 800 if (takemem) { 801 /* 802 * Take the memory from the rest of the system. 803 */ 804 availrmem -= mswap_pages; 805 mutex_exit(&freemem_lock); 806 k_anoninfo.ani_mem_resv += mswap_pages; 807 ANI_ADD(mswap_pages); 808 ANON_PRINT((A_RESV | A_MRESV), 809 ("anon_resvmem: took %ld pages of availrmem\n", 810 mswap_pages)); 811 } else { 812 mutex_exit(&freemem_lock); 813 } 814 815 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 816 mutex_exit(&anoninfo_lock); 817 return (1); 818 819 } else { 820 /* 821 * Fail if not enough memory 822 */ 823 824 if (takemem) { 825 k_anoninfo.ani_phys_resv -= pswap_pages; 826 } 827 828 mutex_exit(&freemem_lock); 829 mutex_exit(&anoninfo_lock); 830 ANON_PRINT(A_RESV, 831 ("anon_resvmem: not enough space from swapfs\n")); 832 return (0); 833 } 834 } 835 836 837 /* 838 * Give back an anon reservation. 839 */ 840 void 841 anon_unresv(size_t size) 842 { 843 pgcnt_t npages = btopr(size); 844 spgcnt_t mem_free_pages = 0; 845 pgcnt_t phys_free_slots; 846 #ifdef ANON_DEBUG 847 pgcnt_t mem_resv; 848 #endif 849 850 mutex_enter(&anoninfo_lock); 851 852 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 853 /* 854 * If some of this reservation belonged to swapfs 855 * give it back to availrmem. 856 * ani_mem_resv is the amount of availrmem swapfs has reserved. 857 * but some of that memory could be locked by segspt so we can only 858 * return non locked ani_mem_resv back to availrmem 859 */ 860 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 861 ANON_PRINT((A_RESV | A_MRESV), 862 ("anon_unresv: growing availrmem by %ld pages\n", 863 MIN(k_anoninfo.ani_mem_resv, npages))); 864 865 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 866 k_anoninfo.ani_locked_swap), npages); 867 mutex_enter(&freemem_lock); 868 availrmem += mem_free_pages; 869 mutex_exit(&freemem_lock); 870 k_anoninfo.ani_mem_resv -= mem_free_pages; 871 872 ANI_ADD(-mem_free_pages); 873 } 874 /* 875 * The remainder of the pages is returned to phys swap 876 */ 877 ASSERT(npages >= mem_free_pages); 878 phys_free_slots = npages - mem_free_pages; 879 880 if (phys_free_slots) { 881 k_anoninfo.ani_phys_resv -= phys_free_slots; 882 } 883 884 #ifdef ANON_DEBUG 885 mem_resv = k_anoninfo.ani_mem_resv; 886 #endif 887 888 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 889 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 890 891 mutex_exit(&anoninfo_lock); 892 893 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 894 npages, mem_resv, (void *)caller())); 895 } 896 897 /* 898 * Allocate an anon slot and return it with the lock held. 899 */ 900 struct anon * 901 anon_alloc(struct vnode *vp, anoff_t off) 902 { 903 struct anon *ap; 904 kmutex_t *ahm; 905 906 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 907 if (vp == NULL) { 908 swap_alloc(ap); 909 } else { 910 ap->an_vp = vp; 911 ap->an_off = off; 912 } 913 ap->an_refcnt = 1; 914 ap->an_pvp = NULL; 915 ap->an_poff = 0; 916 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 917 mutex_enter(ahm); 918 anon_addhash(ap); 919 mutex_exit(ahm); 920 ANI_ADD(-1); 921 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 922 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 923 return (ap); 924 } 925 926 /* 927 * Decrement the reference count of an anon page. 928 * If reference count goes to zero, free it and 929 * its associated page (if any). 930 */ 931 void 932 anon_decref(struct anon *ap) 933 { 934 page_t *pp; 935 struct vnode *vp; 936 anoff_t off; 937 kmutex_t *ahm; 938 939 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 940 mutex_enter(ahm); 941 ASSERT(ap->an_refcnt != 0); 942 if (ap->an_refcnt == 0) 943 panic("anon_decref: slot count 0"); 944 if (--ap->an_refcnt == 0) { 945 swap_xlate(ap, &vp, &off); 946 mutex_exit(ahm); 947 948 /* 949 * If there is a page for this anon slot we will need to 950 * call VN_DISPOSE to get rid of the vp association and 951 * put the page back on the free list as really free. 952 * Acquire the "exclusive" lock to ensure that any 953 * pending i/o always completes before the swap slot 954 * is freed. 955 */ 956 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 957 958 /* 959 * If there was a page, we've synchronized on it (getting 960 * the exclusive lock is as good as gettting the iolock) 961 * so now we can free the physical backing store. Also, this 962 * is where we would free the name of the anonymous page 963 * (swap_free(ap)), a no-op in the current implementation. 964 */ 965 mutex_enter(ahm); 966 ASSERT(ap->an_refcnt == 0); 967 anon_rmhash(ap); 968 if (ap->an_pvp) 969 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 970 mutex_exit(ahm); 971 972 if (pp != NULL) { 973 /*LINTED: constant in conditional context */ 974 VN_DISPOSE(pp, B_INVAL, 0, kcred); 975 } 976 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 977 (void *)ap, (void *)ap->an_vp)); 978 kmem_cache_free(anon_cache, ap); 979 980 ANI_ADD(1); 981 } else { 982 mutex_exit(ahm); 983 } 984 } 985 986 static int 987 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 988 { 989 struct anon *ap; 990 991 while (nslots-- > 0) { 992 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 993 ap->an_refcnt > 1) 994 return (1); 995 anon_index++; 996 } 997 998 return (0); 999 } 1000 1001 static void 1002 anon_decref_pages( 1003 struct anon_hdr *ahp, 1004 ulong_t an_idx, 1005 uint_t szc) 1006 { 1007 struct anon *ap = anon_get_ptr(ahp, an_idx); 1008 kmutex_t *ahmpages = NULL; 1009 page_t *pp; 1010 pgcnt_t pgcnt = page_get_pagecnt(szc); 1011 pgcnt_t i; 1012 struct vnode *vp; 1013 anoff_t off; 1014 kmutex_t *ahm; 1015 #ifdef DEBUG 1016 int refcnt = 1; 1017 #endif 1018 1019 ASSERT(szc != 0); 1020 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1021 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1022 1023 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1024 1025 if (ap != NULL) { 1026 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1027 mutex_enter(ahmpages); 1028 ASSERT((refcnt = ap->an_refcnt) != 0); 1029 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1030 if (ap->an_refcnt == 1) { 1031 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1032 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1033 mutex_exit(ahmpages); 1034 ahmpages = NULL; 1035 } 1036 } 1037 1038 i = 0; 1039 while (i < pgcnt) { 1040 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1041 ASSERT(refcnt == 1 && ahmpages == NULL); 1042 i++; 1043 continue; 1044 } 1045 ASSERT(ap->an_refcnt == refcnt); 1046 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1047 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1048 1049 if (ahmpages == NULL) { 1050 swap_xlate(ap, &vp, &off); 1051 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1052 if (pp == NULL || pp->p_szc == 0) { 1053 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1054 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1055 ap->an_off)]; 1056 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1057 ANON_SLEEP); 1058 mutex_enter(ahm); 1059 ap->an_refcnt--; 1060 ASSERT(ap->an_refcnt == 0); 1061 anon_rmhash(ap); 1062 if (ap->an_pvp) 1063 swap_phys_free(ap->an_pvp, ap->an_poff, 1064 PAGESIZE); 1065 mutex_exit(ahm); 1066 if (pp != NULL) { 1067 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1068 /*LINTED*/ 1069 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1070 } 1071 kmem_cache_free(anon_cache, ap); 1072 ANI_ADD(1); 1073 i++; 1074 } else { 1075 pgcnt_t j; 1076 pgcnt_t curpgcnt = 1077 page_get_pagecnt(pp->p_szc); 1078 size_t ppasize = curpgcnt * sizeof (page_t *); 1079 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1080 int dispose = 0; 1081 1082 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1083 1084 ASSERT(pp->p_szc <= szc); 1085 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1086 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1087 ASSERT(i + curpgcnt <= pgcnt); 1088 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1089 ppa[0] = pp; 1090 for (j = i + 1; j < i + curpgcnt; j++) { 1091 ap = anon_get_ptr(ahp, an_idx + j); 1092 ASSERT(ap != NULL && 1093 ap->an_refcnt == 1); 1094 swap_xlate(ap, &vp, &off); 1095 pp = page_lookup(vp, (u_offset_t)off, 1096 SE_EXCL); 1097 if (pp == NULL) 1098 panic("anon_decref_pages: " 1099 "no page"); 1100 1101 (void) hat_pageunload(pp, 1102 HAT_FORCE_PGUNLOAD); 1103 ASSERT(pp->p_szc == ppa[0]->p_szc); 1104 ASSERT(page_pptonum(pp) - 1 == 1105 page_pptonum(ppa[j - i - 1])); 1106 ppa[j - i] = pp; 1107 if (ap->an_pvp != NULL && 1108 !vn_matchopval(ap->an_pvp, 1109 VOPNAME_DISPOSE, 1110 (fs_generic_func_p)fs_dispose)) 1111 dispose = 1; 1112 } 1113 if (!dispose) { 1114 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1115 page_destroy_pages(ppa[0]); 1116 } else { 1117 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1118 for (j = 0; j < curpgcnt; j++) { 1119 ASSERT(PAGE_EXCL(ppa[j])); 1120 ppa[j]->p_szc = 0; 1121 } 1122 for (j = 0; j < curpgcnt; j++) { 1123 ASSERT(!hat_page_is_mapped( 1124 ppa[j])); 1125 /*LINTED*/ 1126 VN_DISPOSE(ppa[j], B_INVAL, 0, 1127 kcred); 1128 } 1129 } 1130 kmem_free(ppa, ppasize); 1131 for (j = i; j < i + curpgcnt; j++) { 1132 ap = anon_get_ptr(ahp, an_idx + j); 1133 ASSERT(ap != NULL && 1134 ap->an_refcnt == 1); 1135 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1136 ap->an_off)]; 1137 (void) anon_set_ptr(ahp, an_idx + j, 1138 NULL, ANON_SLEEP); 1139 mutex_enter(ahm); 1140 ap->an_refcnt--; 1141 ASSERT(ap->an_refcnt == 0); 1142 anon_rmhash(ap); 1143 if (ap->an_pvp) 1144 swap_phys_free(ap->an_pvp, 1145 ap->an_poff, PAGESIZE); 1146 mutex_exit(ahm); 1147 kmem_cache_free(anon_cache, ap); 1148 ANI_ADD(1); 1149 } 1150 i += curpgcnt; 1151 } 1152 } else { 1153 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1154 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1155 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1156 mutex_enter(ahm); 1157 ap->an_refcnt--; 1158 mutex_exit(ahm); 1159 i++; 1160 } 1161 } 1162 1163 if (ahmpages != NULL) { 1164 mutex_exit(ahmpages); 1165 } 1166 } 1167 1168 /* 1169 * Duplicate references to size bytes worth of anon pages. 1170 * Used when duplicating a segment that contains private anon pages. 1171 * This code assumes that procedure calling this one has already used 1172 * hat_chgprot() to disable write access to the range of addresses that 1173 * that *old actually refers to. 1174 */ 1175 void 1176 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1177 ulong_t new_idx, size_t size) 1178 { 1179 spgcnt_t npages; 1180 kmutex_t *ahm; 1181 struct anon *ap; 1182 ulong_t off; 1183 ulong_t index; 1184 1185 npages = btopr(size); 1186 while (npages > 0) { 1187 index = old_idx; 1188 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1189 break; 1190 1191 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1192 off = index - old_idx; 1193 npages -= off; 1194 if (npages <= 0) 1195 break; 1196 1197 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1198 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1199 1200 mutex_enter(ahm); 1201 ap->an_refcnt++; 1202 mutex_exit(ahm); 1203 1204 off++; 1205 new_idx += off; 1206 old_idx += off; 1207 npages--; 1208 } 1209 } 1210 1211 /* 1212 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1213 * slots) within any large page region. That means if a large page region is 1214 * empty in the old array it will skip it. If there are 1 or more valid slots 1215 * in the large page region of the old array it will make sure to fill in any 1216 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1217 * page region should either have no valid anon slots or all slots should be 1218 * valid. 1219 */ 1220 void 1221 anon_dup_fill_holes( 1222 struct anon_hdr *old, 1223 ulong_t old_idx, 1224 struct anon_hdr *new, 1225 ulong_t new_idx, 1226 size_t size, 1227 uint_t szc, 1228 int noalloc) 1229 { 1230 struct anon *ap; 1231 spgcnt_t npages; 1232 kmutex_t *ahm, *ahmpages = NULL; 1233 pgcnt_t pgcnt, i; 1234 ulong_t index, off; 1235 #ifdef DEBUG 1236 int refcnt; 1237 #endif 1238 1239 ASSERT(szc != 0); 1240 pgcnt = page_get_pagecnt(szc); 1241 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1242 npages = btopr(size); 1243 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1244 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1245 1246 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1247 1248 while (npages > 0) { 1249 index = old_idx; 1250 1251 /* 1252 * Find the next valid slot. 1253 */ 1254 if (anon_get_next_ptr(old, &index) == NULL) 1255 break; 1256 1257 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1258 /* 1259 * Now backup index to the beginning of the 1260 * current large page region of the old array. 1261 */ 1262 index = P2ALIGN(index, pgcnt); 1263 off = index - old_idx; 1264 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1265 npages -= off; 1266 if (npages <= 0) 1267 break; 1268 1269 /* 1270 * Fill and copy a large page regions worth 1271 * of anon slots. 1272 */ 1273 for (i = 0; i < pgcnt; i++) { 1274 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1275 if (noalloc) { 1276 panic("anon_dup_fill_holes: " 1277 "empty anon slot\n"); 1278 } 1279 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1280 ap = anon_alloc(NULL, 0); 1281 (void) anon_set_ptr(old, index + i, ap, 1282 ANON_SLEEP); 1283 } else if (i == 0) { 1284 /* 1285 * make the increment of all refcnts of all 1286 * anon slots of a large page appear atomic by 1287 * getting an anonpages_hash_lock for the 1288 * first anon slot of a large page. 1289 */ 1290 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1291 1292 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1293 1294 ahmpages = &anonpages_hash_lock[hash]; 1295 mutex_enter(ahmpages); 1296 /*LINTED*/ 1297 ASSERT(refcnt = ap->an_refcnt); 1298 1299 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1300 anonvmstats.dupfillholes[3]); 1301 } 1302 (void) anon_set_ptr(new, new_idx + off + i, ap, 1303 ANON_SLEEP); 1304 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1305 mutex_enter(ahm); 1306 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1307 ASSERT(i == 0 || ahmpages == NULL || 1308 refcnt == ap->an_refcnt); 1309 ap->an_refcnt++; 1310 mutex_exit(ahm); 1311 } 1312 if (ahmpages != NULL) { 1313 mutex_exit(ahmpages); 1314 ahmpages = NULL; 1315 } 1316 off += pgcnt; 1317 new_idx += off; 1318 old_idx += off; 1319 npages -= pgcnt; 1320 } 1321 } 1322 1323 /* 1324 * Used when a segment with a vnode changes szc. similarly to 1325 * anon_dup_fill_holes() makes sure each large page region either has no anon 1326 * slots or all of them. but new slots are created by COWing the file 1327 * pages. on entrance no anon slots should be shared. 1328 */ 1329 int 1330 anon_fill_cow_holes( 1331 struct seg *seg, 1332 caddr_t addr, 1333 struct anon_hdr *ahp, 1334 ulong_t an_idx, 1335 struct vnode *vp, 1336 u_offset_t vp_off, 1337 size_t size, 1338 uint_t szc, 1339 uint_t prot, 1340 struct vpage vpage[], 1341 struct cred *cred) 1342 { 1343 struct anon *ap; 1344 spgcnt_t npages; 1345 pgcnt_t pgcnt, i; 1346 ulong_t index, off; 1347 int err = 0; 1348 int pageflags = 0; 1349 1350 ASSERT(szc != 0); 1351 pgcnt = page_get_pagecnt(szc); 1352 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1353 npages = btopr(size); 1354 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1355 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1356 1357 while (npages > 0) { 1358 index = an_idx; 1359 1360 /* 1361 * Find the next valid slot. 1362 */ 1363 if (anon_get_next_ptr(ahp, &index) == NULL) { 1364 break; 1365 } 1366 1367 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1368 /* 1369 * Now backup index to the beginning of the 1370 * current large page region of the anon array. 1371 */ 1372 index = P2ALIGN(index, pgcnt); 1373 off = index - an_idx; 1374 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1375 npages -= off; 1376 if (npages <= 0) 1377 break; 1378 an_idx += off; 1379 vp_off += ptob(off); 1380 addr += ptob(off); 1381 if (vpage != NULL) { 1382 vpage += off; 1383 } 1384 1385 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1386 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1387 page_t *pl[1 + 1]; 1388 page_t *pp; 1389 1390 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1391 pl, PAGESIZE, seg, addr, S_READ, cred); 1392 if (err) { 1393 break; 1394 } 1395 if (vpage != NULL) { 1396 prot = VPP_PROT(vpage); 1397 pageflags = VPP_ISPPLOCK(vpage) ? 1398 LOCK_PAGE : 0; 1399 } 1400 pp = anon_private(&ap, seg, addr, prot, pl[0], 1401 pageflags, cred); 1402 if (pp == NULL) { 1403 err = ENOMEM; 1404 break; 1405 } 1406 (void) anon_set_ptr(ahp, an_idx, ap, 1407 ANON_SLEEP); 1408 page_unlock(pp); 1409 } 1410 ASSERT(ap->an_refcnt == 1); 1411 addr += PAGESIZE; 1412 if (vpage != NULL) { 1413 vpage++; 1414 } 1415 } 1416 npages -= pgcnt; 1417 } 1418 1419 return (err); 1420 } 1421 1422 /* 1423 * Free a group of "size" anon pages, size in bytes, 1424 * and clear out the pointers to the anon entries. 1425 */ 1426 void 1427 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1428 { 1429 spgcnt_t npages; 1430 struct anon *ap; 1431 ulong_t old; 1432 1433 npages = btopr(size); 1434 1435 while (npages > 0) { 1436 old = index; 1437 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1438 break; 1439 1440 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1441 npages -= index - old; 1442 if (npages <= 0) 1443 break; 1444 1445 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1446 anon_decref(ap); 1447 /* 1448 * Bump index and decrement page count 1449 */ 1450 index++; 1451 npages--; 1452 } 1453 } 1454 1455 void 1456 anon_free_pages( 1457 struct anon_hdr *ahp, 1458 ulong_t an_idx, 1459 size_t size, 1460 uint_t szc) 1461 { 1462 spgcnt_t npages; 1463 pgcnt_t pgcnt; 1464 ulong_t index, off; 1465 1466 ASSERT(szc != 0); 1467 pgcnt = page_get_pagecnt(szc); 1468 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1469 npages = btopr(size); 1470 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1471 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1472 1473 VM_STAT_ADD(anonvmstats.freepages[0]); 1474 1475 while (npages > 0) { 1476 index = an_idx; 1477 1478 /* 1479 * Find the next valid slot. 1480 */ 1481 if (anon_get_next_ptr(ahp, &index) == NULL) 1482 break; 1483 1484 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1485 /* 1486 * Now backup index to the beginning of the 1487 * current large page region of the old array. 1488 */ 1489 index = P2ALIGN(index, pgcnt); 1490 off = index - an_idx; 1491 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1492 npages -= off; 1493 if (npages <= 0) 1494 break; 1495 1496 anon_decref_pages(ahp, index, szc); 1497 1498 off += pgcnt; 1499 an_idx += off; 1500 npages -= pgcnt; 1501 } 1502 } 1503 1504 /* 1505 * Make anonymous pages discardable 1506 */ 1507 void 1508 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) 1509 { 1510 spgcnt_t npages = btopr(size); 1511 struct anon *ap; 1512 struct vnode *vp; 1513 anoff_t off; 1514 page_t *pp, *root_pp; 1515 kmutex_t *ahm; 1516 pgcnt_t pgcnt; 1517 ulong_t old_idx, idx, i; 1518 struct anon_hdr *ahp = amp->ahp; 1519 anon_sync_obj_t cookie; 1520 1521 ASSERT(RW_READ_HELD(&->a_rwlock)); 1522 pgcnt = 1; 1523 for (; npages > 0; index = (pgcnt == 1) ? index + 1: 1524 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1525 1526 /* 1527 * get anon pointer and index for the first valid entry 1528 * in the anon list, starting from "index" 1529 */ 1530 old_idx = index; 1531 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1532 break; 1533 1534 /* 1535 * decrement npages by number of NULL anon slots we skipped 1536 */ 1537 npages -= index - old_idx; 1538 if (npages <= 0) 1539 break; 1540 1541 anon_array_enter(amp, index, &cookie); 1542 ap = anon_get_ptr(ahp, index); 1543 ASSERT(ap != NULL); 1544 1545 /* 1546 * Get anonymous page and try to lock it SE_EXCL; 1547 * For non blocking case if we couldn't grab the lock 1548 * we skip to next page. 1549 * For blocking case (ANON_PGLOOKUP_BLK) block 1550 * until we grab SE_EXCL lock. 1551 */ 1552 swap_xlate(ap, &vp, &off); 1553 if (flags & ANON_PGLOOKUP_BLK) 1554 pp = page_lookup_create(vp, (u_offset_t)off, 1555 SE_EXCL, NULL, NULL, SE_EXCL_WANTED); 1556 else 1557 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1558 if (pp == NULL) { 1559 segadvstat.MADV_FREE_miss.value.ul++; 1560 pgcnt = 1; 1561 anon_array_exit(&cookie); 1562 continue; 1563 } 1564 pgcnt = page_get_pagecnt(pp->p_szc); 1565 1566 /* 1567 * we cannot free a page which is permanently locked. 1568 * The page_struct_lock need not be acquired to examine 1569 * these fields since the page has an "exclusive" lock. 1570 */ 1571 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1572 page_unlock(pp); 1573 segadvstat.MADV_FREE_miss.value.ul++; 1574 anon_array_exit(&cookie); 1575 continue; 1576 } 1577 1578 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1579 mutex_enter(ahm); 1580 ASSERT(ap->an_refcnt != 0); 1581 /* 1582 * skip this one if copy-on-write is not yet broken. 1583 */ 1584 if (ap->an_refcnt > 1) { 1585 mutex_exit(ahm); 1586 page_unlock(pp); 1587 segadvstat.MADV_FREE_miss.value.ul++; 1588 anon_array_exit(&cookie); 1589 continue; 1590 } 1591 1592 if (pp->p_szc == 0) { 1593 pgcnt = 1; 1594 1595 /* 1596 * free swap slot; 1597 */ 1598 if (ap->an_pvp) { 1599 swap_phys_free(ap->an_pvp, ap->an_poff, 1600 PAGESIZE); 1601 ap->an_pvp = NULL; 1602 ap->an_poff = 0; 1603 } 1604 mutex_exit(ahm); 1605 segadvstat.MADV_FREE_hit.value.ul++; 1606 1607 /* 1608 * while we are at it, unload all the translations 1609 * and attempt to free the page. 1610 */ 1611 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1612 /*LINTED: constant in conditional context */ 1613 VN_DISPOSE(pp, B_FREE, 0, kcred); 1614 anon_array_exit(&cookie); 1615 continue; 1616 } 1617 1618 pgcnt = page_get_pagecnt(pp->p_szc); 1619 if (!IS_P2ALIGNED(index, pgcnt)) { 1620 if (!page_try_demote_pages(pp)) { 1621 mutex_exit(ahm); 1622 page_unlock(pp); 1623 segadvstat.MADV_FREE_miss.value.ul++; 1624 anon_array_exit(&cookie); 1625 continue; 1626 } else { 1627 pgcnt = 1; 1628 if (ap->an_pvp) { 1629 swap_phys_free(ap->an_pvp, 1630 ap->an_poff, PAGESIZE); 1631 ap->an_pvp = NULL; 1632 ap->an_poff = 0; 1633 } 1634 mutex_exit(ahm); 1635 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1636 /*LINTED*/ 1637 VN_DISPOSE(pp, B_FREE, 0, kcred); 1638 segadvstat.MADV_FREE_hit.value.ul++; 1639 anon_array_exit(&cookie); 1640 continue; 1641 } 1642 } 1643 mutex_exit(ahm); 1644 root_pp = pp; 1645 1646 /* 1647 * try to lock remaining pages 1648 */ 1649 for (idx = 1; idx < pgcnt; idx++) { 1650 pp++; 1651 if (!page_trylock(pp, SE_EXCL)) 1652 break; 1653 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1654 page_unlock(pp); 1655 break; 1656 } 1657 } 1658 1659 if (idx == pgcnt) { 1660 for (i = 0; i < pgcnt; i++) { 1661 ap = anon_get_ptr(ahp, index + i); 1662 if (ap == NULL) 1663 break; 1664 swap_xlate(ap, &vp, &off); 1665 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1666 mutex_enter(ahm); 1667 ASSERT(ap->an_refcnt != 0); 1668 1669 /* 1670 * skip this one if copy-on-write 1671 * is not yet broken. 1672 */ 1673 if (ap->an_refcnt > 1) { 1674 mutex_exit(ahm); 1675 goto skiplp; 1676 } 1677 if (ap->an_pvp) { 1678 swap_phys_free(ap->an_pvp, 1679 ap->an_poff, PAGESIZE); 1680 ap->an_pvp = NULL; 1681 ap->an_poff = 0; 1682 } 1683 mutex_exit(ahm); 1684 } 1685 page_destroy_pages(root_pp); 1686 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1687 anon_array_exit(&cookie); 1688 continue; 1689 } 1690 skiplp: 1691 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1692 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1693 page_unlock(pp); 1694 anon_array_exit(&cookie); 1695 } 1696 } 1697 1698 /* 1699 * Return the kept page(s) and protections back to the segment driver. 1700 */ 1701 int 1702 anon_getpage( 1703 struct anon **app, 1704 uint_t *protp, 1705 page_t *pl[], 1706 size_t plsz, 1707 struct seg *seg, 1708 caddr_t addr, 1709 enum seg_rw rw, 1710 struct cred *cred) 1711 { 1712 page_t *pp; 1713 struct anon *ap = *app; 1714 struct vnode *vp; 1715 anoff_t off; 1716 int err; 1717 kmutex_t *ahm; 1718 1719 swap_xlate(ap, &vp, &off); 1720 1721 /* 1722 * Lookup the page. If page is being paged in, 1723 * wait for it to finish as we must return a list of 1724 * pages since this routine acts like the VOP_GETPAGE 1725 * routine does. 1726 */ 1727 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1728 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1729 mutex_enter(ahm); 1730 if (ap->an_refcnt == 1) 1731 *protp = PROT_ALL; 1732 else 1733 *protp = PROT_ALL & ~PROT_WRITE; 1734 mutex_exit(ahm); 1735 pl[0] = pp; 1736 pl[1] = NULL; 1737 return (0); 1738 } 1739 1740 /* 1741 * Simply treat it as a vnode fault on the anon vp. 1742 */ 1743 1744 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1745 "anon_getpage:seg %x addr %x vp %x", 1746 seg, addr, vp); 1747 1748 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1749 seg, addr, rw, cred); 1750 1751 if (err == 0 && pl != NULL) { 1752 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1753 mutex_enter(ahm); 1754 if (ap->an_refcnt != 1) 1755 *protp &= ~PROT_WRITE; /* make read-only */ 1756 mutex_exit(ahm); 1757 } 1758 return (err); 1759 } 1760 1761 /* 1762 * Creates or returns kept pages to the segment driver. returns -1 if a large 1763 * page cannot be allocated. returns -2 if some other process has allocated a 1764 * larger page. 1765 * 1766 * For cowfault it will alocate any size pages to fill the requested area to 1767 * avoid partially overwritting anon slots (i.e. sharing only some of the anon 1768 * slots within a large page with other processes). This policy greatly 1769 * simplifies large page freeing (which is only freed when all anon slot 1770 * refcnts are 0). 1771 */ 1772 int 1773 anon_map_getpages( 1774 struct anon_map *amp, 1775 ulong_t start_idx, 1776 uint_t szc, 1777 struct seg *seg, 1778 caddr_t addr, 1779 uint_t prot, 1780 uint_t *protp, 1781 page_t *ppa[], 1782 uint_t *ppa_szc, 1783 struct vpage vpage[], 1784 enum seg_rw rw, 1785 int brkcow, 1786 int anypgsz, 1787 struct cred *cred) 1788 { 1789 pgcnt_t pgcnt; 1790 struct anon *ap; 1791 struct vnode *vp; 1792 anoff_t off; 1793 page_t *pp, *pl[2], *conpp = NULL; 1794 caddr_t vaddr; 1795 ulong_t pg_idx, an_idx, i; 1796 spgcnt_t nreloc = 0; 1797 int prealloc = 1; 1798 int err, slotcreate; 1799 uint_t vpprot; 1800 1801 #if !defined(__i386) && !defined(__amd64) 1802 ASSERT(seg->s_szc != 0); 1803 #endif 1804 ASSERT(szc <= seg->s_szc); 1805 ASSERT(ppa_szc != NULL); 1806 ASSERT(rw != S_CREATE); 1807 1808 *protp = PROT_ALL; 1809 1810 VM_STAT_ADD(anonvmstats.getpages[0]); 1811 1812 if (szc == 0) { 1813 VM_STAT_ADD(anonvmstats.getpages[1]); 1814 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1815 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1816 addr, rw, cred); 1817 if (err) 1818 return (err); 1819 ppa[0] = pl[0]; 1820 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1821 VM_STAT_ADD(anonvmstats.getpages[2]); 1822 if (ppa[0]->p_szc != 0) { 1823 VM_STAT_ADD(anonvmstats.getpages[3]); 1824 *ppa_szc = ppa[0]->p_szc; 1825 page_unlock(ppa[0]); 1826 return (-2); 1827 } 1828 return (0); 1829 } 1830 panic("anon_map_getpages: cowfault for szc 0"); 1831 } else { 1832 VM_STAT_ADD(anonvmstats.getpages[4]); 1833 ppa[0] = anon_zero(seg, addr, &ap, cred); 1834 if (ppa[0] == NULL) 1835 return (ENOMEM); 1836 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1837 ANON_SLEEP); 1838 return (0); 1839 } 1840 } 1841 1842 pgcnt = page_get_pagecnt(szc); 1843 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1844 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1845 1846 /* 1847 * First we check for the case that the requtested large 1848 * page or larger page already exists in the system. 1849 * Actually we only check if the first constituent page 1850 * exists and only preallocate if it's not found. 1851 */ 1852 ap = anon_get_ptr(amp->ahp, start_idx); 1853 if (ap) { 1854 uint_t pszc; 1855 swap_xlate(ap, &vp, &off); 1856 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1857 if (pszc > szc) { 1858 *ppa_szc = pszc; 1859 return (-2); 1860 } 1861 if (pszc == szc) { 1862 prealloc = 0; 1863 } 1864 } 1865 } 1866 1867 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1868 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1869 1870 top: 1871 /* 1872 * If a smaller page or no page at all was found, 1873 * grab a large page off the freelist. 1874 */ 1875 if (prealloc) { 1876 ASSERT(conpp == NULL); 1877 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1878 szc, 0) != 0) { 1879 VM_STAT_ADD(anonvmstats.getpages[7]); 1880 if (brkcow == 0 || 1881 !anon_share(amp->ahp, start_idx, pgcnt)) { 1882 /* 1883 * If the refcnt's of all anon slots are <= 1 1884 * they can't increase since we are holding 1885 * the address space's lock. So segvn can 1886 * safely decrease szc without risking to 1887 * generate a cow fault for the region smaller 1888 * than the segment's largest page size. 1889 */ 1890 VM_STAT_ADD(anonvmstats.getpages[8]); 1891 return (-1); 1892 } 1893 docow: 1894 /* 1895 * This is a cow fault. Copy away the entire 1 large 1896 * page region of this segment. 1897 */ 1898 if (szc != seg->s_szc) 1899 panic("anon_map_getpages: cowfault for szc %d", 1900 szc); 1901 vaddr = addr; 1902 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1903 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1904 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1905 NULL) { 1906 err = anon_getpage(&ap, &vpprot, pl, 1907 PAGESIZE, seg, vaddr, rw, cred); 1908 if (err) { 1909 for (i = 0; i < pg_idx; i++) { 1910 if ((pp = ppa[i]) != 1911 NULL) 1912 page_unlock(pp); 1913 } 1914 return (err); 1915 } 1916 ppa[pg_idx] = pl[0]; 1917 } else { 1918 /* 1919 * Since this is a cowfault we know 1920 * that this address space has a 1921 * parent or children which means 1922 * anon_dup_fill_holes() has initialized 1923 * all anon slots within a large page 1924 * region that had at least one anon 1925 * slot at the time of fork(). 1926 */ 1927 panic("anon_map_getpages: " 1928 "cowfault but anon slot is empty"); 1929 } 1930 } 1931 VM_STAT_ADD(anonvmstats.getpages[9]); 1932 *protp = PROT_ALL; 1933 return (anon_map_privatepages(amp, start_idx, szc, seg, 1934 addr, prot, ppa, vpage, anypgsz, cred)); 1935 } 1936 } 1937 1938 VM_STAT_ADD(anonvmstats.getpages[10]); 1939 1940 an_idx = start_idx; 1941 pg_idx = 0; 1942 vaddr = addr; 1943 while (pg_idx < pgcnt) { 1944 slotcreate = 0; 1945 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 1946 VM_STAT_ADD(anonvmstats.getpages[11]); 1947 /* 1948 * For us to have decided not to preallocate 1949 * would have meant that a large page 1950 * was found. Which also means that all of the 1951 * anon slots for that page would have been 1952 * already created for us. 1953 */ 1954 if (prealloc == 0) 1955 panic("anon_map_getpages: prealloc = 0"); 1956 1957 slotcreate = 1; 1958 ap = anon_alloc(NULL, 0); 1959 } 1960 swap_xlate(ap, &vp, &off); 1961 1962 /* 1963 * Now setup our preallocated page to pass down 1964 * to swap_getpage(). 1965 */ 1966 if (prealloc) { 1967 ASSERT(ppa[pg_idx]->p_szc == szc); 1968 conpp = ppa[pg_idx]; 1969 } 1970 ASSERT(prealloc || conpp == NULL); 1971 1972 /* 1973 * If we just created this anon slot then call 1974 * with S_CREATE to prevent doing IO on the page. 1975 * Similar to the anon_zero case. 1976 */ 1977 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 1978 NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr, 1979 slotcreate == 1 ? S_CREATE : rw, cred); 1980 1981 if (err) { 1982 VM_STAT_ADD(anonvmstats.getpages[12]); 1983 ASSERT(slotcreate == 0); 1984 goto io_err; 1985 } 1986 1987 pp = pl[0]; 1988 1989 if (pp->p_szc != szc) { 1990 VM_STAT_ADD(anonvmstats.getpages[13]); 1991 ASSERT(slotcreate == 0); 1992 ASSERT(prealloc == 0); 1993 ASSERT(pg_idx == 0); 1994 if (pp->p_szc > szc) { 1995 page_unlock(pp); 1996 VM_STAT_ADD(anonvmstats.getpages[14]); 1997 return (-2); 1998 } 1999 page_unlock(pp); 2000 prealloc = 1; 2001 goto top; 2002 } 2003 2004 /* 2005 * If we decided to preallocate but VOP_GETPAGE 2006 * found a page in the system that satisfies our 2007 * request then free up our preallocated large page 2008 * and continue looping accross the existing large 2009 * page via VOP_GETPAGE. 2010 */ 2011 if (prealloc && pp != ppa[pg_idx]) { 2012 VM_STAT_ADD(anonvmstats.getpages[15]); 2013 ASSERT(slotcreate == 0); 2014 ASSERT(pg_idx == 0); 2015 conpp = NULL; 2016 prealloc = 0; 2017 page_free_pages(ppa[0]); 2018 } 2019 2020 if (prealloc && nreloc > 1) { 2021 /* 2022 * we have relocated out of a smaller large page. 2023 * skip npgs - 1 iterations and continue which will 2024 * increment by one the loop indices. 2025 */ 2026 spgcnt_t npgs = nreloc; 2027 2028 VM_STAT_ADD(anonvmstats.getpages[16]); 2029 2030 ASSERT(pp == ppa[pg_idx]); 2031 ASSERT(slotcreate == 0); 2032 ASSERT(pg_idx + npgs <= pgcnt); 2033 if ((*protp & PROT_WRITE) && 2034 anon_share(amp->ahp, an_idx, npgs)) { 2035 *protp &= ~PROT_WRITE; 2036 } 2037 pg_idx += npgs; 2038 an_idx += npgs; 2039 vaddr += PAGESIZE * npgs; 2040 continue; 2041 } 2042 2043 VM_STAT_ADD(anonvmstats.getpages[17]); 2044 2045 /* 2046 * Anon_zero case. 2047 */ 2048 if (slotcreate) { 2049 ASSERT(prealloc); 2050 pagezero(pp, 0, PAGESIZE); 2051 CPU_STATS_ADD_K(vm, zfod, 1); 2052 hat_setrefmod(pp); 2053 } 2054 2055 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2056 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2057 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2058 2059 if (pg_idx > 0 && 2060 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2061 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) 2062 panic("anon_map_getpages: unexpected page"); 2063 2064 if (prealloc == 0) { 2065 ppa[pg_idx] = pp; 2066 } 2067 2068 if (ap->an_refcnt > 1) { 2069 VM_STAT_ADD(anonvmstats.getpages[18]); 2070 *protp &= ~PROT_WRITE; 2071 } 2072 2073 /* 2074 * If this is a new anon slot then initialize 2075 * the anon array entry. 2076 */ 2077 if (slotcreate) { 2078 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2079 } 2080 pg_idx++; 2081 an_idx++; 2082 vaddr += PAGESIZE; 2083 } 2084 2085 /* 2086 * Since preallocated pages come off the freelist 2087 * they are locked SE_EXCL. Simply downgrade and return. 2088 */ 2089 if (prealloc) { 2090 VM_STAT_ADD(anonvmstats.getpages[19]); 2091 conpp = NULL; 2092 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2093 page_downgrade(ppa[pg_idx]); 2094 } 2095 } 2096 ASSERT(conpp == NULL); 2097 2098 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2099 VM_STAT_ADD(anonvmstats.getpages[20]); 2100 return (0); 2101 } 2102 2103 if (szc < seg->s_szc) 2104 panic("anon_map_getpages: cowfault for szc %d", szc); 2105 2106 VM_STAT_ADD(anonvmstats.getpages[21]); 2107 2108 *protp = PROT_ALL; 2109 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2110 ppa, vpage, anypgsz, cred)); 2111 io_err: 2112 /* 2113 * We got an IO error somewhere in our large page. 2114 * If we were using a preallocated page then just demote 2115 * all the constituent pages that we've succeeded with sofar 2116 * to PAGESIZE pages and leave them in the system 2117 * unlocked. 2118 */ 2119 2120 ASSERT(err != -2 || pg_idx == 0); 2121 2122 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2123 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2124 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2125 2126 if (prealloc) { 2127 conpp = NULL; 2128 if (pg_idx > 0) { 2129 VM_STAT_ADD(anonvmstats.getpages[25]); 2130 for (i = 0; i < pgcnt; i++) { 2131 pp = ppa[i]; 2132 ASSERT(PAGE_EXCL(pp)); 2133 ASSERT(pp->p_szc == szc); 2134 pp->p_szc = 0; 2135 } 2136 for (i = 0; i < pg_idx; i++) { 2137 ASSERT(!hat_page_is_mapped(ppa[i])); 2138 page_unlock(ppa[i]); 2139 } 2140 /* 2141 * Now free up the remaining unused constituent 2142 * pages. 2143 */ 2144 while (pg_idx < pgcnt) { 2145 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2146 page_free(ppa[pg_idx], 0); 2147 pg_idx++; 2148 } 2149 } else { 2150 VM_STAT_ADD(anonvmstats.getpages[26]); 2151 page_free_pages(ppa[0]); 2152 } 2153 } else { 2154 VM_STAT_ADD(anonvmstats.getpages[27]); 2155 ASSERT(err > 0); 2156 for (i = 0; i < pg_idx; i++) 2157 page_unlock(ppa[i]); 2158 } 2159 ASSERT(conpp == NULL); 2160 if (err != -1) 2161 return (err); 2162 /* 2163 * we are here because we failed to relocate. 2164 */ 2165 ASSERT(prealloc); 2166 if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { 2167 VM_STAT_ADD(anonvmstats.getpages[28]); 2168 return (-1); 2169 } 2170 VM_STAT_ADD(anonvmstats.getpages[29]); 2171 goto docow; 2172 } 2173 2174 2175 /* 2176 * Turn a reference to an object or shared anon page 2177 * into a private page with a copy of the data from the 2178 * original page which is always locked by the caller. 2179 * This routine unloads the translation and unlocks the 2180 * original page, if it isn't being stolen, before returning 2181 * to the caller. 2182 * 2183 * NOTE: The original anon slot is not freed by this routine 2184 * It must be freed by the caller while holding the 2185 * "anon_map" lock to prevent races which can occur if 2186 * a process has multiple lwps in its address space. 2187 */ 2188 page_t * 2189 anon_private( 2190 struct anon **app, 2191 struct seg *seg, 2192 caddr_t addr, 2193 uint_t prot, 2194 page_t *opp, 2195 int oppflags, 2196 struct cred *cred) 2197 { 2198 struct anon *old = *app; 2199 struct anon *new; 2200 page_t *pp = NULL; 2201 struct vnode *vp; 2202 anoff_t off; 2203 page_t *anon_pl[1 + 1]; 2204 int err; 2205 2206 if (oppflags & STEAL_PAGE) 2207 ASSERT(PAGE_EXCL(opp)); 2208 else 2209 ASSERT(PAGE_LOCKED(opp)); 2210 2211 CPU_STATS_ADD_K(vm, cow_fault, 1); 2212 2213 /* Kernel probe */ 2214 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2215 tnf_opaque, address, addr); 2216 2217 *app = new = anon_alloc(NULL, 0); 2218 swap_xlate(new, &vp, &off); 2219 2220 if (oppflags & STEAL_PAGE) { 2221 page_rename(opp, vp, (u_offset_t)off); 2222 pp = opp; 2223 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2224 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2225 seg, addr, pp, vp, off); 2226 hat_setmod(pp); 2227 2228 /* bug 4026339 */ 2229 page_downgrade(pp); 2230 return (pp); 2231 } 2232 2233 /* 2234 * Call the VOP_GETPAGE routine to create the page, thereby 2235 * enabling the vnode driver to allocate any filesystem 2236 * space (e.g., disk block allocation for UFS). This also 2237 * prevents more than one page from being added to the 2238 * vnode at the same time. 2239 */ 2240 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2241 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2242 if (err) 2243 goto out; 2244 2245 pp = anon_pl[0]; 2246 2247 /* 2248 * If the original page was locked, we need to move the lock 2249 * to the new page by transfering 'cowcnt/lckcnt' of the original 2250 * page to 'cowcnt/lckcnt' of the new page. 2251 * 2252 * See Statement at the beginning of segvn_lockop() and 2253 * comments in page_pp_useclaim() regarding the way 2254 * cowcnts/lckcnts are handled. 2255 * 2256 * Also availrmem must be decremented up front for read only mapping 2257 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2258 * if availrmem did not need to be decremented after all. 2259 */ 2260 if (oppflags & LOCK_PAGE) { 2261 if ((prot & PROT_WRITE) == 0) { 2262 mutex_enter(&freemem_lock); 2263 if (availrmem > pages_pp_maximum) { 2264 availrmem--; 2265 pages_useclaim++; 2266 } else { 2267 mutex_exit(&freemem_lock); 2268 goto out; 2269 } 2270 mutex_exit(&freemem_lock); 2271 } 2272 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2273 } 2274 2275 /* 2276 * Now copy the contents from the original page, 2277 * which is locked and loaded in the MMU by 2278 * the caller to prevent yet another page fault. 2279 */ 2280 ppcopy(opp, pp); /* XXX - should set mod bit in here */ 2281 2282 hat_setrefmod(pp); /* mark as modified */ 2283 2284 /* 2285 * Unload the old translation. 2286 */ 2287 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2288 2289 /* 2290 * Free unmapped, unmodified original page. 2291 * or release the lock on the original page, 2292 * otherwise the process will sleep forever in 2293 * anon_decref() waiting for the "exclusive" lock 2294 * on the page. 2295 */ 2296 (void) page_release(opp, 1); 2297 2298 /* 2299 * we are done with page creation so downgrade the new 2300 * page's selock to shared, this helps when multiple 2301 * as_fault(...SOFTLOCK...) are done to the same 2302 * page(aio) 2303 */ 2304 page_downgrade(pp); 2305 2306 /* 2307 * NOTE: The original anon slot must be freed by the 2308 * caller while holding the "anon_map" lock, if we 2309 * copied away from an anonymous page. 2310 */ 2311 return (pp); 2312 2313 out: 2314 *app = old; 2315 if (pp) 2316 page_unlock(pp); 2317 anon_decref(new); 2318 page_unlock(opp); 2319 return ((page_t *)NULL); 2320 } 2321 2322 int 2323 anon_map_privatepages( 2324 struct anon_map *amp, 2325 ulong_t start_idx, 2326 uint_t szc, 2327 struct seg *seg, 2328 caddr_t addr, 2329 uint_t prot, 2330 page_t *ppa[], 2331 struct vpage vpage[], 2332 int anypgsz, 2333 struct cred *cred) 2334 { 2335 pgcnt_t pgcnt; 2336 struct vnode *vp; 2337 anoff_t off; 2338 page_t *pl[2], *conpp = NULL; 2339 int err; 2340 int prealloc = 1; 2341 struct anon *ap, *oldap; 2342 caddr_t vaddr; 2343 page_t *pplist, *pp; 2344 ulong_t pg_idx, an_idx; 2345 spgcnt_t nreloc = 0; 2346 int pagelock = 0; 2347 kmutex_t *ahmpages = NULL; 2348 #ifdef DEBUG 2349 int refcnt; 2350 #endif 2351 2352 ASSERT(szc != 0); 2353 ASSERT(szc == seg->s_szc); 2354 2355 VM_STAT_ADD(anonvmstats.privatepages[0]); 2356 2357 pgcnt = page_get_pagecnt(szc); 2358 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2359 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2360 2361 ASSERT(amp != NULL); 2362 ap = anon_get_ptr(amp->ahp, start_idx); 2363 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2364 2365 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2366 2367 /* 2368 * Now try and allocate the large page. If we fail then just 2369 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2370 * the caller make this decision but to avoid added complexity 2371 * it's simplier to handle that case here. 2372 */ 2373 if (anypgsz == -1) { 2374 VM_STAT_ADD(anonvmstats.privatepages[2]); 2375 prealloc = 0; 2376 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2377 anypgsz) != 0) { 2378 VM_STAT_ADD(anonvmstats.privatepages[3]); 2379 prealloc = 0; 2380 } 2381 2382 /* 2383 * make the decrement of all refcnts of all 2384 * anon slots of a large page appear atomic by 2385 * getting an anonpages_hash_lock for the 2386 * first anon slot of a large page. 2387 */ 2388 if (ap != NULL) { 2389 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2390 ap->an_off)]; 2391 mutex_enter(ahmpages); 2392 if (ap->an_refcnt == 1) { 2393 VM_STAT_ADD(anonvmstats.privatepages[4]); 2394 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2395 mutex_exit(ahmpages); 2396 2397 if (prealloc) { 2398 page_free_replacement_page(pplist); 2399 page_create_putback(pgcnt); 2400 } 2401 ASSERT(ppa[0]->p_szc <= szc); 2402 if (ppa[0]->p_szc == szc) { 2403 VM_STAT_ADD(anonvmstats.privatepages[5]); 2404 return (0); 2405 } 2406 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2407 ASSERT(ppa[pg_idx] != NULL); 2408 page_unlock(ppa[pg_idx]); 2409 } 2410 return (-1); 2411 } 2412 } 2413 2414 /* 2415 * If we are passed in the vpage array and this is 2416 * not PROT_WRITE then we need to decrement availrmem 2417 * up front before we try anything. If we need to and 2418 * can't decrement availrmem then its better to fail now 2419 * than in the middle of processing the new large page. 2420 * page_pp_usclaim() on behalf of each constituent page 2421 * below will adjust availrmem back for the cases not needed. 2422 */ 2423 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2424 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2425 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2426 pagelock = 1; 2427 break; 2428 } 2429 } 2430 if (pagelock) { 2431 VM_STAT_ADD(anonvmstats.privatepages[6]); 2432 mutex_enter(&freemem_lock); 2433 if (availrmem >= pages_pp_maximum + pgcnt) { 2434 availrmem -= pgcnt; 2435 pages_useclaim += pgcnt; 2436 } else { 2437 VM_STAT_ADD(anonvmstats.privatepages[7]); 2438 mutex_exit(&freemem_lock); 2439 if (ahmpages != NULL) { 2440 mutex_exit(ahmpages); 2441 } 2442 if (prealloc) { 2443 page_free_replacement_page(pplist); 2444 page_create_putback(pgcnt); 2445 } 2446 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2447 if (ppa[pg_idx] != NULL) 2448 page_unlock(ppa[pg_idx]); 2449 return (ENOMEM); 2450 } 2451 mutex_exit(&freemem_lock); 2452 } 2453 } 2454 2455 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2456 2457 VM_STAT_ADD(anonvmstats.privatepages[8]); 2458 2459 an_idx = start_idx; 2460 pg_idx = 0; 2461 vaddr = addr; 2462 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2463 ASSERT(ppa[pg_idx] != NULL); 2464 oldap = anon_get_ptr(amp->ahp, an_idx); 2465 ASSERT(ahmpages != NULL || oldap == NULL); 2466 ASSERT(ahmpages == NULL || oldap != NULL); 2467 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2468 ASSERT(ahmpages == NULL || pg_idx != 0 || 2469 (refcnt = oldap->an_refcnt)); 2470 ASSERT(ahmpages == NULL || pg_idx == 0 || 2471 refcnt == oldap->an_refcnt); 2472 2473 ap = anon_alloc(NULL, 0); 2474 2475 swap_xlate(ap, &vp, &off); 2476 2477 /* 2478 * Now setup our preallocated page to pass down to 2479 * swap_getpage(). 2480 */ 2481 if (prealloc) { 2482 pp = pplist; 2483 page_sub(&pplist, pp); 2484 conpp = pp; 2485 } 2486 2487 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2488 PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred); 2489 2490 /* 2491 * Impossible to fail this is S_CREATE. 2492 */ 2493 if (err) 2494 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2495 2496 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2497 ASSERT(prealloc == 0 || nreloc == 1); 2498 2499 pp = pl[0]; 2500 2501 /* 2502 * If the original page was locked, we need to move 2503 * the lock to the new page by transfering 2504 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2505 * of the new page. pg_idx can be used to index 2506 * into the vpage array since the caller will guarentee 2507 * that vpage struct passed in corresponds to addr 2508 * and forward. 2509 */ 2510 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2511 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2512 } else if (pagelock) { 2513 mutex_enter(&freemem_lock); 2514 availrmem++; 2515 pages_useclaim--; 2516 mutex_exit(&freemem_lock); 2517 } 2518 2519 /* 2520 * Now copy the contents from the original page. 2521 */ 2522 ppcopy(ppa[pg_idx], pp); 2523 2524 hat_setrefmod(pp); /* mark as modified */ 2525 2526 /* 2527 * Release the lock on the original page, 2528 * derement the old slot, and down grade the lock 2529 * on the new copy. 2530 */ 2531 page_unlock(ppa[pg_idx]); 2532 2533 if (!prealloc) 2534 page_downgrade(pp); 2535 2536 ppa[pg_idx] = pp; 2537 2538 /* 2539 * Now reflect the copy in the new anon array. 2540 */ 2541 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2542 if (oldap != NULL) 2543 anon_decref(oldap); 2544 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2545 } 2546 if (ahmpages != NULL) { 2547 mutex_exit(ahmpages); 2548 } 2549 ASSERT(prealloc == 0 || pplist == NULL); 2550 if (prealloc) { 2551 VM_STAT_ADD(anonvmstats.privatepages[9]); 2552 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2553 page_downgrade(ppa[pg_idx]); 2554 } 2555 } 2556 2557 /* 2558 * Unload the old large page translation. 2559 */ 2560 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2561 return (0); 2562 } 2563 2564 /* 2565 * Allocate a private zero-filled anon page. 2566 */ 2567 page_t * 2568 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2569 { 2570 struct anon *ap; 2571 page_t *pp; 2572 struct vnode *vp; 2573 anoff_t off; 2574 page_t *anon_pl[1 + 1]; 2575 int err; 2576 2577 /* Kernel probe */ 2578 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2579 tnf_opaque, address, addr); 2580 2581 *app = ap = anon_alloc(NULL, 0); 2582 swap_xlate(ap, &vp, &off); 2583 2584 /* 2585 * Call the VOP_GETPAGE routine to create the page, thereby 2586 * enabling the vnode driver to allocate any filesystem 2587 * dependent structures (e.g., disk block allocation for UFS). 2588 * This also prevents more than on page from being added to 2589 * the vnode at the same time since it is locked. 2590 */ 2591 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2592 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2593 if (err) { 2594 *app = NULL; 2595 anon_decref(ap); 2596 return (NULL); 2597 } 2598 pp = anon_pl[0]; 2599 2600 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2601 page_downgrade(pp); 2602 CPU_STATS_ADD_K(vm, zfod, 1); 2603 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2604 return (pp); 2605 } 2606 2607 2608 /* 2609 * Allocate array of private zero-filled anon pages for empty slots 2610 * and kept pages for non empty slots within given range. 2611 * 2612 * NOTE: This rontine will try and use large pages 2613 * if available and supported by underlying platform. 2614 */ 2615 int 2616 anon_map_createpages( 2617 struct anon_map *amp, 2618 ulong_t start_index, 2619 size_t len, 2620 page_t *ppa[], 2621 struct seg *seg, 2622 caddr_t addr, 2623 enum seg_rw rw, 2624 struct cred *cred) 2625 { 2626 2627 struct anon *ap; 2628 struct vnode *ap_vp; 2629 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2630 int err = 0; 2631 ulong_t p_index, index; 2632 pgcnt_t npgs, pg_cnt; 2633 spgcnt_t nreloc = 0; 2634 uint_t l_szc, szc, prot; 2635 anoff_t ap_off; 2636 size_t pgsz; 2637 lgrp_t *lgrp; 2638 2639 /* 2640 * XXX For now only handle S_CREATE. 2641 */ 2642 ASSERT(rw == S_CREATE); 2643 2644 index = start_index; 2645 p_index = 0; 2646 npgs = btopr(len); 2647 2648 /* 2649 * If this platform supports multiple page sizes 2650 * then try and allocate directly from the free 2651 * list for pages larger than PAGESIZE. 2652 * 2653 * NOTE:When we have page_create_ru we can stop 2654 * directly allocating from the freelist. 2655 */ 2656 l_szc = seg->s_szc; 2657 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2658 while (npgs) { 2659 2660 /* 2661 * if anon slot already exists 2662 * (means page has been created) 2663 * so 1) look up the page 2664 * 2) if the page is still in memory, get it. 2665 * 3) if not, create a page and 2666 * page in from physical swap device. 2667 * These are done in anon_getpage(). 2668 */ 2669 ap = anon_get_ptr(amp->ahp, index); 2670 if (ap) { 2671 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2672 seg, addr, S_READ, cred); 2673 if (err) { 2674 ANON_LOCK_EXIT(&->a_rwlock); 2675 panic("anon_map_createpages: anon_getpage"); 2676 } 2677 pp = anon_pl[0]; 2678 ppa[p_index++] = pp; 2679 2680 addr += PAGESIZE; 2681 index++; 2682 npgs--; 2683 continue; 2684 } 2685 /* 2686 * Now try and allocate the largest page possible 2687 * for the current address and range. 2688 * Keep dropping down in page size until: 2689 * 2690 * 1) Properly aligned 2691 * 2) Does not overlap existing anon pages 2692 * 3) Fits in remaining range. 2693 * 4) able to allocate one. 2694 * 2695 * NOTE: XXX When page_create_ru is completed this code 2696 * will change. 2697 */ 2698 szc = l_szc; 2699 pplist = NULL; 2700 pg_cnt = 0; 2701 while (szc) { 2702 pgsz = page_get_pagesize(szc); 2703 pg_cnt = pgsz >> PAGESHIFT; 2704 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2705 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2706 /* 2707 * XXX 2708 * Since we are faking page_create() 2709 * we also need to do the freemem and 2710 * pcf accounting. 2711 */ 2712 (void) page_create_wait(pg_cnt, PG_WAIT); 2713 2714 /* 2715 * Get lgroup to allocate next page of shared 2716 * memory from and use it to specify where to 2717 * allocate the physical memory 2718 */ 2719 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2720 2721 pplist = page_get_freelist( 2722 anon_vp, (u_offset_t)0, seg, 2723 addr, pgsz, 0, lgrp); 2724 2725 if (pplist == NULL) { 2726 page_create_putback(pg_cnt); 2727 } 2728 2729 /* 2730 * If a request for a page of size 2731 * larger than PAGESIZE failed 2732 * then don't try that size anymore. 2733 */ 2734 if (pplist == NULL) { 2735 l_szc = szc - 1; 2736 } else { 2737 break; 2738 } 2739 } 2740 szc--; 2741 } 2742 2743 /* 2744 * If just using PAGESIZE pages then don't 2745 * directly allocate from the free list. 2746 */ 2747 if (pplist == NULL) { 2748 ASSERT(szc == 0); 2749 pp = anon_zero(seg, addr, &ap, cred); 2750 if (pp == NULL) { 2751 ANON_LOCK_EXIT(&->a_rwlock); 2752 panic("anon_map_createpages: anon_zero"); 2753 } 2754 ppa[p_index++] = pp; 2755 2756 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2757 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2758 2759 addr += PAGESIZE; 2760 index++; 2761 npgs--; 2762 continue; 2763 } 2764 2765 /* 2766 * pplist is a list of pg_cnt PAGESIZE pages. 2767 * These pages are locked SE_EXCL since they 2768 * came directly off the free list. 2769 */ 2770 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2771 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2772 ASSERT(conpp == NULL); 2773 while (pg_cnt--) { 2774 2775 ap = anon_alloc(NULL, 0); 2776 swap_xlate(ap, &ap_vp, &ap_off); 2777 2778 ASSERT(pplist != NULL); 2779 pp = pplist; 2780 page_sub(&pplist, pp); 2781 PP_CLRFREE(pp); 2782 PP_CLRAGED(pp); 2783 conpp = pp; 2784 2785 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2786 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc, 2787 seg, addr, S_CREATE, cred); 2788 2789 if (err) { 2790 ANON_LOCK_EXIT(&->a_rwlock); 2791 panic("anon_map_createpages: S_CREATE"); 2792 } 2793 2794 ASSERT(anon_pl[0] == pp); 2795 ASSERT(nreloc == 1); 2796 pagezero(pp, 0, PAGESIZE); 2797 CPU_STATS_ADD_K(vm, zfod, 1); 2798 hat_setrefmod(pp); 2799 2800 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2801 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2802 2803 ppa[p_index++] = pp; 2804 2805 addr += PAGESIZE; 2806 index++; 2807 npgs--; 2808 } 2809 conpp = NULL; 2810 pg_cnt = pgsz >> PAGESHIFT; 2811 p_index = p_index - pg_cnt; 2812 while (pg_cnt--) { 2813 page_downgrade(ppa[p_index++]); 2814 } 2815 } 2816 ANON_LOCK_EXIT(&->a_rwlock); 2817 return (0); 2818 } 2819 2820 int 2821 anon_map_demotepages( 2822 struct anon_map *amp, 2823 ulong_t start_idx, 2824 struct seg *seg, 2825 caddr_t addr, 2826 uint_t prot, 2827 struct vpage vpage[], 2828 struct cred *cred) 2829 { 2830 struct anon *ap; 2831 uint_t szc = seg->s_szc; 2832 pgcnt_t pgcnt = page_get_pagecnt(szc); 2833 size_t ppasize = pgcnt * sizeof (page_t *); 2834 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 2835 page_t *pp; 2836 page_t *pl[2]; 2837 pgcnt_t i, pg_idx; 2838 ulong_t an_idx; 2839 caddr_t vaddr; 2840 kmutex_t *ahmpages = NULL; 2841 int err; 2842 int retry = 0; 2843 uint_t vpprot; 2844 2845 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 2846 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2847 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2848 ASSERT(ppa != NULL); 2849 2850 VM_STAT_ADD(anonvmstats.demotepages[0]); 2851 2852 ap = anon_get_ptr(amp->ahp, start_idx); 2853 if (ap != NULL) { 2854 VM_STAT_ADD(anonvmstats.demotepages[1]); 2855 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2856 mutex_enter(ahmpages); 2857 } 2858 top: 2859 if (ap == NULL || ap->an_refcnt <= 1) { 2860 int root = 0; 2861 pgcnt_t npgs, curnpgs = 0; 2862 2863 VM_STAT_ADD(anonvmstats.demotepages[2]); 2864 2865 ASSERT(retry == 0 || ap != NULL); 2866 2867 if (ahmpages != NULL) 2868 mutex_exit(ahmpages); 2869 an_idx = start_idx; 2870 for (i = 0; i < pgcnt; i++, an_idx++) { 2871 ap = anon_get_ptr(amp->ahp, an_idx); 2872 if (ap != NULL) { 2873 ASSERT(ap->an_refcnt == 1); 2874 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2875 SE_EXCL); 2876 if (pp != NULL) { 2877 (void) hat_pageunload(pp, 2878 HAT_FORCE_PGUNLOAD); 2879 } 2880 } else { 2881 ppa[i] = NULL; 2882 } 2883 } 2884 for (i = 0; i < pgcnt; i++) { 2885 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 2886 ASSERT(pp->p_szc <= szc); 2887 if (!root) { 2888 VM_STAT_ADD(anonvmstats.demotepages[3]); 2889 if (curnpgs != 0) 2890 panic("anon_map_demotepages: " 2891 "bad large page"); 2892 2893 root = 1; 2894 curnpgs = npgs = 2895 page_get_pagecnt(pp->p_szc); 2896 2897 ASSERT(npgs <= pgcnt); 2898 ASSERT(IS_P2ALIGNED(npgs, npgs)); 2899 ASSERT(!(page_pptonum(pp) & 2900 (npgs - 1))); 2901 } else { 2902 ASSERT(i > 0); 2903 ASSERT(page_pptonum(pp) - 1 == 2904 page_pptonum(ppa[i - 1])); 2905 if ((page_pptonum(pp) & (npgs - 1)) == 2906 npgs - 1) 2907 root = 0; 2908 } 2909 ASSERT(PAGE_EXCL(pp)); 2910 pp->p_szc = 0; 2911 curnpgs--; 2912 } 2913 } 2914 if (root != 0 || curnpgs != 0) 2915 panic("anon_map_demotepages: bad large page"); 2916 2917 for (i = 0; i < pgcnt; i++) { 2918 if ((pp = ppa[i]) != NULL) { 2919 ASSERT(!hat_page_is_mapped(pp)); 2920 ASSERT(pp->p_szc == 0); 2921 page_unlock(pp); 2922 } 2923 } 2924 kmem_free(ppa, ppasize); 2925 return (0); 2926 } 2927 ASSERT(ahmpages != NULL); 2928 mutex_exit(ahmpages); 2929 ahmpages = NULL; 2930 2931 VM_STAT_ADD(anonvmstats.demotepages[4]); 2932 2933 ASSERT(retry == 0); /* we can be here only once */ 2934 2935 vaddr = addr; 2936 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2937 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2938 ap = anon_get_ptr(amp->ahp, an_idx); 2939 if (ap == NULL) 2940 panic("anon_map_demotepages: no anon slot"); 2941 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 2942 S_READ, cred); 2943 if (err) { 2944 for (i = 0; i < pg_idx; i++) { 2945 if ((pp = ppa[i]) != NULL) 2946 page_unlock(pp); 2947 } 2948 kmem_free(ppa, ppasize); 2949 return (err); 2950 } 2951 ppa[pg_idx] = pl[0]; 2952 } 2953 2954 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 2955 vpage, -1, cred); 2956 if (err > 0) { 2957 VM_STAT_ADD(anonvmstats.demotepages[5]); 2958 kmem_free(ppa, ppasize); 2959 return (err); 2960 } 2961 ASSERT(err == 0 || err == -1); 2962 if (err == -1) { 2963 VM_STAT_ADD(anonvmstats.demotepages[6]); 2964 retry = 1; 2965 goto top; 2966 } 2967 for (i = 0; i < pgcnt; i++) { 2968 ASSERT(ppa[i] != NULL); 2969 if (ppa[i]->p_szc != 0) 2970 retry = 1; 2971 page_unlock(ppa[i]); 2972 } 2973 if (retry) { 2974 VM_STAT_ADD(anonvmstats.demotepages[7]); 2975 goto top; 2976 } 2977 2978 VM_STAT_ADD(anonvmstats.demotepages[8]); 2979 2980 kmem_free(ppa, ppasize); 2981 2982 return (0); 2983 } 2984 2985 /* 2986 * Allocate and initialize an anon_map structure for seg 2987 * associating the given swap reservation with the new anon_map. 2988 */ 2989 struct anon_map * 2990 anonmap_alloc(size_t size, size_t swresv) 2991 { 2992 struct anon_map *amp; 2993 2994 amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP); 2995 2996 amp->refcnt = 1; 2997 amp->size = size; 2998 2999 amp->ahp = anon_create(btopr(size), ANON_SLEEP); 3000 amp->swresv = swresv; 3001 amp->locality = 0; 3002 amp->a_szc = 0; 3003 return (amp); 3004 } 3005 3006 void 3007 anonmap_free(struct anon_map *amp) 3008 { 3009 ASSERT(amp->ahp); 3010 ASSERT(amp->refcnt == 0); 3011 3012 lgrp_shm_policy_fini(amp, NULL); 3013 anon_release(amp->ahp, btopr(amp->size)); 3014 kmem_cache_free(anonmap_cache, amp); 3015 } 3016 3017 /* 3018 * Returns true if the app array has some empty slots. 3019 * The offp and lenp paramters are in/out paramters. On entry 3020 * these values represent the starting offset and length of the 3021 * mapping. When true is returned, these values may be modified 3022 * to be the largest range which includes empty slots. 3023 */ 3024 int 3025 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3026 size_t *lenp) 3027 { 3028 ulong_t i, el; 3029 ssize_t low, high; 3030 struct anon *ap; 3031 3032 low = -1; 3033 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3034 ap = anon_get_ptr(ahp, anon_idx); 3035 if (ap == NULL) { 3036 if (low == -1) 3037 low = i; 3038 high = i; 3039 } 3040 } 3041 if (low != -1) { 3042 /* 3043 * Found at least one non-anon page. 3044 * Set up the off and len return values. 3045 */ 3046 if (low != 0) 3047 *offp += low; 3048 *lenp = high - low + PAGESIZE; 3049 return (1); 3050 } 3051 return (0); 3052 } 3053 3054 /* 3055 * Return a count of the number of existing anon pages in the anon array 3056 * app in the range (off, off+len). The array and slots must be guaranteed 3057 * stable by the caller. 3058 */ 3059 pgcnt_t 3060 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3061 { 3062 pgcnt_t cnt = 0; 3063 3064 while (nslots-- > 0) { 3065 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3066 cnt++; 3067 anon_index++; 3068 } 3069 return (cnt); 3070 } 3071 3072 /* 3073 * Move reserved phys swap into memory swap (unreserve phys swap 3074 * and reserve mem swap by the same amount). 3075 * Used by segspt when it needs to lock resrved swap npages in memory 3076 */ 3077 int 3078 anon_swap_adjust(pgcnt_t npages) 3079 { 3080 pgcnt_t unlocked_mem_swap; 3081 3082 mutex_enter(&anoninfo_lock); 3083 3084 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3085 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3086 3087 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3088 - k_anoninfo.ani_locked_swap; 3089 if (npages > unlocked_mem_swap) { 3090 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3091 3092 /* 3093 * if there is not enough unlocked mem swap we take missing 3094 * amount from phys swap and give it to mem swap 3095 */ 3096 mutex_enter(&freemem_lock); 3097 if (availrmem < adjusted_swap + segspt_minfree) { 3098 mutex_exit(&freemem_lock); 3099 mutex_exit(&anoninfo_lock); 3100 return (ENOMEM); 3101 } 3102 availrmem -= adjusted_swap; 3103 mutex_exit(&freemem_lock); 3104 3105 k_anoninfo.ani_mem_resv += adjusted_swap; 3106 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3107 k_anoninfo.ani_phys_resv -= adjusted_swap; 3108 3109 ANI_ADD(adjusted_swap); 3110 } 3111 k_anoninfo.ani_locked_swap += npages; 3112 3113 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3114 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3115 3116 mutex_exit(&anoninfo_lock); 3117 3118 return (0); 3119 } 3120 3121 /* 3122 * 'unlocked' reserved mem swap so when it is unreserved it 3123 * can be moved back phys (disk) swap 3124 */ 3125 void 3126 anon_swap_restore(pgcnt_t npages) 3127 { 3128 mutex_enter(&anoninfo_lock); 3129 3130 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3131 3132 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3133 k_anoninfo.ani_locked_swap -= npages; 3134 3135 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3136 3137 mutex_exit(&anoninfo_lock); 3138 } 3139 3140 /* 3141 * Return the pointer from the list for a 3142 * specified anon index. 3143 */ 3144 ulong_t * 3145 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3146 { 3147 struct anon **app; 3148 void **ppp; 3149 3150 ASSERT(an_idx < ahp->size); 3151 3152 /* 3153 * Single level case. 3154 */ 3155 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3156 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3157 } else { 3158 3159 /* 3160 * 2 level case. 3161 */ 3162 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3163 if (*ppp == NULL) { 3164 mutex_enter(&ahp->serial_lock); 3165 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3166 if (*ppp == NULL) 3167 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3168 mutex_exit(&ahp->serial_lock); 3169 } 3170 app = *ppp; 3171 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3172 } 3173 } 3174 3175 void 3176 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3177 { 3178 ulong_t *ap_slot; 3179 kmutex_t *mtx; 3180 kcondvar_t *cv; 3181 int hash; 3182 3183 /* 3184 * Use szc to determine anon slot(s) to appear atomic. 3185 * If szc = 0, then lock the anon slot and mark it busy. 3186 * If szc > 0, then lock the range of slots by getting the 3187 * anon_array_lock for the first anon slot, and mark only the 3188 * first anon slot busy to represent whole range being busy. 3189 */ 3190 3191 ASSERT(RW_READ_HELD(&->a_rwlock)); 3192 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3193 hash = ANON_ARRAY_HASH(amp, an_idx); 3194 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3195 sobj->sync_cv = cv = &anon_array_cv[hash]; 3196 mutex_enter(mtx); 3197 ap_slot = anon_get_slot(amp->ahp, an_idx); 3198 while (ANON_ISBUSY(ap_slot)) 3199 cv_wait(cv, mtx); 3200 ANON_SETBUSY(ap_slot); 3201 sobj->sync_data = ap_slot; 3202 mutex_exit(mtx); 3203 } 3204 3205 int 3206 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3207 anon_sync_obj_t *sobj) 3208 { 3209 ulong_t *ap_slot; 3210 kmutex_t *mtx; 3211 int hash; 3212 3213 /* 3214 * Try to lock a range of anon slots. 3215 * Use szc to determine anon slot(s) to appear atomic. 3216 * If szc = 0, then lock the anon slot and mark it busy. 3217 * If szc > 0, then lock the range of slots by getting the 3218 * anon_array_lock for the first anon slot, and mark only the 3219 * first anon slot busy to represent whole range being busy. 3220 * Fail if the mutex or the anon_array are busy. 3221 */ 3222 3223 ASSERT(RW_READ_HELD(&->a_rwlock)); 3224 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3225 hash = ANON_ARRAY_HASH(amp, an_idx); 3226 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3227 sobj->sync_cv = &anon_array_cv[hash]; 3228 if (!mutex_tryenter(mtx)) { 3229 return (EWOULDBLOCK); 3230 } 3231 ap_slot = anon_get_slot(amp->ahp, an_idx); 3232 if (ANON_ISBUSY(ap_slot)) { 3233 mutex_exit(mtx); 3234 return (EWOULDBLOCK); 3235 } 3236 ANON_SETBUSY(ap_slot); 3237 sobj->sync_data = ap_slot; 3238 mutex_exit(mtx); 3239 return (0); 3240 } 3241 3242 void 3243 anon_array_exit(anon_sync_obj_t *sobj) 3244 { 3245 mutex_enter(sobj->sync_mutex); 3246 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3247 ANON_CLRBUSY(sobj->sync_data); 3248 if (CV_HAS_WAITERS(sobj->sync_cv)) 3249 cv_broadcast(sobj->sync_cv); 3250 mutex_exit(sobj->sync_mutex); 3251 } 3252