1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 26 /* All Rights Reserved */ 27 28 /* 29 * University Copyright- Copyright (c) 1982, 1986, 1988 30 * The Regents of the University of California 31 * All Rights Reserved 32 * 33 * University Acknowledgment- Portions of this document are derived from 34 * software developed by the University of California, Berkeley, and its 35 * contributors. 36 */ 37 38 /* 39 * VM - anonymous pages. 40 * 41 * This layer sits immediately above the vm_swap layer. It manages 42 * physical pages that have no permanent identity in the file system 43 * name space, using the services of the vm_swap layer to allocate 44 * backing storage for these pages. Since these pages have no external 45 * identity, they are discarded when the last reference is removed. 46 * 47 * An important function of this layer is to manage low-level sharing 48 * of pages that are logically distinct but that happen to be 49 * physically identical (e.g., the corresponding pages of the processes 50 * resulting from a fork before one process or the other changes their 51 * contents). This pseudo-sharing is present only as an optimization 52 * and is not to be confused with true sharing in which multiple 53 * address spaces deliberately contain references to the same object; 54 * such sharing is managed at a higher level. 55 * 56 * The key data structure here is the anon struct, which contains a 57 * reference count for its associated physical page and a hint about 58 * the identity of that page. Anon structs typically live in arrays, 59 * with an instance's position in its array determining where the 60 * corresponding backing storage is allocated; however, the swap_xlate() 61 * routine abstracts away this representation information so that the 62 * rest of the anon layer need not know it. (See the swap layer for 63 * more details on anon struct layout.) 64 * 65 * In the future versions of the system, the association between an 66 * anon struct and its position on backing store will change so that 67 * we don't require backing store all anonymous pages in the system. 68 * This is important for consideration for large memory systems. 69 * We can also use this technique to delay binding physical locations 70 * to anonymous pages until pageout/swapout time where we can make 71 * smarter allocation decisions to improve anonymous klustering. 72 * 73 * Many of the routines defined here take a (struct anon **) argument, 74 * which allows the code at this level to manage anon pages directly, 75 * so that callers can regard anon structs as opaque objects and not be 76 * concerned with assigning or inspecting their contents. 77 * 78 * Clients of this layer refer to anon pages indirectly. That is, they 79 * maintain arrays of pointers to anon structs rather than maintaining 80 * anon structs themselves. The (struct anon **) arguments mentioned 81 * above are pointers to entries in these arrays. It is these arrays 82 * that capture the mapping between offsets within a given segment and 83 * the corresponding anonymous backing storage address. 84 */ 85 86 #ifdef DEBUG 87 #define ANON_DEBUG 88 #endif 89 90 #include <sys/types.h> 91 #include <sys/t_lock.h> 92 #include <sys/param.h> 93 #include <sys/systm.h> 94 #include <sys/mman.h> 95 #include <sys/cred.h> 96 #include <sys/thread.h> 97 #include <sys/vnode.h> 98 #include <sys/cpuvar.h> 99 #include <sys/swap.h> 100 #include <sys/cmn_err.h> 101 #include <sys/vtrace.h> 102 #include <sys/kmem.h> 103 #include <sys/sysmacros.h> 104 #include <sys/bitmap.h> 105 #include <sys/vmsystm.h> 106 #include <sys/tuneable.h> 107 #include <sys/debug.h> 108 #include <sys/fs/swapnode.h> 109 #include <sys/tnf_probe.h> 110 #include <sys/lgrp.h> 111 #include <sys/policy.h> 112 #include <sys/condvar_impl.h> 113 #include <sys/mutex_impl.h> 114 #include <sys/rctl.h> 115 116 #include <vm/as.h> 117 #include <vm/hat.h> 118 #include <vm/anon.h> 119 #include <vm/page.h> 120 #include <vm/vpage.h> 121 #include <vm/seg.h> 122 #include <vm/rm.h> 123 124 #include <fs/fs_subr.h> 125 126 struct vnode *anon_vp; 127 128 int anon_debug; 129 130 kmutex_t anoninfo_lock; 131 struct k_anoninfo k_anoninfo; 132 ani_free_t ani_free_pool[ANI_MAX_POOL]; 133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 134 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 135 136 /* 137 * Global hash table for (vp, off) -> anon slot 138 */ 139 extern int swap_maxcontig; 140 size_t anon_hash_size; 141 struct anon **anon_hash; 142 143 static struct kmem_cache *anon_cache; 144 static struct kmem_cache *anonmap_cache; 145 146 #ifdef VM_STATS 147 static struct anonvmstats_str { 148 ulong_t getpages[30]; 149 ulong_t privatepages[10]; 150 ulong_t demotepages[9]; 151 ulong_t decrefpages[9]; 152 ulong_t dupfillholes[4]; 153 ulong_t freepages[1]; 154 } anonvmstats; 155 #endif /* VM_STATS */ 156 157 /*ARGSUSED*/ 158 static int 159 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 160 { 161 struct anon_map *amp = buf; 162 163 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 164 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 165 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 166 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 167 return (0); 168 } 169 170 /*ARGSUSED1*/ 171 static void 172 anonmap_cache_destructor(void *buf, void *cdrarg) 173 { 174 struct anon_map *amp = buf; 175 176 rw_destroy(&->a_rwlock); 177 cv_destroy(&->a_purgecv); 178 mutex_destroy(&->a_pmtx); 179 mutex_destroy(&->a_purgemtx); 180 } 181 182 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 183 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 184 185 void 186 anon_init(void) 187 { 188 int i; 189 190 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 191 192 for (i = 0; i < AH_LOCK_SIZE; i++) { 193 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 194 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 195 } 196 197 for (i = 0; i < ANON_LOCKSIZE; i++) { 198 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 199 MUTEX_DEFAULT, NULL); 200 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 201 } 202 203 anon_hash = (struct anon **) 204 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 205 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 206 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL); 207 anonmap_cache = kmem_cache_create("anonmap_cache", 208 sizeof (struct anon_map), 0, 209 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 210 NULL, NULL, 0); 211 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 212 213 anon_vp = vn_alloc(KM_SLEEP); 214 vn_setops(anon_vp, swap_vnodeops); 215 anon_vp->v_type = VREG; 216 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 217 } 218 219 /* 220 * Global anon slot hash table manipulation. 221 */ 222 223 static void 224 anon_addhash(struct anon *ap) 225 { 226 int index; 227 228 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 229 index = ANON_HASH(ap->an_vp, ap->an_off); 230 ap->an_hash = anon_hash[index]; 231 anon_hash[index] = ap; 232 } 233 234 static void 235 anon_rmhash(struct anon *ap) 236 { 237 struct anon **app; 238 239 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 240 241 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 242 *app; app = &((*app)->an_hash)) { 243 if (*app == ap) { 244 *app = ap->an_hash; 245 break; 246 } 247 } 248 } 249 250 /* 251 * The anon array interfaces. Functions allocating, 252 * freeing array of pointers, and returning/setting 253 * entries in the array of pointers for a given offset. 254 * 255 * Create the list of pointers 256 */ 257 struct anon_hdr * 258 anon_create(pgcnt_t npages, int flags) 259 { 260 struct anon_hdr *ahp; 261 ulong_t nchunks; 262 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 263 264 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 265 return (NULL); 266 } 267 268 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 269 /* 270 * Single level case. 271 */ 272 ahp->size = npages; 273 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 274 275 if (flags & ANON_ALLOC_FORCE) 276 ahp->flags |= ANON_ALLOC_FORCE; 277 278 ahp->array_chunk = kmem_zalloc( 279 ahp->size * sizeof (struct anon *), kmemflags); 280 281 if (ahp->array_chunk == NULL) { 282 kmem_free(ahp, sizeof (struct anon_hdr)); 283 return (NULL); 284 } 285 } else { 286 /* 287 * 2 Level case. 288 * anon hdr size needs to be rounded off to be a multiple 289 * of ANON_CHUNK_SIZE. This is important as various anon 290 * related functions depend on this. 291 * NOTE - 292 * anon_grow() makes anon hdr size a multiple of 293 * ANON_CHUNK_SIZE. 294 * amp size is <= anon hdr size. 295 * anon_index + seg_pgs <= anon hdr size. 296 */ 297 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 298 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 299 300 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 301 kmemflags); 302 303 if (ahp->array_chunk == NULL) { 304 kmem_free(ahp, sizeof (struct anon_hdr)); 305 return (NULL); 306 } 307 } 308 return (ahp); 309 } 310 311 /* 312 * Free the array of pointers 313 */ 314 void 315 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 316 { 317 ulong_t i; 318 void **ppp; 319 ulong_t nchunks; 320 321 ASSERT(npages <= ahp->size); 322 323 /* 324 * Single level case. 325 */ 326 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 327 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 328 } else { 329 /* 330 * 2 level case. 331 */ 332 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 333 for (i = 0; i < nchunks; i++) { 334 ppp = &ahp->array_chunk[i]; 335 if (*ppp != NULL) 336 kmem_free(*ppp, PAGESIZE); 337 } 338 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 339 } 340 mutex_destroy(&ahp->serial_lock); 341 kmem_free(ahp, sizeof (struct anon_hdr)); 342 } 343 344 /* 345 * Return the pointer from the list for a 346 * specified anon index. 347 */ 348 struct anon * 349 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 350 { 351 struct anon **app; 352 353 ASSERT(an_idx < ahp->size); 354 355 /* 356 * Single level case. 357 */ 358 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 359 return ((struct anon *) 360 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 361 } else { 362 363 /* 364 * 2 level case. 365 */ 366 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 367 if (app) { 368 return ((struct anon *) 369 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 370 ANON_PTRMASK)); 371 } else { 372 return (NULL); 373 } 374 } 375 } 376 377 /* 378 * Return the anon pointer for the first valid entry in the anon list, 379 * starting from the given index. 380 */ 381 struct anon * 382 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 383 { 384 struct anon *ap; 385 struct anon **app; 386 ulong_t chunkoff; 387 ulong_t i; 388 ulong_t j; 389 pgcnt_t size; 390 391 i = *index; 392 size = ahp->size; 393 394 ASSERT(i < size); 395 396 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 397 /* 398 * 1 level case 399 */ 400 while (i < size) { 401 ap = (struct anon *) 402 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 403 if (ap) { 404 *index = i; 405 return (ap); 406 } 407 i++; 408 } 409 } else { 410 /* 411 * 2 level case 412 */ 413 chunkoff = i & ANON_CHUNK_OFF; 414 while (i < size) { 415 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 416 if (app) 417 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 418 ap = (struct anon *) 419 ((uintptr_t)app[j] & ANON_PTRMASK); 420 if (ap) { 421 *index = i + (j - chunkoff); 422 return (ap); 423 } 424 } 425 chunkoff = 0; 426 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 427 } 428 } 429 *index = size; 430 return (NULL); 431 } 432 433 /* 434 * Set list entry with a given pointer for a specified offset 435 */ 436 int 437 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 438 { 439 void **ppp; 440 struct anon **app; 441 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 442 uintptr_t *ap_addr; 443 444 ASSERT(an_idx < ahp->size); 445 446 /* 447 * Single level case. 448 */ 449 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 450 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 451 } else { 452 453 /* 454 * 2 level case. 455 */ 456 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 457 458 ASSERT(ppp != NULL); 459 if (*ppp == NULL) { 460 mutex_enter(&ahp->serial_lock); 461 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 462 if (*ppp == NULL) { 463 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 464 if (*ppp == NULL) { 465 mutex_exit(&ahp->serial_lock); 466 return (ENOMEM); 467 } 468 } 469 mutex_exit(&ahp->serial_lock); 470 } 471 app = *ppp; 472 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 473 } 474 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 475 return (0); 476 } 477 478 /* 479 * Copy anon array into a given new anon array 480 */ 481 int 482 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 483 struct anon_hdr *dahp, ulong_t d_idx, 484 pgcnt_t npages, int flags) 485 { 486 void **sapp, **dapp; 487 void *ap; 488 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 489 490 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 491 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 492 493 /* 494 * Both arrays are 1 level. 495 */ 496 if (((sahp->size <= ANON_CHUNK_SIZE) && 497 (dahp->size <= ANON_CHUNK_SIZE)) || 498 ((sahp->flags & ANON_ALLOC_FORCE) && 499 (dahp->flags & ANON_ALLOC_FORCE))) { 500 501 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 502 npages * sizeof (struct anon *)); 503 return (0); 504 } 505 506 /* 507 * Both arrays are 2 levels. 508 */ 509 if (sahp->size > ANON_CHUNK_SIZE && 510 dahp->size > ANON_CHUNK_SIZE && 511 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 512 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 513 514 ulong_t sapidx, dapidx; 515 ulong_t *sap, *dap; 516 ulong_t chknp; 517 518 while (npages != 0) { 519 520 sapidx = s_idx & ANON_CHUNK_OFF; 521 dapidx = d_idx & ANON_CHUNK_OFF; 522 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 523 if (chknp > npages) 524 chknp = npages; 525 526 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 527 if ((sap = *sapp) != NULL) { 528 dapp = &dahp->array_chunk[d_idx 529 >> ANON_CHUNK_SHIFT]; 530 if ((dap = *dapp) == NULL) { 531 *dapp = kmem_zalloc(PAGESIZE, 532 kmemflags); 533 if ((dap = *dapp) == NULL) 534 return (ENOMEM); 535 } 536 bcopy((sap + sapidx), (dap + dapidx), 537 chknp << ANON_PTRSHIFT); 538 } 539 s_idx += chknp; 540 d_idx += chknp; 541 npages -= chknp; 542 } 543 return (0); 544 } 545 546 /* 547 * At least one of the arrays is 2 level. 548 */ 549 while (npages--) { 550 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 551 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 552 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 553 return (ENOMEM); 554 } 555 s_idx++; 556 d_idx++; 557 } 558 return (0); 559 } 560 561 562 /* 563 * ANON_INITBUF is a convenience macro for anon_grow() below. It 564 * takes a buffer dst, which is at least as large as buffer src. It 565 * does a bcopy from src into dst, and then bzeros the extra bytes 566 * of dst. If tail is set, the data in src is tail aligned within 567 * dst instead of head aligned. 568 */ 569 570 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 571 if (tail) { \ 572 bzero((dst), (dstsize) - (srclen)); \ 573 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 574 } else { \ 575 bcopy((src), (dst), (srclen)); \ 576 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 577 } 578 579 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 580 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 581 582 /* 583 * anon_grow() is used to efficiently extend an existing anon array. 584 * startidx_p points to the index into the anon array of the first page 585 * that is in use. oldseg_pgs is the number of pages in use, starting at 586 * *startidx_p. newpages is the number of additional pages desired. 587 * 588 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 589 * 590 * The growth is done by creating a new top level of the anon array, 591 * and (if the array is 2-level) reusing the existing second level arrays. 592 * 593 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 594 * 595 * Returns the new number of pages in the anon array. 596 */ 597 pgcnt_t 598 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 599 pgcnt_t newseg_pgs, int flags) 600 { 601 ulong_t startidx = startidx_p ? *startidx_p : 0; 602 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 603 pgcnt_t oelems, nelems, totpages; 604 void **level1; 605 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 606 int growdown = (flags & ANON_GROWDOWN); 607 size_t newarrsz, oldarrsz; 608 void *level2; 609 610 ASSERT(!(startidx_p == NULL && growdown)); 611 ASSERT(startidx + oldseg_pgs <= ahp->size); 612 613 /* 614 * Determine the total number of pages needed in the new 615 * anon array. If growing down, totpages is all pages from 616 * startidx through the end of the array, plus <newseg_pgs> 617 * pages. If growing up, keep all pages from page 0 through 618 * the last page currently in use, plus <newseg_pgs> pages. 619 */ 620 if (growdown) 621 totpages = oldamp_pgs - startidx + newseg_pgs; 622 else 623 totpages = startidx + oldseg_pgs + newseg_pgs; 624 625 /* If the array is already large enough, just return. */ 626 627 if (oldamp_pgs >= totpages) { 628 if (growdown) 629 *startidx_p = oldamp_pgs - totpages; 630 return (oldamp_pgs); 631 } 632 633 /* 634 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 635 * by the corresponding arrays. 636 * oelems/nelems are the number of pointers in the top level arrays 637 * which may be either level 1 or level 2. 638 * Will the new anon array be one level or two levels? 639 */ 640 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 641 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 642 oelems = oldamp_pgs; 643 nelems = newamp_pgs; 644 } else { 645 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 646 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 647 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 648 } 649 650 newarrsz = nelems * sizeof (void *); 651 level1 = kmem_alloc(newarrsz, kmemflags); 652 if (level1 == NULL) 653 return (0); 654 655 /* Are we converting from a one level to a two level anon array? */ 656 657 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 658 !(ahp->flags & ANON_ALLOC_FORCE)) { 659 660 /* 661 * Yes, we're converting to a two level. Reuse old level 1 662 * as new level 2 if it is exactly PAGESIZE. Otherwise 663 * alloc a new level 2 and copy the old level 1 data into it. 664 */ 665 if (oldamp_pgs == ANON_CHUNK_SIZE) { 666 level2 = (void *)ahp->array_chunk; 667 } else { 668 level2 = kmem_alloc(PAGESIZE, kmemflags); 669 if (level2 == NULL) { 670 kmem_free(level1, newarrsz); 671 return (0); 672 } 673 oldarrsz = oldamp_pgs * sizeof (void *); 674 675 ANON_INITBUF(ahp->array_chunk, oldarrsz, 676 level2, PAGESIZE, growdown); 677 kmem_free(ahp->array_chunk, oldarrsz); 678 } 679 bzero(level1, newarrsz); 680 if (growdown) 681 level1[nelems - 1] = level2; 682 else 683 level1[0] = level2; 684 } else { 685 oldarrsz = oelems * sizeof (void *); 686 687 ANON_INITBUF(ahp->array_chunk, oldarrsz, 688 level1, newarrsz, growdown); 689 kmem_free(ahp->array_chunk, oldarrsz); 690 } 691 692 ahp->array_chunk = level1; 693 ahp->size = newamp_pgs; 694 if (growdown) 695 *startidx_p = newamp_pgs - totpages; 696 697 return (newamp_pgs); 698 } 699 700 701 /* 702 * Called from clock handler to sync ani_free value. 703 */ 704 705 void 706 set_anoninfo(void) 707 { 708 int ix; 709 pgcnt_t total = 0; 710 711 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 712 total += ani_free_pool[ix].ani_count; 713 } 714 k_anoninfo.ani_free = total; 715 } 716 717 /* 718 * Reserve anon space. 719 * 720 * It's no longer simply a matter of incrementing ani_resv to 721 * reserve swap space, we need to check memory-based as well 722 * as disk-backed (physical) swap. The following algorithm 723 * is used: 724 * Check the space on physical swap 725 * i.e. amount needed < ani_max - ani_phys_resv 726 * If we are swapping on swapfs check 727 * amount needed < (availrmem - swapfs_minfree) 728 * Since the algorithm to check for the quantity of swap space is 729 * almost the same as that for reserving it, we'll just use anon_resvmem 730 * with a flag to decrement availrmem. 731 * 732 * Return non-zero on success. 733 */ 734 int 735 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 736 { 737 pgcnt_t npages = btopr(size); 738 pgcnt_t mswap_pages = 0; 739 pgcnt_t pswap_pages = 0; 740 proc_t *p = curproc; 741 742 if (zone != NULL && takemem) { 743 /* test zone.max-swap resource control */ 744 mutex_enter(&p->p_lock); 745 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 746 mutex_exit(&p->p_lock); 747 return (0); 748 } 749 mutex_exit(&p->p_lock); 750 } 751 mutex_enter(&anoninfo_lock); 752 753 /* 754 * pswap_pages is the number of pages we can take from 755 * physical (i.e. disk-backed) swap. 756 */ 757 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 758 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 759 760 ANON_PRINT(A_RESV, 761 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 762 npages, takemem, pswap_pages, (void *)caller())); 763 764 if (npages <= pswap_pages) { 765 /* 766 * we have enough space on a physical swap 767 */ 768 if (takemem) 769 k_anoninfo.ani_phys_resv += npages; 770 mutex_exit(&anoninfo_lock); 771 return (1); 772 } else if (pswap_pages != 0) { 773 /* 774 * we have some space on a physical swap 775 */ 776 if (takemem) { 777 /* 778 * use up remainder of phys swap 779 */ 780 k_anoninfo.ani_phys_resv += pswap_pages; 781 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 782 } 783 } 784 /* 785 * since (npages > pswap_pages) we need mem swap 786 * mswap_pages is the number of pages needed from availrmem 787 */ 788 ASSERT(npages > pswap_pages); 789 mswap_pages = npages - pswap_pages; 790 791 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 792 mswap_pages)); 793 794 /* 795 * priv processes can reserve memory as swap as long as availrmem 796 * remains greater than swapfs_minfree; in the case of non-priv 797 * processes, memory can be reserved as swap only if availrmem 798 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 799 * swapfs_reserve amount of memswap is not available to non-priv 800 * processes. This protects daemons such as automounter dying 801 * as a result of application processes eating away almost entire 802 * membased swap. This safeguard becomes useless if apps are run 803 * with root access. 804 * 805 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 806 * 807 */ 808 if (tryhard) { 809 pgcnt_t floor_pages; 810 811 if (secpolicy_resource_anon_mem(CRED())) { 812 floor_pages = swapfs_minfree; 813 } else { 814 floor_pages = swapfs_minfree + swapfs_reserve; 815 } 816 817 mutex_exit(&anoninfo_lock); 818 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 819 mutex_enter(&anoninfo_lock); 820 } 821 822 mutex_enter(&freemem_lock); 823 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 824 (availrmem > (swapfs_minfree + mswap_pages) && 825 secpolicy_resource(CRED()) == 0)) { 826 827 if (takemem) { 828 /* 829 * Take the memory from the rest of the system. 830 */ 831 availrmem -= mswap_pages; 832 mutex_exit(&freemem_lock); 833 k_anoninfo.ani_mem_resv += mswap_pages; 834 ANI_ADD(mswap_pages); 835 ANON_PRINT((A_RESV | A_MRESV), 836 ("anon_resvmem: took %ld pages of availrmem\n", 837 mswap_pages)); 838 } else { 839 mutex_exit(&freemem_lock); 840 } 841 842 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 843 mutex_exit(&anoninfo_lock); 844 return (1); 845 } else { 846 /* 847 * Fail if not enough memory 848 */ 849 if (takemem) { 850 k_anoninfo.ani_phys_resv -= pswap_pages; 851 } 852 853 mutex_exit(&freemem_lock); 854 mutex_exit(&anoninfo_lock); 855 ANON_PRINT(A_RESV, 856 ("anon_resvmem: not enough space from swapfs\n")); 857 if (zone != NULL && takemem) 858 rctl_decr_swap(zone, ptob(npages)); 859 return (0); 860 } 861 } 862 863 /* 864 * Give back an anon reservation. 865 */ 866 void 867 anon_unresvmem(size_t size, zone_t *zone) 868 { 869 pgcnt_t npages = btopr(size); 870 spgcnt_t mem_free_pages = 0; 871 pgcnt_t phys_free_slots; 872 #ifdef ANON_DEBUG 873 pgcnt_t mem_resv; 874 #endif 875 if (zone != NULL) 876 rctl_decr_swap(zone, ptob(npages)); 877 878 mutex_enter(&anoninfo_lock); 879 880 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 881 882 /* 883 * If some of this reservation belonged to swapfs 884 * give it back to availrmem. 885 * ani_mem_resv is the amount of availrmem swapfs has reserved. 886 * but some of that memory could be locked by segspt so we can only 887 * return non locked ani_mem_resv back to availrmem 888 */ 889 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 890 ANON_PRINT((A_RESV | A_MRESV), 891 ("anon_unresv: growing availrmem by %ld pages\n", 892 MIN(k_anoninfo.ani_mem_resv, npages))); 893 894 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 895 k_anoninfo.ani_locked_swap), npages); 896 mutex_enter(&freemem_lock); 897 availrmem += mem_free_pages; 898 mutex_exit(&freemem_lock); 899 k_anoninfo.ani_mem_resv -= mem_free_pages; 900 901 ANI_ADD(-mem_free_pages); 902 } 903 /* 904 * The remainder of the pages is returned to phys swap 905 */ 906 ASSERT(npages >= mem_free_pages); 907 phys_free_slots = npages - mem_free_pages; 908 909 if (phys_free_slots) { 910 k_anoninfo.ani_phys_resv -= phys_free_slots; 911 } 912 913 #ifdef ANON_DEBUG 914 mem_resv = k_anoninfo.ani_mem_resv; 915 #endif 916 917 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 918 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 919 920 mutex_exit(&anoninfo_lock); 921 922 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 923 npages, mem_resv, (void *)caller())); 924 } 925 926 /* 927 * Allocate an anon slot and return it with the lock held. 928 */ 929 struct anon * 930 anon_alloc(struct vnode *vp, anoff_t off) 931 { 932 struct anon *ap; 933 kmutex_t *ahm; 934 935 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 936 if (vp == NULL) { 937 swap_alloc(ap); 938 } else { 939 ap->an_vp = vp; 940 ap->an_off = off; 941 } 942 ap->an_refcnt = 1; 943 ap->an_pvp = NULL; 944 ap->an_poff = 0; 945 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 946 mutex_enter(ahm); 947 anon_addhash(ap); 948 mutex_exit(ahm); 949 ANI_ADD(-1); 950 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 951 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 952 return (ap); 953 } 954 955 /* 956 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 957 * such pages don't consume any physical swap resources needed for swapping 958 * unlocked pages. 959 */ 960 void 961 anon_swap_free(struct anon *ap, page_t *pp) 962 { 963 kmutex_t *ahm; 964 965 ASSERT(ap != NULL); 966 ASSERT(pp != NULL); 967 ASSERT(PAGE_LOCKED(pp)); 968 ASSERT(pp->p_vnode != NULL); 969 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 970 ASSERT(ap->an_refcnt != 0); 971 ASSERT(pp->p_vnode == ap->an_vp); 972 ASSERT(pp->p_offset == ap->an_off); 973 974 if (ap->an_pvp == NULL) 975 return; 976 977 page_io_lock(pp); 978 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 979 mutex_enter(ahm); 980 981 ASSERT(ap->an_refcnt != 0); 982 ASSERT(pp->p_vnode == ap->an_vp); 983 ASSERT(pp->p_offset == ap->an_off); 984 985 if (ap->an_pvp != NULL) { 986 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 987 ap->an_pvp = NULL; 988 ap->an_poff = 0; 989 mutex_exit(ahm); 990 hat_setmod(pp); 991 } else { 992 mutex_exit(ahm); 993 } 994 page_io_unlock(pp); 995 } 996 997 /* 998 * Decrement the reference count of an anon page. 999 * If reference count goes to zero, free it and 1000 * its associated page (if any). 1001 */ 1002 void 1003 anon_decref(struct anon *ap) 1004 { 1005 page_t *pp; 1006 struct vnode *vp; 1007 anoff_t off; 1008 kmutex_t *ahm; 1009 1010 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1011 mutex_enter(ahm); 1012 ASSERT(ap->an_refcnt != 0); 1013 if (ap->an_refcnt == 0) 1014 panic("anon_decref: slot count 0"); 1015 if (--ap->an_refcnt == 0) { 1016 swap_xlate(ap, &vp, &off); 1017 anon_rmhash(ap); 1018 if (ap->an_pvp != NULL) 1019 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1020 mutex_exit(ahm); 1021 1022 /* 1023 * If there is a page for this anon slot we will need to 1024 * call VN_DISPOSE to get rid of the vp association and 1025 * put the page back on the free list as really free. 1026 * Acquire the "exclusive" lock to ensure that any 1027 * pending i/o always completes before the swap slot 1028 * is freed. 1029 */ 1030 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1031 if (pp != NULL) { 1032 /*LINTED: constant in conditional context */ 1033 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1034 } 1035 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1036 (void *)ap, (void *)ap->an_vp)); 1037 1038 kmem_cache_free(anon_cache, ap); 1039 1040 ANI_ADD(1); 1041 } else { 1042 mutex_exit(ahm); 1043 } 1044 } 1045 1046 1047 /* 1048 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1049 * seg->s_szc level) to determine whether COW processing is required. 1050 * anonpages_hash_lock[] held on the root ap ensures that if root's 1051 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1052 * later since this process can't fork while its AS lock is held). 1053 * 1054 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1055 */ 1056 int 1057 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1058 { 1059 struct anon *ap; 1060 kmutex_t *ahmpages = NULL; 1061 1062 ap = anon_get_ptr(ahp, anon_index); 1063 if (ap == NULL) 1064 return (0); 1065 1066 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1067 mutex_enter(ahmpages); 1068 ASSERT(ap->an_refcnt >= 1); 1069 if (ap->an_refcnt == 1) { 1070 mutex_exit(ahmpages); 1071 return (0); 1072 } 1073 mutex_exit(ahmpages); 1074 return (1); 1075 } 1076 /* 1077 * Check 'nslots' anon slots for refcnt > 1. 1078 * 1079 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1080 * returns 0. 1081 */ 1082 static int 1083 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1084 { 1085 struct anon *ap; 1086 1087 while (nslots-- > 0) { 1088 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1089 ap->an_refcnt > 1) 1090 return (1); 1091 anon_index++; 1092 } 1093 1094 return (0); 1095 } 1096 1097 static void 1098 anon_decref_pages( 1099 struct anon_hdr *ahp, 1100 ulong_t an_idx, 1101 uint_t szc) 1102 { 1103 struct anon *ap = anon_get_ptr(ahp, an_idx); 1104 kmutex_t *ahmpages = NULL; 1105 page_t *pp; 1106 pgcnt_t pgcnt = page_get_pagecnt(szc); 1107 pgcnt_t i; 1108 struct vnode *vp; 1109 anoff_t off; 1110 kmutex_t *ahm; 1111 #ifdef DEBUG 1112 int refcnt = 1; 1113 #endif 1114 1115 ASSERT(szc != 0); 1116 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1117 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1118 ASSERT(an_idx < ahp->size); 1119 1120 if (ahp->size - an_idx < pgcnt) { 1121 /* 1122 * In case of shared mappings total anon map size may not be 1123 * the largest page size aligned. 1124 */ 1125 pgcnt = ahp->size - an_idx; 1126 } 1127 1128 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1129 1130 if (ap != NULL) { 1131 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1132 mutex_enter(ahmpages); 1133 ASSERT((refcnt = ap->an_refcnt) != 0); 1134 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1135 if (ap->an_refcnt == 1) { 1136 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1137 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1138 mutex_exit(ahmpages); 1139 ahmpages = NULL; 1140 } 1141 } 1142 1143 i = 0; 1144 while (i < pgcnt) { 1145 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1146 ASSERT(refcnt == 1 && ahmpages == NULL); 1147 i++; 1148 continue; 1149 } 1150 ASSERT(ap->an_refcnt == refcnt); 1151 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1152 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1153 1154 if (ahmpages == NULL) { 1155 swap_xlate(ap, &vp, &off); 1156 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1157 if (pp == NULL || pp->p_szc == 0) { 1158 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1159 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1160 ap->an_off)]; 1161 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1162 ANON_SLEEP); 1163 mutex_enter(ahm); 1164 ap->an_refcnt--; 1165 ASSERT(ap->an_refcnt == 0); 1166 anon_rmhash(ap); 1167 if (ap->an_pvp) 1168 swap_phys_free(ap->an_pvp, ap->an_poff, 1169 PAGESIZE); 1170 mutex_exit(ahm); 1171 if (pp == NULL) { 1172 pp = page_lookup(vp, (u_offset_t)off, 1173 SE_EXCL); 1174 ASSERT(pp == NULL || pp->p_szc == 0); 1175 } 1176 if (pp != NULL) { 1177 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1178 /*LINTED*/ 1179 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1180 } 1181 kmem_cache_free(anon_cache, ap); 1182 ANI_ADD(1); 1183 i++; 1184 } else { 1185 pgcnt_t j; 1186 pgcnt_t curpgcnt = 1187 page_get_pagecnt(pp->p_szc); 1188 size_t ppasize = curpgcnt * sizeof (page_t *); 1189 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1190 int dispose = 0; 1191 1192 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1193 1194 ASSERT(pp->p_szc <= szc); 1195 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1196 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1197 ASSERT(i + curpgcnt <= pgcnt); 1198 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1199 ppa[0] = pp; 1200 for (j = i + 1; j < i + curpgcnt; j++) { 1201 ap = anon_get_ptr(ahp, an_idx + j); 1202 ASSERT(ap != NULL && 1203 ap->an_refcnt == 1); 1204 swap_xlate(ap, &vp, &off); 1205 pp = page_lookup(vp, (u_offset_t)off, 1206 SE_EXCL); 1207 if (pp == NULL) 1208 panic("anon_decref_pages: " 1209 "no page"); 1210 1211 (void) hat_pageunload(pp, 1212 HAT_FORCE_PGUNLOAD); 1213 ASSERT(pp->p_szc == ppa[0]->p_szc); 1214 ASSERT(page_pptonum(pp) - 1 == 1215 page_pptonum(ppa[j - i - 1])); 1216 ppa[j - i] = pp; 1217 if (ap->an_pvp != NULL && 1218 !vn_matchopval(ap->an_pvp, 1219 VOPNAME_DISPOSE, 1220 (fs_generic_func_p)fs_dispose)) 1221 dispose = 1; 1222 } 1223 for (j = i; j < i + curpgcnt; j++) { 1224 ap = anon_get_ptr(ahp, an_idx + j); 1225 ASSERT(ap != NULL && 1226 ap->an_refcnt == 1); 1227 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1228 ap->an_off)]; 1229 (void) anon_set_ptr(ahp, an_idx + j, 1230 NULL, ANON_SLEEP); 1231 mutex_enter(ahm); 1232 ap->an_refcnt--; 1233 ASSERT(ap->an_refcnt == 0); 1234 anon_rmhash(ap); 1235 if (ap->an_pvp) 1236 swap_phys_free(ap->an_pvp, 1237 ap->an_poff, PAGESIZE); 1238 mutex_exit(ahm); 1239 kmem_cache_free(anon_cache, ap); 1240 ANI_ADD(1); 1241 } 1242 if (!dispose) { 1243 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1244 page_destroy_pages(ppa[0]); 1245 } else { 1246 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1247 for (j = 0; j < curpgcnt; j++) { 1248 ASSERT(PAGE_EXCL(ppa[j])); 1249 ppa[j]->p_szc = 0; 1250 } 1251 for (j = 0; j < curpgcnt; j++) { 1252 ASSERT(!hat_page_is_mapped( 1253 ppa[j])); 1254 /*LINTED*/ 1255 VN_DISPOSE(ppa[j], B_INVAL, 0, 1256 kcred); 1257 } 1258 } 1259 kmem_free(ppa, ppasize); 1260 i += curpgcnt; 1261 } 1262 } else { 1263 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1264 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1265 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1266 mutex_enter(ahm); 1267 ap->an_refcnt--; 1268 mutex_exit(ahm); 1269 i++; 1270 } 1271 } 1272 1273 if (ahmpages != NULL) { 1274 mutex_exit(ahmpages); 1275 } 1276 } 1277 1278 /* 1279 * Duplicate references to size bytes worth of anon pages. 1280 * Used when duplicating a segment that contains private anon pages. 1281 * This code assumes that procedure calling this one has already used 1282 * hat_chgprot() to disable write access to the range of addresses that 1283 * that *old actually refers to. 1284 */ 1285 void 1286 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1287 ulong_t new_idx, size_t size) 1288 { 1289 spgcnt_t npages; 1290 kmutex_t *ahm; 1291 struct anon *ap; 1292 ulong_t off; 1293 ulong_t index; 1294 1295 npages = btopr(size); 1296 while (npages > 0) { 1297 index = old_idx; 1298 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1299 break; 1300 1301 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1302 off = index - old_idx; 1303 npages -= off; 1304 if (npages <= 0) 1305 break; 1306 1307 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1308 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1309 1310 mutex_enter(ahm); 1311 ap->an_refcnt++; 1312 mutex_exit(ahm); 1313 1314 off++; 1315 new_idx += off; 1316 old_idx += off; 1317 npages--; 1318 } 1319 } 1320 1321 /* 1322 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1323 * slots) within any large page region. That means if a large page region is 1324 * empty in the old array it will skip it. If there are 1 or more valid slots 1325 * in the large page region of the old array it will make sure to fill in any 1326 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1327 * page region should either have no valid anon slots or all slots should be 1328 * valid. 1329 */ 1330 void 1331 anon_dup_fill_holes( 1332 struct anon_hdr *old, 1333 ulong_t old_idx, 1334 struct anon_hdr *new, 1335 ulong_t new_idx, 1336 size_t size, 1337 uint_t szc, 1338 int noalloc) 1339 { 1340 struct anon *ap; 1341 spgcnt_t npages; 1342 kmutex_t *ahm, *ahmpages = NULL; 1343 pgcnt_t pgcnt, i; 1344 ulong_t index, off; 1345 #ifdef DEBUG 1346 int refcnt; 1347 #endif 1348 1349 ASSERT(szc != 0); 1350 pgcnt = page_get_pagecnt(szc); 1351 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1352 npages = btopr(size); 1353 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1354 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1355 1356 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1357 1358 while (npages > 0) { 1359 index = old_idx; 1360 1361 /* 1362 * Find the next valid slot. 1363 */ 1364 if (anon_get_next_ptr(old, &index) == NULL) 1365 break; 1366 1367 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1368 /* 1369 * Now backup index to the beginning of the 1370 * current large page region of the old array. 1371 */ 1372 index = P2ALIGN(index, pgcnt); 1373 off = index - old_idx; 1374 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1375 npages -= off; 1376 if (npages <= 0) 1377 break; 1378 1379 /* 1380 * Fill and copy a large page regions worth 1381 * of anon slots. 1382 */ 1383 for (i = 0; i < pgcnt; i++) { 1384 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1385 if (noalloc) { 1386 panic("anon_dup_fill_holes: " 1387 "empty anon slot\n"); 1388 } 1389 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1390 ap = anon_alloc(NULL, 0); 1391 (void) anon_set_ptr(old, index + i, ap, 1392 ANON_SLEEP); 1393 } else if (i == 0) { 1394 /* 1395 * make the increment of all refcnts of all 1396 * anon slots of a large page appear atomic by 1397 * getting an anonpages_hash_lock for the 1398 * first anon slot of a large page. 1399 */ 1400 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1401 1402 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1403 1404 ahmpages = &anonpages_hash_lock[hash]; 1405 mutex_enter(ahmpages); 1406 /*LINTED*/ 1407 ASSERT(refcnt = ap->an_refcnt); 1408 1409 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1410 anonvmstats.dupfillholes[3]); 1411 } 1412 (void) anon_set_ptr(new, new_idx + off + i, ap, 1413 ANON_SLEEP); 1414 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1415 mutex_enter(ahm); 1416 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1417 ASSERT(i == 0 || ahmpages == NULL || 1418 refcnt == ap->an_refcnt); 1419 ap->an_refcnt++; 1420 mutex_exit(ahm); 1421 } 1422 if (ahmpages != NULL) { 1423 mutex_exit(ahmpages); 1424 ahmpages = NULL; 1425 } 1426 off += pgcnt; 1427 new_idx += off; 1428 old_idx += off; 1429 npages -= pgcnt; 1430 } 1431 } 1432 1433 /* 1434 * Used when a segment with a vnode changes szc. similarly to 1435 * anon_dup_fill_holes() makes sure each large page region either has no anon 1436 * slots or all of them. but new slots are created by COWing the file 1437 * pages. on entrance no anon slots should be shared. 1438 */ 1439 int 1440 anon_fill_cow_holes( 1441 struct seg *seg, 1442 caddr_t addr, 1443 struct anon_hdr *ahp, 1444 ulong_t an_idx, 1445 struct vnode *vp, 1446 u_offset_t vp_off, 1447 size_t size, 1448 uint_t szc, 1449 uint_t prot, 1450 struct vpage vpage[], 1451 struct cred *cred) 1452 { 1453 struct anon *ap; 1454 spgcnt_t npages; 1455 pgcnt_t pgcnt, i; 1456 ulong_t index, off; 1457 int err = 0; 1458 int pageflags = 0; 1459 1460 ASSERT(szc != 0); 1461 pgcnt = page_get_pagecnt(szc); 1462 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1463 npages = btopr(size); 1464 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1465 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1466 1467 while (npages > 0) { 1468 index = an_idx; 1469 1470 /* 1471 * Find the next valid slot. 1472 */ 1473 if (anon_get_next_ptr(ahp, &index) == NULL) { 1474 break; 1475 } 1476 1477 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1478 /* 1479 * Now backup index to the beginning of the 1480 * current large page region of the anon array. 1481 */ 1482 index = P2ALIGN(index, pgcnt); 1483 off = index - an_idx; 1484 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1485 npages -= off; 1486 if (npages <= 0) 1487 break; 1488 an_idx += off; 1489 vp_off += ptob(off); 1490 addr += ptob(off); 1491 if (vpage != NULL) { 1492 vpage += off; 1493 } 1494 1495 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1496 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1497 page_t *pl[1 + 1]; 1498 page_t *pp; 1499 1500 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1501 pl, PAGESIZE, seg, addr, S_READ, cred, 1502 NULL); 1503 if (err) { 1504 break; 1505 } 1506 if (vpage != NULL) { 1507 prot = VPP_PROT(vpage); 1508 pageflags = VPP_ISPPLOCK(vpage) ? 1509 LOCK_PAGE : 0; 1510 } 1511 pp = anon_private(&ap, seg, addr, prot, pl[0], 1512 pageflags, cred); 1513 if (pp == NULL) { 1514 err = ENOMEM; 1515 break; 1516 } 1517 (void) anon_set_ptr(ahp, an_idx, ap, 1518 ANON_SLEEP); 1519 page_unlock(pp); 1520 } 1521 ASSERT(ap->an_refcnt == 1); 1522 addr += PAGESIZE; 1523 if (vpage != NULL) { 1524 vpage++; 1525 } 1526 } 1527 npages -= pgcnt; 1528 } 1529 1530 return (err); 1531 } 1532 1533 /* 1534 * Free a group of "size" anon pages, size in bytes, 1535 * and clear out the pointers to the anon entries. 1536 */ 1537 void 1538 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1539 { 1540 spgcnt_t npages; 1541 struct anon *ap; 1542 ulong_t old; 1543 1544 npages = btopr(size); 1545 1546 while (npages > 0) { 1547 old = index; 1548 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1549 break; 1550 1551 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1552 npages -= index - old; 1553 if (npages <= 0) 1554 break; 1555 1556 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1557 anon_decref(ap); 1558 /* 1559 * Bump index and decrement page count 1560 */ 1561 index++; 1562 npages--; 1563 } 1564 } 1565 1566 void 1567 anon_free_pages( 1568 struct anon_hdr *ahp, 1569 ulong_t an_idx, 1570 size_t size, 1571 uint_t szc) 1572 { 1573 spgcnt_t npages; 1574 pgcnt_t pgcnt; 1575 ulong_t index, off; 1576 1577 ASSERT(szc != 0); 1578 pgcnt = page_get_pagecnt(szc); 1579 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1580 npages = btopr(size); 1581 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1582 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1583 ASSERT(an_idx < ahp->size); 1584 1585 VM_STAT_ADD(anonvmstats.freepages[0]); 1586 1587 while (npages > 0) { 1588 index = an_idx; 1589 1590 /* 1591 * Find the next valid slot. 1592 */ 1593 if (anon_get_next_ptr(ahp, &index) == NULL) 1594 break; 1595 1596 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1597 /* 1598 * Now backup index to the beginning of the 1599 * current large page region of the old array. 1600 */ 1601 index = P2ALIGN(index, pgcnt); 1602 off = index - an_idx; 1603 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1604 npages -= off; 1605 if (npages <= 0) 1606 break; 1607 1608 anon_decref_pages(ahp, index, szc); 1609 1610 off += pgcnt; 1611 an_idx += off; 1612 npages -= pgcnt; 1613 } 1614 } 1615 1616 /* 1617 * Make anonymous pages discardable 1618 */ 1619 void 1620 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1621 { 1622 spgcnt_t npages = btopr(size); 1623 struct anon *ap; 1624 struct vnode *vp; 1625 anoff_t off; 1626 page_t *pp, *root_pp; 1627 kmutex_t *ahm; 1628 pgcnt_t pgcnt; 1629 ulong_t old_idx, idx, i; 1630 struct anon_hdr *ahp = amp->ahp; 1631 anon_sync_obj_t cookie; 1632 1633 ASSERT(RW_READ_HELD(&->a_rwlock)); 1634 pgcnt = 1; 1635 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1636 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1637 1638 /* 1639 * get anon pointer and index for the first valid entry 1640 * in the anon list, starting from "index" 1641 */ 1642 old_idx = index; 1643 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1644 break; 1645 1646 /* 1647 * decrement npages by number of NULL anon slots we skipped 1648 */ 1649 npages -= index - old_idx; 1650 if (npages <= 0) 1651 break; 1652 1653 anon_array_enter(amp, index, &cookie); 1654 ap = anon_get_ptr(ahp, index); 1655 ASSERT(ap != NULL); 1656 1657 /* 1658 * Get anonymous page and try to lock it SE_EXCL; 1659 * if we couldn't grab the lock we skip to next page. 1660 */ 1661 swap_xlate(ap, &vp, &off); 1662 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1663 if (pp == NULL) { 1664 segadvstat.MADV_FREE_miss.value.ul++; 1665 pgcnt = 1; 1666 anon_array_exit(&cookie); 1667 continue; 1668 } 1669 pgcnt = page_get_pagecnt(pp->p_szc); 1670 1671 /* 1672 * we cannot free a page which is permanently locked. 1673 * The page_struct_lock need not be acquired to examine 1674 * these fields since the page has an "exclusive" lock. 1675 */ 1676 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1677 page_unlock(pp); 1678 segadvstat.MADV_FREE_miss.value.ul++; 1679 anon_array_exit(&cookie); 1680 continue; 1681 } 1682 1683 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1684 mutex_enter(ahm); 1685 ASSERT(ap->an_refcnt != 0); 1686 /* 1687 * skip this one if copy-on-write is not yet broken. 1688 */ 1689 if (ap->an_refcnt > 1) { 1690 mutex_exit(ahm); 1691 page_unlock(pp); 1692 segadvstat.MADV_FREE_miss.value.ul++; 1693 anon_array_exit(&cookie); 1694 continue; 1695 } 1696 1697 if (pp->p_szc == 0) { 1698 pgcnt = 1; 1699 1700 /* 1701 * free swap slot; 1702 */ 1703 if (ap->an_pvp) { 1704 swap_phys_free(ap->an_pvp, ap->an_poff, 1705 PAGESIZE); 1706 ap->an_pvp = NULL; 1707 ap->an_poff = 0; 1708 } 1709 mutex_exit(ahm); 1710 segadvstat.MADV_FREE_hit.value.ul++; 1711 1712 /* 1713 * while we are at it, unload all the translations 1714 * and attempt to free the page. 1715 */ 1716 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1717 /*LINTED: constant in conditional context */ 1718 VN_DISPOSE(pp, B_FREE, 0, kcred); 1719 anon_array_exit(&cookie); 1720 continue; 1721 } 1722 1723 pgcnt = page_get_pagecnt(pp->p_szc); 1724 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1725 if (!page_try_demote_pages(pp)) { 1726 mutex_exit(ahm); 1727 page_unlock(pp); 1728 segadvstat.MADV_FREE_miss.value.ul++; 1729 anon_array_exit(&cookie); 1730 continue; 1731 } else { 1732 pgcnt = 1; 1733 if (ap->an_pvp) { 1734 swap_phys_free(ap->an_pvp, 1735 ap->an_poff, PAGESIZE); 1736 ap->an_pvp = NULL; 1737 ap->an_poff = 0; 1738 } 1739 mutex_exit(ahm); 1740 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1741 /*LINTED*/ 1742 VN_DISPOSE(pp, B_FREE, 0, kcred); 1743 segadvstat.MADV_FREE_hit.value.ul++; 1744 anon_array_exit(&cookie); 1745 continue; 1746 } 1747 } 1748 mutex_exit(ahm); 1749 root_pp = pp; 1750 1751 /* 1752 * try to lock remaining pages 1753 */ 1754 for (idx = 1; idx < pgcnt; idx++) { 1755 pp++; 1756 if (!page_trylock(pp, SE_EXCL)) 1757 break; 1758 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1759 page_unlock(pp); 1760 break; 1761 } 1762 } 1763 1764 if (idx == pgcnt) { 1765 for (i = 0; i < pgcnt; i++) { 1766 ap = anon_get_ptr(ahp, index + i); 1767 if (ap == NULL) 1768 break; 1769 swap_xlate(ap, &vp, &off); 1770 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1771 mutex_enter(ahm); 1772 ASSERT(ap->an_refcnt != 0); 1773 1774 /* 1775 * skip this one if copy-on-write 1776 * is not yet broken. 1777 */ 1778 if (ap->an_refcnt > 1) { 1779 mutex_exit(ahm); 1780 goto skiplp; 1781 } 1782 if (ap->an_pvp) { 1783 swap_phys_free(ap->an_pvp, 1784 ap->an_poff, PAGESIZE); 1785 ap->an_pvp = NULL; 1786 ap->an_poff = 0; 1787 } 1788 mutex_exit(ahm); 1789 } 1790 page_destroy_pages(root_pp); 1791 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1792 anon_array_exit(&cookie); 1793 continue; 1794 } 1795 skiplp: 1796 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1797 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1798 page_unlock(pp); 1799 anon_array_exit(&cookie); 1800 } 1801 } 1802 1803 /* 1804 * Return the kept page(s) and protections back to the segment driver. 1805 */ 1806 int 1807 anon_getpage( 1808 struct anon **app, 1809 uint_t *protp, 1810 page_t *pl[], 1811 size_t plsz, 1812 struct seg *seg, 1813 caddr_t addr, 1814 enum seg_rw rw, 1815 struct cred *cred) 1816 { 1817 page_t *pp; 1818 struct anon *ap = *app; 1819 struct vnode *vp; 1820 anoff_t off; 1821 int err; 1822 kmutex_t *ahm; 1823 1824 swap_xlate(ap, &vp, &off); 1825 1826 /* 1827 * Lookup the page. If page is being paged in, 1828 * wait for it to finish as we must return a list of 1829 * pages since this routine acts like the VOP_GETPAGE 1830 * routine does. 1831 */ 1832 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1833 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1834 mutex_enter(ahm); 1835 if (ap->an_refcnt == 1) 1836 *protp = PROT_ALL; 1837 else 1838 *protp = PROT_ALL & ~PROT_WRITE; 1839 mutex_exit(ahm); 1840 pl[0] = pp; 1841 pl[1] = NULL; 1842 return (0); 1843 } 1844 1845 /* 1846 * Simply treat it as a vnode fault on the anon vp. 1847 */ 1848 1849 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1850 "anon_getpage:seg %x addr %x vp %x", 1851 seg, addr, vp); 1852 1853 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1854 seg, addr, rw, cred, NULL); 1855 1856 if (err == 0 && pl != NULL) { 1857 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1858 mutex_enter(ahm); 1859 if (ap->an_refcnt != 1) 1860 *protp &= ~PROT_WRITE; /* make read-only */ 1861 mutex_exit(ahm); 1862 } 1863 return (err); 1864 } 1865 1866 /* 1867 * Creates or returns kept pages to the segment driver. returns -1 if a large 1868 * page cannot be allocated. returns -2 if some other process has allocated a 1869 * larger page. 1870 * 1871 * For cowfault it will allocate any size pages to fill the requested area to 1872 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1873 * slots within a large page with other processes). This policy greatly 1874 * simplifies large page freeing (which is only freed when all anon slot 1875 * refcnts are 0). 1876 */ 1877 int 1878 anon_map_getpages( 1879 struct anon_map *amp, 1880 ulong_t start_idx, 1881 uint_t szc, 1882 struct seg *seg, 1883 caddr_t addr, 1884 uint_t prot, 1885 uint_t *protp, 1886 page_t *ppa[], 1887 uint_t *ppa_szc, 1888 struct vpage vpage[], 1889 enum seg_rw rw, 1890 int brkcow, 1891 int anypgsz, 1892 int pgflags, 1893 struct cred *cred) 1894 { 1895 pgcnt_t pgcnt; 1896 struct anon *ap; 1897 struct vnode *vp; 1898 anoff_t off; 1899 page_t *pp, *pl[2], *conpp = NULL; 1900 caddr_t vaddr; 1901 ulong_t pg_idx, an_idx, i; 1902 spgcnt_t nreloc = 0; 1903 int prealloc = 1; 1904 int err, slotcreate; 1905 uint_t vpprot; 1906 int upsize = (szc < seg->s_szc); 1907 1908 #if !defined(__i386) && !defined(__amd64) 1909 ASSERT(seg->s_szc != 0); 1910 #endif 1911 ASSERT(szc <= seg->s_szc); 1912 ASSERT(ppa_szc != NULL); 1913 ASSERT(rw != S_CREATE); 1914 1915 *protp = PROT_ALL; 1916 1917 VM_STAT_ADD(anonvmstats.getpages[0]); 1918 1919 if (szc == 0) { 1920 VM_STAT_ADD(anonvmstats.getpages[1]); 1921 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1922 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1923 addr, rw, cred); 1924 if (err) 1925 return (err); 1926 ppa[0] = pl[0]; 1927 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1928 VM_STAT_ADD(anonvmstats.getpages[2]); 1929 if (ppa[0]->p_szc != 0 && upsize) { 1930 VM_STAT_ADD(anonvmstats.getpages[3]); 1931 *ppa_szc = MIN(ppa[0]->p_szc, 1932 seg->s_szc); 1933 page_unlock(ppa[0]); 1934 return (-2); 1935 } 1936 return (0); 1937 } 1938 panic("anon_map_getpages: cowfault for szc 0"); 1939 } else { 1940 VM_STAT_ADD(anonvmstats.getpages[4]); 1941 ppa[0] = anon_zero(seg, addr, &ap, cred); 1942 if (ppa[0] == NULL) 1943 return (ENOMEM); 1944 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1945 ANON_SLEEP); 1946 return (0); 1947 } 1948 } 1949 1950 pgcnt = page_get_pagecnt(szc); 1951 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1952 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1953 1954 /* 1955 * First we check for the case that the requtested large 1956 * page or larger page already exists in the system. 1957 * Actually we only check if the first constituent page 1958 * exists and only preallocate if it's not found. 1959 */ 1960 ap = anon_get_ptr(amp->ahp, start_idx); 1961 if (ap) { 1962 uint_t pszc; 1963 swap_xlate(ap, &vp, &off); 1964 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1965 if (pszc > szc && upsize) { 1966 *ppa_szc = MIN(pszc, seg->s_szc); 1967 return (-2); 1968 } 1969 if (pszc >= szc) { 1970 prealloc = 0; 1971 } 1972 } 1973 } 1974 1975 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1976 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1977 1978 top: 1979 /* 1980 * If a smaller page or no page at all was found, 1981 * grab a large page off the freelist. 1982 */ 1983 if (prealloc) { 1984 ASSERT(conpp == NULL); 1985 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1986 szc, 0, pgflags) != 0) { 1987 VM_STAT_ADD(anonvmstats.getpages[7]); 1988 if (brkcow == 0 || szc < seg->s_szc || 1989 !anon_szcshare(amp->ahp, start_idx)) { 1990 /* 1991 * If the refcnt's of all anon slots are <= 1 1992 * they can't increase since we are holding 1993 * the address space's lock. So segvn can 1994 * safely decrease szc without risking to 1995 * generate a cow fault for the region smaller 1996 * than the segment's largest page size. 1997 */ 1998 VM_STAT_ADD(anonvmstats.getpages[8]); 1999 return (-1); 2000 } 2001 docow: 2002 /* 2003 * This is a cow fault. Copy away the entire 1 large 2004 * page region of this segment. 2005 */ 2006 if (szc != seg->s_szc) 2007 panic("anon_map_getpages: cowfault for szc %d", 2008 szc); 2009 vaddr = addr; 2010 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2011 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2012 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2013 NULL) { 2014 err = anon_getpage(&ap, &vpprot, pl, 2015 PAGESIZE, seg, vaddr, rw, cred); 2016 if (err) { 2017 for (i = 0; i < pg_idx; i++) { 2018 if ((pp = ppa[i]) != 2019 NULL) 2020 page_unlock(pp); 2021 } 2022 return (err); 2023 } 2024 ppa[pg_idx] = pl[0]; 2025 } else { 2026 /* 2027 * Since this is a cowfault we know 2028 * that this address space has a 2029 * parent or children which means 2030 * anon_dup_fill_holes() has initialized 2031 * all anon slots within a large page 2032 * region that had at least one anon 2033 * slot at the time of fork(). 2034 */ 2035 panic("anon_map_getpages: " 2036 "cowfault but anon slot is empty"); 2037 } 2038 } 2039 VM_STAT_ADD(anonvmstats.getpages[9]); 2040 *protp = PROT_ALL; 2041 return (anon_map_privatepages(amp, start_idx, szc, seg, 2042 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2043 } 2044 } 2045 2046 VM_STAT_ADD(anonvmstats.getpages[10]); 2047 2048 an_idx = start_idx; 2049 pg_idx = 0; 2050 vaddr = addr; 2051 while (pg_idx < pgcnt) { 2052 slotcreate = 0; 2053 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2054 VM_STAT_ADD(anonvmstats.getpages[11]); 2055 /* 2056 * For us to have decided not to preallocate 2057 * would have meant that a large page 2058 * was found. Which also means that all of the 2059 * anon slots for that page would have been 2060 * already created for us. 2061 */ 2062 if (prealloc == 0) 2063 panic("anon_map_getpages: prealloc = 0"); 2064 2065 slotcreate = 1; 2066 ap = anon_alloc(NULL, 0); 2067 } 2068 swap_xlate(ap, &vp, &off); 2069 2070 /* 2071 * Now setup our preallocated page to pass down 2072 * to swap_getpage(). 2073 */ 2074 if (prealloc) { 2075 ASSERT(ppa[pg_idx]->p_szc == szc); 2076 conpp = ppa[pg_idx]; 2077 } 2078 ASSERT(prealloc || conpp == NULL); 2079 2080 /* 2081 * If we just created this anon slot then call 2082 * with S_CREATE to prevent doing IO on the page. 2083 * Similar to the anon_zero case. 2084 */ 2085 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2086 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2087 slotcreate == 1 ? S_CREATE : rw, cred); 2088 2089 if (err) { 2090 ASSERT(err != -2 || upsize); 2091 VM_STAT_ADD(anonvmstats.getpages[12]); 2092 ASSERT(slotcreate == 0); 2093 goto io_err; 2094 } 2095 2096 pp = pl[0]; 2097 2098 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2099 VM_STAT_ADD(anonvmstats.getpages[13]); 2100 ASSERT(slotcreate == 0); 2101 ASSERT(prealloc == 0); 2102 ASSERT(pg_idx == 0); 2103 if (pp->p_szc > szc) { 2104 ASSERT(upsize); 2105 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2106 page_unlock(pp); 2107 VM_STAT_ADD(anonvmstats.getpages[14]); 2108 return (-2); 2109 } 2110 page_unlock(pp); 2111 prealloc = 1; 2112 goto top; 2113 } 2114 2115 /* 2116 * If we decided to preallocate but VOP_GETPAGE 2117 * found a page in the system that satisfies our 2118 * request then free up our preallocated large page 2119 * and continue looping accross the existing large 2120 * page via VOP_GETPAGE. 2121 */ 2122 if (prealloc && pp != ppa[pg_idx]) { 2123 VM_STAT_ADD(anonvmstats.getpages[15]); 2124 ASSERT(slotcreate == 0); 2125 ASSERT(pg_idx == 0); 2126 conpp = NULL; 2127 prealloc = 0; 2128 page_free_pages(ppa[0]); 2129 } 2130 2131 if (prealloc && nreloc > 1) { 2132 /* 2133 * we have relocated out of a smaller large page. 2134 * skip npgs - 1 iterations and continue which will 2135 * increment by one the loop indices. 2136 */ 2137 spgcnt_t npgs = nreloc; 2138 2139 VM_STAT_ADD(anonvmstats.getpages[16]); 2140 2141 ASSERT(pp == ppa[pg_idx]); 2142 ASSERT(slotcreate == 0); 2143 ASSERT(pg_idx + npgs <= pgcnt); 2144 if ((*protp & PROT_WRITE) && 2145 anon_share(amp->ahp, an_idx, npgs)) { 2146 *protp &= ~PROT_WRITE; 2147 } 2148 pg_idx += npgs; 2149 an_idx += npgs; 2150 vaddr += PAGESIZE * npgs; 2151 continue; 2152 } 2153 2154 VM_STAT_ADD(anonvmstats.getpages[17]); 2155 2156 /* 2157 * Anon_zero case. 2158 */ 2159 if (slotcreate) { 2160 ASSERT(prealloc); 2161 pagezero(pp, 0, PAGESIZE); 2162 CPU_STATS_ADD_K(vm, zfod, 1); 2163 hat_setrefmod(pp); 2164 } 2165 2166 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2167 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2168 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2169 2170 if (pg_idx > 0 && 2171 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2172 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2173 panic("anon_map_getpages: unexpected page"); 2174 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2175 panic("anon_map_getpages: unaligned page"); 2176 } 2177 2178 if (prealloc == 0) { 2179 ppa[pg_idx] = pp; 2180 } 2181 2182 if (ap->an_refcnt > 1) { 2183 VM_STAT_ADD(anonvmstats.getpages[18]); 2184 *protp &= ~PROT_WRITE; 2185 } 2186 2187 /* 2188 * If this is a new anon slot then initialize 2189 * the anon array entry. 2190 */ 2191 if (slotcreate) { 2192 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2193 } 2194 pg_idx++; 2195 an_idx++; 2196 vaddr += PAGESIZE; 2197 } 2198 2199 /* 2200 * Since preallocated pages come off the freelist 2201 * they are locked SE_EXCL. Simply downgrade and return. 2202 */ 2203 if (prealloc) { 2204 VM_STAT_ADD(anonvmstats.getpages[19]); 2205 conpp = NULL; 2206 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2207 page_downgrade(ppa[pg_idx]); 2208 } 2209 } 2210 ASSERT(conpp == NULL); 2211 2212 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2213 VM_STAT_ADD(anonvmstats.getpages[20]); 2214 return (0); 2215 } 2216 2217 if (szc < seg->s_szc) 2218 panic("anon_map_getpages: cowfault for szc %d", szc); 2219 2220 VM_STAT_ADD(anonvmstats.getpages[21]); 2221 2222 *protp = PROT_ALL; 2223 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2224 ppa, vpage, anypgsz, pgflags, cred)); 2225 io_err: 2226 /* 2227 * We got an IO error somewhere in our large page. 2228 * If we were using a preallocated page then just demote 2229 * all the constituent pages that we've succeeded with sofar 2230 * to PAGESIZE pages and leave them in the system 2231 * unlocked. 2232 */ 2233 2234 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2235 2236 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2237 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2238 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2239 2240 if (prealloc) { 2241 conpp = NULL; 2242 if (pg_idx > 0) { 2243 VM_STAT_ADD(anonvmstats.getpages[25]); 2244 for (i = 0; i < pgcnt; i++) { 2245 pp = ppa[i]; 2246 ASSERT(PAGE_EXCL(pp)); 2247 ASSERT(pp->p_szc == szc); 2248 pp->p_szc = 0; 2249 } 2250 for (i = 0; i < pg_idx; i++) { 2251 ASSERT(!hat_page_is_mapped(ppa[i])); 2252 page_unlock(ppa[i]); 2253 } 2254 /* 2255 * Now free up the remaining unused constituent 2256 * pages. 2257 */ 2258 while (pg_idx < pgcnt) { 2259 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2260 page_free(ppa[pg_idx], 0); 2261 pg_idx++; 2262 } 2263 } else { 2264 VM_STAT_ADD(anonvmstats.getpages[26]); 2265 page_free_pages(ppa[0]); 2266 } 2267 } else { 2268 VM_STAT_ADD(anonvmstats.getpages[27]); 2269 ASSERT(err > 0); 2270 for (i = 0; i < pg_idx; i++) 2271 page_unlock(ppa[i]); 2272 } 2273 ASSERT(conpp == NULL); 2274 if (err != -1) 2275 return (err); 2276 /* 2277 * we are here because we failed to relocate. 2278 */ 2279 ASSERT(prealloc); 2280 if (brkcow == 0 || szc < seg->s_szc || 2281 !anon_szcshare(amp->ahp, start_idx)) { 2282 VM_STAT_ADD(anonvmstats.getpages[28]); 2283 return (-1); 2284 } 2285 VM_STAT_ADD(anonvmstats.getpages[29]); 2286 goto docow; 2287 } 2288 2289 2290 /* 2291 * Turn a reference to an object or shared anon page 2292 * into a private page with a copy of the data from the 2293 * original page which is always locked by the caller. 2294 * This routine unloads the translation and unlocks the 2295 * original page, if it isn't being stolen, before returning 2296 * to the caller. 2297 * 2298 * NOTE: The original anon slot is not freed by this routine 2299 * It must be freed by the caller while holding the 2300 * "anon_map" lock to prevent races which can occur if 2301 * a process has multiple lwps in its address space. 2302 */ 2303 page_t * 2304 anon_private( 2305 struct anon **app, 2306 struct seg *seg, 2307 caddr_t addr, 2308 uint_t prot, 2309 page_t *opp, 2310 int oppflags, 2311 struct cred *cred) 2312 { 2313 struct anon *old = *app; 2314 struct anon *new; 2315 page_t *pp = NULL; 2316 struct vnode *vp; 2317 anoff_t off; 2318 page_t *anon_pl[1 + 1]; 2319 int err; 2320 2321 if (oppflags & STEAL_PAGE) 2322 ASSERT(PAGE_EXCL(opp)); 2323 else 2324 ASSERT(PAGE_LOCKED(opp)); 2325 2326 CPU_STATS_ADD_K(vm, cow_fault, 1); 2327 2328 /* Kernel probe */ 2329 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2330 tnf_opaque, address, addr); 2331 2332 *app = new = anon_alloc(NULL, 0); 2333 swap_xlate(new, &vp, &off); 2334 2335 if (oppflags & STEAL_PAGE) { 2336 page_rename(opp, vp, (u_offset_t)off); 2337 pp = opp; 2338 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2339 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2340 seg, addr, pp, vp, off); 2341 hat_setmod(pp); 2342 2343 /* bug 4026339 */ 2344 page_downgrade(pp); 2345 return (pp); 2346 } 2347 2348 /* 2349 * Call the VOP_GETPAGE routine to create the page, thereby 2350 * enabling the vnode driver to allocate any filesystem 2351 * space (e.g., disk block allocation for UFS). This also 2352 * prevents more than one page from being added to the 2353 * vnode at the same time. 2354 */ 2355 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2356 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2357 if (err) 2358 goto out; 2359 2360 pp = anon_pl[0]; 2361 2362 /* 2363 * If the original page was locked, we need to move the lock 2364 * to the new page by transfering 'cowcnt/lckcnt' of the original 2365 * page to 'cowcnt/lckcnt' of the new page. 2366 * 2367 * See Statement at the beginning of segvn_lockop() and 2368 * comments in page_pp_useclaim() regarding the way 2369 * cowcnts/lckcnts are handled. 2370 * 2371 * Also availrmem must be decremented up front for read only mapping 2372 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2373 * if availrmem did not need to be decremented after all. 2374 */ 2375 if (oppflags & LOCK_PAGE) { 2376 if ((prot & PROT_WRITE) == 0) { 2377 mutex_enter(&freemem_lock); 2378 if (availrmem > pages_pp_maximum) { 2379 availrmem--; 2380 pages_useclaim++; 2381 } else { 2382 mutex_exit(&freemem_lock); 2383 goto out; 2384 } 2385 mutex_exit(&freemem_lock); 2386 } 2387 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2388 } 2389 2390 /* 2391 * Now copy the contents from the original page, 2392 * which is locked and loaded in the MMU by 2393 * the caller to prevent yet another page fault. 2394 */ 2395 /* XXX - should set mod bit in here */ 2396 if (ppcopy(opp, pp) == 0) { 2397 /* 2398 * Before ppcopy could hanlde UE or other faults, we 2399 * would have panicked here, and still have no option 2400 * but to do so now. 2401 */ 2402 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2403 (void *)opp, (void *)pp); 2404 } 2405 2406 hat_setrefmod(pp); /* mark as modified */ 2407 2408 /* 2409 * Unload the old translation. 2410 */ 2411 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2412 2413 /* 2414 * Free unmapped, unmodified original page. 2415 * or release the lock on the original page, 2416 * otherwise the process will sleep forever in 2417 * anon_decref() waiting for the "exclusive" lock 2418 * on the page. 2419 */ 2420 (void) page_release(opp, 1); 2421 2422 /* 2423 * we are done with page creation so downgrade the new 2424 * page's selock to shared, this helps when multiple 2425 * as_fault(...SOFTLOCK...) are done to the same 2426 * page(aio) 2427 */ 2428 page_downgrade(pp); 2429 2430 /* 2431 * NOTE: The original anon slot must be freed by the 2432 * caller while holding the "anon_map" lock, if we 2433 * copied away from an anonymous page. 2434 */ 2435 return (pp); 2436 2437 out: 2438 *app = old; 2439 if (pp) 2440 page_unlock(pp); 2441 anon_decref(new); 2442 page_unlock(opp); 2443 return ((page_t *)NULL); 2444 } 2445 2446 int 2447 anon_map_privatepages( 2448 struct anon_map *amp, 2449 ulong_t start_idx, 2450 uint_t szc, 2451 struct seg *seg, 2452 caddr_t addr, 2453 uint_t prot, 2454 page_t *ppa[], 2455 struct vpage vpage[], 2456 int anypgsz, 2457 int pgflags, 2458 struct cred *cred) 2459 { 2460 pgcnt_t pgcnt; 2461 struct vnode *vp; 2462 anoff_t off; 2463 page_t *pl[2], *conpp = NULL; 2464 int err; 2465 int prealloc = 1; 2466 struct anon *ap, *oldap; 2467 caddr_t vaddr; 2468 page_t *pplist, *pp; 2469 ulong_t pg_idx, an_idx; 2470 spgcnt_t nreloc = 0; 2471 int pagelock = 0; 2472 kmutex_t *ahmpages = NULL; 2473 #ifdef DEBUG 2474 int refcnt; 2475 #endif 2476 2477 ASSERT(szc != 0); 2478 ASSERT(szc == seg->s_szc); 2479 2480 VM_STAT_ADD(anonvmstats.privatepages[0]); 2481 2482 pgcnt = page_get_pagecnt(szc); 2483 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2484 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2485 2486 ASSERT(amp != NULL); 2487 ap = anon_get_ptr(amp->ahp, start_idx); 2488 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2489 2490 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2491 2492 /* 2493 * Now try and allocate the large page. If we fail then just 2494 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2495 * the caller make this decision but to avoid added complexity 2496 * it's simplier to handle that case here. 2497 */ 2498 if (anypgsz == -1) { 2499 VM_STAT_ADD(anonvmstats.privatepages[2]); 2500 prealloc = 0; 2501 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2502 anypgsz, pgflags) != 0) { 2503 VM_STAT_ADD(anonvmstats.privatepages[3]); 2504 prealloc = 0; 2505 } 2506 2507 /* 2508 * make the decrement of all refcnts of all 2509 * anon slots of a large page appear atomic by 2510 * getting an anonpages_hash_lock for the 2511 * first anon slot of a large page. 2512 */ 2513 if (ap != NULL) { 2514 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2515 ap->an_off)]; 2516 mutex_enter(ahmpages); 2517 if (ap->an_refcnt == 1) { 2518 VM_STAT_ADD(anonvmstats.privatepages[4]); 2519 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2520 mutex_exit(ahmpages); 2521 2522 if (prealloc) { 2523 page_free_replacement_page(pplist); 2524 page_create_putback(pgcnt); 2525 } 2526 ASSERT(ppa[0]->p_szc <= szc); 2527 if (ppa[0]->p_szc == szc) { 2528 VM_STAT_ADD(anonvmstats.privatepages[5]); 2529 return (0); 2530 } 2531 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2532 ASSERT(ppa[pg_idx] != NULL); 2533 page_unlock(ppa[pg_idx]); 2534 } 2535 return (-1); 2536 } 2537 } 2538 2539 /* 2540 * If we are passed in the vpage array and this is 2541 * not PROT_WRITE then we need to decrement availrmem 2542 * up front before we try anything. If we need to and 2543 * can't decrement availrmem then its better to fail now 2544 * than in the middle of processing the new large page. 2545 * page_pp_usclaim() on behalf of each constituent page 2546 * below will adjust availrmem back for the cases not needed. 2547 */ 2548 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2549 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2550 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2551 pagelock = 1; 2552 break; 2553 } 2554 } 2555 if (pagelock) { 2556 VM_STAT_ADD(anonvmstats.privatepages[6]); 2557 mutex_enter(&freemem_lock); 2558 if (availrmem >= pages_pp_maximum + pgcnt) { 2559 availrmem -= pgcnt; 2560 pages_useclaim += pgcnt; 2561 } else { 2562 VM_STAT_ADD(anonvmstats.privatepages[7]); 2563 mutex_exit(&freemem_lock); 2564 if (ahmpages != NULL) { 2565 mutex_exit(ahmpages); 2566 } 2567 if (prealloc) { 2568 page_free_replacement_page(pplist); 2569 page_create_putback(pgcnt); 2570 } 2571 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2572 if (ppa[pg_idx] != NULL) 2573 page_unlock(ppa[pg_idx]); 2574 return (ENOMEM); 2575 } 2576 mutex_exit(&freemem_lock); 2577 } 2578 } 2579 2580 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2581 2582 VM_STAT_ADD(anonvmstats.privatepages[8]); 2583 2584 an_idx = start_idx; 2585 pg_idx = 0; 2586 vaddr = addr; 2587 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2588 ASSERT(ppa[pg_idx] != NULL); 2589 oldap = anon_get_ptr(amp->ahp, an_idx); 2590 ASSERT(ahmpages != NULL || oldap == NULL); 2591 ASSERT(ahmpages == NULL || oldap != NULL); 2592 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2593 ASSERT(ahmpages == NULL || pg_idx != 0 || 2594 (refcnt = oldap->an_refcnt)); 2595 ASSERT(ahmpages == NULL || pg_idx == 0 || 2596 refcnt == oldap->an_refcnt); 2597 2598 ap = anon_alloc(NULL, 0); 2599 2600 swap_xlate(ap, &vp, &off); 2601 2602 /* 2603 * Now setup our preallocated page to pass down to 2604 * swap_getpage(). 2605 */ 2606 if (prealloc) { 2607 pp = pplist; 2608 page_sub(&pplist, pp); 2609 conpp = pp; 2610 } 2611 2612 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2613 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2614 S_CREATE, cred); 2615 2616 /* 2617 * Impossible to fail this is S_CREATE. 2618 */ 2619 if (err) 2620 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2621 2622 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2623 ASSERT(prealloc == 0 || nreloc == 1); 2624 2625 pp = pl[0]; 2626 2627 /* 2628 * If the original page was locked, we need to move 2629 * the lock to the new page by transfering 2630 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2631 * of the new page. pg_idx can be used to index 2632 * into the vpage array since the caller will guarentee 2633 * that vpage struct passed in corresponds to addr 2634 * and forward. 2635 */ 2636 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2637 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2638 } else if (pagelock) { 2639 mutex_enter(&freemem_lock); 2640 availrmem++; 2641 pages_useclaim--; 2642 mutex_exit(&freemem_lock); 2643 } 2644 2645 /* 2646 * Now copy the contents from the original page. 2647 */ 2648 if (ppcopy(ppa[pg_idx], pp) == 0) { 2649 /* 2650 * Before ppcopy could hanlde UE or other faults, we 2651 * would have panicked here, and still have no option 2652 * but to do so now. 2653 */ 2654 panic("anon_map_privatepages, ppcopy failed"); 2655 } 2656 2657 hat_setrefmod(pp); /* mark as modified */ 2658 2659 /* 2660 * Release the lock on the original page, 2661 * derement the old slot, and down grade the lock 2662 * on the new copy. 2663 */ 2664 page_unlock(ppa[pg_idx]); 2665 2666 if (!prealloc) 2667 page_downgrade(pp); 2668 2669 ppa[pg_idx] = pp; 2670 2671 /* 2672 * Now reflect the copy in the new anon array. 2673 */ 2674 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2675 if (oldap != NULL) 2676 anon_decref(oldap); 2677 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2678 } 2679 2680 /* 2681 * Unload the old large page translation. 2682 */ 2683 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2684 2685 if (ahmpages != NULL) { 2686 mutex_exit(ahmpages); 2687 } 2688 ASSERT(prealloc == 0 || pplist == NULL); 2689 if (prealloc) { 2690 VM_STAT_ADD(anonvmstats.privatepages[9]); 2691 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2692 page_downgrade(ppa[pg_idx]); 2693 } 2694 } 2695 2696 return (0); 2697 } 2698 2699 /* 2700 * Allocate a private zero-filled anon page. 2701 */ 2702 page_t * 2703 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2704 { 2705 struct anon *ap; 2706 page_t *pp; 2707 struct vnode *vp; 2708 anoff_t off; 2709 page_t *anon_pl[1 + 1]; 2710 int err; 2711 2712 /* Kernel probe */ 2713 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2714 tnf_opaque, address, addr); 2715 2716 *app = ap = anon_alloc(NULL, 0); 2717 swap_xlate(ap, &vp, &off); 2718 2719 /* 2720 * Call the VOP_GETPAGE routine to create the page, thereby 2721 * enabling the vnode driver to allocate any filesystem 2722 * dependent structures (e.g., disk block allocation for UFS). 2723 * This also prevents more than on page from being added to 2724 * the vnode at the same time since it is locked. 2725 */ 2726 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2727 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2728 if (err) { 2729 *app = NULL; 2730 anon_decref(ap); 2731 return (NULL); 2732 } 2733 pp = anon_pl[0]; 2734 2735 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2736 page_downgrade(pp); 2737 CPU_STATS_ADD_K(vm, zfod, 1); 2738 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2739 return (pp); 2740 } 2741 2742 2743 /* 2744 * Allocate array of private zero-filled anon pages for empty slots 2745 * and kept pages for non empty slots within given range. 2746 * 2747 * NOTE: This rontine will try and use large pages 2748 * if available and supported by underlying platform. 2749 */ 2750 int 2751 anon_map_createpages( 2752 struct anon_map *amp, 2753 ulong_t start_index, 2754 size_t len, 2755 page_t *ppa[], 2756 struct seg *seg, 2757 caddr_t addr, 2758 enum seg_rw rw, 2759 struct cred *cred) 2760 { 2761 2762 struct anon *ap; 2763 struct vnode *ap_vp; 2764 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2765 int err = 0; 2766 ulong_t p_index, index; 2767 pgcnt_t npgs, pg_cnt; 2768 spgcnt_t nreloc = 0; 2769 uint_t l_szc, szc, prot; 2770 anoff_t ap_off; 2771 size_t pgsz; 2772 lgrp_t *lgrp; 2773 kmutex_t *ahm; 2774 2775 /* 2776 * XXX For now only handle S_CREATE. 2777 */ 2778 ASSERT(rw == S_CREATE); 2779 2780 index = start_index; 2781 p_index = 0; 2782 npgs = btopr(len); 2783 2784 /* 2785 * If this platform supports multiple page sizes 2786 * then try and allocate directly from the free 2787 * list for pages larger than PAGESIZE. 2788 * 2789 * NOTE:When we have page_create_ru we can stop 2790 * directly allocating from the freelist. 2791 */ 2792 l_szc = seg->s_szc; 2793 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2794 while (npgs) { 2795 2796 /* 2797 * if anon slot already exists 2798 * (means page has been created) 2799 * so 1) look up the page 2800 * 2) if the page is still in memory, get it. 2801 * 3) if not, create a page and 2802 * page in from physical swap device. 2803 * These are done in anon_getpage(). 2804 */ 2805 ap = anon_get_ptr(amp->ahp, index); 2806 if (ap) { 2807 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2808 seg, addr, S_READ, cred); 2809 if (err) { 2810 ANON_LOCK_EXIT(&->a_rwlock); 2811 panic("anon_map_createpages: anon_getpage"); 2812 } 2813 pp = anon_pl[0]; 2814 ppa[p_index++] = pp; 2815 2816 /* 2817 * an_pvp can become non-NULL after SysV's page was 2818 * paged out before ISM was attached to this SysV 2819 * shared memory segment. So free swap slot if needed. 2820 */ 2821 if (ap->an_pvp != NULL) { 2822 page_io_lock(pp); 2823 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 2824 ap->an_off)]; 2825 mutex_enter(ahm); 2826 if (ap->an_pvp != NULL) { 2827 swap_phys_free(ap->an_pvp, 2828 ap->an_poff, PAGESIZE); 2829 ap->an_pvp = NULL; 2830 ap->an_poff = 0; 2831 mutex_exit(ahm); 2832 hat_setmod(pp); 2833 } else { 2834 mutex_exit(ahm); 2835 } 2836 page_io_unlock(pp); 2837 } 2838 2839 addr += PAGESIZE; 2840 index++; 2841 npgs--; 2842 continue; 2843 } 2844 /* 2845 * Now try and allocate the largest page possible 2846 * for the current address and range. 2847 * Keep dropping down in page size until: 2848 * 2849 * 1) Properly aligned 2850 * 2) Does not overlap existing anon pages 2851 * 3) Fits in remaining range. 2852 * 4) able to allocate one. 2853 * 2854 * NOTE: XXX When page_create_ru is completed this code 2855 * will change. 2856 */ 2857 szc = l_szc; 2858 pplist = NULL; 2859 pg_cnt = 0; 2860 while (szc) { 2861 pgsz = page_get_pagesize(szc); 2862 pg_cnt = pgsz >> PAGESHIFT; 2863 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2864 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2865 /* 2866 * XXX 2867 * Since we are faking page_create() 2868 * we also need to do the freemem and 2869 * pcf accounting. 2870 */ 2871 (void) page_create_wait(pg_cnt, PG_WAIT); 2872 2873 /* 2874 * Get lgroup to allocate next page of shared 2875 * memory from and use it to specify where to 2876 * allocate the physical memory 2877 */ 2878 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2879 2880 pplist = page_get_freelist( 2881 anon_vp, (u_offset_t)0, seg, 2882 addr, pgsz, 0, lgrp); 2883 2884 if (pplist == NULL) { 2885 page_create_putback(pg_cnt); 2886 } 2887 2888 /* 2889 * If a request for a page of size 2890 * larger than PAGESIZE failed 2891 * then don't try that size anymore. 2892 */ 2893 if (pplist == NULL) { 2894 l_szc = szc - 1; 2895 } else { 2896 break; 2897 } 2898 } 2899 szc--; 2900 } 2901 2902 /* 2903 * If just using PAGESIZE pages then don't 2904 * directly allocate from the free list. 2905 */ 2906 if (pplist == NULL) { 2907 ASSERT(szc == 0); 2908 pp = anon_zero(seg, addr, &ap, cred); 2909 if (pp == NULL) { 2910 ANON_LOCK_EXIT(&->a_rwlock); 2911 panic("anon_map_createpages: anon_zero"); 2912 } 2913 ppa[p_index++] = pp; 2914 2915 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2916 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2917 2918 addr += PAGESIZE; 2919 index++; 2920 npgs--; 2921 continue; 2922 } 2923 2924 /* 2925 * pplist is a list of pg_cnt PAGESIZE pages. 2926 * These pages are locked SE_EXCL since they 2927 * came directly off the free list. 2928 */ 2929 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2930 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2931 ASSERT(conpp == NULL); 2932 while (pg_cnt--) { 2933 2934 ap = anon_alloc(NULL, 0); 2935 swap_xlate(ap, &ap_vp, &ap_off); 2936 2937 ASSERT(pplist != NULL); 2938 pp = pplist; 2939 page_sub(&pplist, pp); 2940 PP_CLRFREE(pp); 2941 PP_CLRAGED(pp); 2942 conpp = pp; 2943 2944 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2945 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2946 &nreloc, seg, addr, S_CREATE, cred); 2947 2948 if (err) { 2949 ANON_LOCK_EXIT(&->a_rwlock); 2950 panic("anon_map_createpages: S_CREATE"); 2951 } 2952 2953 ASSERT(anon_pl[0] == pp); 2954 ASSERT(nreloc == 1); 2955 pagezero(pp, 0, PAGESIZE); 2956 CPU_STATS_ADD_K(vm, zfod, 1); 2957 hat_setrefmod(pp); 2958 2959 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2960 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2961 2962 ppa[p_index++] = pp; 2963 2964 addr += PAGESIZE; 2965 index++; 2966 npgs--; 2967 } 2968 conpp = NULL; 2969 pg_cnt = pgsz >> PAGESHIFT; 2970 p_index = p_index - pg_cnt; 2971 while (pg_cnt--) { 2972 page_downgrade(ppa[p_index++]); 2973 } 2974 } 2975 ANON_LOCK_EXIT(&->a_rwlock); 2976 return (0); 2977 } 2978 2979 static int 2980 anon_try_demote_pages( 2981 struct anon_hdr *ahp, 2982 ulong_t sidx, 2983 uint_t szc, 2984 page_t **ppa, 2985 int private) 2986 { 2987 struct anon *ap; 2988 pgcnt_t pgcnt = page_get_pagecnt(szc); 2989 page_t *pp; 2990 pgcnt_t i; 2991 kmutex_t *ahmpages = NULL; 2992 int root = 0; 2993 pgcnt_t npgs; 2994 pgcnt_t curnpgs = 0; 2995 size_t ppasize = 0; 2996 2997 ASSERT(szc != 0); 2998 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2999 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3000 ASSERT(sidx < ahp->size); 3001 3002 if (ppa == NULL) { 3003 ppasize = pgcnt * sizeof (page_t *); 3004 ppa = kmem_alloc(ppasize, KM_SLEEP); 3005 } 3006 3007 ap = anon_get_ptr(ahp, sidx); 3008 if (ap != NULL && private) { 3009 VM_STAT_ADD(anonvmstats.demotepages[1]); 3010 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 3011 mutex_enter(ahmpages); 3012 } 3013 3014 if (ap != NULL && ap->an_refcnt > 1) { 3015 if (ahmpages != NULL) { 3016 VM_STAT_ADD(anonvmstats.demotepages[2]); 3017 mutex_exit(ahmpages); 3018 } 3019 if (ppasize != 0) { 3020 kmem_free(ppa, ppasize); 3021 } 3022 return (0); 3023 } 3024 if (ahmpages != NULL) { 3025 mutex_exit(ahmpages); 3026 } 3027 if (ahp->size - sidx < pgcnt) { 3028 ASSERT(private == 0); 3029 pgcnt = ahp->size - sidx; 3030 } 3031 for (i = 0; i < pgcnt; i++, sidx++) { 3032 ap = anon_get_ptr(ahp, sidx); 3033 if (ap != NULL) { 3034 if (ap->an_refcnt != 1) { 3035 panic("anon_try_demote_pages: an_refcnt != 1"); 3036 } 3037 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3038 SE_EXCL); 3039 if (pp != NULL) { 3040 (void) hat_pageunload(pp, 3041 HAT_FORCE_PGUNLOAD); 3042 } 3043 } else { 3044 ppa[i] = NULL; 3045 } 3046 } 3047 for (i = 0; i < pgcnt; i++) { 3048 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3049 ASSERT(pp->p_szc <= szc); 3050 if (!root) { 3051 VM_STAT_ADD(anonvmstats.demotepages[3]); 3052 if (curnpgs != 0) 3053 panic("anon_try_demote_pages: " 3054 "bad large page"); 3055 3056 root = 1; 3057 curnpgs = npgs = 3058 page_get_pagecnt(pp->p_szc); 3059 3060 ASSERT(npgs <= pgcnt); 3061 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3062 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3063 } else { 3064 ASSERT(i > 0); 3065 ASSERT(page_pptonum(pp) - 1 == 3066 page_pptonum(ppa[i - 1])); 3067 if ((page_pptonum(pp) & (npgs - 1)) == 3068 npgs - 1) 3069 root = 0; 3070 } 3071 ASSERT(PAGE_EXCL(pp)); 3072 pp->p_szc = 0; 3073 ASSERT(curnpgs > 0); 3074 curnpgs--; 3075 } 3076 } 3077 if (root != 0 || curnpgs != 0) 3078 panic("anon_try_demote_pages: bad large page"); 3079 3080 for (i = 0; i < pgcnt; i++) { 3081 if ((pp = ppa[i]) != NULL) { 3082 ASSERT(!hat_page_is_mapped(pp)); 3083 ASSERT(pp->p_szc == 0); 3084 page_unlock(pp); 3085 } 3086 } 3087 if (ppasize != 0) { 3088 kmem_free(ppa, ppasize); 3089 } 3090 return (1); 3091 } 3092 3093 /* 3094 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3095 */ 3096 int 3097 anon_map_demotepages( 3098 struct anon_map *amp, 3099 ulong_t start_idx, 3100 struct seg *seg, 3101 caddr_t addr, 3102 uint_t prot, 3103 struct vpage vpage[], 3104 struct cred *cred) 3105 { 3106 struct anon *ap; 3107 uint_t szc = seg->s_szc; 3108 pgcnt_t pgcnt = page_get_pagecnt(szc); 3109 size_t ppasize = pgcnt * sizeof (page_t *); 3110 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3111 page_t *pp; 3112 page_t *pl[2]; 3113 pgcnt_t i, pg_idx; 3114 ulong_t an_idx; 3115 caddr_t vaddr; 3116 int err; 3117 int retry = 0; 3118 uint_t vpprot; 3119 3120 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3121 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3122 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3123 ASSERT(ppa != NULL); 3124 ASSERT(szc != 0); 3125 ASSERT(szc == amp->a_szc); 3126 3127 VM_STAT_ADD(anonvmstats.demotepages[0]); 3128 3129 top: 3130 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3131 kmem_free(ppa, ppasize); 3132 return (0); 3133 } 3134 3135 VM_STAT_ADD(anonvmstats.demotepages[4]); 3136 3137 ASSERT(retry == 0); /* we can be here only once */ 3138 3139 vaddr = addr; 3140 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3141 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3142 ap = anon_get_ptr(amp->ahp, an_idx); 3143 if (ap == NULL) 3144 panic("anon_map_demotepages: no anon slot"); 3145 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3146 S_READ, cred); 3147 if (err) { 3148 for (i = 0; i < pg_idx; i++) { 3149 if ((pp = ppa[i]) != NULL) 3150 page_unlock(pp); 3151 } 3152 kmem_free(ppa, ppasize); 3153 return (err); 3154 } 3155 ppa[pg_idx] = pl[0]; 3156 } 3157 3158 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3159 vpage, -1, 0, cred); 3160 if (err > 0) { 3161 VM_STAT_ADD(anonvmstats.demotepages[5]); 3162 kmem_free(ppa, ppasize); 3163 return (err); 3164 } 3165 ASSERT(err == 0 || err == -1); 3166 if (err == -1) { 3167 VM_STAT_ADD(anonvmstats.demotepages[6]); 3168 retry = 1; 3169 goto top; 3170 } 3171 for (i = 0; i < pgcnt; i++) { 3172 ASSERT(ppa[i] != NULL); 3173 if (ppa[i]->p_szc != 0) 3174 retry = 1; 3175 page_unlock(ppa[i]); 3176 } 3177 if (retry) { 3178 VM_STAT_ADD(anonvmstats.demotepages[7]); 3179 goto top; 3180 } 3181 3182 VM_STAT_ADD(anonvmstats.demotepages[8]); 3183 3184 kmem_free(ppa, ppasize); 3185 3186 return (0); 3187 } 3188 3189 /* 3190 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3191 * structures with private anon maps. Therefore all anon structures should 3192 * have at most one reference at this point. This means underlying pages can 3193 * be exclusively locked and demoted or freed. If not freeing the entire 3194 * large pages demote the ends of the region we free to be able to free 3195 * subpages. Page roots correspond to aligned index positions in anon map. 3196 */ 3197 void 3198 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3199 { 3200 ulong_t eidx = sidx + btopr(len); 3201 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3202 struct anon_hdr *ahp = amp->ahp; 3203 ulong_t tidx; 3204 size_t size; 3205 ulong_t sidx_aligned; 3206 ulong_t eidx_aligned; 3207 3208 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3209 ASSERT(amp->refcnt <= 1); 3210 ASSERT(amp->a_szc > 0); 3211 ASSERT(eidx <= ahp->size); 3212 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3213 3214 if (len == 0) { /* XXX */ 3215 return; 3216 } 3217 3218 sidx_aligned = P2ALIGN(sidx, pages); 3219 if (sidx_aligned != sidx || 3220 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3221 if (!anon_try_demote_pages(ahp, sidx_aligned, 3222 amp->a_szc, NULL, 0)) { 3223 panic("anon_shmap_free_pages: demote failed"); 3224 } 3225 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3226 P2NPHASE(sidx, pages); 3227 size <<= PAGESHIFT; 3228 anon_free(ahp, sidx, size); 3229 sidx = sidx_aligned + pages; 3230 if (eidx <= sidx) { 3231 return; 3232 } 3233 } 3234 eidx_aligned = P2ALIGN(eidx, pages); 3235 if (sidx < eidx_aligned) { 3236 anon_free_pages(ahp, sidx, 3237 (eidx_aligned - sidx) << PAGESHIFT, 3238 amp->a_szc); 3239 sidx = eidx_aligned; 3240 } 3241 ASSERT(sidx == eidx_aligned); 3242 if (eidx == eidx_aligned) { 3243 return; 3244 } 3245 tidx = eidx; 3246 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3247 tidx - sidx < pages) { 3248 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3249 panic("anon_shmap_free_pages: demote failed"); 3250 } 3251 size = (eidx - sidx) << PAGESHIFT; 3252 anon_free(ahp, sidx, size); 3253 } else { 3254 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3255 } 3256 } 3257 3258 /* 3259 * This routine should be called with amp's writer lock when there're no other 3260 * users of amp. All pcache entries of this amp must have been already 3261 * inactivated. We must not drop a_rwlock here to prevent new users from 3262 * attaching to this amp. 3263 */ 3264 void 3265 anonmap_purge(struct anon_map *amp) 3266 { 3267 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3268 ASSERT(amp->refcnt <= 1); 3269 3270 if (amp->a_softlockcnt != 0) { 3271 seg_ppurge(NULL, amp, 0); 3272 } 3273 3274 /* 3275 * Since all pcache entries were already inactive before this routine 3276 * was called seg_ppurge() couldn't return while there're still 3277 * entries that can be found via the list anchored at a_phead. So we 3278 * can assert this list is empty now. a_softlockcnt may be still non 0 3279 * if asynchronous thread that manages pcache already removed pcache 3280 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3281 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3282 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3283 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3284 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3285 * barrier that prevents anonmap_purge() to complete while 3286 * shamp_reclaim() may still be referencing this amp. 3287 */ 3288 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3289 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3290 3291 mutex_enter(&->a_purgemtx); 3292 while (amp->a_softlockcnt != 0) { 3293 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3294 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3295 amp->a_purgewait = 1; 3296 cv_wait(&->a_purgecv, &->a_purgemtx); 3297 } 3298 mutex_exit(&->a_purgemtx); 3299 3300 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3301 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3302 ASSERT(amp->a_softlockcnt == 0); 3303 } 3304 3305 /* 3306 * Allocate and initialize an anon_map structure for seg 3307 * associating the given swap reservation with the new anon_map. 3308 */ 3309 struct anon_map * 3310 anonmap_alloc(size_t size, size_t swresv, int flags) 3311 { 3312 struct anon_map *amp; 3313 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3314 3315 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3316 if (amp == NULL) { 3317 ASSERT(kmflags == KM_NOSLEEP); 3318 return (NULL); 3319 } 3320 3321 amp->ahp = anon_create(btopr(size), flags); 3322 if (amp->ahp == NULL) { 3323 ASSERT(flags == ANON_NOSLEEP); 3324 kmem_cache_free(anonmap_cache, amp); 3325 return (NULL); 3326 } 3327 amp->refcnt = 1; 3328 amp->size = size; 3329 amp->swresv = swresv; 3330 amp->locality = 0; 3331 amp->a_szc = 0; 3332 amp->a_sp = NULL; 3333 amp->a_softlockcnt = 0; 3334 amp->a_purgewait = 0; 3335 amp->a_phead.p_lnext = &->a_phead; 3336 amp->a_phead.p_lprev = &->a_phead; 3337 3338 return (amp); 3339 } 3340 3341 void 3342 anonmap_free(struct anon_map *amp) 3343 { 3344 ASSERT(amp->ahp != NULL); 3345 ASSERT(amp->refcnt == 0); 3346 ASSERT(amp->a_softlockcnt == 0); 3347 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3348 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3349 3350 lgrp_shm_policy_fini(amp, NULL); 3351 anon_release(amp->ahp, btopr(amp->size)); 3352 kmem_cache_free(anonmap_cache, amp); 3353 } 3354 3355 /* 3356 * Returns true if the app array has some empty slots. 3357 * The offp and lenp parameters are in/out parameters. On entry 3358 * these values represent the starting offset and length of the 3359 * mapping. When true is returned, these values may be modified 3360 * to be the largest range which includes empty slots. 3361 */ 3362 int 3363 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3364 size_t *lenp) 3365 { 3366 ulong_t i, el; 3367 ssize_t low, high; 3368 struct anon *ap; 3369 3370 low = -1; 3371 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3372 ap = anon_get_ptr(ahp, anon_idx); 3373 if (ap == NULL) { 3374 if (low == -1) 3375 low = i; 3376 high = i; 3377 } 3378 } 3379 if (low != -1) { 3380 /* 3381 * Found at least one non-anon page. 3382 * Set up the off and len return values. 3383 */ 3384 if (low != 0) 3385 *offp += low; 3386 *lenp = high - low + PAGESIZE; 3387 return (1); 3388 } 3389 return (0); 3390 } 3391 3392 /* 3393 * Return a count of the number of existing anon pages in the anon array 3394 * app in the range (off, off+len). The array and slots must be guaranteed 3395 * stable by the caller. 3396 */ 3397 pgcnt_t 3398 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3399 { 3400 pgcnt_t cnt = 0; 3401 3402 while (nslots-- > 0) { 3403 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3404 cnt++; 3405 anon_index++; 3406 } 3407 return (cnt); 3408 } 3409 3410 /* 3411 * Move reserved phys swap into memory swap (unreserve phys swap 3412 * and reserve mem swap by the same amount). 3413 * Used by segspt when it needs to lock reserved swap npages in memory 3414 */ 3415 int 3416 anon_swap_adjust(pgcnt_t npages) 3417 { 3418 pgcnt_t unlocked_mem_swap; 3419 3420 mutex_enter(&anoninfo_lock); 3421 3422 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3423 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3424 3425 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3426 - k_anoninfo.ani_locked_swap; 3427 if (npages > unlocked_mem_swap) { 3428 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3429 3430 /* 3431 * if there is not enough unlocked mem swap we take missing 3432 * amount from phys swap and give it to mem swap 3433 */ 3434 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3435 mutex_exit(&anoninfo_lock); 3436 return (ENOMEM); 3437 } 3438 3439 k_anoninfo.ani_mem_resv += adjusted_swap; 3440 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3441 k_anoninfo.ani_phys_resv -= adjusted_swap; 3442 3443 ANI_ADD(adjusted_swap); 3444 } 3445 k_anoninfo.ani_locked_swap += npages; 3446 3447 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3448 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3449 3450 mutex_exit(&anoninfo_lock); 3451 3452 return (0); 3453 } 3454 3455 /* 3456 * 'unlocked' reserved mem swap so when it is unreserved it 3457 * can be moved back phys (disk) swap 3458 */ 3459 void 3460 anon_swap_restore(pgcnt_t npages) 3461 { 3462 mutex_enter(&anoninfo_lock); 3463 3464 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3465 3466 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3467 k_anoninfo.ani_locked_swap -= npages; 3468 3469 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3470 3471 mutex_exit(&anoninfo_lock); 3472 } 3473 3474 /* 3475 * Return the pointer from the list for a 3476 * specified anon index. 3477 */ 3478 ulong_t * 3479 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3480 { 3481 struct anon **app; 3482 void **ppp; 3483 3484 ASSERT(an_idx < ahp->size); 3485 3486 /* 3487 * Single level case. 3488 */ 3489 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3490 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3491 } else { 3492 3493 /* 3494 * 2 level case. 3495 */ 3496 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3497 if (*ppp == NULL) { 3498 mutex_enter(&ahp->serial_lock); 3499 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3500 if (*ppp == NULL) 3501 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3502 mutex_exit(&ahp->serial_lock); 3503 } 3504 app = *ppp; 3505 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3506 } 3507 } 3508 3509 void 3510 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3511 { 3512 ulong_t *ap_slot; 3513 kmutex_t *mtx; 3514 kcondvar_t *cv; 3515 int hash; 3516 3517 /* 3518 * Use szc to determine anon slot(s) to appear atomic. 3519 * If szc = 0, then lock the anon slot and mark it busy. 3520 * If szc > 0, then lock the range of slots by getting the 3521 * anon_array_lock for the first anon slot, and mark only the 3522 * first anon slot busy to represent whole range being busy. 3523 */ 3524 3525 ASSERT(RW_READ_HELD(&->a_rwlock)); 3526 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3527 hash = ANON_ARRAY_HASH(amp, an_idx); 3528 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3529 sobj->sync_cv = cv = &anon_array_cv[hash]; 3530 mutex_enter(mtx); 3531 ap_slot = anon_get_slot(amp->ahp, an_idx); 3532 while (ANON_ISBUSY(ap_slot)) 3533 cv_wait(cv, mtx); 3534 ANON_SETBUSY(ap_slot); 3535 sobj->sync_data = ap_slot; 3536 mutex_exit(mtx); 3537 } 3538 3539 int 3540 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3541 anon_sync_obj_t *sobj) 3542 { 3543 ulong_t *ap_slot; 3544 kmutex_t *mtx; 3545 int hash; 3546 3547 /* 3548 * Try to lock a range of anon slots. 3549 * Use szc to determine anon slot(s) to appear atomic. 3550 * If szc = 0, then lock the anon slot and mark it busy. 3551 * If szc > 0, then lock the range of slots by getting the 3552 * anon_array_lock for the first anon slot, and mark only the 3553 * first anon slot busy to represent whole range being busy. 3554 * Fail if the mutex or the anon_array are busy. 3555 */ 3556 3557 ASSERT(RW_READ_HELD(&->a_rwlock)); 3558 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3559 hash = ANON_ARRAY_HASH(amp, an_idx); 3560 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3561 sobj->sync_cv = &anon_array_cv[hash]; 3562 if (!mutex_tryenter(mtx)) { 3563 return (EWOULDBLOCK); 3564 } 3565 ap_slot = anon_get_slot(amp->ahp, an_idx); 3566 if (ANON_ISBUSY(ap_slot)) { 3567 mutex_exit(mtx); 3568 return (EWOULDBLOCK); 3569 } 3570 ANON_SETBUSY(ap_slot); 3571 sobj->sync_data = ap_slot; 3572 mutex_exit(mtx); 3573 return (0); 3574 } 3575 3576 void 3577 anon_array_exit(anon_sync_obj_t *sobj) 3578 { 3579 mutex_enter(sobj->sync_mutex); 3580 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3581 ANON_CLRBUSY(sobj->sync_data); 3582 if (CV_HAS_WAITERS(sobj->sync_cv)) 3583 cv_broadcast(sobj->sync_cv); 3584 mutex_exit(sobj->sync_mutex); 3585 } 3586