1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 /* 40 * VM - anonymous pages. 41 * 42 * This layer sits immediately above the vm_swap layer. It manages 43 * physical pages that have no permanent identity in the file system 44 * name space, using the services of the vm_swap layer to allocate 45 * backing storage for these pages. Since these pages have no external 46 * identity, they are discarded when the last reference is removed. 47 * 48 * An important function of this layer is to manage low-level sharing 49 * of pages that are logically distinct but that happen to be 50 * physically identical (e.g., the corresponding pages of the processes 51 * resulting from a fork before one process or the other changes their 52 * contents). This pseudo-sharing is present only as an optimization 53 * and is not to be confused with true sharing in which multiple 54 * address spaces deliberately contain references to the same object; 55 * such sharing is managed at a higher level. 56 * 57 * The key data structure here is the anon struct, which contains a 58 * reference count for its associated physical page and a hint about 59 * the identity of that page. Anon structs typically live in arrays, 60 * with an instance's position in its array determining where the 61 * corresponding backing storage is allocated; however, the swap_xlate() 62 * routine abstracts away this representation information so that the 63 * rest of the anon layer need not know it. (See the swap layer for 64 * more details on anon struct layout.) 65 * 66 * In the future versions of the system, the association between an 67 * anon struct and its position on backing store will change so that 68 * we don't require backing store all anonymous pages in the system. 69 * This is important for consideration for large memory systems. 70 * We can also use this technique to delay binding physical locations 71 * to anonymous pages until pageout/swapout time where we can make 72 * smarter allocation decisions to improve anonymous klustering. 73 * 74 * Many of the routines defined here take a (struct anon **) argument, 75 * which allows the code at this level to manage anon pages directly, 76 * so that callers can regard anon structs as opaque objects and not be 77 * concerned with assigning or inspecting their contents. 78 * 79 * Clients of this layer refer to anon pages indirectly. That is, they 80 * maintain arrays of pointers to anon structs rather than maintaining 81 * anon structs themselves. The (struct anon **) arguments mentioned 82 * above are pointers to entries in these arrays. It is these arrays 83 * that capture the mapping between offsets within a given segment and 84 * the corresponding anonymous backing storage address. 85 */ 86 87 #ifdef DEBUG 88 #define ANON_DEBUG 89 #endif 90 91 #include <sys/types.h> 92 #include <sys/t_lock.h> 93 #include <sys/param.h> 94 #include <sys/systm.h> 95 #include <sys/mman.h> 96 #include <sys/cred.h> 97 #include <sys/thread.h> 98 #include <sys/vnode.h> 99 #include <sys/cpuvar.h> 100 #include <sys/swap.h> 101 #include <sys/cmn_err.h> 102 #include <sys/vtrace.h> 103 #include <sys/kmem.h> 104 #include <sys/sysmacros.h> 105 #include <sys/bitmap.h> 106 #include <sys/vmsystm.h> 107 #include <sys/tuneable.h> 108 #include <sys/debug.h> 109 #include <sys/fs/swapnode.h> 110 #include <sys/tnf_probe.h> 111 #include <sys/lgrp.h> 112 #include <sys/policy.h> 113 #include <sys/condvar_impl.h> 114 #include <sys/mutex_impl.h> 115 #include <sys/rctl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 struct vnode *anon_vp; 128 129 int anon_debug; 130 131 kmutex_t anoninfo_lock; 132 struct k_anoninfo k_anoninfo; 133 ani_free_t ani_free_pool[ANI_MAX_POOL]; 134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 135 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 136 137 /* 138 * Global hash table for (vp, off) -> anon slot 139 */ 140 extern int swap_maxcontig; 141 size_t anon_hash_size; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 #ifdef VM_STATS 148 static struct anonvmstats_str { 149 ulong_t getpages[30]; 150 ulong_t privatepages[10]; 151 ulong_t demotepages[9]; 152 ulong_t decrefpages[9]; 153 ulong_t dupfillholes[4]; 154 ulong_t freepages[1]; 155 } anonvmstats; 156 #endif /* VM_STATS */ 157 158 /*ARGSUSED*/ 159 static int 160 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 161 { 162 struct anon_map *amp = buf; 163 164 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 165 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL); 166 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL); 167 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL); 168 return (0); 169 } 170 171 /*ARGSUSED1*/ 172 static void 173 anonmap_cache_destructor(void *buf, void *cdrarg) 174 { 175 struct anon_map *amp = buf; 176 177 rw_destroy(&->a_rwlock); 178 cv_destroy(&->a_purgecv); 179 mutex_destroy(&->a_pmtx); 180 mutex_destroy(&->a_purgemtx); 181 } 182 183 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 184 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 185 186 void 187 anon_init(void) 188 { 189 int i; 190 191 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 192 193 for (i = 0; i < AH_LOCK_SIZE; i++) { 194 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 195 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 196 } 197 198 for (i = 0; i < ANON_LOCKSIZE; i++) { 199 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 200 MUTEX_DEFAULT, NULL); 201 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 202 } 203 204 anon_hash = (struct anon **) 205 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 206 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 207 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 208 anonmap_cache = kmem_cache_create("anonmap_cache", 209 sizeof (struct anon_map), 0, 210 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 211 NULL, NULL, 0); 212 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 213 214 anon_vp = vn_alloc(KM_SLEEP); 215 vn_setops(anon_vp, swap_vnodeops); 216 anon_vp->v_type = VREG; 217 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 218 } 219 220 /* 221 * Global anon slot hash table manipulation. 222 */ 223 224 static void 225 anon_addhash(struct anon *ap) 226 { 227 int index; 228 229 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 230 index = ANON_HASH(ap->an_vp, ap->an_off); 231 ap->an_hash = anon_hash[index]; 232 anon_hash[index] = ap; 233 } 234 235 static void 236 anon_rmhash(struct anon *ap) 237 { 238 struct anon **app; 239 240 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 241 242 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 243 *app; app = &((*app)->an_hash)) { 244 if (*app == ap) { 245 *app = ap->an_hash; 246 break; 247 } 248 } 249 } 250 251 /* 252 * The anon array interfaces. Functions allocating, 253 * freeing array of pointers, and returning/setting 254 * entries in the array of pointers for a given offset. 255 * 256 * Create the list of pointers 257 */ 258 struct anon_hdr * 259 anon_create(pgcnt_t npages, int flags) 260 { 261 struct anon_hdr *ahp; 262 ulong_t nchunks; 263 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 264 265 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 266 return (NULL); 267 } 268 269 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 270 /* 271 * Single level case. 272 */ 273 ahp->size = npages; 274 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 275 276 if (flags & ANON_ALLOC_FORCE) 277 ahp->flags |= ANON_ALLOC_FORCE; 278 279 ahp->array_chunk = kmem_zalloc( 280 ahp->size * sizeof (struct anon *), kmemflags); 281 282 if (ahp->array_chunk == NULL) { 283 kmem_free(ahp, sizeof (struct anon_hdr)); 284 return (NULL); 285 } 286 } else { 287 /* 288 * 2 Level case. 289 * anon hdr size needs to be rounded off to be a multiple 290 * of ANON_CHUNK_SIZE. This is important as various anon 291 * related functions depend on this. 292 * NOTE - 293 * anon_grow() makes anon hdr size a multiple of 294 * ANON_CHUNK_SIZE. 295 * amp size is <= anon hdr size. 296 * anon_index + seg_pgs <= anon hdr size. 297 */ 298 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 299 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 300 301 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 302 kmemflags); 303 304 if (ahp->array_chunk == NULL) { 305 kmem_free(ahp, sizeof (struct anon_hdr)); 306 return (NULL); 307 } 308 } 309 return (ahp); 310 } 311 312 /* 313 * Free the array of pointers 314 */ 315 void 316 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 317 { 318 ulong_t i; 319 void **ppp; 320 ulong_t nchunks; 321 322 ASSERT(npages <= ahp->size); 323 324 /* 325 * Single level case. 326 */ 327 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 328 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 329 } else { 330 /* 331 * 2 level case. 332 */ 333 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 334 for (i = 0; i < nchunks; i++) { 335 ppp = &ahp->array_chunk[i]; 336 if (*ppp != NULL) 337 kmem_free(*ppp, PAGESIZE); 338 } 339 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 340 } 341 mutex_destroy(&ahp->serial_lock); 342 kmem_free(ahp, sizeof (struct anon_hdr)); 343 } 344 345 /* 346 * Return the pointer from the list for a 347 * specified anon index. 348 */ 349 struct anon * 350 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 351 { 352 struct anon **app; 353 354 ASSERT(an_idx < ahp->size); 355 356 /* 357 * Single level case. 358 */ 359 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 360 return ((struct anon *) 361 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 362 } else { 363 364 /* 365 * 2 level case. 366 */ 367 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 368 if (app) { 369 return ((struct anon *) 370 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 371 ANON_PTRMASK)); 372 } else { 373 return (NULL); 374 } 375 } 376 } 377 378 /* 379 * Return the anon pointer for the first valid entry in the anon list, 380 * starting from the given index. 381 */ 382 struct anon * 383 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 384 { 385 struct anon *ap; 386 struct anon **app; 387 ulong_t chunkoff; 388 ulong_t i; 389 ulong_t j; 390 pgcnt_t size; 391 392 i = *index; 393 size = ahp->size; 394 395 ASSERT(i < size); 396 397 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 398 /* 399 * 1 level case 400 */ 401 while (i < size) { 402 ap = (struct anon *) 403 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 404 if (ap) { 405 *index = i; 406 return (ap); 407 } 408 i++; 409 } 410 } else { 411 /* 412 * 2 level case 413 */ 414 chunkoff = i & ANON_CHUNK_OFF; 415 while (i < size) { 416 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 417 if (app) 418 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 419 ap = (struct anon *) 420 ((uintptr_t)app[j] & ANON_PTRMASK); 421 if (ap) { 422 *index = i + (j - chunkoff); 423 return (ap); 424 } 425 } 426 chunkoff = 0; 427 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 428 } 429 } 430 *index = size; 431 return (NULL); 432 } 433 434 /* 435 * Set list entry with a given pointer for a specified offset 436 */ 437 int 438 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 439 { 440 void **ppp; 441 struct anon **app; 442 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 443 uintptr_t *ap_addr; 444 445 ASSERT(an_idx < ahp->size); 446 447 /* 448 * Single level case. 449 */ 450 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 451 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 452 } else { 453 454 /* 455 * 2 level case. 456 */ 457 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 458 459 ASSERT(ppp != NULL); 460 if (*ppp == NULL) { 461 mutex_enter(&ahp->serial_lock); 462 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 463 if (*ppp == NULL) { 464 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 465 if (*ppp == NULL) { 466 mutex_exit(&ahp->serial_lock); 467 return (ENOMEM); 468 } 469 } 470 mutex_exit(&ahp->serial_lock); 471 } 472 app = *ppp; 473 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 474 } 475 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 476 return (0); 477 } 478 479 /* 480 * Copy anon array into a given new anon array 481 */ 482 int 483 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 484 struct anon_hdr *dahp, ulong_t d_idx, 485 pgcnt_t npages, int flags) 486 { 487 void **sapp, **dapp; 488 void *ap; 489 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 490 491 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 492 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 493 494 /* 495 * Both arrays are 1 level. 496 */ 497 if (((sahp->size <= ANON_CHUNK_SIZE) && 498 (dahp->size <= ANON_CHUNK_SIZE)) || 499 ((sahp->flags & ANON_ALLOC_FORCE) && 500 (dahp->flags & ANON_ALLOC_FORCE))) { 501 502 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 503 npages * sizeof (struct anon *)); 504 return (0); 505 } 506 507 /* 508 * Both arrays are 2 levels. 509 */ 510 if (sahp->size > ANON_CHUNK_SIZE && 511 dahp->size > ANON_CHUNK_SIZE && 512 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 513 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 514 515 ulong_t sapidx, dapidx; 516 ulong_t *sap, *dap; 517 ulong_t chknp; 518 519 while (npages != 0) { 520 521 sapidx = s_idx & ANON_CHUNK_OFF; 522 dapidx = d_idx & ANON_CHUNK_OFF; 523 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 524 if (chknp > npages) 525 chknp = npages; 526 527 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 528 if ((sap = *sapp) != NULL) { 529 dapp = &dahp->array_chunk[d_idx 530 >> ANON_CHUNK_SHIFT]; 531 if ((dap = *dapp) == NULL) { 532 *dapp = kmem_zalloc(PAGESIZE, 533 kmemflags); 534 if ((dap = *dapp) == NULL) 535 return (ENOMEM); 536 } 537 bcopy((sap + sapidx), (dap + dapidx), 538 chknp << ANON_PTRSHIFT); 539 } 540 s_idx += chknp; 541 d_idx += chknp; 542 npages -= chknp; 543 } 544 return (0); 545 } 546 547 /* 548 * At least one of the arrays is 2 level. 549 */ 550 while (npages--) { 551 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 552 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 553 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 554 return (ENOMEM); 555 } 556 s_idx++; 557 d_idx++; 558 } 559 return (0); 560 } 561 562 563 /* 564 * ANON_INITBUF is a convenience macro for anon_grow() below. It 565 * takes a buffer dst, which is at least as large as buffer src. It 566 * does a bcopy from src into dst, and then bzeros the extra bytes 567 * of dst. If tail is set, the data in src is tail aligned within 568 * dst instead of head aligned. 569 */ 570 571 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 572 if (tail) { \ 573 bzero((dst), (dstsize) - (srclen)); \ 574 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 575 } else { \ 576 bcopy((src), (dst), (srclen)); \ 577 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 578 } 579 580 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 581 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 582 583 /* 584 * anon_grow() is used to efficiently extend an existing anon array. 585 * startidx_p points to the index into the anon array of the first page 586 * that is in use. oldseg_pgs is the number of pages in use, starting at 587 * *startidx_p. newpages is the number of additional pages desired. 588 * 589 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 590 * 591 * The growth is done by creating a new top level of the anon array, 592 * and (if the array is 2-level) reusing the existing second level arrays. 593 * 594 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 595 * 596 * Returns the new number of pages in the anon array. 597 */ 598 pgcnt_t 599 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 600 pgcnt_t newseg_pgs, int flags) 601 { 602 ulong_t startidx = startidx_p ? *startidx_p : 0; 603 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 604 pgcnt_t oelems, nelems, totpages; 605 void **level1; 606 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 607 int growdown = (flags & ANON_GROWDOWN); 608 size_t newarrsz, oldarrsz; 609 void *level2; 610 611 ASSERT(!(startidx_p == NULL && growdown)); 612 ASSERT(startidx + oldseg_pgs <= ahp->size); 613 614 /* 615 * Determine the total number of pages needed in the new 616 * anon array. If growing down, totpages is all pages from 617 * startidx through the end of the array, plus <newseg_pgs> 618 * pages. If growing up, keep all pages from page 0 through 619 * the last page currently in use, plus <newseg_pgs> pages. 620 */ 621 if (growdown) 622 totpages = oldamp_pgs - startidx + newseg_pgs; 623 else 624 totpages = startidx + oldseg_pgs + newseg_pgs; 625 626 /* If the array is already large enough, just return. */ 627 628 if (oldamp_pgs >= totpages) { 629 if (growdown) 630 *startidx_p = oldamp_pgs - totpages; 631 return (oldamp_pgs); 632 } 633 634 /* 635 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 636 * by the corresponding arrays. 637 * oelems/nelems are the number of pointers in the top level arrays 638 * which may be either level 1 or level 2. 639 * Will the new anon array be one level or two levels? 640 */ 641 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 642 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 643 oelems = oldamp_pgs; 644 nelems = newamp_pgs; 645 } else { 646 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 647 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 648 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 649 } 650 651 newarrsz = nelems * sizeof (void *); 652 level1 = kmem_alloc(newarrsz, kmemflags); 653 if (level1 == NULL) 654 return (0); 655 656 /* Are we converting from a one level to a two level anon array? */ 657 658 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 659 !(ahp->flags & ANON_ALLOC_FORCE)) { 660 661 /* 662 * Yes, we're converting to a two level. Reuse old level 1 663 * as new level 2 if it is exactly PAGESIZE. Otherwise 664 * alloc a new level 2 and copy the old level 1 data into it. 665 */ 666 if (oldamp_pgs == ANON_CHUNK_SIZE) { 667 level2 = (void *)ahp->array_chunk; 668 } else { 669 level2 = kmem_alloc(PAGESIZE, kmemflags); 670 if (level2 == NULL) { 671 kmem_free(level1, newarrsz); 672 return (0); 673 } 674 oldarrsz = oldamp_pgs * sizeof (void *); 675 676 ANON_INITBUF(ahp->array_chunk, oldarrsz, 677 level2, PAGESIZE, growdown); 678 kmem_free(ahp->array_chunk, oldarrsz); 679 } 680 bzero(level1, newarrsz); 681 if (growdown) 682 level1[nelems - 1] = level2; 683 else 684 level1[0] = level2; 685 } else { 686 oldarrsz = oelems * sizeof (void *); 687 688 ANON_INITBUF(ahp->array_chunk, oldarrsz, 689 level1, newarrsz, growdown); 690 kmem_free(ahp->array_chunk, oldarrsz); 691 } 692 693 ahp->array_chunk = level1; 694 ahp->size = newamp_pgs; 695 if (growdown) 696 *startidx_p = newamp_pgs - totpages; 697 698 return (newamp_pgs); 699 } 700 701 702 /* 703 * Called from clock handler to sync ani_free value. 704 */ 705 706 void 707 set_anoninfo(void) 708 { 709 int ix; 710 pgcnt_t total = 0; 711 712 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 713 total += ani_free_pool[ix].ani_count; 714 } 715 k_anoninfo.ani_free = total; 716 } 717 718 /* 719 * Reserve anon space. 720 * 721 * It's no longer simply a matter of incrementing ani_resv to 722 * reserve swap space, we need to check memory-based as well 723 * as disk-backed (physical) swap. The following algorithm 724 * is used: 725 * Check the space on physical swap 726 * i.e. amount needed < ani_max - ani_phys_resv 727 * If we are swapping on swapfs check 728 * amount needed < (availrmem - swapfs_minfree) 729 * Since the algorithm to check for the quantity of swap space is 730 * almost the same as that for reserving it, we'll just use anon_resvmem 731 * with a flag to decrement availrmem. 732 * 733 * Return non-zero on success. 734 */ 735 int 736 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 737 { 738 pgcnt_t npages = btopr(size); 739 pgcnt_t mswap_pages = 0; 740 pgcnt_t pswap_pages = 0; 741 proc_t *p = curproc; 742 743 if (zone != NULL && takemem) { 744 /* test zone.max-swap resource control */ 745 mutex_enter(&p->p_lock); 746 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 747 mutex_exit(&p->p_lock); 748 return (0); 749 } 750 mutex_exit(&p->p_lock); 751 } 752 mutex_enter(&anoninfo_lock); 753 754 /* 755 * pswap_pages is the number of pages we can take from 756 * physical (i.e. disk-backed) swap. 757 */ 758 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 759 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 760 761 ANON_PRINT(A_RESV, 762 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 763 npages, takemem, pswap_pages, (void *)caller())); 764 765 if (npages <= pswap_pages) { 766 /* 767 * we have enough space on a physical swap 768 */ 769 if (takemem) 770 k_anoninfo.ani_phys_resv += npages; 771 mutex_exit(&anoninfo_lock); 772 return (1); 773 } else if (pswap_pages != 0) { 774 /* 775 * we have some space on a physical swap 776 */ 777 if (takemem) { 778 /* 779 * use up remainder of phys swap 780 */ 781 k_anoninfo.ani_phys_resv += pswap_pages; 782 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 783 } 784 } 785 /* 786 * since (npages > pswap_pages) we need mem swap 787 * mswap_pages is the number of pages needed from availrmem 788 */ 789 ASSERT(npages > pswap_pages); 790 mswap_pages = npages - pswap_pages; 791 792 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 793 mswap_pages)); 794 795 /* 796 * priv processes can reserve memory as swap as long as availrmem 797 * remains greater than swapfs_minfree; in the case of non-priv 798 * processes, memory can be reserved as swap only if availrmem 799 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 800 * swapfs_reserve amount of memswap is not available to non-priv 801 * processes. This protects daemons such as automounter dying 802 * as a result of application processes eating away almost entire 803 * membased swap. This safeguard becomes useless if apps are run 804 * with root access. 805 * 806 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 807 * 808 */ 809 if (tryhard) { 810 pgcnt_t floor_pages; 811 812 if (secpolicy_resource_anon_mem(CRED())) { 813 floor_pages = swapfs_minfree; 814 } else { 815 floor_pages = swapfs_minfree + swapfs_reserve; 816 } 817 818 mutex_exit(&anoninfo_lock); 819 (void) page_reclaim_mem(mswap_pages, floor_pages, 0); 820 mutex_enter(&anoninfo_lock); 821 } 822 823 mutex_enter(&freemem_lock); 824 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 825 (availrmem > (swapfs_minfree + mswap_pages) && 826 secpolicy_resource(CRED()) == 0)) { 827 828 if (takemem) { 829 /* 830 * Take the memory from the rest of the system. 831 */ 832 availrmem -= mswap_pages; 833 mutex_exit(&freemem_lock); 834 k_anoninfo.ani_mem_resv += mswap_pages; 835 ANI_ADD(mswap_pages); 836 ANON_PRINT((A_RESV | A_MRESV), 837 ("anon_resvmem: took %ld pages of availrmem\n", 838 mswap_pages)); 839 } else { 840 mutex_exit(&freemem_lock); 841 } 842 843 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 844 mutex_exit(&anoninfo_lock); 845 return (1); 846 } else { 847 /* 848 * Fail if not enough memory 849 */ 850 if (takemem) { 851 k_anoninfo.ani_phys_resv -= pswap_pages; 852 } 853 854 mutex_exit(&freemem_lock); 855 mutex_exit(&anoninfo_lock); 856 ANON_PRINT(A_RESV, 857 ("anon_resvmem: not enough space from swapfs\n")); 858 if (zone != NULL && takemem) 859 rctl_decr_swap(zone, ptob(npages)); 860 return (0); 861 } 862 } 863 864 /* 865 * Give back an anon reservation. 866 */ 867 void 868 anon_unresvmem(size_t size, zone_t *zone) 869 { 870 pgcnt_t npages = btopr(size); 871 spgcnt_t mem_free_pages = 0; 872 pgcnt_t phys_free_slots; 873 #ifdef ANON_DEBUG 874 pgcnt_t mem_resv; 875 #endif 876 if (zone != NULL) 877 rctl_decr_swap(zone, ptob(npages)); 878 879 mutex_enter(&anoninfo_lock); 880 881 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 882 883 /* 884 * If some of this reservation belonged to swapfs 885 * give it back to availrmem. 886 * ani_mem_resv is the amount of availrmem swapfs has reserved. 887 * but some of that memory could be locked by segspt so we can only 888 * return non locked ani_mem_resv back to availrmem 889 */ 890 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 891 ANON_PRINT((A_RESV | A_MRESV), 892 ("anon_unresv: growing availrmem by %ld pages\n", 893 MIN(k_anoninfo.ani_mem_resv, npages))); 894 895 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 896 k_anoninfo.ani_locked_swap), npages); 897 mutex_enter(&freemem_lock); 898 availrmem += mem_free_pages; 899 mutex_exit(&freemem_lock); 900 k_anoninfo.ani_mem_resv -= mem_free_pages; 901 902 ANI_ADD(-mem_free_pages); 903 } 904 /* 905 * The remainder of the pages is returned to phys swap 906 */ 907 ASSERT(npages >= mem_free_pages); 908 phys_free_slots = npages - mem_free_pages; 909 910 if (phys_free_slots) { 911 k_anoninfo.ani_phys_resv -= phys_free_slots; 912 } 913 914 #ifdef ANON_DEBUG 915 mem_resv = k_anoninfo.ani_mem_resv; 916 #endif 917 918 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 919 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 920 921 mutex_exit(&anoninfo_lock); 922 923 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 924 npages, mem_resv, (void *)caller())); 925 } 926 927 /* 928 * Allocate an anon slot and return it with the lock held. 929 */ 930 struct anon * 931 anon_alloc(struct vnode *vp, anoff_t off) 932 { 933 struct anon *ap; 934 kmutex_t *ahm; 935 936 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 937 if (vp == NULL) { 938 swap_alloc(ap); 939 } else { 940 ap->an_vp = vp; 941 ap->an_off = off; 942 } 943 ap->an_refcnt = 1; 944 ap->an_pvp = NULL; 945 ap->an_poff = 0; 946 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 947 mutex_enter(ahm); 948 anon_addhash(ap); 949 mutex_exit(ahm); 950 ANI_ADD(-1); 951 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 952 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 953 return (ap); 954 } 955 956 /* 957 * Called for pages locked in memory via softlock/pagelock/mlock to make sure 958 * such pages don't consume any physical swap resources needed for swapping 959 * unlocked pages. 960 */ 961 void 962 anon_swap_free(struct anon *ap, page_t *pp) 963 { 964 kmutex_t *ahm; 965 966 ASSERT(ap != NULL); 967 ASSERT(pp != NULL); 968 ASSERT(PAGE_LOCKED(pp)); 969 ASSERT(pp->p_vnode != NULL); 970 ASSERT(IS_SWAPFSVP(pp->p_vnode)); 971 ASSERT(ap->an_refcnt != 0); 972 ASSERT(pp->p_vnode == ap->an_vp); 973 ASSERT(pp->p_offset == ap->an_off); 974 975 if (ap->an_pvp == NULL) 976 return; 977 978 page_io_lock(pp); 979 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 980 mutex_enter(ahm); 981 982 ASSERT(ap->an_refcnt != 0); 983 ASSERT(pp->p_vnode == ap->an_vp); 984 ASSERT(pp->p_offset == ap->an_off); 985 986 if (ap->an_pvp != NULL) { 987 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 988 ap->an_pvp = NULL; 989 ap->an_poff = 0; 990 mutex_exit(ahm); 991 hat_setmod(pp); 992 } else { 993 mutex_exit(ahm); 994 } 995 page_io_unlock(pp); 996 } 997 998 /* 999 * Decrement the reference count of an anon page. 1000 * If reference count goes to zero, free it and 1001 * its associated page (if any). 1002 */ 1003 void 1004 anon_decref(struct anon *ap) 1005 { 1006 page_t *pp; 1007 struct vnode *vp; 1008 anoff_t off; 1009 kmutex_t *ahm; 1010 1011 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1012 mutex_enter(ahm); 1013 ASSERT(ap->an_refcnt != 0); 1014 if (ap->an_refcnt == 0) 1015 panic("anon_decref: slot count 0"); 1016 if (--ap->an_refcnt == 0) { 1017 swap_xlate(ap, &vp, &off); 1018 anon_rmhash(ap); 1019 if (ap->an_pvp != NULL) 1020 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 1021 mutex_exit(ahm); 1022 1023 /* 1024 * If there is a page for this anon slot we will need to 1025 * call VN_DISPOSE to get rid of the vp association and 1026 * put the page back on the free list as really free. 1027 * Acquire the "exclusive" lock to ensure that any 1028 * pending i/o always completes before the swap slot 1029 * is freed. 1030 */ 1031 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1032 if (pp != NULL) { 1033 /*LINTED: constant in conditional context */ 1034 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1035 } 1036 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 1037 (void *)ap, (void *)ap->an_vp)); 1038 1039 kmem_cache_free(anon_cache, ap); 1040 1041 ANI_ADD(1); 1042 } else { 1043 mutex_exit(ahm); 1044 } 1045 } 1046 1047 1048 /* 1049 * check an_refcnt of the root anon slot (anon_index argument is aligned at 1050 * seg->s_szc level) to determine whether COW processing is required. 1051 * anonpages_hash_lock[] held on the root ap ensures that if root's 1052 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase 1053 * later since this process can't fork while its AS lock is held). 1054 * 1055 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0. 1056 */ 1057 int 1058 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index) 1059 { 1060 struct anon *ap; 1061 kmutex_t *ahmpages = NULL; 1062 1063 ap = anon_get_ptr(ahp, anon_index); 1064 if (ap == NULL) 1065 return (0); 1066 1067 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1068 mutex_enter(ahmpages); 1069 ASSERT(ap->an_refcnt >= 1); 1070 if (ap->an_refcnt == 1) { 1071 mutex_exit(ahmpages); 1072 return (0); 1073 } 1074 mutex_exit(ahmpages); 1075 return (1); 1076 } 1077 /* 1078 * Check 'nslots' anon slots for refcnt > 1. 1079 * 1080 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise 1081 * returns 0. 1082 */ 1083 static int 1084 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1085 { 1086 struct anon *ap; 1087 1088 while (nslots-- > 0) { 1089 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1090 ap->an_refcnt > 1) 1091 return (1); 1092 anon_index++; 1093 } 1094 1095 return (0); 1096 } 1097 1098 static void 1099 anon_decref_pages( 1100 struct anon_hdr *ahp, 1101 ulong_t an_idx, 1102 uint_t szc) 1103 { 1104 struct anon *ap = anon_get_ptr(ahp, an_idx); 1105 kmutex_t *ahmpages = NULL; 1106 page_t *pp; 1107 pgcnt_t pgcnt = page_get_pagecnt(szc); 1108 pgcnt_t i; 1109 struct vnode *vp; 1110 anoff_t off; 1111 kmutex_t *ahm; 1112 #ifdef DEBUG 1113 int refcnt = 1; 1114 #endif 1115 1116 ASSERT(szc != 0); 1117 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1118 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1119 ASSERT(an_idx < ahp->size); 1120 1121 if (ahp->size - an_idx < pgcnt) { 1122 /* 1123 * In case of shared mappings total anon map size may not be 1124 * the largest page size aligned. 1125 */ 1126 pgcnt = ahp->size - an_idx; 1127 } 1128 1129 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1130 1131 if (ap != NULL) { 1132 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1133 mutex_enter(ahmpages); 1134 ASSERT((refcnt = ap->an_refcnt) != 0); 1135 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1136 if (ap->an_refcnt == 1) { 1137 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1138 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1139 mutex_exit(ahmpages); 1140 ahmpages = NULL; 1141 } 1142 } 1143 1144 i = 0; 1145 while (i < pgcnt) { 1146 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1147 ASSERT(refcnt == 1 && ahmpages == NULL); 1148 i++; 1149 continue; 1150 } 1151 ASSERT(ap->an_refcnt == refcnt); 1152 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1153 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1154 1155 if (ahmpages == NULL) { 1156 swap_xlate(ap, &vp, &off); 1157 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1158 if (pp == NULL || pp->p_szc == 0) { 1159 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1160 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1161 ap->an_off)]; 1162 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1163 ANON_SLEEP); 1164 mutex_enter(ahm); 1165 ap->an_refcnt--; 1166 ASSERT(ap->an_refcnt == 0); 1167 anon_rmhash(ap); 1168 if (ap->an_pvp) 1169 swap_phys_free(ap->an_pvp, ap->an_poff, 1170 PAGESIZE); 1171 mutex_exit(ahm); 1172 if (pp == NULL) { 1173 pp = page_lookup(vp, (u_offset_t)off, 1174 SE_EXCL); 1175 ASSERT(pp == NULL || pp->p_szc == 0); 1176 } 1177 if (pp != NULL) { 1178 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1179 /*LINTED*/ 1180 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1181 } 1182 kmem_cache_free(anon_cache, ap); 1183 ANI_ADD(1); 1184 i++; 1185 } else { 1186 pgcnt_t j; 1187 pgcnt_t curpgcnt = 1188 page_get_pagecnt(pp->p_szc); 1189 size_t ppasize = curpgcnt * sizeof (page_t *); 1190 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1191 int dispose = 0; 1192 1193 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1194 1195 ASSERT(pp->p_szc <= szc); 1196 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1197 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1198 ASSERT(i + curpgcnt <= pgcnt); 1199 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1200 ppa[0] = pp; 1201 for (j = i + 1; j < i + curpgcnt; j++) { 1202 ap = anon_get_ptr(ahp, an_idx + j); 1203 ASSERT(ap != NULL && 1204 ap->an_refcnt == 1); 1205 swap_xlate(ap, &vp, &off); 1206 pp = page_lookup(vp, (u_offset_t)off, 1207 SE_EXCL); 1208 if (pp == NULL) 1209 panic("anon_decref_pages: " 1210 "no page"); 1211 1212 (void) hat_pageunload(pp, 1213 HAT_FORCE_PGUNLOAD); 1214 ASSERT(pp->p_szc == ppa[0]->p_szc); 1215 ASSERT(page_pptonum(pp) - 1 == 1216 page_pptonum(ppa[j - i - 1])); 1217 ppa[j - i] = pp; 1218 if (ap->an_pvp != NULL && 1219 !vn_matchopval(ap->an_pvp, 1220 VOPNAME_DISPOSE, 1221 (fs_generic_func_p)fs_dispose)) 1222 dispose = 1; 1223 } 1224 for (j = i; j < i + curpgcnt; j++) { 1225 ap = anon_get_ptr(ahp, an_idx + j); 1226 ASSERT(ap != NULL && 1227 ap->an_refcnt == 1); 1228 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1229 ap->an_off)]; 1230 (void) anon_set_ptr(ahp, an_idx + j, 1231 NULL, ANON_SLEEP); 1232 mutex_enter(ahm); 1233 ap->an_refcnt--; 1234 ASSERT(ap->an_refcnt == 0); 1235 anon_rmhash(ap); 1236 if (ap->an_pvp) 1237 swap_phys_free(ap->an_pvp, 1238 ap->an_poff, PAGESIZE); 1239 mutex_exit(ahm); 1240 kmem_cache_free(anon_cache, ap); 1241 ANI_ADD(1); 1242 } 1243 if (!dispose) { 1244 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1245 page_destroy_pages(ppa[0]); 1246 } else { 1247 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1248 for (j = 0; j < curpgcnt; j++) { 1249 ASSERT(PAGE_EXCL(ppa[j])); 1250 ppa[j]->p_szc = 0; 1251 } 1252 for (j = 0; j < curpgcnt; j++) { 1253 ASSERT(!hat_page_is_mapped( 1254 ppa[j])); 1255 /*LINTED*/ 1256 VN_DISPOSE(ppa[j], B_INVAL, 0, 1257 kcred); 1258 } 1259 } 1260 kmem_free(ppa, ppasize); 1261 i += curpgcnt; 1262 } 1263 } else { 1264 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1265 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1266 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1267 mutex_enter(ahm); 1268 ap->an_refcnt--; 1269 mutex_exit(ahm); 1270 i++; 1271 } 1272 } 1273 1274 if (ahmpages != NULL) { 1275 mutex_exit(ahmpages); 1276 } 1277 } 1278 1279 /* 1280 * Duplicate references to size bytes worth of anon pages. 1281 * Used when duplicating a segment that contains private anon pages. 1282 * This code assumes that procedure calling this one has already used 1283 * hat_chgprot() to disable write access to the range of addresses that 1284 * that *old actually refers to. 1285 */ 1286 void 1287 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1288 ulong_t new_idx, size_t size) 1289 { 1290 spgcnt_t npages; 1291 kmutex_t *ahm; 1292 struct anon *ap; 1293 ulong_t off; 1294 ulong_t index; 1295 1296 npages = btopr(size); 1297 while (npages > 0) { 1298 index = old_idx; 1299 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1300 break; 1301 1302 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1303 off = index - old_idx; 1304 npages -= off; 1305 if (npages <= 0) 1306 break; 1307 1308 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1309 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1310 1311 mutex_enter(ahm); 1312 ap->an_refcnt++; 1313 mutex_exit(ahm); 1314 1315 off++; 1316 new_idx += off; 1317 old_idx += off; 1318 npages--; 1319 } 1320 } 1321 1322 /* 1323 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1324 * slots) within any large page region. That means if a large page region is 1325 * empty in the old array it will skip it. If there are 1 or more valid slots 1326 * in the large page region of the old array it will make sure to fill in any 1327 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1328 * page region should either have no valid anon slots or all slots should be 1329 * valid. 1330 */ 1331 void 1332 anon_dup_fill_holes( 1333 struct anon_hdr *old, 1334 ulong_t old_idx, 1335 struct anon_hdr *new, 1336 ulong_t new_idx, 1337 size_t size, 1338 uint_t szc, 1339 int noalloc) 1340 { 1341 struct anon *ap; 1342 spgcnt_t npages; 1343 kmutex_t *ahm, *ahmpages = NULL; 1344 pgcnt_t pgcnt, i; 1345 ulong_t index, off; 1346 #ifdef DEBUG 1347 int refcnt; 1348 #endif 1349 1350 ASSERT(szc != 0); 1351 pgcnt = page_get_pagecnt(szc); 1352 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1353 npages = btopr(size); 1354 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1355 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1356 1357 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1358 1359 while (npages > 0) { 1360 index = old_idx; 1361 1362 /* 1363 * Find the next valid slot. 1364 */ 1365 if (anon_get_next_ptr(old, &index) == NULL) 1366 break; 1367 1368 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1369 /* 1370 * Now backup index to the beginning of the 1371 * current large page region of the old array. 1372 */ 1373 index = P2ALIGN(index, pgcnt); 1374 off = index - old_idx; 1375 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1376 npages -= off; 1377 if (npages <= 0) 1378 break; 1379 1380 /* 1381 * Fill and copy a large page regions worth 1382 * of anon slots. 1383 */ 1384 for (i = 0; i < pgcnt; i++) { 1385 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1386 if (noalloc) { 1387 panic("anon_dup_fill_holes: " 1388 "empty anon slot\n"); 1389 } 1390 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1391 ap = anon_alloc(NULL, 0); 1392 (void) anon_set_ptr(old, index + i, ap, 1393 ANON_SLEEP); 1394 } else if (i == 0) { 1395 /* 1396 * make the increment of all refcnts of all 1397 * anon slots of a large page appear atomic by 1398 * getting an anonpages_hash_lock for the 1399 * first anon slot of a large page. 1400 */ 1401 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1402 1403 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1404 1405 ahmpages = &anonpages_hash_lock[hash]; 1406 mutex_enter(ahmpages); 1407 /*LINTED*/ 1408 ASSERT(refcnt = ap->an_refcnt); 1409 1410 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1411 anonvmstats.dupfillholes[3]); 1412 } 1413 (void) anon_set_ptr(new, new_idx + off + i, ap, 1414 ANON_SLEEP); 1415 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1416 mutex_enter(ahm); 1417 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1418 ASSERT(i == 0 || ahmpages == NULL || 1419 refcnt == ap->an_refcnt); 1420 ap->an_refcnt++; 1421 mutex_exit(ahm); 1422 } 1423 if (ahmpages != NULL) { 1424 mutex_exit(ahmpages); 1425 ahmpages = NULL; 1426 } 1427 off += pgcnt; 1428 new_idx += off; 1429 old_idx += off; 1430 npages -= pgcnt; 1431 } 1432 } 1433 1434 /* 1435 * Used when a segment with a vnode changes szc. similarly to 1436 * anon_dup_fill_holes() makes sure each large page region either has no anon 1437 * slots or all of them. but new slots are created by COWing the file 1438 * pages. on entrance no anon slots should be shared. 1439 */ 1440 int 1441 anon_fill_cow_holes( 1442 struct seg *seg, 1443 caddr_t addr, 1444 struct anon_hdr *ahp, 1445 ulong_t an_idx, 1446 struct vnode *vp, 1447 u_offset_t vp_off, 1448 size_t size, 1449 uint_t szc, 1450 uint_t prot, 1451 struct vpage vpage[], 1452 struct cred *cred) 1453 { 1454 struct anon *ap; 1455 spgcnt_t npages; 1456 pgcnt_t pgcnt, i; 1457 ulong_t index, off; 1458 int err = 0; 1459 int pageflags = 0; 1460 1461 ASSERT(szc != 0); 1462 pgcnt = page_get_pagecnt(szc); 1463 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1464 npages = btopr(size); 1465 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1466 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1467 1468 while (npages > 0) { 1469 index = an_idx; 1470 1471 /* 1472 * Find the next valid slot. 1473 */ 1474 if (anon_get_next_ptr(ahp, &index) == NULL) { 1475 break; 1476 } 1477 1478 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1479 /* 1480 * Now backup index to the beginning of the 1481 * current large page region of the anon array. 1482 */ 1483 index = P2ALIGN(index, pgcnt); 1484 off = index - an_idx; 1485 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1486 npages -= off; 1487 if (npages <= 0) 1488 break; 1489 an_idx += off; 1490 vp_off += ptob(off); 1491 addr += ptob(off); 1492 if (vpage != NULL) { 1493 vpage += off; 1494 } 1495 1496 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1497 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1498 page_t *pl[1 + 1]; 1499 page_t *pp; 1500 1501 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1502 pl, PAGESIZE, seg, addr, S_READ, cred, 1503 NULL); 1504 if (err) { 1505 break; 1506 } 1507 if (vpage != NULL) { 1508 prot = VPP_PROT(vpage); 1509 pageflags = VPP_ISPPLOCK(vpage) ? 1510 LOCK_PAGE : 0; 1511 } 1512 pp = anon_private(&ap, seg, addr, prot, pl[0], 1513 pageflags, cred); 1514 if (pp == NULL) { 1515 err = ENOMEM; 1516 break; 1517 } 1518 (void) anon_set_ptr(ahp, an_idx, ap, 1519 ANON_SLEEP); 1520 page_unlock(pp); 1521 } 1522 ASSERT(ap->an_refcnt == 1); 1523 addr += PAGESIZE; 1524 if (vpage != NULL) { 1525 vpage++; 1526 } 1527 } 1528 npages -= pgcnt; 1529 } 1530 1531 return (err); 1532 } 1533 1534 /* 1535 * Free a group of "size" anon pages, size in bytes, 1536 * and clear out the pointers to the anon entries. 1537 */ 1538 void 1539 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1540 { 1541 spgcnt_t npages; 1542 struct anon *ap; 1543 ulong_t old; 1544 1545 npages = btopr(size); 1546 1547 while (npages > 0) { 1548 old = index; 1549 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1550 break; 1551 1552 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1553 npages -= index - old; 1554 if (npages <= 0) 1555 break; 1556 1557 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1558 anon_decref(ap); 1559 /* 1560 * Bump index and decrement page count 1561 */ 1562 index++; 1563 npages--; 1564 } 1565 } 1566 1567 void 1568 anon_free_pages( 1569 struct anon_hdr *ahp, 1570 ulong_t an_idx, 1571 size_t size, 1572 uint_t szc) 1573 { 1574 spgcnt_t npages; 1575 pgcnt_t pgcnt; 1576 ulong_t index, off; 1577 1578 ASSERT(szc != 0); 1579 pgcnt = page_get_pagecnt(szc); 1580 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1581 npages = btopr(size); 1582 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1583 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1584 ASSERT(an_idx < ahp->size); 1585 1586 VM_STAT_ADD(anonvmstats.freepages[0]); 1587 1588 while (npages > 0) { 1589 index = an_idx; 1590 1591 /* 1592 * Find the next valid slot. 1593 */ 1594 if (anon_get_next_ptr(ahp, &index) == NULL) 1595 break; 1596 1597 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1598 /* 1599 * Now backup index to the beginning of the 1600 * current large page region of the old array. 1601 */ 1602 index = P2ALIGN(index, pgcnt); 1603 off = index - an_idx; 1604 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1605 npages -= off; 1606 if (npages <= 0) 1607 break; 1608 1609 anon_decref_pages(ahp, index, szc); 1610 1611 off += pgcnt; 1612 an_idx += off; 1613 npages -= pgcnt; 1614 } 1615 } 1616 1617 /* 1618 * Make anonymous pages discardable 1619 */ 1620 void 1621 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size) 1622 { 1623 spgcnt_t npages = btopr(size); 1624 struct anon *ap; 1625 struct vnode *vp; 1626 anoff_t off; 1627 page_t *pp, *root_pp; 1628 kmutex_t *ahm; 1629 pgcnt_t pgcnt; 1630 ulong_t old_idx, idx, i; 1631 struct anon_hdr *ahp = amp->ahp; 1632 anon_sync_obj_t cookie; 1633 1634 ASSERT(RW_READ_HELD(&->a_rwlock)); 1635 pgcnt = 1; 1636 for (; npages > 0; index = (pgcnt == 1) ? index + 1 : 1637 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1638 1639 /* 1640 * get anon pointer and index for the first valid entry 1641 * in the anon list, starting from "index" 1642 */ 1643 old_idx = index; 1644 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1645 break; 1646 1647 /* 1648 * decrement npages by number of NULL anon slots we skipped 1649 */ 1650 npages -= index - old_idx; 1651 if (npages <= 0) 1652 break; 1653 1654 anon_array_enter(amp, index, &cookie); 1655 ap = anon_get_ptr(ahp, index); 1656 ASSERT(ap != NULL); 1657 1658 /* 1659 * Get anonymous page and try to lock it SE_EXCL; 1660 * if we couldn't grab the lock we skip to next page. 1661 */ 1662 swap_xlate(ap, &vp, &off); 1663 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1664 if (pp == NULL) { 1665 segadvstat.MADV_FREE_miss.value.ul++; 1666 pgcnt = 1; 1667 anon_array_exit(&cookie); 1668 continue; 1669 } 1670 pgcnt = page_get_pagecnt(pp->p_szc); 1671 1672 /* 1673 * we cannot free a page which is permanently locked. 1674 * The page_struct_lock need not be acquired to examine 1675 * these fields since the page has an "exclusive" lock. 1676 */ 1677 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1678 page_unlock(pp); 1679 segadvstat.MADV_FREE_miss.value.ul++; 1680 anon_array_exit(&cookie); 1681 continue; 1682 } 1683 1684 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1685 mutex_enter(ahm); 1686 ASSERT(ap->an_refcnt != 0); 1687 /* 1688 * skip this one if copy-on-write is not yet broken. 1689 */ 1690 if (ap->an_refcnt > 1) { 1691 mutex_exit(ahm); 1692 page_unlock(pp); 1693 segadvstat.MADV_FREE_miss.value.ul++; 1694 anon_array_exit(&cookie); 1695 continue; 1696 } 1697 1698 if (pp->p_szc == 0) { 1699 pgcnt = 1; 1700 1701 /* 1702 * free swap slot; 1703 */ 1704 if (ap->an_pvp) { 1705 swap_phys_free(ap->an_pvp, ap->an_poff, 1706 PAGESIZE); 1707 ap->an_pvp = NULL; 1708 ap->an_poff = 0; 1709 } 1710 mutex_exit(ahm); 1711 segadvstat.MADV_FREE_hit.value.ul++; 1712 1713 /* 1714 * while we are at it, unload all the translations 1715 * and attempt to free the page. 1716 */ 1717 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1718 /*LINTED: constant in conditional context */ 1719 VN_DISPOSE(pp, B_FREE, 0, kcred); 1720 anon_array_exit(&cookie); 1721 continue; 1722 } 1723 1724 pgcnt = page_get_pagecnt(pp->p_szc); 1725 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1726 if (!page_try_demote_pages(pp)) { 1727 mutex_exit(ahm); 1728 page_unlock(pp); 1729 segadvstat.MADV_FREE_miss.value.ul++; 1730 anon_array_exit(&cookie); 1731 continue; 1732 } else { 1733 pgcnt = 1; 1734 if (ap->an_pvp) { 1735 swap_phys_free(ap->an_pvp, 1736 ap->an_poff, PAGESIZE); 1737 ap->an_pvp = NULL; 1738 ap->an_poff = 0; 1739 } 1740 mutex_exit(ahm); 1741 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1742 /*LINTED*/ 1743 VN_DISPOSE(pp, B_FREE, 0, kcred); 1744 segadvstat.MADV_FREE_hit.value.ul++; 1745 anon_array_exit(&cookie); 1746 continue; 1747 } 1748 } 1749 mutex_exit(ahm); 1750 root_pp = pp; 1751 1752 /* 1753 * try to lock remaining pages 1754 */ 1755 for (idx = 1; idx < pgcnt; idx++) { 1756 pp++; 1757 if (!page_trylock(pp, SE_EXCL)) 1758 break; 1759 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1760 page_unlock(pp); 1761 break; 1762 } 1763 } 1764 1765 if (idx == pgcnt) { 1766 for (i = 0; i < pgcnt; i++) { 1767 ap = anon_get_ptr(ahp, index + i); 1768 if (ap == NULL) 1769 break; 1770 swap_xlate(ap, &vp, &off); 1771 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1772 mutex_enter(ahm); 1773 ASSERT(ap->an_refcnt != 0); 1774 1775 /* 1776 * skip this one if copy-on-write 1777 * is not yet broken. 1778 */ 1779 if (ap->an_refcnt > 1) { 1780 mutex_exit(ahm); 1781 goto skiplp; 1782 } 1783 if (ap->an_pvp) { 1784 swap_phys_free(ap->an_pvp, 1785 ap->an_poff, PAGESIZE); 1786 ap->an_pvp = NULL; 1787 ap->an_poff = 0; 1788 } 1789 mutex_exit(ahm); 1790 } 1791 page_destroy_pages(root_pp); 1792 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1793 anon_array_exit(&cookie); 1794 continue; 1795 } 1796 skiplp: 1797 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1798 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1799 page_unlock(pp); 1800 anon_array_exit(&cookie); 1801 } 1802 } 1803 1804 /* 1805 * Return the kept page(s) and protections back to the segment driver. 1806 */ 1807 int 1808 anon_getpage( 1809 struct anon **app, 1810 uint_t *protp, 1811 page_t *pl[], 1812 size_t plsz, 1813 struct seg *seg, 1814 caddr_t addr, 1815 enum seg_rw rw, 1816 struct cred *cred) 1817 { 1818 page_t *pp; 1819 struct anon *ap = *app; 1820 struct vnode *vp; 1821 anoff_t off; 1822 int err; 1823 kmutex_t *ahm; 1824 1825 swap_xlate(ap, &vp, &off); 1826 1827 /* 1828 * Lookup the page. If page is being paged in, 1829 * wait for it to finish as we must return a list of 1830 * pages since this routine acts like the VOP_GETPAGE 1831 * routine does. 1832 */ 1833 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1834 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1835 mutex_enter(ahm); 1836 if (ap->an_refcnt == 1) 1837 *protp = PROT_ALL; 1838 else 1839 *protp = PROT_ALL & ~PROT_WRITE; 1840 mutex_exit(ahm); 1841 pl[0] = pp; 1842 pl[1] = NULL; 1843 return (0); 1844 } 1845 1846 /* 1847 * Simply treat it as a vnode fault on the anon vp. 1848 */ 1849 1850 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1851 "anon_getpage:seg %x addr %x vp %x", 1852 seg, addr, vp); 1853 1854 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1855 seg, addr, rw, cred, NULL); 1856 1857 if (err == 0 && pl != NULL) { 1858 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1859 mutex_enter(ahm); 1860 if (ap->an_refcnt != 1) 1861 *protp &= ~PROT_WRITE; /* make read-only */ 1862 mutex_exit(ahm); 1863 } 1864 return (err); 1865 } 1866 1867 /* 1868 * Creates or returns kept pages to the segment driver. returns -1 if a large 1869 * page cannot be allocated. returns -2 if some other process has allocated a 1870 * larger page. 1871 * 1872 * For cowfault it will allocate any size pages to fill the requested area to 1873 * avoid partially overwriting anon slots (i.e. sharing only some of the anon 1874 * slots within a large page with other processes). This policy greatly 1875 * simplifies large page freeing (which is only freed when all anon slot 1876 * refcnts are 0). 1877 */ 1878 int 1879 anon_map_getpages( 1880 struct anon_map *amp, 1881 ulong_t start_idx, 1882 uint_t szc, 1883 struct seg *seg, 1884 caddr_t addr, 1885 uint_t prot, 1886 uint_t *protp, 1887 page_t *ppa[], 1888 uint_t *ppa_szc, 1889 struct vpage vpage[], 1890 enum seg_rw rw, 1891 int brkcow, 1892 int anypgsz, 1893 int pgflags, 1894 struct cred *cred) 1895 { 1896 pgcnt_t pgcnt; 1897 struct anon *ap; 1898 struct vnode *vp; 1899 anoff_t off; 1900 page_t *pp, *pl[2], *conpp = NULL; 1901 caddr_t vaddr; 1902 ulong_t pg_idx, an_idx, i; 1903 spgcnt_t nreloc = 0; 1904 int prealloc = 1; 1905 int err, slotcreate; 1906 uint_t vpprot; 1907 int upsize = (szc < seg->s_szc); 1908 1909 #if !defined(__i386) && !defined(__amd64) 1910 ASSERT(seg->s_szc != 0); 1911 #endif 1912 ASSERT(szc <= seg->s_szc); 1913 ASSERT(ppa_szc != NULL); 1914 ASSERT(rw != S_CREATE); 1915 1916 *protp = PROT_ALL; 1917 1918 VM_STAT_ADD(anonvmstats.getpages[0]); 1919 1920 if (szc == 0) { 1921 VM_STAT_ADD(anonvmstats.getpages[1]); 1922 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1923 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1924 addr, rw, cred); 1925 if (err) 1926 return (err); 1927 ppa[0] = pl[0]; 1928 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1929 VM_STAT_ADD(anonvmstats.getpages[2]); 1930 if (ppa[0]->p_szc != 0 && upsize) { 1931 VM_STAT_ADD(anonvmstats.getpages[3]); 1932 *ppa_szc = MIN(ppa[0]->p_szc, 1933 seg->s_szc); 1934 page_unlock(ppa[0]); 1935 return (-2); 1936 } 1937 return (0); 1938 } 1939 panic("anon_map_getpages: cowfault for szc 0"); 1940 } else { 1941 VM_STAT_ADD(anonvmstats.getpages[4]); 1942 ppa[0] = anon_zero(seg, addr, &ap, cred); 1943 if (ppa[0] == NULL) 1944 return (ENOMEM); 1945 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1946 ANON_SLEEP); 1947 return (0); 1948 } 1949 } 1950 1951 pgcnt = page_get_pagecnt(szc); 1952 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1953 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1954 1955 /* 1956 * First we check for the case that the requtested large 1957 * page or larger page already exists in the system. 1958 * Actually we only check if the first constituent page 1959 * exists and only preallocate if it's not found. 1960 */ 1961 ap = anon_get_ptr(amp->ahp, start_idx); 1962 if (ap) { 1963 uint_t pszc; 1964 swap_xlate(ap, &vp, &off); 1965 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1966 if (pszc > szc && upsize) { 1967 *ppa_szc = MIN(pszc, seg->s_szc); 1968 return (-2); 1969 } 1970 if (pszc >= szc) { 1971 prealloc = 0; 1972 } 1973 } 1974 } 1975 1976 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1977 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1978 1979 top: 1980 /* 1981 * If a smaller page or no page at all was found, 1982 * grab a large page off the freelist. 1983 */ 1984 if (prealloc) { 1985 ASSERT(conpp == NULL); 1986 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1987 szc, 0, pgflags) != 0) { 1988 VM_STAT_ADD(anonvmstats.getpages[7]); 1989 if (brkcow == 0 || szc < seg->s_szc || 1990 !anon_szcshare(amp->ahp, start_idx)) { 1991 /* 1992 * If the refcnt's of all anon slots are <= 1 1993 * they can't increase since we are holding 1994 * the address space's lock. So segvn can 1995 * safely decrease szc without risking to 1996 * generate a cow fault for the region smaller 1997 * than the segment's largest page size. 1998 */ 1999 VM_STAT_ADD(anonvmstats.getpages[8]); 2000 return (-1); 2001 } 2002 docow: 2003 /* 2004 * This is a cow fault. Copy away the entire 1 large 2005 * page region of this segment. 2006 */ 2007 if (szc != seg->s_szc) 2008 panic("anon_map_getpages: cowfault for szc %d", 2009 szc); 2010 vaddr = addr; 2011 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2012 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2013 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 2014 NULL) { 2015 err = anon_getpage(&ap, &vpprot, pl, 2016 PAGESIZE, seg, vaddr, rw, cred); 2017 if (err) { 2018 for (i = 0; i < pg_idx; i++) { 2019 if ((pp = ppa[i]) != 2020 NULL) 2021 page_unlock(pp); 2022 } 2023 return (err); 2024 } 2025 ppa[pg_idx] = pl[0]; 2026 } else { 2027 /* 2028 * Since this is a cowfault we know 2029 * that this address space has a 2030 * parent or children which means 2031 * anon_dup_fill_holes() has initialized 2032 * all anon slots within a large page 2033 * region that had at least one anon 2034 * slot at the time of fork(). 2035 */ 2036 panic("anon_map_getpages: " 2037 "cowfault but anon slot is empty"); 2038 } 2039 } 2040 VM_STAT_ADD(anonvmstats.getpages[9]); 2041 *protp = PROT_ALL; 2042 return (anon_map_privatepages(amp, start_idx, szc, seg, 2043 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 2044 } 2045 } 2046 2047 VM_STAT_ADD(anonvmstats.getpages[10]); 2048 2049 an_idx = start_idx; 2050 pg_idx = 0; 2051 vaddr = addr; 2052 while (pg_idx < pgcnt) { 2053 slotcreate = 0; 2054 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 2055 VM_STAT_ADD(anonvmstats.getpages[11]); 2056 /* 2057 * For us to have decided not to preallocate 2058 * would have meant that a large page 2059 * was found. Which also means that all of the 2060 * anon slots for that page would have been 2061 * already created for us. 2062 */ 2063 if (prealloc == 0) 2064 panic("anon_map_getpages: prealloc = 0"); 2065 2066 slotcreate = 1; 2067 ap = anon_alloc(NULL, 0); 2068 } 2069 swap_xlate(ap, &vp, &off); 2070 2071 /* 2072 * Now setup our preallocated page to pass down 2073 * to swap_getpage(). 2074 */ 2075 if (prealloc) { 2076 ASSERT(ppa[pg_idx]->p_szc == szc); 2077 conpp = ppa[pg_idx]; 2078 } 2079 ASSERT(prealloc || conpp == NULL); 2080 2081 /* 2082 * If we just created this anon slot then call 2083 * with S_CREATE to prevent doing IO on the page. 2084 * Similar to the anon_zero case. 2085 */ 2086 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2087 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2088 slotcreate == 1 ? S_CREATE : rw, cred); 2089 2090 if (err) { 2091 ASSERT(err != -2 || upsize); 2092 VM_STAT_ADD(anonvmstats.getpages[12]); 2093 ASSERT(slotcreate == 0); 2094 goto io_err; 2095 } 2096 2097 pp = pl[0]; 2098 2099 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2100 VM_STAT_ADD(anonvmstats.getpages[13]); 2101 ASSERT(slotcreate == 0); 2102 ASSERT(prealloc == 0); 2103 ASSERT(pg_idx == 0); 2104 if (pp->p_szc > szc) { 2105 ASSERT(upsize); 2106 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2107 page_unlock(pp); 2108 VM_STAT_ADD(anonvmstats.getpages[14]); 2109 return (-2); 2110 } 2111 page_unlock(pp); 2112 prealloc = 1; 2113 goto top; 2114 } 2115 2116 /* 2117 * If we decided to preallocate but VOP_GETPAGE 2118 * found a page in the system that satisfies our 2119 * request then free up our preallocated large page 2120 * and continue looping accross the existing large 2121 * page via VOP_GETPAGE. 2122 */ 2123 if (prealloc && pp != ppa[pg_idx]) { 2124 VM_STAT_ADD(anonvmstats.getpages[15]); 2125 ASSERT(slotcreate == 0); 2126 ASSERT(pg_idx == 0); 2127 conpp = NULL; 2128 prealloc = 0; 2129 page_free_pages(ppa[0]); 2130 } 2131 2132 if (prealloc && nreloc > 1) { 2133 /* 2134 * we have relocated out of a smaller large page. 2135 * skip npgs - 1 iterations and continue which will 2136 * increment by one the loop indices. 2137 */ 2138 spgcnt_t npgs = nreloc; 2139 2140 VM_STAT_ADD(anonvmstats.getpages[16]); 2141 2142 ASSERT(pp == ppa[pg_idx]); 2143 ASSERT(slotcreate == 0); 2144 ASSERT(pg_idx + npgs <= pgcnt); 2145 if ((*protp & PROT_WRITE) && 2146 anon_share(amp->ahp, an_idx, npgs)) { 2147 *protp &= ~PROT_WRITE; 2148 } 2149 pg_idx += npgs; 2150 an_idx += npgs; 2151 vaddr += PAGESIZE * npgs; 2152 continue; 2153 } 2154 2155 VM_STAT_ADD(anonvmstats.getpages[17]); 2156 2157 /* 2158 * Anon_zero case. 2159 */ 2160 if (slotcreate) { 2161 ASSERT(prealloc); 2162 pagezero(pp, 0, PAGESIZE); 2163 CPU_STATS_ADD_K(vm, zfod, 1); 2164 hat_setrefmod(pp); 2165 } 2166 2167 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2168 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2169 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2170 2171 if (pg_idx > 0 && 2172 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2173 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2174 panic("anon_map_getpages: unexpected page"); 2175 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2176 panic("anon_map_getpages: unaligned page"); 2177 } 2178 2179 if (prealloc == 0) { 2180 ppa[pg_idx] = pp; 2181 } 2182 2183 if (ap->an_refcnt > 1) { 2184 VM_STAT_ADD(anonvmstats.getpages[18]); 2185 *protp &= ~PROT_WRITE; 2186 } 2187 2188 /* 2189 * If this is a new anon slot then initialize 2190 * the anon array entry. 2191 */ 2192 if (slotcreate) { 2193 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2194 } 2195 pg_idx++; 2196 an_idx++; 2197 vaddr += PAGESIZE; 2198 } 2199 2200 /* 2201 * Since preallocated pages come off the freelist 2202 * they are locked SE_EXCL. Simply downgrade and return. 2203 */ 2204 if (prealloc) { 2205 VM_STAT_ADD(anonvmstats.getpages[19]); 2206 conpp = NULL; 2207 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2208 page_downgrade(ppa[pg_idx]); 2209 } 2210 } 2211 ASSERT(conpp == NULL); 2212 2213 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2214 VM_STAT_ADD(anonvmstats.getpages[20]); 2215 return (0); 2216 } 2217 2218 if (szc < seg->s_szc) 2219 panic("anon_map_getpages: cowfault for szc %d", szc); 2220 2221 VM_STAT_ADD(anonvmstats.getpages[21]); 2222 2223 *protp = PROT_ALL; 2224 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2225 ppa, vpage, anypgsz, pgflags, cred)); 2226 io_err: 2227 /* 2228 * We got an IO error somewhere in our large page. 2229 * If we were using a preallocated page then just demote 2230 * all the constituent pages that we've succeeded with sofar 2231 * to PAGESIZE pages and leave them in the system 2232 * unlocked. 2233 */ 2234 2235 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2236 2237 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2238 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2239 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2240 2241 if (prealloc) { 2242 conpp = NULL; 2243 if (pg_idx > 0) { 2244 VM_STAT_ADD(anonvmstats.getpages[25]); 2245 for (i = 0; i < pgcnt; i++) { 2246 pp = ppa[i]; 2247 ASSERT(PAGE_EXCL(pp)); 2248 ASSERT(pp->p_szc == szc); 2249 pp->p_szc = 0; 2250 } 2251 for (i = 0; i < pg_idx; i++) { 2252 ASSERT(!hat_page_is_mapped(ppa[i])); 2253 page_unlock(ppa[i]); 2254 } 2255 /* 2256 * Now free up the remaining unused constituent 2257 * pages. 2258 */ 2259 while (pg_idx < pgcnt) { 2260 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2261 page_free(ppa[pg_idx], 0); 2262 pg_idx++; 2263 } 2264 } else { 2265 VM_STAT_ADD(anonvmstats.getpages[26]); 2266 page_free_pages(ppa[0]); 2267 } 2268 } else { 2269 VM_STAT_ADD(anonvmstats.getpages[27]); 2270 ASSERT(err > 0); 2271 for (i = 0; i < pg_idx; i++) 2272 page_unlock(ppa[i]); 2273 } 2274 ASSERT(conpp == NULL); 2275 if (err != -1) 2276 return (err); 2277 /* 2278 * we are here because we failed to relocate. 2279 */ 2280 ASSERT(prealloc); 2281 if (brkcow == 0 || szc < seg->s_szc || 2282 !anon_szcshare(amp->ahp, start_idx)) { 2283 VM_STAT_ADD(anonvmstats.getpages[28]); 2284 return (-1); 2285 } 2286 VM_STAT_ADD(anonvmstats.getpages[29]); 2287 goto docow; 2288 } 2289 2290 2291 /* 2292 * Turn a reference to an object or shared anon page 2293 * into a private page with a copy of the data from the 2294 * original page which is always locked by the caller. 2295 * This routine unloads the translation and unlocks the 2296 * original page, if it isn't being stolen, before returning 2297 * to the caller. 2298 * 2299 * NOTE: The original anon slot is not freed by this routine 2300 * It must be freed by the caller while holding the 2301 * "anon_map" lock to prevent races which can occur if 2302 * a process has multiple lwps in its address space. 2303 */ 2304 page_t * 2305 anon_private( 2306 struct anon **app, 2307 struct seg *seg, 2308 caddr_t addr, 2309 uint_t prot, 2310 page_t *opp, 2311 int oppflags, 2312 struct cred *cred) 2313 { 2314 struct anon *old = *app; 2315 struct anon *new; 2316 page_t *pp = NULL; 2317 struct vnode *vp; 2318 anoff_t off; 2319 page_t *anon_pl[1 + 1]; 2320 int err; 2321 2322 if (oppflags & STEAL_PAGE) 2323 ASSERT(PAGE_EXCL(opp)); 2324 else 2325 ASSERT(PAGE_LOCKED(opp)); 2326 2327 CPU_STATS_ADD_K(vm, cow_fault, 1); 2328 2329 /* Kernel probe */ 2330 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2331 tnf_opaque, address, addr); 2332 2333 *app = new = anon_alloc(NULL, 0); 2334 swap_xlate(new, &vp, &off); 2335 2336 if (oppflags & STEAL_PAGE) { 2337 page_rename(opp, vp, (u_offset_t)off); 2338 pp = opp; 2339 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2340 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2341 seg, addr, pp, vp, off); 2342 hat_setmod(pp); 2343 2344 /* bug 4026339 */ 2345 page_downgrade(pp); 2346 return (pp); 2347 } 2348 2349 /* 2350 * Call the VOP_GETPAGE routine to create the page, thereby 2351 * enabling the vnode driver to allocate any filesystem 2352 * space (e.g., disk block allocation for UFS). This also 2353 * prevents more than one page from being added to the 2354 * vnode at the same time. 2355 */ 2356 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2357 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2358 if (err) 2359 goto out; 2360 2361 pp = anon_pl[0]; 2362 2363 /* 2364 * If the original page was locked, we need to move the lock 2365 * to the new page by transfering 'cowcnt/lckcnt' of the original 2366 * page to 'cowcnt/lckcnt' of the new page. 2367 * 2368 * See Statement at the beginning of segvn_lockop() and 2369 * comments in page_pp_useclaim() regarding the way 2370 * cowcnts/lckcnts are handled. 2371 * 2372 * Also availrmem must be decremented up front for read only mapping 2373 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2374 * if availrmem did not need to be decremented after all. 2375 */ 2376 if (oppflags & LOCK_PAGE) { 2377 if ((prot & PROT_WRITE) == 0) { 2378 mutex_enter(&freemem_lock); 2379 if (availrmem > pages_pp_maximum) { 2380 availrmem--; 2381 pages_useclaim++; 2382 } else { 2383 mutex_exit(&freemem_lock); 2384 goto out; 2385 } 2386 mutex_exit(&freemem_lock); 2387 } 2388 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2389 } 2390 2391 /* 2392 * Now copy the contents from the original page, 2393 * which is locked and loaded in the MMU by 2394 * the caller to prevent yet another page fault. 2395 */ 2396 /* XXX - should set mod bit in here */ 2397 if (ppcopy(opp, pp) == 0) { 2398 /* 2399 * Before ppcopy could hanlde UE or other faults, we 2400 * would have panicked here, and still have no option 2401 * but to do so now. 2402 */ 2403 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2404 (void *)opp, (void *)pp); 2405 } 2406 2407 hat_setrefmod(pp); /* mark as modified */ 2408 2409 /* 2410 * Unload the old translation. 2411 */ 2412 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2413 2414 /* 2415 * Free unmapped, unmodified original page. 2416 * or release the lock on the original page, 2417 * otherwise the process will sleep forever in 2418 * anon_decref() waiting for the "exclusive" lock 2419 * on the page. 2420 */ 2421 (void) page_release(opp, 1); 2422 2423 /* 2424 * we are done with page creation so downgrade the new 2425 * page's selock to shared, this helps when multiple 2426 * as_fault(...SOFTLOCK...) are done to the same 2427 * page(aio) 2428 */ 2429 page_downgrade(pp); 2430 2431 /* 2432 * NOTE: The original anon slot must be freed by the 2433 * caller while holding the "anon_map" lock, if we 2434 * copied away from an anonymous page. 2435 */ 2436 return (pp); 2437 2438 out: 2439 *app = old; 2440 if (pp) 2441 page_unlock(pp); 2442 anon_decref(new); 2443 page_unlock(opp); 2444 return ((page_t *)NULL); 2445 } 2446 2447 int 2448 anon_map_privatepages( 2449 struct anon_map *amp, 2450 ulong_t start_idx, 2451 uint_t szc, 2452 struct seg *seg, 2453 caddr_t addr, 2454 uint_t prot, 2455 page_t *ppa[], 2456 struct vpage vpage[], 2457 int anypgsz, 2458 int pgflags, 2459 struct cred *cred) 2460 { 2461 pgcnt_t pgcnt; 2462 struct vnode *vp; 2463 anoff_t off; 2464 page_t *pl[2], *conpp = NULL; 2465 int err; 2466 int prealloc = 1; 2467 struct anon *ap, *oldap; 2468 caddr_t vaddr; 2469 page_t *pplist, *pp; 2470 ulong_t pg_idx, an_idx; 2471 spgcnt_t nreloc = 0; 2472 int pagelock = 0; 2473 kmutex_t *ahmpages = NULL; 2474 #ifdef DEBUG 2475 int refcnt; 2476 #endif 2477 2478 ASSERT(szc != 0); 2479 ASSERT(szc == seg->s_szc); 2480 2481 VM_STAT_ADD(anonvmstats.privatepages[0]); 2482 2483 pgcnt = page_get_pagecnt(szc); 2484 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2485 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2486 2487 ASSERT(amp != NULL); 2488 ap = anon_get_ptr(amp->ahp, start_idx); 2489 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2490 2491 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2492 2493 /* 2494 * Now try and allocate the large page. If we fail then just 2495 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2496 * the caller make this decision but to avoid added complexity 2497 * it's simplier to handle that case here. 2498 */ 2499 if (anypgsz == -1) { 2500 VM_STAT_ADD(anonvmstats.privatepages[2]); 2501 prealloc = 0; 2502 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2503 anypgsz, pgflags) != 0) { 2504 VM_STAT_ADD(anonvmstats.privatepages[3]); 2505 prealloc = 0; 2506 } 2507 2508 /* 2509 * make the decrement of all refcnts of all 2510 * anon slots of a large page appear atomic by 2511 * getting an anonpages_hash_lock for the 2512 * first anon slot of a large page. 2513 */ 2514 if (ap != NULL) { 2515 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2516 ap->an_off)]; 2517 mutex_enter(ahmpages); 2518 if (ap->an_refcnt == 1) { 2519 VM_STAT_ADD(anonvmstats.privatepages[4]); 2520 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2521 mutex_exit(ahmpages); 2522 2523 if (prealloc) { 2524 page_free_replacement_page(pplist); 2525 page_create_putback(pgcnt); 2526 } 2527 ASSERT(ppa[0]->p_szc <= szc); 2528 if (ppa[0]->p_szc == szc) { 2529 VM_STAT_ADD(anonvmstats.privatepages[5]); 2530 return (0); 2531 } 2532 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2533 ASSERT(ppa[pg_idx] != NULL); 2534 page_unlock(ppa[pg_idx]); 2535 } 2536 return (-1); 2537 } 2538 } 2539 2540 /* 2541 * If we are passed in the vpage array and this is 2542 * not PROT_WRITE then we need to decrement availrmem 2543 * up front before we try anything. If we need to and 2544 * can't decrement availrmem then its better to fail now 2545 * than in the middle of processing the new large page. 2546 * page_pp_usclaim() on behalf of each constituent page 2547 * below will adjust availrmem back for the cases not needed. 2548 */ 2549 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2550 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2551 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2552 pagelock = 1; 2553 break; 2554 } 2555 } 2556 if (pagelock) { 2557 VM_STAT_ADD(anonvmstats.privatepages[6]); 2558 mutex_enter(&freemem_lock); 2559 if (availrmem >= pages_pp_maximum + pgcnt) { 2560 availrmem -= pgcnt; 2561 pages_useclaim += pgcnt; 2562 } else { 2563 VM_STAT_ADD(anonvmstats.privatepages[7]); 2564 mutex_exit(&freemem_lock); 2565 if (ahmpages != NULL) { 2566 mutex_exit(ahmpages); 2567 } 2568 if (prealloc) { 2569 page_free_replacement_page(pplist); 2570 page_create_putback(pgcnt); 2571 } 2572 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2573 if (ppa[pg_idx] != NULL) 2574 page_unlock(ppa[pg_idx]); 2575 return (ENOMEM); 2576 } 2577 mutex_exit(&freemem_lock); 2578 } 2579 } 2580 2581 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2582 2583 VM_STAT_ADD(anonvmstats.privatepages[8]); 2584 2585 an_idx = start_idx; 2586 pg_idx = 0; 2587 vaddr = addr; 2588 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2589 ASSERT(ppa[pg_idx] != NULL); 2590 oldap = anon_get_ptr(amp->ahp, an_idx); 2591 ASSERT(ahmpages != NULL || oldap == NULL); 2592 ASSERT(ahmpages == NULL || oldap != NULL); 2593 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2594 ASSERT(ahmpages == NULL || pg_idx != 0 || 2595 (refcnt = oldap->an_refcnt)); 2596 ASSERT(ahmpages == NULL || pg_idx == 0 || 2597 refcnt == oldap->an_refcnt); 2598 2599 ap = anon_alloc(NULL, 0); 2600 2601 swap_xlate(ap, &vp, &off); 2602 2603 /* 2604 * Now setup our preallocated page to pass down to 2605 * swap_getpage(). 2606 */ 2607 if (prealloc) { 2608 pp = pplist; 2609 page_sub(&pplist, pp); 2610 conpp = pp; 2611 } 2612 2613 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2614 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2615 S_CREATE, cred); 2616 2617 /* 2618 * Impossible to fail this is S_CREATE. 2619 */ 2620 if (err) 2621 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2622 2623 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2624 ASSERT(prealloc == 0 || nreloc == 1); 2625 2626 pp = pl[0]; 2627 2628 /* 2629 * If the original page was locked, we need to move 2630 * the lock to the new page by transfering 2631 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2632 * of the new page. pg_idx can be used to index 2633 * into the vpage array since the caller will guarentee 2634 * that vpage struct passed in corresponds to addr 2635 * and forward. 2636 */ 2637 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2638 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2639 } else if (pagelock) { 2640 mutex_enter(&freemem_lock); 2641 availrmem++; 2642 pages_useclaim--; 2643 mutex_exit(&freemem_lock); 2644 } 2645 2646 /* 2647 * Now copy the contents from the original page. 2648 */ 2649 if (ppcopy(ppa[pg_idx], pp) == 0) { 2650 /* 2651 * Before ppcopy could hanlde UE or other faults, we 2652 * would have panicked here, and still have no option 2653 * but to do so now. 2654 */ 2655 panic("anon_map_privatepages, ppcopy failed"); 2656 } 2657 2658 hat_setrefmod(pp); /* mark as modified */ 2659 2660 /* 2661 * Release the lock on the original page, 2662 * derement the old slot, and down grade the lock 2663 * on the new copy. 2664 */ 2665 page_unlock(ppa[pg_idx]); 2666 2667 if (!prealloc) 2668 page_downgrade(pp); 2669 2670 ppa[pg_idx] = pp; 2671 2672 /* 2673 * Now reflect the copy in the new anon array. 2674 */ 2675 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2676 if (oldap != NULL) 2677 anon_decref(oldap); 2678 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2679 } 2680 2681 /* 2682 * Unload the old large page translation. 2683 */ 2684 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2685 2686 if (ahmpages != NULL) { 2687 mutex_exit(ahmpages); 2688 } 2689 ASSERT(prealloc == 0 || pplist == NULL); 2690 if (prealloc) { 2691 VM_STAT_ADD(anonvmstats.privatepages[9]); 2692 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2693 page_downgrade(ppa[pg_idx]); 2694 } 2695 } 2696 2697 return (0); 2698 } 2699 2700 /* 2701 * Allocate a private zero-filled anon page. 2702 */ 2703 page_t * 2704 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2705 { 2706 struct anon *ap; 2707 page_t *pp; 2708 struct vnode *vp; 2709 anoff_t off; 2710 page_t *anon_pl[1 + 1]; 2711 int err; 2712 2713 /* Kernel probe */ 2714 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2715 tnf_opaque, address, addr); 2716 2717 *app = ap = anon_alloc(NULL, 0); 2718 swap_xlate(ap, &vp, &off); 2719 2720 /* 2721 * Call the VOP_GETPAGE routine to create the page, thereby 2722 * enabling the vnode driver to allocate any filesystem 2723 * dependent structures (e.g., disk block allocation for UFS). 2724 * This also prevents more than on page from being added to 2725 * the vnode at the same time since it is locked. 2726 */ 2727 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2728 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL); 2729 if (err) { 2730 *app = NULL; 2731 anon_decref(ap); 2732 return (NULL); 2733 } 2734 pp = anon_pl[0]; 2735 2736 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2737 page_downgrade(pp); 2738 CPU_STATS_ADD_K(vm, zfod, 1); 2739 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2740 return (pp); 2741 } 2742 2743 2744 /* 2745 * Allocate array of private zero-filled anon pages for empty slots 2746 * and kept pages for non empty slots within given range. 2747 * 2748 * NOTE: This rontine will try and use large pages 2749 * if available and supported by underlying platform. 2750 */ 2751 int 2752 anon_map_createpages( 2753 struct anon_map *amp, 2754 ulong_t start_index, 2755 size_t len, 2756 page_t *ppa[], 2757 struct seg *seg, 2758 caddr_t addr, 2759 enum seg_rw rw, 2760 struct cred *cred) 2761 { 2762 2763 struct anon *ap; 2764 struct vnode *ap_vp; 2765 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2766 int err = 0; 2767 ulong_t p_index, index; 2768 pgcnt_t npgs, pg_cnt; 2769 spgcnt_t nreloc = 0; 2770 uint_t l_szc, szc, prot; 2771 anoff_t ap_off; 2772 size_t pgsz; 2773 lgrp_t *lgrp; 2774 kmutex_t *ahm; 2775 2776 /* 2777 * XXX For now only handle S_CREATE. 2778 */ 2779 ASSERT(rw == S_CREATE); 2780 2781 index = start_index; 2782 p_index = 0; 2783 npgs = btopr(len); 2784 2785 /* 2786 * If this platform supports multiple page sizes 2787 * then try and allocate directly from the free 2788 * list for pages larger than PAGESIZE. 2789 * 2790 * NOTE:When we have page_create_ru we can stop 2791 * directly allocating from the freelist. 2792 */ 2793 l_szc = seg->s_szc; 2794 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2795 while (npgs) { 2796 2797 /* 2798 * if anon slot already exists 2799 * (means page has been created) 2800 * so 1) look up the page 2801 * 2) if the page is still in memory, get it. 2802 * 3) if not, create a page and 2803 * page in from physical swap device. 2804 * These are done in anon_getpage(). 2805 */ 2806 ap = anon_get_ptr(amp->ahp, index); 2807 if (ap) { 2808 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2809 seg, addr, S_READ, cred); 2810 if (err) { 2811 ANON_LOCK_EXIT(&->a_rwlock); 2812 panic("anon_map_createpages: anon_getpage"); 2813 } 2814 pp = anon_pl[0]; 2815 ppa[p_index++] = pp; 2816 2817 /* 2818 * an_pvp can become non-NULL after SysV's page was 2819 * paged out before ISM was attached to this SysV 2820 * shared memory segment. So free swap slot if needed. 2821 */ 2822 if (ap->an_pvp != NULL) { 2823 page_io_lock(pp); 2824 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 2825 ap->an_off)]; 2826 mutex_enter(ahm); 2827 if (ap->an_pvp != NULL) { 2828 swap_phys_free(ap->an_pvp, 2829 ap->an_poff, PAGESIZE); 2830 ap->an_pvp = NULL; 2831 ap->an_poff = 0; 2832 mutex_exit(ahm); 2833 hat_setmod(pp); 2834 } else { 2835 mutex_exit(ahm); 2836 } 2837 page_io_unlock(pp); 2838 } 2839 2840 addr += PAGESIZE; 2841 index++; 2842 npgs--; 2843 continue; 2844 } 2845 /* 2846 * Now try and allocate the largest page possible 2847 * for the current address and range. 2848 * Keep dropping down in page size until: 2849 * 2850 * 1) Properly aligned 2851 * 2) Does not overlap existing anon pages 2852 * 3) Fits in remaining range. 2853 * 4) able to allocate one. 2854 * 2855 * NOTE: XXX When page_create_ru is completed this code 2856 * will change. 2857 */ 2858 szc = l_szc; 2859 pplist = NULL; 2860 pg_cnt = 0; 2861 while (szc) { 2862 pgsz = page_get_pagesize(szc); 2863 pg_cnt = pgsz >> PAGESHIFT; 2864 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2865 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2866 /* 2867 * XXX 2868 * Since we are faking page_create() 2869 * we also need to do the freemem and 2870 * pcf accounting. 2871 */ 2872 (void) page_create_wait(pg_cnt, PG_WAIT); 2873 2874 /* 2875 * Get lgroup to allocate next page of shared 2876 * memory from and use it to specify where to 2877 * allocate the physical memory 2878 */ 2879 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2880 2881 pplist = page_get_freelist( 2882 anon_vp, (u_offset_t)0, seg, 2883 addr, pgsz, 0, lgrp); 2884 2885 if (pplist == NULL) { 2886 page_create_putback(pg_cnt); 2887 } 2888 2889 /* 2890 * If a request for a page of size 2891 * larger than PAGESIZE failed 2892 * then don't try that size anymore. 2893 */ 2894 if (pplist == NULL) { 2895 l_szc = szc - 1; 2896 } else { 2897 break; 2898 } 2899 } 2900 szc--; 2901 } 2902 2903 /* 2904 * If just using PAGESIZE pages then don't 2905 * directly allocate from the free list. 2906 */ 2907 if (pplist == NULL) { 2908 ASSERT(szc == 0); 2909 pp = anon_zero(seg, addr, &ap, cred); 2910 if (pp == NULL) { 2911 ANON_LOCK_EXIT(&->a_rwlock); 2912 panic("anon_map_createpages: anon_zero"); 2913 } 2914 ppa[p_index++] = pp; 2915 2916 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2917 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2918 2919 addr += PAGESIZE; 2920 index++; 2921 npgs--; 2922 continue; 2923 } 2924 2925 /* 2926 * pplist is a list of pg_cnt PAGESIZE pages. 2927 * These pages are locked SE_EXCL since they 2928 * came directly off the free list. 2929 */ 2930 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2931 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2932 ASSERT(conpp == NULL); 2933 while (pg_cnt--) { 2934 2935 ap = anon_alloc(NULL, 0); 2936 swap_xlate(ap, &ap_vp, &ap_off); 2937 2938 ASSERT(pplist != NULL); 2939 pp = pplist; 2940 page_sub(&pplist, pp); 2941 PP_CLRFREE(pp); 2942 PP_CLRAGED(pp); 2943 conpp = pp; 2944 2945 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2946 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2947 &nreloc, seg, addr, S_CREATE, cred); 2948 2949 if (err) { 2950 ANON_LOCK_EXIT(&->a_rwlock); 2951 panic("anon_map_createpages: S_CREATE"); 2952 } 2953 2954 ASSERT(anon_pl[0] == pp); 2955 ASSERT(nreloc == 1); 2956 pagezero(pp, 0, PAGESIZE); 2957 CPU_STATS_ADD_K(vm, zfod, 1); 2958 hat_setrefmod(pp); 2959 2960 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2961 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2962 2963 ppa[p_index++] = pp; 2964 2965 addr += PAGESIZE; 2966 index++; 2967 npgs--; 2968 } 2969 conpp = NULL; 2970 pg_cnt = pgsz >> PAGESHIFT; 2971 p_index = p_index - pg_cnt; 2972 while (pg_cnt--) { 2973 page_downgrade(ppa[p_index++]); 2974 } 2975 } 2976 ANON_LOCK_EXIT(&->a_rwlock); 2977 return (0); 2978 } 2979 2980 static int 2981 anon_try_demote_pages( 2982 struct anon_hdr *ahp, 2983 ulong_t sidx, 2984 uint_t szc, 2985 page_t **ppa, 2986 int private) 2987 { 2988 struct anon *ap; 2989 pgcnt_t pgcnt = page_get_pagecnt(szc); 2990 page_t *pp; 2991 pgcnt_t i; 2992 kmutex_t *ahmpages = NULL; 2993 int root = 0; 2994 pgcnt_t npgs; 2995 pgcnt_t curnpgs = 0; 2996 size_t ppasize = 0; 2997 2998 ASSERT(szc != 0); 2999 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3000 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 3001 ASSERT(sidx < ahp->size); 3002 3003 if (ppa == NULL) { 3004 ppasize = pgcnt * sizeof (page_t *); 3005 ppa = kmem_alloc(ppasize, KM_SLEEP); 3006 } 3007 3008 ap = anon_get_ptr(ahp, sidx); 3009 if (ap != NULL && private) { 3010 VM_STAT_ADD(anonvmstats.demotepages[1]); 3011 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 3012 mutex_enter(ahmpages); 3013 } 3014 3015 if (ap != NULL && ap->an_refcnt > 1) { 3016 if (ahmpages != NULL) { 3017 VM_STAT_ADD(anonvmstats.demotepages[2]); 3018 mutex_exit(ahmpages); 3019 } 3020 if (ppasize != 0) { 3021 kmem_free(ppa, ppasize); 3022 } 3023 return (0); 3024 } 3025 if (ahmpages != NULL) { 3026 mutex_exit(ahmpages); 3027 } 3028 if (ahp->size - sidx < pgcnt) { 3029 ASSERT(private == 0); 3030 pgcnt = ahp->size - sidx; 3031 } 3032 for (i = 0; i < pgcnt; i++, sidx++) { 3033 ap = anon_get_ptr(ahp, sidx); 3034 if (ap != NULL) { 3035 if (ap->an_refcnt != 1) { 3036 panic("anon_try_demote_pages: an_refcnt != 1"); 3037 } 3038 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 3039 SE_EXCL); 3040 if (pp != NULL) { 3041 (void) hat_pageunload(pp, 3042 HAT_FORCE_PGUNLOAD); 3043 } 3044 } else { 3045 ppa[i] = NULL; 3046 } 3047 } 3048 for (i = 0; i < pgcnt; i++) { 3049 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 3050 ASSERT(pp->p_szc <= szc); 3051 if (!root) { 3052 VM_STAT_ADD(anonvmstats.demotepages[3]); 3053 if (curnpgs != 0) 3054 panic("anon_try_demote_pages: " 3055 "bad large page"); 3056 3057 root = 1; 3058 curnpgs = npgs = 3059 page_get_pagecnt(pp->p_szc); 3060 3061 ASSERT(npgs <= pgcnt); 3062 ASSERT(IS_P2ALIGNED(npgs, npgs)); 3063 ASSERT(!(page_pptonum(pp) & (npgs - 1))); 3064 } else { 3065 ASSERT(i > 0); 3066 ASSERT(page_pptonum(pp) - 1 == 3067 page_pptonum(ppa[i - 1])); 3068 if ((page_pptonum(pp) & (npgs - 1)) == 3069 npgs - 1) 3070 root = 0; 3071 } 3072 ASSERT(PAGE_EXCL(pp)); 3073 pp->p_szc = 0; 3074 ASSERT(curnpgs > 0); 3075 curnpgs--; 3076 } 3077 } 3078 if (root != 0 || curnpgs != 0) 3079 panic("anon_try_demote_pages: bad large page"); 3080 3081 for (i = 0; i < pgcnt; i++) { 3082 if ((pp = ppa[i]) != NULL) { 3083 ASSERT(!hat_page_is_mapped(pp)); 3084 ASSERT(pp->p_szc == 0); 3085 page_unlock(pp); 3086 } 3087 } 3088 if (ppasize != 0) { 3089 kmem_free(ppa, ppasize); 3090 } 3091 return (1); 3092 } 3093 3094 /* 3095 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3096 */ 3097 int 3098 anon_map_demotepages( 3099 struct anon_map *amp, 3100 ulong_t start_idx, 3101 struct seg *seg, 3102 caddr_t addr, 3103 uint_t prot, 3104 struct vpage vpage[], 3105 struct cred *cred) 3106 { 3107 struct anon *ap; 3108 uint_t szc = seg->s_szc; 3109 pgcnt_t pgcnt = page_get_pagecnt(szc); 3110 size_t ppasize = pgcnt * sizeof (page_t *); 3111 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3112 page_t *pp; 3113 page_t *pl[2]; 3114 pgcnt_t i, pg_idx; 3115 ulong_t an_idx; 3116 caddr_t vaddr; 3117 int err; 3118 int retry = 0; 3119 uint_t vpprot; 3120 3121 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3122 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3123 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3124 ASSERT(ppa != NULL); 3125 ASSERT(szc != 0); 3126 ASSERT(szc == amp->a_szc); 3127 3128 VM_STAT_ADD(anonvmstats.demotepages[0]); 3129 3130 top: 3131 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3132 kmem_free(ppa, ppasize); 3133 return (0); 3134 } 3135 3136 VM_STAT_ADD(anonvmstats.demotepages[4]); 3137 3138 ASSERT(retry == 0); /* we can be here only once */ 3139 3140 vaddr = addr; 3141 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3142 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3143 ap = anon_get_ptr(amp->ahp, an_idx); 3144 if (ap == NULL) 3145 panic("anon_map_demotepages: no anon slot"); 3146 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3147 S_READ, cred); 3148 if (err) { 3149 for (i = 0; i < pg_idx; i++) { 3150 if ((pp = ppa[i]) != NULL) 3151 page_unlock(pp); 3152 } 3153 kmem_free(ppa, ppasize); 3154 return (err); 3155 } 3156 ppa[pg_idx] = pl[0]; 3157 } 3158 3159 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3160 vpage, -1, 0, cred); 3161 if (err > 0) { 3162 VM_STAT_ADD(anonvmstats.demotepages[5]); 3163 kmem_free(ppa, ppasize); 3164 return (err); 3165 } 3166 ASSERT(err == 0 || err == -1); 3167 if (err == -1) { 3168 VM_STAT_ADD(anonvmstats.demotepages[6]); 3169 retry = 1; 3170 goto top; 3171 } 3172 for (i = 0; i < pgcnt; i++) { 3173 ASSERT(ppa[i] != NULL); 3174 if (ppa[i]->p_szc != 0) 3175 retry = 1; 3176 page_unlock(ppa[i]); 3177 } 3178 if (retry) { 3179 VM_STAT_ADD(anonvmstats.demotepages[7]); 3180 goto top; 3181 } 3182 3183 VM_STAT_ADD(anonvmstats.demotepages[8]); 3184 3185 kmem_free(ppa, ppasize); 3186 3187 return (0); 3188 } 3189 3190 /* 3191 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3192 * structures with private anon maps. Therefore all anon structures should 3193 * have at most one reference at this point. This means underlying pages can 3194 * be exclusively locked and demoted or freed. If not freeing the entire 3195 * large pages demote the ends of the region we free to be able to free 3196 * subpages. Page roots correspond to aligned index positions in anon map. 3197 */ 3198 void 3199 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3200 { 3201 ulong_t eidx = sidx + btopr(len); 3202 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3203 struct anon_hdr *ahp = amp->ahp; 3204 ulong_t tidx; 3205 size_t size; 3206 ulong_t sidx_aligned; 3207 ulong_t eidx_aligned; 3208 3209 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3210 ASSERT(amp->refcnt <= 1); 3211 ASSERT(amp->a_szc > 0); 3212 ASSERT(eidx <= ahp->size); 3213 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3214 3215 if (len == 0) { /* XXX */ 3216 return; 3217 } 3218 3219 sidx_aligned = P2ALIGN(sidx, pages); 3220 if (sidx_aligned != sidx || 3221 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3222 if (!anon_try_demote_pages(ahp, sidx_aligned, 3223 amp->a_szc, NULL, 0)) { 3224 panic("anon_shmap_free_pages: demote failed"); 3225 } 3226 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3227 P2NPHASE(sidx, pages); 3228 size <<= PAGESHIFT; 3229 anon_free(ahp, sidx, size); 3230 sidx = sidx_aligned + pages; 3231 if (eidx <= sidx) { 3232 return; 3233 } 3234 } 3235 eidx_aligned = P2ALIGN(eidx, pages); 3236 if (sidx < eidx_aligned) { 3237 anon_free_pages(ahp, sidx, 3238 (eidx_aligned - sidx) << PAGESHIFT, 3239 amp->a_szc); 3240 sidx = eidx_aligned; 3241 } 3242 ASSERT(sidx == eidx_aligned); 3243 if (eidx == eidx_aligned) { 3244 return; 3245 } 3246 tidx = eidx; 3247 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3248 tidx - sidx < pages) { 3249 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3250 panic("anon_shmap_free_pages: demote failed"); 3251 } 3252 size = (eidx - sidx) << PAGESHIFT; 3253 anon_free(ahp, sidx, size); 3254 } else { 3255 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3256 } 3257 } 3258 3259 /* 3260 * This routine should be called with amp's writer lock when there're no other 3261 * users of amp. All pcache entries of this amp must have been already 3262 * inactivated. We must not drop a_rwlock here to prevent new users from 3263 * attaching to this amp. 3264 */ 3265 void 3266 anonmap_purge(struct anon_map *amp) 3267 { 3268 ASSERT(ANON_WRITE_HELD(&->a_rwlock)); 3269 ASSERT(amp->refcnt <= 1); 3270 3271 if (amp->a_softlockcnt != 0) { 3272 seg_ppurge(NULL, amp, 0); 3273 } 3274 3275 /* 3276 * Since all pcache entries were already inactive before this routine 3277 * was called seg_ppurge() couldn't return while there're still 3278 * entries that can be found via the list anchored at a_phead. So we 3279 * can assert this list is empty now. a_softlockcnt may be still non 0 3280 * if asynchronous thread that manages pcache already removed pcache 3281 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non 3282 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if 3283 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map 3284 * before shamp_reclaim() is done with it. a_purgemtx also taken by 3285 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a 3286 * barrier that prevents anonmap_purge() to complete while 3287 * shamp_reclaim() may still be referencing this amp. 3288 */ 3289 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3290 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3291 3292 mutex_enter(&->a_purgemtx); 3293 while (amp->a_softlockcnt != 0) { 3294 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3295 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3296 amp->a_purgewait = 1; 3297 cv_wait(&->a_purgecv, &->a_purgemtx); 3298 } 3299 mutex_exit(&->a_purgemtx); 3300 3301 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3302 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3303 ASSERT(amp->a_softlockcnt == 0); 3304 } 3305 3306 /* 3307 * Allocate and initialize an anon_map structure for seg 3308 * associating the given swap reservation with the new anon_map. 3309 */ 3310 struct anon_map * 3311 anonmap_alloc(size_t size, size_t swresv, int flags) 3312 { 3313 struct anon_map *amp; 3314 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3315 3316 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3317 if (amp == NULL) { 3318 ASSERT(kmflags == KM_NOSLEEP); 3319 return (NULL); 3320 } 3321 3322 amp->ahp = anon_create(btopr(size), flags); 3323 if (amp->ahp == NULL) { 3324 ASSERT(flags == ANON_NOSLEEP); 3325 kmem_cache_free(anonmap_cache, amp); 3326 return (NULL); 3327 } 3328 amp->refcnt = 1; 3329 amp->size = size; 3330 amp->swresv = swresv; 3331 amp->locality = 0; 3332 amp->a_szc = 0; 3333 amp->a_sp = NULL; 3334 amp->a_softlockcnt = 0; 3335 amp->a_purgewait = 0; 3336 amp->a_phead.p_lnext = &->a_phead; 3337 amp->a_phead.p_lprev = &->a_phead; 3338 3339 return (amp); 3340 } 3341 3342 void 3343 anonmap_free(struct anon_map *amp) 3344 { 3345 ASSERT(amp->ahp != NULL); 3346 ASSERT(amp->refcnt == 0); 3347 ASSERT(amp->a_softlockcnt == 0); 3348 ASSERT(amp->a_phead.p_lnext == &->a_phead); 3349 ASSERT(amp->a_phead.p_lprev == &->a_phead); 3350 3351 lgrp_shm_policy_fini(amp, NULL); 3352 anon_release(amp->ahp, btopr(amp->size)); 3353 kmem_cache_free(anonmap_cache, amp); 3354 } 3355 3356 /* 3357 * Returns true if the app array has some empty slots. 3358 * The offp and lenp parameters are in/out parameters. On entry 3359 * these values represent the starting offset and length of the 3360 * mapping. When true is returned, these values may be modified 3361 * to be the largest range which includes empty slots. 3362 */ 3363 int 3364 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3365 size_t *lenp) 3366 { 3367 ulong_t i, el; 3368 ssize_t low, high; 3369 struct anon *ap; 3370 3371 low = -1; 3372 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3373 ap = anon_get_ptr(ahp, anon_idx); 3374 if (ap == NULL) { 3375 if (low == -1) 3376 low = i; 3377 high = i; 3378 } 3379 } 3380 if (low != -1) { 3381 /* 3382 * Found at least one non-anon page. 3383 * Set up the off and len return values. 3384 */ 3385 if (low != 0) 3386 *offp += low; 3387 *lenp = high - low + PAGESIZE; 3388 return (1); 3389 } 3390 return (0); 3391 } 3392 3393 /* 3394 * Return a count of the number of existing anon pages in the anon array 3395 * app in the range (off, off+len). The array and slots must be guaranteed 3396 * stable by the caller. 3397 */ 3398 pgcnt_t 3399 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3400 { 3401 pgcnt_t cnt = 0; 3402 3403 while (nslots-- > 0) { 3404 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3405 cnt++; 3406 anon_index++; 3407 } 3408 return (cnt); 3409 } 3410 3411 /* 3412 * Move reserved phys swap into memory swap (unreserve phys swap 3413 * and reserve mem swap by the same amount). 3414 * Used by segspt when it needs to lock reserved swap npages in memory 3415 */ 3416 int 3417 anon_swap_adjust(pgcnt_t npages) 3418 { 3419 pgcnt_t unlocked_mem_swap; 3420 3421 mutex_enter(&anoninfo_lock); 3422 3423 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3424 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3425 3426 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3427 - k_anoninfo.ani_locked_swap; 3428 if (npages > unlocked_mem_swap) { 3429 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3430 3431 /* 3432 * if there is not enough unlocked mem swap we take missing 3433 * amount from phys swap and give it to mem swap 3434 */ 3435 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3436 mutex_exit(&anoninfo_lock); 3437 return (ENOMEM); 3438 } 3439 3440 k_anoninfo.ani_mem_resv += adjusted_swap; 3441 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3442 k_anoninfo.ani_phys_resv -= adjusted_swap; 3443 3444 ANI_ADD(adjusted_swap); 3445 } 3446 k_anoninfo.ani_locked_swap += npages; 3447 3448 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3449 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3450 3451 mutex_exit(&anoninfo_lock); 3452 3453 return (0); 3454 } 3455 3456 /* 3457 * 'unlocked' reserved mem swap so when it is unreserved it 3458 * can be moved back phys (disk) swap 3459 */ 3460 void 3461 anon_swap_restore(pgcnt_t npages) 3462 { 3463 mutex_enter(&anoninfo_lock); 3464 3465 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3466 3467 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3468 k_anoninfo.ani_locked_swap -= npages; 3469 3470 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3471 3472 mutex_exit(&anoninfo_lock); 3473 } 3474 3475 /* 3476 * Return the pointer from the list for a 3477 * specified anon index. 3478 */ 3479 ulong_t * 3480 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3481 { 3482 struct anon **app; 3483 void **ppp; 3484 3485 ASSERT(an_idx < ahp->size); 3486 3487 /* 3488 * Single level case. 3489 */ 3490 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3491 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3492 } else { 3493 3494 /* 3495 * 2 level case. 3496 */ 3497 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3498 if (*ppp == NULL) { 3499 mutex_enter(&ahp->serial_lock); 3500 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3501 if (*ppp == NULL) 3502 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3503 mutex_exit(&ahp->serial_lock); 3504 } 3505 app = *ppp; 3506 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3507 } 3508 } 3509 3510 void 3511 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3512 { 3513 ulong_t *ap_slot; 3514 kmutex_t *mtx; 3515 kcondvar_t *cv; 3516 int hash; 3517 3518 /* 3519 * Use szc to determine anon slot(s) to appear atomic. 3520 * If szc = 0, then lock the anon slot and mark it busy. 3521 * If szc > 0, then lock the range of slots by getting the 3522 * anon_array_lock for the first anon slot, and mark only the 3523 * first anon slot busy to represent whole range being busy. 3524 */ 3525 3526 ASSERT(RW_READ_HELD(&->a_rwlock)); 3527 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3528 hash = ANON_ARRAY_HASH(amp, an_idx); 3529 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3530 sobj->sync_cv = cv = &anon_array_cv[hash]; 3531 mutex_enter(mtx); 3532 ap_slot = anon_get_slot(amp->ahp, an_idx); 3533 while (ANON_ISBUSY(ap_slot)) 3534 cv_wait(cv, mtx); 3535 ANON_SETBUSY(ap_slot); 3536 sobj->sync_data = ap_slot; 3537 mutex_exit(mtx); 3538 } 3539 3540 int 3541 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3542 anon_sync_obj_t *sobj) 3543 { 3544 ulong_t *ap_slot; 3545 kmutex_t *mtx; 3546 int hash; 3547 3548 /* 3549 * Try to lock a range of anon slots. 3550 * Use szc to determine anon slot(s) to appear atomic. 3551 * If szc = 0, then lock the anon slot and mark it busy. 3552 * If szc > 0, then lock the range of slots by getting the 3553 * anon_array_lock for the first anon slot, and mark only the 3554 * first anon slot busy to represent whole range being busy. 3555 * Fail if the mutex or the anon_array are busy. 3556 */ 3557 3558 ASSERT(RW_READ_HELD(&->a_rwlock)); 3559 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3560 hash = ANON_ARRAY_HASH(amp, an_idx); 3561 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3562 sobj->sync_cv = &anon_array_cv[hash]; 3563 if (!mutex_tryenter(mtx)) { 3564 return (EWOULDBLOCK); 3565 } 3566 ap_slot = anon_get_slot(amp->ahp, an_idx); 3567 if (ANON_ISBUSY(ap_slot)) { 3568 mutex_exit(mtx); 3569 return (EWOULDBLOCK); 3570 } 3571 ANON_SETBUSY(ap_slot); 3572 sobj->sync_data = ap_slot; 3573 mutex_exit(mtx); 3574 return (0); 3575 } 3576 3577 void 3578 anon_array_exit(anon_sync_obj_t *sobj) 3579 { 3580 mutex_enter(sobj->sync_mutex); 3581 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3582 ANON_CLRBUSY(sobj->sync_data); 3583 if (CV_HAS_WAITERS(sobj->sync_cv)) 3584 cv_broadcast(sobj->sync_cv); 3585 mutex_exit(sobj->sync_mutex); 3586 } 3587