1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - anonymous pages. 44 * 45 * This layer sits immediately above the vm_swap layer. It manages 46 * physical pages that have no permanent identity in the file system 47 * name space, using the services of the vm_swap layer to allocate 48 * backing storage for these pages. Since these pages have no external 49 * identity, they are discarded when the last reference is removed. 50 * 51 * An important function of this layer is to manage low-level sharing 52 * of pages that are logically distinct but that happen to be 53 * physically identical (e.g., the corresponding pages of the processes 54 * resulting from a fork before one process or the other changes their 55 * contents). This pseudo-sharing is present only as an optimization 56 * and is not to be confused with true sharing in which multiple 57 * address spaces deliberately contain references to the same object; 58 * such sharing is managed at a higher level. 59 * 60 * The key data structure here is the anon struct, which contains a 61 * reference count for its associated physical page and a hint about 62 * the identity of that page. Anon structs typically live in arrays, 63 * with an instance's position in its array determining where the 64 * corresponding backing storage is allocated; however, the swap_xlate() 65 * routine abstracts away this representation information so that the 66 * rest of the anon layer need not know it. (See the swap layer for 67 * more details on anon struct layout.) 68 * 69 * In the future versions of the system, the association between an 70 * anon struct and its position on backing store will change so that 71 * we don't require backing store all anonymous pages in the system. 72 * This is important for consideration for large memory systems. 73 * We can also use this technique to delay binding physical locations 74 * to anonymous pages until pageout/swapout time where we can make 75 * smarter allocation decisions to improve anonymous klustering. 76 * 77 * Many of the routines defined here take a (struct anon **) argument, 78 * which allows the code at this level to manage anon pages directly, 79 * so that callers can regard anon structs as opaque objects and not be 80 * concerned with assigning or inspecting their contents. 81 * 82 * Clients of this layer refer to anon pages indirectly. That is, they 83 * maintain arrays of pointers to anon structs rather than maintaining 84 * anon structs themselves. The (struct anon **) arguments mentioned 85 * above are pointers to entries in these arrays. It is these arrays 86 * that capture the mapping between offsets within a given segment and 87 * the corresponding anonymous backing storage address. 88 */ 89 90 #ifdef DEBUG 91 #define ANON_DEBUG 92 #endif 93 94 #include <sys/types.h> 95 #include <sys/t_lock.h> 96 #include <sys/param.h> 97 #include <sys/systm.h> 98 #include <sys/mman.h> 99 #include <sys/cred.h> 100 #include <sys/thread.h> 101 #include <sys/vnode.h> 102 #include <sys/cpuvar.h> 103 #include <sys/swap.h> 104 #include <sys/cmn_err.h> 105 #include <sys/vtrace.h> 106 #include <sys/kmem.h> 107 #include <sys/sysmacros.h> 108 #include <sys/bitmap.h> 109 #include <sys/vmsystm.h> 110 #include <sys/debug.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 int anon_debug; 128 129 kmutex_t anoninfo_lock; 130 struct k_anoninfo k_anoninfo; 131 ani_free_t ani_free_pool[ANI_MAX_POOL]; 132 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 133 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 134 135 /* 136 * Global hash table for (vp, off) -> anon slot 137 */ 138 extern int swap_maxcontig; 139 size_t anon_hash_size; 140 struct anon **anon_hash; 141 142 static struct kmem_cache *anon_cache; 143 static struct kmem_cache *anonmap_cache; 144 145 #ifdef VM_STATS 146 static struct anonvmstats_str { 147 ulong_t getpages[30]; 148 ulong_t privatepages[10]; 149 ulong_t demotepages[9]; 150 ulong_t decrefpages[9]; 151 ulong_t dupfillholes[4]; 152 ulong_t freepages[1]; 153 } anonvmstats; 154 #endif /* VM_STATS */ 155 156 157 /*ARGSUSED*/ 158 static int 159 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 160 { 161 struct anon_map *amp = buf; 162 163 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 164 return (0); 165 } 166 167 /*ARGSUSED1*/ 168 static void 169 anonmap_cache_destructor(void *buf, void *cdrarg) 170 { 171 struct anon_map *amp = buf; 172 173 rw_destroy(&->a_rwlock); 174 } 175 176 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 177 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 178 179 void 180 anon_init(void) 181 { 182 int i; 183 184 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 185 186 for (i = 0; i < AH_LOCK_SIZE; i++) { 187 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 188 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 189 } 190 191 for (i = 0; i < ANON_LOCKSIZE; i++) { 192 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 193 MUTEX_DEFAULT, NULL); 194 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 195 } 196 197 anon_hash = (struct anon **) 198 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 199 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 200 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 201 anonmap_cache = kmem_cache_create("anonmap_cache", 202 sizeof (struct anon_map), 0, 203 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 204 NULL, NULL, 0); 205 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 206 } 207 208 /* 209 * Global anon slot hash table manipulation. 210 */ 211 212 static void 213 anon_addhash(struct anon *ap) 214 { 215 int index; 216 217 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 218 index = ANON_HASH(ap->an_vp, ap->an_off); 219 ap->an_hash = anon_hash[index]; 220 anon_hash[index] = ap; 221 } 222 223 static void 224 anon_rmhash(struct anon *ap) 225 { 226 struct anon **app; 227 228 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 229 230 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 231 *app; app = &((*app)->an_hash)) { 232 if (*app == ap) { 233 *app = ap->an_hash; 234 break; 235 } 236 } 237 } 238 239 /* 240 * The anon array interfaces. Functions allocating, 241 * freeing array of pointers, and returning/setting 242 * entries in the array of pointers for a given offset. 243 * 244 * Create the list of pointers 245 */ 246 struct anon_hdr * 247 anon_create(pgcnt_t npages, int flags) 248 { 249 struct anon_hdr *ahp; 250 ulong_t nchunks; 251 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 252 253 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 254 return (NULL); 255 } 256 257 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 258 /* 259 * Single level case. 260 */ 261 ahp->size = npages; 262 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 263 264 if (flags & ANON_ALLOC_FORCE) 265 ahp->flags |= ANON_ALLOC_FORCE; 266 267 ahp->array_chunk = kmem_zalloc( 268 ahp->size * sizeof (struct anon *), kmemflags); 269 270 if (ahp->array_chunk == NULL) { 271 kmem_free(ahp, sizeof (struct anon_hdr)); 272 return (NULL); 273 } 274 } else { 275 /* 276 * 2 Level case. 277 */ 278 nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 279 280 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 281 kmemflags); 282 283 if (ahp->array_chunk == NULL) { 284 kmem_free(ahp, sizeof (struct anon_hdr)); 285 return (NULL); 286 } 287 } 288 return (ahp); 289 } 290 291 /* 292 * Free the array of pointers 293 */ 294 void 295 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 296 { 297 ulong_t i; 298 void **ppp; 299 ulong_t nchunks; 300 301 ASSERT(npages == ahp->size); 302 303 /* 304 * Single level case. 305 */ 306 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 307 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 308 } else { 309 /* 310 * 2 level case. 311 */ 312 nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 313 for (i = 0; i < nchunks; i++) { 314 ppp = &ahp->array_chunk[i]; 315 if (*ppp != NULL) 316 kmem_free(*ppp, PAGESIZE); 317 } 318 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 319 } 320 mutex_destroy(&ahp->serial_lock); 321 kmem_free(ahp, sizeof (struct anon_hdr)); 322 } 323 324 /* 325 * Return the pointer from the list for a 326 * specified anon index. 327 */ 328 struct anon * 329 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 330 { 331 struct anon **app; 332 333 ASSERT(an_idx < ahp->size); 334 335 /* 336 * Single level case. 337 */ 338 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 339 return ((struct anon *) 340 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 341 } else { 342 343 /* 344 * 2 level case. 345 */ 346 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 347 if (app) { 348 return ((struct anon *) 349 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 350 ANON_PTRMASK)); 351 } else { 352 return (NULL); 353 } 354 } 355 } 356 357 /* 358 * Return the anon pointer for the first valid entry in the anon list, 359 * starting from the given index. 360 */ 361 struct anon * 362 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 363 { 364 struct anon *ap; 365 struct anon **app; 366 ulong_t chunkoff; 367 ulong_t i; 368 ulong_t j; 369 pgcnt_t size; 370 371 i = *index; 372 size = ahp->size; 373 374 ASSERT(i < size); 375 376 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 377 /* 378 * 1 level case 379 */ 380 while (i < size) { 381 ap = (struct anon *) 382 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 383 if (ap) { 384 *index = i; 385 return (ap); 386 } 387 i++; 388 } 389 } else { 390 /* 391 * 2 level case 392 */ 393 chunkoff = i & ANON_CHUNK_OFF; 394 while (i < size) { 395 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 396 if (app) 397 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 398 ap = (struct anon *) 399 ((uintptr_t)app[j] & 400 ANON_PTRMASK); 401 if (ap) { 402 *index = i + (j - chunkoff); 403 return (ap); 404 } 405 } 406 chunkoff = 0; 407 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 408 } 409 } 410 *index = size; 411 return (NULL); 412 } 413 414 /* 415 * Set list entry with a given pointer for a specified offset 416 */ 417 int 418 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 419 { 420 void **ppp; 421 struct anon **app; 422 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 423 uintptr_t *ap_addr; 424 425 ASSERT(an_idx < ahp->size); 426 427 /* 428 * Single level case. 429 */ 430 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 431 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 432 } else { 433 434 /* 435 * 2 level case. 436 */ 437 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 438 439 ASSERT(ppp != NULL); 440 if (*ppp == NULL) { 441 mutex_enter(&ahp->serial_lock); 442 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 443 if (*ppp == NULL) { 444 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 445 if (*ppp == NULL) { 446 mutex_exit(&ahp->serial_lock); 447 return (ENOMEM); 448 } 449 } 450 mutex_exit(&ahp->serial_lock); 451 } 452 app = *ppp; 453 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 454 } 455 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 456 return (0); 457 } 458 459 /* 460 * Copy anon array into a given new anon array 461 */ 462 int 463 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 464 struct anon_hdr *dahp, ulong_t d_idx, 465 pgcnt_t npages, int flags) 466 { 467 void **sapp, **dapp; 468 void *ap; 469 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 470 471 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 472 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 473 474 /* 475 * Both arrays are 1 level. 476 */ 477 if (((sahp->size <= ANON_CHUNK_SIZE) && 478 (dahp->size <= ANON_CHUNK_SIZE)) || 479 ((sahp->flags & ANON_ALLOC_FORCE) && 480 (dahp->flags & ANON_ALLOC_FORCE))) { 481 482 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 483 npages * sizeof (struct anon *)); 484 return (0); 485 } 486 487 /* 488 * Both arrays are 2 levels. 489 */ 490 if (sahp->size > ANON_CHUNK_SIZE && 491 dahp->size > ANON_CHUNK_SIZE && 492 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 493 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 494 495 ulong_t sapidx, dapidx; 496 ulong_t *sap, *dap; 497 ulong_t chknp; 498 499 while (npages != 0) { 500 501 sapidx = s_idx & ANON_CHUNK_OFF; 502 dapidx = d_idx & ANON_CHUNK_OFF; 503 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 504 if (chknp > npages) 505 chknp = npages; 506 507 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 508 if ((sap = *sapp) != NULL) { 509 dapp = &dahp->array_chunk[d_idx 510 >> ANON_CHUNK_SHIFT]; 511 if ((dap = *dapp) == NULL) { 512 *dapp = kmem_zalloc(PAGESIZE, 513 kmemflags); 514 if ((dap = *dapp) == NULL) 515 return (ENOMEM); 516 } 517 bcopy((sap + sapidx), (dap + dapidx), 518 chknp << ANON_PTRSHIFT); 519 } 520 s_idx += chknp; 521 d_idx += chknp; 522 npages -= chknp; 523 } 524 return (0); 525 } 526 527 /* 528 * At least one of the arrays is 2 level. 529 */ 530 while (npages--) { 531 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 532 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 533 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 534 return (ENOMEM); 535 } 536 s_idx++; 537 d_idx++; 538 } 539 return (0); 540 } 541 542 543 /* 544 * ANON_INITBUF is a convenience macro for anon_grow() below. It 545 * takes a buffer dst, which is at least as large as buffer src. It 546 * does a bcopy from src into dst, and then bzeros the extra bytes 547 * of dst. If tail is set, the data in src is tail aligned within 548 * dst instead of head aligned. 549 */ 550 551 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 552 if (tail) { \ 553 bzero((dst), (dstsize) - (srclen)); \ 554 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 555 } else { \ 556 bcopy((src), (dst), (srclen)); \ 557 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 558 } 559 560 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 561 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 562 563 /* 564 * anon_grow() is used to efficiently extend an existing anon array. 565 * startidx_p points to the index into the anon array of the first page 566 * that is in use. oldseg_pgs is the number of pages in use, starting at 567 * *startidx_p. newpages is the number of additional pages desired. 568 * 569 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 570 * 571 * The growth is done by creating a new top level of the anon array, 572 * and (if the array is 2-level) reusing the existing second level arrays. 573 * 574 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 575 * 576 * Returns the new number of pages in the anon array. 577 */ 578 pgcnt_t 579 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 580 pgcnt_t newseg_pgs, int flags) 581 { 582 ulong_t startidx = startidx_p ? *startidx_p : 0; 583 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 584 pgcnt_t oelems, nelems, totpages; 585 void **level1; 586 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 587 int growdown = (flags & ANON_GROWDOWN); 588 size_t newarrsz, oldarrsz; 589 void *level2; 590 591 ASSERT(!(startidx_p == NULL && growdown)); 592 ASSERT(startidx + oldseg_pgs <= ahp->size); 593 594 /* 595 * Determine the total number of pages needed in the new 596 * anon array. If growing down, totpages is all pages from 597 * startidx through the end of the array, plus <newseg_pgs> 598 * pages. If growing up, keep all pages from page 0 through 599 * the last page currently in use, plus <newseg_pgs> pages. 600 */ 601 if (growdown) 602 totpages = oldamp_pgs - startidx + newseg_pgs; 603 else 604 totpages = startidx + oldseg_pgs + newseg_pgs; 605 606 /* If the array is already large enough, just return. */ 607 608 if (oldamp_pgs >= totpages) { 609 if (growdown) 610 *startidx_p = oldamp_pgs - totpages; 611 return (oldamp_pgs); 612 } 613 614 /* 615 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 616 * by the corresponding arrays. 617 * oelems/nelems are the number of pointers in the top level arrays 618 * which may be either level 1 or level 2. 619 * Will the new anon array be one level or two levels? 620 */ 621 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 622 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 623 oelems = oldamp_pgs; 624 nelems = newamp_pgs; 625 } else { 626 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 627 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 628 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 629 } 630 631 newarrsz = nelems * sizeof (void *); 632 level1 = kmem_alloc(newarrsz, kmemflags); 633 if (level1 == NULL) 634 return (0); 635 636 /* Are we converting from a one level to a two level anon array? */ 637 638 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 639 !(ahp->flags & ANON_ALLOC_FORCE)) { 640 641 /* 642 * Yes, we're converting to a two level. Reuse old level 1 643 * as new level 2 if it is exactly PAGESIZE. Otherwise 644 * alloc a new level 2 and copy the old level 1 data into it. 645 */ 646 if (oldamp_pgs == ANON_CHUNK_SIZE) { 647 level2 = (void *)ahp->array_chunk; 648 } else { 649 level2 = kmem_alloc(PAGESIZE, kmemflags); 650 if (level2 == NULL) { 651 kmem_free(level1, newarrsz); 652 return (0); 653 } 654 oldarrsz = oldamp_pgs * sizeof (void *); 655 656 ANON_INITBUF(ahp->array_chunk, oldarrsz, 657 level2, PAGESIZE, growdown); 658 kmem_free(ahp->array_chunk, oldarrsz); 659 } 660 bzero(level1, newarrsz); 661 if (growdown) 662 level1[nelems - 1] = level2; 663 else 664 level1[0] = level2; 665 } else { 666 oldarrsz = oelems * sizeof (void *); 667 668 ANON_INITBUF(ahp->array_chunk, oldarrsz, 669 level1, newarrsz, growdown); 670 kmem_free(ahp->array_chunk, oldarrsz); 671 } 672 673 ahp->array_chunk = level1; 674 ahp->size = newamp_pgs; 675 if (growdown) { 676 *startidx_p = newamp_pgs - totpages; 677 if (oldamp_pgs > ANON_CHUNK_SIZE) 678 *startidx_p -= P2NPHASE(oldseg_pgs, ANON_CHUNK_SIZE); 679 } 680 return (newamp_pgs); 681 } 682 683 684 /* 685 * Called from clock handler to sync ani_free value. 686 */ 687 688 void 689 set_anoninfo(void) 690 { 691 int ix; 692 pgcnt_t total = 0; 693 694 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 695 total += ani_free_pool[ix].ani_count; 696 } 697 k_anoninfo.ani_free = total; 698 } 699 700 /* 701 * Reserve anon space. 702 * 703 * It's no longer simply a matter of incrementing ani_resv to 704 * reserve swap space, we need to check memory-based as well 705 * as disk-backed (physical) swap. The following algorithm 706 * is used: 707 * Check the space on physical swap 708 * i.e. amount needed < ani_max - ani_phys_resv 709 * If we are swapping on swapfs check 710 * amount needed < (availrmem - swapfs_minfree) 711 * Since the algorithm to check for the quantity of swap space is 712 * almost the same as that for reserving it, we'll just use anon_resvmem 713 * with a flag to decrement availrmem. 714 * 715 * Return non-zero on success. 716 */ 717 int 718 anon_resvmem(size_t size, uint_t takemem) 719 { 720 pgcnt_t npages = btopr(size); 721 pgcnt_t mswap_pages = 0; 722 pgcnt_t pswap_pages = 0; 723 724 mutex_enter(&anoninfo_lock); 725 726 /* 727 * pswap_pages is the number of pages we can take from 728 * physical (i.e. disk-backed) swap. 729 */ 730 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 731 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 732 733 ANON_PRINT(A_RESV, 734 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 735 npages, takemem, pswap_pages, (void *)caller())); 736 737 if (npages <= pswap_pages) { 738 /* 739 * we have enough space on a physical swap 740 */ 741 if (takemem) 742 k_anoninfo.ani_phys_resv += npages; 743 mutex_exit(&anoninfo_lock); 744 return (1); 745 } else if (pswap_pages != 0) { 746 /* 747 * we have some space on a physical swap 748 */ 749 if (takemem) { 750 /* 751 * use up remainder of phys swap 752 */ 753 k_anoninfo.ani_phys_resv += pswap_pages; 754 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 755 } 756 } 757 /* 758 * since (npages > pswap_pages) we need mem swap 759 * mswap_pages is the number of pages needed from availrmem 760 */ 761 ASSERT(npages > pswap_pages); 762 mswap_pages = npages - pswap_pages; 763 764 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 765 mswap_pages)); 766 767 /* 768 * priv processes can reserve memory as swap as long as availrmem 769 * remains greater than swapfs_minfree; in the case of non-priv 770 * processes, memory can be reserved as swap only if availrmem 771 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 772 * swapfs_reserve amount of memswap is not available to non-priv 773 * processes. This protects daemons such as automounter dying 774 * as a result of application processes eating away almost entire 775 * membased swap. This safeguard becomes useless if apps are run 776 * with root access. 777 * 778 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 779 * 780 */ 781 mutex_enter(&freemem_lock); 782 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 783 (availrmem > (swapfs_minfree + mswap_pages) && 784 secpolicy_resource(CRED()) == 0)) { 785 786 if (takemem) { 787 /* 788 * Take the memory from the rest of the system. 789 */ 790 availrmem -= mswap_pages; 791 mutex_exit(&freemem_lock); 792 k_anoninfo.ani_mem_resv += mswap_pages; 793 ANI_ADD(mswap_pages); 794 ANON_PRINT((A_RESV | A_MRESV), 795 ("anon_resvmem: took %ld pages of availrmem\n", 796 mswap_pages)); 797 } else { 798 mutex_exit(&freemem_lock); 799 } 800 801 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 802 mutex_exit(&anoninfo_lock); 803 return (1); 804 805 } else { 806 /* 807 * Fail if not enough memory 808 */ 809 810 if (takemem) { 811 k_anoninfo.ani_phys_resv -= pswap_pages; 812 } 813 814 mutex_exit(&freemem_lock); 815 mutex_exit(&anoninfo_lock); 816 ANON_PRINT(A_RESV, 817 ("anon_resvmem: not enough space from swapfs\n")); 818 return (0); 819 } 820 } 821 822 823 /* 824 * Give back an anon reservation. 825 */ 826 void 827 anon_unresv(size_t size) 828 { 829 pgcnt_t npages = btopr(size); 830 spgcnt_t mem_free_pages = 0; 831 pgcnt_t phys_free_slots; 832 #ifdef ANON_DEBUG 833 pgcnt_t mem_resv; 834 #endif 835 836 mutex_enter(&anoninfo_lock); 837 838 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 839 /* 840 * If some of this reservation belonged to swapfs 841 * give it back to availrmem. 842 * ani_mem_resv is the amount of availrmem swapfs has reserved. 843 * but some of that memory could be locked by segspt so we can only 844 * return non locked ani_mem_resv back to availrmem 845 */ 846 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 847 ANON_PRINT((A_RESV | A_MRESV), 848 ("anon_unresv: growing availrmem by %ld pages\n", 849 MIN(k_anoninfo.ani_mem_resv, npages))); 850 851 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 852 k_anoninfo.ani_locked_swap), npages); 853 mutex_enter(&freemem_lock); 854 availrmem += mem_free_pages; 855 mutex_exit(&freemem_lock); 856 k_anoninfo.ani_mem_resv -= mem_free_pages; 857 858 ANI_ADD(-mem_free_pages); 859 } 860 /* 861 * The remainder of the pages is returned to phys swap 862 */ 863 ASSERT(npages >= mem_free_pages); 864 phys_free_slots = npages - mem_free_pages; 865 866 if (phys_free_slots) { 867 k_anoninfo.ani_phys_resv -= phys_free_slots; 868 } 869 870 #ifdef ANON_DEBUG 871 mem_resv = k_anoninfo.ani_mem_resv; 872 #endif 873 874 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 875 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 876 877 mutex_exit(&anoninfo_lock); 878 879 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 880 npages, mem_resv, (void *)caller())); 881 } 882 883 /* 884 * Allocate an anon slot and return it with the lock held. 885 */ 886 struct anon * 887 anon_alloc(struct vnode *vp, anoff_t off) 888 { 889 struct anon *ap; 890 kmutex_t *ahm; 891 892 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 893 if (vp == NULL) { 894 swap_alloc(ap); 895 } else { 896 ap->an_vp = vp; 897 ap->an_off = off; 898 } 899 ap->an_refcnt = 1; 900 ap->an_pvp = NULL; 901 ap->an_poff = 0; 902 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 903 mutex_enter(ahm); 904 anon_addhash(ap); 905 mutex_exit(ahm); 906 ANI_ADD(-1); 907 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 908 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 909 return (ap); 910 } 911 912 /* 913 * Decrement the reference count of an anon page. 914 * If reference count goes to zero, free it and 915 * its associated page (if any). 916 */ 917 void 918 anon_decref(struct anon *ap) 919 { 920 page_t *pp; 921 struct vnode *vp; 922 anoff_t off; 923 kmutex_t *ahm; 924 925 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 926 mutex_enter(ahm); 927 ASSERT(ap->an_refcnt != 0); 928 if (ap->an_refcnt == 0) 929 panic("anon_decref: slot count 0"); 930 if (--ap->an_refcnt == 0) { 931 swap_xlate(ap, &vp, &off); 932 mutex_exit(ahm); 933 934 /* 935 * If there is a page for this anon slot we will need to 936 * call VN_DISPOSE to get rid of the vp association and 937 * put the page back on the free list as really free. 938 * Acquire the "exclusive" lock to ensure that any 939 * pending i/o always completes before the swap slot 940 * is freed. 941 */ 942 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 943 944 /* 945 * If there was a page, we've synchronized on it (getting 946 * the exclusive lock is as good as gettting the iolock) 947 * so now we can free the physical backing store. Also, this 948 * is where we would free the name of the anonymous page 949 * (swap_free(ap)), a no-op in the current implementation. 950 */ 951 mutex_enter(ahm); 952 ASSERT(ap->an_refcnt == 0); 953 anon_rmhash(ap); 954 if (ap->an_pvp) 955 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 956 mutex_exit(ahm); 957 958 if (pp != NULL) { 959 /*LINTED: constant in conditional context */ 960 VN_DISPOSE(pp, B_INVAL, 0, kcred); 961 } 962 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 963 (void *)ap, (void *)ap->an_vp)); 964 kmem_cache_free(anon_cache, ap); 965 966 ANI_ADD(1); 967 } else { 968 mutex_exit(ahm); 969 } 970 } 971 972 static int 973 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 974 { 975 struct anon *ap; 976 977 while (nslots-- > 0) { 978 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 979 ap->an_refcnt > 1) 980 return (1); 981 anon_index++; 982 } 983 984 return (0); 985 } 986 987 static void 988 anon_decref_pages( 989 struct anon_hdr *ahp, 990 ulong_t an_idx, 991 uint_t szc) 992 { 993 struct anon *ap = anon_get_ptr(ahp, an_idx); 994 kmutex_t *ahmpages = NULL; 995 page_t *pp; 996 pgcnt_t pgcnt = page_get_pagecnt(szc); 997 pgcnt_t i; 998 struct vnode *vp; 999 anoff_t off; 1000 kmutex_t *ahm; 1001 #ifdef DEBUG 1002 int refcnt = 1; 1003 #endif 1004 1005 ASSERT(szc != 0); 1006 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1007 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1008 1009 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1010 1011 if (ap != NULL) { 1012 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1013 mutex_enter(ahmpages); 1014 ASSERT((refcnt = ap->an_refcnt) != 0); 1015 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1016 if (ap->an_refcnt == 1) { 1017 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1018 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1019 mutex_exit(ahmpages); 1020 ahmpages = NULL; 1021 } 1022 } 1023 1024 i = 0; 1025 while (i < pgcnt) { 1026 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1027 ASSERT(refcnt == 1 && ahmpages == NULL); 1028 i++; 1029 continue; 1030 } 1031 ASSERT(ap->an_refcnt == refcnt); 1032 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1033 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1034 1035 if (ahmpages == NULL) { 1036 swap_xlate(ap, &vp, &off); 1037 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1038 if (pp == NULL || pp->p_szc == 0) { 1039 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1040 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1041 ap->an_off)]; 1042 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1043 ANON_SLEEP); 1044 mutex_enter(ahm); 1045 ap->an_refcnt--; 1046 ASSERT(ap->an_refcnt == 0); 1047 anon_rmhash(ap); 1048 if (ap->an_pvp) 1049 swap_phys_free(ap->an_pvp, ap->an_poff, 1050 PAGESIZE); 1051 mutex_exit(ahm); 1052 if (pp != NULL) { 1053 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1054 /*LINTED*/ 1055 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1056 } 1057 kmem_cache_free(anon_cache, ap); 1058 ANI_ADD(1); 1059 i++; 1060 } else { 1061 pgcnt_t j; 1062 pgcnt_t curpgcnt = 1063 page_get_pagecnt(pp->p_szc); 1064 size_t ppasize = curpgcnt * sizeof (page_t *); 1065 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1066 int dispose = 0; 1067 1068 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1069 1070 ASSERT(pp->p_szc <= szc); 1071 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1072 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1073 ASSERT(i + curpgcnt <= pgcnt); 1074 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1075 ppa[0] = pp; 1076 for (j = i + 1; j < i + curpgcnt; j++) { 1077 ap = anon_get_ptr(ahp, an_idx + j); 1078 ASSERT(ap != NULL && 1079 ap->an_refcnt == 1); 1080 swap_xlate(ap, &vp, &off); 1081 pp = page_lookup(vp, (u_offset_t)off, 1082 SE_EXCL); 1083 if (pp == NULL) 1084 panic("anon_decref_pages: " 1085 "no page"); 1086 1087 (void) hat_pageunload(pp, 1088 HAT_FORCE_PGUNLOAD); 1089 ASSERT(pp->p_szc == ppa[0]->p_szc); 1090 ASSERT(page_pptonum(pp) - 1 == 1091 page_pptonum(ppa[j - i - 1])); 1092 ppa[j - i] = pp; 1093 if (ap->an_pvp != NULL && 1094 !vn_matchopval(ap->an_pvp, 1095 VOPNAME_DISPOSE, 1096 (fs_generic_func_p)fs_dispose)) 1097 dispose = 1; 1098 } 1099 if (!dispose) { 1100 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1101 page_destroy_pages(ppa[0]); 1102 } else { 1103 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1104 for (j = 0; j < curpgcnt; j++) { 1105 ASSERT(PAGE_EXCL(ppa[j])); 1106 ppa[j]->p_szc = 0; 1107 } 1108 for (j = 0; j < curpgcnt; j++) { 1109 ASSERT(!hat_page_is_mapped( 1110 ppa[j])); 1111 /*LINTED*/ 1112 VN_DISPOSE(ppa[j], B_INVAL, 0, 1113 kcred); 1114 } 1115 } 1116 kmem_free(ppa, ppasize); 1117 for (j = i; j < i + curpgcnt; j++) { 1118 ap = anon_get_ptr(ahp, an_idx + j); 1119 ASSERT(ap != NULL && 1120 ap->an_refcnt == 1); 1121 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1122 ap->an_off)]; 1123 (void) anon_set_ptr(ahp, an_idx + j, 1124 NULL, ANON_SLEEP); 1125 mutex_enter(ahm); 1126 ap->an_refcnt--; 1127 ASSERT(ap->an_refcnt == 0); 1128 anon_rmhash(ap); 1129 if (ap->an_pvp) 1130 swap_phys_free(ap->an_pvp, 1131 ap->an_poff, PAGESIZE); 1132 mutex_exit(ahm); 1133 kmem_cache_free(anon_cache, ap); 1134 ANI_ADD(1); 1135 } 1136 i += curpgcnt; 1137 } 1138 } else { 1139 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1140 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1141 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1142 mutex_enter(ahm); 1143 ap->an_refcnt--; 1144 mutex_exit(ahm); 1145 i++; 1146 } 1147 } 1148 1149 if (ahmpages != NULL) { 1150 mutex_exit(ahmpages); 1151 } 1152 } 1153 1154 /* 1155 * Duplicate references to size bytes worth of anon pages. 1156 * Used when duplicating a segment that contains private anon pages. 1157 * This code assumes that procedure calling this one has already used 1158 * hat_chgprot() to disable write access to the range of addresses that 1159 * that *old actually refers to. 1160 */ 1161 void 1162 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1163 ulong_t new_idx, size_t size) 1164 { 1165 spgcnt_t npages; 1166 kmutex_t *ahm; 1167 struct anon *ap; 1168 ulong_t off; 1169 ulong_t index; 1170 1171 npages = btopr(size); 1172 while (npages > 0) { 1173 index = old_idx; 1174 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1175 break; 1176 1177 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1178 off = index - old_idx; 1179 npages -= off; 1180 if (npages <= 0) 1181 break; 1182 1183 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1184 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1185 1186 mutex_enter(ahm); 1187 ap->an_refcnt++; 1188 mutex_exit(ahm); 1189 1190 off++; 1191 new_idx += off; 1192 old_idx += off; 1193 npages--; 1194 } 1195 } 1196 1197 /* 1198 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1199 * slots) within any large page region. That means if a large page region is 1200 * empty in the old array it will skip it. If there are 1 or more valid slots 1201 * in the large page region of the old array it will make sure to fill in any 1202 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1203 * page region should either have no valid anon slots or all slots should be 1204 * valid. 1205 */ 1206 void 1207 anon_dup_fill_holes( 1208 struct anon_hdr *old, 1209 ulong_t old_idx, 1210 struct anon_hdr *new, 1211 ulong_t new_idx, 1212 size_t size, 1213 uint_t szc, 1214 int noalloc) 1215 { 1216 struct anon *ap; 1217 spgcnt_t npages; 1218 kmutex_t *ahm, *ahmpages = NULL; 1219 pgcnt_t pgcnt, i; 1220 ulong_t index, off; 1221 #ifdef DEBUG 1222 int refcnt; 1223 #endif 1224 1225 ASSERT(szc != 0); 1226 pgcnt = page_get_pagecnt(szc); 1227 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1228 npages = btopr(size); 1229 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1230 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1231 1232 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1233 1234 while (npages > 0) { 1235 index = old_idx; 1236 1237 /* 1238 * Find the next valid slot. 1239 */ 1240 if (anon_get_next_ptr(old, &index) == NULL) 1241 break; 1242 1243 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1244 /* 1245 * Now backup index to the beginning of the 1246 * current large page region of the old array. 1247 */ 1248 index = P2ALIGN(index, pgcnt); 1249 off = index - old_idx; 1250 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1251 npages -= off; 1252 if (npages <= 0) 1253 break; 1254 1255 /* 1256 * Fill and copy a large page regions worth 1257 * of anon slots. 1258 */ 1259 for (i = 0; i < pgcnt; i++) { 1260 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1261 if (noalloc) { 1262 panic("anon_dup_fill_holes: " 1263 "empty anon slot\n"); 1264 } 1265 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1266 ap = anon_alloc(NULL, 0); 1267 (void) anon_set_ptr(old, index + i, ap, 1268 ANON_SLEEP); 1269 } else if (i == 0) { 1270 /* 1271 * make the increment of all refcnts of all 1272 * anon slots of a large page appear atomic by 1273 * getting an anonpages_hash_lock for the 1274 * first anon slot of a large page. 1275 */ 1276 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1277 1278 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1279 1280 ahmpages = &anonpages_hash_lock[hash]; 1281 mutex_enter(ahmpages); 1282 /*LINTED*/ 1283 ASSERT(refcnt = ap->an_refcnt); 1284 1285 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1286 anonvmstats.dupfillholes[3]); 1287 } 1288 (void) anon_set_ptr(new, new_idx + off + i, ap, 1289 ANON_SLEEP); 1290 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1291 mutex_enter(ahm); 1292 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1293 ASSERT(i == 0 || ahmpages == NULL || 1294 refcnt == ap->an_refcnt); 1295 ap->an_refcnt++; 1296 mutex_exit(ahm); 1297 } 1298 if (ahmpages != NULL) { 1299 mutex_exit(ahmpages); 1300 ahmpages = NULL; 1301 } 1302 off += pgcnt; 1303 new_idx += off; 1304 old_idx += off; 1305 npages -= pgcnt; 1306 } 1307 } 1308 1309 /* 1310 * Used when a segment with a vnode changes szc. similarly to 1311 * anon_dup_fill_holes() makes sure each large page region either has no anon 1312 * slots or all of them. but new slots are created by COWing the file 1313 * pages. on entrance no anon slots should be shared. 1314 */ 1315 int 1316 anon_fill_cow_holes( 1317 struct seg *seg, 1318 caddr_t addr, 1319 struct anon_hdr *ahp, 1320 ulong_t an_idx, 1321 struct vnode *vp, 1322 u_offset_t vp_off, 1323 size_t size, 1324 uint_t szc, 1325 uint_t prot, 1326 struct vpage vpage[], 1327 struct cred *cred) 1328 { 1329 struct anon *ap; 1330 spgcnt_t npages; 1331 pgcnt_t pgcnt, i; 1332 ulong_t index, off; 1333 int err = 0; 1334 int pageflags = 0; 1335 1336 ASSERT(szc != 0); 1337 pgcnt = page_get_pagecnt(szc); 1338 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1339 npages = btopr(size); 1340 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1341 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1342 1343 while (npages > 0) { 1344 index = an_idx; 1345 1346 /* 1347 * Find the next valid slot. 1348 */ 1349 if (anon_get_next_ptr(ahp, &index) == NULL) { 1350 break; 1351 } 1352 1353 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1354 /* 1355 * Now backup index to the beginning of the 1356 * current large page region of the anon array. 1357 */ 1358 index = P2ALIGN(index, pgcnt); 1359 off = index - an_idx; 1360 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1361 npages -= off; 1362 if (npages <= 0) 1363 break; 1364 an_idx += off; 1365 vp_off += ptob(off); 1366 addr += ptob(off); 1367 if (vpage != NULL) { 1368 vpage += off; 1369 } 1370 1371 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1372 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1373 page_t *pl[1 + 1]; 1374 page_t *pp; 1375 1376 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1377 pl, PAGESIZE, seg, addr, S_READ, cred); 1378 if (err) { 1379 break; 1380 } 1381 if (vpage != NULL) { 1382 prot = VPP_PROT(vpage); 1383 pageflags = VPP_ISPPLOCK(vpage) ? 1384 LOCK_PAGE : 0; 1385 } 1386 pp = anon_private(&ap, seg, addr, prot, pl[0], 1387 pageflags, cred); 1388 if (pp == NULL) { 1389 err = ENOMEM; 1390 break; 1391 } 1392 (void) anon_set_ptr(ahp, an_idx, ap, 1393 ANON_SLEEP); 1394 page_unlock(pp); 1395 } 1396 ASSERT(ap->an_refcnt == 1); 1397 addr += PAGESIZE; 1398 if (vpage != NULL) { 1399 vpage++; 1400 } 1401 } 1402 npages -= pgcnt; 1403 } 1404 1405 return (err); 1406 } 1407 1408 /* 1409 * Free a group of "size" anon pages, size in bytes, 1410 * and clear out the pointers to the anon entries. 1411 */ 1412 void 1413 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1414 { 1415 spgcnt_t npages; 1416 struct anon *ap; 1417 ulong_t old; 1418 1419 npages = btopr(size); 1420 1421 while (npages > 0) { 1422 old = index; 1423 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1424 break; 1425 1426 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1427 npages -= index - old; 1428 if (npages <= 0) 1429 break; 1430 1431 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1432 anon_decref(ap); 1433 /* 1434 * Bump index and decrement page count 1435 */ 1436 index++; 1437 npages--; 1438 } 1439 } 1440 1441 void 1442 anon_free_pages( 1443 struct anon_hdr *ahp, 1444 ulong_t an_idx, 1445 size_t size, 1446 uint_t szc) 1447 { 1448 spgcnt_t npages; 1449 pgcnt_t pgcnt; 1450 ulong_t index, off; 1451 1452 ASSERT(szc != 0); 1453 pgcnt = page_get_pagecnt(szc); 1454 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1455 npages = btopr(size); 1456 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1457 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1458 1459 VM_STAT_ADD(anonvmstats.freepages[0]); 1460 1461 while (npages > 0) { 1462 index = an_idx; 1463 1464 /* 1465 * Find the next valid slot. 1466 */ 1467 if (anon_get_next_ptr(ahp, &index) == NULL) 1468 break; 1469 1470 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1471 /* 1472 * Now backup index to the beginning of the 1473 * current large page region of the old array. 1474 */ 1475 index = P2ALIGN(index, pgcnt); 1476 off = index - an_idx; 1477 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1478 npages -= off; 1479 if (npages <= 0) 1480 break; 1481 1482 anon_decref_pages(ahp, index, szc); 1483 1484 off += pgcnt; 1485 an_idx += off; 1486 npages -= pgcnt; 1487 } 1488 } 1489 1490 /* 1491 * Make anonymous pages discardable 1492 */ 1493 void 1494 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) 1495 { 1496 spgcnt_t npages = btopr(size); 1497 struct anon *ap; 1498 struct vnode *vp; 1499 anoff_t off; 1500 page_t *pp, *root_pp; 1501 kmutex_t *ahm; 1502 pgcnt_t pgcnt; 1503 ulong_t old_idx, idx, i; 1504 struct anon_hdr *ahp = amp->ahp; 1505 anon_sync_obj_t cookie; 1506 1507 ASSERT(RW_READ_HELD(&->a_rwlock)); 1508 pgcnt = 1; 1509 for (; npages > 0; index = (pgcnt == 1) ? index + 1: 1510 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1511 1512 /* 1513 * get anon pointer and index for the first valid entry 1514 * in the anon list, starting from "index" 1515 */ 1516 old_idx = index; 1517 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1518 break; 1519 1520 /* 1521 * decrement npages by number of NULL anon slots we skipped 1522 */ 1523 npages -= index - old_idx; 1524 if (npages <= 0) 1525 break; 1526 1527 anon_array_enter(amp, index, &cookie); 1528 ap = anon_get_ptr(ahp, index); 1529 ASSERT(ap != NULL); 1530 1531 /* 1532 * Get anonymous page and try to lock it SE_EXCL; 1533 * For non blocking case if we couldn't grab the lock 1534 * we skip to next page. 1535 * For blocking case (ANON_PGLOOKUP_BLK) block 1536 * until we grab SE_EXCL lock. 1537 */ 1538 swap_xlate(ap, &vp, &off); 1539 if (flags & ANON_PGLOOKUP_BLK) 1540 pp = page_lookup_create(vp, (u_offset_t)off, 1541 SE_EXCL, NULL, NULL, SE_EXCL_WANTED); 1542 else 1543 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1544 if (pp == NULL) { 1545 segadvstat.MADV_FREE_miss.value.ul++; 1546 pgcnt = 1; 1547 anon_array_exit(&cookie); 1548 continue; 1549 } 1550 pgcnt = page_get_pagecnt(pp->p_szc); 1551 1552 /* 1553 * we cannot free a page which is permanently locked. 1554 * The page_struct_lock need not be acquired to examine 1555 * these fields since the page has an "exclusive" lock. 1556 */ 1557 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1558 page_unlock(pp); 1559 segadvstat.MADV_FREE_miss.value.ul++; 1560 anon_array_exit(&cookie); 1561 continue; 1562 } 1563 1564 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1565 mutex_enter(ahm); 1566 ASSERT(ap->an_refcnt != 0); 1567 /* 1568 * skip this one if copy-on-write is not yet broken. 1569 */ 1570 if (ap->an_refcnt > 1) { 1571 mutex_exit(ahm); 1572 page_unlock(pp); 1573 segadvstat.MADV_FREE_miss.value.ul++; 1574 anon_array_exit(&cookie); 1575 continue; 1576 } 1577 1578 if (pp->p_szc == 0) { 1579 pgcnt = 1; 1580 1581 /* 1582 * free swap slot; 1583 */ 1584 if (ap->an_pvp) { 1585 swap_phys_free(ap->an_pvp, ap->an_poff, 1586 PAGESIZE); 1587 ap->an_pvp = NULL; 1588 ap->an_poff = 0; 1589 } 1590 mutex_exit(ahm); 1591 segadvstat.MADV_FREE_hit.value.ul++; 1592 1593 /* 1594 * while we are at it, unload all the translations 1595 * and attempt to free the page. 1596 */ 1597 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1598 /*LINTED: constant in conditional context */ 1599 VN_DISPOSE(pp, B_FREE, 0, kcred); 1600 anon_array_exit(&cookie); 1601 continue; 1602 } 1603 1604 pgcnt = page_get_pagecnt(pp->p_szc); 1605 if (!IS_P2ALIGNED(index, pgcnt)) { 1606 if (!page_try_demote_pages(pp)) { 1607 mutex_exit(ahm); 1608 page_unlock(pp); 1609 segadvstat.MADV_FREE_miss.value.ul++; 1610 anon_array_exit(&cookie); 1611 continue; 1612 } else { 1613 pgcnt = 1; 1614 if (ap->an_pvp) { 1615 swap_phys_free(ap->an_pvp, 1616 ap->an_poff, PAGESIZE); 1617 ap->an_pvp = NULL; 1618 ap->an_poff = 0; 1619 } 1620 mutex_exit(ahm); 1621 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1622 /*LINTED*/ 1623 VN_DISPOSE(pp, B_FREE, 0, kcred); 1624 segadvstat.MADV_FREE_hit.value.ul++; 1625 anon_array_exit(&cookie); 1626 continue; 1627 } 1628 } 1629 mutex_exit(ahm); 1630 root_pp = pp; 1631 1632 /* 1633 * try to lock remaining pages 1634 */ 1635 for (idx = 1; idx < pgcnt; idx++) { 1636 pp++; 1637 if (!page_trylock(pp, SE_EXCL)) 1638 break; 1639 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1640 page_unlock(pp); 1641 break; 1642 } 1643 } 1644 1645 if (idx == pgcnt) { 1646 for (i = 0; i < pgcnt; i++) { 1647 ap = anon_get_ptr(ahp, index + i); 1648 if (ap == NULL) 1649 break; 1650 swap_xlate(ap, &vp, &off); 1651 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1652 mutex_enter(ahm); 1653 ASSERT(ap->an_refcnt != 0); 1654 1655 /* 1656 * skip this one if copy-on-write 1657 * is not yet broken. 1658 */ 1659 if (ap->an_refcnt > 1) { 1660 mutex_exit(ahm); 1661 goto skiplp; 1662 } 1663 if (ap->an_pvp) { 1664 swap_phys_free(ap->an_pvp, 1665 ap->an_poff, PAGESIZE); 1666 ap->an_pvp = NULL; 1667 ap->an_poff = 0; 1668 } 1669 mutex_exit(ahm); 1670 } 1671 page_destroy_pages(root_pp); 1672 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1673 anon_array_exit(&cookie); 1674 continue; 1675 } 1676 skiplp: 1677 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1678 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1679 page_unlock(pp); 1680 anon_array_exit(&cookie); 1681 } 1682 } 1683 1684 /* 1685 * Return the kept page(s) and protections back to the segment driver. 1686 */ 1687 int 1688 anon_getpage( 1689 struct anon **app, 1690 uint_t *protp, 1691 page_t *pl[], 1692 size_t plsz, 1693 struct seg *seg, 1694 caddr_t addr, 1695 enum seg_rw rw, 1696 struct cred *cred) 1697 { 1698 page_t *pp; 1699 struct anon *ap = *app; 1700 struct vnode *vp; 1701 anoff_t off; 1702 int err; 1703 kmutex_t *ahm; 1704 1705 swap_xlate(ap, &vp, &off); 1706 1707 /* 1708 * Lookup the page. If page is being paged in, 1709 * wait for it to finish as we must return a list of 1710 * pages since this routine acts like the VOP_GETPAGE 1711 * routine does. 1712 */ 1713 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1714 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1715 mutex_enter(ahm); 1716 if (ap->an_refcnt == 1) 1717 *protp = PROT_ALL; 1718 else 1719 *protp = PROT_ALL & ~PROT_WRITE; 1720 mutex_exit(ahm); 1721 pl[0] = pp; 1722 pl[1] = NULL; 1723 return (0); 1724 } 1725 1726 /* 1727 * Simply treat it as a vnode fault on the anon vp. 1728 */ 1729 1730 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1731 "anon_getpage:seg %x addr %x vp %x", 1732 seg, addr, vp); 1733 1734 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1735 seg, addr, rw, cred); 1736 1737 if (err == 0 && pl != NULL) { 1738 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1739 mutex_enter(ahm); 1740 if (ap->an_refcnt != 1) 1741 *protp &= ~PROT_WRITE; /* make read-only */ 1742 mutex_exit(ahm); 1743 } 1744 return (err); 1745 } 1746 1747 /* 1748 * Creates or returns kept pages to the segment driver. returns -1 if a large 1749 * page cannot be allocated. returns -2 if some other process has allocated a 1750 * larger page. 1751 * 1752 * For cowfault it will alocate any size pages to fill the requested area to 1753 * avoid partially overwritting anon slots (i.e. sharing only some of the anon 1754 * slots within a large page with other processes). This policy greatly 1755 * simplifies large page freeing (which is only freed when all anon slot 1756 * refcnts are 0). 1757 */ 1758 int 1759 anon_map_getpages( 1760 struct anon_map *amp, 1761 ulong_t start_idx, 1762 uint_t szc, 1763 struct seg *seg, 1764 caddr_t addr, 1765 uint_t prot, 1766 uint_t *protp, 1767 page_t *ppa[], 1768 uint_t *ppa_szc, 1769 struct vpage vpage[], 1770 enum seg_rw rw, 1771 int brkcow, 1772 int anypgsz, 1773 struct cred *cred) 1774 { 1775 pgcnt_t pgcnt; 1776 struct anon *ap; 1777 struct vnode *vp; 1778 anoff_t off; 1779 page_t *pp, *pl[2], *conpp = NULL; 1780 caddr_t vaddr; 1781 ulong_t pg_idx, an_idx, i; 1782 spgcnt_t nreloc = 0; 1783 int prealloc = 1; 1784 int err, slotcreate; 1785 uint_t vpprot; 1786 1787 #if !defined(__i386) && !defined(__amd64) 1788 ASSERT(seg->s_szc != 0); 1789 #endif 1790 ASSERT(szc <= seg->s_szc); 1791 ASSERT(ppa_szc != NULL); 1792 ASSERT(rw != S_CREATE); 1793 1794 *protp = PROT_ALL; 1795 1796 VM_STAT_ADD(anonvmstats.getpages[0]); 1797 1798 if (szc == 0) { 1799 VM_STAT_ADD(anonvmstats.getpages[1]); 1800 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1801 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1802 addr, rw, cred); 1803 if (err) 1804 return (err); 1805 ppa[0] = pl[0]; 1806 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1807 VM_STAT_ADD(anonvmstats.getpages[2]); 1808 if (ppa[0]->p_szc != 0) { 1809 VM_STAT_ADD(anonvmstats.getpages[3]); 1810 *ppa_szc = ppa[0]->p_szc; 1811 page_unlock(ppa[0]); 1812 return (-2); 1813 } 1814 return (0); 1815 } 1816 panic("anon_map_getpages: cowfault for szc 0"); 1817 } else { 1818 VM_STAT_ADD(anonvmstats.getpages[4]); 1819 ppa[0] = anon_zero(seg, addr, &ap, cred); 1820 if (ppa[0] == NULL) 1821 return (ENOMEM); 1822 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1823 ANON_SLEEP); 1824 return (0); 1825 } 1826 } 1827 1828 pgcnt = page_get_pagecnt(szc); 1829 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1830 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1831 1832 /* 1833 * First we check for the case that the requtested large 1834 * page or larger page already exists in the system. 1835 * Actually we only check if the first constituent page 1836 * exists and only preallocate if it's not found. 1837 */ 1838 ap = anon_get_ptr(amp->ahp, start_idx); 1839 if (ap) { 1840 uint_t pszc; 1841 swap_xlate(ap, &vp, &off); 1842 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1843 if (pszc > szc) { 1844 *ppa_szc = pszc; 1845 return (-2); 1846 } 1847 if (pszc == szc) { 1848 prealloc = 0; 1849 } 1850 } 1851 } 1852 1853 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1854 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1855 1856 top: 1857 /* 1858 * If a smaller page or no page at all was found, 1859 * grab a large page off the freelist. 1860 */ 1861 if (prealloc) { 1862 ASSERT(conpp == NULL); 1863 if (page_alloc_pages(seg, addr, NULL, ppa, szc, 0) != 0) { 1864 VM_STAT_ADD(anonvmstats.getpages[7]); 1865 if (brkcow == 0 || 1866 !anon_share(amp->ahp, start_idx, pgcnt)) { 1867 /* 1868 * If the refcnt's of all anon slots are <= 1 1869 * they can't increase since we are holding 1870 * the address space's lock. So segvn can 1871 * safely decrease szc without risking to 1872 * generate a cow fault for the region smaller 1873 * than the segment's largest page size. 1874 */ 1875 VM_STAT_ADD(anonvmstats.getpages[8]); 1876 return (-1); 1877 } 1878 docow: 1879 /* 1880 * This is a cow fault. Copy away the entire 1 large 1881 * page region of this segment. 1882 */ 1883 if (szc != seg->s_szc) 1884 panic("anon_map_getpages: cowfault for szc %d", 1885 szc); 1886 vaddr = addr; 1887 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1888 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1889 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1890 NULL) { 1891 err = anon_getpage(&ap, &vpprot, pl, 1892 PAGESIZE, seg, vaddr, rw, cred); 1893 if (err) { 1894 for (i = 0; i < pg_idx; i++) { 1895 if ((pp = ppa[i]) != 1896 NULL) 1897 page_unlock(pp); 1898 } 1899 return (err); 1900 } 1901 ppa[pg_idx] = pl[0]; 1902 } else { 1903 /* 1904 * Since this is a cowfault we know 1905 * that this address space has a 1906 * parent or children which means 1907 * anon_dup_fill_holes() has initialized 1908 * all anon slots within a large page 1909 * region that had at least one anon 1910 * slot at the time of fork(). 1911 */ 1912 panic("anon_map_getpages: " 1913 "cowfault but anon slot is empty"); 1914 } 1915 } 1916 VM_STAT_ADD(anonvmstats.getpages[9]); 1917 *protp = PROT_ALL; 1918 return (anon_map_privatepages(amp, start_idx, szc, seg, 1919 addr, prot, ppa, vpage, anypgsz, cred)); 1920 } 1921 } 1922 1923 VM_STAT_ADD(anonvmstats.getpages[10]); 1924 1925 an_idx = start_idx; 1926 pg_idx = 0; 1927 vaddr = addr; 1928 while (pg_idx < pgcnt) { 1929 slotcreate = 0; 1930 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 1931 VM_STAT_ADD(anonvmstats.getpages[11]); 1932 /* 1933 * For us to have decided not to preallocate 1934 * would have meant that a large page 1935 * was found. Which also means that all of the 1936 * anon slots for that page would have been 1937 * already created for us. 1938 */ 1939 if (prealloc == 0) 1940 panic("anon_map_getpages: prealloc = 0"); 1941 1942 slotcreate = 1; 1943 ap = anon_alloc(NULL, 0); 1944 } 1945 swap_xlate(ap, &vp, &off); 1946 1947 /* 1948 * Now setup our preallocated page to pass down 1949 * to swap_getpage(). 1950 */ 1951 if (prealloc) { 1952 ASSERT(ppa[pg_idx]->p_szc == szc); 1953 conpp = ppa[pg_idx]; 1954 } 1955 ASSERT(prealloc || conpp == NULL); 1956 1957 /* 1958 * If we just created this anon slot then call 1959 * with S_CREATE to prevent doing IO on the page. 1960 * Similar to the anon_zero case. 1961 */ 1962 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 1963 NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr, 1964 slotcreate == 1 ? S_CREATE : rw, cred); 1965 1966 if (err) { 1967 VM_STAT_ADD(anonvmstats.getpages[12]); 1968 ASSERT(slotcreate == 0); 1969 goto io_err; 1970 } 1971 1972 pp = pl[0]; 1973 1974 if (pp->p_szc != szc) { 1975 VM_STAT_ADD(anonvmstats.getpages[13]); 1976 ASSERT(slotcreate == 0); 1977 ASSERT(prealloc == 0); 1978 ASSERT(pg_idx == 0); 1979 if (pp->p_szc > szc) { 1980 page_unlock(pp); 1981 VM_STAT_ADD(anonvmstats.getpages[14]); 1982 return (-2); 1983 } 1984 page_unlock(pp); 1985 prealloc = 1; 1986 goto top; 1987 } 1988 1989 /* 1990 * If we decided to preallocate but VOP_GETPAGE 1991 * found a page in the system that satisfies our 1992 * request then free up our preallocated large page 1993 * and continue looping accross the existing large 1994 * page via VOP_GETPAGE. 1995 */ 1996 if (prealloc && pp != ppa[pg_idx]) { 1997 VM_STAT_ADD(anonvmstats.getpages[15]); 1998 ASSERT(slotcreate == 0); 1999 ASSERT(pg_idx == 0); 2000 conpp = NULL; 2001 prealloc = 0; 2002 page_free_pages(ppa[0]); 2003 } 2004 2005 if (prealloc && nreloc > 1) { 2006 /* 2007 * we have relocated out of a smaller large page. 2008 * skip npgs - 1 iterations and continue which will 2009 * increment by one the loop indices. 2010 */ 2011 spgcnt_t npgs = nreloc; 2012 2013 VM_STAT_ADD(anonvmstats.getpages[16]); 2014 2015 ASSERT(pp == ppa[pg_idx]); 2016 ASSERT(slotcreate == 0); 2017 ASSERT(pg_idx + npgs <= pgcnt); 2018 if ((*protp & PROT_WRITE) && 2019 anon_share(amp->ahp, an_idx, npgs)) { 2020 *protp &= ~PROT_WRITE; 2021 } 2022 pg_idx += npgs; 2023 an_idx += npgs; 2024 vaddr += PAGESIZE * npgs; 2025 continue; 2026 } 2027 2028 VM_STAT_ADD(anonvmstats.getpages[17]); 2029 2030 /* 2031 * Anon_zero case. 2032 */ 2033 if (slotcreate) { 2034 ASSERT(prealloc); 2035 pagezero(pp, 0, PAGESIZE); 2036 CPU_STATS_ADD_K(vm, zfod, 1); 2037 hat_setrefmod(pp); 2038 } 2039 2040 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2041 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2042 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2043 2044 if (pg_idx > 0 && 2045 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2046 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) 2047 panic("anon_map_getpages: unexpected page"); 2048 2049 if (prealloc == 0) { 2050 ppa[pg_idx] = pp; 2051 } 2052 2053 if (ap->an_refcnt > 1) { 2054 VM_STAT_ADD(anonvmstats.getpages[18]); 2055 *protp &= ~PROT_WRITE; 2056 } 2057 2058 /* 2059 * If this is a new anon slot then initialize 2060 * the anon array entry. 2061 */ 2062 if (slotcreate) { 2063 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2064 } 2065 pg_idx++; 2066 an_idx++; 2067 vaddr += PAGESIZE; 2068 } 2069 2070 /* 2071 * Since preallocated pages come off the freelist 2072 * they are locked SE_EXCL. Simply downgrade and return. 2073 */ 2074 if (prealloc) { 2075 VM_STAT_ADD(anonvmstats.getpages[19]); 2076 conpp = NULL; 2077 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2078 page_downgrade(ppa[pg_idx]); 2079 } 2080 } 2081 ASSERT(conpp == NULL); 2082 2083 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2084 VM_STAT_ADD(anonvmstats.getpages[20]); 2085 return (0); 2086 } 2087 2088 if (szc < seg->s_szc) 2089 panic("anon_map_getpages: cowfault for szc %d", szc); 2090 2091 VM_STAT_ADD(anonvmstats.getpages[21]); 2092 2093 *protp = PROT_ALL; 2094 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2095 ppa, vpage, anypgsz, cred)); 2096 io_err: 2097 /* 2098 * We got an IO error somewhere in our large page. 2099 * If we were using a preallocated page then just demote 2100 * all the constituent pages that we've succeeded with sofar 2101 * to PAGESIZE pages and leave them in the system 2102 * unlocked. 2103 */ 2104 2105 ASSERT(err != -2 || pg_idx == 0); 2106 2107 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2108 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2109 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2110 2111 if (prealloc) { 2112 conpp = NULL; 2113 if (pg_idx > 0) { 2114 VM_STAT_ADD(anonvmstats.getpages[25]); 2115 for (i = 0; i < pgcnt; i++) { 2116 pp = ppa[i]; 2117 ASSERT(PAGE_EXCL(pp)); 2118 ASSERT(pp->p_szc == szc); 2119 pp->p_szc = 0; 2120 } 2121 for (i = 0; i < pg_idx; i++) { 2122 ASSERT(!hat_page_is_mapped(ppa[i])); 2123 page_unlock(ppa[i]); 2124 } 2125 /* 2126 * Now free up the remaining unused constituent 2127 * pages. 2128 */ 2129 while (pg_idx < pgcnt) { 2130 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2131 page_free(ppa[pg_idx], 0); 2132 pg_idx++; 2133 } 2134 } else { 2135 VM_STAT_ADD(anonvmstats.getpages[26]); 2136 page_free_pages(ppa[0]); 2137 } 2138 } else { 2139 VM_STAT_ADD(anonvmstats.getpages[27]); 2140 ASSERT(err > 0); 2141 for (i = 0; i < pg_idx; i++) 2142 page_unlock(ppa[i]); 2143 } 2144 ASSERT(conpp == NULL); 2145 if (err != -1) 2146 return (err); 2147 /* 2148 * we are here because we failed to relocate. 2149 */ 2150 ASSERT(prealloc); 2151 if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { 2152 VM_STAT_ADD(anonvmstats.getpages[28]); 2153 return (-1); 2154 } 2155 VM_STAT_ADD(anonvmstats.getpages[29]); 2156 goto docow; 2157 } 2158 2159 2160 /* 2161 * Turn a reference to an object or shared anon page 2162 * into a private page with a copy of the data from the 2163 * original page which is always locked by the caller. 2164 * This routine unloads the translation and unlocks the 2165 * original page, if it isn't being stolen, before returning 2166 * to the caller. 2167 * 2168 * NOTE: The original anon slot is not freed by this routine 2169 * It must be freed by the caller while holding the 2170 * "anon_map" lock to prevent races which can occur if 2171 * a process has multiple lwps in its address space. 2172 */ 2173 page_t * 2174 anon_private( 2175 struct anon **app, 2176 struct seg *seg, 2177 caddr_t addr, 2178 uint_t prot, 2179 page_t *opp, 2180 int oppflags, 2181 struct cred *cred) 2182 { 2183 struct anon *old = *app; 2184 struct anon *new; 2185 page_t *pp = NULL; 2186 struct vnode *vp; 2187 anoff_t off; 2188 page_t *anon_pl[1 + 1]; 2189 int err; 2190 2191 if (oppflags & STEAL_PAGE) 2192 ASSERT(PAGE_EXCL(opp)); 2193 else 2194 ASSERT(PAGE_LOCKED(opp)); 2195 2196 CPU_STATS_ADD_K(vm, cow_fault, 1); 2197 2198 /* Kernel probe */ 2199 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2200 tnf_opaque, address, addr); 2201 2202 *app = new = anon_alloc(NULL, 0); 2203 swap_xlate(new, &vp, &off); 2204 2205 if (oppflags & STEAL_PAGE) { 2206 page_rename(opp, vp, (u_offset_t)off); 2207 pp = opp; 2208 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2209 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2210 seg, addr, pp, vp, off); 2211 hat_setmod(pp); 2212 2213 /* bug 4026339 */ 2214 page_downgrade(pp); 2215 return (pp); 2216 } 2217 2218 /* 2219 * Call the VOP_GETPAGE routine to create the page, thereby 2220 * enabling the vnode driver to allocate any filesystem 2221 * space (e.g., disk block allocation for UFS). This also 2222 * prevents more than one page from being added to the 2223 * vnode at the same time. 2224 */ 2225 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2226 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2227 if (err) 2228 goto out; 2229 2230 pp = anon_pl[0]; 2231 2232 /* 2233 * If the original page was locked, we need to move the lock 2234 * to the new page by transfering 'cowcnt/lckcnt' of the original 2235 * page to 'cowcnt/lckcnt' of the new page. 2236 * 2237 * See Statement at the beginning of segvn_lockop() and 2238 * comments in page_pp_useclaim() regarding the way 2239 * cowcnts/lckcnts are handled. 2240 * 2241 * Also availrmem must be decremented up front for read only mapping 2242 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2243 * if availrmem did not need to be decremented after all. 2244 */ 2245 if (oppflags & LOCK_PAGE) { 2246 if ((prot & PROT_WRITE) == 0) { 2247 mutex_enter(&freemem_lock); 2248 if (availrmem > pages_pp_maximum) { 2249 availrmem--; 2250 pages_useclaim++; 2251 } else { 2252 mutex_exit(&freemem_lock); 2253 goto out; 2254 } 2255 mutex_exit(&freemem_lock); 2256 } 2257 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2258 } 2259 2260 /* 2261 * Now copy the contents from the original page, 2262 * which is locked and loaded in the MMU by 2263 * the caller to prevent yet another page fault. 2264 */ 2265 ppcopy(opp, pp); /* XXX - should set mod bit in here */ 2266 2267 hat_setrefmod(pp); /* mark as modified */ 2268 2269 /* 2270 * Unload the old translation. 2271 */ 2272 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2273 2274 /* 2275 * Free unmapped, unmodified original page. 2276 * or release the lock on the original page, 2277 * otherwise the process will sleep forever in 2278 * anon_decref() waiting for the "exclusive" lock 2279 * on the page. 2280 */ 2281 (void) page_release(opp, 1); 2282 2283 /* 2284 * we are done with page creation so downgrade the new 2285 * page's selock to shared, this helps when multiple 2286 * as_fault(...SOFTLOCK...) are done to the same 2287 * page(aio) 2288 */ 2289 page_downgrade(pp); 2290 2291 /* 2292 * NOTE: The original anon slot must be freed by the 2293 * caller while holding the "anon_map" lock, if we 2294 * copied away from an anonymous page. 2295 */ 2296 return (pp); 2297 2298 out: 2299 *app = old; 2300 if (pp) 2301 page_unlock(pp); 2302 anon_decref(new); 2303 page_unlock(opp); 2304 return ((page_t *)NULL); 2305 } 2306 2307 int 2308 anon_map_privatepages( 2309 struct anon_map *amp, 2310 ulong_t start_idx, 2311 uint_t szc, 2312 struct seg *seg, 2313 caddr_t addr, 2314 uint_t prot, 2315 page_t *ppa[], 2316 struct vpage vpage[], 2317 int anypgsz, 2318 struct cred *cred) 2319 { 2320 pgcnt_t pgcnt; 2321 struct vnode *vp; 2322 anoff_t off; 2323 page_t *pl[2], *conpp = NULL; 2324 int err; 2325 int prealloc = 1; 2326 struct anon *ap, *oldap; 2327 caddr_t vaddr; 2328 page_t *pplist, *pp; 2329 ulong_t pg_idx, an_idx; 2330 spgcnt_t nreloc = 0; 2331 int pagelock = 0; 2332 kmutex_t *ahmpages = NULL; 2333 #ifdef DEBUG 2334 int refcnt; 2335 #endif 2336 2337 ASSERT(szc != 0); 2338 ASSERT(szc == seg->s_szc); 2339 2340 VM_STAT_ADD(anonvmstats.privatepages[0]); 2341 2342 pgcnt = page_get_pagecnt(szc); 2343 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2344 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2345 2346 ASSERT(amp != NULL); 2347 ap = anon_get_ptr(amp->ahp, start_idx); 2348 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2349 2350 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2351 2352 /* 2353 * Now try and allocate the large page. If we fail then just 2354 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2355 * the caller make this decision but to avoid added complexity 2356 * it's simplier to handle that case here. 2357 */ 2358 if (anypgsz == -1) { 2359 VM_STAT_ADD(anonvmstats.privatepages[2]); 2360 prealloc = 0; 2361 } else if (page_alloc_pages(seg, addr, &pplist, NULL, szc, 2362 anypgsz) != 0) { 2363 VM_STAT_ADD(anonvmstats.privatepages[3]); 2364 prealloc = 0; 2365 } 2366 2367 /* 2368 * make the decrement of all refcnts of all 2369 * anon slots of a large page appear atomic by 2370 * getting an anonpages_hash_lock for the 2371 * first anon slot of a large page. 2372 */ 2373 if (ap != NULL) { 2374 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2375 ap->an_off)]; 2376 mutex_enter(ahmpages); 2377 if (ap->an_refcnt == 1) { 2378 VM_STAT_ADD(anonvmstats.privatepages[4]); 2379 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2380 mutex_exit(ahmpages); 2381 2382 if (prealloc) { 2383 page_free_replacement_page(pplist); 2384 page_create_putback(pgcnt); 2385 } 2386 ASSERT(ppa[0]->p_szc <= szc); 2387 if (ppa[0]->p_szc == szc) { 2388 VM_STAT_ADD(anonvmstats.privatepages[5]); 2389 return (0); 2390 } 2391 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2392 ASSERT(ppa[pg_idx] != NULL); 2393 page_unlock(ppa[pg_idx]); 2394 } 2395 return (-1); 2396 } 2397 } 2398 2399 /* 2400 * If we are passed in the vpage array and this is 2401 * not PROT_WRITE then we need to decrement availrmem 2402 * up front before we try anything. If we need to and 2403 * can't decrement availrmem then its better to fail now 2404 * than in the middle of processing the new large page. 2405 * page_pp_usclaim() on behalf of each constituent page 2406 * below will adjust availrmem back for the cases not needed. 2407 */ 2408 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2409 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2410 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2411 pagelock = 1; 2412 break; 2413 } 2414 } 2415 if (pagelock) { 2416 VM_STAT_ADD(anonvmstats.privatepages[6]); 2417 mutex_enter(&freemem_lock); 2418 if (availrmem >= pages_pp_maximum + pgcnt) { 2419 availrmem -= pgcnt; 2420 pages_useclaim += pgcnt; 2421 } else { 2422 VM_STAT_ADD(anonvmstats.privatepages[7]); 2423 mutex_exit(&freemem_lock); 2424 if (ahmpages != NULL) { 2425 mutex_exit(ahmpages); 2426 } 2427 if (prealloc) { 2428 page_free_replacement_page(pplist); 2429 page_create_putback(pgcnt); 2430 } 2431 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2432 if (ppa[pg_idx] != NULL) 2433 page_unlock(ppa[pg_idx]); 2434 return (ENOMEM); 2435 } 2436 mutex_exit(&freemem_lock); 2437 } 2438 } 2439 2440 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2441 2442 VM_STAT_ADD(anonvmstats.privatepages[8]); 2443 2444 an_idx = start_idx; 2445 pg_idx = 0; 2446 vaddr = addr; 2447 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2448 ASSERT(ppa[pg_idx] != NULL); 2449 oldap = anon_get_ptr(amp->ahp, an_idx); 2450 ASSERT(ahmpages != NULL || oldap == NULL); 2451 ASSERT(ahmpages == NULL || oldap != NULL); 2452 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2453 ASSERT(ahmpages == NULL || pg_idx != 0 || 2454 (refcnt = oldap->an_refcnt)); 2455 ASSERT(ahmpages == NULL || pg_idx == 0 || 2456 refcnt == oldap->an_refcnt); 2457 2458 ap = anon_alloc(NULL, 0); 2459 2460 swap_xlate(ap, &vp, &off); 2461 2462 /* 2463 * Now setup our preallocated page to pass down to 2464 * swap_getpage(). 2465 */ 2466 if (prealloc) { 2467 pp = pplist; 2468 page_sub(&pplist, pp); 2469 conpp = pp; 2470 } 2471 2472 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2473 PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred); 2474 2475 /* 2476 * Impossible to fail this is S_CREATE. 2477 */ 2478 if (err) 2479 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2480 2481 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2482 ASSERT(prealloc == 0 || nreloc == 1); 2483 2484 pp = pl[0]; 2485 2486 /* 2487 * If the original page was locked, we need to move 2488 * the lock to the new page by transfering 2489 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2490 * of the new page. pg_idx can be used to index 2491 * into the vpage array since the caller will guarentee 2492 * that vpage struct passed in corresponds to addr 2493 * and forward. 2494 */ 2495 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2496 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2497 } else if (pagelock) { 2498 mutex_enter(&freemem_lock); 2499 availrmem++; 2500 pages_useclaim--; 2501 mutex_exit(&freemem_lock); 2502 } 2503 2504 /* 2505 * Now copy the contents from the original page. 2506 */ 2507 ppcopy(ppa[pg_idx], pp); 2508 2509 hat_setrefmod(pp); /* mark as modified */ 2510 2511 /* 2512 * Release the lock on the original page, 2513 * derement the old slot, and down grade the lock 2514 * on the new copy. 2515 */ 2516 page_unlock(ppa[pg_idx]); 2517 2518 if (!prealloc) 2519 page_downgrade(pp); 2520 2521 ppa[pg_idx] = pp; 2522 2523 /* 2524 * Now reflect the copy in the new anon array. 2525 */ 2526 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2527 if (oldap != NULL) 2528 anon_decref(oldap); 2529 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2530 } 2531 if (ahmpages != NULL) { 2532 mutex_exit(ahmpages); 2533 } 2534 ASSERT(prealloc == 0 || pplist == NULL); 2535 if (prealloc) { 2536 VM_STAT_ADD(anonvmstats.privatepages[9]); 2537 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2538 page_downgrade(ppa[pg_idx]); 2539 } 2540 } 2541 2542 /* 2543 * Unload the old large page translation. 2544 */ 2545 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2546 return (0); 2547 } 2548 2549 /* 2550 * Allocate a private zero-filled anon page. 2551 */ 2552 page_t * 2553 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2554 { 2555 struct anon *ap; 2556 page_t *pp; 2557 struct vnode *vp; 2558 anoff_t off; 2559 page_t *anon_pl[1 + 1]; 2560 int err; 2561 2562 /* Kernel probe */ 2563 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2564 tnf_opaque, address, addr); 2565 2566 *app = ap = anon_alloc(NULL, 0); 2567 swap_xlate(ap, &vp, &off); 2568 2569 /* 2570 * Call the VOP_GETPAGE routine to create the page, thereby 2571 * enabling the vnode driver to allocate any filesystem 2572 * dependent structures (e.g., disk block allocation for UFS). 2573 * This also prevents more than on page from being added to 2574 * the vnode at the same time since it is locked. 2575 */ 2576 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2577 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2578 if (err) { 2579 *app = NULL; 2580 anon_decref(ap); 2581 return (NULL); 2582 } 2583 pp = anon_pl[0]; 2584 2585 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2586 page_downgrade(pp); 2587 CPU_STATS_ADD_K(vm, zfod, 1); 2588 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2589 return (pp); 2590 } 2591 2592 2593 /* 2594 * Allocate array of private zero-filled anon pages for empty slots 2595 * and kept pages for non empty slots within given range. 2596 * 2597 * NOTE: This rontine will try and use large pages 2598 * if available and supported by underlying platform. 2599 */ 2600 int 2601 anon_map_createpages( 2602 struct anon_map *amp, 2603 ulong_t start_index, 2604 size_t len, 2605 page_t *ppa[], 2606 struct seg *seg, 2607 caddr_t addr, 2608 enum seg_rw rw, 2609 struct cred *cred) 2610 { 2611 2612 struct anon *ap; 2613 struct vnode *ap_vp; 2614 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2615 int err = 0; 2616 ulong_t p_index, index; 2617 pgcnt_t npgs, pg_cnt; 2618 spgcnt_t nreloc = 0; 2619 uint_t l_szc, szc, prot; 2620 anoff_t ap_off; 2621 size_t pgsz; 2622 lgrp_t *lgrp; 2623 2624 /* 2625 * XXX For now only handle S_CREATE. 2626 */ 2627 ASSERT(rw == S_CREATE); 2628 2629 index = start_index; 2630 p_index = 0; 2631 npgs = btopr(len); 2632 2633 /* 2634 * If this platform supports multiple page sizes 2635 * then try and allocate directly from the free 2636 * list for pages larger than PAGESIZE. 2637 * 2638 * NOTE:When we have page_create_ru we can stop 2639 * directly allocating from the freelist. 2640 */ 2641 l_szc = seg->s_szc; 2642 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2643 while (npgs) { 2644 2645 /* 2646 * if anon slot already exists 2647 * (means page has been created) 2648 * so 1) look up the page 2649 * 2) if the page is still in memory, get it. 2650 * 3) if not, create a page and 2651 * page in from physical swap device. 2652 * These are done in anon_getpage(). 2653 */ 2654 ap = anon_get_ptr(amp->ahp, index); 2655 if (ap) { 2656 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2657 seg, addr, S_READ, cred); 2658 if (err) { 2659 ANON_LOCK_EXIT(&->a_rwlock); 2660 panic("anon_map_createpages: anon_getpage"); 2661 } 2662 pp = anon_pl[0]; 2663 ppa[p_index++] = pp; 2664 2665 addr += PAGESIZE; 2666 index++; 2667 npgs--; 2668 continue; 2669 } 2670 /* 2671 * Now try and allocate the largest page possible 2672 * for the current address and range. 2673 * Keep dropping down in page size until: 2674 * 2675 * 1) Properly aligned 2676 * 2) Does not overlap existing anon pages 2677 * 3) Fits in remaining range. 2678 * 4) able to allocate one. 2679 * 2680 * NOTE: XXX When page_create_ru is completed this code 2681 * will change. 2682 */ 2683 szc = l_szc; 2684 pplist = NULL; 2685 pg_cnt = 0; 2686 while (szc) { 2687 pgsz = page_get_pagesize(szc); 2688 pg_cnt = pgsz >> PAGESHIFT; 2689 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2690 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2691 /* 2692 * XXX 2693 * Since we are faking page_create() 2694 * we also need to do the freemem and 2695 * pcf accounting. 2696 */ 2697 (void) page_create_wait(pg_cnt, PG_WAIT); 2698 2699 /* 2700 * Get lgroup to allocate next page of shared 2701 * memory from and use it to specify where to 2702 * allocate the physical memory 2703 */ 2704 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2705 2706 pplist = page_get_freelist( 2707 (struct vnode *)NULL, (u_offset_t)0, seg, 2708 addr, pgsz, 0, lgrp); 2709 2710 if (pplist == NULL) { 2711 page_create_putback(pg_cnt); 2712 } 2713 2714 /* 2715 * If a request for a page of size 2716 * larger than PAGESIZE failed 2717 * then don't try that size anymore. 2718 */ 2719 if (pplist == NULL) { 2720 l_szc = szc - 1; 2721 } else { 2722 break; 2723 } 2724 } 2725 szc--; 2726 } 2727 2728 /* 2729 * If just using PAGESIZE pages then don't 2730 * directly allocate from the free list. 2731 */ 2732 if (pplist == NULL) { 2733 ASSERT(szc == 0); 2734 pp = anon_zero(seg, addr, &ap, cred); 2735 if (pp == NULL) { 2736 ANON_LOCK_EXIT(&->a_rwlock); 2737 panic("anon_map_createpages: anon_zero"); 2738 } 2739 ppa[p_index++] = pp; 2740 2741 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2742 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2743 2744 addr += PAGESIZE; 2745 index++; 2746 npgs--; 2747 continue; 2748 } 2749 2750 /* 2751 * pplist is a list of pg_cnt PAGESIZE pages. 2752 * These pages are locked SE_EXCL since they 2753 * came directly off the free list. 2754 */ 2755 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2756 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2757 ASSERT(conpp == NULL); 2758 while (pg_cnt--) { 2759 2760 ap = anon_alloc(NULL, 0); 2761 swap_xlate(ap, &ap_vp, &ap_off); 2762 2763 ASSERT(pplist != NULL); 2764 pp = pplist; 2765 page_sub(&pplist, pp); 2766 PP_CLRFREE(pp); 2767 PP_CLRAGED(pp); 2768 conpp = pp; 2769 2770 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2771 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc, 2772 seg, addr, S_CREATE, cred); 2773 2774 if (err) { 2775 ANON_LOCK_EXIT(&->a_rwlock); 2776 panic("anon_map_createpages: S_CREATE"); 2777 } 2778 2779 ASSERT(anon_pl[0] == pp); 2780 ASSERT(nreloc == 1); 2781 pagezero(pp, 0, PAGESIZE); 2782 CPU_STATS_ADD_K(vm, zfod, 1); 2783 hat_setrefmod(pp); 2784 2785 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2786 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2787 2788 ppa[p_index++] = pp; 2789 2790 addr += PAGESIZE; 2791 index++; 2792 npgs--; 2793 } 2794 conpp = NULL; 2795 pg_cnt = pgsz >> PAGESHIFT; 2796 p_index = p_index - pg_cnt; 2797 while (pg_cnt--) { 2798 page_downgrade(ppa[p_index++]); 2799 } 2800 } 2801 ANON_LOCK_EXIT(&->a_rwlock); 2802 return (0); 2803 } 2804 2805 int 2806 anon_map_demotepages( 2807 struct anon_map *amp, 2808 ulong_t start_idx, 2809 struct seg *seg, 2810 caddr_t addr, 2811 uint_t prot, 2812 struct vpage vpage[], 2813 struct cred *cred) 2814 { 2815 struct anon *ap; 2816 uint_t szc = seg->s_szc; 2817 pgcnt_t pgcnt = page_get_pagecnt(szc); 2818 size_t ppasize = pgcnt * sizeof (page_t *); 2819 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 2820 page_t *pp; 2821 page_t *pl[2]; 2822 pgcnt_t i, pg_idx; 2823 ulong_t an_idx; 2824 caddr_t vaddr; 2825 kmutex_t *ahmpages = NULL; 2826 int err; 2827 int retry = 0; 2828 uint_t vpprot; 2829 2830 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 2831 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2832 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2833 ASSERT(ppa != NULL); 2834 2835 VM_STAT_ADD(anonvmstats.demotepages[0]); 2836 2837 ap = anon_get_ptr(amp->ahp, start_idx); 2838 if (ap != NULL) { 2839 VM_STAT_ADD(anonvmstats.demotepages[1]); 2840 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2841 mutex_enter(ahmpages); 2842 } 2843 top: 2844 if (ap == NULL || ap->an_refcnt <= 1) { 2845 int root = 0; 2846 pgcnt_t npgs, curnpgs = 0; 2847 2848 VM_STAT_ADD(anonvmstats.demotepages[2]); 2849 2850 ASSERT(retry == 0 || ap != NULL); 2851 2852 if (ahmpages != NULL) 2853 mutex_exit(ahmpages); 2854 an_idx = start_idx; 2855 for (i = 0; i < pgcnt; i++, an_idx++) { 2856 ap = anon_get_ptr(amp->ahp, an_idx); 2857 if (ap != NULL) { 2858 ASSERT(ap->an_refcnt == 1); 2859 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2860 SE_EXCL); 2861 if (pp != NULL) { 2862 (void) hat_pageunload(pp, 2863 HAT_FORCE_PGUNLOAD); 2864 } 2865 } else { 2866 ppa[i] = NULL; 2867 } 2868 } 2869 for (i = 0; i < pgcnt; i++) { 2870 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 2871 ASSERT(pp->p_szc <= szc); 2872 if (!root) { 2873 VM_STAT_ADD(anonvmstats.demotepages[3]); 2874 if (curnpgs != 0) 2875 panic("anon_map_demotepages: " 2876 "bad large page"); 2877 2878 root = 1; 2879 curnpgs = npgs = 2880 page_get_pagecnt(pp->p_szc); 2881 2882 ASSERT(npgs <= pgcnt); 2883 ASSERT(IS_P2ALIGNED(npgs, npgs)); 2884 ASSERT(!(page_pptonum(pp) & 2885 (npgs - 1))); 2886 } else { 2887 ASSERT(i > 0); 2888 ASSERT(page_pptonum(pp) - 1 == 2889 page_pptonum(ppa[i - 1])); 2890 if ((page_pptonum(pp) & (npgs - 1)) == 2891 npgs - 1) 2892 root = 0; 2893 } 2894 ASSERT(PAGE_EXCL(pp)); 2895 pp->p_szc = 0; 2896 curnpgs--; 2897 } 2898 } 2899 if (root != 0 || curnpgs != 0) 2900 panic("anon_map_demotepages: bad large page"); 2901 2902 for (i = 0; i < pgcnt; i++) { 2903 if ((pp = ppa[i]) != NULL) { 2904 ASSERT(!hat_page_is_mapped(pp)); 2905 ASSERT(pp->p_szc == 0); 2906 page_unlock(pp); 2907 } 2908 } 2909 kmem_free(ppa, ppasize); 2910 return (0); 2911 } 2912 ASSERT(ahmpages != NULL); 2913 mutex_exit(ahmpages); 2914 ahmpages = NULL; 2915 2916 VM_STAT_ADD(anonvmstats.demotepages[4]); 2917 2918 ASSERT(retry == 0); /* we can be here only once */ 2919 2920 vaddr = addr; 2921 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2922 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2923 ap = anon_get_ptr(amp->ahp, an_idx); 2924 if (ap == NULL) 2925 panic("anon_map_demotepages: no anon slot"); 2926 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 2927 S_READ, cred); 2928 if (err) { 2929 for (i = 0; i < pg_idx; i++) { 2930 if ((pp = ppa[i]) != NULL) 2931 page_unlock(pp); 2932 } 2933 kmem_free(ppa, ppasize); 2934 return (err); 2935 } 2936 ppa[pg_idx] = pl[0]; 2937 } 2938 2939 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 2940 vpage, -1, cred); 2941 if (err > 0) { 2942 VM_STAT_ADD(anonvmstats.demotepages[5]); 2943 kmem_free(ppa, ppasize); 2944 return (err); 2945 } 2946 ASSERT(err == 0 || err == -1); 2947 if (err == -1) { 2948 VM_STAT_ADD(anonvmstats.demotepages[6]); 2949 retry = 1; 2950 goto top; 2951 } 2952 for (i = 0; i < pgcnt; i++) { 2953 ASSERT(ppa[i] != NULL); 2954 if (ppa[i]->p_szc != 0) 2955 retry = 1; 2956 page_unlock(ppa[i]); 2957 } 2958 if (retry) { 2959 VM_STAT_ADD(anonvmstats.demotepages[7]); 2960 goto top; 2961 } 2962 2963 VM_STAT_ADD(anonvmstats.demotepages[8]); 2964 2965 kmem_free(ppa, ppasize); 2966 2967 return (0); 2968 } 2969 2970 /* 2971 * Allocate and initialize an anon_map structure for seg 2972 * associating the given swap reservation with the new anon_map. 2973 */ 2974 struct anon_map * 2975 anonmap_alloc(size_t size, size_t swresv) 2976 { 2977 struct anon_map *amp; 2978 2979 amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP); 2980 2981 amp->refcnt = 1; 2982 amp->size = size; 2983 2984 amp->ahp = anon_create(btopr(size), ANON_SLEEP); 2985 amp->swresv = swresv; 2986 amp->locality = 0; 2987 amp->a_szc = 0; 2988 return (amp); 2989 } 2990 2991 void 2992 anonmap_free(struct anon_map *amp) 2993 { 2994 ASSERT(amp->ahp); 2995 ASSERT(amp->refcnt == 0); 2996 2997 lgrp_shm_policy_fini(amp, NULL); 2998 anon_release(amp->ahp, btopr(amp->size)); 2999 kmem_cache_free(anonmap_cache, amp); 3000 } 3001 3002 /* 3003 * Returns true if the app array has some empty slots. 3004 * The offp and lenp paramters are in/out paramters. On entry 3005 * these values represent the starting offset and length of the 3006 * mapping. When true is returned, these values may be modified 3007 * to be the largest range which includes empty slots. 3008 */ 3009 int 3010 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3011 size_t *lenp) 3012 { 3013 ulong_t i, el; 3014 ssize_t low, high; 3015 struct anon *ap; 3016 3017 low = -1; 3018 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3019 ap = anon_get_ptr(ahp, anon_idx); 3020 if (ap == NULL) { 3021 if (low == -1) 3022 low = i; 3023 high = i; 3024 } 3025 } 3026 if (low != -1) { 3027 /* 3028 * Found at least one non-anon page. 3029 * Set up the off and len return values. 3030 */ 3031 if (low != 0) 3032 *offp += low; 3033 *lenp = high - low + PAGESIZE; 3034 return (1); 3035 } 3036 return (0); 3037 } 3038 3039 /* 3040 * Return a count of the number of existing anon pages in the anon array 3041 * app in the range (off, off+len). The array and slots must be guaranteed 3042 * stable by the caller. 3043 */ 3044 pgcnt_t 3045 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3046 { 3047 pgcnt_t cnt = 0; 3048 3049 while (nslots-- > 0) { 3050 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3051 cnt++; 3052 anon_index++; 3053 } 3054 return (cnt); 3055 } 3056 3057 /* 3058 * Move reserved phys swap into memory swap (unreserve phys swap 3059 * and reserve mem swap by the same amount). 3060 * Used by segspt when it needs to lock resrved swap npages in memory 3061 */ 3062 int 3063 anon_swap_adjust(pgcnt_t npages) 3064 { 3065 pgcnt_t unlocked_mem_swap; 3066 3067 mutex_enter(&anoninfo_lock); 3068 3069 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3070 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3071 3072 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3073 - k_anoninfo.ani_locked_swap; 3074 if (npages > unlocked_mem_swap) { 3075 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3076 3077 /* 3078 * if there is not enough unlocked mem swap we take missing 3079 * amount from phys swap and give it to mem swap 3080 */ 3081 mutex_enter(&freemem_lock); 3082 if (availrmem < adjusted_swap + segspt_minfree) { 3083 mutex_exit(&freemem_lock); 3084 mutex_exit(&anoninfo_lock); 3085 return (ENOMEM); 3086 } 3087 availrmem -= adjusted_swap; 3088 mutex_exit(&freemem_lock); 3089 3090 k_anoninfo.ani_mem_resv += adjusted_swap; 3091 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3092 k_anoninfo.ani_phys_resv -= adjusted_swap; 3093 3094 ANI_ADD(adjusted_swap); 3095 } 3096 k_anoninfo.ani_locked_swap += npages; 3097 3098 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3099 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3100 3101 mutex_exit(&anoninfo_lock); 3102 3103 return (0); 3104 } 3105 3106 /* 3107 * 'unlocked' reserved mem swap so when it is unreserved it 3108 * can be moved back phys (disk) swap 3109 */ 3110 void 3111 anon_swap_restore(pgcnt_t npages) 3112 { 3113 mutex_enter(&anoninfo_lock); 3114 3115 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3116 3117 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3118 k_anoninfo.ani_locked_swap -= npages; 3119 3120 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3121 3122 mutex_exit(&anoninfo_lock); 3123 } 3124 3125 /* 3126 * Return the pointer from the list for a 3127 * specified anon index. 3128 */ 3129 ulong_t * 3130 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3131 { 3132 struct anon **app; 3133 void **ppp; 3134 3135 ASSERT(an_idx < ahp->size); 3136 3137 /* 3138 * Single level case. 3139 */ 3140 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3141 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3142 } else { 3143 3144 /* 3145 * 2 level case. 3146 */ 3147 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3148 if (*ppp == NULL) { 3149 mutex_enter(&ahp->serial_lock); 3150 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3151 if (*ppp == NULL) 3152 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3153 mutex_exit(&ahp->serial_lock); 3154 } 3155 app = *ppp; 3156 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3157 } 3158 } 3159 3160 void 3161 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3162 { 3163 ulong_t *ap_slot; 3164 kmutex_t *mtx; 3165 kcondvar_t *cv; 3166 int hash; 3167 3168 /* 3169 * Use szc to determine anon slot(s) to appear atomic. 3170 * If szc = 0, then lock the anon slot and mark it busy. 3171 * If szc > 0, then lock the range of slots by getting the 3172 * anon_array_lock for the first anon slot, and mark only the 3173 * first anon slot busy to represent whole range being busy. 3174 */ 3175 3176 ASSERT(RW_READ_HELD(&->a_rwlock)); 3177 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3178 hash = ANON_ARRAY_HASH(amp, an_idx); 3179 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3180 sobj->sync_cv = cv = &anon_array_cv[hash]; 3181 mutex_enter(mtx); 3182 ap_slot = anon_get_slot(amp->ahp, an_idx); 3183 while (ANON_ISBUSY(ap_slot)) 3184 cv_wait(cv, mtx); 3185 ANON_SETBUSY(ap_slot); 3186 sobj->sync_data = ap_slot; 3187 mutex_exit(mtx); 3188 } 3189 3190 void 3191 anon_array_exit(anon_sync_obj_t *sobj) 3192 { 3193 mutex_enter(sobj->sync_mutex); 3194 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3195 ANON_CLRBUSY(sobj->sync_data); 3196 if (CV_HAS_WAITERS(sobj->sync_cv)) 3197 cv_broadcast(sobj->sync_cv); 3198 mutex_exit(sobj->sync_mutex); 3199 } 3200