1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 28 /* All Rights Reserved */ 29 30 /* 31 * University Copyright- Copyright (c) 1982, 1986, 1988 32 * The Regents of the University of California 33 * All Rights Reserved 34 * 35 * University Acknowledgment- Portions of this document are derived from 36 * software developed by the University of California, Berkeley, and its 37 * contributors. 38 */ 39 40 #pragma ident "%Z%%M% %I% %E% SMI" 41 42 /* 43 * VM - anonymous pages. 44 * 45 * This layer sits immediately above the vm_swap layer. It manages 46 * physical pages that have no permanent identity in the file system 47 * name space, using the services of the vm_swap layer to allocate 48 * backing storage for these pages. Since these pages have no external 49 * identity, they are discarded when the last reference is removed. 50 * 51 * An important function of this layer is to manage low-level sharing 52 * of pages that are logically distinct but that happen to be 53 * physically identical (e.g., the corresponding pages of the processes 54 * resulting from a fork before one process or the other changes their 55 * contents). This pseudo-sharing is present only as an optimization 56 * and is not to be confused with true sharing in which multiple 57 * address spaces deliberately contain references to the same object; 58 * such sharing is managed at a higher level. 59 * 60 * The key data structure here is the anon struct, which contains a 61 * reference count for its associated physical page and a hint about 62 * the identity of that page. Anon structs typically live in arrays, 63 * with an instance's position in its array determining where the 64 * corresponding backing storage is allocated; however, the swap_xlate() 65 * routine abstracts away this representation information so that the 66 * rest of the anon layer need not know it. (See the swap layer for 67 * more details on anon struct layout.) 68 * 69 * In the future versions of the system, the association between an 70 * anon struct and its position on backing store will change so that 71 * we don't require backing store all anonymous pages in the system. 72 * This is important for consideration for large memory systems. 73 * We can also use this technique to delay binding physical locations 74 * to anonymous pages until pageout/swapout time where we can make 75 * smarter allocation decisions to improve anonymous klustering. 76 * 77 * Many of the routines defined here take a (struct anon **) argument, 78 * which allows the code at this level to manage anon pages directly, 79 * so that callers can regard anon structs as opaque objects and not be 80 * concerned with assigning or inspecting their contents. 81 * 82 * Clients of this layer refer to anon pages indirectly. That is, they 83 * maintain arrays of pointers to anon structs rather than maintaining 84 * anon structs themselves. The (struct anon **) arguments mentioned 85 * above are pointers to entries in these arrays. It is these arrays 86 * that capture the mapping between offsets within a given segment and 87 * the corresponding anonymous backing storage address. 88 */ 89 90 #ifdef DEBUG 91 #define ANON_DEBUG 92 #endif 93 94 #include <sys/types.h> 95 #include <sys/t_lock.h> 96 #include <sys/param.h> 97 #include <sys/systm.h> 98 #include <sys/mman.h> 99 #include <sys/cred.h> 100 #include <sys/thread.h> 101 #include <sys/vnode.h> 102 #include <sys/cpuvar.h> 103 #include <sys/swap.h> 104 #include <sys/cmn_err.h> 105 #include <sys/vtrace.h> 106 #include <sys/kmem.h> 107 #include <sys/sysmacros.h> 108 #include <sys/bitmap.h> 109 #include <sys/vmsystm.h> 110 #include <sys/debug.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 int anon_debug; 128 129 kmutex_t anoninfo_lock; 130 struct k_anoninfo k_anoninfo; 131 ani_free_t ani_free_pool[ANI_MAX_POOL]; 132 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 133 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 134 135 /* 136 * Global hash table for (vp, off) -> anon slot 137 */ 138 extern int swap_maxcontig; 139 size_t anon_hash_size; 140 struct anon **anon_hash; 141 142 static struct kmem_cache *anon_cache; 143 static struct kmem_cache *anonmap_cache; 144 145 #ifdef VM_STATS 146 static struct anonvmstats_str { 147 ulong_t getpages[30]; 148 ulong_t privatepages[10]; 149 ulong_t demotepages[9]; 150 ulong_t decrefpages[9]; 151 ulong_t dupfillholes[4]; 152 ulong_t freepages[1]; 153 } anonvmstats; 154 #endif /* VM_STATS */ 155 156 157 /*ARGSUSED*/ 158 static int 159 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 160 { 161 struct anon_map *amp = buf; 162 163 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 164 return (0); 165 } 166 167 /*ARGSUSED1*/ 168 static void 169 anonmap_cache_destructor(void *buf, void *cdrarg) 170 { 171 struct anon_map *amp = buf; 172 173 rw_destroy(&->a_rwlock); 174 } 175 176 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 177 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 178 179 void 180 anon_init(void) 181 { 182 int i; 183 184 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 185 186 for (i = 0; i < AH_LOCK_SIZE; i++) { 187 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 188 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 189 } 190 191 for (i = 0; i < ANON_LOCKSIZE; i++) { 192 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 193 MUTEX_DEFAULT, NULL); 194 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 195 } 196 197 anon_hash = (struct anon **) 198 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 199 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 200 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 201 anonmap_cache = kmem_cache_create("anonmap_cache", 202 sizeof (struct anon_map), 0, 203 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 204 NULL, NULL, 0); 205 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 206 } 207 208 /* 209 * Global anon slot hash table manipulation. 210 */ 211 212 static void 213 anon_addhash(struct anon *ap) 214 { 215 int index; 216 217 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 218 index = ANON_HASH(ap->an_vp, ap->an_off); 219 ap->an_hash = anon_hash[index]; 220 anon_hash[index] = ap; 221 } 222 223 static void 224 anon_rmhash(struct anon *ap) 225 { 226 struct anon **app; 227 228 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 229 230 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 231 *app; app = &((*app)->an_hash)) { 232 if (*app == ap) { 233 *app = ap->an_hash; 234 break; 235 } 236 } 237 } 238 239 /* 240 * The anon array interfaces. Functions allocating, 241 * freeing array of pointers, and returning/setting 242 * entries in the array of pointers for a given offset. 243 * 244 * Create the list of pointers 245 */ 246 struct anon_hdr * 247 anon_create(pgcnt_t npages, int flags) 248 { 249 struct anon_hdr *ahp; 250 ulong_t nchunks; 251 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 252 253 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 254 return (NULL); 255 } 256 257 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 258 /* 259 * Single level case. 260 */ 261 ahp->size = npages; 262 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 263 264 if (flags & ANON_ALLOC_FORCE) 265 ahp->flags |= ANON_ALLOC_FORCE; 266 267 ahp->array_chunk = kmem_zalloc( 268 ahp->size * sizeof (struct anon *), kmemflags); 269 270 if (ahp->array_chunk == NULL) { 271 kmem_free(ahp, sizeof (struct anon_hdr)); 272 return (NULL); 273 } 274 } else { 275 /* 276 * 2 Level case. 277 */ 278 nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 279 280 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 281 kmemflags); 282 283 if (ahp->array_chunk == NULL) { 284 kmem_free(ahp, sizeof (struct anon_hdr)); 285 return (NULL); 286 } 287 } 288 return (ahp); 289 } 290 291 /* 292 * Free the array of pointers 293 */ 294 void 295 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 296 { 297 ulong_t i; 298 void **ppp; 299 ulong_t nchunks; 300 301 ASSERT(npages == ahp->size); 302 303 /* 304 * Single level case. 305 */ 306 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 307 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 308 } else { 309 /* 310 * 2 level case. 311 */ 312 nchunks = (ahp->size + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 313 for (i = 0; i < nchunks; i++) { 314 ppp = &ahp->array_chunk[i]; 315 if (*ppp != NULL) 316 kmem_free(*ppp, PAGESIZE); 317 } 318 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 319 } 320 mutex_destroy(&ahp->serial_lock); 321 kmem_free(ahp, sizeof (struct anon_hdr)); 322 } 323 324 /* 325 * Return the pointer from the list for a 326 * specified anon index. 327 */ 328 struct anon * 329 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 330 { 331 struct anon **app; 332 333 ASSERT(an_idx < ahp->size); 334 335 /* 336 * Single level case. 337 */ 338 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 339 return ((struct anon *) 340 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 341 } else { 342 343 /* 344 * 2 level case. 345 */ 346 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 347 if (app) { 348 return ((struct anon *) 349 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 350 ANON_PTRMASK)); 351 } else { 352 return (NULL); 353 } 354 } 355 } 356 357 /* 358 * Return the anon pointer for the first valid entry in the anon list, 359 * starting from the given index. 360 */ 361 struct anon * 362 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 363 { 364 struct anon *ap; 365 struct anon **app; 366 ulong_t chunkoff; 367 ulong_t i; 368 ulong_t j; 369 pgcnt_t size; 370 371 i = *index; 372 size = ahp->size; 373 374 ASSERT(i < size); 375 376 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 377 /* 378 * 1 level case 379 */ 380 while (i < size) { 381 ap = (struct anon *) 382 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 383 if (ap) { 384 *index = i; 385 return (ap); 386 } 387 i++; 388 } 389 } else { 390 /* 391 * 2 level case 392 */ 393 chunkoff = i & ANON_CHUNK_OFF; 394 while (i < size) { 395 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 396 if (app) 397 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 398 ap = (struct anon *) 399 ((uintptr_t)app[j] & 400 ANON_PTRMASK); 401 if (ap) { 402 *index = i + (j - chunkoff); 403 return (ap); 404 } 405 } 406 chunkoff = 0; 407 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 408 } 409 } 410 *index = size; 411 return (NULL); 412 } 413 414 /* 415 * Set list entry with a given pointer for a specified offset 416 */ 417 int 418 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 419 { 420 void **ppp; 421 struct anon **app; 422 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 423 uintptr_t *ap_addr; 424 425 ASSERT(an_idx < ahp->size); 426 427 /* 428 * Single level case. 429 */ 430 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 431 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 432 } else { 433 434 /* 435 * 2 level case. 436 */ 437 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 438 439 ASSERT(ppp != NULL); 440 if (*ppp == NULL) { 441 mutex_enter(&ahp->serial_lock); 442 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 443 if (*ppp == NULL) { 444 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 445 if (*ppp == NULL) { 446 mutex_exit(&ahp->serial_lock); 447 return (ENOMEM); 448 } 449 } 450 mutex_exit(&ahp->serial_lock); 451 } 452 app = *ppp; 453 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 454 } 455 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 456 return (0); 457 } 458 459 /* 460 * Copy anon array into a given new anon array 461 */ 462 int 463 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 464 struct anon_hdr *dahp, ulong_t d_idx, 465 pgcnt_t npages, int flags) 466 { 467 void **sapp, **dapp; 468 void *ap; 469 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 470 471 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 472 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 473 474 /* 475 * Both arrays are 1 level. 476 */ 477 if (((sahp->size <= ANON_CHUNK_SIZE) && 478 (dahp->size <= ANON_CHUNK_SIZE)) || 479 ((sahp->flags & ANON_ALLOC_FORCE) && 480 (dahp->flags & ANON_ALLOC_FORCE))) { 481 482 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 483 npages * sizeof (struct anon *)); 484 return (0); 485 } 486 487 /* 488 * Both arrays are 2 levels. 489 */ 490 if (sahp->size > ANON_CHUNK_SIZE && 491 dahp->size > ANON_CHUNK_SIZE && 492 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 493 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 494 495 ulong_t sapidx, dapidx; 496 ulong_t *sap, *dap; 497 ulong_t chknp; 498 499 while (npages != 0) { 500 501 sapidx = s_idx & ANON_CHUNK_OFF; 502 dapidx = d_idx & ANON_CHUNK_OFF; 503 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 504 if (chknp > npages) 505 chknp = npages; 506 507 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 508 if ((sap = *sapp) != NULL) { 509 dapp = &dahp->array_chunk[d_idx 510 >> ANON_CHUNK_SHIFT]; 511 if ((dap = *dapp) == NULL) { 512 *dapp = kmem_zalloc(PAGESIZE, 513 kmemflags); 514 if ((dap = *dapp) == NULL) 515 return (ENOMEM); 516 } 517 bcopy((sap + sapidx), (dap + dapidx), 518 chknp << ANON_PTRSHIFT); 519 } 520 s_idx += chknp; 521 d_idx += chknp; 522 npages -= chknp; 523 } 524 return (0); 525 } 526 527 /* 528 * At least one of the arrays is 2 level. 529 */ 530 while (npages--) { 531 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 532 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 533 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 534 return (ENOMEM); 535 } 536 s_idx++; 537 d_idx++; 538 } 539 return (0); 540 } 541 542 543 /* 544 * ANON_INITBUF is a convenience macro for anon_grow() below. It 545 * takes a buffer dst, which is at least as large as buffer src. It 546 * does a bcopy from src into dst, and then bzeros the extra bytes 547 * of dst. If tail is set, the data in src is tail aligned within 548 * dst instead of head aligned. 549 */ 550 551 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 552 if (tail) { \ 553 bzero((dst), (dstsize) - (srclen)); \ 554 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 555 } else { \ 556 bcopy((src), (dst), (srclen)); \ 557 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 558 } 559 560 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 561 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 562 563 /* 564 * anon_grow() is used to efficiently extend an existing anon array. 565 * startidx_p points to the index into the anon array of the first page 566 * that is in use. curpages is the number of pages in use, starting at 567 * *startidx_p. newpages is the number of additional pages desired. 568 * 569 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 570 * 571 * The growth is done by creating a new top level of the anon array, 572 * and (if the array is 2-level) reusing the existing second level arrays. 573 * 574 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 575 * 576 * Returns the new number of pages in the anon array. 577 */ 578 579 pgcnt_t 580 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t curpages, 581 pgcnt_t newpages, int flags) 582 { 583 ulong_t startidx = startidx_p ? *startidx_p : 0; 584 pgcnt_t osz = ahp->size, nsz; 585 pgcnt_t oelems, nelems, totpages; 586 void **level1; 587 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 588 int growdown = (flags & ANON_GROWDOWN); 589 size_t newarrsz, oldarrsz; 590 void *level2; 591 592 ASSERT(!(startidx_p == NULL && growdown)); 593 ASSERT(startidx + curpages <= ahp->size); 594 595 /* 596 * Determine the total number of pages needed in the new 597 * anon array. If growing down, totpages is all pages from 598 * startidx through the end of the array, plus <newpages> 599 * pages. If growing up, keep all pages from page 0 through 600 * the last page currently in use, plus <newpages> pages. 601 */ 602 603 if (growdown) 604 totpages = osz - startidx + newpages; 605 else 606 totpages = startidx + curpages + newpages; 607 608 /* If the array is already large enough, just return. */ 609 610 if (osz >= totpages) { 611 nsz = osz; 612 goto out; 613 } 614 615 /* 616 * osz/nsz are the total numbers of pages represented by the array. 617 * oelems/nelems are the number of pointers in the top level array. 618 * 619 * Will the new anon array be one level or two levels? 620 */ 621 622 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 623 nsz = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 624 oelems = osz; 625 nelems = nsz; 626 } else { 627 nsz = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 628 oelems = (osz + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 629 nelems = nsz >> ANON_CHUNK_SHIFT; 630 } 631 632 newarrsz = nelems * sizeof (void *); 633 level1 = kmem_alloc(newarrsz, kmemflags); 634 if (level1 == NULL) 635 return (0); 636 637 /* Are we converting from a one level to a two level anon array? */ 638 639 if (nsz > ANON_CHUNK_SIZE && osz <= ANON_CHUNK_SIZE && 640 !(ahp->flags & ANON_ALLOC_FORCE)) { 641 /* 642 * Yes, we're converting to a two level. Reuse old level 1 643 * as new level 2 if it is exactly PAGESIZE. Otherwise 644 * alloc a new level 2 and copy the old level 1 data into it. 645 */ 646 647 if (osz == ANON_CHUNK_SIZE) { 648 level2 = (void *)ahp->array_chunk; 649 } else { 650 level2 = kmem_alloc(PAGESIZE, kmemflags); 651 if (level2 == NULL) { 652 kmem_free(level1, newarrsz); 653 return (0); 654 } 655 oldarrsz = osz * sizeof (void *); 656 657 ANON_INITBUF(ahp->array_chunk, oldarrsz, 658 level2, PAGESIZE, growdown); 659 kmem_free(ahp->array_chunk, oldarrsz); 660 } 661 bzero(level1, newarrsz); 662 if (growdown) 663 level1[nelems - 1] = level2; 664 else 665 level1[0] = level2; 666 } else { 667 oldarrsz = oelems * sizeof (void *); 668 669 ANON_INITBUF(ahp->array_chunk, oldarrsz, 670 level1, newarrsz, growdown); 671 kmem_free(ahp->array_chunk, oldarrsz); 672 } 673 674 ahp->array_chunk = level1; 675 ahp->size = nsz; 676 out: 677 if (growdown) 678 *startidx_p = nsz - totpages; 679 return (nsz); 680 } 681 682 /* 683 * Called from clock handler to sync ani_free value. 684 */ 685 686 void 687 set_anoninfo(void) 688 { 689 int ix; 690 pgcnt_t total = 0; 691 692 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 693 total += ani_free_pool[ix].ani_count; 694 } 695 k_anoninfo.ani_free = total; 696 } 697 698 /* 699 * Reserve anon space. 700 * 701 * It's no longer simply a matter of incrementing ani_resv to 702 * reserve swap space, we need to check memory-based as well 703 * as disk-backed (physical) swap. The following algorithm 704 * is used: 705 * Check the space on physical swap 706 * i.e. amount needed < ani_max - ani_phys_resv 707 * If we are swapping on swapfs check 708 * amount needed < (availrmem - swapfs_minfree) 709 * Since the algorithm to check for the quantity of swap space is 710 * almost the same as that for reserving it, we'll just use anon_resvmem 711 * with a flag to decrement availrmem. 712 * 713 * Return non-zero on success. 714 */ 715 int 716 anon_resvmem(size_t size, uint_t takemem) 717 { 718 pgcnt_t npages = btopr(size); 719 pgcnt_t mswap_pages = 0; 720 pgcnt_t pswap_pages = 0; 721 722 mutex_enter(&anoninfo_lock); 723 724 /* 725 * pswap_pages is the number of pages we can take from 726 * physical (i.e. disk-backed) swap. 727 */ 728 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 729 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 730 731 ANON_PRINT(A_RESV, 732 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 733 npages, takemem, pswap_pages, (void *)caller())); 734 735 if (npages <= pswap_pages) { 736 /* 737 * we have enough space on a physical swap 738 */ 739 if (takemem) 740 k_anoninfo.ani_phys_resv += npages; 741 mutex_exit(&anoninfo_lock); 742 return (1); 743 } else if (pswap_pages != 0) { 744 /* 745 * we have some space on a physical swap 746 */ 747 if (takemem) { 748 /* 749 * use up remainder of phys swap 750 */ 751 k_anoninfo.ani_phys_resv += pswap_pages; 752 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 753 } 754 } 755 /* 756 * since (npages > pswap_pages) we need mem swap 757 * mswap_pages is the number of pages needed from availrmem 758 */ 759 ASSERT(npages > pswap_pages); 760 mswap_pages = npages - pswap_pages; 761 762 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 763 mswap_pages)); 764 765 /* 766 * priv processes can reserve memory as swap as long as availrmem 767 * remains greater than swapfs_minfree; in the case of non-priv 768 * processes, memory can be reserved as swap only if availrmem 769 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 770 * swapfs_reserve amount of memswap is not available to non-priv 771 * processes. This protects daemons such as automounter dying 772 * as a result of application processes eating away almost entire 773 * membased swap. This safeguard becomes useless if apps are run 774 * with root access. 775 * 776 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 777 * 778 */ 779 mutex_enter(&freemem_lock); 780 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 781 (availrmem > (swapfs_minfree + mswap_pages) && 782 secpolicy_resource(CRED()) == 0)) { 783 784 if (takemem) { 785 /* 786 * Take the memory from the rest of the system. 787 */ 788 availrmem -= mswap_pages; 789 mutex_exit(&freemem_lock); 790 k_anoninfo.ani_mem_resv += mswap_pages; 791 ANI_ADD(mswap_pages); 792 ANON_PRINT((A_RESV | A_MRESV), 793 ("anon_resvmem: took %ld pages of availrmem\n", 794 mswap_pages)); 795 } else { 796 mutex_exit(&freemem_lock); 797 } 798 799 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 800 mutex_exit(&anoninfo_lock); 801 return (1); 802 803 } else { 804 /* 805 * Fail if not enough memory 806 */ 807 808 if (takemem) { 809 k_anoninfo.ani_phys_resv -= pswap_pages; 810 } 811 812 mutex_exit(&freemem_lock); 813 mutex_exit(&anoninfo_lock); 814 ANON_PRINT(A_RESV, 815 ("anon_resvmem: not enough space from swapfs\n")); 816 return (0); 817 } 818 } 819 820 821 /* 822 * Give back an anon reservation. 823 */ 824 void 825 anon_unresv(size_t size) 826 { 827 pgcnt_t npages = btopr(size); 828 spgcnt_t mem_free_pages = 0; 829 pgcnt_t phys_free_slots; 830 #ifdef ANON_DEBUG 831 pgcnt_t mem_resv; 832 #endif 833 834 mutex_enter(&anoninfo_lock); 835 836 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 837 /* 838 * If some of this reservation belonged to swapfs 839 * give it back to availrmem. 840 * ani_mem_resv is the amount of availrmem swapfs has reserved. 841 * but some of that memory could be locked by segspt so we can only 842 * return non locked ani_mem_resv back to availrmem 843 */ 844 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 845 ANON_PRINT((A_RESV | A_MRESV), 846 ("anon_unresv: growing availrmem by %ld pages\n", 847 MIN(k_anoninfo.ani_mem_resv, npages))); 848 849 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 850 k_anoninfo.ani_locked_swap), npages); 851 mutex_enter(&freemem_lock); 852 availrmem += mem_free_pages; 853 mutex_exit(&freemem_lock); 854 k_anoninfo.ani_mem_resv -= mem_free_pages; 855 856 ANI_ADD(-mem_free_pages); 857 } 858 /* 859 * The remainder of the pages is returned to phys swap 860 */ 861 ASSERT(npages >= mem_free_pages); 862 phys_free_slots = npages - mem_free_pages; 863 864 if (phys_free_slots) { 865 k_anoninfo.ani_phys_resv -= phys_free_slots; 866 } 867 868 #ifdef ANON_DEBUG 869 mem_resv = k_anoninfo.ani_mem_resv; 870 #endif 871 872 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 873 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 874 875 mutex_exit(&anoninfo_lock); 876 877 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 878 npages, mem_resv, (void *)caller())); 879 } 880 881 /* 882 * Allocate an anon slot and return it with the lock held. 883 */ 884 struct anon * 885 anon_alloc(struct vnode *vp, anoff_t off) 886 { 887 struct anon *ap; 888 kmutex_t *ahm; 889 890 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 891 if (vp == NULL) { 892 swap_alloc(ap); 893 } else { 894 ap->an_vp = vp; 895 ap->an_off = off; 896 } 897 ap->an_refcnt = 1; 898 ap->an_pvp = NULL; 899 ap->an_poff = 0; 900 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 901 mutex_enter(ahm); 902 anon_addhash(ap); 903 mutex_exit(ahm); 904 ANI_ADD(-1); 905 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 906 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 907 return (ap); 908 } 909 910 /* 911 * Decrement the reference count of an anon page. 912 * If reference count goes to zero, free it and 913 * its associated page (if any). 914 */ 915 void 916 anon_decref(struct anon *ap) 917 { 918 page_t *pp; 919 struct vnode *vp; 920 anoff_t off; 921 kmutex_t *ahm; 922 923 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 924 mutex_enter(ahm); 925 ASSERT(ap->an_refcnt != 0); 926 if (ap->an_refcnt == 0) 927 panic("anon_decref: slot count 0"); 928 if (--ap->an_refcnt == 0) { 929 swap_xlate(ap, &vp, &off); 930 mutex_exit(ahm); 931 932 /* 933 * If there is a page for this anon slot we will need to 934 * call VN_DISPOSE to get rid of the vp association and 935 * put the page back on the free list as really free. 936 * Acquire the "exclusive" lock to ensure that any 937 * pending i/o always completes before the swap slot 938 * is freed. 939 */ 940 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 941 942 /* 943 * If there was a page, we've synchronized on it (getting 944 * the exclusive lock is as good as gettting the iolock) 945 * so now we can free the physical backing store. Also, this 946 * is where we would free the name of the anonymous page 947 * (swap_free(ap)), a no-op in the current implementation. 948 */ 949 mutex_enter(ahm); 950 ASSERT(ap->an_refcnt == 0); 951 anon_rmhash(ap); 952 if (ap->an_pvp) 953 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 954 mutex_exit(ahm); 955 956 if (pp != NULL) { 957 /*LINTED: constant in conditional context */ 958 VN_DISPOSE(pp, B_INVAL, 0, kcred); 959 } 960 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 961 (void *)ap, (void *)ap->an_vp)); 962 kmem_cache_free(anon_cache, ap); 963 964 ANI_ADD(1); 965 } else { 966 mutex_exit(ahm); 967 } 968 } 969 970 static int 971 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 972 { 973 struct anon *ap; 974 975 while (nslots-- > 0) { 976 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 977 ap->an_refcnt > 1) 978 return (1); 979 anon_index++; 980 } 981 982 return (0); 983 } 984 985 static void 986 anon_decref_pages( 987 struct anon_hdr *ahp, 988 ulong_t an_idx, 989 uint_t szc) 990 { 991 struct anon *ap = anon_get_ptr(ahp, an_idx); 992 kmutex_t *ahmpages = NULL; 993 page_t *pp; 994 pgcnt_t pgcnt = page_get_pagecnt(szc); 995 pgcnt_t i; 996 struct vnode *vp; 997 anoff_t off; 998 kmutex_t *ahm; 999 #ifdef DEBUG 1000 int refcnt = 1; 1001 #endif 1002 1003 ASSERT(szc != 0); 1004 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1005 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1006 1007 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1008 1009 if (ap != NULL) { 1010 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1011 mutex_enter(ahmpages); 1012 ASSERT((refcnt = ap->an_refcnt) != 0); 1013 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1014 if (ap->an_refcnt == 1) { 1015 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1016 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1017 mutex_exit(ahmpages); 1018 ahmpages = NULL; 1019 } 1020 } 1021 1022 i = 0; 1023 while (i < pgcnt) { 1024 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1025 ASSERT(refcnt == 1 && ahmpages == NULL); 1026 i++; 1027 continue; 1028 } 1029 ASSERT(ap->an_refcnt == refcnt); 1030 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1031 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1032 1033 if (ahmpages == NULL) { 1034 swap_xlate(ap, &vp, &off); 1035 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1036 if (pp == NULL || pp->p_szc == 0) { 1037 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1038 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1039 ap->an_off)]; 1040 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1041 ANON_SLEEP); 1042 mutex_enter(ahm); 1043 ap->an_refcnt--; 1044 ASSERT(ap->an_refcnt == 0); 1045 anon_rmhash(ap); 1046 if (ap->an_pvp) 1047 swap_phys_free(ap->an_pvp, ap->an_poff, 1048 PAGESIZE); 1049 mutex_exit(ahm); 1050 if (pp != NULL) { 1051 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1052 /*LINTED*/ 1053 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1054 } 1055 kmem_cache_free(anon_cache, ap); 1056 ANI_ADD(1); 1057 i++; 1058 } else { 1059 pgcnt_t j; 1060 pgcnt_t curpgcnt = 1061 page_get_pagecnt(pp->p_szc); 1062 size_t ppasize = curpgcnt * sizeof (page_t *); 1063 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1064 int dispose = 0; 1065 1066 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1067 1068 ASSERT(pp->p_szc <= szc); 1069 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1070 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1071 ASSERT(i + curpgcnt <= pgcnt); 1072 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1073 ppa[0] = pp; 1074 for (j = i + 1; j < i + curpgcnt; j++) { 1075 ap = anon_get_ptr(ahp, an_idx + j); 1076 ASSERT(ap != NULL && 1077 ap->an_refcnt == 1); 1078 swap_xlate(ap, &vp, &off); 1079 pp = page_lookup(vp, (u_offset_t)off, 1080 SE_EXCL); 1081 if (pp == NULL) 1082 panic("anon_decref_pages: " 1083 "no page"); 1084 1085 (void) hat_pageunload(pp, 1086 HAT_FORCE_PGUNLOAD); 1087 ASSERT(pp->p_szc == ppa[0]->p_szc); 1088 ASSERT(page_pptonum(pp) - 1 == 1089 page_pptonum(ppa[j - i - 1])); 1090 ppa[j - i] = pp; 1091 if (ap->an_pvp != NULL && 1092 !vn_matchopval(ap->an_pvp, 1093 VOPNAME_DISPOSE, 1094 (fs_generic_func_p)fs_dispose)) 1095 dispose = 1; 1096 } 1097 if (!dispose) { 1098 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1099 page_destroy_pages(ppa[0]); 1100 } else { 1101 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1102 for (j = 0; j < curpgcnt; j++) { 1103 ASSERT(PAGE_EXCL(ppa[j])); 1104 ppa[j]->p_szc = 0; 1105 } 1106 for (j = 0; j < curpgcnt; j++) { 1107 ASSERT(!hat_page_is_mapped( 1108 ppa[j])); 1109 /*LINTED*/ 1110 VN_DISPOSE(ppa[j], B_INVAL, 0, 1111 kcred); 1112 } 1113 } 1114 kmem_free(ppa, ppasize); 1115 for (j = i; j < i + curpgcnt; j++) { 1116 ap = anon_get_ptr(ahp, an_idx + j); 1117 ASSERT(ap != NULL && 1118 ap->an_refcnt == 1); 1119 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1120 ap->an_off)]; 1121 (void) anon_set_ptr(ahp, an_idx + j, 1122 NULL, ANON_SLEEP); 1123 mutex_enter(ahm); 1124 ap->an_refcnt--; 1125 ASSERT(ap->an_refcnt == 0); 1126 anon_rmhash(ap); 1127 if (ap->an_pvp) 1128 swap_phys_free(ap->an_pvp, 1129 ap->an_poff, PAGESIZE); 1130 mutex_exit(ahm); 1131 kmem_cache_free(anon_cache, ap); 1132 ANI_ADD(1); 1133 } 1134 i += curpgcnt; 1135 } 1136 } else { 1137 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1138 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1139 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1140 mutex_enter(ahm); 1141 ap->an_refcnt--; 1142 mutex_exit(ahm); 1143 i++; 1144 } 1145 } 1146 1147 if (ahmpages != NULL) { 1148 mutex_exit(ahmpages); 1149 } 1150 } 1151 1152 /* 1153 * Duplicate references to size bytes worth of anon pages. 1154 * Used when duplicating a segment that contains private anon pages. 1155 * This code assumes that procedure calling this one has already used 1156 * hat_chgprot() to disable write access to the range of addresses that 1157 * that *old actually refers to. 1158 */ 1159 void 1160 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1161 ulong_t new_idx, size_t size) 1162 { 1163 spgcnt_t npages; 1164 kmutex_t *ahm; 1165 struct anon *ap; 1166 ulong_t off; 1167 ulong_t index; 1168 1169 npages = btopr(size); 1170 while (npages > 0) { 1171 index = old_idx; 1172 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1173 break; 1174 1175 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1176 off = index - old_idx; 1177 npages -= off; 1178 if (npages <= 0) 1179 break; 1180 1181 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1182 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1183 1184 mutex_enter(ahm); 1185 ap->an_refcnt++; 1186 mutex_exit(ahm); 1187 1188 off++; 1189 new_idx += off; 1190 old_idx += off; 1191 npages--; 1192 } 1193 } 1194 1195 /* 1196 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1197 * slots) within any large page region. That means if a large page region is 1198 * empty in the old array it will skip it. If there are 1 or more valid slots 1199 * in the large page region of the old array it will make sure to fill in any 1200 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1201 * page region should either have no valid anon slots or all slots should be 1202 * valid. 1203 */ 1204 void 1205 anon_dup_fill_holes( 1206 struct anon_hdr *old, 1207 ulong_t old_idx, 1208 struct anon_hdr *new, 1209 ulong_t new_idx, 1210 size_t size, 1211 uint_t szc, 1212 int noalloc) 1213 { 1214 struct anon *ap; 1215 spgcnt_t npages; 1216 kmutex_t *ahm, *ahmpages = NULL; 1217 pgcnt_t pgcnt, i; 1218 ulong_t index, off; 1219 #ifdef DEBUG 1220 int refcnt; 1221 #endif 1222 1223 ASSERT(szc != 0); 1224 pgcnt = page_get_pagecnt(szc); 1225 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1226 npages = btopr(size); 1227 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1228 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1229 1230 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1231 1232 while (npages > 0) { 1233 index = old_idx; 1234 1235 /* 1236 * Find the next valid slot. 1237 */ 1238 if (anon_get_next_ptr(old, &index) == NULL) 1239 break; 1240 1241 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1242 /* 1243 * Now backup index to the beginning of the 1244 * current large page region of the old array. 1245 */ 1246 index = P2ALIGN(index, pgcnt); 1247 off = index - old_idx; 1248 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1249 npages -= off; 1250 if (npages <= 0) 1251 break; 1252 1253 /* 1254 * Fill and copy a large page regions worth 1255 * of anon slots. 1256 */ 1257 for (i = 0; i < pgcnt; i++) { 1258 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1259 if (noalloc) { 1260 panic("anon_dup_fill_holes: " 1261 "empty anon slot\n"); 1262 } 1263 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1264 ap = anon_alloc(NULL, 0); 1265 (void) anon_set_ptr(old, index + i, ap, 1266 ANON_SLEEP); 1267 } else if (i == 0) { 1268 /* 1269 * make the increment of all refcnts of all 1270 * anon slots of a large page appear atomic by 1271 * getting an anonpages_hash_lock for the 1272 * first anon slot of a large page. 1273 */ 1274 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1275 1276 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1277 1278 ahmpages = &anonpages_hash_lock[hash]; 1279 mutex_enter(ahmpages); 1280 /*LINTED*/ 1281 ASSERT(refcnt = ap->an_refcnt); 1282 1283 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1284 anonvmstats.dupfillholes[3]); 1285 } 1286 (void) anon_set_ptr(new, new_idx + off + i, ap, 1287 ANON_SLEEP); 1288 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1289 mutex_enter(ahm); 1290 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1291 ASSERT(i == 0 || ahmpages == NULL || 1292 refcnt == ap->an_refcnt); 1293 ap->an_refcnt++; 1294 mutex_exit(ahm); 1295 } 1296 if (ahmpages != NULL) { 1297 mutex_exit(ahmpages); 1298 ahmpages = NULL; 1299 } 1300 off += pgcnt; 1301 new_idx += off; 1302 old_idx += off; 1303 npages -= pgcnt; 1304 } 1305 } 1306 1307 /* 1308 * Used when a segment with a vnode changes szc. similarly to 1309 * anon_dup_fill_holes() makes sure each large page region either has no anon 1310 * slots or all of them. but new slots are created by COWing the file 1311 * pages. on entrance no anon slots should be shared. 1312 */ 1313 int 1314 anon_fill_cow_holes( 1315 struct seg *seg, 1316 caddr_t addr, 1317 struct anon_hdr *ahp, 1318 ulong_t an_idx, 1319 struct vnode *vp, 1320 u_offset_t vp_off, 1321 size_t size, 1322 uint_t szc, 1323 uint_t prot, 1324 struct vpage vpage[], 1325 struct cred *cred) 1326 { 1327 struct anon *ap; 1328 spgcnt_t npages; 1329 pgcnt_t pgcnt, i; 1330 ulong_t index, off; 1331 int err = 0; 1332 int pageflags = 0; 1333 1334 ASSERT(szc != 0); 1335 pgcnt = page_get_pagecnt(szc); 1336 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1337 npages = btopr(size); 1338 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1339 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1340 1341 while (npages > 0) { 1342 index = an_idx; 1343 1344 /* 1345 * Find the next valid slot. 1346 */ 1347 if (anon_get_next_ptr(ahp, &index) == NULL) { 1348 break; 1349 } 1350 1351 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1352 /* 1353 * Now backup index to the beginning of the 1354 * current large page region of the anon array. 1355 */ 1356 index = P2ALIGN(index, pgcnt); 1357 off = index - an_idx; 1358 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1359 npages -= off; 1360 if (npages <= 0) 1361 break; 1362 an_idx += off; 1363 vp_off += ptob(off); 1364 addr += ptob(off); 1365 if (vpage != NULL) { 1366 vpage += off; 1367 } 1368 1369 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1370 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1371 page_t *pl[1 + 1]; 1372 page_t *pp; 1373 1374 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1375 pl, PAGESIZE, seg, addr, S_READ, cred); 1376 if (err) { 1377 break; 1378 } 1379 if (vpage != NULL) { 1380 prot = VPP_PROT(vpage); 1381 pageflags = VPP_ISPPLOCK(vpage) ? 1382 LOCK_PAGE : 0; 1383 } 1384 pp = anon_private(&ap, seg, addr, prot, pl[0], 1385 pageflags, cred); 1386 if (pp == NULL) { 1387 err = ENOMEM; 1388 break; 1389 } 1390 (void) anon_set_ptr(ahp, an_idx, ap, 1391 ANON_SLEEP); 1392 page_unlock(pp); 1393 } 1394 ASSERT(ap->an_refcnt == 1); 1395 addr += PAGESIZE; 1396 if (vpage != NULL) { 1397 vpage++; 1398 } 1399 } 1400 npages -= pgcnt; 1401 } 1402 1403 return (err); 1404 } 1405 1406 /* 1407 * Free a group of "size" anon pages, size in bytes, 1408 * and clear out the pointers to the anon entries. 1409 */ 1410 void 1411 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1412 { 1413 spgcnt_t npages; 1414 struct anon *ap; 1415 ulong_t old; 1416 1417 npages = btopr(size); 1418 1419 while (npages > 0) { 1420 old = index; 1421 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1422 break; 1423 1424 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1425 npages -= index - old; 1426 if (npages <= 0) 1427 break; 1428 1429 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1430 anon_decref(ap); 1431 /* 1432 * Bump index and decrement page count 1433 */ 1434 index++; 1435 npages--; 1436 } 1437 } 1438 1439 void 1440 anon_free_pages( 1441 struct anon_hdr *ahp, 1442 ulong_t an_idx, 1443 size_t size, 1444 uint_t szc) 1445 { 1446 spgcnt_t npages; 1447 pgcnt_t pgcnt; 1448 ulong_t index, off; 1449 1450 ASSERT(szc != 0); 1451 pgcnt = page_get_pagecnt(szc); 1452 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1453 npages = btopr(size); 1454 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1455 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1456 1457 VM_STAT_ADD(anonvmstats.freepages[0]); 1458 1459 while (npages > 0) { 1460 index = an_idx; 1461 1462 /* 1463 * Find the next valid slot. 1464 */ 1465 if (anon_get_next_ptr(ahp, &index) == NULL) 1466 break; 1467 1468 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1469 /* 1470 * Now backup index to the beginning of the 1471 * current large page region of the old array. 1472 */ 1473 index = P2ALIGN(index, pgcnt); 1474 off = index - an_idx; 1475 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1476 npages -= off; 1477 if (npages <= 0) 1478 break; 1479 1480 anon_decref_pages(ahp, index, szc); 1481 1482 off += pgcnt; 1483 an_idx += off; 1484 npages -= pgcnt; 1485 } 1486 } 1487 1488 /* 1489 * Make anonymous pages discardable 1490 */ 1491 void 1492 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) 1493 { 1494 spgcnt_t npages = btopr(size); 1495 struct anon *ap; 1496 struct vnode *vp; 1497 anoff_t off; 1498 page_t *pp, *root_pp; 1499 kmutex_t *ahm; 1500 pgcnt_t pgcnt; 1501 ulong_t old_idx, idx, i; 1502 struct anon_hdr *ahp = amp->ahp; 1503 anon_sync_obj_t cookie; 1504 1505 ASSERT(RW_READ_HELD(&->a_rwlock)); 1506 pgcnt = 1; 1507 for (; npages > 0; index = (pgcnt == 1) ? index + 1: 1508 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1509 1510 /* 1511 * get anon pointer and index for the first valid entry 1512 * in the anon list, starting from "index" 1513 */ 1514 old_idx = index; 1515 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1516 break; 1517 1518 /* 1519 * decrement npages by number of NULL anon slots we skipped 1520 */ 1521 npages -= index - old_idx; 1522 if (npages <= 0) 1523 break; 1524 1525 anon_array_enter(amp, index, &cookie); 1526 ap = anon_get_ptr(ahp, index); 1527 ASSERT(ap != NULL); 1528 1529 /* 1530 * Get anonymous page and try to lock it SE_EXCL; 1531 * For non blocking case if we couldn't grab the lock 1532 * we skip to next page. 1533 * For blocking case (ANON_PGLOOKUP_BLK) block 1534 * until we grab SE_EXCL lock. 1535 */ 1536 swap_xlate(ap, &vp, &off); 1537 if (flags & ANON_PGLOOKUP_BLK) 1538 pp = page_lookup_create(vp, (u_offset_t)off, 1539 SE_EXCL, NULL, NULL, SE_EXCL_WANTED); 1540 else 1541 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1542 if (pp == NULL) { 1543 segadvstat.MADV_FREE_miss.value.ul++; 1544 pgcnt = 1; 1545 anon_array_exit(&cookie); 1546 continue; 1547 } 1548 pgcnt = page_get_pagecnt(pp->p_szc); 1549 1550 /* 1551 * we cannot free a page which is permanently locked. 1552 * The page_struct_lock need not be acquired to examine 1553 * these fields since the page has an "exclusive" lock. 1554 */ 1555 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1556 page_unlock(pp); 1557 segadvstat.MADV_FREE_miss.value.ul++; 1558 anon_array_exit(&cookie); 1559 continue; 1560 } 1561 1562 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1563 mutex_enter(ahm); 1564 ASSERT(ap->an_refcnt != 0); 1565 /* 1566 * skip this one if copy-on-write is not yet broken. 1567 */ 1568 if (ap->an_refcnt > 1) { 1569 mutex_exit(ahm); 1570 page_unlock(pp); 1571 segadvstat.MADV_FREE_miss.value.ul++; 1572 anon_array_exit(&cookie); 1573 continue; 1574 } 1575 1576 if (pp->p_szc == 0) { 1577 pgcnt = 1; 1578 1579 /* 1580 * free swap slot; 1581 */ 1582 if (ap->an_pvp) { 1583 swap_phys_free(ap->an_pvp, ap->an_poff, 1584 PAGESIZE); 1585 ap->an_pvp = NULL; 1586 ap->an_poff = 0; 1587 } 1588 mutex_exit(ahm); 1589 segadvstat.MADV_FREE_hit.value.ul++; 1590 1591 /* 1592 * while we are at it, unload all the translations 1593 * and attempt to free the page. 1594 */ 1595 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1596 /*LINTED: constant in conditional context */ 1597 VN_DISPOSE(pp, B_FREE, 0, kcred); 1598 anon_array_exit(&cookie); 1599 continue; 1600 } 1601 1602 pgcnt = page_get_pagecnt(pp->p_szc); 1603 if (!IS_P2ALIGNED(index, pgcnt)) { 1604 if (!page_try_demote_pages(pp)) { 1605 mutex_exit(ahm); 1606 page_unlock(pp); 1607 segadvstat.MADV_FREE_miss.value.ul++; 1608 anon_array_exit(&cookie); 1609 continue; 1610 } else { 1611 pgcnt = 1; 1612 if (ap->an_pvp) { 1613 swap_phys_free(ap->an_pvp, 1614 ap->an_poff, PAGESIZE); 1615 ap->an_pvp = NULL; 1616 ap->an_poff = 0; 1617 } 1618 mutex_exit(ahm); 1619 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1620 /*LINTED*/ 1621 VN_DISPOSE(pp, B_FREE, 0, kcred); 1622 segadvstat.MADV_FREE_hit.value.ul++; 1623 anon_array_exit(&cookie); 1624 continue; 1625 } 1626 } 1627 mutex_exit(ahm); 1628 root_pp = pp; 1629 1630 /* 1631 * try to lock remaining pages 1632 */ 1633 for (idx = 1; idx < pgcnt; idx++) { 1634 pp = page_next(pp); 1635 if (!page_trylock(pp, SE_EXCL)) 1636 break; 1637 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1638 page_unlock(pp); 1639 break; 1640 } 1641 } 1642 1643 if (idx == pgcnt) { 1644 for (i = 0; i < pgcnt; i++) { 1645 ap = anon_get_ptr(ahp, index + i); 1646 if (ap == NULL) 1647 break; 1648 swap_xlate(ap, &vp, &off); 1649 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1650 mutex_enter(ahm); 1651 ASSERT(ap->an_refcnt != 0); 1652 1653 /* 1654 * skip this one if copy-on-write 1655 * is not yet broken. 1656 */ 1657 if (ap->an_refcnt > 1) { 1658 mutex_exit(ahm); 1659 goto skiplp; 1660 } 1661 if (ap->an_pvp) { 1662 swap_phys_free(ap->an_pvp, 1663 ap->an_poff, PAGESIZE); 1664 ap->an_pvp = NULL; 1665 ap->an_poff = 0; 1666 } 1667 mutex_exit(ahm); 1668 } 1669 page_destroy_pages(root_pp); 1670 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1671 anon_array_exit(&cookie); 1672 continue; 1673 } 1674 skiplp: 1675 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1676 for (i = 0, pp = root_pp; i < idx; pp = page_next(pp), i++) 1677 page_unlock(pp); 1678 anon_array_exit(&cookie); 1679 } 1680 } 1681 1682 /* 1683 * Return the kept page(s) and protections back to the segment driver. 1684 */ 1685 int 1686 anon_getpage( 1687 struct anon **app, 1688 uint_t *protp, 1689 page_t *pl[], 1690 size_t plsz, 1691 struct seg *seg, 1692 caddr_t addr, 1693 enum seg_rw rw, 1694 struct cred *cred) 1695 { 1696 page_t *pp; 1697 struct anon *ap = *app; 1698 struct vnode *vp; 1699 anoff_t off; 1700 int err; 1701 kmutex_t *ahm; 1702 1703 swap_xlate(ap, &vp, &off); 1704 1705 /* 1706 * Lookup the page. If page is being paged in, 1707 * wait for it to finish as we must return a list of 1708 * pages since this routine acts like the VOP_GETPAGE 1709 * routine does. 1710 */ 1711 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1712 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1713 mutex_enter(ahm); 1714 if (ap->an_refcnt == 1) 1715 *protp = PROT_ALL; 1716 else 1717 *protp = PROT_ALL & ~PROT_WRITE; 1718 mutex_exit(ahm); 1719 pl[0] = pp; 1720 pl[1] = NULL; 1721 return (0); 1722 } 1723 1724 /* 1725 * Simply treat it as a vnode fault on the anon vp. 1726 */ 1727 1728 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1729 "anon_getpage:seg %x addr %x vp %x", 1730 seg, addr, vp); 1731 1732 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1733 seg, addr, rw, cred); 1734 1735 if (err == 0 && pl != NULL) { 1736 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1737 mutex_enter(ahm); 1738 if (ap->an_refcnt != 1) 1739 *protp &= ~PROT_WRITE; /* make read-only */ 1740 mutex_exit(ahm); 1741 } 1742 return (err); 1743 } 1744 1745 /* 1746 * Creates or returns kept pages to the segment driver. returns -1 if a large 1747 * page cannot be allocated. returns -2 if some other process has allocated a 1748 * larger page. 1749 * 1750 * For cowfault it will alocate any size pages to fill the requested area to 1751 * avoid partially overwritting anon slots (i.e. sharing only some of the anon 1752 * slots within a large page with other processes). This policy greatly 1753 * simplifies large page freeing (which is only freed when all anon slot 1754 * refcnts are 0). 1755 */ 1756 int 1757 anon_map_getpages( 1758 struct anon_map *amp, 1759 ulong_t start_idx, 1760 uint_t szc, 1761 struct seg *seg, 1762 caddr_t addr, 1763 uint_t prot, 1764 uint_t *protp, 1765 page_t *ppa[], 1766 uint_t *ppa_szc, 1767 struct vpage vpage[], 1768 enum seg_rw rw, 1769 int brkcow, 1770 int anypgsz, 1771 struct cred *cred) 1772 { 1773 pgcnt_t pgcnt; 1774 struct anon *ap; 1775 struct vnode *vp; 1776 anoff_t off; 1777 page_t *pp, *pl[2], *conpp = NULL; 1778 caddr_t vaddr; 1779 ulong_t pg_idx, an_idx, i; 1780 spgcnt_t nreloc = 0; 1781 int prealloc = 1; 1782 int err, slotcreate; 1783 uint_t vpprot; 1784 1785 #if !defined(__i386) && !defined(__amd64) 1786 ASSERT(seg->s_szc != 0); 1787 #endif 1788 ASSERT(szc <= seg->s_szc); 1789 ASSERT(ppa_szc != NULL); 1790 ASSERT(rw != S_CREATE); 1791 1792 *protp = PROT_ALL; 1793 1794 VM_STAT_ADD(anonvmstats.getpages[0]); 1795 1796 if (szc == 0) { 1797 VM_STAT_ADD(anonvmstats.getpages[1]); 1798 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1799 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1800 addr, rw, cred); 1801 if (err) 1802 return (err); 1803 ppa[0] = pl[0]; 1804 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1805 VM_STAT_ADD(anonvmstats.getpages[2]); 1806 if (ppa[0]->p_szc != 0) { 1807 VM_STAT_ADD(anonvmstats.getpages[3]); 1808 *ppa_szc = ppa[0]->p_szc; 1809 page_unlock(ppa[0]); 1810 return (-2); 1811 } 1812 return (0); 1813 } 1814 panic("anon_map_getpages: cowfault for szc 0"); 1815 } else { 1816 VM_STAT_ADD(anonvmstats.getpages[4]); 1817 ppa[0] = anon_zero(seg, addr, &ap, cred); 1818 if (ppa[0] == NULL) 1819 return (ENOMEM); 1820 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1821 ANON_SLEEP); 1822 return (0); 1823 } 1824 } 1825 1826 pgcnt = page_get_pagecnt(szc); 1827 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1828 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1829 1830 /* 1831 * First we check for the case that the requtested large 1832 * page or larger page already exists in the system. 1833 * Actually we only check if the first constituent page 1834 * exists and only preallocate if it's not found. 1835 */ 1836 ap = anon_get_ptr(amp->ahp, start_idx); 1837 if (ap) { 1838 uint_t pszc; 1839 swap_xlate(ap, &vp, &off); 1840 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1841 if (pszc > szc) { 1842 *ppa_szc = pszc; 1843 return (-2); 1844 } 1845 if (pszc == szc) { 1846 prealloc = 0; 1847 } 1848 } 1849 } 1850 1851 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1852 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1853 1854 top: 1855 /* 1856 * If a smaller page or no page at all was found, 1857 * grab a large page off the freelist. 1858 */ 1859 if (prealloc) { 1860 ASSERT(conpp == NULL); 1861 if (page_alloc_pages(seg, addr, NULL, ppa, szc, 0) != 0) { 1862 VM_STAT_ADD(anonvmstats.getpages[7]); 1863 if (brkcow == 0 || 1864 !anon_share(amp->ahp, start_idx, pgcnt)) { 1865 /* 1866 * If the refcnt's of all anon slots are <= 1 1867 * they can't increase since we are holding 1868 * the address space's lock. So segvn can 1869 * safely decrease szc without risking to 1870 * generate a cow fault for the region smaller 1871 * than the segment's largest page size. 1872 */ 1873 VM_STAT_ADD(anonvmstats.getpages[8]); 1874 return (-1); 1875 } 1876 docow: 1877 /* 1878 * This is a cow fault. Copy away the entire 1 large 1879 * page region of this segment. 1880 */ 1881 if (szc != seg->s_szc) 1882 panic("anon_map_getpages: cowfault for szc %d", 1883 szc); 1884 vaddr = addr; 1885 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1886 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1887 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1888 NULL) { 1889 err = anon_getpage(&ap, &vpprot, pl, 1890 PAGESIZE, seg, vaddr, rw, cred); 1891 if (err) { 1892 for (i = 0; i < pg_idx; i++) { 1893 if ((pp = ppa[i]) != 1894 NULL) 1895 page_unlock(pp); 1896 } 1897 return (err); 1898 } 1899 ppa[pg_idx] = pl[0]; 1900 } else { 1901 /* 1902 * Since this is a cowfault we know 1903 * that this address space has a 1904 * parent or children which means 1905 * anon_dup_fill_holes() has initialized 1906 * all anon slots within a large page 1907 * region that had at least one anon 1908 * slot at the time of fork(). 1909 */ 1910 panic("anon_map_getpages: " 1911 "cowfault but anon slot is empty"); 1912 } 1913 } 1914 VM_STAT_ADD(anonvmstats.getpages[9]); 1915 *protp = PROT_ALL; 1916 return (anon_map_privatepages(amp, start_idx, szc, seg, 1917 addr, prot, ppa, vpage, anypgsz, cred)); 1918 } 1919 } 1920 1921 VM_STAT_ADD(anonvmstats.getpages[10]); 1922 1923 an_idx = start_idx; 1924 pg_idx = 0; 1925 vaddr = addr; 1926 while (pg_idx < pgcnt) { 1927 slotcreate = 0; 1928 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 1929 VM_STAT_ADD(anonvmstats.getpages[11]); 1930 /* 1931 * For us to have decided not to preallocate 1932 * would have meant that a large page 1933 * was found. Which also means that all of the 1934 * anon slots for that page would have been 1935 * already created for us. 1936 */ 1937 if (prealloc == 0) 1938 panic("anon_map_getpages: prealloc = 0"); 1939 1940 slotcreate = 1; 1941 ap = anon_alloc(NULL, 0); 1942 } 1943 swap_xlate(ap, &vp, &off); 1944 1945 /* 1946 * Now setup our preallocated page to pass down 1947 * to swap_getpage(). 1948 */ 1949 if (prealloc) { 1950 ASSERT(ppa[pg_idx]->p_szc == szc); 1951 conpp = ppa[pg_idx]; 1952 } 1953 ASSERT(prealloc || conpp == NULL); 1954 1955 /* 1956 * If we just created this anon slot then call 1957 * with S_CREATE to prevent doing IO on the page. 1958 * Similar to the anon_zero case. 1959 */ 1960 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 1961 NULL, pl, PAGESIZE, conpp, &nreloc, seg, vaddr, 1962 slotcreate == 1 ? S_CREATE : rw, cred); 1963 1964 if (err) { 1965 VM_STAT_ADD(anonvmstats.getpages[12]); 1966 ASSERT(slotcreate == 0); 1967 goto io_err; 1968 } 1969 1970 pp = pl[0]; 1971 1972 if (pp->p_szc != szc) { 1973 VM_STAT_ADD(anonvmstats.getpages[13]); 1974 ASSERT(slotcreate == 0); 1975 ASSERT(prealloc == 0); 1976 ASSERT(pg_idx == 0); 1977 if (pp->p_szc > szc) { 1978 page_unlock(pp); 1979 VM_STAT_ADD(anonvmstats.getpages[14]); 1980 return (-2); 1981 } 1982 page_unlock(pp); 1983 prealloc = 1; 1984 goto top; 1985 } 1986 1987 /* 1988 * If we decided to preallocate but VOP_GETPAGE 1989 * found a page in the system that satisfies our 1990 * request then free up our preallocated large page 1991 * and continue looping accross the existing large 1992 * page via VOP_GETPAGE. 1993 */ 1994 if (prealloc && pp != ppa[pg_idx]) { 1995 VM_STAT_ADD(anonvmstats.getpages[15]); 1996 ASSERT(slotcreate == 0); 1997 ASSERT(pg_idx == 0); 1998 conpp = NULL; 1999 prealloc = 0; 2000 page_free_pages(ppa[0]); 2001 } 2002 2003 if (prealloc && nreloc > 1) { 2004 /* 2005 * we have relocated out of a smaller large page. 2006 * skip npgs - 1 iterations and continue which will 2007 * increment by one the loop indices. 2008 */ 2009 spgcnt_t npgs = nreloc; 2010 2011 VM_STAT_ADD(anonvmstats.getpages[16]); 2012 2013 ASSERT(pp == ppa[pg_idx]); 2014 ASSERT(slotcreate == 0); 2015 ASSERT(pg_idx + npgs <= pgcnt); 2016 if ((*protp & PROT_WRITE) && 2017 anon_share(amp->ahp, an_idx, npgs)) { 2018 *protp &= ~PROT_WRITE; 2019 } 2020 pg_idx += npgs; 2021 an_idx += npgs; 2022 vaddr += PAGESIZE * npgs; 2023 continue; 2024 } 2025 2026 VM_STAT_ADD(anonvmstats.getpages[17]); 2027 2028 /* 2029 * Anon_zero case. 2030 */ 2031 if (slotcreate) { 2032 ASSERT(prealloc); 2033 pagezero(pp, 0, PAGESIZE); 2034 CPU_STATS_ADD_K(vm, zfod, 1); 2035 hat_setrefmod(pp); 2036 } 2037 2038 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2039 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2040 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2041 2042 if (pg_idx > 0 && 2043 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2044 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) 2045 panic("anon_map_getpages: unexpected page"); 2046 2047 if (prealloc == 0) { 2048 ppa[pg_idx] = pp; 2049 } 2050 2051 if (ap->an_refcnt > 1) { 2052 VM_STAT_ADD(anonvmstats.getpages[18]); 2053 *protp &= ~PROT_WRITE; 2054 } 2055 2056 /* 2057 * If this is a new anon slot then initialize 2058 * the anon array entry. 2059 */ 2060 if (slotcreate) { 2061 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2062 } 2063 pg_idx++; 2064 an_idx++; 2065 vaddr += PAGESIZE; 2066 } 2067 2068 /* 2069 * Since preallocated pages come off the freelist 2070 * they are locked SE_EXCL. Simply downgrade and return. 2071 */ 2072 if (prealloc) { 2073 VM_STAT_ADD(anonvmstats.getpages[19]); 2074 conpp = NULL; 2075 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2076 page_downgrade(ppa[pg_idx]); 2077 } 2078 } 2079 ASSERT(conpp == NULL); 2080 2081 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2082 VM_STAT_ADD(anonvmstats.getpages[20]); 2083 return (0); 2084 } 2085 2086 if (szc < seg->s_szc) 2087 panic("anon_map_getpages: cowfault for szc %d", szc); 2088 2089 VM_STAT_ADD(anonvmstats.getpages[21]); 2090 2091 *protp = PROT_ALL; 2092 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2093 ppa, vpage, anypgsz, cred)); 2094 io_err: 2095 /* 2096 * We got an IO error somewhere in our large page. 2097 * If we were using a preallocated page then just demote 2098 * all the constituent pages that we've succeeded with sofar 2099 * to PAGESIZE pages and leave them in the system 2100 * unlocked. 2101 */ 2102 2103 ASSERT(err != -2 || pg_idx == 0); 2104 2105 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2106 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2107 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2108 2109 if (prealloc) { 2110 conpp = NULL; 2111 if (pg_idx > 0) { 2112 VM_STAT_ADD(anonvmstats.getpages[25]); 2113 for (i = 0; i < pgcnt; i++) { 2114 pp = ppa[i]; 2115 ASSERT(PAGE_EXCL(pp)); 2116 ASSERT(pp->p_szc == szc); 2117 pp->p_szc = 0; 2118 } 2119 for (i = 0; i < pg_idx; i++) { 2120 ASSERT(!hat_page_is_mapped(ppa[i])); 2121 page_unlock(ppa[i]); 2122 } 2123 /* 2124 * Now free up the remaining unused constituent 2125 * pages. 2126 */ 2127 while (pg_idx < pgcnt) { 2128 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2129 page_free(ppa[pg_idx], 0); 2130 pg_idx++; 2131 } 2132 } else { 2133 VM_STAT_ADD(anonvmstats.getpages[26]); 2134 page_free_pages(ppa[0]); 2135 } 2136 } else { 2137 VM_STAT_ADD(anonvmstats.getpages[27]); 2138 ASSERT(err > 0); 2139 for (i = 0; i < pg_idx; i++) 2140 page_unlock(ppa[i]); 2141 } 2142 ASSERT(conpp == NULL); 2143 if (err != -1) 2144 return (err); 2145 /* 2146 * we are here because we failed to relocate. 2147 */ 2148 ASSERT(prealloc); 2149 if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { 2150 VM_STAT_ADD(anonvmstats.getpages[28]); 2151 return (-1); 2152 } 2153 VM_STAT_ADD(anonvmstats.getpages[29]); 2154 goto docow; 2155 } 2156 2157 2158 /* 2159 * Turn a reference to an object or shared anon page 2160 * into a private page with a copy of the data from the 2161 * original page which is always locked by the caller. 2162 * This routine unloads the translation and unlocks the 2163 * original page, if it isn't being stolen, before returning 2164 * to the caller. 2165 * 2166 * NOTE: The original anon slot is not freed by this routine 2167 * It must be freed by the caller while holding the 2168 * "anon_map" lock to prevent races which can occur if 2169 * a process has multiple lwps in its address space. 2170 */ 2171 page_t * 2172 anon_private( 2173 struct anon **app, 2174 struct seg *seg, 2175 caddr_t addr, 2176 uint_t prot, 2177 page_t *opp, 2178 int oppflags, 2179 struct cred *cred) 2180 { 2181 struct anon *old = *app; 2182 struct anon *new; 2183 page_t *pp = NULL; 2184 struct vnode *vp; 2185 anoff_t off; 2186 page_t *anon_pl[1 + 1]; 2187 int err; 2188 2189 if (oppflags & STEAL_PAGE) 2190 ASSERT(PAGE_EXCL(opp)); 2191 else 2192 ASSERT(PAGE_LOCKED(opp)); 2193 2194 CPU_STATS_ADD_K(vm, cow_fault, 1); 2195 2196 /* Kernel probe */ 2197 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2198 tnf_opaque, address, addr); 2199 2200 *app = new = anon_alloc(NULL, 0); 2201 swap_xlate(new, &vp, &off); 2202 2203 if (oppflags & STEAL_PAGE) { 2204 page_rename(opp, vp, (u_offset_t)off); 2205 pp = opp; 2206 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2207 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2208 seg, addr, pp, vp, off); 2209 hat_setmod(pp); 2210 2211 /* bug 4026339 */ 2212 page_downgrade(pp); 2213 return (pp); 2214 } 2215 2216 /* 2217 * Call the VOP_GETPAGE routine to create the page, thereby 2218 * enabling the vnode driver to allocate any filesystem 2219 * space (e.g., disk block allocation for UFS). This also 2220 * prevents more than one page from being added to the 2221 * vnode at the same time. 2222 */ 2223 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2224 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2225 if (err) 2226 goto out; 2227 2228 pp = anon_pl[0]; 2229 2230 /* 2231 * If the original page was locked, we need to move the lock 2232 * to the new page by transfering 'cowcnt/lckcnt' of the original 2233 * page to 'cowcnt/lckcnt' of the new page. 2234 * 2235 * See Statement at the beginning of segvn_lockop() and 2236 * comments in page_pp_useclaim() regarding the way 2237 * cowcnts/lckcnts are handled. 2238 * 2239 * Also availrmem must be decremented up front for read only mapping 2240 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2241 * if availrmem did not need to be decremented after all. 2242 */ 2243 if (oppflags & LOCK_PAGE) { 2244 if ((prot & PROT_WRITE) == 0) { 2245 mutex_enter(&freemem_lock); 2246 if (availrmem > pages_pp_maximum) { 2247 availrmem--; 2248 pages_useclaim++; 2249 } else { 2250 mutex_exit(&freemem_lock); 2251 goto out; 2252 } 2253 mutex_exit(&freemem_lock); 2254 } 2255 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2256 } 2257 2258 /* 2259 * Now copy the contents from the original page, 2260 * which is locked and loaded in the MMU by 2261 * the caller to prevent yet another page fault. 2262 */ 2263 ppcopy(opp, pp); /* XXX - should set mod bit in here */ 2264 2265 hat_setrefmod(pp); /* mark as modified */ 2266 2267 /* 2268 * Unload the old translation. 2269 */ 2270 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2271 2272 /* 2273 * Free unmapped, unmodified original page. 2274 * or release the lock on the original page, 2275 * otherwise the process will sleep forever in 2276 * anon_decref() waiting for the "exclusive" lock 2277 * on the page. 2278 */ 2279 (void) page_release(opp, 1); 2280 2281 /* 2282 * we are done with page creation so downgrade the new 2283 * page's selock to shared, this helps when multiple 2284 * as_fault(...SOFTLOCK...) are done to the same 2285 * page(aio) 2286 */ 2287 page_downgrade(pp); 2288 2289 /* 2290 * NOTE: The original anon slot must be freed by the 2291 * caller while holding the "anon_map" lock, if we 2292 * copied away from an anonymous page. 2293 */ 2294 return (pp); 2295 2296 out: 2297 *app = old; 2298 if (pp) 2299 page_unlock(pp); 2300 anon_decref(new); 2301 page_unlock(opp); 2302 return ((page_t *)NULL); 2303 } 2304 2305 int 2306 anon_map_privatepages( 2307 struct anon_map *amp, 2308 ulong_t start_idx, 2309 uint_t szc, 2310 struct seg *seg, 2311 caddr_t addr, 2312 uint_t prot, 2313 page_t *ppa[], 2314 struct vpage vpage[], 2315 int anypgsz, 2316 struct cred *cred) 2317 { 2318 pgcnt_t pgcnt; 2319 struct vnode *vp; 2320 anoff_t off; 2321 page_t *pl[2], *conpp = NULL; 2322 int err; 2323 int prealloc = 1; 2324 struct anon *ap, *oldap; 2325 caddr_t vaddr; 2326 page_t *pplist, *pp; 2327 ulong_t pg_idx, an_idx; 2328 spgcnt_t nreloc = 0; 2329 int pagelock = 0; 2330 kmutex_t *ahmpages = NULL; 2331 #ifdef DEBUG 2332 int refcnt; 2333 #endif 2334 2335 ASSERT(szc != 0); 2336 ASSERT(szc == seg->s_szc); 2337 2338 VM_STAT_ADD(anonvmstats.privatepages[0]); 2339 2340 pgcnt = page_get_pagecnt(szc); 2341 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2342 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2343 2344 ASSERT(amp != NULL); 2345 ap = anon_get_ptr(amp->ahp, start_idx); 2346 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2347 2348 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2349 2350 /* 2351 * Now try and allocate the large page. If we fail then just 2352 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2353 * the caller make this decision but to avoid added complexity 2354 * it's simplier to handle that case here. 2355 */ 2356 if (anypgsz == -1) { 2357 VM_STAT_ADD(anonvmstats.privatepages[2]); 2358 prealloc = 0; 2359 } else if (page_alloc_pages(seg, addr, &pplist, NULL, szc, 2360 anypgsz) != 0) { 2361 VM_STAT_ADD(anonvmstats.privatepages[3]); 2362 prealloc = 0; 2363 } 2364 2365 /* 2366 * make the decrement of all refcnts of all 2367 * anon slots of a large page appear atomic by 2368 * getting an anonpages_hash_lock for the 2369 * first anon slot of a large page. 2370 */ 2371 if (ap != NULL) { 2372 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2373 ap->an_off)]; 2374 mutex_enter(ahmpages); 2375 if (ap->an_refcnt == 1) { 2376 VM_STAT_ADD(anonvmstats.privatepages[4]); 2377 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2378 mutex_exit(ahmpages); 2379 2380 if (prealloc) { 2381 page_free_replacement_page(pplist); 2382 page_create_putback(pgcnt); 2383 } 2384 ASSERT(ppa[0]->p_szc <= szc); 2385 if (ppa[0]->p_szc == szc) { 2386 VM_STAT_ADD(anonvmstats.privatepages[5]); 2387 return (0); 2388 } 2389 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2390 ASSERT(ppa[pg_idx] != NULL); 2391 page_unlock(ppa[pg_idx]); 2392 } 2393 return (-1); 2394 } 2395 } 2396 2397 /* 2398 * If we are passed in the vpage array and this is 2399 * not PROT_WRITE then we need to decrement availrmem 2400 * up front before we try anything. If we need to and 2401 * can't decrement availrmem then its better to fail now 2402 * than in the middle of processing the new large page. 2403 * page_pp_usclaim() on behalf of each constituent page 2404 * below will adjust availrmem back for the cases not needed. 2405 */ 2406 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2407 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2408 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2409 pagelock = 1; 2410 break; 2411 } 2412 } 2413 if (pagelock) { 2414 VM_STAT_ADD(anonvmstats.privatepages[6]); 2415 mutex_enter(&freemem_lock); 2416 if (availrmem >= pages_pp_maximum + pgcnt) { 2417 availrmem -= pgcnt; 2418 pages_useclaim += pgcnt; 2419 } else { 2420 VM_STAT_ADD(anonvmstats.privatepages[7]); 2421 mutex_exit(&freemem_lock); 2422 if (ahmpages != NULL) { 2423 mutex_exit(ahmpages); 2424 } 2425 if (prealloc) { 2426 page_free_replacement_page(pplist); 2427 page_create_putback(pgcnt); 2428 } 2429 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2430 if (ppa[pg_idx] != NULL) 2431 page_unlock(ppa[pg_idx]); 2432 return (ENOMEM); 2433 } 2434 mutex_exit(&freemem_lock); 2435 } 2436 } 2437 2438 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2439 2440 VM_STAT_ADD(anonvmstats.privatepages[8]); 2441 2442 an_idx = start_idx; 2443 pg_idx = 0; 2444 vaddr = addr; 2445 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2446 ASSERT(ppa[pg_idx] != NULL); 2447 oldap = anon_get_ptr(amp->ahp, an_idx); 2448 ASSERT(ahmpages != NULL || oldap == NULL); 2449 ASSERT(ahmpages == NULL || oldap != NULL); 2450 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2451 ASSERT(ahmpages == NULL || pg_idx != 0 || 2452 (refcnt = oldap->an_refcnt)); 2453 ASSERT(ahmpages == NULL || pg_idx == 0 || 2454 refcnt == oldap->an_refcnt); 2455 2456 ap = anon_alloc(NULL, 0); 2457 2458 swap_xlate(ap, &vp, &off); 2459 2460 /* 2461 * Now setup our preallocated page to pass down to 2462 * swap_getpage(). 2463 */ 2464 if (prealloc) { 2465 pp = pplist; 2466 page_sub(&pplist, pp); 2467 conpp = pp; 2468 } 2469 2470 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2471 PAGESIZE, conpp, &nreloc, seg, vaddr, S_CREATE, cred); 2472 2473 /* 2474 * Impossible to fail this is S_CREATE. 2475 */ 2476 if (err) 2477 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2478 2479 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2480 ASSERT(prealloc == 0 || nreloc == 1); 2481 2482 pp = pl[0]; 2483 2484 /* 2485 * If the original page was locked, we need to move 2486 * the lock to the new page by transfering 2487 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2488 * of the new page. pg_idx can be used to index 2489 * into the vpage array since the caller will guarentee 2490 * that vpage struct passed in corresponds to addr 2491 * and forward. 2492 */ 2493 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2494 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2495 } else if (pagelock) { 2496 mutex_enter(&freemem_lock); 2497 availrmem++; 2498 pages_useclaim--; 2499 mutex_exit(&freemem_lock); 2500 } 2501 2502 /* 2503 * Now copy the contents from the original page. 2504 */ 2505 ppcopy(ppa[pg_idx], pp); 2506 2507 hat_setrefmod(pp); /* mark as modified */ 2508 2509 /* 2510 * Release the lock on the original page, 2511 * derement the old slot, and down grade the lock 2512 * on the new copy. 2513 */ 2514 page_unlock(ppa[pg_idx]); 2515 2516 if (!prealloc) 2517 page_downgrade(pp); 2518 2519 ppa[pg_idx] = pp; 2520 2521 /* 2522 * Now reflect the copy in the new anon array. 2523 */ 2524 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2525 if (oldap != NULL) 2526 anon_decref(oldap); 2527 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2528 } 2529 if (ahmpages != NULL) { 2530 mutex_exit(ahmpages); 2531 } 2532 ASSERT(prealloc == 0 || pplist == NULL); 2533 if (prealloc) { 2534 VM_STAT_ADD(anonvmstats.privatepages[9]); 2535 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2536 page_downgrade(ppa[pg_idx]); 2537 } 2538 } 2539 2540 /* 2541 * Unload the old large page translation. 2542 */ 2543 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2544 return (0); 2545 } 2546 2547 /* 2548 * Allocate a private zero-filled anon page. 2549 */ 2550 page_t * 2551 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2552 { 2553 struct anon *ap; 2554 page_t *pp; 2555 struct vnode *vp; 2556 anoff_t off; 2557 page_t *anon_pl[1 + 1]; 2558 int err; 2559 2560 /* Kernel probe */ 2561 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2562 tnf_opaque, address, addr); 2563 2564 *app = ap = anon_alloc(NULL, 0); 2565 swap_xlate(ap, &vp, &off); 2566 2567 /* 2568 * Call the VOP_GETPAGE routine to create the page, thereby 2569 * enabling the vnode driver to allocate any filesystem 2570 * dependent structures (e.g., disk block allocation for UFS). 2571 * This also prevents more than on page from being added to 2572 * the vnode at the same time since it is locked. 2573 */ 2574 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2575 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2576 if (err) { 2577 *app = NULL; 2578 anon_decref(ap); 2579 return (NULL); 2580 } 2581 pp = anon_pl[0]; 2582 2583 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2584 page_downgrade(pp); 2585 CPU_STATS_ADD_K(vm, zfod, 1); 2586 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2587 return (pp); 2588 } 2589 2590 2591 /* 2592 * Allocate array of private zero-filled anon pages for empty slots 2593 * and kept pages for non empty slots within given range. 2594 * 2595 * NOTE: This rontine will try and use large pages 2596 * if available and supported by underlying platform. 2597 */ 2598 int 2599 anon_map_createpages( 2600 struct anon_map *amp, 2601 ulong_t start_index, 2602 size_t len, 2603 page_t *ppa[], 2604 struct seg *seg, 2605 caddr_t addr, 2606 enum seg_rw rw, 2607 struct cred *cred) 2608 { 2609 2610 struct anon *ap; 2611 struct vnode *ap_vp; 2612 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2613 int err = 0; 2614 ulong_t p_index, index; 2615 pgcnt_t npgs, pg_cnt; 2616 spgcnt_t nreloc = 0; 2617 uint_t l_szc, szc, prot; 2618 anoff_t ap_off; 2619 size_t pgsz; 2620 lgrp_t *lgrp; 2621 2622 /* 2623 * XXX For now only handle S_CREATE. 2624 */ 2625 ASSERT(rw == S_CREATE); 2626 2627 index = start_index; 2628 p_index = 0; 2629 npgs = btopr(len); 2630 2631 /* 2632 * If this platform supports multiple page sizes 2633 * then try and allocate directly from the free 2634 * list for pages larger than PAGESIZE. 2635 * 2636 * NOTE:When we have page_create_ru we can stop 2637 * directly allocating from the freelist. 2638 */ 2639 l_szc = seg->s_szc; 2640 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2641 while (npgs) { 2642 2643 /* 2644 * if anon slot already exists 2645 * (means page has been created) 2646 * so 1) look up the page 2647 * 2) if the page is still in memory, get it. 2648 * 3) if not, create a page and 2649 * page in from physical swap device. 2650 * These are done in anon_getpage(). 2651 */ 2652 ap = anon_get_ptr(amp->ahp, index); 2653 if (ap) { 2654 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2655 seg, addr, S_READ, cred); 2656 if (err) { 2657 ANON_LOCK_EXIT(&->a_rwlock); 2658 panic("anon_map_createpages: anon_getpage"); 2659 } 2660 pp = anon_pl[0]; 2661 ppa[p_index++] = pp; 2662 2663 addr += PAGESIZE; 2664 index++; 2665 npgs--; 2666 continue; 2667 } 2668 /* 2669 * Now try and allocate the largest page possible 2670 * for the current address and range. 2671 * Keep dropping down in page size until: 2672 * 2673 * 1) Properly aligned 2674 * 2) Does not overlap existing anon pages 2675 * 3) Fits in remaining range. 2676 * 4) able to allocate one. 2677 * 2678 * NOTE: XXX When page_create_ru is completed this code 2679 * will change. 2680 */ 2681 szc = l_szc; 2682 pplist = NULL; 2683 pg_cnt = 0; 2684 while (szc) { 2685 pgsz = page_get_pagesize(szc); 2686 pg_cnt = pgsz >> PAGESHIFT; 2687 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2688 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2689 /* 2690 * XXX 2691 * Since we are faking page_create() 2692 * we also need to do the freemem and 2693 * pcf accounting. 2694 */ 2695 (void) page_create_wait(pg_cnt, PG_WAIT); 2696 2697 /* 2698 * Get lgroup to allocate next page of shared 2699 * memory from and use it to specify where to 2700 * allocate the physical memory 2701 */ 2702 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2703 2704 pplist = page_get_freelist( 2705 (struct vnode *)NULL, (u_offset_t)0, seg, 2706 addr, pgsz, 0, lgrp); 2707 2708 if (pplist == NULL) { 2709 page_create_putback(pg_cnt); 2710 } 2711 2712 /* 2713 * If a request for a page of size 2714 * larger than PAGESIZE failed 2715 * then don't try that size anymore. 2716 */ 2717 if (pplist == NULL) { 2718 l_szc = szc - 1; 2719 } else { 2720 break; 2721 } 2722 } 2723 szc--; 2724 } 2725 2726 /* 2727 * If just using PAGESIZE pages then don't 2728 * directly allocate from the free list. 2729 */ 2730 if (pplist == NULL) { 2731 ASSERT(szc == 0); 2732 pp = anon_zero(seg, addr, &ap, cred); 2733 if (pp == NULL) { 2734 ANON_LOCK_EXIT(&->a_rwlock); 2735 panic("anon_map_createpages: anon_zero"); 2736 } 2737 ppa[p_index++] = pp; 2738 2739 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2740 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2741 2742 addr += PAGESIZE; 2743 index++; 2744 npgs--; 2745 continue; 2746 } 2747 2748 /* 2749 * pplist is a list of pg_cnt PAGESIZE pages. 2750 * These pages are locked SE_EXCL since they 2751 * came directly off the free list. 2752 */ 2753 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2754 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2755 ASSERT(conpp == NULL); 2756 while (pg_cnt--) { 2757 2758 ap = anon_alloc(NULL, 0); 2759 swap_xlate(ap, &ap_vp, &ap_off); 2760 2761 ASSERT(pplist != NULL); 2762 pp = pplist; 2763 page_sub(&pplist, pp); 2764 PP_CLRFREE(pp); 2765 PP_CLRAGED(pp); 2766 conpp = pp; 2767 2768 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2769 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, &nreloc, 2770 seg, addr, S_CREATE, cred); 2771 2772 if (err) { 2773 ANON_LOCK_EXIT(&->a_rwlock); 2774 panic("anon_map_createpages: S_CREATE"); 2775 } 2776 2777 ASSERT(anon_pl[0] == pp); 2778 ASSERT(nreloc == 1); 2779 pagezero(pp, 0, PAGESIZE); 2780 CPU_STATS_ADD_K(vm, zfod, 1); 2781 hat_setrefmod(pp); 2782 2783 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2784 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2785 2786 ppa[p_index++] = pp; 2787 2788 addr += PAGESIZE; 2789 index++; 2790 npgs--; 2791 } 2792 conpp = NULL; 2793 pg_cnt = pgsz >> PAGESHIFT; 2794 p_index = p_index - pg_cnt; 2795 while (pg_cnt--) { 2796 page_downgrade(ppa[p_index++]); 2797 } 2798 } 2799 ANON_LOCK_EXIT(&->a_rwlock); 2800 return (0); 2801 } 2802 2803 int 2804 anon_map_demotepages( 2805 struct anon_map *amp, 2806 ulong_t start_idx, 2807 struct seg *seg, 2808 caddr_t addr, 2809 uint_t prot, 2810 struct vpage vpage[], 2811 struct cred *cred) 2812 { 2813 struct anon *ap; 2814 uint_t szc = seg->s_szc; 2815 pgcnt_t pgcnt = page_get_pagecnt(szc); 2816 size_t ppasize = pgcnt * sizeof (page_t *); 2817 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 2818 page_t *pp; 2819 page_t *pl[2]; 2820 pgcnt_t i, pg_idx; 2821 ulong_t an_idx; 2822 caddr_t vaddr; 2823 kmutex_t *ahmpages = NULL; 2824 int err; 2825 int retry = 0; 2826 uint_t vpprot; 2827 2828 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 2829 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2830 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2831 ASSERT(ppa != NULL); 2832 2833 VM_STAT_ADD(anonvmstats.demotepages[0]); 2834 2835 ap = anon_get_ptr(amp->ahp, start_idx); 2836 if (ap != NULL) { 2837 VM_STAT_ADD(anonvmstats.demotepages[1]); 2838 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2839 mutex_enter(ahmpages); 2840 } 2841 top: 2842 if (ap == NULL || ap->an_refcnt <= 1) { 2843 int root = 0; 2844 pgcnt_t npgs, curnpgs = 0; 2845 2846 VM_STAT_ADD(anonvmstats.demotepages[2]); 2847 2848 ASSERT(retry == 0 || ap != NULL); 2849 2850 if (ahmpages != NULL) 2851 mutex_exit(ahmpages); 2852 an_idx = start_idx; 2853 for (i = 0; i < pgcnt; i++, an_idx++) { 2854 ap = anon_get_ptr(amp->ahp, an_idx); 2855 if (ap != NULL) { 2856 ASSERT(ap->an_refcnt == 1); 2857 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2858 SE_EXCL); 2859 if (pp != NULL) { 2860 (void) hat_pageunload(pp, 2861 HAT_FORCE_PGUNLOAD); 2862 } 2863 } else { 2864 ppa[i] = NULL; 2865 } 2866 } 2867 for (i = 0; i < pgcnt; i++) { 2868 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 2869 ASSERT(pp->p_szc <= szc); 2870 if (!root) { 2871 VM_STAT_ADD(anonvmstats.demotepages[3]); 2872 if (curnpgs != 0) 2873 panic("anon_map_demotepages: " 2874 "bad large page"); 2875 2876 root = 1; 2877 curnpgs = npgs = 2878 page_get_pagecnt(pp->p_szc); 2879 2880 ASSERT(npgs <= pgcnt); 2881 ASSERT(IS_P2ALIGNED(npgs, npgs)); 2882 ASSERT(!(page_pptonum(pp) & 2883 (npgs - 1))); 2884 } else { 2885 ASSERT(i > 0); 2886 ASSERT(page_pptonum(pp) - 1 == 2887 page_pptonum(ppa[i - 1])); 2888 if ((page_pptonum(pp) & (npgs - 1)) == 2889 npgs - 1) 2890 root = 0; 2891 } 2892 ASSERT(PAGE_EXCL(pp)); 2893 pp->p_szc = 0; 2894 curnpgs--; 2895 } 2896 } 2897 if (root != 0 || curnpgs != 0) 2898 panic("anon_map_demotepages: bad large page"); 2899 2900 for (i = 0; i < pgcnt; i++) { 2901 if ((pp = ppa[i]) != NULL) { 2902 ASSERT(!hat_page_is_mapped(pp)); 2903 ASSERT(pp->p_szc == 0); 2904 page_unlock(pp); 2905 } 2906 } 2907 kmem_free(ppa, ppasize); 2908 return (0); 2909 } 2910 ASSERT(ahmpages != NULL); 2911 mutex_exit(ahmpages); 2912 ahmpages = NULL; 2913 2914 VM_STAT_ADD(anonvmstats.demotepages[4]); 2915 2916 ASSERT(retry == 0); /* we can be here only once */ 2917 2918 vaddr = addr; 2919 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 2920 pg_idx++, an_idx++, vaddr += PAGESIZE) { 2921 ap = anon_get_ptr(amp->ahp, an_idx); 2922 if (ap == NULL) 2923 panic("anon_map_demotepages: no anon slot"); 2924 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 2925 S_READ, cred); 2926 if (err) { 2927 for (i = 0; i < pg_idx; i++) { 2928 if ((pp = ppa[i]) != NULL) 2929 page_unlock(pp); 2930 } 2931 kmem_free(ppa, ppasize); 2932 return (err); 2933 } 2934 ppa[pg_idx] = pl[0]; 2935 } 2936 2937 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 2938 vpage, -1, cred); 2939 if (err > 0) { 2940 VM_STAT_ADD(anonvmstats.demotepages[5]); 2941 kmem_free(ppa, ppasize); 2942 return (err); 2943 } 2944 ASSERT(err == 0 || err == -1); 2945 if (err == -1) { 2946 VM_STAT_ADD(anonvmstats.demotepages[6]); 2947 retry = 1; 2948 goto top; 2949 } 2950 for (i = 0; i < pgcnt; i++) { 2951 ASSERT(ppa[i] != NULL); 2952 if (ppa[i]->p_szc != 0) 2953 retry = 1; 2954 page_unlock(ppa[i]); 2955 } 2956 if (retry) { 2957 VM_STAT_ADD(anonvmstats.demotepages[7]); 2958 goto top; 2959 } 2960 2961 VM_STAT_ADD(anonvmstats.demotepages[8]); 2962 2963 kmem_free(ppa, ppasize); 2964 2965 return (0); 2966 } 2967 2968 /* 2969 * Allocate and initialize an anon_map structure for seg 2970 * associating the given swap reservation with the new anon_map. 2971 */ 2972 struct anon_map * 2973 anonmap_alloc(size_t size, size_t swresv) 2974 { 2975 struct anon_map *amp; 2976 2977 amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP); 2978 2979 amp->refcnt = 1; 2980 amp->size = size; 2981 2982 amp->ahp = anon_create(btopr(size), ANON_SLEEP); 2983 amp->swresv = swresv; 2984 amp->locality = 0; 2985 amp->a_szc = 0; 2986 return (amp); 2987 } 2988 2989 void 2990 anonmap_free(struct anon_map *amp) 2991 { 2992 ASSERT(amp->ahp); 2993 ASSERT(amp->refcnt == 0); 2994 2995 lgrp_shm_policy_fini(amp, NULL); 2996 anon_release(amp->ahp, btopr(amp->size)); 2997 kmem_cache_free(anonmap_cache, amp); 2998 } 2999 3000 /* 3001 * Returns true if the app array has some empty slots. 3002 * The offp and lenp paramters are in/out paramters. On entry 3003 * these values represent the starting offset and length of the 3004 * mapping. When true is returned, these values may be modified 3005 * to be the largest range which includes empty slots. 3006 */ 3007 int 3008 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3009 size_t *lenp) 3010 { 3011 ulong_t i, el; 3012 ssize_t low, high; 3013 struct anon *ap; 3014 3015 low = -1; 3016 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3017 ap = anon_get_ptr(ahp, anon_idx); 3018 if (ap == NULL) { 3019 if (low == -1) 3020 low = i; 3021 high = i; 3022 } 3023 } 3024 if (low != -1) { 3025 /* 3026 * Found at least one non-anon page. 3027 * Set up the off and len return values. 3028 */ 3029 if (low != 0) 3030 *offp += low; 3031 *lenp = high - low + PAGESIZE; 3032 return (1); 3033 } 3034 return (0); 3035 } 3036 3037 /* 3038 * Return a count of the number of existing anon pages in the anon array 3039 * app in the range (off, off+len). The array and slots must be guaranteed 3040 * stable by the caller. 3041 */ 3042 pgcnt_t 3043 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3044 { 3045 pgcnt_t cnt = 0; 3046 3047 while (nslots-- > 0) { 3048 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3049 cnt++; 3050 anon_index++; 3051 } 3052 return (cnt); 3053 } 3054 3055 /* 3056 * Move reserved phys swap into memory swap (unreserve phys swap 3057 * and reserve mem swap by the same amount). 3058 * Used by segspt when it needs to lock resrved swap npages in memory 3059 */ 3060 int 3061 anon_swap_adjust(pgcnt_t npages) 3062 { 3063 pgcnt_t unlocked_mem_swap; 3064 3065 mutex_enter(&anoninfo_lock); 3066 3067 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3068 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3069 3070 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3071 - k_anoninfo.ani_locked_swap; 3072 if (npages > unlocked_mem_swap) { 3073 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3074 3075 /* 3076 * if there is not enough unlocked mem swap we take missing 3077 * amount from phys swap and give it to mem swap 3078 */ 3079 mutex_enter(&freemem_lock); 3080 if (availrmem < adjusted_swap + segspt_minfree) { 3081 mutex_exit(&freemem_lock); 3082 mutex_exit(&anoninfo_lock); 3083 return (ENOMEM); 3084 } 3085 availrmem -= adjusted_swap; 3086 mutex_exit(&freemem_lock); 3087 3088 k_anoninfo.ani_mem_resv += adjusted_swap; 3089 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3090 k_anoninfo.ani_phys_resv -= adjusted_swap; 3091 3092 ANI_ADD(adjusted_swap); 3093 } 3094 k_anoninfo.ani_locked_swap += npages; 3095 3096 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3097 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3098 3099 mutex_exit(&anoninfo_lock); 3100 3101 return (0); 3102 } 3103 3104 /* 3105 * 'unlocked' reserved mem swap so when it is unreserved it 3106 * can be moved back phys (disk) swap 3107 */ 3108 void 3109 anon_swap_restore(pgcnt_t npages) 3110 { 3111 mutex_enter(&anoninfo_lock); 3112 3113 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3114 3115 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3116 k_anoninfo.ani_locked_swap -= npages; 3117 3118 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3119 3120 mutex_exit(&anoninfo_lock); 3121 } 3122 3123 /* 3124 * Return the pointer from the list for a 3125 * specified anon index. 3126 */ 3127 ulong_t * 3128 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3129 { 3130 struct anon **app; 3131 void **ppp; 3132 3133 ASSERT(an_idx < ahp->size); 3134 3135 /* 3136 * Single level case. 3137 */ 3138 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3139 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3140 } else { 3141 3142 /* 3143 * 2 level case. 3144 */ 3145 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3146 if (*ppp == NULL) { 3147 mutex_enter(&ahp->serial_lock); 3148 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3149 if (*ppp == NULL) 3150 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3151 mutex_exit(&ahp->serial_lock); 3152 } 3153 app = *ppp; 3154 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3155 } 3156 } 3157 3158 void 3159 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3160 { 3161 ulong_t *ap_slot; 3162 kmutex_t *mtx; 3163 kcondvar_t *cv; 3164 int hash; 3165 3166 /* 3167 * Use szc to determine anon slot(s) to appear atomic. 3168 * If szc = 0, then lock the anon slot and mark it busy. 3169 * If szc > 0, then lock the range of slots by getting the 3170 * anon_array_lock for the first anon slot, and mark only the 3171 * first anon slot busy to represent whole range being busy. 3172 */ 3173 3174 ASSERT(RW_READ_HELD(&->a_rwlock)); 3175 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3176 hash = ANON_ARRAY_HASH(amp, an_idx); 3177 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3178 sobj->sync_cv = cv = &anon_array_cv[hash]; 3179 mutex_enter(mtx); 3180 ap_slot = anon_get_slot(amp->ahp, an_idx); 3181 while (ANON_ISBUSY(ap_slot)) 3182 cv_wait(cv, mtx); 3183 ANON_SETBUSY(ap_slot); 3184 sobj->sync_data = ap_slot; 3185 mutex_exit(mtx); 3186 } 3187 3188 void 3189 anon_array_exit(anon_sync_obj_t *sobj) 3190 { 3191 mutex_enter(sobj->sync_mutex); 3192 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3193 ANON_CLRBUSY(sobj->sync_data); 3194 if (CV_HAS_WAITERS(sobj->sync_cv)) 3195 cv_broadcast(sobj->sync_cv); 3196 mutex_exit(sobj->sync_mutex); 3197 } 3198