1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - anonymous pages. 43 * 44 * This layer sits immediately above the vm_swap layer. It manages 45 * physical pages that have no permanent identity in the file system 46 * name space, using the services of the vm_swap layer to allocate 47 * backing storage for these pages. Since these pages have no external 48 * identity, they are discarded when the last reference is removed. 49 * 50 * An important function of this layer is to manage low-level sharing 51 * of pages that are logically distinct but that happen to be 52 * physically identical (e.g., the corresponding pages of the processes 53 * resulting from a fork before one process or the other changes their 54 * contents). This pseudo-sharing is present only as an optimization 55 * and is not to be confused with true sharing in which multiple 56 * address spaces deliberately contain references to the same object; 57 * such sharing is managed at a higher level. 58 * 59 * The key data structure here is the anon struct, which contains a 60 * reference count for its associated physical page and a hint about 61 * the identity of that page. Anon structs typically live in arrays, 62 * with an instance's position in its array determining where the 63 * corresponding backing storage is allocated; however, the swap_xlate() 64 * routine abstracts away this representation information so that the 65 * rest of the anon layer need not know it. (See the swap layer for 66 * more details on anon struct layout.) 67 * 68 * In the future versions of the system, the association between an 69 * anon struct and its position on backing store will change so that 70 * we don't require backing store all anonymous pages in the system. 71 * This is important for consideration for large memory systems. 72 * We can also use this technique to delay binding physical locations 73 * to anonymous pages until pageout/swapout time where we can make 74 * smarter allocation decisions to improve anonymous klustering. 75 * 76 * Many of the routines defined here take a (struct anon **) argument, 77 * which allows the code at this level to manage anon pages directly, 78 * so that callers can regard anon structs as opaque objects and not be 79 * concerned with assigning or inspecting their contents. 80 * 81 * Clients of this layer refer to anon pages indirectly. That is, they 82 * maintain arrays of pointers to anon structs rather than maintaining 83 * anon structs themselves. The (struct anon **) arguments mentioned 84 * above are pointers to entries in these arrays. It is these arrays 85 * that capture the mapping between offsets within a given segment and 86 * the corresponding anonymous backing storage address. 87 */ 88 89 #ifdef DEBUG 90 #define ANON_DEBUG 91 #endif 92 93 #include <sys/types.h> 94 #include <sys/t_lock.h> 95 #include <sys/param.h> 96 #include <sys/systm.h> 97 #include <sys/mman.h> 98 #include <sys/cred.h> 99 #include <sys/thread.h> 100 #include <sys/vnode.h> 101 #include <sys/cpuvar.h> 102 #include <sys/swap.h> 103 #include <sys/cmn_err.h> 104 #include <sys/vtrace.h> 105 #include <sys/kmem.h> 106 #include <sys/sysmacros.h> 107 #include <sys/bitmap.h> 108 #include <sys/vmsystm.h> 109 #include <sys/debug.h> 110 #include <sys/fs/swapnode.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 #include <sys/rctl.h> 117 118 #include <vm/as.h> 119 #include <vm/hat.h> 120 #include <vm/anon.h> 121 #include <vm/page.h> 122 #include <vm/vpage.h> 123 #include <vm/seg.h> 124 #include <vm/rm.h> 125 126 #include <fs/fs_subr.h> 127 128 struct vnode *anon_vp; 129 130 int anon_debug; 131 132 kmutex_t anoninfo_lock; 133 struct k_anoninfo k_anoninfo; 134 ani_free_t ani_free_pool[ANI_MAX_POOL]; 135 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 136 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 137 138 /* 139 * Global hash table for (vp, off) -> anon slot 140 */ 141 extern int swap_maxcontig; 142 size_t anon_hash_size; 143 struct anon **anon_hash; 144 145 static struct kmem_cache *anon_cache; 146 static struct kmem_cache *anonmap_cache; 147 148 #ifdef VM_STATS 149 static struct anonvmstats_str { 150 ulong_t getpages[30]; 151 ulong_t privatepages[10]; 152 ulong_t demotepages[9]; 153 ulong_t decrefpages[9]; 154 ulong_t dupfillholes[4]; 155 ulong_t freepages[1]; 156 } anonvmstats; 157 #endif /* VM_STATS */ 158 159 160 /*ARGSUSED*/ 161 static int 162 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 163 { 164 struct anon_map *amp = buf; 165 166 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 167 return (0); 168 } 169 170 /*ARGSUSED1*/ 171 static void 172 anonmap_cache_destructor(void *buf, void *cdrarg) 173 { 174 struct anon_map *amp = buf; 175 176 rw_destroy(&->a_rwlock); 177 } 178 179 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 180 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 181 182 void 183 anon_init(void) 184 { 185 int i; 186 187 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 188 189 for (i = 0; i < AH_LOCK_SIZE; i++) { 190 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 191 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 192 } 193 194 for (i = 0; i < ANON_LOCKSIZE; i++) { 195 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 196 MUTEX_DEFAULT, NULL); 197 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 198 } 199 200 anon_hash = (struct anon **) 201 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 202 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 203 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 204 anonmap_cache = kmem_cache_create("anonmap_cache", 205 sizeof (struct anon_map), 0, 206 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 207 NULL, NULL, 0); 208 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 209 210 anon_vp = vn_alloc(KM_SLEEP); 211 vn_setops(anon_vp, swap_vnodeops); 212 anon_vp->v_type = VREG; 213 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 214 } 215 216 /* 217 * Global anon slot hash table manipulation. 218 */ 219 220 static void 221 anon_addhash(struct anon *ap) 222 { 223 int index; 224 225 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 226 index = ANON_HASH(ap->an_vp, ap->an_off); 227 ap->an_hash = anon_hash[index]; 228 anon_hash[index] = ap; 229 } 230 231 static void 232 anon_rmhash(struct anon *ap) 233 { 234 struct anon **app; 235 236 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 237 238 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 239 *app; app = &((*app)->an_hash)) { 240 if (*app == ap) { 241 *app = ap->an_hash; 242 break; 243 } 244 } 245 } 246 247 /* 248 * The anon array interfaces. Functions allocating, 249 * freeing array of pointers, and returning/setting 250 * entries in the array of pointers for a given offset. 251 * 252 * Create the list of pointers 253 */ 254 struct anon_hdr * 255 anon_create(pgcnt_t npages, int flags) 256 { 257 struct anon_hdr *ahp; 258 ulong_t nchunks; 259 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 260 261 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 262 return (NULL); 263 } 264 265 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 266 /* 267 * Single level case. 268 */ 269 ahp->size = npages; 270 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 271 272 if (flags & ANON_ALLOC_FORCE) 273 ahp->flags |= ANON_ALLOC_FORCE; 274 275 ahp->array_chunk = kmem_zalloc( 276 ahp->size * sizeof (struct anon *), kmemflags); 277 278 if (ahp->array_chunk == NULL) { 279 kmem_free(ahp, sizeof (struct anon_hdr)); 280 return (NULL); 281 } 282 } else { 283 /* 284 * 2 Level case. 285 * anon hdr size needs to be rounded off to be a multiple 286 * of ANON_CHUNK_SIZE. This is important as various anon 287 * related functions depend on this. 288 * NOTE - 289 * anon_grow() makes anon hdr size a multiple of 290 * ANON_CHUNK_SIZE. 291 * amp size is <= anon hdr size. 292 * anon_index + seg_pgs <= anon hdr size. 293 */ 294 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 295 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 296 297 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 298 kmemflags); 299 300 if (ahp->array_chunk == NULL) { 301 kmem_free(ahp, sizeof (struct anon_hdr)); 302 return (NULL); 303 } 304 } 305 return (ahp); 306 } 307 308 /* 309 * Free the array of pointers 310 */ 311 void 312 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 313 { 314 ulong_t i; 315 void **ppp; 316 ulong_t nchunks; 317 318 ASSERT(npages <= ahp->size); 319 320 /* 321 * Single level case. 322 */ 323 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 324 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 325 } else { 326 /* 327 * 2 level case. 328 */ 329 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 330 for (i = 0; i < nchunks; i++) { 331 ppp = &ahp->array_chunk[i]; 332 if (*ppp != NULL) 333 kmem_free(*ppp, PAGESIZE); 334 } 335 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 336 } 337 mutex_destroy(&ahp->serial_lock); 338 kmem_free(ahp, sizeof (struct anon_hdr)); 339 } 340 341 /* 342 * Return the pointer from the list for a 343 * specified anon index. 344 */ 345 struct anon * 346 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 347 { 348 struct anon **app; 349 350 ASSERT(an_idx < ahp->size); 351 352 /* 353 * Single level case. 354 */ 355 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 356 return ((struct anon *) 357 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 358 } else { 359 360 /* 361 * 2 level case. 362 */ 363 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 364 if (app) { 365 return ((struct anon *) 366 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 367 ANON_PTRMASK)); 368 } else { 369 return (NULL); 370 } 371 } 372 } 373 374 /* 375 * Return the anon pointer for the first valid entry in the anon list, 376 * starting from the given index. 377 */ 378 struct anon * 379 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 380 { 381 struct anon *ap; 382 struct anon **app; 383 ulong_t chunkoff; 384 ulong_t i; 385 ulong_t j; 386 pgcnt_t size; 387 388 i = *index; 389 size = ahp->size; 390 391 ASSERT(i < size); 392 393 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 394 /* 395 * 1 level case 396 */ 397 while (i < size) { 398 ap = (struct anon *) 399 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 400 if (ap) { 401 *index = i; 402 return (ap); 403 } 404 i++; 405 } 406 } else { 407 /* 408 * 2 level case 409 */ 410 chunkoff = i & ANON_CHUNK_OFF; 411 while (i < size) { 412 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 413 if (app) 414 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 415 ap = (struct anon *) 416 ((uintptr_t)app[j] & 417 ANON_PTRMASK); 418 if (ap) { 419 *index = i + (j - chunkoff); 420 return (ap); 421 } 422 } 423 chunkoff = 0; 424 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 425 } 426 } 427 *index = size; 428 return (NULL); 429 } 430 431 /* 432 * Set list entry with a given pointer for a specified offset 433 */ 434 int 435 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 436 { 437 void **ppp; 438 struct anon **app; 439 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 440 uintptr_t *ap_addr; 441 442 ASSERT(an_idx < ahp->size); 443 444 /* 445 * Single level case. 446 */ 447 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 448 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 449 } else { 450 451 /* 452 * 2 level case. 453 */ 454 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 455 456 ASSERT(ppp != NULL); 457 if (*ppp == NULL) { 458 mutex_enter(&ahp->serial_lock); 459 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 460 if (*ppp == NULL) { 461 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 462 if (*ppp == NULL) { 463 mutex_exit(&ahp->serial_lock); 464 return (ENOMEM); 465 } 466 } 467 mutex_exit(&ahp->serial_lock); 468 } 469 app = *ppp; 470 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 471 } 472 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 473 return (0); 474 } 475 476 /* 477 * Copy anon array into a given new anon array 478 */ 479 int 480 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 481 struct anon_hdr *dahp, ulong_t d_idx, 482 pgcnt_t npages, int flags) 483 { 484 void **sapp, **dapp; 485 void *ap; 486 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 487 488 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 489 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 490 491 /* 492 * Both arrays are 1 level. 493 */ 494 if (((sahp->size <= ANON_CHUNK_SIZE) && 495 (dahp->size <= ANON_CHUNK_SIZE)) || 496 ((sahp->flags & ANON_ALLOC_FORCE) && 497 (dahp->flags & ANON_ALLOC_FORCE))) { 498 499 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 500 npages * sizeof (struct anon *)); 501 return (0); 502 } 503 504 /* 505 * Both arrays are 2 levels. 506 */ 507 if (sahp->size > ANON_CHUNK_SIZE && 508 dahp->size > ANON_CHUNK_SIZE && 509 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 510 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 511 512 ulong_t sapidx, dapidx; 513 ulong_t *sap, *dap; 514 ulong_t chknp; 515 516 while (npages != 0) { 517 518 sapidx = s_idx & ANON_CHUNK_OFF; 519 dapidx = d_idx & ANON_CHUNK_OFF; 520 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 521 if (chknp > npages) 522 chknp = npages; 523 524 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 525 if ((sap = *sapp) != NULL) { 526 dapp = &dahp->array_chunk[d_idx 527 >> ANON_CHUNK_SHIFT]; 528 if ((dap = *dapp) == NULL) { 529 *dapp = kmem_zalloc(PAGESIZE, 530 kmemflags); 531 if ((dap = *dapp) == NULL) 532 return (ENOMEM); 533 } 534 bcopy((sap + sapidx), (dap + dapidx), 535 chknp << ANON_PTRSHIFT); 536 } 537 s_idx += chknp; 538 d_idx += chknp; 539 npages -= chknp; 540 } 541 return (0); 542 } 543 544 /* 545 * At least one of the arrays is 2 level. 546 */ 547 while (npages--) { 548 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 549 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 550 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 551 return (ENOMEM); 552 } 553 s_idx++; 554 d_idx++; 555 } 556 return (0); 557 } 558 559 560 /* 561 * ANON_INITBUF is a convenience macro for anon_grow() below. It 562 * takes a buffer dst, which is at least as large as buffer src. It 563 * does a bcopy from src into dst, and then bzeros the extra bytes 564 * of dst. If tail is set, the data in src is tail aligned within 565 * dst instead of head aligned. 566 */ 567 568 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 569 if (tail) { \ 570 bzero((dst), (dstsize) - (srclen)); \ 571 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 572 } else { \ 573 bcopy((src), (dst), (srclen)); \ 574 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 575 } 576 577 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 578 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 579 580 /* 581 * anon_grow() is used to efficiently extend an existing anon array. 582 * startidx_p points to the index into the anon array of the first page 583 * that is in use. oldseg_pgs is the number of pages in use, starting at 584 * *startidx_p. newpages is the number of additional pages desired. 585 * 586 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 587 * 588 * The growth is done by creating a new top level of the anon array, 589 * and (if the array is 2-level) reusing the existing second level arrays. 590 * 591 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 592 * 593 * Returns the new number of pages in the anon array. 594 */ 595 pgcnt_t 596 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 597 pgcnt_t newseg_pgs, int flags) 598 { 599 ulong_t startidx = startidx_p ? *startidx_p : 0; 600 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 601 pgcnt_t oelems, nelems, totpages; 602 void **level1; 603 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 604 int growdown = (flags & ANON_GROWDOWN); 605 size_t newarrsz, oldarrsz; 606 void *level2; 607 608 ASSERT(!(startidx_p == NULL && growdown)); 609 ASSERT(startidx + oldseg_pgs <= ahp->size); 610 611 /* 612 * Determine the total number of pages needed in the new 613 * anon array. If growing down, totpages is all pages from 614 * startidx through the end of the array, plus <newseg_pgs> 615 * pages. If growing up, keep all pages from page 0 through 616 * the last page currently in use, plus <newseg_pgs> pages. 617 */ 618 if (growdown) 619 totpages = oldamp_pgs - startidx + newseg_pgs; 620 else 621 totpages = startidx + oldseg_pgs + newseg_pgs; 622 623 /* If the array is already large enough, just return. */ 624 625 if (oldamp_pgs >= totpages) { 626 if (growdown) 627 *startidx_p = oldamp_pgs - totpages; 628 return (oldamp_pgs); 629 } 630 631 /* 632 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 633 * by the corresponding arrays. 634 * oelems/nelems are the number of pointers in the top level arrays 635 * which may be either level 1 or level 2. 636 * Will the new anon array be one level or two levels? 637 */ 638 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 639 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 640 oelems = oldamp_pgs; 641 nelems = newamp_pgs; 642 } else { 643 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 644 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 645 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 646 } 647 648 newarrsz = nelems * sizeof (void *); 649 level1 = kmem_alloc(newarrsz, kmemflags); 650 if (level1 == NULL) 651 return (0); 652 653 /* Are we converting from a one level to a two level anon array? */ 654 655 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 656 !(ahp->flags & ANON_ALLOC_FORCE)) { 657 658 /* 659 * Yes, we're converting to a two level. Reuse old level 1 660 * as new level 2 if it is exactly PAGESIZE. Otherwise 661 * alloc a new level 2 and copy the old level 1 data into it. 662 */ 663 if (oldamp_pgs == ANON_CHUNK_SIZE) { 664 level2 = (void *)ahp->array_chunk; 665 } else { 666 level2 = kmem_alloc(PAGESIZE, kmemflags); 667 if (level2 == NULL) { 668 kmem_free(level1, newarrsz); 669 return (0); 670 } 671 oldarrsz = oldamp_pgs * sizeof (void *); 672 673 ANON_INITBUF(ahp->array_chunk, oldarrsz, 674 level2, PAGESIZE, growdown); 675 kmem_free(ahp->array_chunk, oldarrsz); 676 } 677 bzero(level1, newarrsz); 678 if (growdown) 679 level1[nelems - 1] = level2; 680 else 681 level1[0] = level2; 682 } else { 683 oldarrsz = oelems * sizeof (void *); 684 685 ANON_INITBUF(ahp->array_chunk, oldarrsz, 686 level1, newarrsz, growdown); 687 kmem_free(ahp->array_chunk, oldarrsz); 688 } 689 690 ahp->array_chunk = level1; 691 ahp->size = newamp_pgs; 692 if (growdown) 693 *startidx_p = newamp_pgs - totpages; 694 695 return (newamp_pgs); 696 } 697 698 699 /* 700 * Called from clock handler to sync ani_free value. 701 */ 702 703 void 704 set_anoninfo(void) 705 { 706 int ix; 707 pgcnt_t total = 0; 708 709 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 710 total += ani_free_pool[ix].ani_count; 711 } 712 k_anoninfo.ani_free = total; 713 } 714 715 /* 716 * Reserve anon space. 717 * 718 * It's no longer simply a matter of incrementing ani_resv to 719 * reserve swap space, we need to check memory-based as well 720 * as disk-backed (physical) swap. The following algorithm 721 * is used: 722 * Check the space on physical swap 723 * i.e. amount needed < ani_max - ani_phys_resv 724 * If we are swapping on swapfs check 725 * amount needed < (availrmem - swapfs_minfree) 726 * Since the algorithm to check for the quantity of swap space is 727 * almost the same as that for reserving it, we'll just use anon_resvmem 728 * with a flag to decrement availrmem. 729 * 730 * Return non-zero on success. 731 */ 732 int 733 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard) 734 { 735 pgcnt_t npages = btopr(size); 736 pgcnt_t mswap_pages = 0; 737 pgcnt_t pswap_pages = 0; 738 proc_t *p = curproc; 739 740 if (zone != NULL && takemem) { 741 /* test zone.max-swap resource control */ 742 mutex_enter(&p->p_lock); 743 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) { 744 mutex_exit(&p->p_lock); 745 return (0); 746 } 747 mutex_exit(&p->p_lock); 748 } 749 mutex_enter(&anoninfo_lock); 750 751 /* 752 * pswap_pages is the number of pages we can take from 753 * physical (i.e. disk-backed) swap. 754 */ 755 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 756 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 757 758 ANON_PRINT(A_RESV, 759 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 760 npages, takemem, pswap_pages, (void *)caller())); 761 762 if (npages <= pswap_pages) { 763 /* 764 * we have enough space on a physical swap 765 */ 766 if (takemem) 767 k_anoninfo.ani_phys_resv += npages; 768 mutex_exit(&anoninfo_lock); 769 return (1); 770 } else if (pswap_pages != 0) { 771 /* 772 * we have some space on a physical swap 773 */ 774 if (takemem) { 775 /* 776 * use up remainder of phys swap 777 */ 778 k_anoninfo.ani_phys_resv += pswap_pages; 779 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 780 } 781 } 782 /* 783 * since (npages > pswap_pages) we need mem swap 784 * mswap_pages is the number of pages needed from availrmem 785 */ 786 ASSERT(npages > pswap_pages); 787 mswap_pages = npages - pswap_pages; 788 789 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 790 mswap_pages)); 791 792 /* 793 * priv processes can reserve memory as swap as long as availrmem 794 * remains greater than swapfs_minfree; in the case of non-priv 795 * processes, memory can be reserved as swap only if availrmem 796 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 797 * swapfs_reserve amount of memswap is not available to non-priv 798 * processes. This protects daemons such as automounter dying 799 * as a result of application processes eating away almost entire 800 * membased swap. This safeguard becomes useless if apps are run 801 * with root access. 802 * 803 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 804 * 805 */ 806 if (tryhard) { 807 mutex_exit(&anoninfo_lock); 808 (void) page_reclaim_mem(mswap_pages, 809 swapfs_minfree + swapfs_reserve, 0); 810 mutex_enter(&anoninfo_lock); 811 } 812 813 mutex_enter(&freemem_lock); 814 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 815 (availrmem > (swapfs_minfree + mswap_pages) && 816 secpolicy_resource(CRED()) == 0)) { 817 818 if (takemem) { 819 /* 820 * Take the memory from the rest of the system. 821 */ 822 availrmem -= mswap_pages; 823 mutex_exit(&freemem_lock); 824 k_anoninfo.ani_mem_resv += mswap_pages; 825 ANI_ADD(mswap_pages); 826 ANON_PRINT((A_RESV | A_MRESV), 827 ("anon_resvmem: took %ld pages of availrmem\n", 828 mswap_pages)); 829 } else { 830 mutex_exit(&freemem_lock); 831 } 832 833 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 834 mutex_exit(&anoninfo_lock); 835 return (1); 836 837 } else { 838 /* 839 * Fail if not enough memory 840 */ 841 842 if (takemem) { 843 k_anoninfo.ani_phys_resv -= pswap_pages; 844 } 845 846 mutex_exit(&freemem_lock); 847 mutex_exit(&anoninfo_lock); 848 ANON_PRINT(A_RESV, 849 ("anon_resvmem: not enough space from swapfs\n")); 850 if (zone != NULL && takemem) 851 rctl_decr_swap(zone, ptob(npages)); 852 return (0); 853 } 854 } 855 856 /* 857 * Give back an anon reservation. 858 */ 859 void 860 anon_unresvmem(size_t size, zone_t *zone) 861 { 862 pgcnt_t npages = btopr(size); 863 spgcnt_t mem_free_pages = 0; 864 pgcnt_t phys_free_slots; 865 #ifdef ANON_DEBUG 866 pgcnt_t mem_resv; 867 #endif 868 if (zone != NULL) 869 rctl_decr_swap(zone, ptob(npages)); 870 871 mutex_enter(&anoninfo_lock); 872 873 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 874 /* 875 * If some of this reservation belonged to swapfs 876 * give it back to availrmem. 877 * ani_mem_resv is the amount of availrmem swapfs has reserved. 878 * but some of that memory could be locked by segspt so we can only 879 * return non locked ani_mem_resv back to availrmem 880 */ 881 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 882 ANON_PRINT((A_RESV | A_MRESV), 883 ("anon_unresv: growing availrmem by %ld pages\n", 884 MIN(k_anoninfo.ani_mem_resv, npages))); 885 886 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 887 k_anoninfo.ani_locked_swap), npages); 888 mutex_enter(&freemem_lock); 889 availrmem += mem_free_pages; 890 mutex_exit(&freemem_lock); 891 k_anoninfo.ani_mem_resv -= mem_free_pages; 892 893 ANI_ADD(-mem_free_pages); 894 } 895 /* 896 * The remainder of the pages is returned to phys swap 897 */ 898 ASSERT(npages >= mem_free_pages); 899 phys_free_slots = npages - mem_free_pages; 900 901 if (phys_free_slots) { 902 k_anoninfo.ani_phys_resv -= phys_free_slots; 903 } 904 905 #ifdef ANON_DEBUG 906 mem_resv = k_anoninfo.ani_mem_resv; 907 #endif 908 909 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 910 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 911 912 mutex_exit(&anoninfo_lock); 913 914 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 915 npages, mem_resv, (void *)caller())); 916 } 917 918 /* 919 * Allocate an anon slot and return it with the lock held. 920 */ 921 struct anon * 922 anon_alloc(struct vnode *vp, anoff_t off) 923 { 924 struct anon *ap; 925 kmutex_t *ahm; 926 927 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 928 if (vp == NULL) { 929 swap_alloc(ap); 930 } else { 931 ap->an_vp = vp; 932 ap->an_off = off; 933 } 934 ap->an_refcnt = 1; 935 ap->an_pvp = NULL; 936 ap->an_poff = 0; 937 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 938 mutex_enter(ahm); 939 anon_addhash(ap); 940 mutex_exit(ahm); 941 ANI_ADD(-1); 942 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 943 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 944 return (ap); 945 } 946 947 /* 948 * Decrement the reference count of an anon page. 949 * If reference count goes to zero, free it and 950 * its associated page (if any). 951 */ 952 void 953 anon_decref(struct anon *ap) 954 { 955 page_t *pp; 956 struct vnode *vp; 957 anoff_t off; 958 kmutex_t *ahm; 959 960 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 961 mutex_enter(ahm); 962 ASSERT(ap->an_refcnt != 0); 963 if (ap->an_refcnt == 0) 964 panic("anon_decref: slot count 0"); 965 if (--ap->an_refcnt == 0) { 966 swap_xlate(ap, &vp, &off); 967 mutex_exit(ahm); 968 969 /* 970 * If there is a page for this anon slot we will need to 971 * call VN_DISPOSE to get rid of the vp association and 972 * put the page back on the free list as really free. 973 * Acquire the "exclusive" lock to ensure that any 974 * pending i/o always completes before the swap slot 975 * is freed. 976 */ 977 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 978 979 /* 980 * If there was a page, we've synchronized on it (getting 981 * the exclusive lock is as good as gettting the iolock) 982 * so now we can free the physical backing store. Also, this 983 * is where we would free the name of the anonymous page 984 * (swap_free(ap)), a no-op in the current implementation. 985 */ 986 mutex_enter(ahm); 987 ASSERT(ap->an_refcnt == 0); 988 anon_rmhash(ap); 989 if (ap->an_pvp) 990 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 991 mutex_exit(ahm); 992 993 if (pp != NULL) { 994 /*LINTED: constant in conditional context */ 995 VN_DISPOSE(pp, B_INVAL, 0, kcred); 996 } 997 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 998 (void *)ap, (void *)ap->an_vp)); 999 kmem_cache_free(anon_cache, ap); 1000 1001 ANI_ADD(1); 1002 } else { 1003 mutex_exit(ahm); 1004 } 1005 } 1006 1007 static int 1008 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 1009 { 1010 struct anon *ap; 1011 1012 while (nslots-- > 0) { 1013 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 1014 ap->an_refcnt > 1) 1015 return (1); 1016 anon_index++; 1017 } 1018 1019 return (0); 1020 } 1021 1022 static void 1023 anon_decref_pages( 1024 struct anon_hdr *ahp, 1025 ulong_t an_idx, 1026 uint_t szc) 1027 { 1028 struct anon *ap = anon_get_ptr(ahp, an_idx); 1029 kmutex_t *ahmpages = NULL; 1030 page_t *pp; 1031 pgcnt_t pgcnt = page_get_pagecnt(szc); 1032 pgcnt_t i; 1033 struct vnode *vp; 1034 anoff_t off; 1035 kmutex_t *ahm; 1036 #ifdef DEBUG 1037 int refcnt = 1; 1038 #endif 1039 1040 ASSERT(szc != 0); 1041 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1042 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1043 ASSERT(an_idx < ahp->size); 1044 1045 if (ahp->size - an_idx < pgcnt) { 1046 /* 1047 * In case of shared mappings total anon map size may not be 1048 * the largest page size aligned. 1049 */ 1050 pgcnt = ahp->size - an_idx; 1051 } 1052 1053 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1054 1055 if (ap != NULL) { 1056 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1057 mutex_enter(ahmpages); 1058 ASSERT((refcnt = ap->an_refcnt) != 0); 1059 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1060 if (ap->an_refcnt == 1) { 1061 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1062 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1063 mutex_exit(ahmpages); 1064 ahmpages = NULL; 1065 } 1066 } 1067 1068 i = 0; 1069 while (i < pgcnt) { 1070 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1071 ASSERT(refcnt == 1 && ahmpages == NULL); 1072 i++; 1073 continue; 1074 } 1075 ASSERT(ap->an_refcnt == refcnt); 1076 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1077 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1078 1079 if (ahmpages == NULL) { 1080 swap_xlate(ap, &vp, &off); 1081 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1082 if (pp == NULL || pp->p_szc == 0) { 1083 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1084 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1085 ap->an_off)]; 1086 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1087 ANON_SLEEP); 1088 mutex_enter(ahm); 1089 ap->an_refcnt--; 1090 ASSERT(ap->an_refcnt == 0); 1091 anon_rmhash(ap); 1092 if (ap->an_pvp) 1093 swap_phys_free(ap->an_pvp, ap->an_poff, 1094 PAGESIZE); 1095 mutex_exit(ahm); 1096 if (pp != NULL) { 1097 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1098 /*LINTED*/ 1099 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1100 } 1101 kmem_cache_free(anon_cache, ap); 1102 ANI_ADD(1); 1103 i++; 1104 } else { 1105 pgcnt_t j; 1106 pgcnt_t curpgcnt = 1107 page_get_pagecnt(pp->p_szc); 1108 size_t ppasize = curpgcnt * sizeof (page_t *); 1109 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1110 int dispose = 0; 1111 1112 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1113 1114 ASSERT(pp->p_szc <= szc); 1115 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1116 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1117 ASSERT(i + curpgcnt <= pgcnt); 1118 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1119 ppa[0] = pp; 1120 for (j = i + 1; j < i + curpgcnt; j++) { 1121 ap = anon_get_ptr(ahp, an_idx + j); 1122 ASSERT(ap != NULL && 1123 ap->an_refcnt == 1); 1124 swap_xlate(ap, &vp, &off); 1125 pp = page_lookup(vp, (u_offset_t)off, 1126 SE_EXCL); 1127 if (pp == NULL) 1128 panic("anon_decref_pages: " 1129 "no page"); 1130 1131 (void) hat_pageunload(pp, 1132 HAT_FORCE_PGUNLOAD); 1133 ASSERT(pp->p_szc == ppa[0]->p_szc); 1134 ASSERT(page_pptonum(pp) - 1 == 1135 page_pptonum(ppa[j - i - 1])); 1136 ppa[j - i] = pp; 1137 if (ap->an_pvp != NULL && 1138 !vn_matchopval(ap->an_pvp, 1139 VOPNAME_DISPOSE, 1140 (fs_generic_func_p)fs_dispose)) 1141 dispose = 1; 1142 } 1143 if (!dispose) { 1144 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1145 page_destroy_pages(ppa[0]); 1146 } else { 1147 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1148 for (j = 0; j < curpgcnt; j++) { 1149 ASSERT(PAGE_EXCL(ppa[j])); 1150 ppa[j]->p_szc = 0; 1151 } 1152 for (j = 0; j < curpgcnt; j++) { 1153 ASSERT(!hat_page_is_mapped( 1154 ppa[j])); 1155 /*LINTED*/ 1156 VN_DISPOSE(ppa[j], B_INVAL, 0, 1157 kcred); 1158 } 1159 } 1160 kmem_free(ppa, ppasize); 1161 for (j = i; j < i + curpgcnt; j++) { 1162 ap = anon_get_ptr(ahp, an_idx + j); 1163 ASSERT(ap != NULL && 1164 ap->an_refcnt == 1); 1165 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1166 ap->an_off)]; 1167 (void) anon_set_ptr(ahp, an_idx + j, 1168 NULL, ANON_SLEEP); 1169 mutex_enter(ahm); 1170 ap->an_refcnt--; 1171 ASSERT(ap->an_refcnt == 0); 1172 anon_rmhash(ap); 1173 if (ap->an_pvp) 1174 swap_phys_free(ap->an_pvp, 1175 ap->an_poff, PAGESIZE); 1176 mutex_exit(ahm); 1177 kmem_cache_free(anon_cache, ap); 1178 ANI_ADD(1); 1179 } 1180 i += curpgcnt; 1181 } 1182 } else { 1183 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1184 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1185 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1186 mutex_enter(ahm); 1187 ap->an_refcnt--; 1188 mutex_exit(ahm); 1189 i++; 1190 } 1191 } 1192 1193 if (ahmpages != NULL) { 1194 mutex_exit(ahmpages); 1195 } 1196 } 1197 1198 /* 1199 * Duplicate references to size bytes worth of anon pages. 1200 * Used when duplicating a segment that contains private anon pages. 1201 * This code assumes that procedure calling this one has already used 1202 * hat_chgprot() to disable write access to the range of addresses that 1203 * that *old actually refers to. 1204 */ 1205 void 1206 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1207 ulong_t new_idx, size_t size) 1208 { 1209 spgcnt_t npages; 1210 kmutex_t *ahm; 1211 struct anon *ap; 1212 ulong_t off; 1213 ulong_t index; 1214 1215 npages = btopr(size); 1216 while (npages > 0) { 1217 index = old_idx; 1218 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1219 break; 1220 1221 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1222 off = index - old_idx; 1223 npages -= off; 1224 if (npages <= 0) 1225 break; 1226 1227 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1228 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1229 1230 mutex_enter(ahm); 1231 ap->an_refcnt++; 1232 mutex_exit(ahm); 1233 1234 off++; 1235 new_idx += off; 1236 old_idx += off; 1237 npages--; 1238 } 1239 } 1240 1241 /* 1242 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1243 * slots) within any large page region. That means if a large page region is 1244 * empty in the old array it will skip it. If there are 1 or more valid slots 1245 * in the large page region of the old array it will make sure to fill in any 1246 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1247 * page region should either have no valid anon slots or all slots should be 1248 * valid. 1249 */ 1250 void 1251 anon_dup_fill_holes( 1252 struct anon_hdr *old, 1253 ulong_t old_idx, 1254 struct anon_hdr *new, 1255 ulong_t new_idx, 1256 size_t size, 1257 uint_t szc, 1258 int noalloc) 1259 { 1260 struct anon *ap; 1261 spgcnt_t npages; 1262 kmutex_t *ahm, *ahmpages = NULL; 1263 pgcnt_t pgcnt, i; 1264 ulong_t index, off; 1265 #ifdef DEBUG 1266 int refcnt; 1267 #endif 1268 1269 ASSERT(szc != 0); 1270 pgcnt = page_get_pagecnt(szc); 1271 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1272 npages = btopr(size); 1273 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1274 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1275 1276 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1277 1278 while (npages > 0) { 1279 index = old_idx; 1280 1281 /* 1282 * Find the next valid slot. 1283 */ 1284 if (anon_get_next_ptr(old, &index) == NULL) 1285 break; 1286 1287 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1288 /* 1289 * Now backup index to the beginning of the 1290 * current large page region of the old array. 1291 */ 1292 index = P2ALIGN(index, pgcnt); 1293 off = index - old_idx; 1294 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1295 npages -= off; 1296 if (npages <= 0) 1297 break; 1298 1299 /* 1300 * Fill and copy a large page regions worth 1301 * of anon slots. 1302 */ 1303 for (i = 0; i < pgcnt; i++) { 1304 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1305 if (noalloc) { 1306 panic("anon_dup_fill_holes: " 1307 "empty anon slot\n"); 1308 } 1309 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1310 ap = anon_alloc(NULL, 0); 1311 (void) anon_set_ptr(old, index + i, ap, 1312 ANON_SLEEP); 1313 } else if (i == 0) { 1314 /* 1315 * make the increment of all refcnts of all 1316 * anon slots of a large page appear atomic by 1317 * getting an anonpages_hash_lock for the 1318 * first anon slot of a large page. 1319 */ 1320 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1321 1322 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1323 1324 ahmpages = &anonpages_hash_lock[hash]; 1325 mutex_enter(ahmpages); 1326 /*LINTED*/ 1327 ASSERT(refcnt = ap->an_refcnt); 1328 1329 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1330 anonvmstats.dupfillholes[3]); 1331 } 1332 (void) anon_set_ptr(new, new_idx + off + i, ap, 1333 ANON_SLEEP); 1334 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1335 mutex_enter(ahm); 1336 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1337 ASSERT(i == 0 || ahmpages == NULL || 1338 refcnt == ap->an_refcnt); 1339 ap->an_refcnt++; 1340 mutex_exit(ahm); 1341 } 1342 if (ahmpages != NULL) { 1343 mutex_exit(ahmpages); 1344 ahmpages = NULL; 1345 } 1346 off += pgcnt; 1347 new_idx += off; 1348 old_idx += off; 1349 npages -= pgcnt; 1350 } 1351 } 1352 1353 /* 1354 * Used when a segment with a vnode changes szc. similarly to 1355 * anon_dup_fill_holes() makes sure each large page region either has no anon 1356 * slots or all of them. but new slots are created by COWing the file 1357 * pages. on entrance no anon slots should be shared. 1358 */ 1359 int 1360 anon_fill_cow_holes( 1361 struct seg *seg, 1362 caddr_t addr, 1363 struct anon_hdr *ahp, 1364 ulong_t an_idx, 1365 struct vnode *vp, 1366 u_offset_t vp_off, 1367 size_t size, 1368 uint_t szc, 1369 uint_t prot, 1370 struct vpage vpage[], 1371 struct cred *cred) 1372 { 1373 struct anon *ap; 1374 spgcnt_t npages; 1375 pgcnt_t pgcnt, i; 1376 ulong_t index, off; 1377 int err = 0; 1378 int pageflags = 0; 1379 1380 ASSERT(szc != 0); 1381 pgcnt = page_get_pagecnt(szc); 1382 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1383 npages = btopr(size); 1384 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1385 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1386 1387 while (npages > 0) { 1388 index = an_idx; 1389 1390 /* 1391 * Find the next valid slot. 1392 */ 1393 if (anon_get_next_ptr(ahp, &index) == NULL) { 1394 break; 1395 } 1396 1397 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1398 /* 1399 * Now backup index to the beginning of the 1400 * current large page region of the anon array. 1401 */ 1402 index = P2ALIGN(index, pgcnt); 1403 off = index - an_idx; 1404 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1405 npages -= off; 1406 if (npages <= 0) 1407 break; 1408 an_idx += off; 1409 vp_off += ptob(off); 1410 addr += ptob(off); 1411 if (vpage != NULL) { 1412 vpage += off; 1413 } 1414 1415 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1416 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1417 page_t *pl[1 + 1]; 1418 page_t *pp; 1419 1420 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1421 pl, PAGESIZE, seg, addr, S_READ, cred); 1422 if (err) { 1423 break; 1424 } 1425 if (vpage != NULL) { 1426 prot = VPP_PROT(vpage); 1427 pageflags = VPP_ISPPLOCK(vpage) ? 1428 LOCK_PAGE : 0; 1429 } 1430 pp = anon_private(&ap, seg, addr, prot, pl[0], 1431 pageflags, cred); 1432 if (pp == NULL) { 1433 err = ENOMEM; 1434 break; 1435 } 1436 (void) anon_set_ptr(ahp, an_idx, ap, 1437 ANON_SLEEP); 1438 page_unlock(pp); 1439 } 1440 ASSERT(ap->an_refcnt == 1); 1441 addr += PAGESIZE; 1442 if (vpage != NULL) { 1443 vpage++; 1444 } 1445 } 1446 npages -= pgcnt; 1447 } 1448 1449 return (err); 1450 } 1451 1452 /* 1453 * Free a group of "size" anon pages, size in bytes, 1454 * and clear out the pointers to the anon entries. 1455 */ 1456 void 1457 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1458 { 1459 spgcnt_t npages; 1460 struct anon *ap; 1461 ulong_t old; 1462 1463 npages = btopr(size); 1464 1465 while (npages > 0) { 1466 old = index; 1467 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1468 break; 1469 1470 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1471 npages -= index - old; 1472 if (npages <= 0) 1473 break; 1474 1475 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1476 anon_decref(ap); 1477 /* 1478 * Bump index and decrement page count 1479 */ 1480 index++; 1481 npages--; 1482 } 1483 } 1484 1485 void 1486 anon_free_pages( 1487 struct anon_hdr *ahp, 1488 ulong_t an_idx, 1489 size_t size, 1490 uint_t szc) 1491 { 1492 spgcnt_t npages; 1493 pgcnt_t pgcnt; 1494 ulong_t index, off; 1495 1496 ASSERT(szc != 0); 1497 pgcnt = page_get_pagecnt(szc); 1498 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1499 npages = btopr(size); 1500 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1501 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1502 ASSERT(an_idx < ahp->size); 1503 1504 VM_STAT_ADD(anonvmstats.freepages[0]); 1505 1506 while (npages > 0) { 1507 index = an_idx; 1508 1509 /* 1510 * Find the next valid slot. 1511 */ 1512 if (anon_get_next_ptr(ahp, &index) == NULL) 1513 break; 1514 1515 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1516 /* 1517 * Now backup index to the beginning of the 1518 * current large page region of the old array. 1519 */ 1520 index = P2ALIGN(index, pgcnt); 1521 off = index - an_idx; 1522 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1523 npages -= off; 1524 if (npages <= 0) 1525 break; 1526 1527 anon_decref_pages(ahp, index, szc); 1528 1529 off += pgcnt; 1530 an_idx += off; 1531 npages -= pgcnt; 1532 } 1533 } 1534 1535 /* 1536 * Make anonymous pages discardable 1537 */ 1538 void 1539 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) 1540 { 1541 spgcnt_t npages = btopr(size); 1542 struct anon *ap; 1543 struct vnode *vp; 1544 anoff_t off; 1545 page_t *pp, *root_pp; 1546 kmutex_t *ahm; 1547 pgcnt_t pgcnt; 1548 ulong_t old_idx, idx, i; 1549 struct anon_hdr *ahp = amp->ahp; 1550 anon_sync_obj_t cookie; 1551 1552 ASSERT(RW_READ_HELD(&->a_rwlock)); 1553 pgcnt = 1; 1554 for (; npages > 0; index = (pgcnt == 1) ? index + 1: 1555 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1556 1557 /* 1558 * get anon pointer and index for the first valid entry 1559 * in the anon list, starting from "index" 1560 */ 1561 old_idx = index; 1562 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1563 break; 1564 1565 /* 1566 * decrement npages by number of NULL anon slots we skipped 1567 */ 1568 npages -= index - old_idx; 1569 if (npages <= 0) 1570 break; 1571 1572 anon_array_enter(amp, index, &cookie); 1573 ap = anon_get_ptr(ahp, index); 1574 ASSERT(ap != NULL); 1575 1576 /* 1577 * Get anonymous page and try to lock it SE_EXCL; 1578 * For non blocking case if we couldn't grab the lock 1579 * we skip to next page. 1580 * For blocking case (ANON_PGLOOKUP_BLK) block 1581 * until we grab SE_EXCL lock. 1582 */ 1583 swap_xlate(ap, &vp, &off); 1584 if (flags & ANON_PGLOOKUP_BLK) 1585 pp = page_lookup_create(vp, (u_offset_t)off, 1586 SE_EXCL, NULL, NULL, SE_EXCL_WANTED); 1587 else 1588 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1589 if (pp == NULL) { 1590 segadvstat.MADV_FREE_miss.value.ul++; 1591 pgcnt = 1; 1592 anon_array_exit(&cookie); 1593 continue; 1594 } 1595 pgcnt = page_get_pagecnt(pp->p_szc); 1596 1597 /* 1598 * we cannot free a page which is permanently locked. 1599 * The page_struct_lock need not be acquired to examine 1600 * these fields since the page has an "exclusive" lock. 1601 */ 1602 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1603 page_unlock(pp); 1604 segadvstat.MADV_FREE_miss.value.ul++; 1605 anon_array_exit(&cookie); 1606 continue; 1607 } 1608 1609 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1610 mutex_enter(ahm); 1611 ASSERT(ap->an_refcnt != 0); 1612 /* 1613 * skip this one if copy-on-write is not yet broken. 1614 */ 1615 if (ap->an_refcnt > 1) { 1616 mutex_exit(ahm); 1617 page_unlock(pp); 1618 segadvstat.MADV_FREE_miss.value.ul++; 1619 anon_array_exit(&cookie); 1620 continue; 1621 } 1622 1623 if (pp->p_szc == 0) { 1624 pgcnt = 1; 1625 1626 /* 1627 * free swap slot; 1628 */ 1629 if (ap->an_pvp) { 1630 swap_phys_free(ap->an_pvp, ap->an_poff, 1631 PAGESIZE); 1632 ap->an_pvp = NULL; 1633 ap->an_poff = 0; 1634 } 1635 mutex_exit(ahm); 1636 segadvstat.MADV_FREE_hit.value.ul++; 1637 1638 /* 1639 * while we are at it, unload all the translations 1640 * and attempt to free the page. 1641 */ 1642 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1643 /*LINTED: constant in conditional context */ 1644 VN_DISPOSE(pp, B_FREE, 0, kcred); 1645 anon_array_exit(&cookie); 1646 continue; 1647 } 1648 1649 pgcnt = page_get_pagecnt(pp->p_szc); 1650 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1651 if (!page_try_demote_pages(pp)) { 1652 mutex_exit(ahm); 1653 page_unlock(pp); 1654 segadvstat.MADV_FREE_miss.value.ul++; 1655 anon_array_exit(&cookie); 1656 continue; 1657 } else { 1658 pgcnt = 1; 1659 if (ap->an_pvp) { 1660 swap_phys_free(ap->an_pvp, 1661 ap->an_poff, PAGESIZE); 1662 ap->an_pvp = NULL; 1663 ap->an_poff = 0; 1664 } 1665 mutex_exit(ahm); 1666 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1667 /*LINTED*/ 1668 VN_DISPOSE(pp, B_FREE, 0, kcred); 1669 segadvstat.MADV_FREE_hit.value.ul++; 1670 anon_array_exit(&cookie); 1671 continue; 1672 } 1673 } 1674 mutex_exit(ahm); 1675 root_pp = pp; 1676 1677 /* 1678 * try to lock remaining pages 1679 */ 1680 for (idx = 1; idx < pgcnt; idx++) { 1681 pp++; 1682 if (!page_trylock(pp, SE_EXCL)) 1683 break; 1684 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1685 page_unlock(pp); 1686 break; 1687 } 1688 } 1689 1690 if (idx == pgcnt) { 1691 for (i = 0; i < pgcnt; i++) { 1692 ap = anon_get_ptr(ahp, index + i); 1693 if (ap == NULL) 1694 break; 1695 swap_xlate(ap, &vp, &off); 1696 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1697 mutex_enter(ahm); 1698 ASSERT(ap->an_refcnt != 0); 1699 1700 /* 1701 * skip this one if copy-on-write 1702 * is not yet broken. 1703 */ 1704 if (ap->an_refcnt > 1) { 1705 mutex_exit(ahm); 1706 goto skiplp; 1707 } 1708 if (ap->an_pvp) { 1709 swap_phys_free(ap->an_pvp, 1710 ap->an_poff, PAGESIZE); 1711 ap->an_pvp = NULL; 1712 ap->an_poff = 0; 1713 } 1714 mutex_exit(ahm); 1715 } 1716 page_destroy_pages(root_pp); 1717 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1718 anon_array_exit(&cookie); 1719 continue; 1720 } 1721 skiplp: 1722 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1723 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1724 page_unlock(pp); 1725 anon_array_exit(&cookie); 1726 } 1727 } 1728 1729 /* 1730 * Return the kept page(s) and protections back to the segment driver. 1731 */ 1732 int 1733 anon_getpage( 1734 struct anon **app, 1735 uint_t *protp, 1736 page_t *pl[], 1737 size_t plsz, 1738 struct seg *seg, 1739 caddr_t addr, 1740 enum seg_rw rw, 1741 struct cred *cred) 1742 { 1743 page_t *pp; 1744 struct anon *ap = *app; 1745 struct vnode *vp; 1746 anoff_t off; 1747 int err; 1748 kmutex_t *ahm; 1749 1750 swap_xlate(ap, &vp, &off); 1751 1752 /* 1753 * Lookup the page. If page is being paged in, 1754 * wait for it to finish as we must return a list of 1755 * pages since this routine acts like the VOP_GETPAGE 1756 * routine does. 1757 */ 1758 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1759 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1760 mutex_enter(ahm); 1761 if (ap->an_refcnt == 1) 1762 *protp = PROT_ALL; 1763 else 1764 *protp = PROT_ALL & ~PROT_WRITE; 1765 mutex_exit(ahm); 1766 pl[0] = pp; 1767 pl[1] = NULL; 1768 return (0); 1769 } 1770 1771 /* 1772 * Simply treat it as a vnode fault on the anon vp. 1773 */ 1774 1775 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1776 "anon_getpage:seg %x addr %x vp %x", 1777 seg, addr, vp); 1778 1779 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1780 seg, addr, rw, cred); 1781 1782 if (err == 0 && pl != NULL) { 1783 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1784 mutex_enter(ahm); 1785 if (ap->an_refcnt != 1) 1786 *protp &= ~PROT_WRITE; /* make read-only */ 1787 mutex_exit(ahm); 1788 } 1789 return (err); 1790 } 1791 1792 /* 1793 * Creates or returns kept pages to the segment driver. returns -1 if a large 1794 * page cannot be allocated. returns -2 if some other process has allocated a 1795 * larger page. 1796 * 1797 * For cowfault it will alocate any size pages to fill the requested area to 1798 * avoid partially overwritting anon slots (i.e. sharing only some of the anon 1799 * slots within a large page with other processes). This policy greatly 1800 * simplifies large page freeing (which is only freed when all anon slot 1801 * refcnts are 0). 1802 */ 1803 int 1804 anon_map_getpages( 1805 struct anon_map *amp, 1806 ulong_t start_idx, 1807 uint_t szc, 1808 struct seg *seg, 1809 caddr_t addr, 1810 uint_t prot, 1811 uint_t *protp, 1812 page_t *ppa[], 1813 uint_t *ppa_szc, 1814 struct vpage vpage[], 1815 enum seg_rw rw, 1816 int brkcow, 1817 int anypgsz, 1818 int pgflags, 1819 struct cred *cred) 1820 { 1821 pgcnt_t pgcnt; 1822 struct anon *ap; 1823 struct vnode *vp; 1824 anoff_t off; 1825 page_t *pp, *pl[2], *conpp = NULL; 1826 caddr_t vaddr; 1827 ulong_t pg_idx, an_idx, i; 1828 spgcnt_t nreloc = 0; 1829 int prealloc = 1; 1830 int err, slotcreate; 1831 uint_t vpprot; 1832 int upsize = (szc < seg->s_szc); 1833 1834 #if !defined(__i386) && !defined(__amd64) 1835 ASSERT(seg->s_szc != 0); 1836 #endif 1837 ASSERT(szc <= seg->s_szc); 1838 ASSERT(ppa_szc != NULL); 1839 ASSERT(rw != S_CREATE); 1840 1841 *protp = PROT_ALL; 1842 1843 VM_STAT_ADD(anonvmstats.getpages[0]); 1844 1845 if (szc == 0) { 1846 VM_STAT_ADD(anonvmstats.getpages[1]); 1847 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1848 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1849 addr, rw, cred); 1850 if (err) 1851 return (err); 1852 ppa[0] = pl[0]; 1853 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1854 VM_STAT_ADD(anonvmstats.getpages[2]); 1855 if (ppa[0]->p_szc != 0 && upsize) { 1856 VM_STAT_ADD(anonvmstats.getpages[3]); 1857 *ppa_szc = MIN(ppa[0]->p_szc, 1858 seg->s_szc); 1859 page_unlock(ppa[0]); 1860 return (-2); 1861 } 1862 return (0); 1863 } 1864 panic("anon_map_getpages: cowfault for szc 0"); 1865 } else { 1866 VM_STAT_ADD(anonvmstats.getpages[4]); 1867 ppa[0] = anon_zero(seg, addr, &ap, cred); 1868 if (ppa[0] == NULL) 1869 return (ENOMEM); 1870 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1871 ANON_SLEEP); 1872 return (0); 1873 } 1874 } 1875 1876 pgcnt = page_get_pagecnt(szc); 1877 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1878 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1879 1880 /* 1881 * First we check for the case that the requtested large 1882 * page or larger page already exists in the system. 1883 * Actually we only check if the first constituent page 1884 * exists and only preallocate if it's not found. 1885 */ 1886 ap = anon_get_ptr(amp->ahp, start_idx); 1887 if (ap) { 1888 uint_t pszc; 1889 swap_xlate(ap, &vp, &off); 1890 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1891 if (pszc > szc && upsize) { 1892 *ppa_szc = MIN(pszc, seg->s_szc); 1893 return (-2); 1894 } 1895 if (pszc >= szc) { 1896 prealloc = 0; 1897 } 1898 } 1899 } 1900 1901 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1902 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1903 1904 top: 1905 /* 1906 * If a smaller page or no page at all was found, 1907 * grab a large page off the freelist. 1908 */ 1909 if (prealloc) { 1910 ASSERT(conpp == NULL); 1911 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1912 szc, 0, pgflags) != 0) { 1913 VM_STAT_ADD(anonvmstats.getpages[7]); 1914 if (brkcow == 0 || 1915 !anon_share(amp->ahp, start_idx, pgcnt)) { 1916 /* 1917 * If the refcnt's of all anon slots are <= 1 1918 * they can't increase since we are holding 1919 * the address space's lock. So segvn can 1920 * safely decrease szc without risking to 1921 * generate a cow fault for the region smaller 1922 * than the segment's largest page size. 1923 */ 1924 VM_STAT_ADD(anonvmstats.getpages[8]); 1925 return (-1); 1926 } 1927 docow: 1928 /* 1929 * This is a cow fault. Copy away the entire 1 large 1930 * page region of this segment. 1931 */ 1932 if (szc != seg->s_szc) 1933 panic("anon_map_getpages: cowfault for szc %d", 1934 szc); 1935 vaddr = addr; 1936 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1937 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1938 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1939 NULL) { 1940 err = anon_getpage(&ap, &vpprot, pl, 1941 PAGESIZE, seg, vaddr, rw, cred); 1942 if (err) { 1943 for (i = 0; i < pg_idx; i++) { 1944 if ((pp = ppa[i]) != 1945 NULL) 1946 page_unlock(pp); 1947 } 1948 return (err); 1949 } 1950 ppa[pg_idx] = pl[0]; 1951 } else { 1952 /* 1953 * Since this is a cowfault we know 1954 * that this address space has a 1955 * parent or children which means 1956 * anon_dup_fill_holes() has initialized 1957 * all anon slots within a large page 1958 * region that had at least one anon 1959 * slot at the time of fork(). 1960 */ 1961 panic("anon_map_getpages: " 1962 "cowfault but anon slot is empty"); 1963 } 1964 } 1965 VM_STAT_ADD(anonvmstats.getpages[9]); 1966 *protp = PROT_ALL; 1967 return (anon_map_privatepages(amp, start_idx, szc, seg, 1968 addr, prot, ppa, vpage, anypgsz, pgflags, cred)); 1969 } 1970 } 1971 1972 VM_STAT_ADD(anonvmstats.getpages[10]); 1973 1974 an_idx = start_idx; 1975 pg_idx = 0; 1976 vaddr = addr; 1977 while (pg_idx < pgcnt) { 1978 slotcreate = 0; 1979 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 1980 VM_STAT_ADD(anonvmstats.getpages[11]); 1981 /* 1982 * For us to have decided not to preallocate 1983 * would have meant that a large page 1984 * was found. Which also means that all of the 1985 * anon slots for that page would have been 1986 * already created for us. 1987 */ 1988 if (prealloc == 0) 1989 panic("anon_map_getpages: prealloc = 0"); 1990 1991 slotcreate = 1; 1992 ap = anon_alloc(NULL, 0); 1993 } 1994 swap_xlate(ap, &vp, &off); 1995 1996 /* 1997 * Now setup our preallocated page to pass down 1998 * to swap_getpage(). 1999 */ 2000 if (prealloc) { 2001 ASSERT(ppa[pg_idx]->p_szc == szc); 2002 conpp = ppa[pg_idx]; 2003 } 2004 ASSERT(prealloc || conpp == NULL); 2005 2006 /* 2007 * If we just created this anon slot then call 2008 * with S_CREATE to prevent doing IO on the page. 2009 * Similar to the anon_zero case. 2010 */ 2011 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 2012 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 2013 slotcreate == 1 ? S_CREATE : rw, cred); 2014 2015 if (err) { 2016 ASSERT(err != -2 || upsize); 2017 VM_STAT_ADD(anonvmstats.getpages[12]); 2018 ASSERT(slotcreate == 0); 2019 goto io_err; 2020 } 2021 2022 pp = pl[0]; 2023 2024 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2025 VM_STAT_ADD(anonvmstats.getpages[13]); 2026 ASSERT(slotcreate == 0); 2027 ASSERT(prealloc == 0); 2028 ASSERT(pg_idx == 0); 2029 if (pp->p_szc > szc) { 2030 ASSERT(upsize); 2031 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2032 page_unlock(pp); 2033 VM_STAT_ADD(anonvmstats.getpages[14]); 2034 return (-2); 2035 } 2036 page_unlock(pp); 2037 prealloc = 1; 2038 goto top; 2039 } 2040 2041 /* 2042 * If we decided to preallocate but VOP_GETPAGE 2043 * found a page in the system that satisfies our 2044 * request then free up our preallocated large page 2045 * and continue looping accross the existing large 2046 * page via VOP_GETPAGE. 2047 */ 2048 if (prealloc && pp != ppa[pg_idx]) { 2049 VM_STAT_ADD(anonvmstats.getpages[15]); 2050 ASSERT(slotcreate == 0); 2051 ASSERT(pg_idx == 0); 2052 conpp = NULL; 2053 prealloc = 0; 2054 page_free_pages(ppa[0]); 2055 } 2056 2057 if (prealloc && nreloc > 1) { 2058 /* 2059 * we have relocated out of a smaller large page. 2060 * skip npgs - 1 iterations and continue which will 2061 * increment by one the loop indices. 2062 */ 2063 spgcnt_t npgs = nreloc; 2064 2065 VM_STAT_ADD(anonvmstats.getpages[16]); 2066 2067 ASSERT(pp == ppa[pg_idx]); 2068 ASSERT(slotcreate == 0); 2069 ASSERT(pg_idx + npgs <= pgcnt); 2070 if ((*protp & PROT_WRITE) && 2071 anon_share(amp->ahp, an_idx, npgs)) { 2072 *protp &= ~PROT_WRITE; 2073 } 2074 pg_idx += npgs; 2075 an_idx += npgs; 2076 vaddr += PAGESIZE * npgs; 2077 continue; 2078 } 2079 2080 VM_STAT_ADD(anonvmstats.getpages[17]); 2081 2082 /* 2083 * Anon_zero case. 2084 */ 2085 if (slotcreate) { 2086 ASSERT(prealloc); 2087 pagezero(pp, 0, PAGESIZE); 2088 CPU_STATS_ADD_K(vm, zfod, 1); 2089 hat_setrefmod(pp); 2090 } 2091 2092 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2093 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2094 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2095 2096 if (pg_idx > 0 && 2097 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2098 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2099 panic("anon_map_getpages: unexpected page"); 2100 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2101 panic("anon_map_getpages: unaligned page"); 2102 } 2103 2104 if (prealloc == 0) { 2105 ppa[pg_idx] = pp; 2106 } 2107 2108 if (ap->an_refcnt > 1) { 2109 VM_STAT_ADD(anonvmstats.getpages[18]); 2110 *protp &= ~PROT_WRITE; 2111 } 2112 2113 /* 2114 * If this is a new anon slot then initialize 2115 * the anon array entry. 2116 */ 2117 if (slotcreate) { 2118 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2119 } 2120 pg_idx++; 2121 an_idx++; 2122 vaddr += PAGESIZE; 2123 } 2124 2125 /* 2126 * Since preallocated pages come off the freelist 2127 * they are locked SE_EXCL. Simply downgrade and return. 2128 */ 2129 if (prealloc) { 2130 VM_STAT_ADD(anonvmstats.getpages[19]); 2131 conpp = NULL; 2132 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2133 page_downgrade(ppa[pg_idx]); 2134 } 2135 } 2136 ASSERT(conpp == NULL); 2137 2138 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2139 VM_STAT_ADD(anonvmstats.getpages[20]); 2140 return (0); 2141 } 2142 2143 if (szc < seg->s_szc) 2144 panic("anon_map_getpages: cowfault for szc %d", szc); 2145 2146 VM_STAT_ADD(anonvmstats.getpages[21]); 2147 2148 *protp = PROT_ALL; 2149 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2150 ppa, vpage, anypgsz, pgflags, cred)); 2151 io_err: 2152 /* 2153 * We got an IO error somewhere in our large page. 2154 * If we were using a preallocated page then just demote 2155 * all the constituent pages that we've succeeded with sofar 2156 * to PAGESIZE pages and leave them in the system 2157 * unlocked. 2158 */ 2159 2160 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2161 2162 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2163 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2164 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2165 2166 if (prealloc) { 2167 conpp = NULL; 2168 if (pg_idx > 0) { 2169 VM_STAT_ADD(anonvmstats.getpages[25]); 2170 for (i = 0; i < pgcnt; i++) { 2171 pp = ppa[i]; 2172 ASSERT(PAGE_EXCL(pp)); 2173 ASSERT(pp->p_szc == szc); 2174 pp->p_szc = 0; 2175 } 2176 for (i = 0; i < pg_idx; i++) { 2177 ASSERT(!hat_page_is_mapped(ppa[i])); 2178 page_unlock(ppa[i]); 2179 } 2180 /* 2181 * Now free up the remaining unused constituent 2182 * pages. 2183 */ 2184 while (pg_idx < pgcnt) { 2185 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2186 page_free(ppa[pg_idx], 0); 2187 pg_idx++; 2188 } 2189 } else { 2190 VM_STAT_ADD(anonvmstats.getpages[26]); 2191 page_free_pages(ppa[0]); 2192 } 2193 } else { 2194 VM_STAT_ADD(anonvmstats.getpages[27]); 2195 ASSERT(err > 0); 2196 for (i = 0; i < pg_idx; i++) 2197 page_unlock(ppa[i]); 2198 } 2199 ASSERT(conpp == NULL); 2200 if (err != -1) 2201 return (err); 2202 /* 2203 * we are here because we failed to relocate. 2204 */ 2205 ASSERT(prealloc); 2206 if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { 2207 VM_STAT_ADD(anonvmstats.getpages[28]); 2208 return (-1); 2209 } 2210 VM_STAT_ADD(anonvmstats.getpages[29]); 2211 goto docow; 2212 } 2213 2214 2215 /* 2216 * Turn a reference to an object or shared anon page 2217 * into a private page with a copy of the data from the 2218 * original page which is always locked by the caller. 2219 * This routine unloads the translation and unlocks the 2220 * original page, if it isn't being stolen, before returning 2221 * to the caller. 2222 * 2223 * NOTE: The original anon slot is not freed by this routine 2224 * It must be freed by the caller while holding the 2225 * "anon_map" lock to prevent races which can occur if 2226 * a process has multiple lwps in its address space. 2227 */ 2228 page_t * 2229 anon_private( 2230 struct anon **app, 2231 struct seg *seg, 2232 caddr_t addr, 2233 uint_t prot, 2234 page_t *opp, 2235 int oppflags, 2236 struct cred *cred) 2237 { 2238 struct anon *old = *app; 2239 struct anon *new; 2240 page_t *pp = NULL; 2241 struct vnode *vp; 2242 anoff_t off; 2243 page_t *anon_pl[1 + 1]; 2244 int err; 2245 2246 if (oppflags & STEAL_PAGE) 2247 ASSERT(PAGE_EXCL(opp)); 2248 else 2249 ASSERT(PAGE_LOCKED(opp)); 2250 2251 CPU_STATS_ADD_K(vm, cow_fault, 1); 2252 2253 /* Kernel probe */ 2254 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2255 tnf_opaque, address, addr); 2256 2257 *app = new = anon_alloc(NULL, 0); 2258 swap_xlate(new, &vp, &off); 2259 2260 if (oppflags & STEAL_PAGE) { 2261 page_rename(opp, vp, (u_offset_t)off); 2262 pp = opp; 2263 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2264 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2265 seg, addr, pp, vp, off); 2266 hat_setmod(pp); 2267 2268 /* bug 4026339 */ 2269 page_downgrade(pp); 2270 return (pp); 2271 } 2272 2273 /* 2274 * Call the VOP_GETPAGE routine to create the page, thereby 2275 * enabling the vnode driver to allocate any filesystem 2276 * space (e.g., disk block allocation for UFS). This also 2277 * prevents more than one page from being added to the 2278 * vnode at the same time. 2279 */ 2280 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2281 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2282 if (err) 2283 goto out; 2284 2285 pp = anon_pl[0]; 2286 2287 /* 2288 * If the original page was locked, we need to move the lock 2289 * to the new page by transfering 'cowcnt/lckcnt' of the original 2290 * page to 'cowcnt/lckcnt' of the new page. 2291 * 2292 * See Statement at the beginning of segvn_lockop() and 2293 * comments in page_pp_useclaim() regarding the way 2294 * cowcnts/lckcnts are handled. 2295 * 2296 * Also availrmem must be decremented up front for read only mapping 2297 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2298 * if availrmem did not need to be decremented after all. 2299 */ 2300 if (oppflags & LOCK_PAGE) { 2301 if ((prot & PROT_WRITE) == 0) { 2302 mutex_enter(&freemem_lock); 2303 if (availrmem > pages_pp_maximum) { 2304 availrmem--; 2305 pages_useclaim++; 2306 } else { 2307 mutex_exit(&freemem_lock); 2308 goto out; 2309 } 2310 mutex_exit(&freemem_lock); 2311 } 2312 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2313 } 2314 2315 /* 2316 * Now copy the contents from the original page, 2317 * which is locked and loaded in the MMU by 2318 * the caller to prevent yet another page fault. 2319 */ 2320 /* XXX - should set mod bit in here */ 2321 if (ppcopy(opp, pp) == 0) { 2322 /* 2323 * Before ppcopy could hanlde UE or other faults, we 2324 * would have panicked here, and still have no option 2325 * but to do so now. 2326 */ 2327 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p", 2328 opp, pp); 2329 } 2330 2331 hat_setrefmod(pp); /* mark as modified */ 2332 2333 /* 2334 * Unload the old translation. 2335 */ 2336 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2337 2338 /* 2339 * Free unmapped, unmodified original page. 2340 * or release the lock on the original page, 2341 * otherwise the process will sleep forever in 2342 * anon_decref() waiting for the "exclusive" lock 2343 * on the page. 2344 */ 2345 (void) page_release(opp, 1); 2346 2347 /* 2348 * we are done with page creation so downgrade the new 2349 * page's selock to shared, this helps when multiple 2350 * as_fault(...SOFTLOCK...) are done to the same 2351 * page(aio) 2352 */ 2353 page_downgrade(pp); 2354 2355 /* 2356 * NOTE: The original anon slot must be freed by the 2357 * caller while holding the "anon_map" lock, if we 2358 * copied away from an anonymous page. 2359 */ 2360 return (pp); 2361 2362 out: 2363 *app = old; 2364 if (pp) 2365 page_unlock(pp); 2366 anon_decref(new); 2367 page_unlock(opp); 2368 return ((page_t *)NULL); 2369 } 2370 2371 int 2372 anon_map_privatepages( 2373 struct anon_map *amp, 2374 ulong_t start_idx, 2375 uint_t szc, 2376 struct seg *seg, 2377 caddr_t addr, 2378 uint_t prot, 2379 page_t *ppa[], 2380 struct vpage vpage[], 2381 int anypgsz, 2382 int pgflags, 2383 struct cred *cred) 2384 { 2385 pgcnt_t pgcnt; 2386 struct vnode *vp; 2387 anoff_t off; 2388 page_t *pl[2], *conpp = NULL; 2389 int err; 2390 int prealloc = 1; 2391 struct anon *ap, *oldap; 2392 caddr_t vaddr; 2393 page_t *pplist, *pp; 2394 ulong_t pg_idx, an_idx; 2395 spgcnt_t nreloc = 0; 2396 int pagelock = 0; 2397 kmutex_t *ahmpages = NULL; 2398 #ifdef DEBUG 2399 int refcnt; 2400 #endif 2401 2402 ASSERT(szc != 0); 2403 ASSERT(szc == seg->s_szc); 2404 2405 VM_STAT_ADD(anonvmstats.privatepages[0]); 2406 2407 pgcnt = page_get_pagecnt(szc); 2408 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2409 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2410 2411 ASSERT(amp != NULL); 2412 ap = anon_get_ptr(amp->ahp, start_idx); 2413 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2414 2415 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2416 2417 /* 2418 * Now try and allocate the large page. If we fail then just 2419 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2420 * the caller make this decision but to avoid added complexity 2421 * it's simplier to handle that case here. 2422 */ 2423 if (anypgsz == -1) { 2424 VM_STAT_ADD(anonvmstats.privatepages[2]); 2425 prealloc = 0; 2426 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2427 anypgsz, pgflags) != 0) { 2428 VM_STAT_ADD(anonvmstats.privatepages[3]); 2429 prealloc = 0; 2430 } 2431 2432 /* 2433 * make the decrement of all refcnts of all 2434 * anon slots of a large page appear atomic by 2435 * getting an anonpages_hash_lock for the 2436 * first anon slot of a large page. 2437 */ 2438 if (ap != NULL) { 2439 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2440 ap->an_off)]; 2441 mutex_enter(ahmpages); 2442 if (ap->an_refcnt == 1) { 2443 VM_STAT_ADD(anonvmstats.privatepages[4]); 2444 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2445 mutex_exit(ahmpages); 2446 2447 if (prealloc) { 2448 page_free_replacement_page(pplist); 2449 page_create_putback(pgcnt); 2450 } 2451 ASSERT(ppa[0]->p_szc <= szc); 2452 if (ppa[0]->p_szc == szc) { 2453 VM_STAT_ADD(anonvmstats.privatepages[5]); 2454 return (0); 2455 } 2456 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2457 ASSERT(ppa[pg_idx] != NULL); 2458 page_unlock(ppa[pg_idx]); 2459 } 2460 return (-1); 2461 } 2462 } 2463 2464 /* 2465 * If we are passed in the vpage array and this is 2466 * not PROT_WRITE then we need to decrement availrmem 2467 * up front before we try anything. If we need to and 2468 * can't decrement availrmem then its better to fail now 2469 * than in the middle of processing the new large page. 2470 * page_pp_usclaim() on behalf of each constituent page 2471 * below will adjust availrmem back for the cases not needed. 2472 */ 2473 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2474 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2475 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2476 pagelock = 1; 2477 break; 2478 } 2479 } 2480 if (pagelock) { 2481 VM_STAT_ADD(anonvmstats.privatepages[6]); 2482 mutex_enter(&freemem_lock); 2483 if (availrmem >= pages_pp_maximum + pgcnt) { 2484 availrmem -= pgcnt; 2485 pages_useclaim += pgcnt; 2486 } else { 2487 VM_STAT_ADD(anonvmstats.privatepages[7]); 2488 mutex_exit(&freemem_lock); 2489 if (ahmpages != NULL) { 2490 mutex_exit(ahmpages); 2491 } 2492 if (prealloc) { 2493 page_free_replacement_page(pplist); 2494 page_create_putback(pgcnt); 2495 } 2496 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2497 if (ppa[pg_idx] != NULL) 2498 page_unlock(ppa[pg_idx]); 2499 return (ENOMEM); 2500 } 2501 mutex_exit(&freemem_lock); 2502 } 2503 } 2504 2505 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2506 2507 VM_STAT_ADD(anonvmstats.privatepages[8]); 2508 2509 an_idx = start_idx; 2510 pg_idx = 0; 2511 vaddr = addr; 2512 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2513 ASSERT(ppa[pg_idx] != NULL); 2514 oldap = anon_get_ptr(amp->ahp, an_idx); 2515 ASSERT(ahmpages != NULL || oldap == NULL); 2516 ASSERT(ahmpages == NULL || oldap != NULL); 2517 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2518 ASSERT(ahmpages == NULL || pg_idx != 0 || 2519 (refcnt = oldap->an_refcnt)); 2520 ASSERT(ahmpages == NULL || pg_idx == 0 || 2521 refcnt == oldap->an_refcnt); 2522 2523 ap = anon_alloc(NULL, 0); 2524 2525 swap_xlate(ap, &vp, &off); 2526 2527 /* 2528 * Now setup our preallocated page to pass down to 2529 * swap_getpage(). 2530 */ 2531 if (prealloc) { 2532 pp = pplist; 2533 page_sub(&pplist, pp); 2534 conpp = pp; 2535 } 2536 2537 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2538 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2539 S_CREATE, cred); 2540 2541 /* 2542 * Impossible to fail this is S_CREATE. 2543 */ 2544 if (err) 2545 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2546 2547 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2548 ASSERT(prealloc == 0 || nreloc == 1); 2549 2550 pp = pl[0]; 2551 2552 /* 2553 * If the original page was locked, we need to move 2554 * the lock to the new page by transfering 2555 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2556 * of the new page. pg_idx can be used to index 2557 * into the vpage array since the caller will guarentee 2558 * that vpage struct passed in corresponds to addr 2559 * and forward. 2560 */ 2561 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2562 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2563 } else if (pagelock) { 2564 mutex_enter(&freemem_lock); 2565 availrmem++; 2566 pages_useclaim--; 2567 mutex_exit(&freemem_lock); 2568 } 2569 2570 /* 2571 * Now copy the contents from the original page. 2572 */ 2573 if (ppcopy(ppa[pg_idx], pp) == 0) { 2574 /* 2575 * Before ppcopy could hanlde UE or other faults, we 2576 * would have panicked here, and still have no option 2577 * but to do so now. 2578 */ 2579 panic("anon_map_privatepages, ppcopy failed"); 2580 } 2581 2582 hat_setrefmod(pp); /* mark as modified */ 2583 2584 /* 2585 * Release the lock on the original page, 2586 * derement the old slot, and down grade the lock 2587 * on the new copy. 2588 */ 2589 page_unlock(ppa[pg_idx]); 2590 2591 if (!prealloc) 2592 page_downgrade(pp); 2593 2594 ppa[pg_idx] = pp; 2595 2596 /* 2597 * Now reflect the copy in the new anon array. 2598 */ 2599 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2600 if (oldap != NULL) 2601 anon_decref(oldap); 2602 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2603 } 2604 if (ahmpages != NULL) { 2605 mutex_exit(ahmpages); 2606 } 2607 ASSERT(prealloc == 0 || pplist == NULL); 2608 if (prealloc) { 2609 VM_STAT_ADD(anonvmstats.privatepages[9]); 2610 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2611 page_downgrade(ppa[pg_idx]); 2612 } 2613 } 2614 2615 /* 2616 * Unload the old large page translation. 2617 */ 2618 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2619 return (0); 2620 } 2621 2622 /* 2623 * Allocate a private zero-filled anon page. 2624 */ 2625 page_t * 2626 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2627 { 2628 struct anon *ap; 2629 page_t *pp; 2630 struct vnode *vp; 2631 anoff_t off; 2632 page_t *anon_pl[1 + 1]; 2633 int err; 2634 2635 /* Kernel probe */ 2636 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2637 tnf_opaque, address, addr); 2638 2639 *app = ap = anon_alloc(NULL, 0); 2640 swap_xlate(ap, &vp, &off); 2641 2642 /* 2643 * Call the VOP_GETPAGE routine to create the page, thereby 2644 * enabling the vnode driver to allocate any filesystem 2645 * dependent structures (e.g., disk block allocation for UFS). 2646 * This also prevents more than on page from being added to 2647 * the vnode at the same time since it is locked. 2648 */ 2649 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2650 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2651 if (err) { 2652 *app = NULL; 2653 anon_decref(ap); 2654 return (NULL); 2655 } 2656 pp = anon_pl[0]; 2657 2658 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2659 page_downgrade(pp); 2660 CPU_STATS_ADD_K(vm, zfod, 1); 2661 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2662 return (pp); 2663 } 2664 2665 2666 /* 2667 * Allocate array of private zero-filled anon pages for empty slots 2668 * and kept pages for non empty slots within given range. 2669 * 2670 * NOTE: This rontine will try and use large pages 2671 * if available and supported by underlying platform. 2672 */ 2673 int 2674 anon_map_createpages( 2675 struct anon_map *amp, 2676 ulong_t start_index, 2677 size_t len, 2678 page_t *ppa[], 2679 struct seg *seg, 2680 caddr_t addr, 2681 enum seg_rw rw, 2682 struct cred *cred) 2683 { 2684 2685 struct anon *ap; 2686 struct vnode *ap_vp; 2687 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2688 int err = 0; 2689 ulong_t p_index, index; 2690 pgcnt_t npgs, pg_cnt; 2691 spgcnt_t nreloc = 0; 2692 uint_t l_szc, szc, prot; 2693 anoff_t ap_off; 2694 size_t pgsz; 2695 lgrp_t *lgrp; 2696 kmutex_t *ahm; 2697 2698 /* 2699 * XXX For now only handle S_CREATE. 2700 */ 2701 ASSERT(rw == S_CREATE); 2702 2703 index = start_index; 2704 p_index = 0; 2705 npgs = btopr(len); 2706 2707 /* 2708 * If this platform supports multiple page sizes 2709 * then try and allocate directly from the free 2710 * list for pages larger than PAGESIZE. 2711 * 2712 * NOTE:When we have page_create_ru we can stop 2713 * directly allocating from the freelist. 2714 */ 2715 l_szc = seg->s_szc; 2716 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2717 while (npgs) { 2718 2719 /* 2720 * if anon slot already exists 2721 * (means page has been created) 2722 * so 1) look up the page 2723 * 2) if the page is still in memory, get it. 2724 * 3) if not, create a page and 2725 * page in from physical swap device. 2726 * These are done in anon_getpage(). 2727 */ 2728 ap = anon_get_ptr(amp->ahp, index); 2729 if (ap) { 2730 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2731 seg, addr, S_READ, cred); 2732 if (err) { 2733 ANON_LOCK_EXIT(&->a_rwlock); 2734 panic("anon_map_createpages: anon_getpage"); 2735 } 2736 pp = anon_pl[0]; 2737 ppa[p_index++] = pp; 2738 2739 /* 2740 * an_pvp can become non-NULL after SysV's page was 2741 * paged out before ISM was attached to this SysV 2742 * shared memory segment. So free swap slot if needed. 2743 */ 2744 if (ap->an_pvp != NULL) { 2745 page_io_lock(pp); 2746 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 2747 ap->an_off)]; 2748 mutex_enter(ahm); 2749 if (ap->an_pvp != NULL) { 2750 swap_phys_free(ap->an_pvp, 2751 ap->an_poff, PAGESIZE); 2752 ap->an_pvp = NULL; 2753 ap->an_poff = 0; 2754 mutex_exit(ahm); 2755 hat_setmod(pp); 2756 } else { 2757 mutex_exit(ahm); 2758 } 2759 page_io_unlock(pp); 2760 } 2761 2762 addr += PAGESIZE; 2763 index++; 2764 npgs--; 2765 continue; 2766 } 2767 /* 2768 * Now try and allocate the largest page possible 2769 * for the current address and range. 2770 * Keep dropping down in page size until: 2771 * 2772 * 1) Properly aligned 2773 * 2) Does not overlap existing anon pages 2774 * 3) Fits in remaining range. 2775 * 4) able to allocate one. 2776 * 2777 * NOTE: XXX When page_create_ru is completed this code 2778 * will change. 2779 */ 2780 szc = l_szc; 2781 pplist = NULL; 2782 pg_cnt = 0; 2783 while (szc) { 2784 pgsz = page_get_pagesize(szc); 2785 pg_cnt = pgsz >> PAGESHIFT; 2786 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2787 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2788 /* 2789 * XXX 2790 * Since we are faking page_create() 2791 * we also need to do the freemem and 2792 * pcf accounting. 2793 */ 2794 (void) page_create_wait(pg_cnt, PG_WAIT); 2795 2796 /* 2797 * Get lgroup to allocate next page of shared 2798 * memory from and use it to specify where to 2799 * allocate the physical memory 2800 */ 2801 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2802 2803 pplist = page_get_freelist( 2804 anon_vp, (u_offset_t)0, seg, 2805 addr, pgsz, 0, lgrp); 2806 2807 if (pplist == NULL) { 2808 page_create_putback(pg_cnt); 2809 } 2810 2811 /* 2812 * If a request for a page of size 2813 * larger than PAGESIZE failed 2814 * then don't try that size anymore. 2815 */ 2816 if (pplist == NULL) { 2817 l_szc = szc - 1; 2818 } else { 2819 break; 2820 } 2821 } 2822 szc--; 2823 } 2824 2825 /* 2826 * If just using PAGESIZE pages then don't 2827 * directly allocate from the free list. 2828 */ 2829 if (pplist == NULL) { 2830 ASSERT(szc == 0); 2831 pp = anon_zero(seg, addr, &ap, cred); 2832 if (pp == NULL) { 2833 ANON_LOCK_EXIT(&->a_rwlock); 2834 panic("anon_map_createpages: anon_zero"); 2835 } 2836 ppa[p_index++] = pp; 2837 2838 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2839 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2840 2841 addr += PAGESIZE; 2842 index++; 2843 npgs--; 2844 continue; 2845 } 2846 2847 /* 2848 * pplist is a list of pg_cnt PAGESIZE pages. 2849 * These pages are locked SE_EXCL since they 2850 * came directly off the free list. 2851 */ 2852 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2853 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2854 ASSERT(conpp == NULL); 2855 while (pg_cnt--) { 2856 2857 ap = anon_alloc(NULL, 0); 2858 swap_xlate(ap, &ap_vp, &ap_off); 2859 2860 ASSERT(pplist != NULL); 2861 pp = pplist; 2862 page_sub(&pplist, pp); 2863 PP_CLRFREE(pp); 2864 PP_CLRAGED(pp); 2865 conpp = pp; 2866 2867 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2868 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2869 &nreloc, seg, addr, S_CREATE, cred); 2870 2871 if (err) { 2872 ANON_LOCK_EXIT(&->a_rwlock); 2873 panic("anon_map_createpages: S_CREATE"); 2874 } 2875 2876 ASSERT(anon_pl[0] == pp); 2877 ASSERT(nreloc == 1); 2878 pagezero(pp, 0, PAGESIZE); 2879 CPU_STATS_ADD_K(vm, zfod, 1); 2880 hat_setrefmod(pp); 2881 2882 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2883 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2884 2885 ppa[p_index++] = pp; 2886 2887 addr += PAGESIZE; 2888 index++; 2889 npgs--; 2890 } 2891 conpp = NULL; 2892 pg_cnt = pgsz >> PAGESHIFT; 2893 p_index = p_index - pg_cnt; 2894 while (pg_cnt--) { 2895 page_downgrade(ppa[p_index++]); 2896 } 2897 } 2898 ANON_LOCK_EXIT(&->a_rwlock); 2899 return (0); 2900 } 2901 2902 static int 2903 anon_try_demote_pages( 2904 struct anon_hdr *ahp, 2905 ulong_t sidx, 2906 uint_t szc, 2907 page_t **ppa, 2908 int private) 2909 { 2910 struct anon *ap; 2911 pgcnt_t pgcnt = page_get_pagecnt(szc); 2912 page_t *pp; 2913 pgcnt_t i; 2914 kmutex_t *ahmpages = NULL; 2915 int root = 0; 2916 pgcnt_t npgs; 2917 pgcnt_t curnpgs = 0; 2918 size_t ppasize = 0; 2919 2920 ASSERT(szc != 0); 2921 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2922 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 2923 ASSERT(sidx < ahp->size); 2924 2925 if (ppa == NULL) { 2926 ppasize = pgcnt * sizeof (page_t *); 2927 ppa = kmem_alloc(ppasize, KM_SLEEP); 2928 } 2929 2930 ap = anon_get_ptr(ahp, sidx); 2931 if (ap != NULL && private) { 2932 VM_STAT_ADD(anonvmstats.demotepages[1]); 2933 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2934 mutex_enter(ahmpages); 2935 } 2936 2937 if (ap != NULL && ap->an_refcnt > 1) { 2938 if (ahmpages != NULL) { 2939 VM_STAT_ADD(anonvmstats.demotepages[2]); 2940 mutex_exit(ahmpages); 2941 } 2942 if (ppasize != 0) { 2943 kmem_free(ppa, ppasize); 2944 } 2945 return (0); 2946 } 2947 if (ahmpages != NULL) { 2948 mutex_exit(ahmpages); 2949 } 2950 if (ahp->size - sidx < pgcnt) { 2951 ASSERT(private == 0); 2952 pgcnt = ahp->size - sidx; 2953 } 2954 for (i = 0; i < pgcnt; i++, sidx++) { 2955 ap = anon_get_ptr(ahp, sidx); 2956 if (ap != NULL) { 2957 if (ap->an_refcnt != 1) { 2958 panic("anon_try_demote_pages: an_refcnt != 1"); 2959 } 2960 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2961 SE_EXCL); 2962 if (pp != NULL) { 2963 (void) hat_pageunload(pp, 2964 HAT_FORCE_PGUNLOAD); 2965 } 2966 } else { 2967 ppa[i] = NULL; 2968 } 2969 } 2970 for (i = 0; i < pgcnt; i++) { 2971 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 2972 ASSERT(pp->p_szc <= szc); 2973 if (!root) { 2974 VM_STAT_ADD(anonvmstats.demotepages[3]); 2975 if (curnpgs != 0) 2976 panic("anon_try_demote_pages: " 2977 "bad large page"); 2978 2979 root = 1; 2980 curnpgs = npgs = 2981 page_get_pagecnt(pp->p_szc); 2982 2983 ASSERT(npgs <= pgcnt); 2984 ASSERT(IS_P2ALIGNED(npgs, npgs)); 2985 ASSERT(!(page_pptonum(pp) & 2986 (npgs - 1))); 2987 } else { 2988 ASSERT(i > 0); 2989 ASSERT(page_pptonum(pp) - 1 == 2990 page_pptonum(ppa[i - 1])); 2991 if ((page_pptonum(pp) & (npgs - 1)) == 2992 npgs - 1) 2993 root = 0; 2994 } 2995 ASSERT(PAGE_EXCL(pp)); 2996 pp->p_szc = 0; 2997 ASSERT(curnpgs > 0); 2998 curnpgs--; 2999 } 3000 } 3001 if (root != 0 || curnpgs != 0) 3002 panic("anon_try_demote_pages: bad large page"); 3003 3004 for (i = 0; i < pgcnt; i++) { 3005 if ((pp = ppa[i]) != NULL) { 3006 ASSERT(!hat_page_is_mapped(pp)); 3007 ASSERT(pp->p_szc == 0); 3008 page_unlock(pp); 3009 } 3010 } 3011 if (ppasize != 0) { 3012 kmem_free(ppa, ppasize); 3013 } 3014 return (1); 3015 } 3016 3017 /* 3018 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 3019 */ 3020 int 3021 anon_map_demotepages( 3022 struct anon_map *amp, 3023 ulong_t start_idx, 3024 struct seg *seg, 3025 caddr_t addr, 3026 uint_t prot, 3027 struct vpage vpage[], 3028 struct cred *cred) 3029 { 3030 struct anon *ap; 3031 uint_t szc = seg->s_szc; 3032 pgcnt_t pgcnt = page_get_pagecnt(szc); 3033 size_t ppasize = pgcnt * sizeof (page_t *); 3034 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 3035 page_t *pp; 3036 page_t *pl[2]; 3037 pgcnt_t i, pg_idx; 3038 ulong_t an_idx; 3039 caddr_t vaddr; 3040 int err; 3041 int retry = 0; 3042 uint_t vpprot; 3043 3044 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3045 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 3046 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 3047 ASSERT(ppa != NULL); 3048 ASSERT(szc != 0); 3049 ASSERT(szc == amp->a_szc); 3050 3051 VM_STAT_ADD(anonvmstats.demotepages[0]); 3052 3053 top: 3054 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 3055 kmem_free(ppa, ppasize); 3056 return (0); 3057 } 3058 3059 VM_STAT_ADD(anonvmstats.demotepages[4]); 3060 3061 ASSERT(retry == 0); /* we can be here only once */ 3062 3063 vaddr = addr; 3064 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3065 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3066 ap = anon_get_ptr(amp->ahp, an_idx); 3067 if (ap == NULL) 3068 panic("anon_map_demotepages: no anon slot"); 3069 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3070 S_READ, cred); 3071 if (err) { 3072 for (i = 0; i < pg_idx; i++) { 3073 if ((pp = ppa[i]) != NULL) 3074 page_unlock(pp); 3075 } 3076 kmem_free(ppa, ppasize); 3077 return (err); 3078 } 3079 ppa[pg_idx] = pl[0]; 3080 } 3081 3082 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3083 vpage, -1, 0, cred); 3084 if (err > 0) { 3085 VM_STAT_ADD(anonvmstats.demotepages[5]); 3086 kmem_free(ppa, ppasize); 3087 return (err); 3088 } 3089 ASSERT(err == 0 || err == -1); 3090 if (err == -1) { 3091 VM_STAT_ADD(anonvmstats.demotepages[6]); 3092 retry = 1; 3093 goto top; 3094 } 3095 for (i = 0; i < pgcnt; i++) { 3096 ASSERT(ppa[i] != NULL); 3097 if (ppa[i]->p_szc != 0) 3098 retry = 1; 3099 page_unlock(ppa[i]); 3100 } 3101 if (retry) { 3102 VM_STAT_ADD(anonvmstats.demotepages[7]); 3103 goto top; 3104 } 3105 3106 VM_STAT_ADD(anonvmstats.demotepages[8]); 3107 3108 kmem_free(ppa, ppasize); 3109 3110 return (0); 3111 } 3112 3113 /* 3114 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3115 * structures with private anon maps. Therefore all anon structures should 3116 * have at most one reference at this point. This means underlying pages can 3117 * be exclusively locked and demoted or freed. If not freeing the entire 3118 * large pages demote the ends of the region we free to be able to free 3119 * subpages. Page roots correspend to aligned index positions in anon map. 3120 */ 3121 void 3122 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3123 { 3124 ulong_t eidx = sidx + btopr(len); 3125 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3126 struct anon_hdr *ahp = amp->ahp; 3127 ulong_t tidx; 3128 size_t size; 3129 ulong_t sidx_aligned; 3130 ulong_t eidx_aligned; 3131 3132 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3133 ASSERT(amp->refcnt <= 1); 3134 ASSERT(amp->a_szc > 0); 3135 ASSERT(eidx <= ahp->size); 3136 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3137 3138 if (len == 0) { /* XXX */ 3139 return; 3140 } 3141 3142 sidx_aligned = P2ALIGN(sidx, pages); 3143 if (sidx_aligned != sidx || 3144 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3145 if (!anon_try_demote_pages(ahp, sidx_aligned, 3146 amp->a_szc, NULL, 0)) { 3147 panic("anon_shmap_free_pages: demote failed"); 3148 } 3149 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3150 P2NPHASE(sidx, pages); 3151 size <<= PAGESHIFT; 3152 anon_free(ahp, sidx, size); 3153 sidx = sidx_aligned + pages; 3154 if (eidx <= sidx) { 3155 return; 3156 } 3157 } 3158 eidx_aligned = P2ALIGN(eidx, pages); 3159 if (sidx < eidx_aligned) { 3160 anon_free_pages(ahp, sidx, 3161 (eidx_aligned - sidx) << PAGESHIFT, 3162 amp->a_szc); 3163 sidx = eidx_aligned; 3164 } 3165 ASSERT(sidx == eidx_aligned); 3166 if (eidx == eidx_aligned) { 3167 return; 3168 } 3169 tidx = eidx; 3170 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3171 tidx - sidx < pages) { 3172 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3173 panic("anon_shmap_free_pages: demote failed"); 3174 } 3175 size = (eidx - sidx) << PAGESHIFT; 3176 anon_free(ahp, sidx, size); 3177 } else { 3178 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3179 } 3180 } 3181 3182 /* 3183 * Allocate and initialize an anon_map structure for seg 3184 * associating the given swap reservation with the new anon_map. 3185 */ 3186 struct anon_map * 3187 anonmap_alloc(size_t size, size_t swresv, int flags) 3188 { 3189 struct anon_map *amp; 3190 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 3191 3192 amp = kmem_cache_alloc(anonmap_cache, kmflags); 3193 if (amp == NULL) { 3194 ASSERT(kmflags == KM_NOSLEEP); 3195 return (NULL); 3196 } 3197 3198 amp->ahp = anon_create(btopr(size), flags); 3199 if (amp->ahp == NULL) { 3200 ASSERT(flags == ANON_NOSLEEP); 3201 kmem_cache_free(anonmap_cache, amp); 3202 return (NULL); 3203 } 3204 amp->refcnt = 1; 3205 amp->size = size; 3206 amp->swresv = swresv; 3207 amp->locality = 0; 3208 amp->a_szc = 0; 3209 amp->a_sp = NULL; 3210 return (amp); 3211 } 3212 3213 void 3214 anonmap_free(struct anon_map *amp) 3215 { 3216 ASSERT(amp->ahp); 3217 ASSERT(amp->refcnt == 0); 3218 3219 lgrp_shm_policy_fini(amp, NULL); 3220 anon_release(amp->ahp, btopr(amp->size)); 3221 kmem_cache_free(anonmap_cache, amp); 3222 } 3223 3224 /* 3225 * Returns true if the app array has some empty slots. 3226 * The offp and lenp paramters are in/out paramters. On entry 3227 * these values represent the starting offset and length of the 3228 * mapping. When true is returned, these values may be modified 3229 * to be the largest range which includes empty slots. 3230 */ 3231 int 3232 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3233 size_t *lenp) 3234 { 3235 ulong_t i, el; 3236 ssize_t low, high; 3237 struct anon *ap; 3238 3239 low = -1; 3240 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3241 ap = anon_get_ptr(ahp, anon_idx); 3242 if (ap == NULL) { 3243 if (low == -1) 3244 low = i; 3245 high = i; 3246 } 3247 } 3248 if (low != -1) { 3249 /* 3250 * Found at least one non-anon page. 3251 * Set up the off and len return values. 3252 */ 3253 if (low != 0) 3254 *offp += low; 3255 *lenp = high - low + PAGESIZE; 3256 return (1); 3257 } 3258 return (0); 3259 } 3260 3261 /* 3262 * Return a count of the number of existing anon pages in the anon array 3263 * app in the range (off, off+len). The array and slots must be guaranteed 3264 * stable by the caller. 3265 */ 3266 pgcnt_t 3267 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3268 { 3269 pgcnt_t cnt = 0; 3270 3271 while (nslots-- > 0) { 3272 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3273 cnt++; 3274 anon_index++; 3275 } 3276 return (cnt); 3277 } 3278 3279 /* 3280 * Move reserved phys swap into memory swap (unreserve phys swap 3281 * and reserve mem swap by the same amount). 3282 * Used by segspt when it needs to lock resrved swap npages in memory 3283 */ 3284 int 3285 anon_swap_adjust(pgcnt_t npages) 3286 { 3287 pgcnt_t unlocked_mem_swap; 3288 3289 mutex_enter(&anoninfo_lock); 3290 3291 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3292 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3293 3294 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3295 - k_anoninfo.ani_locked_swap; 3296 if (npages > unlocked_mem_swap) { 3297 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3298 3299 /* 3300 * if there is not enough unlocked mem swap we take missing 3301 * amount from phys swap and give it to mem swap 3302 */ 3303 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3304 mutex_exit(&anoninfo_lock); 3305 return (ENOMEM); 3306 } 3307 3308 k_anoninfo.ani_mem_resv += adjusted_swap; 3309 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3310 k_anoninfo.ani_phys_resv -= adjusted_swap; 3311 3312 ANI_ADD(adjusted_swap); 3313 } 3314 k_anoninfo.ani_locked_swap += npages; 3315 3316 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3317 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3318 3319 mutex_exit(&anoninfo_lock); 3320 3321 return (0); 3322 } 3323 3324 /* 3325 * 'unlocked' reserved mem swap so when it is unreserved it 3326 * can be moved back phys (disk) swap 3327 */ 3328 void 3329 anon_swap_restore(pgcnt_t npages) 3330 { 3331 mutex_enter(&anoninfo_lock); 3332 3333 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3334 3335 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3336 k_anoninfo.ani_locked_swap -= npages; 3337 3338 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3339 3340 mutex_exit(&anoninfo_lock); 3341 } 3342 3343 /* 3344 * Return the pointer from the list for a 3345 * specified anon index. 3346 */ 3347 ulong_t * 3348 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3349 { 3350 struct anon **app; 3351 void **ppp; 3352 3353 ASSERT(an_idx < ahp->size); 3354 3355 /* 3356 * Single level case. 3357 */ 3358 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3359 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3360 } else { 3361 3362 /* 3363 * 2 level case. 3364 */ 3365 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3366 if (*ppp == NULL) { 3367 mutex_enter(&ahp->serial_lock); 3368 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3369 if (*ppp == NULL) 3370 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3371 mutex_exit(&ahp->serial_lock); 3372 } 3373 app = *ppp; 3374 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3375 } 3376 } 3377 3378 void 3379 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3380 { 3381 ulong_t *ap_slot; 3382 kmutex_t *mtx; 3383 kcondvar_t *cv; 3384 int hash; 3385 3386 /* 3387 * Use szc to determine anon slot(s) to appear atomic. 3388 * If szc = 0, then lock the anon slot and mark it busy. 3389 * If szc > 0, then lock the range of slots by getting the 3390 * anon_array_lock for the first anon slot, and mark only the 3391 * first anon slot busy to represent whole range being busy. 3392 */ 3393 3394 ASSERT(RW_READ_HELD(&->a_rwlock)); 3395 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3396 hash = ANON_ARRAY_HASH(amp, an_idx); 3397 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3398 sobj->sync_cv = cv = &anon_array_cv[hash]; 3399 mutex_enter(mtx); 3400 ap_slot = anon_get_slot(amp->ahp, an_idx); 3401 while (ANON_ISBUSY(ap_slot)) 3402 cv_wait(cv, mtx); 3403 ANON_SETBUSY(ap_slot); 3404 sobj->sync_data = ap_slot; 3405 mutex_exit(mtx); 3406 } 3407 3408 int 3409 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3410 anon_sync_obj_t *sobj) 3411 { 3412 ulong_t *ap_slot; 3413 kmutex_t *mtx; 3414 int hash; 3415 3416 /* 3417 * Try to lock a range of anon slots. 3418 * Use szc to determine anon slot(s) to appear atomic. 3419 * If szc = 0, then lock the anon slot and mark it busy. 3420 * If szc > 0, then lock the range of slots by getting the 3421 * anon_array_lock for the first anon slot, and mark only the 3422 * first anon slot busy to represent whole range being busy. 3423 * Fail if the mutex or the anon_array are busy. 3424 */ 3425 3426 ASSERT(RW_READ_HELD(&->a_rwlock)); 3427 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3428 hash = ANON_ARRAY_HASH(amp, an_idx); 3429 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3430 sobj->sync_cv = &anon_array_cv[hash]; 3431 if (!mutex_tryenter(mtx)) { 3432 return (EWOULDBLOCK); 3433 } 3434 ap_slot = anon_get_slot(amp->ahp, an_idx); 3435 if (ANON_ISBUSY(ap_slot)) { 3436 mutex_exit(mtx); 3437 return (EWOULDBLOCK); 3438 } 3439 ANON_SETBUSY(ap_slot); 3440 sobj->sync_data = ap_slot; 3441 mutex_exit(mtx); 3442 return (0); 3443 } 3444 3445 void 3446 anon_array_exit(anon_sync_obj_t *sobj) 3447 { 3448 mutex_enter(sobj->sync_mutex); 3449 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3450 ANON_CLRBUSY(sobj->sync_data); 3451 if (CV_HAS_WAITERS(sobj->sync_cv)) 3452 cv_broadcast(sobj->sync_cv); 3453 mutex_exit(sobj->sync_mutex); 3454 } 3455