1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 29 /* 30 * University Copyright- Copyright (c) 1982, 1986, 1988 31 * The Regents of the University of California 32 * All Rights Reserved 33 * 34 * University Acknowledgment- Portions of this document are derived from 35 * software developed by the University of California, Berkeley, and its 36 * contributors. 37 */ 38 39 #pragma ident "%Z%%M% %I% %E% SMI" 40 41 /* 42 * VM - anonymous pages. 43 * 44 * This layer sits immediately above the vm_swap layer. It manages 45 * physical pages that have no permanent identity in the file system 46 * name space, using the services of the vm_swap layer to allocate 47 * backing storage for these pages. Since these pages have no external 48 * identity, they are discarded when the last reference is removed. 49 * 50 * An important function of this layer is to manage low-level sharing 51 * of pages that are logically distinct but that happen to be 52 * physically identical (e.g., the corresponding pages of the processes 53 * resulting from a fork before one process or the other changes their 54 * contents). This pseudo-sharing is present only as an optimization 55 * and is not to be confused with true sharing in which multiple 56 * address spaces deliberately contain references to the same object; 57 * such sharing is managed at a higher level. 58 * 59 * The key data structure here is the anon struct, which contains a 60 * reference count for its associated physical page and a hint about 61 * the identity of that page. Anon structs typically live in arrays, 62 * with an instance's position in its array determining where the 63 * corresponding backing storage is allocated; however, the swap_xlate() 64 * routine abstracts away this representation information so that the 65 * rest of the anon layer need not know it. (See the swap layer for 66 * more details on anon struct layout.) 67 * 68 * In the future versions of the system, the association between an 69 * anon struct and its position on backing store will change so that 70 * we don't require backing store all anonymous pages in the system. 71 * This is important for consideration for large memory systems. 72 * We can also use this technique to delay binding physical locations 73 * to anonymous pages until pageout/swapout time where we can make 74 * smarter allocation decisions to improve anonymous klustering. 75 * 76 * Many of the routines defined here take a (struct anon **) argument, 77 * which allows the code at this level to manage anon pages directly, 78 * so that callers can regard anon structs as opaque objects and not be 79 * concerned with assigning or inspecting their contents. 80 * 81 * Clients of this layer refer to anon pages indirectly. That is, they 82 * maintain arrays of pointers to anon structs rather than maintaining 83 * anon structs themselves. The (struct anon **) arguments mentioned 84 * above are pointers to entries in these arrays. It is these arrays 85 * that capture the mapping between offsets within a given segment and 86 * the corresponding anonymous backing storage address. 87 */ 88 89 #ifdef DEBUG 90 #define ANON_DEBUG 91 #endif 92 93 #include <sys/types.h> 94 #include <sys/t_lock.h> 95 #include <sys/param.h> 96 #include <sys/systm.h> 97 #include <sys/mman.h> 98 #include <sys/cred.h> 99 #include <sys/thread.h> 100 #include <sys/vnode.h> 101 #include <sys/cpuvar.h> 102 #include <sys/swap.h> 103 #include <sys/cmn_err.h> 104 #include <sys/vtrace.h> 105 #include <sys/kmem.h> 106 #include <sys/sysmacros.h> 107 #include <sys/bitmap.h> 108 #include <sys/vmsystm.h> 109 #include <sys/debug.h> 110 #include <sys/fs/swapnode.h> 111 #include <sys/tnf_probe.h> 112 #include <sys/lgrp.h> 113 #include <sys/policy.h> 114 #include <sys/condvar_impl.h> 115 #include <sys/mutex_impl.h> 116 117 #include <vm/as.h> 118 #include <vm/hat.h> 119 #include <vm/anon.h> 120 #include <vm/page.h> 121 #include <vm/vpage.h> 122 #include <vm/seg.h> 123 #include <vm/rm.h> 124 125 #include <fs/fs_subr.h> 126 127 struct vnode *anon_vp; 128 129 int anon_debug; 130 131 kmutex_t anoninfo_lock; 132 struct k_anoninfo k_anoninfo; 133 ani_free_t ani_free_pool[ANI_MAX_POOL]; 134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE]; 135 kcondvar_t anon_array_cv[ANON_LOCKSIZE]; 136 137 /* 138 * Global hash table for (vp, off) -> anon slot 139 */ 140 extern int swap_maxcontig; 141 size_t anon_hash_size; 142 struct anon **anon_hash; 143 144 static struct kmem_cache *anon_cache; 145 static struct kmem_cache *anonmap_cache; 146 147 #ifdef VM_STATS 148 static struct anonvmstats_str { 149 ulong_t getpages[30]; 150 ulong_t privatepages[10]; 151 ulong_t demotepages[9]; 152 ulong_t decrefpages[9]; 153 ulong_t dupfillholes[4]; 154 ulong_t freepages[1]; 155 } anonvmstats; 156 #endif /* VM_STATS */ 157 158 159 /*ARGSUSED*/ 160 static int 161 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags) 162 { 163 struct anon_map *amp = buf; 164 165 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL); 166 return (0); 167 } 168 169 /*ARGSUSED1*/ 170 static void 171 anonmap_cache_destructor(void *buf, void *cdrarg) 172 { 173 struct anon_map *amp = buf; 174 175 rw_destroy(&->a_rwlock); 176 } 177 178 kmutex_t anonhash_lock[AH_LOCK_SIZE]; 179 kmutex_t anonpages_hash_lock[AH_LOCK_SIZE]; 180 181 void 182 anon_init(void) 183 { 184 int i; 185 186 anon_hash_size = 1L << highbit(physmem / ANON_HASHAVELEN); 187 188 for (i = 0; i < AH_LOCK_SIZE; i++) { 189 mutex_init(&anonhash_lock[i], NULL, MUTEX_DEFAULT, NULL); 190 mutex_init(&anonpages_hash_lock[i], NULL, MUTEX_DEFAULT, NULL); 191 } 192 193 for (i = 0; i < ANON_LOCKSIZE; i++) { 194 mutex_init(&anon_array_lock[i].pad_mutex, NULL, 195 MUTEX_DEFAULT, NULL); 196 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL); 197 } 198 199 anon_hash = (struct anon **) 200 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP); 201 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon), 202 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, 0); 203 anonmap_cache = kmem_cache_create("anonmap_cache", 204 sizeof (struct anon_map), 0, 205 anonmap_cache_constructor, anonmap_cache_destructor, NULL, 206 NULL, NULL, 0); 207 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */ 208 209 anon_vp = vn_alloc(KM_SLEEP); 210 vn_setops(anon_vp, swap_vnodeops); 211 anon_vp->v_type = VREG; 212 anon_vp->v_flag |= (VISSWAP|VISSWAPFS); 213 } 214 215 /* 216 * Global anon slot hash table manipulation. 217 */ 218 219 static void 220 anon_addhash(struct anon *ap) 221 { 222 int index; 223 224 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 225 index = ANON_HASH(ap->an_vp, ap->an_off); 226 ap->an_hash = anon_hash[index]; 227 anon_hash[index] = ap; 228 } 229 230 static void 231 anon_rmhash(struct anon *ap) 232 { 233 struct anon **app; 234 235 ASSERT(MUTEX_HELD(&anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)])); 236 237 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)]; 238 *app; app = &((*app)->an_hash)) { 239 if (*app == ap) { 240 *app = ap->an_hash; 241 break; 242 } 243 } 244 } 245 246 /* 247 * The anon array interfaces. Functions allocating, 248 * freeing array of pointers, and returning/setting 249 * entries in the array of pointers for a given offset. 250 * 251 * Create the list of pointers 252 */ 253 struct anon_hdr * 254 anon_create(pgcnt_t npages, int flags) 255 { 256 struct anon_hdr *ahp; 257 ulong_t nchunks; 258 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 259 260 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) { 261 return (NULL); 262 } 263 264 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL); 265 /* 266 * Single level case. 267 */ 268 ahp->size = npages; 269 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) { 270 271 if (flags & ANON_ALLOC_FORCE) 272 ahp->flags |= ANON_ALLOC_FORCE; 273 274 ahp->array_chunk = kmem_zalloc( 275 ahp->size * sizeof (struct anon *), kmemflags); 276 277 if (ahp->array_chunk == NULL) { 278 kmem_free(ahp, sizeof (struct anon_hdr)); 279 return (NULL); 280 } 281 } else { 282 /* 283 * 2 Level case. 284 * anon hdr size needs to be rounded off to be a multiple 285 * of ANON_CHUNK_SIZE. This is important as various anon 286 * related functions depend on this. 287 * NOTE - 288 * anon_grow() makes anon hdr size a multiple of 289 * ANON_CHUNK_SIZE. 290 * amp size is <= anon hdr size. 291 * anon_index + seg_pgs <= anon hdr size. 292 */ 293 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE); 294 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 295 296 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *), 297 kmemflags); 298 299 if (ahp->array_chunk == NULL) { 300 kmem_free(ahp, sizeof (struct anon_hdr)); 301 return (NULL); 302 } 303 } 304 return (ahp); 305 } 306 307 /* 308 * Free the array of pointers 309 */ 310 void 311 anon_release(struct anon_hdr *ahp, pgcnt_t npages) 312 { 313 ulong_t i; 314 void **ppp; 315 ulong_t nchunks; 316 317 ASSERT(npages <= ahp->size); 318 319 /* 320 * Single level case. 321 */ 322 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 323 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *)); 324 } else { 325 /* 326 * 2 level case. 327 */ 328 nchunks = ahp->size >> ANON_CHUNK_SHIFT; 329 for (i = 0; i < nchunks; i++) { 330 ppp = &ahp->array_chunk[i]; 331 if (*ppp != NULL) 332 kmem_free(*ppp, PAGESIZE); 333 } 334 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *)); 335 } 336 mutex_destroy(&ahp->serial_lock); 337 kmem_free(ahp, sizeof (struct anon_hdr)); 338 } 339 340 /* 341 * Return the pointer from the list for a 342 * specified anon index. 343 */ 344 struct anon * 345 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx) 346 { 347 struct anon **app; 348 349 ASSERT(an_idx < ahp->size); 350 351 /* 352 * Single level case. 353 */ 354 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 355 return ((struct anon *) 356 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK)); 357 } else { 358 359 /* 360 * 2 level case. 361 */ 362 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 363 if (app) { 364 return ((struct anon *) 365 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] & 366 ANON_PTRMASK)); 367 } else { 368 return (NULL); 369 } 370 } 371 } 372 373 /* 374 * Return the anon pointer for the first valid entry in the anon list, 375 * starting from the given index. 376 */ 377 struct anon * 378 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index) 379 { 380 struct anon *ap; 381 struct anon **app; 382 ulong_t chunkoff; 383 ulong_t i; 384 ulong_t j; 385 pgcnt_t size; 386 387 i = *index; 388 size = ahp->size; 389 390 ASSERT(i < size); 391 392 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 393 /* 394 * 1 level case 395 */ 396 while (i < size) { 397 ap = (struct anon *) 398 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK); 399 if (ap) { 400 *index = i; 401 return (ap); 402 } 403 i++; 404 } 405 } else { 406 /* 407 * 2 level case 408 */ 409 chunkoff = i & ANON_CHUNK_OFF; 410 while (i < size) { 411 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT]; 412 if (app) 413 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) { 414 ap = (struct anon *) 415 ((uintptr_t)app[j] & 416 ANON_PTRMASK); 417 if (ap) { 418 *index = i + (j - chunkoff); 419 return (ap); 420 } 421 } 422 chunkoff = 0; 423 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF; 424 } 425 } 426 *index = size; 427 return (NULL); 428 } 429 430 /* 431 * Set list entry with a given pointer for a specified offset 432 */ 433 int 434 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags) 435 { 436 void **ppp; 437 struct anon **app; 438 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 439 uintptr_t *ap_addr; 440 441 ASSERT(an_idx < ahp->size); 442 443 /* 444 * Single level case. 445 */ 446 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 447 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx]; 448 } else { 449 450 /* 451 * 2 level case. 452 */ 453 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 454 455 ASSERT(ppp != NULL); 456 if (*ppp == NULL) { 457 mutex_enter(&ahp->serial_lock); 458 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 459 if (*ppp == NULL) { 460 *ppp = kmem_zalloc(PAGESIZE, kmemflags); 461 if (*ppp == NULL) { 462 mutex_exit(&ahp->serial_lock); 463 return (ENOMEM); 464 } 465 } 466 mutex_exit(&ahp->serial_lock); 467 } 468 app = *ppp; 469 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF]; 470 } 471 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap; 472 return (0); 473 } 474 475 /* 476 * Copy anon array into a given new anon array 477 */ 478 int 479 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, 480 struct anon_hdr *dahp, ulong_t d_idx, 481 pgcnt_t npages, int flags) 482 { 483 void **sapp, **dapp; 484 void *ap; 485 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 486 487 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size)); 488 ASSERT((npages <= sahp->size) && (npages <= dahp->size)); 489 490 /* 491 * Both arrays are 1 level. 492 */ 493 if (((sahp->size <= ANON_CHUNK_SIZE) && 494 (dahp->size <= ANON_CHUNK_SIZE)) || 495 ((sahp->flags & ANON_ALLOC_FORCE) && 496 (dahp->flags & ANON_ALLOC_FORCE))) { 497 498 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx], 499 npages * sizeof (struct anon *)); 500 return (0); 501 } 502 503 /* 504 * Both arrays are 2 levels. 505 */ 506 if (sahp->size > ANON_CHUNK_SIZE && 507 dahp->size > ANON_CHUNK_SIZE && 508 ((sahp->flags & ANON_ALLOC_FORCE) == 0) && 509 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) { 510 511 ulong_t sapidx, dapidx; 512 ulong_t *sap, *dap; 513 ulong_t chknp; 514 515 while (npages != 0) { 516 517 sapidx = s_idx & ANON_CHUNK_OFF; 518 dapidx = d_idx & ANON_CHUNK_OFF; 519 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx); 520 if (chknp > npages) 521 chknp = npages; 522 523 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT]; 524 if ((sap = *sapp) != NULL) { 525 dapp = &dahp->array_chunk[d_idx 526 >> ANON_CHUNK_SHIFT]; 527 if ((dap = *dapp) == NULL) { 528 *dapp = kmem_zalloc(PAGESIZE, 529 kmemflags); 530 if ((dap = *dapp) == NULL) 531 return (ENOMEM); 532 } 533 bcopy((sap + sapidx), (dap + dapidx), 534 chknp << ANON_PTRSHIFT); 535 } 536 s_idx += chknp; 537 d_idx += chknp; 538 npages -= chknp; 539 } 540 return (0); 541 } 542 543 /* 544 * At least one of the arrays is 2 level. 545 */ 546 while (npages--) { 547 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) { 548 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx))); 549 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM) 550 return (ENOMEM); 551 } 552 s_idx++; 553 d_idx++; 554 } 555 return (0); 556 } 557 558 559 /* 560 * ANON_INITBUF is a convenience macro for anon_grow() below. It 561 * takes a buffer dst, which is at least as large as buffer src. It 562 * does a bcopy from src into dst, and then bzeros the extra bytes 563 * of dst. If tail is set, the data in src is tail aligned within 564 * dst instead of head aligned. 565 */ 566 567 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \ 568 if (tail) { \ 569 bzero((dst), (dstsize) - (srclen)); \ 570 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \ 571 } else { \ 572 bcopy((src), (dst), (srclen)); \ 573 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \ 574 } 575 576 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8) 577 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE) 578 579 /* 580 * anon_grow() is used to efficiently extend an existing anon array. 581 * startidx_p points to the index into the anon array of the first page 582 * that is in use. oldseg_pgs is the number of pages in use, starting at 583 * *startidx_p. newpages is the number of additional pages desired. 584 * 585 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed. 586 * 587 * The growth is done by creating a new top level of the anon array, 588 * and (if the array is 2-level) reusing the existing second level arrays. 589 * 590 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN. 591 * 592 * Returns the new number of pages in the anon array. 593 */ 594 pgcnt_t 595 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs, 596 pgcnt_t newseg_pgs, int flags) 597 { 598 ulong_t startidx = startidx_p ? *startidx_p : 0; 599 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs; 600 pgcnt_t oelems, nelems, totpages; 601 void **level1; 602 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP; 603 int growdown = (flags & ANON_GROWDOWN); 604 size_t newarrsz, oldarrsz; 605 void *level2; 606 607 ASSERT(!(startidx_p == NULL && growdown)); 608 ASSERT(startidx + oldseg_pgs <= ahp->size); 609 610 /* 611 * Determine the total number of pages needed in the new 612 * anon array. If growing down, totpages is all pages from 613 * startidx through the end of the array, plus <newseg_pgs> 614 * pages. If growing up, keep all pages from page 0 through 615 * the last page currently in use, plus <newseg_pgs> pages. 616 */ 617 if (growdown) 618 totpages = oldamp_pgs - startidx + newseg_pgs; 619 else 620 totpages = startidx + oldseg_pgs + newseg_pgs; 621 622 /* If the array is already large enough, just return. */ 623 624 if (oldamp_pgs >= totpages) { 625 if (growdown) 626 *startidx_p = oldamp_pgs - totpages; 627 return (oldamp_pgs); 628 } 629 630 /* 631 * oldamp_pgs/newamp_pgs are the total numbers of pages represented 632 * by the corresponding arrays. 633 * oelems/nelems are the number of pointers in the top level arrays 634 * which may be either level 1 or level 2. 635 * Will the new anon array be one level or two levels? 636 */ 637 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) { 638 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC); 639 oelems = oldamp_pgs; 640 nelems = newamp_pgs; 641 } else { 642 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC); 643 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT; 644 nelems = newamp_pgs >> ANON_CHUNK_SHIFT; 645 } 646 647 newarrsz = nelems * sizeof (void *); 648 level1 = kmem_alloc(newarrsz, kmemflags); 649 if (level1 == NULL) 650 return (0); 651 652 /* Are we converting from a one level to a two level anon array? */ 653 654 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE && 655 !(ahp->flags & ANON_ALLOC_FORCE)) { 656 657 /* 658 * Yes, we're converting to a two level. Reuse old level 1 659 * as new level 2 if it is exactly PAGESIZE. Otherwise 660 * alloc a new level 2 and copy the old level 1 data into it. 661 */ 662 if (oldamp_pgs == ANON_CHUNK_SIZE) { 663 level2 = (void *)ahp->array_chunk; 664 } else { 665 level2 = kmem_alloc(PAGESIZE, kmemflags); 666 if (level2 == NULL) { 667 kmem_free(level1, newarrsz); 668 return (0); 669 } 670 oldarrsz = oldamp_pgs * sizeof (void *); 671 672 ANON_INITBUF(ahp->array_chunk, oldarrsz, 673 level2, PAGESIZE, growdown); 674 kmem_free(ahp->array_chunk, oldarrsz); 675 } 676 bzero(level1, newarrsz); 677 if (growdown) 678 level1[nelems - 1] = level2; 679 else 680 level1[0] = level2; 681 } else { 682 oldarrsz = oelems * sizeof (void *); 683 684 ANON_INITBUF(ahp->array_chunk, oldarrsz, 685 level1, newarrsz, growdown); 686 kmem_free(ahp->array_chunk, oldarrsz); 687 } 688 689 ahp->array_chunk = level1; 690 ahp->size = newamp_pgs; 691 if (growdown) 692 *startidx_p = newamp_pgs - totpages; 693 694 return (newamp_pgs); 695 } 696 697 698 /* 699 * Called from clock handler to sync ani_free value. 700 */ 701 702 void 703 set_anoninfo(void) 704 { 705 int ix; 706 pgcnt_t total = 0; 707 708 for (ix = 0; ix < ANI_MAX_POOL; ix++) { 709 total += ani_free_pool[ix].ani_count; 710 } 711 k_anoninfo.ani_free = total; 712 } 713 714 /* 715 * Reserve anon space. 716 * 717 * It's no longer simply a matter of incrementing ani_resv to 718 * reserve swap space, we need to check memory-based as well 719 * as disk-backed (physical) swap. The following algorithm 720 * is used: 721 * Check the space on physical swap 722 * i.e. amount needed < ani_max - ani_phys_resv 723 * If we are swapping on swapfs check 724 * amount needed < (availrmem - swapfs_minfree) 725 * Since the algorithm to check for the quantity of swap space is 726 * almost the same as that for reserving it, we'll just use anon_resvmem 727 * with a flag to decrement availrmem. 728 * 729 * Return non-zero on success. 730 */ 731 int 732 anon_resvmem(size_t size, uint_t takemem) 733 { 734 pgcnt_t npages = btopr(size); 735 pgcnt_t mswap_pages = 0; 736 pgcnt_t pswap_pages = 0; 737 738 mutex_enter(&anoninfo_lock); 739 740 /* 741 * pswap_pages is the number of pages we can take from 742 * physical (i.e. disk-backed) swap. 743 */ 744 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 745 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv; 746 747 ANON_PRINT(A_RESV, 748 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n", 749 npages, takemem, pswap_pages, (void *)caller())); 750 751 if (npages <= pswap_pages) { 752 /* 753 * we have enough space on a physical swap 754 */ 755 if (takemem) 756 k_anoninfo.ani_phys_resv += npages; 757 mutex_exit(&anoninfo_lock); 758 return (1); 759 } else if (pswap_pages != 0) { 760 /* 761 * we have some space on a physical swap 762 */ 763 if (takemem) { 764 /* 765 * use up remainder of phys swap 766 */ 767 k_anoninfo.ani_phys_resv += pswap_pages; 768 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max); 769 } 770 } 771 /* 772 * since (npages > pswap_pages) we need mem swap 773 * mswap_pages is the number of pages needed from availrmem 774 */ 775 ASSERT(npages > pswap_pages); 776 mswap_pages = npages - pswap_pages; 777 778 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n", 779 mswap_pages)); 780 781 /* 782 * priv processes can reserve memory as swap as long as availrmem 783 * remains greater than swapfs_minfree; in the case of non-priv 784 * processes, memory can be reserved as swap only if availrmem 785 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus, 786 * swapfs_reserve amount of memswap is not available to non-priv 787 * processes. This protects daemons such as automounter dying 788 * as a result of application processes eating away almost entire 789 * membased swap. This safeguard becomes useless if apps are run 790 * with root access. 791 * 792 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem. 793 * 794 */ 795 mutex_exit(&anoninfo_lock); 796 (void) page_reclaim_mem(mswap_pages, 797 swapfs_minfree + swapfs_reserve, 0); 798 mutex_enter(&anoninfo_lock); 799 800 mutex_enter(&freemem_lock); 801 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) || 802 (availrmem > (swapfs_minfree + mswap_pages) && 803 secpolicy_resource(CRED()) == 0)) { 804 805 if (takemem) { 806 /* 807 * Take the memory from the rest of the system. 808 */ 809 availrmem -= mswap_pages; 810 mutex_exit(&freemem_lock); 811 k_anoninfo.ani_mem_resv += mswap_pages; 812 ANI_ADD(mswap_pages); 813 ANON_PRINT((A_RESV | A_MRESV), 814 ("anon_resvmem: took %ld pages of availrmem\n", 815 mswap_pages)); 816 } else { 817 mutex_exit(&freemem_lock); 818 } 819 820 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 821 mutex_exit(&anoninfo_lock); 822 return (1); 823 824 } else { 825 /* 826 * Fail if not enough memory 827 */ 828 829 if (takemem) { 830 k_anoninfo.ani_phys_resv -= pswap_pages; 831 } 832 833 mutex_exit(&freemem_lock); 834 mutex_exit(&anoninfo_lock); 835 ANON_PRINT(A_RESV, 836 ("anon_resvmem: not enough space from swapfs\n")); 837 return (0); 838 } 839 } 840 841 842 /* 843 * Give back an anon reservation. 844 */ 845 void 846 anon_unresv(size_t size) 847 { 848 pgcnt_t npages = btopr(size); 849 spgcnt_t mem_free_pages = 0; 850 pgcnt_t phys_free_slots; 851 #ifdef ANON_DEBUG 852 pgcnt_t mem_resv; 853 #endif 854 855 mutex_enter(&anoninfo_lock); 856 857 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 858 /* 859 * If some of this reservation belonged to swapfs 860 * give it back to availrmem. 861 * ani_mem_resv is the amount of availrmem swapfs has reserved. 862 * but some of that memory could be locked by segspt so we can only 863 * return non locked ani_mem_resv back to availrmem 864 */ 865 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) { 866 ANON_PRINT((A_RESV | A_MRESV), 867 ("anon_unresv: growing availrmem by %ld pages\n", 868 MIN(k_anoninfo.ani_mem_resv, npages))); 869 870 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv - 871 k_anoninfo.ani_locked_swap), npages); 872 mutex_enter(&freemem_lock); 873 availrmem += mem_free_pages; 874 mutex_exit(&freemem_lock); 875 k_anoninfo.ani_mem_resv -= mem_free_pages; 876 877 ANI_ADD(-mem_free_pages); 878 } 879 /* 880 * The remainder of the pages is returned to phys swap 881 */ 882 ASSERT(npages >= mem_free_pages); 883 phys_free_slots = npages - mem_free_pages; 884 885 if (phys_free_slots) { 886 k_anoninfo.ani_phys_resv -= phys_free_slots; 887 } 888 889 #ifdef ANON_DEBUG 890 mem_resv = k_anoninfo.ani_mem_resv; 891 #endif 892 893 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 894 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 895 896 mutex_exit(&anoninfo_lock); 897 898 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n", 899 npages, mem_resv, (void *)caller())); 900 } 901 902 /* 903 * Allocate an anon slot and return it with the lock held. 904 */ 905 struct anon * 906 anon_alloc(struct vnode *vp, anoff_t off) 907 { 908 struct anon *ap; 909 kmutex_t *ahm; 910 911 ap = kmem_cache_alloc(anon_cache, KM_SLEEP); 912 if (vp == NULL) { 913 swap_alloc(ap); 914 } else { 915 ap->an_vp = vp; 916 ap->an_off = off; 917 } 918 ap->an_refcnt = 1; 919 ap->an_pvp = NULL; 920 ap->an_poff = 0; 921 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 922 mutex_enter(ahm); 923 anon_addhash(ap); 924 mutex_exit(ahm); 925 ANI_ADD(-1); 926 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n", 927 (void *)ap, (ap ? (void *)ap->an_vp : NULL))); 928 return (ap); 929 } 930 931 /* 932 * Decrement the reference count of an anon page. 933 * If reference count goes to zero, free it and 934 * its associated page (if any). 935 */ 936 void 937 anon_decref(struct anon *ap) 938 { 939 page_t *pp; 940 struct vnode *vp; 941 anoff_t off; 942 kmutex_t *ahm; 943 944 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 945 mutex_enter(ahm); 946 ASSERT(ap->an_refcnt != 0); 947 if (ap->an_refcnt == 0) 948 panic("anon_decref: slot count 0"); 949 if (--ap->an_refcnt == 0) { 950 swap_xlate(ap, &vp, &off); 951 mutex_exit(ahm); 952 953 /* 954 * If there is a page for this anon slot we will need to 955 * call VN_DISPOSE to get rid of the vp association and 956 * put the page back on the free list as really free. 957 * Acquire the "exclusive" lock to ensure that any 958 * pending i/o always completes before the swap slot 959 * is freed. 960 */ 961 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 962 963 /* 964 * If there was a page, we've synchronized on it (getting 965 * the exclusive lock is as good as gettting the iolock) 966 * so now we can free the physical backing store. Also, this 967 * is where we would free the name of the anonymous page 968 * (swap_free(ap)), a no-op in the current implementation. 969 */ 970 mutex_enter(ahm); 971 ASSERT(ap->an_refcnt == 0); 972 anon_rmhash(ap); 973 if (ap->an_pvp) 974 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE); 975 mutex_exit(ahm); 976 977 if (pp != NULL) { 978 /*LINTED: constant in conditional context */ 979 VN_DISPOSE(pp, B_INVAL, 0, kcred); 980 } 981 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n", 982 (void *)ap, (void *)ap->an_vp)); 983 kmem_cache_free(anon_cache, ap); 984 985 ANI_ADD(1); 986 } else { 987 mutex_exit(ahm); 988 } 989 } 990 991 static int 992 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 993 { 994 struct anon *ap; 995 996 while (nslots-- > 0) { 997 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL && 998 ap->an_refcnt > 1) 999 return (1); 1000 anon_index++; 1001 } 1002 1003 return (0); 1004 } 1005 1006 static void 1007 anon_decref_pages( 1008 struct anon_hdr *ahp, 1009 ulong_t an_idx, 1010 uint_t szc) 1011 { 1012 struct anon *ap = anon_get_ptr(ahp, an_idx); 1013 kmutex_t *ahmpages = NULL; 1014 page_t *pp; 1015 pgcnt_t pgcnt = page_get_pagecnt(szc); 1016 pgcnt_t i; 1017 struct vnode *vp; 1018 anoff_t off; 1019 kmutex_t *ahm; 1020 #ifdef DEBUG 1021 int refcnt = 1; 1022 #endif 1023 1024 ASSERT(szc != 0); 1025 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1026 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1027 ASSERT(an_idx < ahp->size); 1028 1029 if (ahp->size - an_idx < pgcnt) { 1030 /* 1031 * In case of shared mappings total anon map size may not be 1032 * the largest page size aligned. 1033 */ 1034 pgcnt = ahp->size - an_idx; 1035 } 1036 1037 VM_STAT_ADD(anonvmstats.decrefpages[0]); 1038 1039 if (ap != NULL) { 1040 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1041 mutex_enter(ahmpages); 1042 ASSERT((refcnt = ap->an_refcnt) != 0); 1043 VM_STAT_ADD(anonvmstats.decrefpages[1]); 1044 if (ap->an_refcnt == 1) { 1045 VM_STAT_ADD(anonvmstats.decrefpages[2]); 1046 ASSERT(!anon_share(ahp, an_idx, pgcnt)); 1047 mutex_exit(ahmpages); 1048 ahmpages = NULL; 1049 } 1050 } 1051 1052 i = 0; 1053 while (i < pgcnt) { 1054 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) { 1055 ASSERT(refcnt == 1 && ahmpages == NULL); 1056 i++; 1057 continue; 1058 } 1059 ASSERT(ap->an_refcnt == refcnt); 1060 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1061 ASSERT(ahmpages == NULL || ap->an_refcnt > 1); 1062 1063 if (ahmpages == NULL) { 1064 swap_xlate(ap, &vp, &off); 1065 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL); 1066 if (pp == NULL || pp->p_szc == 0) { 1067 VM_STAT_ADD(anonvmstats.decrefpages[3]); 1068 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1069 ap->an_off)]; 1070 (void) anon_set_ptr(ahp, an_idx + i, NULL, 1071 ANON_SLEEP); 1072 mutex_enter(ahm); 1073 ap->an_refcnt--; 1074 ASSERT(ap->an_refcnt == 0); 1075 anon_rmhash(ap); 1076 if (ap->an_pvp) 1077 swap_phys_free(ap->an_pvp, ap->an_poff, 1078 PAGESIZE); 1079 mutex_exit(ahm); 1080 if (pp != NULL) { 1081 VM_STAT_ADD(anonvmstats.decrefpages[4]); 1082 /*LINTED*/ 1083 VN_DISPOSE(pp, B_INVAL, 0, kcred); 1084 } 1085 kmem_cache_free(anon_cache, ap); 1086 ANI_ADD(1); 1087 i++; 1088 } else { 1089 pgcnt_t j; 1090 pgcnt_t curpgcnt = 1091 page_get_pagecnt(pp->p_szc); 1092 size_t ppasize = curpgcnt * sizeof (page_t *); 1093 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 1094 int dispose = 0; 1095 1096 VM_STAT_ADD(anonvmstats.decrefpages[5]); 1097 1098 ASSERT(pp->p_szc <= szc); 1099 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt)); 1100 ASSERT(IS_P2ALIGNED(i, curpgcnt)); 1101 ASSERT(i + curpgcnt <= pgcnt); 1102 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1))); 1103 ppa[0] = pp; 1104 for (j = i + 1; j < i + curpgcnt; j++) { 1105 ap = anon_get_ptr(ahp, an_idx + j); 1106 ASSERT(ap != NULL && 1107 ap->an_refcnt == 1); 1108 swap_xlate(ap, &vp, &off); 1109 pp = page_lookup(vp, (u_offset_t)off, 1110 SE_EXCL); 1111 if (pp == NULL) 1112 panic("anon_decref_pages: " 1113 "no page"); 1114 1115 (void) hat_pageunload(pp, 1116 HAT_FORCE_PGUNLOAD); 1117 ASSERT(pp->p_szc == ppa[0]->p_szc); 1118 ASSERT(page_pptonum(pp) - 1 == 1119 page_pptonum(ppa[j - i - 1])); 1120 ppa[j - i] = pp; 1121 if (ap->an_pvp != NULL && 1122 !vn_matchopval(ap->an_pvp, 1123 VOPNAME_DISPOSE, 1124 (fs_generic_func_p)fs_dispose)) 1125 dispose = 1; 1126 } 1127 if (!dispose) { 1128 VM_STAT_ADD(anonvmstats.decrefpages[6]); 1129 page_destroy_pages(ppa[0]); 1130 } else { 1131 VM_STAT_ADD(anonvmstats.decrefpages[7]); 1132 for (j = 0; j < curpgcnt; j++) { 1133 ASSERT(PAGE_EXCL(ppa[j])); 1134 ppa[j]->p_szc = 0; 1135 } 1136 for (j = 0; j < curpgcnt; j++) { 1137 ASSERT(!hat_page_is_mapped( 1138 ppa[j])); 1139 /*LINTED*/ 1140 VN_DISPOSE(ppa[j], B_INVAL, 0, 1141 kcred); 1142 } 1143 } 1144 kmem_free(ppa, ppasize); 1145 for (j = i; j < i + curpgcnt; j++) { 1146 ap = anon_get_ptr(ahp, an_idx + j); 1147 ASSERT(ap != NULL && 1148 ap->an_refcnt == 1); 1149 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, 1150 ap->an_off)]; 1151 (void) anon_set_ptr(ahp, an_idx + j, 1152 NULL, ANON_SLEEP); 1153 mutex_enter(ahm); 1154 ap->an_refcnt--; 1155 ASSERT(ap->an_refcnt == 0); 1156 anon_rmhash(ap); 1157 if (ap->an_pvp) 1158 swap_phys_free(ap->an_pvp, 1159 ap->an_poff, PAGESIZE); 1160 mutex_exit(ahm); 1161 kmem_cache_free(anon_cache, ap); 1162 ANI_ADD(1); 1163 } 1164 i += curpgcnt; 1165 } 1166 } else { 1167 VM_STAT_ADD(anonvmstats.decrefpages[8]); 1168 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP); 1169 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1170 mutex_enter(ahm); 1171 ap->an_refcnt--; 1172 mutex_exit(ahm); 1173 i++; 1174 } 1175 } 1176 1177 if (ahmpages != NULL) { 1178 mutex_exit(ahmpages); 1179 } 1180 } 1181 1182 /* 1183 * Duplicate references to size bytes worth of anon pages. 1184 * Used when duplicating a segment that contains private anon pages. 1185 * This code assumes that procedure calling this one has already used 1186 * hat_chgprot() to disable write access to the range of addresses that 1187 * that *old actually refers to. 1188 */ 1189 void 1190 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new, 1191 ulong_t new_idx, size_t size) 1192 { 1193 spgcnt_t npages; 1194 kmutex_t *ahm; 1195 struct anon *ap; 1196 ulong_t off; 1197 ulong_t index; 1198 1199 npages = btopr(size); 1200 while (npages > 0) { 1201 index = old_idx; 1202 if ((ap = anon_get_next_ptr(old, &index)) == NULL) 1203 break; 1204 1205 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1206 off = index - old_idx; 1207 npages -= off; 1208 if (npages <= 0) 1209 break; 1210 1211 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP); 1212 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1213 1214 mutex_enter(ahm); 1215 ap->an_refcnt++; 1216 mutex_exit(ahm); 1217 1218 off++; 1219 new_idx += off; 1220 old_idx += off; 1221 npages--; 1222 } 1223 } 1224 1225 /* 1226 * Just like anon_dup but also guarantees there are no holes (unallocated anon 1227 * slots) within any large page region. That means if a large page region is 1228 * empty in the old array it will skip it. If there are 1 or more valid slots 1229 * in the large page region of the old array it will make sure to fill in any 1230 * unallocated ones and also copy them to the new array. If noalloc is 1 large 1231 * page region should either have no valid anon slots or all slots should be 1232 * valid. 1233 */ 1234 void 1235 anon_dup_fill_holes( 1236 struct anon_hdr *old, 1237 ulong_t old_idx, 1238 struct anon_hdr *new, 1239 ulong_t new_idx, 1240 size_t size, 1241 uint_t szc, 1242 int noalloc) 1243 { 1244 struct anon *ap; 1245 spgcnt_t npages; 1246 kmutex_t *ahm, *ahmpages = NULL; 1247 pgcnt_t pgcnt, i; 1248 ulong_t index, off; 1249 #ifdef DEBUG 1250 int refcnt; 1251 #endif 1252 1253 ASSERT(szc != 0); 1254 pgcnt = page_get_pagecnt(szc); 1255 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1256 npages = btopr(size); 1257 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1258 ASSERT(IS_P2ALIGNED(old_idx, pgcnt)); 1259 1260 VM_STAT_ADD(anonvmstats.dupfillholes[0]); 1261 1262 while (npages > 0) { 1263 index = old_idx; 1264 1265 /* 1266 * Find the next valid slot. 1267 */ 1268 if (anon_get_next_ptr(old, &index) == NULL) 1269 break; 1270 1271 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index))); 1272 /* 1273 * Now backup index to the beginning of the 1274 * current large page region of the old array. 1275 */ 1276 index = P2ALIGN(index, pgcnt); 1277 off = index - old_idx; 1278 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1279 npages -= off; 1280 if (npages <= 0) 1281 break; 1282 1283 /* 1284 * Fill and copy a large page regions worth 1285 * of anon slots. 1286 */ 1287 for (i = 0; i < pgcnt; i++) { 1288 if ((ap = anon_get_ptr(old, index + i)) == NULL) { 1289 if (noalloc) { 1290 panic("anon_dup_fill_holes: " 1291 "empty anon slot\n"); 1292 } 1293 VM_STAT_ADD(anonvmstats.dupfillholes[1]); 1294 ap = anon_alloc(NULL, 0); 1295 (void) anon_set_ptr(old, index + i, ap, 1296 ANON_SLEEP); 1297 } else if (i == 0) { 1298 /* 1299 * make the increment of all refcnts of all 1300 * anon slots of a large page appear atomic by 1301 * getting an anonpages_hash_lock for the 1302 * first anon slot of a large page. 1303 */ 1304 int hash = AH_LOCK(ap->an_vp, ap->an_off); 1305 1306 VM_STAT_ADD(anonvmstats.dupfillholes[2]); 1307 1308 ahmpages = &anonpages_hash_lock[hash]; 1309 mutex_enter(ahmpages); 1310 /*LINTED*/ 1311 ASSERT(refcnt = ap->an_refcnt); 1312 1313 VM_STAT_COND_ADD(ap->an_refcnt > 1, 1314 anonvmstats.dupfillholes[3]); 1315 } 1316 (void) anon_set_ptr(new, new_idx + off + i, ap, 1317 ANON_SLEEP); 1318 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1319 mutex_enter(ahm); 1320 ASSERT(ahmpages != NULL || ap->an_refcnt == 1); 1321 ASSERT(i == 0 || ahmpages == NULL || 1322 refcnt == ap->an_refcnt); 1323 ap->an_refcnt++; 1324 mutex_exit(ahm); 1325 } 1326 if (ahmpages != NULL) { 1327 mutex_exit(ahmpages); 1328 ahmpages = NULL; 1329 } 1330 off += pgcnt; 1331 new_idx += off; 1332 old_idx += off; 1333 npages -= pgcnt; 1334 } 1335 } 1336 1337 /* 1338 * Used when a segment with a vnode changes szc. similarly to 1339 * anon_dup_fill_holes() makes sure each large page region either has no anon 1340 * slots or all of them. but new slots are created by COWing the file 1341 * pages. on entrance no anon slots should be shared. 1342 */ 1343 int 1344 anon_fill_cow_holes( 1345 struct seg *seg, 1346 caddr_t addr, 1347 struct anon_hdr *ahp, 1348 ulong_t an_idx, 1349 struct vnode *vp, 1350 u_offset_t vp_off, 1351 size_t size, 1352 uint_t szc, 1353 uint_t prot, 1354 struct vpage vpage[], 1355 struct cred *cred) 1356 { 1357 struct anon *ap; 1358 spgcnt_t npages; 1359 pgcnt_t pgcnt, i; 1360 ulong_t index, off; 1361 int err = 0; 1362 int pageflags = 0; 1363 1364 ASSERT(szc != 0); 1365 pgcnt = page_get_pagecnt(szc); 1366 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1367 npages = btopr(size); 1368 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1369 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1370 1371 while (npages > 0) { 1372 index = an_idx; 1373 1374 /* 1375 * Find the next valid slot. 1376 */ 1377 if (anon_get_next_ptr(ahp, &index) == NULL) { 1378 break; 1379 } 1380 1381 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1382 /* 1383 * Now backup index to the beginning of the 1384 * current large page region of the anon array. 1385 */ 1386 index = P2ALIGN(index, pgcnt); 1387 off = index - an_idx; 1388 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1389 npages -= off; 1390 if (npages <= 0) 1391 break; 1392 an_idx += off; 1393 vp_off += ptob(off); 1394 addr += ptob(off); 1395 if (vpage != NULL) { 1396 vpage += off; 1397 } 1398 1399 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) { 1400 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) { 1401 page_t *pl[1 + 1]; 1402 page_t *pp; 1403 1404 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL, 1405 pl, PAGESIZE, seg, addr, S_READ, cred); 1406 if (err) { 1407 break; 1408 } 1409 if (vpage != NULL) { 1410 prot = VPP_PROT(vpage); 1411 pageflags = VPP_ISPPLOCK(vpage) ? 1412 LOCK_PAGE : 0; 1413 } 1414 pp = anon_private(&ap, seg, addr, prot, pl[0], 1415 pageflags, cred); 1416 if (pp == NULL) { 1417 err = ENOMEM; 1418 break; 1419 } 1420 (void) anon_set_ptr(ahp, an_idx, ap, 1421 ANON_SLEEP); 1422 page_unlock(pp); 1423 } 1424 ASSERT(ap->an_refcnt == 1); 1425 addr += PAGESIZE; 1426 if (vpage != NULL) { 1427 vpage++; 1428 } 1429 } 1430 npages -= pgcnt; 1431 } 1432 1433 return (err); 1434 } 1435 1436 /* 1437 * Free a group of "size" anon pages, size in bytes, 1438 * and clear out the pointers to the anon entries. 1439 */ 1440 void 1441 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size) 1442 { 1443 spgcnt_t npages; 1444 struct anon *ap; 1445 ulong_t old; 1446 1447 npages = btopr(size); 1448 1449 while (npages > 0) { 1450 old = index; 1451 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1452 break; 1453 1454 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1455 npages -= index - old; 1456 if (npages <= 0) 1457 break; 1458 1459 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP); 1460 anon_decref(ap); 1461 /* 1462 * Bump index and decrement page count 1463 */ 1464 index++; 1465 npages--; 1466 } 1467 } 1468 1469 void 1470 anon_free_pages( 1471 struct anon_hdr *ahp, 1472 ulong_t an_idx, 1473 size_t size, 1474 uint_t szc) 1475 { 1476 spgcnt_t npages; 1477 pgcnt_t pgcnt; 1478 ulong_t index, off; 1479 1480 ASSERT(szc != 0); 1481 pgcnt = page_get_pagecnt(szc); 1482 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1483 npages = btopr(size); 1484 ASSERT(IS_P2ALIGNED(npages, pgcnt)); 1485 ASSERT(IS_P2ALIGNED(an_idx, pgcnt)); 1486 ASSERT(an_idx < ahp->size); 1487 1488 VM_STAT_ADD(anonvmstats.freepages[0]); 1489 1490 while (npages > 0) { 1491 index = an_idx; 1492 1493 /* 1494 * Find the next valid slot. 1495 */ 1496 if (anon_get_next_ptr(ahp, &index) == NULL) 1497 break; 1498 1499 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index))); 1500 /* 1501 * Now backup index to the beginning of the 1502 * current large page region of the old array. 1503 */ 1504 index = P2ALIGN(index, pgcnt); 1505 off = index - an_idx; 1506 ASSERT(IS_P2ALIGNED(off, pgcnt)); 1507 npages -= off; 1508 if (npages <= 0) 1509 break; 1510 1511 anon_decref_pages(ahp, index, szc); 1512 1513 off += pgcnt; 1514 an_idx += off; 1515 npages -= pgcnt; 1516 } 1517 } 1518 1519 /* 1520 * Make anonymous pages discardable 1521 */ 1522 void 1523 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size, int flags) 1524 { 1525 spgcnt_t npages = btopr(size); 1526 struct anon *ap; 1527 struct vnode *vp; 1528 anoff_t off; 1529 page_t *pp, *root_pp; 1530 kmutex_t *ahm; 1531 pgcnt_t pgcnt; 1532 ulong_t old_idx, idx, i; 1533 struct anon_hdr *ahp = amp->ahp; 1534 anon_sync_obj_t cookie; 1535 1536 ASSERT(RW_READ_HELD(&->a_rwlock)); 1537 pgcnt = 1; 1538 for (; npages > 0; index = (pgcnt == 1) ? index + 1: 1539 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) { 1540 1541 /* 1542 * get anon pointer and index for the first valid entry 1543 * in the anon list, starting from "index" 1544 */ 1545 old_idx = index; 1546 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL) 1547 break; 1548 1549 /* 1550 * decrement npages by number of NULL anon slots we skipped 1551 */ 1552 npages -= index - old_idx; 1553 if (npages <= 0) 1554 break; 1555 1556 anon_array_enter(amp, index, &cookie); 1557 ap = anon_get_ptr(ahp, index); 1558 ASSERT(ap != NULL); 1559 1560 /* 1561 * Get anonymous page and try to lock it SE_EXCL; 1562 * For non blocking case if we couldn't grab the lock 1563 * we skip to next page. 1564 * For blocking case (ANON_PGLOOKUP_BLK) block 1565 * until we grab SE_EXCL lock. 1566 */ 1567 swap_xlate(ap, &vp, &off); 1568 if (flags & ANON_PGLOOKUP_BLK) 1569 pp = page_lookup_create(vp, (u_offset_t)off, 1570 SE_EXCL, NULL, NULL, SE_EXCL_WANTED); 1571 else 1572 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL); 1573 if (pp == NULL) { 1574 segadvstat.MADV_FREE_miss.value.ul++; 1575 pgcnt = 1; 1576 anon_array_exit(&cookie); 1577 continue; 1578 } 1579 pgcnt = page_get_pagecnt(pp->p_szc); 1580 1581 /* 1582 * we cannot free a page which is permanently locked. 1583 * The page_struct_lock need not be acquired to examine 1584 * these fields since the page has an "exclusive" lock. 1585 */ 1586 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1587 page_unlock(pp); 1588 segadvstat.MADV_FREE_miss.value.ul++; 1589 anon_array_exit(&cookie); 1590 continue; 1591 } 1592 1593 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1594 mutex_enter(ahm); 1595 ASSERT(ap->an_refcnt != 0); 1596 /* 1597 * skip this one if copy-on-write is not yet broken. 1598 */ 1599 if (ap->an_refcnt > 1) { 1600 mutex_exit(ahm); 1601 page_unlock(pp); 1602 segadvstat.MADV_FREE_miss.value.ul++; 1603 anon_array_exit(&cookie); 1604 continue; 1605 } 1606 1607 if (pp->p_szc == 0) { 1608 pgcnt = 1; 1609 1610 /* 1611 * free swap slot; 1612 */ 1613 if (ap->an_pvp) { 1614 swap_phys_free(ap->an_pvp, ap->an_poff, 1615 PAGESIZE); 1616 ap->an_pvp = NULL; 1617 ap->an_poff = 0; 1618 } 1619 mutex_exit(ahm); 1620 segadvstat.MADV_FREE_hit.value.ul++; 1621 1622 /* 1623 * while we are at it, unload all the translations 1624 * and attempt to free the page. 1625 */ 1626 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1627 /*LINTED: constant in conditional context */ 1628 VN_DISPOSE(pp, B_FREE, 0, kcred); 1629 anon_array_exit(&cookie); 1630 continue; 1631 } 1632 1633 pgcnt = page_get_pagecnt(pp->p_szc); 1634 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) { 1635 if (!page_try_demote_pages(pp)) { 1636 mutex_exit(ahm); 1637 page_unlock(pp); 1638 segadvstat.MADV_FREE_miss.value.ul++; 1639 anon_array_exit(&cookie); 1640 continue; 1641 } else { 1642 pgcnt = 1; 1643 if (ap->an_pvp) { 1644 swap_phys_free(ap->an_pvp, 1645 ap->an_poff, PAGESIZE); 1646 ap->an_pvp = NULL; 1647 ap->an_poff = 0; 1648 } 1649 mutex_exit(ahm); 1650 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD); 1651 /*LINTED*/ 1652 VN_DISPOSE(pp, B_FREE, 0, kcred); 1653 segadvstat.MADV_FREE_hit.value.ul++; 1654 anon_array_exit(&cookie); 1655 continue; 1656 } 1657 } 1658 mutex_exit(ahm); 1659 root_pp = pp; 1660 1661 /* 1662 * try to lock remaining pages 1663 */ 1664 for (idx = 1; idx < pgcnt; idx++) { 1665 pp++; 1666 if (!page_trylock(pp, SE_EXCL)) 1667 break; 1668 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) { 1669 page_unlock(pp); 1670 break; 1671 } 1672 } 1673 1674 if (idx == pgcnt) { 1675 for (i = 0; i < pgcnt; i++) { 1676 ap = anon_get_ptr(ahp, index + i); 1677 if (ap == NULL) 1678 break; 1679 swap_xlate(ap, &vp, &off); 1680 ahm = &anonhash_lock[AH_LOCK(vp, off)]; 1681 mutex_enter(ahm); 1682 ASSERT(ap->an_refcnt != 0); 1683 1684 /* 1685 * skip this one if copy-on-write 1686 * is not yet broken. 1687 */ 1688 if (ap->an_refcnt > 1) { 1689 mutex_exit(ahm); 1690 goto skiplp; 1691 } 1692 if (ap->an_pvp) { 1693 swap_phys_free(ap->an_pvp, 1694 ap->an_poff, PAGESIZE); 1695 ap->an_pvp = NULL; 1696 ap->an_poff = 0; 1697 } 1698 mutex_exit(ahm); 1699 } 1700 page_destroy_pages(root_pp); 1701 segadvstat.MADV_FREE_hit.value.ul += pgcnt; 1702 anon_array_exit(&cookie); 1703 continue; 1704 } 1705 skiplp: 1706 segadvstat.MADV_FREE_miss.value.ul += pgcnt; 1707 for (i = 0, pp = root_pp; i < idx; pp++, i++) 1708 page_unlock(pp); 1709 anon_array_exit(&cookie); 1710 } 1711 } 1712 1713 /* 1714 * Return the kept page(s) and protections back to the segment driver. 1715 */ 1716 int 1717 anon_getpage( 1718 struct anon **app, 1719 uint_t *protp, 1720 page_t *pl[], 1721 size_t plsz, 1722 struct seg *seg, 1723 caddr_t addr, 1724 enum seg_rw rw, 1725 struct cred *cred) 1726 { 1727 page_t *pp; 1728 struct anon *ap = *app; 1729 struct vnode *vp; 1730 anoff_t off; 1731 int err; 1732 kmutex_t *ahm; 1733 1734 swap_xlate(ap, &vp, &off); 1735 1736 /* 1737 * Lookup the page. If page is being paged in, 1738 * wait for it to finish as we must return a list of 1739 * pages since this routine acts like the VOP_GETPAGE 1740 * routine does. 1741 */ 1742 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) { 1743 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1744 mutex_enter(ahm); 1745 if (ap->an_refcnt == 1) 1746 *protp = PROT_ALL; 1747 else 1748 *protp = PROT_ALL & ~PROT_WRITE; 1749 mutex_exit(ahm); 1750 pl[0] = pp; 1751 pl[1] = NULL; 1752 return (0); 1753 } 1754 1755 /* 1756 * Simply treat it as a vnode fault on the anon vp. 1757 */ 1758 1759 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE, 1760 "anon_getpage:seg %x addr %x vp %x", 1761 seg, addr, vp); 1762 1763 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz, 1764 seg, addr, rw, cred); 1765 1766 if (err == 0 && pl != NULL) { 1767 ahm = &anonhash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 1768 mutex_enter(ahm); 1769 if (ap->an_refcnt != 1) 1770 *protp &= ~PROT_WRITE; /* make read-only */ 1771 mutex_exit(ahm); 1772 } 1773 return (err); 1774 } 1775 1776 /* 1777 * Creates or returns kept pages to the segment driver. returns -1 if a large 1778 * page cannot be allocated. returns -2 if some other process has allocated a 1779 * larger page. 1780 * 1781 * For cowfault it will alocate any size pages to fill the requested area to 1782 * avoid partially overwritting anon slots (i.e. sharing only some of the anon 1783 * slots within a large page with other processes). This policy greatly 1784 * simplifies large page freeing (which is only freed when all anon slot 1785 * refcnts are 0). 1786 */ 1787 int 1788 anon_map_getpages( 1789 struct anon_map *amp, 1790 ulong_t start_idx, 1791 uint_t szc, 1792 struct seg *seg, 1793 caddr_t addr, 1794 uint_t prot, 1795 uint_t *protp, 1796 page_t *ppa[], 1797 uint_t *ppa_szc, 1798 struct vpage vpage[], 1799 enum seg_rw rw, 1800 int brkcow, 1801 int anypgsz, 1802 struct cred *cred) 1803 { 1804 pgcnt_t pgcnt; 1805 struct anon *ap; 1806 struct vnode *vp; 1807 anoff_t off; 1808 page_t *pp, *pl[2], *conpp = NULL; 1809 caddr_t vaddr; 1810 ulong_t pg_idx, an_idx, i; 1811 spgcnt_t nreloc = 0; 1812 int prealloc = 1; 1813 int err, slotcreate; 1814 uint_t vpprot; 1815 int upsize = (szc < seg->s_szc); 1816 1817 #if !defined(__i386) && !defined(__amd64) 1818 ASSERT(seg->s_szc != 0); 1819 #endif 1820 ASSERT(szc <= seg->s_szc); 1821 ASSERT(ppa_szc != NULL); 1822 ASSERT(rw != S_CREATE); 1823 1824 *protp = PROT_ALL; 1825 1826 VM_STAT_ADD(anonvmstats.getpages[0]); 1827 1828 if (szc == 0) { 1829 VM_STAT_ADD(anonvmstats.getpages[1]); 1830 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) { 1831 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg, 1832 addr, rw, cred); 1833 if (err) 1834 return (err); 1835 ppa[0] = pl[0]; 1836 if (brkcow == 0 || (*protp & PROT_WRITE)) { 1837 VM_STAT_ADD(anonvmstats.getpages[2]); 1838 if (ppa[0]->p_szc != 0 && upsize) { 1839 VM_STAT_ADD(anonvmstats.getpages[3]); 1840 *ppa_szc = MIN(ppa[0]->p_szc, 1841 seg->s_szc); 1842 page_unlock(ppa[0]); 1843 return (-2); 1844 } 1845 return (0); 1846 } 1847 panic("anon_map_getpages: cowfault for szc 0"); 1848 } else { 1849 VM_STAT_ADD(anonvmstats.getpages[4]); 1850 ppa[0] = anon_zero(seg, addr, &ap, cred); 1851 if (ppa[0] == NULL) 1852 return (ENOMEM); 1853 (void) anon_set_ptr(amp->ahp, start_idx, ap, 1854 ANON_SLEEP); 1855 return (0); 1856 } 1857 } 1858 1859 pgcnt = page_get_pagecnt(szc); 1860 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 1861 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 1862 1863 /* 1864 * First we check for the case that the requtested large 1865 * page or larger page already exists in the system. 1866 * Actually we only check if the first constituent page 1867 * exists and only preallocate if it's not found. 1868 */ 1869 ap = anon_get_ptr(amp->ahp, start_idx); 1870 if (ap) { 1871 uint_t pszc; 1872 swap_xlate(ap, &vp, &off); 1873 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) { 1874 if (pszc > szc && upsize) { 1875 *ppa_szc = MIN(pszc, seg->s_szc); 1876 return (-2); 1877 } 1878 if (pszc >= szc) { 1879 prealloc = 0; 1880 } 1881 } 1882 } 1883 1884 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]); 1885 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]); 1886 1887 top: 1888 /* 1889 * If a smaller page or no page at all was found, 1890 * grab a large page off the freelist. 1891 */ 1892 if (prealloc) { 1893 ASSERT(conpp == NULL); 1894 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa, 1895 szc, 0) != 0) { 1896 VM_STAT_ADD(anonvmstats.getpages[7]); 1897 if (brkcow == 0 || 1898 !anon_share(amp->ahp, start_idx, pgcnt)) { 1899 /* 1900 * If the refcnt's of all anon slots are <= 1 1901 * they can't increase since we are holding 1902 * the address space's lock. So segvn can 1903 * safely decrease szc without risking to 1904 * generate a cow fault for the region smaller 1905 * than the segment's largest page size. 1906 */ 1907 VM_STAT_ADD(anonvmstats.getpages[8]); 1908 return (-1); 1909 } 1910 docow: 1911 /* 1912 * This is a cow fault. Copy away the entire 1 large 1913 * page region of this segment. 1914 */ 1915 if (szc != seg->s_szc) 1916 panic("anon_map_getpages: cowfault for szc %d", 1917 szc); 1918 vaddr = addr; 1919 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 1920 pg_idx++, an_idx++, vaddr += PAGESIZE) { 1921 if ((ap = anon_get_ptr(amp->ahp, an_idx)) != 1922 NULL) { 1923 err = anon_getpage(&ap, &vpprot, pl, 1924 PAGESIZE, seg, vaddr, rw, cred); 1925 if (err) { 1926 for (i = 0; i < pg_idx; i++) { 1927 if ((pp = ppa[i]) != 1928 NULL) 1929 page_unlock(pp); 1930 } 1931 return (err); 1932 } 1933 ppa[pg_idx] = pl[0]; 1934 } else { 1935 /* 1936 * Since this is a cowfault we know 1937 * that this address space has a 1938 * parent or children which means 1939 * anon_dup_fill_holes() has initialized 1940 * all anon slots within a large page 1941 * region that had at least one anon 1942 * slot at the time of fork(). 1943 */ 1944 panic("anon_map_getpages: " 1945 "cowfault but anon slot is empty"); 1946 } 1947 } 1948 VM_STAT_ADD(anonvmstats.getpages[9]); 1949 *protp = PROT_ALL; 1950 return (anon_map_privatepages(amp, start_idx, szc, seg, 1951 addr, prot, ppa, vpage, anypgsz, cred)); 1952 } 1953 } 1954 1955 VM_STAT_ADD(anonvmstats.getpages[10]); 1956 1957 an_idx = start_idx; 1958 pg_idx = 0; 1959 vaddr = addr; 1960 while (pg_idx < pgcnt) { 1961 slotcreate = 0; 1962 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) { 1963 VM_STAT_ADD(anonvmstats.getpages[11]); 1964 /* 1965 * For us to have decided not to preallocate 1966 * would have meant that a large page 1967 * was found. Which also means that all of the 1968 * anon slots for that page would have been 1969 * already created for us. 1970 */ 1971 if (prealloc == 0) 1972 panic("anon_map_getpages: prealloc = 0"); 1973 1974 slotcreate = 1; 1975 ap = anon_alloc(NULL, 0); 1976 } 1977 swap_xlate(ap, &vp, &off); 1978 1979 /* 1980 * Now setup our preallocated page to pass down 1981 * to swap_getpage(). 1982 */ 1983 if (prealloc) { 1984 ASSERT(ppa[pg_idx]->p_szc == szc); 1985 conpp = ppa[pg_idx]; 1986 } 1987 ASSERT(prealloc || conpp == NULL); 1988 1989 /* 1990 * If we just created this anon slot then call 1991 * with S_CREATE to prevent doing IO on the page. 1992 * Similar to the anon_zero case. 1993 */ 1994 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, 1995 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr, 1996 slotcreate == 1 ? S_CREATE : rw, cred); 1997 1998 if (err) { 1999 ASSERT(err != -2 || upsize); 2000 VM_STAT_ADD(anonvmstats.getpages[12]); 2001 ASSERT(slotcreate == 0); 2002 goto io_err; 2003 } 2004 2005 pp = pl[0]; 2006 2007 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) { 2008 VM_STAT_ADD(anonvmstats.getpages[13]); 2009 ASSERT(slotcreate == 0); 2010 ASSERT(prealloc == 0); 2011 ASSERT(pg_idx == 0); 2012 if (pp->p_szc > szc) { 2013 ASSERT(upsize); 2014 *ppa_szc = MIN(pp->p_szc, seg->s_szc); 2015 page_unlock(pp); 2016 VM_STAT_ADD(anonvmstats.getpages[14]); 2017 return (-2); 2018 } 2019 page_unlock(pp); 2020 prealloc = 1; 2021 goto top; 2022 } 2023 2024 /* 2025 * If we decided to preallocate but VOP_GETPAGE 2026 * found a page in the system that satisfies our 2027 * request then free up our preallocated large page 2028 * and continue looping accross the existing large 2029 * page via VOP_GETPAGE. 2030 */ 2031 if (prealloc && pp != ppa[pg_idx]) { 2032 VM_STAT_ADD(anonvmstats.getpages[15]); 2033 ASSERT(slotcreate == 0); 2034 ASSERT(pg_idx == 0); 2035 conpp = NULL; 2036 prealloc = 0; 2037 page_free_pages(ppa[0]); 2038 } 2039 2040 if (prealloc && nreloc > 1) { 2041 /* 2042 * we have relocated out of a smaller large page. 2043 * skip npgs - 1 iterations and continue which will 2044 * increment by one the loop indices. 2045 */ 2046 spgcnt_t npgs = nreloc; 2047 2048 VM_STAT_ADD(anonvmstats.getpages[16]); 2049 2050 ASSERT(pp == ppa[pg_idx]); 2051 ASSERT(slotcreate == 0); 2052 ASSERT(pg_idx + npgs <= pgcnt); 2053 if ((*protp & PROT_WRITE) && 2054 anon_share(amp->ahp, an_idx, npgs)) { 2055 *protp &= ~PROT_WRITE; 2056 } 2057 pg_idx += npgs; 2058 an_idx += npgs; 2059 vaddr += PAGESIZE * npgs; 2060 continue; 2061 } 2062 2063 VM_STAT_ADD(anonvmstats.getpages[17]); 2064 2065 /* 2066 * Anon_zero case. 2067 */ 2068 if (slotcreate) { 2069 ASSERT(prealloc); 2070 pagezero(pp, 0, PAGESIZE); 2071 CPU_STATS_ADD_K(vm, zfod, 1); 2072 hat_setrefmod(pp); 2073 } 2074 2075 ASSERT(prealloc == 0 || ppa[pg_idx] == pp); 2076 ASSERT(prealloc != 0 || PAGE_SHARED(pp)); 2077 ASSERT(prealloc == 0 || PAGE_EXCL(pp)); 2078 2079 if (pg_idx > 0 && 2080 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) || 2081 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) { 2082 panic("anon_map_getpages: unexpected page"); 2083 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) { 2084 panic("anon_map_getpages: unaligned page"); 2085 } 2086 2087 if (prealloc == 0) { 2088 ppa[pg_idx] = pp; 2089 } 2090 2091 if (ap->an_refcnt > 1) { 2092 VM_STAT_ADD(anonvmstats.getpages[18]); 2093 *protp &= ~PROT_WRITE; 2094 } 2095 2096 /* 2097 * If this is a new anon slot then initialize 2098 * the anon array entry. 2099 */ 2100 if (slotcreate) { 2101 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2102 } 2103 pg_idx++; 2104 an_idx++; 2105 vaddr += PAGESIZE; 2106 } 2107 2108 /* 2109 * Since preallocated pages come off the freelist 2110 * they are locked SE_EXCL. Simply downgrade and return. 2111 */ 2112 if (prealloc) { 2113 VM_STAT_ADD(anonvmstats.getpages[19]); 2114 conpp = NULL; 2115 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2116 page_downgrade(ppa[pg_idx]); 2117 } 2118 } 2119 ASSERT(conpp == NULL); 2120 2121 if (brkcow == 0 || (*protp & PROT_WRITE)) { 2122 VM_STAT_ADD(anonvmstats.getpages[20]); 2123 return (0); 2124 } 2125 2126 if (szc < seg->s_szc) 2127 panic("anon_map_getpages: cowfault for szc %d", szc); 2128 2129 VM_STAT_ADD(anonvmstats.getpages[21]); 2130 2131 *protp = PROT_ALL; 2132 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, 2133 ppa, vpage, anypgsz, cred)); 2134 io_err: 2135 /* 2136 * We got an IO error somewhere in our large page. 2137 * If we were using a preallocated page then just demote 2138 * all the constituent pages that we've succeeded with sofar 2139 * to PAGESIZE pages and leave them in the system 2140 * unlocked. 2141 */ 2142 2143 ASSERT(err != -2 || ((pg_idx == 0) && upsize)); 2144 2145 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]); 2146 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]); 2147 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]); 2148 2149 if (prealloc) { 2150 conpp = NULL; 2151 if (pg_idx > 0) { 2152 VM_STAT_ADD(anonvmstats.getpages[25]); 2153 for (i = 0; i < pgcnt; i++) { 2154 pp = ppa[i]; 2155 ASSERT(PAGE_EXCL(pp)); 2156 ASSERT(pp->p_szc == szc); 2157 pp->p_szc = 0; 2158 } 2159 for (i = 0; i < pg_idx; i++) { 2160 ASSERT(!hat_page_is_mapped(ppa[i])); 2161 page_unlock(ppa[i]); 2162 } 2163 /* 2164 * Now free up the remaining unused constituent 2165 * pages. 2166 */ 2167 while (pg_idx < pgcnt) { 2168 ASSERT(!hat_page_is_mapped(ppa[pg_idx])); 2169 page_free(ppa[pg_idx], 0); 2170 pg_idx++; 2171 } 2172 } else { 2173 VM_STAT_ADD(anonvmstats.getpages[26]); 2174 page_free_pages(ppa[0]); 2175 } 2176 } else { 2177 VM_STAT_ADD(anonvmstats.getpages[27]); 2178 ASSERT(err > 0); 2179 for (i = 0; i < pg_idx; i++) 2180 page_unlock(ppa[i]); 2181 } 2182 ASSERT(conpp == NULL); 2183 if (err != -1) 2184 return (err); 2185 /* 2186 * we are here because we failed to relocate. 2187 */ 2188 ASSERT(prealloc); 2189 if (brkcow == 0 || !anon_share(amp->ahp, start_idx, pgcnt)) { 2190 VM_STAT_ADD(anonvmstats.getpages[28]); 2191 return (-1); 2192 } 2193 VM_STAT_ADD(anonvmstats.getpages[29]); 2194 goto docow; 2195 } 2196 2197 2198 /* 2199 * Turn a reference to an object or shared anon page 2200 * into a private page with a copy of the data from the 2201 * original page which is always locked by the caller. 2202 * This routine unloads the translation and unlocks the 2203 * original page, if it isn't being stolen, before returning 2204 * to the caller. 2205 * 2206 * NOTE: The original anon slot is not freed by this routine 2207 * It must be freed by the caller while holding the 2208 * "anon_map" lock to prevent races which can occur if 2209 * a process has multiple lwps in its address space. 2210 */ 2211 page_t * 2212 anon_private( 2213 struct anon **app, 2214 struct seg *seg, 2215 caddr_t addr, 2216 uint_t prot, 2217 page_t *opp, 2218 int oppflags, 2219 struct cred *cred) 2220 { 2221 struct anon *old = *app; 2222 struct anon *new; 2223 page_t *pp = NULL; 2224 struct vnode *vp; 2225 anoff_t off; 2226 page_t *anon_pl[1 + 1]; 2227 int err; 2228 2229 if (oppflags & STEAL_PAGE) 2230 ASSERT(PAGE_EXCL(opp)); 2231 else 2232 ASSERT(PAGE_LOCKED(opp)); 2233 2234 CPU_STATS_ADD_K(vm, cow_fault, 1); 2235 2236 /* Kernel probe */ 2237 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */, 2238 tnf_opaque, address, addr); 2239 2240 *app = new = anon_alloc(NULL, 0); 2241 swap_xlate(new, &vp, &off); 2242 2243 if (oppflags & STEAL_PAGE) { 2244 page_rename(opp, vp, (u_offset_t)off); 2245 pp = opp; 2246 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE, 2247 "anon_private:seg %p addr %x pp %p vp %p off %lx", 2248 seg, addr, pp, vp, off); 2249 hat_setmod(pp); 2250 2251 /* bug 4026339 */ 2252 page_downgrade(pp); 2253 return (pp); 2254 } 2255 2256 /* 2257 * Call the VOP_GETPAGE routine to create the page, thereby 2258 * enabling the vnode driver to allocate any filesystem 2259 * space (e.g., disk block allocation for UFS). This also 2260 * prevents more than one page from being added to the 2261 * vnode at the same time. 2262 */ 2263 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL, 2264 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2265 if (err) 2266 goto out; 2267 2268 pp = anon_pl[0]; 2269 2270 /* 2271 * If the original page was locked, we need to move the lock 2272 * to the new page by transfering 'cowcnt/lckcnt' of the original 2273 * page to 'cowcnt/lckcnt' of the new page. 2274 * 2275 * See Statement at the beginning of segvn_lockop() and 2276 * comments in page_pp_useclaim() regarding the way 2277 * cowcnts/lckcnts are handled. 2278 * 2279 * Also availrmem must be decremented up front for read only mapping 2280 * before calling page_pp_useclaim. page_pp_useclaim will bump it back 2281 * if availrmem did not need to be decremented after all. 2282 */ 2283 if (oppflags & LOCK_PAGE) { 2284 if ((prot & PROT_WRITE) == 0) { 2285 mutex_enter(&freemem_lock); 2286 if (availrmem > pages_pp_maximum) { 2287 availrmem--; 2288 pages_useclaim++; 2289 } else { 2290 mutex_exit(&freemem_lock); 2291 goto out; 2292 } 2293 mutex_exit(&freemem_lock); 2294 } 2295 page_pp_useclaim(opp, pp, prot & PROT_WRITE); 2296 } 2297 2298 /* 2299 * Now copy the contents from the original page, 2300 * which is locked and loaded in the MMU by 2301 * the caller to prevent yet another page fault. 2302 */ 2303 ppcopy(opp, pp); /* XXX - should set mod bit in here */ 2304 2305 hat_setrefmod(pp); /* mark as modified */ 2306 2307 /* 2308 * Unload the old translation. 2309 */ 2310 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD); 2311 2312 /* 2313 * Free unmapped, unmodified original page. 2314 * or release the lock on the original page, 2315 * otherwise the process will sleep forever in 2316 * anon_decref() waiting for the "exclusive" lock 2317 * on the page. 2318 */ 2319 (void) page_release(opp, 1); 2320 2321 /* 2322 * we are done with page creation so downgrade the new 2323 * page's selock to shared, this helps when multiple 2324 * as_fault(...SOFTLOCK...) are done to the same 2325 * page(aio) 2326 */ 2327 page_downgrade(pp); 2328 2329 /* 2330 * NOTE: The original anon slot must be freed by the 2331 * caller while holding the "anon_map" lock, if we 2332 * copied away from an anonymous page. 2333 */ 2334 return (pp); 2335 2336 out: 2337 *app = old; 2338 if (pp) 2339 page_unlock(pp); 2340 anon_decref(new); 2341 page_unlock(opp); 2342 return ((page_t *)NULL); 2343 } 2344 2345 int 2346 anon_map_privatepages( 2347 struct anon_map *amp, 2348 ulong_t start_idx, 2349 uint_t szc, 2350 struct seg *seg, 2351 caddr_t addr, 2352 uint_t prot, 2353 page_t *ppa[], 2354 struct vpage vpage[], 2355 int anypgsz, 2356 struct cred *cred) 2357 { 2358 pgcnt_t pgcnt; 2359 struct vnode *vp; 2360 anoff_t off; 2361 page_t *pl[2], *conpp = NULL; 2362 int err; 2363 int prealloc = 1; 2364 struct anon *ap, *oldap; 2365 caddr_t vaddr; 2366 page_t *pplist, *pp; 2367 ulong_t pg_idx, an_idx; 2368 spgcnt_t nreloc = 0; 2369 int pagelock = 0; 2370 kmutex_t *ahmpages = NULL; 2371 #ifdef DEBUG 2372 int refcnt; 2373 #endif 2374 2375 ASSERT(szc != 0); 2376 ASSERT(szc == seg->s_szc); 2377 2378 VM_STAT_ADD(anonvmstats.privatepages[0]); 2379 2380 pgcnt = page_get_pagecnt(szc); 2381 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2382 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2383 2384 ASSERT(amp != NULL); 2385 ap = anon_get_ptr(amp->ahp, start_idx); 2386 ASSERT(ap == NULL || ap->an_refcnt >= 1); 2387 2388 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]); 2389 2390 /* 2391 * Now try and allocate the large page. If we fail then just 2392 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let 2393 * the caller make this decision but to avoid added complexity 2394 * it's simplier to handle that case here. 2395 */ 2396 if (anypgsz == -1) { 2397 VM_STAT_ADD(anonvmstats.privatepages[2]); 2398 prealloc = 0; 2399 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc, 2400 anypgsz) != 0) { 2401 VM_STAT_ADD(anonvmstats.privatepages[3]); 2402 prealloc = 0; 2403 } 2404 2405 /* 2406 * make the decrement of all refcnts of all 2407 * anon slots of a large page appear atomic by 2408 * getting an anonpages_hash_lock for the 2409 * first anon slot of a large page. 2410 */ 2411 if (ap != NULL) { 2412 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, 2413 ap->an_off)]; 2414 mutex_enter(ahmpages); 2415 if (ap->an_refcnt == 1) { 2416 VM_STAT_ADD(anonvmstats.privatepages[4]); 2417 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt)); 2418 mutex_exit(ahmpages); 2419 2420 if (prealloc) { 2421 page_free_replacement_page(pplist); 2422 page_create_putback(pgcnt); 2423 } 2424 ASSERT(ppa[0]->p_szc <= szc); 2425 if (ppa[0]->p_szc == szc) { 2426 VM_STAT_ADD(anonvmstats.privatepages[5]); 2427 return (0); 2428 } 2429 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2430 ASSERT(ppa[pg_idx] != NULL); 2431 page_unlock(ppa[pg_idx]); 2432 } 2433 return (-1); 2434 } 2435 } 2436 2437 /* 2438 * If we are passed in the vpage array and this is 2439 * not PROT_WRITE then we need to decrement availrmem 2440 * up front before we try anything. If we need to and 2441 * can't decrement availrmem then its better to fail now 2442 * than in the middle of processing the new large page. 2443 * page_pp_usclaim() on behalf of each constituent page 2444 * below will adjust availrmem back for the cases not needed. 2445 */ 2446 if (vpage != NULL && (prot & PROT_WRITE) == 0) { 2447 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2448 if (VPP_ISPPLOCK(&vpage[pg_idx])) { 2449 pagelock = 1; 2450 break; 2451 } 2452 } 2453 if (pagelock) { 2454 VM_STAT_ADD(anonvmstats.privatepages[6]); 2455 mutex_enter(&freemem_lock); 2456 if (availrmem >= pages_pp_maximum + pgcnt) { 2457 availrmem -= pgcnt; 2458 pages_useclaim += pgcnt; 2459 } else { 2460 VM_STAT_ADD(anonvmstats.privatepages[7]); 2461 mutex_exit(&freemem_lock); 2462 if (ahmpages != NULL) { 2463 mutex_exit(ahmpages); 2464 } 2465 if (prealloc) { 2466 page_free_replacement_page(pplist); 2467 page_create_putback(pgcnt); 2468 } 2469 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) 2470 if (ppa[pg_idx] != NULL) 2471 page_unlock(ppa[pg_idx]); 2472 return (ENOMEM); 2473 } 2474 mutex_exit(&freemem_lock); 2475 } 2476 } 2477 2478 CPU_STATS_ADD_K(vm, cow_fault, pgcnt); 2479 2480 VM_STAT_ADD(anonvmstats.privatepages[8]); 2481 2482 an_idx = start_idx; 2483 pg_idx = 0; 2484 vaddr = addr; 2485 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) { 2486 ASSERT(ppa[pg_idx] != NULL); 2487 oldap = anon_get_ptr(amp->ahp, an_idx); 2488 ASSERT(ahmpages != NULL || oldap == NULL); 2489 ASSERT(ahmpages == NULL || oldap != NULL); 2490 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2491 ASSERT(ahmpages == NULL || pg_idx != 0 || 2492 (refcnt = oldap->an_refcnt)); 2493 ASSERT(ahmpages == NULL || pg_idx == 0 || 2494 refcnt == oldap->an_refcnt); 2495 2496 ap = anon_alloc(NULL, 0); 2497 2498 swap_xlate(ap, &vp, &off); 2499 2500 /* 2501 * Now setup our preallocated page to pass down to 2502 * swap_getpage(). 2503 */ 2504 if (prealloc) { 2505 pp = pplist; 2506 page_sub(&pplist, pp); 2507 conpp = pp; 2508 } 2509 2510 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl, 2511 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr, 2512 S_CREATE, cred); 2513 2514 /* 2515 * Impossible to fail this is S_CREATE. 2516 */ 2517 if (err) 2518 panic("anon_map_privatepages: VOP_GETPAGE failed"); 2519 2520 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0); 2521 ASSERT(prealloc == 0 || nreloc == 1); 2522 2523 pp = pl[0]; 2524 2525 /* 2526 * If the original page was locked, we need to move 2527 * the lock to the new page by transfering 2528 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt' 2529 * of the new page. pg_idx can be used to index 2530 * into the vpage array since the caller will guarentee 2531 * that vpage struct passed in corresponds to addr 2532 * and forward. 2533 */ 2534 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) { 2535 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE); 2536 } else if (pagelock) { 2537 mutex_enter(&freemem_lock); 2538 availrmem++; 2539 pages_useclaim--; 2540 mutex_exit(&freemem_lock); 2541 } 2542 2543 /* 2544 * Now copy the contents from the original page. 2545 */ 2546 ppcopy(ppa[pg_idx], pp); 2547 2548 hat_setrefmod(pp); /* mark as modified */ 2549 2550 /* 2551 * Release the lock on the original page, 2552 * derement the old slot, and down grade the lock 2553 * on the new copy. 2554 */ 2555 page_unlock(ppa[pg_idx]); 2556 2557 if (!prealloc) 2558 page_downgrade(pp); 2559 2560 ppa[pg_idx] = pp; 2561 2562 /* 2563 * Now reflect the copy in the new anon array. 2564 */ 2565 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1); 2566 if (oldap != NULL) 2567 anon_decref(oldap); 2568 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP); 2569 } 2570 if (ahmpages != NULL) { 2571 mutex_exit(ahmpages); 2572 } 2573 ASSERT(prealloc == 0 || pplist == NULL); 2574 if (prealloc) { 2575 VM_STAT_ADD(anonvmstats.privatepages[9]); 2576 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) { 2577 page_downgrade(ppa[pg_idx]); 2578 } 2579 } 2580 2581 /* 2582 * Unload the old large page translation. 2583 */ 2584 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD); 2585 return (0); 2586 } 2587 2588 /* 2589 * Allocate a private zero-filled anon page. 2590 */ 2591 page_t * 2592 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred) 2593 { 2594 struct anon *ap; 2595 page_t *pp; 2596 struct vnode *vp; 2597 anoff_t off; 2598 page_t *anon_pl[1 + 1]; 2599 int err; 2600 2601 /* Kernel probe */ 2602 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */, 2603 tnf_opaque, address, addr); 2604 2605 *app = ap = anon_alloc(NULL, 0); 2606 swap_xlate(ap, &vp, &off); 2607 2608 /* 2609 * Call the VOP_GETPAGE routine to create the page, thereby 2610 * enabling the vnode driver to allocate any filesystem 2611 * dependent structures (e.g., disk block allocation for UFS). 2612 * This also prevents more than on page from being added to 2613 * the vnode at the same time since it is locked. 2614 */ 2615 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL, 2616 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred); 2617 if (err) { 2618 *app = NULL; 2619 anon_decref(ap); 2620 return (NULL); 2621 } 2622 pp = anon_pl[0]; 2623 2624 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */ 2625 page_downgrade(pp); 2626 CPU_STATS_ADD_K(vm, zfod, 1); 2627 hat_setrefmod(pp); /* mark as modified so pageout writes back */ 2628 return (pp); 2629 } 2630 2631 2632 /* 2633 * Allocate array of private zero-filled anon pages for empty slots 2634 * and kept pages for non empty slots within given range. 2635 * 2636 * NOTE: This rontine will try and use large pages 2637 * if available and supported by underlying platform. 2638 */ 2639 int 2640 anon_map_createpages( 2641 struct anon_map *amp, 2642 ulong_t start_index, 2643 size_t len, 2644 page_t *ppa[], 2645 struct seg *seg, 2646 caddr_t addr, 2647 enum seg_rw rw, 2648 struct cred *cred) 2649 { 2650 2651 struct anon *ap; 2652 struct vnode *ap_vp; 2653 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL; 2654 int err = 0; 2655 ulong_t p_index, index; 2656 pgcnt_t npgs, pg_cnt; 2657 spgcnt_t nreloc = 0; 2658 uint_t l_szc, szc, prot; 2659 anoff_t ap_off; 2660 size_t pgsz; 2661 lgrp_t *lgrp; 2662 2663 /* 2664 * XXX For now only handle S_CREATE. 2665 */ 2666 ASSERT(rw == S_CREATE); 2667 2668 index = start_index; 2669 p_index = 0; 2670 npgs = btopr(len); 2671 2672 /* 2673 * If this platform supports multiple page sizes 2674 * then try and allocate directly from the free 2675 * list for pages larger than PAGESIZE. 2676 * 2677 * NOTE:When we have page_create_ru we can stop 2678 * directly allocating from the freelist. 2679 */ 2680 l_szc = seg->s_szc; 2681 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER); 2682 while (npgs) { 2683 2684 /* 2685 * if anon slot already exists 2686 * (means page has been created) 2687 * so 1) look up the page 2688 * 2) if the page is still in memory, get it. 2689 * 3) if not, create a page and 2690 * page in from physical swap device. 2691 * These are done in anon_getpage(). 2692 */ 2693 ap = anon_get_ptr(amp->ahp, index); 2694 if (ap) { 2695 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE, 2696 seg, addr, S_READ, cred); 2697 if (err) { 2698 ANON_LOCK_EXIT(&->a_rwlock); 2699 panic("anon_map_createpages: anon_getpage"); 2700 } 2701 pp = anon_pl[0]; 2702 ppa[p_index++] = pp; 2703 2704 addr += PAGESIZE; 2705 index++; 2706 npgs--; 2707 continue; 2708 } 2709 /* 2710 * Now try and allocate the largest page possible 2711 * for the current address and range. 2712 * Keep dropping down in page size until: 2713 * 2714 * 1) Properly aligned 2715 * 2) Does not overlap existing anon pages 2716 * 3) Fits in remaining range. 2717 * 4) able to allocate one. 2718 * 2719 * NOTE: XXX When page_create_ru is completed this code 2720 * will change. 2721 */ 2722 szc = l_szc; 2723 pplist = NULL; 2724 pg_cnt = 0; 2725 while (szc) { 2726 pgsz = page_get_pagesize(szc); 2727 pg_cnt = pgsz >> PAGESHIFT; 2728 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs && 2729 anon_pages(amp->ahp, index, pg_cnt) == 0) { 2730 /* 2731 * XXX 2732 * Since we are faking page_create() 2733 * we also need to do the freemem and 2734 * pcf accounting. 2735 */ 2736 (void) page_create_wait(pg_cnt, PG_WAIT); 2737 2738 /* 2739 * Get lgroup to allocate next page of shared 2740 * memory from and use it to specify where to 2741 * allocate the physical memory 2742 */ 2743 lgrp = lgrp_mem_choose(seg, addr, pgsz); 2744 2745 pplist = page_get_freelist( 2746 anon_vp, (u_offset_t)0, seg, 2747 addr, pgsz, 0, lgrp); 2748 2749 if (pplist == NULL) { 2750 page_create_putback(pg_cnt); 2751 } 2752 2753 /* 2754 * If a request for a page of size 2755 * larger than PAGESIZE failed 2756 * then don't try that size anymore. 2757 */ 2758 if (pplist == NULL) { 2759 l_szc = szc - 1; 2760 } else { 2761 break; 2762 } 2763 } 2764 szc--; 2765 } 2766 2767 /* 2768 * If just using PAGESIZE pages then don't 2769 * directly allocate from the free list. 2770 */ 2771 if (pplist == NULL) { 2772 ASSERT(szc == 0); 2773 pp = anon_zero(seg, addr, &ap, cred); 2774 if (pp == NULL) { 2775 ANON_LOCK_EXIT(&->a_rwlock); 2776 panic("anon_map_createpages: anon_zero"); 2777 } 2778 ppa[p_index++] = pp; 2779 2780 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2781 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2782 2783 addr += PAGESIZE; 2784 index++; 2785 npgs--; 2786 continue; 2787 } 2788 2789 /* 2790 * pplist is a list of pg_cnt PAGESIZE pages. 2791 * These pages are locked SE_EXCL since they 2792 * came directly off the free list. 2793 */ 2794 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt)); 2795 ASSERT(IS_P2ALIGNED(index, pg_cnt)); 2796 ASSERT(conpp == NULL); 2797 while (pg_cnt--) { 2798 2799 ap = anon_alloc(NULL, 0); 2800 swap_xlate(ap, &ap_vp, &ap_off); 2801 2802 ASSERT(pplist != NULL); 2803 pp = pplist; 2804 page_sub(&pplist, pp); 2805 PP_CLRFREE(pp); 2806 PP_CLRAGED(pp); 2807 conpp = pp; 2808 2809 err = swap_getconpage(ap_vp, ap_off, PAGESIZE, 2810 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL, 2811 &nreloc, seg, addr, S_CREATE, cred); 2812 2813 if (err) { 2814 ANON_LOCK_EXIT(&->a_rwlock); 2815 panic("anon_map_createpages: S_CREATE"); 2816 } 2817 2818 ASSERT(anon_pl[0] == pp); 2819 ASSERT(nreloc == 1); 2820 pagezero(pp, 0, PAGESIZE); 2821 CPU_STATS_ADD_K(vm, zfod, 1); 2822 hat_setrefmod(pp); 2823 2824 ASSERT(anon_get_ptr(amp->ahp, index) == NULL); 2825 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP); 2826 2827 ppa[p_index++] = pp; 2828 2829 addr += PAGESIZE; 2830 index++; 2831 npgs--; 2832 } 2833 conpp = NULL; 2834 pg_cnt = pgsz >> PAGESHIFT; 2835 p_index = p_index - pg_cnt; 2836 while (pg_cnt--) { 2837 page_downgrade(ppa[p_index++]); 2838 } 2839 } 2840 ANON_LOCK_EXIT(&->a_rwlock); 2841 return (0); 2842 } 2843 2844 static int 2845 anon_try_demote_pages( 2846 struct anon_hdr *ahp, 2847 ulong_t sidx, 2848 uint_t szc, 2849 page_t **ppa, 2850 int private) 2851 { 2852 struct anon *ap; 2853 pgcnt_t pgcnt = page_get_pagecnt(szc); 2854 page_t *pp; 2855 pgcnt_t i; 2856 kmutex_t *ahmpages = NULL; 2857 int root = 0; 2858 pgcnt_t npgs; 2859 pgcnt_t curnpgs = 0; 2860 size_t ppasize = 0; 2861 2862 ASSERT(szc != 0); 2863 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2864 ASSERT(IS_P2ALIGNED(sidx, pgcnt)); 2865 ASSERT(sidx < ahp->size); 2866 2867 if (ppa == NULL) { 2868 ppasize = pgcnt * sizeof (page_t *); 2869 ppa = kmem_alloc(ppasize, KM_SLEEP); 2870 } 2871 2872 ap = anon_get_ptr(ahp, sidx); 2873 if (ap != NULL && private) { 2874 VM_STAT_ADD(anonvmstats.demotepages[1]); 2875 ahmpages = &anonpages_hash_lock[AH_LOCK(ap->an_vp, ap->an_off)]; 2876 mutex_enter(ahmpages); 2877 } 2878 2879 if (ap != NULL && ap->an_refcnt > 1) { 2880 if (ahmpages != NULL) { 2881 VM_STAT_ADD(anonvmstats.demotepages[2]); 2882 mutex_exit(ahmpages); 2883 } 2884 if (ppasize != 0) { 2885 kmem_free(ppa, ppasize); 2886 } 2887 return (0); 2888 } 2889 if (ahmpages != NULL) { 2890 mutex_exit(ahmpages); 2891 } 2892 if (ahp->size - sidx < pgcnt) { 2893 ASSERT(private == 0); 2894 pgcnt = ahp->size - sidx; 2895 } 2896 for (i = 0; i < pgcnt; i++, sidx++) { 2897 ap = anon_get_ptr(ahp, sidx); 2898 if (ap != NULL) { 2899 if (ap->an_refcnt != 1) { 2900 panic("anon_try_demote_pages: an_refcnt != 1"); 2901 } 2902 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off, 2903 SE_EXCL); 2904 if (pp != NULL) { 2905 (void) hat_pageunload(pp, 2906 HAT_FORCE_PGUNLOAD); 2907 } 2908 } else { 2909 ppa[i] = NULL; 2910 } 2911 } 2912 for (i = 0; i < pgcnt; i++) { 2913 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) { 2914 ASSERT(pp->p_szc <= szc); 2915 if (!root) { 2916 VM_STAT_ADD(anonvmstats.demotepages[3]); 2917 if (curnpgs != 0) 2918 panic("anon_try_demote_pages: " 2919 "bad large page"); 2920 2921 root = 1; 2922 curnpgs = npgs = 2923 page_get_pagecnt(pp->p_szc); 2924 2925 ASSERT(npgs <= pgcnt); 2926 ASSERT(IS_P2ALIGNED(npgs, npgs)); 2927 ASSERT(!(page_pptonum(pp) & 2928 (npgs - 1))); 2929 } else { 2930 ASSERT(i > 0); 2931 ASSERT(page_pptonum(pp) - 1 == 2932 page_pptonum(ppa[i - 1])); 2933 if ((page_pptonum(pp) & (npgs - 1)) == 2934 npgs - 1) 2935 root = 0; 2936 } 2937 ASSERT(PAGE_EXCL(pp)); 2938 pp->p_szc = 0; 2939 ASSERT(curnpgs > 0); 2940 curnpgs--; 2941 } 2942 } 2943 if (root != 0 || curnpgs != 0) 2944 panic("anon_try_demote_pages: bad large page"); 2945 2946 for (i = 0; i < pgcnt; i++) { 2947 if ((pp = ppa[i]) != NULL) { 2948 ASSERT(!hat_page_is_mapped(pp)); 2949 ASSERT(pp->p_szc == 0); 2950 page_unlock(pp); 2951 } 2952 } 2953 if (ppasize != 0) { 2954 kmem_free(ppa, ppasize); 2955 } 2956 return (1); 2957 } 2958 2959 /* 2960 * anon_map_demotepages() can only be called by MAP_PRIVATE segments. 2961 */ 2962 int 2963 anon_map_demotepages( 2964 struct anon_map *amp, 2965 ulong_t start_idx, 2966 struct seg *seg, 2967 caddr_t addr, 2968 uint_t prot, 2969 struct vpage vpage[], 2970 struct cred *cred) 2971 { 2972 struct anon *ap; 2973 uint_t szc = seg->s_szc; 2974 pgcnt_t pgcnt = page_get_pagecnt(szc); 2975 size_t ppasize = pgcnt * sizeof (page_t *); 2976 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP); 2977 page_t *pp; 2978 page_t *pl[2]; 2979 pgcnt_t i, pg_idx; 2980 ulong_t an_idx; 2981 caddr_t vaddr; 2982 int err; 2983 int retry = 0; 2984 uint_t vpprot; 2985 2986 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 2987 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt)); 2988 ASSERT(IS_P2ALIGNED(start_idx, pgcnt)); 2989 ASSERT(ppa != NULL); 2990 ASSERT(szc != 0); 2991 ASSERT(szc == amp->a_szc); 2992 2993 VM_STAT_ADD(anonvmstats.demotepages[0]); 2994 2995 top: 2996 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) { 2997 kmem_free(ppa, ppasize); 2998 return (0); 2999 } 3000 3001 VM_STAT_ADD(anonvmstats.demotepages[4]); 3002 3003 ASSERT(retry == 0); /* we can be here only once */ 3004 3005 vaddr = addr; 3006 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt; 3007 pg_idx++, an_idx++, vaddr += PAGESIZE) { 3008 ap = anon_get_ptr(amp->ahp, an_idx); 3009 if (ap == NULL) 3010 panic("anon_map_demotepages: no anon slot"); 3011 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr, 3012 S_READ, cred); 3013 if (err) { 3014 for (i = 0; i < pg_idx; i++) { 3015 if ((pp = ppa[i]) != NULL) 3016 page_unlock(pp); 3017 } 3018 kmem_free(ppa, ppasize); 3019 return (err); 3020 } 3021 ppa[pg_idx] = pl[0]; 3022 } 3023 3024 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa, 3025 vpage, -1, cred); 3026 if (err > 0) { 3027 VM_STAT_ADD(anonvmstats.demotepages[5]); 3028 kmem_free(ppa, ppasize); 3029 return (err); 3030 } 3031 ASSERT(err == 0 || err == -1); 3032 if (err == -1) { 3033 VM_STAT_ADD(anonvmstats.demotepages[6]); 3034 retry = 1; 3035 goto top; 3036 } 3037 for (i = 0; i < pgcnt; i++) { 3038 ASSERT(ppa[i] != NULL); 3039 if (ppa[i]->p_szc != 0) 3040 retry = 1; 3041 page_unlock(ppa[i]); 3042 } 3043 if (retry) { 3044 VM_STAT_ADD(anonvmstats.demotepages[7]); 3045 goto top; 3046 } 3047 3048 VM_STAT_ADD(anonvmstats.demotepages[8]); 3049 3050 kmem_free(ppa, ppasize); 3051 3052 return (0); 3053 } 3054 3055 /* 3056 * Free pages of shared anon map. It's assumed that anon maps don't share anon 3057 * structures with private anon maps. Therefore all anon structures should 3058 * have at most one reference at this point. This means underlying pages can 3059 * be exclusively locked and demoted or freed. If not freeing the entire 3060 * large pages demote the ends of the region we free to be able to free 3061 * subpages. Page roots correspend to aligned index positions in anon map. 3062 */ 3063 void 3064 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len) 3065 { 3066 ulong_t eidx = sidx + btopr(len); 3067 pgcnt_t pages = page_get_pagecnt(amp->a_szc); 3068 struct anon_hdr *ahp = amp->ahp; 3069 ulong_t tidx; 3070 size_t size; 3071 ulong_t sidx_aligned; 3072 ulong_t eidx_aligned; 3073 3074 ASSERT(RW_WRITE_HELD(&->a_rwlock)); 3075 ASSERT(amp->refcnt <= 1); 3076 ASSERT(amp->a_szc > 0); 3077 ASSERT(eidx <= ahp->size); 3078 ASSERT(!anon_share(ahp, sidx, btopr(len))); 3079 3080 if (len == 0) { /* XXX */ 3081 return; 3082 } 3083 3084 sidx_aligned = P2ALIGN(sidx, pages); 3085 if (sidx_aligned != sidx || 3086 (eidx < sidx_aligned + pages && eidx < ahp->size)) { 3087 if (!anon_try_demote_pages(ahp, sidx_aligned, 3088 amp->a_szc, NULL, 0)) { 3089 panic("anon_shmap_free_pages: demote failed"); 3090 } 3091 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) : 3092 P2NPHASE(sidx, pages); 3093 size <<= PAGESHIFT; 3094 anon_free(ahp, sidx, size); 3095 sidx = sidx_aligned + pages; 3096 if (eidx <= sidx) { 3097 return; 3098 } 3099 } 3100 eidx_aligned = P2ALIGN(eidx, pages); 3101 if (sidx < eidx_aligned) { 3102 anon_free_pages(ahp, sidx, 3103 (eidx_aligned - sidx) << PAGESHIFT, 3104 amp->a_szc); 3105 sidx = eidx_aligned; 3106 } 3107 ASSERT(sidx == eidx_aligned); 3108 if (eidx == eidx_aligned) { 3109 return; 3110 } 3111 tidx = eidx; 3112 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL && 3113 tidx - sidx < pages) { 3114 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) { 3115 panic("anon_shmap_free_pages: demote failed"); 3116 } 3117 size = (eidx - sidx) << PAGESHIFT; 3118 anon_free(ahp, sidx, size); 3119 } else { 3120 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc); 3121 } 3122 } 3123 3124 /* 3125 * Allocate and initialize an anon_map structure for seg 3126 * associating the given swap reservation with the new anon_map. 3127 */ 3128 struct anon_map * 3129 anonmap_alloc(size_t size, size_t swresv) 3130 { 3131 struct anon_map *amp; 3132 3133 amp = kmem_cache_alloc(anonmap_cache, KM_SLEEP); 3134 3135 amp->refcnt = 1; 3136 amp->size = size; 3137 3138 amp->ahp = anon_create(btopr(size), ANON_SLEEP); 3139 amp->swresv = swresv; 3140 amp->locality = 0; 3141 amp->a_szc = 0; 3142 return (amp); 3143 } 3144 3145 void 3146 anonmap_free(struct anon_map *amp) 3147 { 3148 ASSERT(amp->ahp); 3149 ASSERT(amp->refcnt == 0); 3150 3151 lgrp_shm_policy_fini(amp, NULL); 3152 anon_release(amp->ahp, btopr(amp->size)); 3153 kmem_cache_free(anonmap_cache, amp); 3154 } 3155 3156 /* 3157 * Returns true if the app array has some empty slots. 3158 * The offp and lenp paramters are in/out paramters. On entry 3159 * these values represent the starting offset and length of the 3160 * mapping. When true is returned, these values may be modified 3161 * to be the largest range which includes empty slots. 3162 */ 3163 int 3164 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp, 3165 size_t *lenp) 3166 { 3167 ulong_t i, el; 3168 ssize_t low, high; 3169 struct anon *ap; 3170 3171 low = -1; 3172 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) { 3173 ap = anon_get_ptr(ahp, anon_idx); 3174 if (ap == NULL) { 3175 if (low == -1) 3176 low = i; 3177 high = i; 3178 } 3179 } 3180 if (low != -1) { 3181 /* 3182 * Found at least one non-anon page. 3183 * Set up the off and len return values. 3184 */ 3185 if (low != 0) 3186 *offp += low; 3187 *lenp = high - low + PAGESIZE; 3188 return (1); 3189 } 3190 return (0); 3191 } 3192 3193 /* 3194 * Return a count of the number of existing anon pages in the anon array 3195 * app in the range (off, off+len). The array and slots must be guaranteed 3196 * stable by the caller. 3197 */ 3198 pgcnt_t 3199 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots) 3200 { 3201 pgcnt_t cnt = 0; 3202 3203 while (nslots-- > 0) { 3204 if ((anon_get_ptr(ahp, anon_index)) != NULL) 3205 cnt++; 3206 anon_index++; 3207 } 3208 return (cnt); 3209 } 3210 3211 /* 3212 * Move reserved phys swap into memory swap (unreserve phys swap 3213 * and reserve mem swap by the same amount). 3214 * Used by segspt when it needs to lock resrved swap npages in memory 3215 */ 3216 int 3217 anon_swap_adjust(pgcnt_t npages) 3218 { 3219 pgcnt_t unlocked_mem_swap; 3220 3221 mutex_enter(&anoninfo_lock); 3222 3223 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3224 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3225 3226 unlocked_mem_swap = k_anoninfo.ani_mem_resv 3227 - k_anoninfo.ani_locked_swap; 3228 if (npages > unlocked_mem_swap) { 3229 spgcnt_t adjusted_swap = npages - unlocked_mem_swap; 3230 3231 /* 3232 * if there is not enough unlocked mem swap we take missing 3233 * amount from phys swap and give it to mem swap 3234 */ 3235 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) { 3236 mutex_exit(&anoninfo_lock); 3237 return (ENOMEM); 3238 } 3239 3240 k_anoninfo.ani_mem_resv += adjusted_swap; 3241 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap); 3242 k_anoninfo.ani_phys_resv -= adjusted_swap; 3243 3244 ANI_ADD(adjusted_swap); 3245 } 3246 k_anoninfo.ani_locked_swap += npages; 3247 3248 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap); 3249 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv); 3250 3251 mutex_exit(&anoninfo_lock); 3252 3253 return (0); 3254 } 3255 3256 /* 3257 * 'unlocked' reserved mem swap so when it is unreserved it 3258 * can be moved back phys (disk) swap 3259 */ 3260 void 3261 anon_swap_restore(pgcnt_t npages) 3262 { 3263 mutex_enter(&anoninfo_lock); 3264 3265 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3266 3267 ASSERT(k_anoninfo.ani_locked_swap >= npages); 3268 k_anoninfo.ani_locked_swap -= npages; 3269 3270 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv); 3271 3272 mutex_exit(&anoninfo_lock); 3273 } 3274 3275 /* 3276 * Return the pointer from the list for a 3277 * specified anon index. 3278 */ 3279 ulong_t * 3280 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx) 3281 { 3282 struct anon **app; 3283 void **ppp; 3284 3285 ASSERT(an_idx < ahp->size); 3286 3287 /* 3288 * Single level case. 3289 */ 3290 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) { 3291 return ((ulong_t *)&ahp->array_chunk[an_idx]); 3292 } else { 3293 3294 /* 3295 * 2 level case. 3296 */ 3297 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3298 if (*ppp == NULL) { 3299 mutex_enter(&ahp->serial_lock); 3300 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT]; 3301 if (*ppp == NULL) 3302 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP); 3303 mutex_exit(&ahp->serial_lock); 3304 } 3305 app = *ppp; 3306 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]); 3307 } 3308 } 3309 3310 void 3311 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj) 3312 { 3313 ulong_t *ap_slot; 3314 kmutex_t *mtx; 3315 kcondvar_t *cv; 3316 int hash; 3317 3318 /* 3319 * Use szc to determine anon slot(s) to appear atomic. 3320 * If szc = 0, then lock the anon slot and mark it busy. 3321 * If szc > 0, then lock the range of slots by getting the 3322 * anon_array_lock for the first anon slot, and mark only the 3323 * first anon slot busy to represent whole range being busy. 3324 */ 3325 3326 ASSERT(RW_READ_HELD(&->a_rwlock)); 3327 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3328 hash = ANON_ARRAY_HASH(amp, an_idx); 3329 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3330 sobj->sync_cv = cv = &anon_array_cv[hash]; 3331 mutex_enter(mtx); 3332 ap_slot = anon_get_slot(amp->ahp, an_idx); 3333 while (ANON_ISBUSY(ap_slot)) 3334 cv_wait(cv, mtx); 3335 ANON_SETBUSY(ap_slot); 3336 sobj->sync_data = ap_slot; 3337 mutex_exit(mtx); 3338 } 3339 3340 int 3341 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx, 3342 anon_sync_obj_t *sobj) 3343 { 3344 ulong_t *ap_slot; 3345 kmutex_t *mtx; 3346 int hash; 3347 3348 /* 3349 * Try to lock a range of anon slots. 3350 * Use szc to determine anon slot(s) to appear atomic. 3351 * If szc = 0, then lock the anon slot and mark it busy. 3352 * If szc > 0, then lock the range of slots by getting the 3353 * anon_array_lock for the first anon slot, and mark only the 3354 * first anon slot busy to represent whole range being busy. 3355 * Fail if the mutex or the anon_array are busy. 3356 */ 3357 3358 ASSERT(RW_READ_HELD(&->a_rwlock)); 3359 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc)); 3360 hash = ANON_ARRAY_HASH(amp, an_idx); 3361 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex; 3362 sobj->sync_cv = &anon_array_cv[hash]; 3363 if (!mutex_tryenter(mtx)) { 3364 return (EWOULDBLOCK); 3365 } 3366 ap_slot = anon_get_slot(amp->ahp, an_idx); 3367 if (ANON_ISBUSY(ap_slot)) { 3368 mutex_exit(mtx); 3369 return (EWOULDBLOCK); 3370 } 3371 ANON_SETBUSY(ap_slot); 3372 sobj->sync_data = ap_slot; 3373 mutex_exit(mtx); 3374 return (0); 3375 } 3376 3377 void 3378 anon_array_exit(anon_sync_obj_t *sobj) 3379 { 3380 mutex_enter(sobj->sync_mutex); 3381 ASSERT(ANON_ISBUSY(sobj->sync_data)); 3382 ANON_CLRBUSY(sobj->sync_data); 3383 if (CV_HAS_WAITERS(sobj->sync_cv)) 3384 cv_broadcast(sobj->sync_cv); 3385 mutex_exit(sobj->sync_mutex); 3386 } 3387