1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 /*
40 * VM - anonymous pages.
41 *
42 * This layer sits immediately above the vm_swap layer. It manages
43 * physical pages that have no permanent identity in the file system
44 * name space, using the services of the vm_swap layer to allocate
45 * backing storage for these pages. Since these pages have no external
46 * identity, they are discarded when the last reference is removed.
47 *
48 * An important function of this layer is to manage low-level sharing
49 * of pages that are logically distinct but that happen to be
50 * physically identical (e.g., the corresponding pages of the processes
51 * resulting from a fork before one process or the other changes their
52 * contents). This pseudo-sharing is present only as an optimization
53 * and is not to be confused with true sharing in which multiple
54 * address spaces deliberately contain references to the same object;
55 * such sharing is managed at a higher level.
56 *
57 * The key data structure here is the anon struct, which contains a
58 * reference count for its associated physical page and a hint about
59 * the identity of that page. Anon structs typically live in arrays,
60 * with an instance's position in its array determining where the
61 * corresponding backing storage is allocated; however, the swap_xlate()
62 * routine abstracts away this representation information so that the
63 * rest of the anon layer need not know it. (See the swap layer for
64 * more details on anon struct layout.)
65 *
66 * In the future versions of the system, the association between an
67 * anon struct and its position on backing store will change so that
68 * we don't require backing store all anonymous pages in the system.
69 * This is important for consideration for large memory systems.
70 * We can also use this technique to delay binding physical locations
71 * to anonymous pages until pageout/swapout time where we can make
72 * smarter allocation decisions to improve anonymous klustering.
73 *
74 * Many of the routines defined here take a (struct anon **) argument,
75 * which allows the code at this level to manage anon pages directly,
76 * so that callers can regard anon structs as opaque objects and not be
77 * concerned with assigning or inspecting their contents.
78 *
79 * Clients of this layer refer to anon pages indirectly. That is, they
80 * maintain arrays of pointers to anon structs rather than maintaining
81 * anon structs themselves. The (struct anon **) arguments mentioned
82 * above are pointers to entries in these arrays. It is these arrays
83 * that capture the mapping between offsets within a given segment and
84 * the corresponding anonymous backing storage address.
85 */
86
87 #ifdef DEBUG
88 #define ANON_DEBUG
89 #endif
90
91 #include <sys/types.h>
92 #include <sys/t_lock.h>
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/mman.h>
96 #include <sys/cred.h>
97 #include <sys/thread.h>
98 #include <sys/vnode.h>
99 #include <sys/cpuvar.h>
100 #include <sys/swap.h>
101 #include <sys/cmn_err.h>
102 #include <sys/vtrace.h>
103 #include <sys/kmem.h>
104 #include <sys/sysmacros.h>
105 #include <sys/bitmap.h>
106 #include <sys/vmsystm.h>
107 #include <sys/tuneable.h>
108 #include <sys/debug.h>
109 #include <sys/fs/swapnode.h>
110 #include <sys/tnf_probe.h>
111 #include <sys/lgrp.h>
112 #include <sys/policy.h>
113 #include <sys/condvar_impl.h>
114 #include <sys/mutex_impl.h>
115 #include <sys/rctl.h>
116
117 #include <vm/as.h>
118 #include <vm/hat.h>
119 #include <vm/anon.h>
120 #include <vm/page.h>
121 #include <vm/vpage.h>
122 #include <vm/seg.h>
123 #include <vm/rm.h>
124
125 #include <fs/fs_subr.h>
126
127 struct vnode *anon_vp;
128
129 int anon_debug;
130
131 kmutex_t anoninfo_lock;
132 struct k_anoninfo k_anoninfo;
133 ani_free_t *ani_free_pool;
134 pad_mutex_t anon_array_lock[ANON_LOCKSIZE];
135 kcondvar_t anon_array_cv[ANON_LOCKSIZE];
136
137 /*
138 * Global hash table for (vp, off) -> anon slot
139 */
140 extern int swap_maxcontig;
141 size_t anon_hash_size;
142 unsigned int anon_hash_shift;
143 struct anon **anon_hash;
144
145 static struct kmem_cache *anon_cache;
146 static struct kmem_cache *anonmap_cache;
147
148 pad_mutex_t *anonhash_lock;
149
150 /*
151 * Used to make the increment of all refcnts of all anon slots of a large
152 * page appear to be atomic. The lock is grabbed for the first anon slot of
153 * a large page.
154 */
155 pad_mutex_t *anonpages_hash_lock;
156
157 #define APH_MUTEX(vp, off) \
158 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \
159 (AH_LOCK_SIZE - 1))].pad_mutex)
160
161 #ifdef VM_STATS
162 static struct anonvmstats_str {
163 ulong_t getpages[30];
164 ulong_t privatepages[10];
165 ulong_t demotepages[9];
166 ulong_t decrefpages[9];
167 ulong_t dupfillholes[4];
168 ulong_t freepages[1];
169 } anonvmstats;
170 #endif /* VM_STATS */
171
172 /*ARGSUSED*/
173 static int
anonmap_cache_constructor(void * buf,void * cdrarg,int kmflags)174 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
175 {
176 struct anon_map *amp = buf;
177
178 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL);
179 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL);
180 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
181 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
182 return (0);
183 }
184
185 /*ARGSUSED1*/
186 static void
anonmap_cache_destructor(void * buf,void * cdrarg)187 anonmap_cache_destructor(void *buf, void *cdrarg)
188 {
189 struct anon_map *amp = buf;
190
191 rw_destroy(&->a_rwlock);
192 cv_destroy(&->a_purgecv);
193 mutex_destroy(&->a_pmtx);
194 mutex_destroy(&->a_purgemtx);
195 }
196
197 void
anon_init(void)198 anon_init(void)
199 {
200 int i;
201 pad_mutex_t *tmp;
202
203 /* These both need to be powers of 2 so round up to the next power */
204 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
205 anon_hash_size = 1L << anon_hash_shift;
206
207 /*
208 * We need to align the anonhash_lock and anonpages_hash_lock arrays
209 * to a 64B boundary to avoid false sharing. We add 63B to our
210 * allocation so that we can get a 64B aligned address to use.
211 * We allocate both of these together to avoid wasting an additional
212 * 63B.
213 */
214 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63,
215 KM_SLEEP);
216 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64);
217 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE;
218
219 for (i = 0; i < AH_LOCK_SIZE; i++) {
220 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT,
221 NULL);
222 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL,
223 MUTEX_DEFAULT, NULL);
224 }
225
226 for (i = 0; i < ANON_LOCKSIZE; i++) {
227 mutex_init(&anon_array_lock[i].pad_mutex, NULL,
228 MUTEX_DEFAULT, NULL);
229 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
230 }
231
232 anon_hash = (struct anon **)
233 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
234 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
235 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
236 anonmap_cache = kmem_cache_create("anonmap_cache",
237 sizeof (struct anon_map), 0,
238 anonmap_cache_constructor, anonmap_cache_destructor, NULL,
239 NULL, NULL, 0);
240 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */
241
242 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP);
243 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */
244 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64);
245
246 anon_vp = vn_alloc(KM_SLEEP);
247 vn_setops(anon_vp, swap_vnodeops);
248 anon_vp->v_type = VREG;
249 anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
250 }
251
252 /*
253 * Global anon slot hash table manipulation.
254 */
255
256 static void
anon_addhash(struct anon * ap)257 anon_addhash(struct anon *ap)
258 {
259 int index;
260
261 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
262 index = ANON_HASH(ap->an_vp, ap->an_off);
263 ap->an_hash = anon_hash[index];
264 anon_hash[index] = ap;
265 }
266
267 static void
anon_rmhash(struct anon * ap)268 anon_rmhash(struct anon *ap)
269 {
270 struct anon **app;
271
272 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
273
274 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
275 *app; app = &((*app)->an_hash)) {
276 if (*app == ap) {
277 *app = ap->an_hash;
278 break;
279 }
280 }
281 }
282
283 /*
284 * The anon array interfaces. Functions allocating,
285 * freeing array of pointers, and returning/setting
286 * entries in the array of pointers for a given offset.
287 *
288 * Create the list of pointers
289 */
290 struct anon_hdr *
anon_create(pgcnt_t npages,int flags)291 anon_create(pgcnt_t npages, int flags)
292 {
293 struct anon_hdr *ahp;
294 ulong_t nchunks;
295 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
296
297 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
298 return (NULL);
299 }
300
301 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
302 /*
303 * Single level case.
304 */
305 ahp->size = npages;
306 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
307
308 if (flags & ANON_ALLOC_FORCE)
309 ahp->flags |= ANON_ALLOC_FORCE;
310
311 ahp->array_chunk = kmem_zalloc(
312 ahp->size * sizeof (struct anon *), kmemflags);
313
314 if (ahp->array_chunk == NULL) {
315 kmem_free(ahp, sizeof (struct anon_hdr));
316 return (NULL);
317 }
318 } else {
319 /*
320 * 2 Level case.
321 * anon hdr size needs to be rounded off to be a multiple
322 * of ANON_CHUNK_SIZE. This is important as various anon
323 * related functions depend on this.
324 * NOTE -
325 * anon_grow() makes anon hdr size a multiple of
326 * ANON_CHUNK_SIZE.
327 * amp size is <= anon hdr size.
328 * anon_index + seg_pgs <= anon hdr size.
329 */
330 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
331 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
332
333 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
334 kmemflags);
335
336 if (ahp->array_chunk == NULL) {
337 kmem_free(ahp, sizeof (struct anon_hdr));
338 return (NULL);
339 }
340 }
341 return (ahp);
342 }
343
344 /*
345 * Free the array of pointers
346 */
347 void
anon_release(struct anon_hdr * ahp,pgcnt_t npages)348 anon_release(struct anon_hdr *ahp, pgcnt_t npages)
349 {
350 ulong_t i;
351 void **ppp;
352 ulong_t nchunks;
353
354 ASSERT(npages <= ahp->size);
355
356 /*
357 * Single level case.
358 */
359 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
360 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
361 } else {
362 /*
363 * 2 level case.
364 */
365 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
366 for (i = 0; i < nchunks; i++) {
367 ppp = &ahp->array_chunk[i];
368 if (*ppp != NULL)
369 kmem_free(*ppp, PAGESIZE);
370 }
371 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
372 }
373 mutex_destroy(&ahp->serial_lock);
374 kmem_free(ahp, sizeof (struct anon_hdr));
375 }
376
377 /*
378 * Return the pointer from the list for a
379 * specified anon index.
380 */
381 struct anon *
anon_get_ptr(struct anon_hdr * ahp,ulong_t an_idx)382 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
383 {
384 struct anon **app;
385
386 ASSERT(an_idx < ahp->size);
387
388 /*
389 * Single level case.
390 */
391 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
392 return ((struct anon *)
393 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
394 } else {
395
396 /*
397 * 2 level case.
398 */
399 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
400 if (app) {
401 return ((struct anon *)
402 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
403 ANON_PTRMASK));
404 } else {
405 return (NULL);
406 }
407 }
408 }
409
410 /*
411 * Return the anon pointer for the first valid entry in the anon list,
412 * starting from the given index.
413 */
414 struct anon *
anon_get_next_ptr(struct anon_hdr * ahp,ulong_t * index)415 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
416 {
417 struct anon *ap;
418 struct anon **app;
419 ulong_t chunkoff;
420 ulong_t i;
421 ulong_t j;
422 pgcnt_t size;
423
424 i = *index;
425 size = ahp->size;
426
427 ASSERT(i < size);
428
429 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
430 /*
431 * 1 level case
432 */
433 while (i < size) {
434 ap = (struct anon *)
435 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
436 if (ap) {
437 *index = i;
438 return (ap);
439 }
440 i++;
441 }
442 } else {
443 /*
444 * 2 level case
445 */
446 chunkoff = i & ANON_CHUNK_OFF;
447 while (i < size) {
448 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
449 if (app)
450 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
451 ap = (struct anon *)
452 ((uintptr_t)app[j] & ANON_PTRMASK);
453 if (ap) {
454 *index = i + (j - chunkoff);
455 return (ap);
456 }
457 }
458 chunkoff = 0;
459 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
460 }
461 }
462 *index = size;
463 return (NULL);
464 }
465
466 /*
467 * Set list entry with a given pointer for a specified offset
468 */
469 int
anon_set_ptr(struct anon_hdr * ahp,ulong_t an_idx,struct anon * ap,int flags)470 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
471 {
472 void **ppp;
473 struct anon **app;
474 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
475 uintptr_t *ap_addr;
476
477 ASSERT(an_idx < ahp->size);
478
479 /*
480 * Single level case.
481 */
482 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
483 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
484 } else {
485
486 /*
487 * 2 level case.
488 */
489 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
490
491 ASSERT(ppp != NULL);
492 if (*ppp == NULL) {
493 mutex_enter(&ahp->serial_lock);
494 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
495 if (*ppp == NULL) {
496 *ppp = kmem_zalloc(PAGESIZE, kmemflags);
497 if (*ppp == NULL) {
498 mutex_exit(&ahp->serial_lock);
499 return (ENOMEM);
500 }
501 }
502 mutex_exit(&ahp->serial_lock);
503 }
504 app = *ppp;
505 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
506 }
507 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
508 return (0);
509 }
510
511 /*
512 * Copy anon array into a given new anon array
513 */
514 int
anon_copy_ptr(struct anon_hdr * sahp,ulong_t s_idx,struct anon_hdr * dahp,ulong_t d_idx,pgcnt_t npages,int flags)515 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx,
516 struct anon_hdr *dahp, ulong_t d_idx,
517 pgcnt_t npages, int flags)
518 {
519 void **sapp, **dapp;
520 void *ap;
521 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
522
523 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
524 ASSERT((npages <= sahp->size) && (npages <= dahp->size));
525
526 /*
527 * Both arrays are 1 level.
528 */
529 if (((sahp->size <= ANON_CHUNK_SIZE) &&
530 (dahp->size <= ANON_CHUNK_SIZE)) ||
531 ((sahp->flags & ANON_ALLOC_FORCE) &&
532 (dahp->flags & ANON_ALLOC_FORCE))) {
533
534 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
535 npages * sizeof (struct anon *));
536 return (0);
537 }
538
539 /*
540 * Both arrays are 2 levels.
541 */
542 if (sahp->size > ANON_CHUNK_SIZE &&
543 dahp->size > ANON_CHUNK_SIZE &&
544 ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
545 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
546
547 ulong_t sapidx, dapidx;
548 ulong_t *sap, *dap;
549 ulong_t chknp;
550
551 while (npages != 0) {
552
553 sapidx = s_idx & ANON_CHUNK_OFF;
554 dapidx = d_idx & ANON_CHUNK_OFF;
555 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
556 if (chknp > npages)
557 chknp = npages;
558
559 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
560 if ((sap = *sapp) != NULL) {
561 dapp = &dahp->array_chunk[d_idx
562 >> ANON_CHUNK_SHIFT];
563 if ((dap = *dapp) == NULL) {
564 *dapp = kmem_zalloc(PAGESIZE,
565 kmemflags);
566 if ((dap = *dapp) == NULL)
567 return (ENOMEM);
568 }
569 bcopy((sap + sapidx), (dap + dapidx),
570 chknp << ANON_PTRSHIFT);
571 }
572 s_idx += chknp;
573 d_idx += chknp;
574 npages -= chknp;
575 }
576 return (0);
577 }
578
579 /*
580 * At least one of the arrays is 2 level.
581 */
582 while (npages--) {
583 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
584 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
585 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
586 return (ENOMEM);
587 }
588 s_idx++;
589 d_idx++;
590 }
591 return (0);
592 }
593
594
595 /*
596 * ANON_INITBUF is a convenience macro for anon_grow() below. It
597 * takes a buffer dst, which is at least as large as buffer src. It
598 * does a bcopy from src into dst, and then bzeros the extra bytes
599 * of dst. If tail is set, the data in src is tail aligned within
600 * dst instead of head aligned.
601 */
602
603 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \
604 if (tail) { \
605 bzero((dst), (dstsize) - (srclen)); \
606 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
607 } else { \
608 bcopy((src), (dst), (srclen)); \
609 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \
610 }
611
612 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8)
613 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
614
615 /*
616 * anon_grow() is used to efficiently extend an existing anon array.
617 * startidx_p points to the index into the anon array of the first page
618 * that is in use. oldseg_pgs is the number of pages in use, starting at
619 * *startidx_p. newpages is the number of additional pages desired.
620 *
621 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
622 *
623 * The growth is done by creating a new top level of the anon array,
624 * and (if the array is 2-level) reusing the existing second level arrays.
625 *
626 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
627 *
628 * Returns the new number of pages in the anon array.
629 */
630 pgcnt_t
anon_grow(struct anon_hdr * ahp,ulong_t * startidx_p,pgcnt_t oldseg_pgs,pgcnt_t newseg_pgs,int flags)631 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
632 pgcnt_t newseg_pgs, int flags)
633 {
634 ulong_t startidx = startidx_p ? *startidx_p : 0;
635 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
636 pgcnt_t oelems, nelems, totpages;
637 void **level1;
638 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
639 int growdown = (flags & ANON_GROWDOWN);
640 size_t newarrsz, oldarrsz;
641 void *level2;
642
643 ASSERT(!(startidx_p == NULL && growdown));
644 ASSERT(startidx + oldseg_pgs <= ahp->size);
645
646 /*
647 * Determine the total number of pages needed in the new
648 * anon array. If growing down, totpages is all pages from
649 * startidx through the end of the array, plus <newseg_pgs>
650 * pages. If growing up, keep all pages from page 0 through
651 * the last page currently in use, plus <newseg_pgs> pages.
652 */
653 if (growdown)
654 totpages = oldamp_pgs - startidx + newseg_pgs;
655 else
656 totpages = startidx + oldseg_pgs + newseg_pgs;
657
658 /* If the array is already large enough, just return. */
659
660 if (oldamp_pgs >= totpages) {
661 if (growdown)
662 *startidx_p = oldamp_pgs - totpages;
663 return (oldamp_pgs);
664 }
665
666 /*
667 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
668 * by the corresponding arrays.
669 * oelems/nelems are the number of pointers in the top level arrays
670 * which may be either level 1 or level 2.
671 * Will the new anon array be one level or two levels?
672 */
673 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
674 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
675 oelems = oldamp_pgs;
676 nelems = newamp_pgs;
677 } else {
678 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
679 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
680 nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
681 }
682
683 newarrsz = nelems * sizeof (void *);
684 level1 = kmem_alloc(newarrsz, kmemflags);
685 if (level1 == NULL)
686 return (0);
687
688 /* Are we converting from a one level to a two level anon array? */
689
690 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
691 !(ahp->flags & ANON_ALLOC_FORCE)) {
692
693 /*
694 * Yes, we're converting to a two level. Reuse old level 1
695 * as new level 2 if it is exactly PAGESIZE. Otherwise
696 * alloc a new level 2 and copy the old level 1 data into it.
697 */
698 if (oldamp_pgs == ANON_CHUNK_SIZE) {
699 level2 = (void *)ahp->array_chunk;
700 } else {
701 level2 = kmem_alloc(PAGESIZE, kmemflags);
702 if (level2 == NULL) {
703 kmem_free(level1, newarrsz);
704 return (0);
705 }
706 oldarrsz = oldamp_pgs * sizeof (void *);
707
708 ANON_INITBUF(ahp->array_chunk, oldarrsz,
709 level2, PAGESIZE, growdown);
710 kmem_free(ahp->array_chunk, oldarrsz);
711 }
712 bzero(level1, newarrsz);
713 if (growdown)
714 level1[nelems - 1] = level2;
715 else
716 level1[0] = level2;
717 } else {
718 oldarrsz = oelems * sizeof (void *);
719
720 ANON_INITBUF(ahp->array_chunk, oldarrsz,
721 level1, newarrsz, growdown);
722 kmem_free(ahp->array_chunk, oldarrsz);
723 }
724
725 ahp->array_chunk = level1;
726 ahp->size = newamp_pgs;
727 if (growdown)
728 *startidx_p = newamp_pgs - totpages;
729
730 return (newamp_pgs);
731 }
732
733
734 /*
735 * Called to sync ani_free value.
736 */
737
738 void
set_anoninfo(void)739 set_anoninfo(void)
740 {
741 processorid_t ix, max_seqid;
742 pgcnt_t total = 0;
743 static clock_t last_time;
744 clock_t new_time;
745
746 if (ani_free_pool == NULL)
747 return;
748
749 /*
750 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
751 * identify the maximum number of CPUs were ever online.
752 */
753 new_time = ddi_get_lbolt();
754 if (new_time > last_time) {
755
756 max_seqid = max_cpu_seqid_ever;
757 ASSERT(ANI_MAX_POOL > max_seqid);
758 for (ix = 0; ix <= max_seqid; ix++)
759 total += ani_free_pool[ix].ani_count;
760
761 last_time = new_time;
762 k_anoninfo.ani_free = total;
763 }
764 }
765
766 /*
767 * Reserve anon space.
768 *
769 * It's no longer simply a matter of incrementing ani_resv to
770 * reserve swap space, we need to check memory-based as well
771 * as disk-backed (physical) swap. The following algorithm
772 * is used:
773 * Check the space on physical swap
774 * i.e. amount needed < ani_max - ani_phys_resv
775 * If we are swapping on swapfs check
776 * amount needed < (availrmem - swapfs_minfree)
777 * Since the algorithm to check for the quantity of swap space is
778 * almost the same as that for reserving it, we'll just use anon_resvmem
779 * with a flag to decrement availrmem.
780 *
781 * Return non-zero on success.
782 */
783 int
anon_resvmem(size_t size,boolean_t takemem,zone_t * zone,int tryhard)784 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
785 {
786 pgcnt_t npages = btopr(size);
787 pgcnt_t mswap_pages = 0;
788 pgcnt_t pswap_pages = 0;
789 proc_t *p = curproc;
790
791 if (zone != NULL && takemem) {
792 /* test zone.max-swap resource control */
793 mutex_enter(&p->p_lock);
794 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
795 mutex_exit(&p->p_lock);
796 return (0);
797 }
798 mutex_exit(&p->p_lock);
799 }
800 mutex_enter(&anoninfo_lock);
801
802 /*
803 * pswap_pages is the number of pages we can take from
804 * physical (i.e. disk-backed) swap.
805 */
806 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
807 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
808
809 ANON_PRINT(A_RESV,
810 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
811 npages, takemem, pswap_pages, (void *)caller()));
812
813 if (npages <= pswap_pages) {
814 /*
815 * we have enough space on a physical swap
816 */
817 if (takemem)
818 k_anoninfo.ani_phys_resv += npages;
819 mutex_exit(&anoninfo_lock);
820 return (1);
821 } else if (pswap_pages != 0) {
822 /*
823 * we have some space on a physical swap
824 */
825 if (takemem) {
826 /*
827 * use up remainder of phys swap
828 */
829 k_anoninfo.ani_phys_resv += pswap_pages;
830 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
831 }
832 }
833 /*
834 * since (npages > pswap_pages) we need mem swap
835 * mswap_pages is the number of pages needed from availrmem
836 */
837 ASSERT(npages > pswap_pages);
838 mswap_pages = npages - pswap_pages;
839
840 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
841 mswap_pages));
842
843 /*
844 * priv processes can reserve memory as swap as long as availrmem
845 * remains greater than swapfs_minfree; in the case of non-priv
846 * processes, memory can be reserved as swap only if availrmem
847 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
848 * swapfs_reserve amount of memswap is not available to non-priv
849 * processes. This protects daemons such as automounter dying
850 * as a result of application processes eating away almost entire
851 * membased swap. This safeguard becomes useless if apps are run
852 * with root access.
853 *
854 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
855 *
856 */
857 if (tryhard) {
858 pgcnt_t floor_pages;
859
860 if (secpolicy_resource_anon_mem(CRED())) {
861 floor_pages = swapfs_minfree;
862 } else {
863 floor_pages = swapfs_minfree + swapfs_reserve;
864 }
865
866 mutex_exit(&anoninfo_lock);
867 (void) page_reclaim_mem(mswap_pages, floor_pages, 0);
868 mutex_enter(&anoninfo_lock);
869 }
870
871 mutex_enter(&freemem_lock);
872 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
873 (availrmem > (swapfs_minfree + mswap_pages) &&
874 secpolicy_resource(CRED()) == 0)) {
875
876 if (takemem) {
877 /*
878 * Take the memory from the rest of the system.
879 */
880 availrmem -= mswap_pages;
881 mutex_exit(&freemem_lock);
882 k_anoninfo.ani_mem_resv += mswap_pages;
883 ANI_ADD(mswap_pages);
884 ANON_PRINT((A_RESV | A_MRESV),
885 ("anon_resvmem: took %ld pages of availrmem\n",
886 mswap_pages));
887 } else {
888 mutex_exit(&freemem_lock);
889 }
890
891 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
892 mutex_exit(&anoninfo_lock);
893 return (1);
894 } else {
895 /*
896 * Fail if not enough memory
897 */
898 if (takemem) {
899 k_anoninfo.ani_phys_resv -= pswap_pages;
900 }
901
902 mutex_exit(&freemem_lock);
903 mutex_exit(&anoninfo_lock);
904 ANON_PRINT(A_RESV,
905 ("anon_resvmem: not enough space from swapfs\n"));
906 if (zone != NULL && takemem)
907 rctl_decr_swap(zone, ptob(npages));
908 return (0);
909 }
910 }
911
912 /*
913 * Give back an anon reservation.
914 */
915 void
anon_unresvmem(size_t size,zone_t * zone)916 anon_unresvmem(size_t size, zone_t *zone)
917 {
918 pgcnt_t npages = btopr(size);
919 spgcnt_t mem_free_pages = 0;
920 pgcnt_t phys_free_slots;
921 #ifdef ANON_DEBUG
922 pgcnt_t mem_resv;
923 #endif
924 if (zone != NULL)
925 rctl_decr_swap(zone, ptob(npages));
926
927 mutex_enter(&anoninfo_lock);
928
929 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
930
931 /*
932 * If some of this reservation belonged to swapfs
933 * give it back to availrmem.
934 * ani_mem_resv is the amount of availrmem swapfs has reserved.
935 * but some of that memory could be locked by segspt so we can only
936 * return non locked ani_mem_resv back to availrmem
937 */
938 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
939 ANON_PRINT((A_RESV | A_MRESV),
940 ("anon_unresv: growing availrmem by %ld pages\n",
941 MIN(k_anoninfo.ani_mem_resv, npages)));
942
943 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
944 k_anoninfo.ani_locked_swap), npages);
945 mutex_enter(&freemem_lock);
946 availrmem += mem_free_pages;
947 mutex_exit(&freemem_lock);
948 k_anoninfo.ani_mem_resv -= mem_free_pages;
949
950 ANI_ADD(-mem_free_pages);
951 }
952 /*
953 * The remainder of the pages is returned to phys swap
954 */
955 ASSERT(npages >= mem_free_pages);
956 phys_free_slots = npages - mem_free_pages;
957
958 if (phys_free_slots) {
959 k_anoninfo.ani_phys_resv -= phys_free_slots;
960 }
961
962 #ifdef ANON_DEBUG
963 mem_resv = k_anoninfo.ani_mem_resv;
964 #endif
965
966 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
967 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
968
969 mutex_exit(&anoninfo_lock);
970
971 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
972 npages, mem_resv, (void *)caller()));
973 }
974
975 /*
976 * Allocate an anon slot and return it with the lock held.
977 */
978 struct anon *
anon_alloc(struct vnode * vp,anoff_t off)979 anon_alloc(struct vnode *vp, anoff_t off)
980 {
981 struct anon *ap;
982 kmutex_t *ahm;
983
984 ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
985 if (vp == NULL) {
986 swap_alloc(ap);
987 } else {
988 ap->an_vp = vp;
989 ap->an_off = off;
990 }
991 ap->an_refcnt = 1;
992 ap->an_pvp = NULL;
993 ap->an_poff = 0;
994 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
995 mutex_enter(ahm);
996 anon_addhash(ap);
997 mutex_exit(ahm);
998 ANI_ADD(-1);
999 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
1000 (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
1001 return (ap);
1002 }
1003
1004 /*
1005 * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1006 * such pages don't consume any physical swap resources needed for swapping
1007 * unlocked pages.
1008 */
1009 void
anon_swap_free(struct anon * ap,page_t * pp)1010 anon_swap_free(struct anon *ap, page_t *pp)
1011 {
1012 kmutex_t *ahm;
1013
1014 ASSERT(ap != NULL);
1015 ASSERT(pp != NULL);
1016 ASSERT(PAGE_LOCKED(pp));
1017 ASSERT(pp->p_vnode != NULL);
1018 ASSERT(IS_SWAPFSVP(pp->p_vnode));
1019 ASSERT(ap->an_refcnt != 0);
1020 ASSERT(pp->p_vnode == ap->an_vp);
1021 ASSERT(pp->p_offset == ap->an_off);
1022
1023 if (ap->an_pvp == NULL)
1024 return;
1025
1026 page_io_lock(pp);
1027 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1028 mutex_enter(ahm);
1029
1030 ASSERT(ap->an_refcnt != 0);
1031 ASSERT(pp->p_vnode == ap->an_vp);
1032 ASSERT(pp->p_offset == ap->an_off);
1033
1034 if (ap->an_pvp != NULL) {
1035 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1036 ap->an_pvp = NULL;
1037 ap->an_poff = 0;
1038 mutex_exit(ahm);
1039 hat_setmod(pp);
1040 } else {
1041 mutex_exit(ahm);
1042 }
1043 page_io_unlock(pp);
1044 }
1045
1046 /*
1047 * Decrement the reference count of an anon page.
1048 * If reference count goes to zero, free it and
1049 * its associated page (if any).
1050 */
1051 void
anon_decref(struct anon * ap)1052 anon_decref(struct anon *ap)
1053 {
1054 page_t *pp;
1055 struct vnode *vp;
1056 anoff_t off;
1057 kmutex_t *ahm;
1058
1059 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1060 mutex_enter(ahm);
1061 ASSERT(ap->an_refcnt != 0);
1062 if (ap->an_refcnt == 0)
1063 panic("anon_decref: slot count 0");
1064 if (--ap->an_refcnt == 0) {
1065 swap_xlate(ap, &vp, &off);
1066 anon_rmhash(ap);
1067 if (ap->an_pvp != NULL)
1068 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1069 mutex_exit(ahm);
1070
1071 /*
1072 * If there is a page for this anon slot we will need to
1073 * call VN_DISPOSE to get rid of the vp association and
1074 * put the page back on the free list as really free.
1075 * Acquire the "exclusive" lock to ensure that any
1076 * pending i/o always completes before the swap slot
1077 * is freed.
1078 */
1079 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1080 if (pp != NULL) {
1081 /*LINTED: constant in conditional context */
1082 VN_DISPOSE(pp, B_INVAL, 0, kcred);
1083 }
1084 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
1085 (void *)ap, (void *)ap->an_vp));
1086
1087 kmem_cache_free(anon_cache, ap);
1088
1089 ANI_ADD(1);
1090 } else {
1091 mutex_exit(ahm);
1092 }
1093 }
1094
1095
1096 /*
1097 * check an_refcnt of the root anon slot (anon_index argument is aligned at
1098 * seg->s_szc level) to determine whether COW processing is required.
1099 * anonpages_hash_lock[] held on the root ap ensures that if root's
1100 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1101 * later since this process can't fork while its AS lock is held).
1102 *
1103 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1104 */
1105 int
anon_szcshare(struct anon_hdr * ahp,ulong_t anon_index)1106 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
1107 {
1108 struct anon *ap;
1109 kmutex_t *ahmpages = NULL;
1110
1111 ap = anon_get_ptr(ahp, anon_index);
1112 if (ap == NULL)
1113 return (0);
1114
1115 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1116 mutex_enter(ahmpages);
1117 ASSERT(ap->an_refcnt >= 1);
1118 if (ap->an_refcnt == 1) {
1119 mutex_exit(ahmpages);
1120 return (0);
1121 }
1122 mutex_exit(ahmpages);
1123 return (1);
1124 }
1125 /*
1126 * Check 'nslots' anon slots for refcnt > 1.
1127 *
1128 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1129 * returns 0.
1130 */
1131 static int
anon_share(struct anon_hdr * ahp,ulong_t anon_index,pgcnt_t nslots)1132 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
1133 {
1134 struct anon *ap;
1135
1136 while (nslots-- > 0) {
1137 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
1138 ap->an_refcnt > 1)
1139 return (1);
1140 anon_index++;
1141 }
1142
1143 return (0);
1144 }
1145
1146 static void
anon_decref_pages(struct anon_hdr * ahp,ulong_t an_idx,uint_t szc)1147 anon_decref_pages(
1148 struct anon_hdr *ahp,
1149 ulong_t an_idx,
1150 uint_t szc)
1151 {
1152 struct anon *ap = anon_get_ptr(ahp, an_idx);
1153 kmutex_t *ahmpages = NULL;
1154 page_t *pp;
1155 pgcnt_t pgcnt = page_get_pagecnt(szc);
1156 pgcnt_t i;
1157 struct vnode *vp;
1158 anoff_t off;
1159 kmutex_t *ahm;
1160 #ifdef DEBUG
1161 int refcnt = 1;
1162 #endif
1163
1164 ASSERT(szc != 0);
1165 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1166 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1167 ASSERT(an_idx < ahp->size);
1168
1169 if (ahp->size - an_idx < pgcnt) {
1170 /*
1171 * In case of shared mappings total anon map size may not be
1172 * the largest page size aligned.
1173 */
1174 pgcnt = ahp->size - an_idx;
1175 }
1176
1177 VM_STAT_ADD(anonvmstats.decrefpages[0]);
1178
1179 if (ap != NULL) {
1180 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1181 mutex_enter(ahmpages);
1182 ASSERT((refcnt = ap->an_refcnt) != 0);
1183 VM_STAT_ADD(anonvmstats.decrefpages[1]);
1184 if (ap->an_refcnt == 1) {
1185 VM_STAT_ADD(anonvmstats.decrefpages[2]);
1186 ASSERT(!anon_share(ahp, an_idx, pgcnt));
1187 mutex_exit(ahmpages);
1188 ahmpages = NULL;
1189 }
1190 }
1191
1192 i = 0;
1193 while (i < pgcnt) {
1194 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
1195 ASSERT(refcnt == 1 && ahmpages == NULL);
1196 i++;
1197 continue;
1198 }
1199 ASSERT(ap->an_refcnt == refcnt);
1200 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1201 ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
1202
1203 if (ahmpages == NULL) {
1204 swap_xlate(ap, &vp, &off);
1205 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1206 if (pp == NULL || pp->p_szc == 0) {
1207 VM_STAT_ADD(anonvmstats.decrefpages[3]);
1208 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1209 (void) anon_set_ptr(ahp, an_idx + i, NULL,
1210 ANON_SLEEP);
1211 mutex_enter(ahm);
1212 ap->an_refcnt--;
1213 ASSERT(ap->an_refcnt == 0);
1214 anon_rmhash(ap);
1215 if (ap->an_pvp)
1216 swap_phys_free(ap->an_pvp, ap->an_poff,
1217 PAGESIZE);
1218 mutex_exit(ahm);
1219 if (pp == NULL) {
1220 pp = page_lookup(vp, (u_offset_t)off,
1221 SE_EXCL);
1222 ASSERT(pp == NULL || pp->p_szc == 0);
1223 }
1224 if (pp != NULL) {
1225 VM_STAT_ADD(anonvmstats.decrefpages[4]);
1226 /*LINTED*/
1227 VN_DISPOSE(pp, B_INVAL, 0, kcred);
1228 }
1229 kmem_cache_free(anon_cache, ap);
1230 ANI_ADD(1);
1231 i++;
1232 } else {
1233 pgcnt_t j;
1234 pgcnt_t curpgcnt =
1235 page_get_pagecnt(pp->p_szc);
1236 size_t ppasize = curpgcnt * sizeof (page_t *);
1237 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
1238 int dispose = 0;
1239
1240 VM_STAT_ADD(anonvmstats.decrefpages[5]);
1241
1242 ASSERT(pp->p_szc <= szc);
1243 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
1244 ASSERT(IS_P2ALIGNED(i, curpgcnt));
1245 ASSERT(i + curpgcnt <= pgcnt);
1246 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
1247 ppa[0] = pp;
1248 for (j = i + 1; j < i + curpgcnt; j++) {
1249 ap = anon_get_ptr(ahp, an_idx + j);
1250 ASSERT(ap != NULL &&
1251 ap->an_refcnt == 1);
1252 swap_xlate(ap, &vp, &off);
1253 pp = page_lookup(vp, (u_offset_t)off,
1254 SE_EXCL);
1255 if (pp == NULL)
1256 panic("anon_decref_pages: "
1257 "no page");
1258
1259 (void) hat_pageunload(pp,
1260 HAT_FORCE_PGUNLOAD);
1261 ASSERT(pp->p_szc == ppa[0]->p_szc);
1262 ASSERT(page_pptonum(pp) - 1 ==
1263 page_pptonum(ppa[j - i - 1]));
1264 ppa[j - i] = pp;
1265 if (ap->an_pvp != NULL &&
1266 !vn_matchopval(ap->an_pvp,
1267 VOPNAME_DISPOSE,
1268 (fs_generic_func_p)fs_dispose))
1269 dispose = 1;
1270 }
1271 for (j = i; j < i + curpgcnt; j++) {
1272 ap = anon_get_ptr(ahp, an_idx + j);
1273 ASSERT(ap != NULL &&
1274 ap->an_refcnt == 1);
1275 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1276 (void) anon_set_ptr(ahp, an_idx + j,
1277 NULL, ANON_SLEEP);
1278 mutex_enter(ahm);
1279 ap->an_refcnt--;
1280 ASSERT(ap->an_refcnt == 0);
1281 anon_rmhash(ap);
1282 if (ap->an_pvp)
1283 swap_phys_free(ap->an_pvp,
1284 ap->an_poff, PAGESIZE);
1285 mutex_exit(ahm);
1286 kmem_cache_free(anon_cache, ap);
1287 ANI_ADD(1);
1288 }
1289 if (!dispose) {
1290 VM_STAT_ADD(anonvmstats.decrefpages[6]);
1291 page_destroy_pages(ppa[0]);
1292 } else {
1293 VM_STAT_ADD(anonvmstats.decrefpages[7]);
1294 for (j = 0; j < curpgcnt; j++) {
1295 ASSERT(PAGE_EXCL(ppa[j]));
1296 ppa[j]->p_szc = 0;
1297 }
1298 for (j = 0; j < curpgcnt; j++) {
1299 ASSERT(!hat_page_is_mapped(
1300 ppa[j]));
1301 /*LINTED*/
1302 VN_DISPOSE(ppa[j], B_INVAL, 0,
1303 kcred);
1304 }
1305 }
1306 kmem_free(ppa, ppasize);
1307 i += curpgcnt;
1308 }
1309 } else {
1310 VM_STAT_ADD(anonvmstats.decrefpages[8]);
1311 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
1312 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1313 mutex_enter(ahm);
1314 ap->an_refcnt--;
1315 mutex_exit(ahm);
1316 i++;
1317 }
1318 }
1319
1320 if (ahmpages != NULL) {
1321 mutex_exit(ahmpages);
1322 }
1323 }
1324
1325 /*
1326 * Duplicate references to size bytes worth of anon pages.
1327 * Used when duplicating a segment that contains private anon pages.
1328 * This code assumes that procedure calling this one has already used
1329 * hat_chgprot() to disable write access to the range of addresses that
1330 * that *old actually refers to.
1331 */
1332 void
anon_dup(struct anon_hdr * old,ulong_t old_idx,struct anon_hdr * new,ulong_t new_idx,size_t size)1333 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
1334 ulong_t new_idx, size_t size)
1335 {
1336 spgcnt_t npages;
1337 kmutex_t *ahm;
1338 struct anon *ap;
1339 ulong_t off;
1340 ulong_t index;
1341
1342 npages = btopr(size);
1343 while (npages > 0) {
1344 index = old_idx;
1345 if ((ap = anon_get_next_ptr(old, &index)) == NULL)
1346 break;
1347
1348 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1349 off = index - old_idx;
1350 npages -= off;
1351 if (npages <= 0)
1352 break;
1353
1354 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
1355 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1356
1357 mutex_enter(ahm);
1358 ap->an_refcnt++;
1359 mutex_exit(ahm);
1360
1361 off++;
1362 new_idx += off;
1363 old_idx += off;
1364 npages--;
1365 }
1366 }
1367
1368 /*
1369 * Just like anon_dup but also guarantees there are no holes (unallocated anon
1370 * slots) within any large page region. That means if a large page region is
1371 * empty in the old array it will skip it. If there are 1 or more valid slots
1372 * in the large page region of the old array it will make sure to fill in any
1373 * unallocated ones and also copy them to the new array. If noalloc is 1 large
1374 * page region should either have no valid anon slots or all slots should be
1375 * valid.
1376 */
1377 void
anon_dup_fill_holes(struct anon_hdr * old,ulong_t old_idx,struct anon_hdr * new,ulong_t new_idx,size_t size,uint_t szc,int noalloc)1378 anon_dup_fill_holes(
1379 struct anon_hdr *old,
1380 ulong_t old_idx,
1381 struct anon_hdr *new,
1382 ulong_t new_idx,
1383 size_t size,
1384 uint_t szc,
1385 int noalloc)
1386 {
1387 struct anon *ap;
1388 spgcnt_t npages;
1389 kmutex_t *ahm, *ahmpages = NULL;
1390 pgcnt_t pgcnt, i;
1391 ulong_t index, off;
1392 #ifdef DEBUG
1393 int refcnt;
1394 #endif
1395
1396 ASSERT(szc != 0);
1397 pgcnt = page_get_pagecnt(szc);
1398 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1399 npages = btopr(size);
1400 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1401 ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
1402
1403 VM_STAT_ADD(anonvmstats.dupfillholes[0]);
1404
1405 while (npages > 0) {
1406 index = old_idx;
1407
1408 /*
1409 * Find the next valid slot.
1410 */
1411 if (anon_get_next_ptr(old, &index) == NULL)
1412 break;
1413
1414 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1415 /*
1416 * Now backup index to the beginning of the
1417 * current large page region of the old array.
1418 */
1419 index = P2ALIGN(index, pgcnt);
1420 off = index - old_idx;
1421 ASSERT(IS_P2ALIGNED(off, pgcnt));
1422 npages -= off;
1423 if (npages <= 0)
1424 break;
1425
1426 /*
1427 * Fill and copy a large page regions worth
1428 * of anon slots.
1429 */
1430 for (i = 0; i < pgcnt; i++) {
1431 if ((ap = anon_get_ptr(old, index + i)) == NULL) {
1432 if (noalloc) {
1433 panic("anon_dup_fill_holes: "
1434 "empty anon slot\n");
1435 }
1436 VM_STAT_ADD(anonvmstats.dupfillholes[1]);
1437 ap = anon_alloc(NULL, 0);
1438 (void) anon_set_ptr(old, index + i, ap,
1439 ANON_SLEEP);
1440 } else if (i == 0) {
1441 /*
1442 * make the increment of all refcnts of all
1443 * anon slots of a large page appear atomic by
1444 * getting an anonpages_hash_lock for the
1445 * first anon slot of a large page.
1446 */
1447 VM_STAT_ADD(anonvmstats.dupfillholes[2]);
1448
1449 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1450 mutex_enter(ahmpages);
1451 /*LINTED*/
1452 ASSERT(refcnt = ap->an_refcnt);
1453
1454 VM_STAT_COND_ADD(ap->an_refcnt > 1,
1455 anonvmstats.dupfillholes[3]);
1456 }
1457 (void) anon_set_ptr(new, new_idx + off + i, ap,
1458 ANON_SLEEP);
1459 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1460 mutex_enter(ahm);
1461 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1462 ASSERT(i == 0 || ahmpages == NULL ||
1463 refcnt == ap->an_refcnt);
1464 ap->an_refcnt++;
1465 mutex_exit(ahm);
1466 }
1467 if (ahmpages != NULL) {
1468 mutex_exit(ahmpages);
1469 ahmpages = NULL;
1470 }
1471 off += pgcnt;
1472 new_idx += off;
1473 old_idx += off;
1474 npages -= pgcnt;
1475 }
1476 }
1477
1478 /*
1479 * Used when a segment with a vnode changes szc. similarly to
1480 * anon_dup_fill_holes() makes sure each large page region either has no anon
1481 * slots or all of them. but new slots are created by COWing the file
1482 * pages. on entrance no anon slots should be shared.
1483 */
1484 int
anon_fill_cow_holes(struct seg * seg,caddr_t addr,struct anon_hdr * ahp,ulong_t an_idx,struct vnode * vp,u_offset_t vp_off,size_t size,uint_t szc,uint_t prot,struct vpage vpage[],struct cred * cred)1485 anon_fill_cow_holes(
1486 struct seg *seg,
1487 caddr_t addr,
1488 struct anon_hdr *ahp,
1489 ulong_t an_idx,
1490 struct vnode *vp,
1491 u_offset_t vp_off,
1492 size_t size,
1493 uint_t szc,
1494 uint_t prot,
1495 struct vpage vpage[],
1496 struct cred *cred)
1497 {
1498 struct anon *ap;
1499 spgcnt_t npages;
1500 pgcnt_t pgcnt, i;
1501 ulong_t index, off;
1502 int err = 0;
1503 int pageflags = 0;
1504
1505 ASSERT(szc != 0);
1506 pgcnt = page_get_pagecnt(szc);
1507 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1508 npages = btopr(size);
1509 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1510 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1511
1512 while (npages > 0) {
1513 index = an_idx;
1514
1515 /*
1516 * Find the next valid slot.
1517 */
1518 if (anon_get_next_ptr(ahp, &index) == NULL) {
1519 break;
1520 }
1521
1522 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1523 /*
1524 * Now backup index to the beginning of the
1525 * current large page region of the anon array.
1526 */
1527 index = P2ALIGN(index, pgcnt);
1528 off = index - an_idx;
1529 ASSERT(IS_P2ALIGNED(off, pgcnt));
1530 npages -= off;
1531 if (npages <= 0)
1532 break;
1533 an_idx += off;
1534 vp_off += ptob(off);
1535 addr += ptob(off);
1536 if (vpage != NULL) {
1537 vpage += off;
1538 }
1539
1540 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
1541 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
1542 page_t *pl[1 + 1];
1543 page_t *pp;
1544
1545 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
1546 pl, PAGESIZE, seg, addr, S_READ, cred,
1547 NULL);
1548 if (err) {
1549 break;
1550 }
1551 if (vpage != NULL) {
1552 prot = VPP_PROT(vpage);
1553 pageflags = VPP_ISPPLOCK(vpage) ?
1554 LOCK_PAGE : 0;
1555 }
1556 pp = anon_private(&ap, seg, addr, prot, pl[0],
1557 pageflags, cred);
1558 if (pp == NULL) {
1559 err = ENOMEM;
1560 break;
1561 }
1562 (void) anon_set_ptr(ahp, an_idx, ap,
1563 ANON_SLEEP);
1564 page_unlock(pp);
1565 }
1566 ASSERT(ap->an_refcnt == 1);
1567 addr += PAGESIZE;
1568 if (vpage != NULL) {
1569 vpage++;
1570 }
1571 }
1572 npages -= pgcnt;
1573 }
1574
1575 return (err);
1576 }
1577
1578 /*
1579 * Free a group of "size" anon pages, size in bytes,
1580 * and clear out the pointers to the anon entries.
1581 */
1582 void
anon_free(struct anon_hdr * ahp,ulong_t index,size_t size)1583 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
1584 {
1585 spgcnt_t npages;
1586 struct anon *ap;
1587 ulong_t old;
1588
1589 npages = btopr(size);
1590
1591 while (npages > 0) {
1592 old = index;
1593 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1594 break;
1595
1596 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1597 npages -= index - old;
1598 if (npages <= 0)
1599 break;
1600
1601 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
1602 anon_decref(ap);
1603 /*
1604 * Bump index and decrement page count
1605 */
1606 index++;
1607 npages--;
1608 }
1609 }
1610
1611 void
anon_free_pages(struct anon_hdr * ahp,ulong_t an_idx,size_t size,uint_t szc)1612 anon_free_pages(
1613 struct anon_hdr *ahp,
1614 ulong_t an_idx,
1615 size_t size,
1616 uint_t szc)
1617 {
1618 spgcnt_t npages;
1619 pgcnt_t pgcnt;
1620 ulong_t index, off;
1621
1622 ASSERT(szc != 0);
1623 pgcnt = page_get_pagecnt(szc);
1624 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1625 npages = btopr(size);
1626 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1627 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1628 ASSERT(an_idx < ahp->size);
1629
1630 VM_STAT_ADD(anonvmstats.freepages[0]);
1631
1632 while (npages > 0) {
1633 index = an_idx;
1634
1635 /*
1636 * Find the next valid slot.
1637 */
1638 if (anon_get_next_ptr(ahp, &index) == NULL)
1639 break;
1640
1641 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1642 /*
1643 * Now backup index to the beginning of the
1644 * current large page region of the old array.
1645 */
1646 index = P2ALIGN(index, pgcnt);
1647 off = index - an_idx;
1648 ASSERT(IS_P2ALIGNED(off, pgcnt));
1649 npages -= off;
1650 if (npages <= 0)
1651 break;
1652
1653 anon_decref_pages(ahp, index, szc);
1654
1655 off += pgcnt;
1656 an_idx += off;
1657 npages -= pgcnt;
1658 }
1659 }
1660
1661 /*
1662 * Make anonymous pages discardable
1663 */
1664 int
anon_disclaim(struct anon_map * amp,ulong_t index,size_t size,uint_t behav,pgcnt_t * purged)1665 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size,
1666 uint_t behav, pgcnt_t *purged)
1667 {
1668 spgcnt_t npages = btopr(size);
1669 struct anon *ap;
1670 struct vnode *vp;
1671 anoff_t off;
1672 page_t *pp, *root_pp;
1673 kmutex_t *ahm;
1674 pgcnt_t pgcnt, npurged = 0;
1675 ulong_t old_idx, idx, i;
1676 struct anon_hdr *ahp = amp->ahp;
1677 anon_sync_obj_t cookie;
1678 int err = 0;
1679
1680 VERIFY(behav == MADV_FREE || behav == MADV_PURGE);
1681 ASSERT(RW_READ_HELD(&->a_rwlock));
1682 pgcnt = 1;
1683 for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
1684 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
1685
1686 /*
1687 * get anon pointer and index for the first valid entry
1688 * in the anon list, starting from "index"
1689 */
1690 old_idx = index;
1691 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1692 break;
1693
1694 /*
1695 * decrement npages by number of NULL anon slots we skipped
1696 */
1697 npages -= index - old_idx;
1698 if (npages <= 0)
1699 break;
1700
1701 anon_array_enter(amp, index, &cookie);
1702 ap = anon_get_ptr(ahp, index);
1703 ASSERT(ap != NULL);
1704
1705 /*
1706 * Get anonymous page and try to lock it SE_EXCL;
1707 * if we couldn't grab the lock we skip to next page.
1708 */
1709 swap_xlate(ap, &vp, &off);
1710 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
1711 if (pp == NULL) {
1712 segadvstat.MADV_FREE_miss.value.ul++;
1713 pgcnt = 1;
1714 anon_array_exit(&cookie);
1715 continue;
1716 }
1717 pgcnt = page_get_pagecnt(pp->p_szc);
1718
1719 /*
1720 * we cannot free a page which is permanently locked.
1721 * The page_struct_lock need not be acquired to examine
1722 * these fields since the page has an "exclusive" lock.
1723 */
1724 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1725 page_unlock(pp);
1726 segadvstat.MADV_FREE_miss.value.ul++;
1727 anon_array_exit(&cookie);
1728 err = EBUSY;
1729 continue;
1730 }
1731
1732 ahm = AH_MUTEX(vp, off);
1733 mutex_enter(ahm);
1734 ASSERT(ap->an_refcnt != 0);
1735 /*
1736 * skip this one if copy-on-write is not yet broken.
1737 */
1738 if (ap->an_refcnt > 1) {
1739 mutex_exit(ahm);
1740 page_unlock(pp);
1741 segadvstat.MADV_FREE_miss.value.ul++;
1742 anon_array_exit(&cookie);
1743 continue;
1744 }
1745
1746 if (behav == MADV_PURGE && pp->p_szc != 0) {
1747 /*
1748 * If we're purging and we have a large page, simplify
1749 * things a bit by demoting ourselves into the base
1750 * page case.
1751 */
1752 (void) page_try_demote_pages(pp);
1753 }
1754
1755 if (pp->p_szc == 0) {
1756 pgcnt = 1;
1757
1758 /*
1759 * free swap slot;
1760 */
1761 if (ap->an_pvp) {
1762 swap_phys_free(ap->an_pvp, ap->an_poff,
1763 PAGESIZE);
1764 ap->an_pvp = NULL;
1765 ap->an_poff = 0;
1766 }
1767
1768 if (behav == MADV_PURGE) {
1769 /*
1770 * If we're purging (instead of merely freeing),
1771 * rip out this anon structure entirely to
1772 * assure that any subsequent fault pulls from
1773 * the backing vnode (if any).
1774 */
1775 if (--ap->an_refcnt == 0)
1776 anon_rmhash(ap);
1777
1778 mutex_exit(ahm);
1779 (void) anon_set_ptr(ahp, index,
1780 NULL, ANON_SLEEP);
1781 npurged++;
1782 ANI_ADD(1);
1783 kmem_cache_free(anon_cache, ap);
1784 } else {
1785 mutex_exit(ahm);
1786 }
1787
1788 segadvstat.MADV_FREE_hit.value.ul++;
1789
1790 /*
1791 * while we are at it, unload all the translations
1792 * and attempt to free the page.
1793 */
1794 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1795 /*LINTED: constant in conditional context */
1796 VN_DISPOSE(pp,
1797 behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred);
1798
1799 anon_array_exit(&cookie);
1800 continue;
1801 }
1802
1803 pgcnt = page_get_pagecnt(pp->p_szc);
1804 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
1805 if (!page_try_demote_pages(pp)) {
1806 mutex_exit(ahm);
1807 page_unlock(pp);
1808 segadvstat.MADV_FREE_miss.value.ul++;
1809 anon_array_exit(&cookie);
1810 err = EBUSY;
1811 continue;
1812 } else {
1813 pgcnt = 1;
1814 if (ap->an_pvp) {
1815 swap_phys_free(ap->an_pvp,
1816 ap->an_poff, PAGESIZE);
1817 ap->an_pvp = NULL;
1818 ap->an_poff = 0;
1819 }
1820 mutex_exit(ahm);
1821 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1822 /*LINTED*/
1823 VN_DISPOSE(pp, B_FREE, 0, kcred);
1824 segadvstat.MADV_FREE_hit.value.ul++;
1825 anon_array_exit(&cookie);
1826 continue;
1827 }
1828 }
1829 mutex_exit(ahm);
1830 root_pp = pp;
1831
1832 /*
1833 * try to lock remaining pages
1834 */
1835 for (idx = 1; idx < pgcnt; idx++) {
1836 pp++;
1837 if (!page_trylock(pp, SE_EXCL))
1838 break;
1839 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1840 page_unlock(pp);
1841 break;
1842 }
1843 }
1844
1845 if (idx == pgcnt) {
1846 for (i = 0; i < pgcnt; i++) {
1847 ap = anon_get_ptr(ahp, index + i);
1848 if (ap == NULL)
1849 break;
1850 swap_xlate(ap, &vp, &off);
1851 ahm = AH_MUTEX(vp, off);
1852 mutex_enter(ahm);
1853 ASSERT(ap->an_refcnt != 0);
1854
1855 /*
1856 * skip this one if copy-on-write
1857 * is not yet broken.
1858 */
1859 if (ap->an_refcnt > 1) {
1860 mutex_exit(ahm);
1861 goto skiplp;
1862 }
1863 if (ap->an_pvp) {
1864 swap_phys_free(ap->an_pvp,
1865 ap->an_poff, PAGESIZE);
1866 ap->an_pvp = NULL;
1867 ap->an_poff = 0;
1868 }
1869 mutex_exit(ahm);
1870 }
1871 page_destroy_pages(root_pp);
1872 segadvstat.MADV_FREE_hit.value.ul += pgcnt;
1873 anon_array_exit(&cookie);
1874 continue;
1875 }
1876 skiplp:
1877 segadvstat.MADV_FREE_miss.value.ul += pgcnt;
1878 for (i = 0, pp = root_pp; i < idx; pp++, i++)
1879 page_unlock(pp);
1880 anon_array_exit(&cookie);
1881 }
1882
1883 if (purged != NULL)
1884 *purged = npurged;
1885
1886 return (err);
1887 }
1888
1889 /*
1890 * Return the kept page(s) and protections back to the segment driver.
1891 */
1892 int
anon_getpage(struct anon ** app,uint_t * protp,page_t * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)1893 anon_getpage(
1894 struct anon **app,
1895 uint_t *protp,
1896 page_t *pl[],
1897 size_t plsz,
1898 struct seg *seg,
1899 caddr_t addr,
1900 enum seg_rw rw,
1901 struct cred *cred)
1902 {
1903 page_t *pp;
1904 struct anon *ap = *app;
1905 struct vnode *vp;
1906 anoff_t off;
1907 int err;
1908 kmutex_t *ahm;
1909
1910 swap_xlate(ap, &vp, &off);
1911
1912 /*
1913 * Lookup the page. If page is being paged in,
1914 * wait for it to finish as we must return a list of
1915 * pages since this routine acts like the VOP_GETPAGE
1916 * routine does.
1917 */
1918 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
1919 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1920 mutex_enter(ahm);
1921 if (ap->an_refcnt == 1)
1922 *protp = PROT_ALL;
1923 else
1924 *protp = PROT_ALL & ~PROT_WRITE;
1925 mutex_exit(ahm);
1926 pl[0] = pp;
1927 pl[1] = NULL;
1928 return (0);
1929 }
1930
1931 /*
1932 * Simply treat it as a vnode fault on the anon vp.
1933 */
1934
1935 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
1936 "anon_getpage:seg %x addr %x vp %x",
1937 seg, addr, vp);
1938
1939 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
1940 seg, addr, rw, cred, NULL);
1941
1942 if (err == 0 && pl != NULL) {
1943 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1944 mutex_enter(ahm);
1945 if (ap->an_refcnt != 1)
1946 *protp &= ~PROT_WRITE; /* make read-only */
1947 mutex_exit(ahm);
1948 }
1949 return (err);
1950 }
1951
1952 /*
1953 * Creates or returns kept pages to the segment driver. returns -1 if a large
1954 * page cannot be allocated. returns -2 if some other process has allocated a
1955 * larger page.
1956 *
1957 * For cowfault it will allocate any size pages to fill the requested area to
1958 * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1959 * slots within a large page with other processes). This policy greatly
1960 * simplifies large page freeing (which is only freed when all anon slot
1961 * refcnts are 0).
1962 */
1963 int
anon_map_getpages(struct anon_map * amp,ulong_t start_idx,uint_t szc,struct seg * seg,caddr_t addr,uint_t prot,uint_t * protp,page_t * ppa[],uint_t * ppa_szc,struct vpage vpage[],enum seg_rw rw,int brkcow,int anypgsz,int pgflags,struct cred * cred)1964 anon_map_getpages(
1965 struct anon_map *amp,
1966 ulong_t start_idx,
1967 uint_t szc,
1968 struct seg *seg,
1969 caddr_t addr,
1970 uint_t prot,
1971 uint_t *protp,
1972 page_t *ppa[],
1973 uint_t *ppa_szc,
1974 struct vpage vpage[],
1975 enum seg_rw rw,
1976 int brkcow,
1977 int anypgsz,
1978 int pgflags,
1979 struct cred *cred)
1980 {
1981 pgcnt_t pgcnt;
1982 struct anon *ap;
1983 struct vnode *vp;
1984 anoff_t off;
1985 page_t *pp, *pl[2], *conpp = NULL;
1986 caddr_t vaddr;
1987 ulong_t pg_idx, an_idx, i;
1988 spgcnt_t nreloc = 0;
1989 int prealloc = 1;
1990 int err, slotcreate;
1991 uint_t vpprot;
1992 int upsize = (szc < seg->s_szc);
1993
1994 #if !defined(__i386) && !defined(__amd64)
1995 ASSERT(seg->s_szc != 0);
1996 #endif
1997 ASSERT(szc <= seg->s_szc);
1998 ASSERT(ppa_szc != NULL);
1999 ASSERT(rw != S_CREATE);
2000
2001 *protp = PROT_ALL;
2002
2003 VM_STAT_ADD(anonvmstats.getpages[0]);
2004
2005 if (szc == 0) {
2006 VM_STAT_ADD(anonvmstats.getpages[1]);
2007 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
2008 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
2009 addr, rw, cred);
2010 if (err)
2011 return (err);
2012 ppa[0] = pl[0];
2013 if (brkcow == 0 || (*protp & PROT_WRITE)) {
2014 VM_STAT_ADD(anonvmstats.getpages[2]);
2015 if (ppa[0]->p_szc != 0 && upsize) {
2016 VM_STAT_ADD(anonvmstats.getpages[3]);
2017 *ppa_szc = MIN(ppa[0]->p_szc,
2018 seg->s_szc);
2019 page_unlock(ppa[0]);
2020 return (-2);
2021 }
2022 return (0);
2023 }
2024 panic("anon_map_getpages: cowfault for szc 0");
2025 } else {
2026 VM_STAT_ADD(anonvmstats.getpages[4]);
2027 ppa[0] = anon_zero(seg, addr, &ap, cred);
2028 if (ppa[0] == NULL)
2029 return (ENOMEM);
2030 (void) anon_set_ptr(amp->ahp, start_idx, ap,
2031 ANON_SLEEP);
2032 return (0);
2033 }
2034 }
2035
2036 pgcnt = page_get_pagecnt(szc);
2037 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2038 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2039
2040 /*
2041 * First we check for the case that the requtested large
2042 * page or larger page already exists in the system.
2043 * Actually we only check if the first constituent page
2044 * exists and only preallocate if it's not found.
2045 */
2046 ap = anon_get_ptr(amp->ahp, start_idx);
2047 if (ap) {
2048 uint_t pszc;
2049 swap_xlate(ap, &vp, &off);
2050 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
2051 if (pszc > szc && upsize) {
2052 *ppa_szc = MIN(pszc, seg->s_szc);
2053 return (-2);
2054 }
2055 if (pszc >= szc) {
2056 prealloc = 0;
2057 }
2058 }
2059 }
2060
2061 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
2062 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
2063
2064 top:
2065 /*
2066 * If a smaller page or no page at all was found,
2067 * grab a large page off the freelist.
2068 */
2069 if (prealloc) {
2070 ASSERT(conpp == NULL);
2071 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
2072 szc, 0, pgflags) != 0) {
2073 VM_STAT_ADD(anonvmstats.getpages[7]);
2074 if (brkcow == 0 || szc < seg->s_szc ||
2075 !anon_szcshare(amp->ahp, start_idx)) {
2076 /*
2077 * If the refcnt's of all anon slots are <= 1
2078 * they can't increase since we are holding
2079 * the address space's lock. So segvn can
2080 * safely decrease szc without risking to
2081 * generate a cow fault for the region smaller
2082 * than the segment's largest page size.
2083 */
2084 VM_STAT_ADD(anonvmstats.getpages[8]);
2085 return (-1);
2086 }
2087 docow:
2088 /*
2089 * This is a cow fault. Copy away the entire 1 large
2090 * page region of this segment.
2091 */
2092 if (szc != seg->s_szc)
2093 panic("anon_map_getpages: cowfault for szc %d",
2094 szc);
2095 vaddr = addr;
2096 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
2097 pg_idx++, an_idx++, vaddr += PAGESIZE) {
2098 if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
2099 NULL) {
2100 err = anon_getpage(&ap, &vpprot, pl,
2101 PAGESIZE, seg, vaddr, rw, cred);
2102 if (err) {
2103 for (i = 0; i < pg_idx; i++) {
2104 if ((pp = ppa[i]) !=
2105 NULL)
2106 page_unlock(pp);
2107 }
2108 return (err);
2109 }
2110 ppa[pg_idx] = pl[0];
2111 } else {
2112 /*
2113 * Since this is a cowfault we know
2114 * that this address space has a
2115 * parent or children which means
2116 * anon_dup_fill_holes() has initialized
2117 * all anon slots within a large page
2118 * region that had at least one anon
2119 * slot at the time of fork().
2120 */
2121 panic("anon_map_getpages: "
2122 "cowfault but anon slot is empty");
2123 }
2124 }
2125 VM_STAT_ADD(anonvmstats.getpages[9]);
2126 *protp = PROT_ALL;
2127 return (anon_map_privatepages(amp, start_idx, szc, seg,
2128 addr, prot, ppa, vpage, anypgsz, pgflags, cred));
2129 }
2130 }
2131
2132 VM_STAT_ADD(anonvmstats.getpages[10]);
2133
2134 an_idx = start_idx;
2135 pg_idx = 0;
2136 vaddr = addr;
2137 while (pg_idx < pgcnt) {
2138 slotcreate = 0;
2139 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
2140 VM_STAT_ADD(anonvmstats.getpages[11]);
2141 /*
2142 * For us to have decided not to preallocate
2143 * would have meant that a large page
2144 * was found. Which also means that all of the
2145 * anon slots for that page would have been
2146 * already created for us.
2147 */
2148 if (prealloc == 0)
2149 panic("anon_map_getpages: prealloc = 0");
2150
2151 slotcreate = 1;
2152 ap = anon_alloc(NULL, 0);
2153 }
2154 swap_xlate(ap, &vp, &off);
2155
2156 /*
2157 * Now setup our preallocated page to pass down
2158 * to swap_getpage().
2159 */
2160 if (prealloc) {
2161 ASSERT(ppa[pg_idx]->p_szc == szc);
2162 conpp = ppa[pg_idx];
2163 }
2164 ASSERT(prealloc || conpp == NULL);
2165
2166 /*
2167 * If we just created this anon slot then call
2168 * with S_CREATE to prevent doing IO on the page.
2169 * Similar to the anon_zero case.
2170 */
2171 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
2172 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
2173 slotcreate == 1 ? S_CREATE : rw, cred);
2174
2175 if (err) {
2176 ASSERT(err != -2 || upsize);
2177 VM_STAT_ADD(anonvmstats.getpages[12]);
2178 ASSERT(slotcreate == 0);
2179 goto io_err;
2180 }
2181
2182 pp = pl[0];
2183
2184 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
2185 VM_STAT_ADD(anonvmstats.getpages[13]);
2186 ASSERT(slotcreate == 0);
2187 ASSERT(prealloc == 0);
2188 ASSERT(pg_idx == 0);
2189 if (pp->p_szc > szc) {
2190 ASSERT(upsize);
2191 *ppa_szc = MIN(pp->p_szc, seg->s_szc);
2192 page_unlock(pp);
2193 VM_STAT_ADD(anonvmstats.getpages[14]);
2194 return (-2);
2195 }
2196 page_unlock(pp);
2197 prealloc = 1;
2198 goto top;
2199 }
2200
2201 /*
2202 * If we decided to preallocate but VOP_GETPAGE
2203 * found a page in the system that satisfies our
2204 * request then free up our preallocated large page
2205 * and continue looping accross the existing large
2206 * page via VOP_GETPAGE.
2207 */
2208 if (prealloc && pp != ppa[pg_idx]) {
2209 VM_STAT_ADD(anonvmstats.getpages[15]);
2210 ASSERT(slotcreate == 0);
2211 ASSERT(pg_idx == 0);
2212 conpp = NULL;
2213 prealloc = 0;
2214 page_free_pages(ppa[0]);
2215 }
2216
2217 if (prealloc && nreloc > 1) {
2218 /*
2219 * we have relocated out of a smaller large page.
2220 * skip npgs - 1 iterations and continue which will
2221 * increment by one the loop indices.
2222 */
2223 spgcnt_t npgs = nreloc;
2224
2225 VM_STAT_ADD(anonvmstats.getpages[16]);
2226
2227 ASSERT(pp == ppa[pg_idx]);
2228 ASSERT(slotcreate == 0);
2229 ASSERT(pg_idx + npgs <= pgcnt);
2230 if ((*protp & PROT_WRITE) &&
2231 anon_share(amp->ahp, an_idx, npgs)) {
2232 *protp &= ~PROT_WRITE;
2233 }
2234 pg_idx += npgs;
2235 an_idx += npgs;
2236 vaddr += PAGESIZE * npgs;
2237 continue;
2238 }
2239
2240 VM_STAT_ADD(anonvmstats.getpages[17]);
2241
2242 /*
2243 * Anon_zero case.
2244 */
2245 if (slotcreate) {
2246 ASSERT(prealloc);
2247 pagezero(pp, 0, PAGESIZE);
2248 CPU_STATS_ADD_K(vm, zfod, 1);
2249 hat_setrefmod(pp);
2250 }
2251
2252 ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
2253 ASSERT(prealloc != 0 || PAGE_SHARED(pp));
2254 ASSERT(prealloc == 0 || PAGE_EXCL(pp));
2255
2256 if (pg_idx > 0 &&
2257 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
2258 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
2259 panic("anon_map_getpages: unexpected page");
2260 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
2261 panic("anon_map_getpages: unaligned page");
2262 }
2263
2264 if (prealloc == 0) {
2265 ppa[pg_idx] = pp;
2266 }
2267
2268 if (ap->an_refcnt > 1) {
2269 VM_STAT_ADD(anonvmstats.getpages[18]);
2270 *protp &= ~PROT_WRITE;
2271 }
2272
2273 /*
2274 * If this is a new anon slot then initialize
2275 * the anon array entry.
2276 */
2277 if (slotcreate) {
2278 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2279 }
2280 pg_idx++;
2281 an_idx++;
2282 vaddr += PAGESIZE;
2283 }
2284
2285 /*
2286 * Since preallocated pages come off the freelist
2287 * they are locked SE_EXCL. Simply downgrade and return.
2288 */
2289 if (prealloc) {
2290 VM_STAT_ADD(anonvmstats.getpages[19]);
2291 conpp = NULL;
2292 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2293 page_downgrade(ppa[pg_idx]);
2294 }
2295 }
2296 ASSERT(conpp == NULL);
2297
2298 if (brkcow == 0 || (*protp & PROT_WRITE)) {
2299 VM_STAT_ADD(anonvmstats.getpages[20]);
2300 return (0);
2301 }
2302
2303 if (szc < seg->s_szc)
2304 panic("anon_map_getpages: cowfault for szc %d", szc);
2305
2306 VM_STAT_ADD(anonvmstats.getpages[21]);
2307
2308 *protp = PROT_ALL;
2309 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
2310 ppa, vpage, anypgsz, pgflags, cred));
2311 io_err:
2312 /*
2313 * We got an IO error somewhere in our large page.
2314 * If we were using a preallocated page then just demote
2315 * all the constituent pages that we've succeeded with sofar
2316 * to PAGESIZE pages and leave them in the system
2317 * unlocked.
2318 */
2319
2320 ASSERT(err != -2 || ((pg_idx == 0) && upsize));
2321
2322 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
2323 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
2324 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
2325
2326 if (prealloc) {
2327 conpp = NULL;
2328 if (pg_idx > 0) {
2329 VM_STAT_ADD(anonvmstats.getpages[25]);
2330 for (i = 0; i < pgcnt; i++) {
2331 pp = ppa[i];
2332 ASSERT(PAGE_EXCL(pp));
2333 ASSERT(pp->p_szc == szc);
2334 pp->p_szc = 0;
2335 }
2336 for (i = 0; i < pg_idx; i++) {
2337 ASSERT(!hat_page_is_mapped(ppa[i]));
2338 page_unlock(ppa[i]);
2339 }
2340 /*
2341 * Now free up the remaining unused constituent
2342 * pages.
2343 */
2344 while (pg_idx < pgcnt) {
2345 ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
2346 page_free(ppa[pg_idx], 0);
2347 pg_idx++;
2348 }
2349 } else {
2350 VM_STAT_ADD(anonvmstats.getpages[26]);
2351 page_free_pages(ppa[0]);
2352 }
2353 } else {
2354 VM_STAT_ADD(anonvmstats.getpages[27]);
2355 ASSERT(err > 0);
2356 for (i = 0; i < pg_idx; i++)
2357 page_unlock(ppa[i]);
2358 }
2359 ASSERT(conpp == NULL);
2360 if (err != -1)
2361 return (err);
2362 /*
2363 * we are here because we failed to relocate.
2364 */
2365 ASSERT(prealloc);
2366 if (brkcow == 0 || szc < seg->s_szc ||
2367 !anon_szcshare(amp->ahp, start_idx)) {
2368 VM_STAT_ADD(anonvmstats.getpages[28]);
2369 return (-1);
2370 }
2371 VM_STAT_ADD(anonvmstats.getpages[29]);
2372 goto docow;
2373 }
2374
2375
2376 /*
2377 * Turn a reference to an object or shared anon page
2378 * into a private page with a copy of the data from the
2379 * original page which is always locked by the caller.
2380 * This routine unloads the translation and unlocks the
2381 * original page, if it isn't being stolen, before returning
2382 * to the caller.
2383 *
2384 * NOTE: The original anon slot is not freed by this routine
2385 * It must be freed by the caller while holding the
2386 * "anon_map" lock to prevent races which can occur if
2387 * a process has multiple lwps in its address space.
2388 */
2389 page_t *
anon_private(struct anon ** app,struct seg * seg,caddr_t addr,uint_t prot,page_t * opp,int oppflags,struct cred * cred)2390 anon_private(
2391 struct anon **app,
2392 struct seg *seg,
2393 caddr_t addr,
2394 uint_t prot,
2395 page_t *opp,
2396 int oppflags,
2397 struct cred *cred)
2398 {
2399 struct anon *old = *app;
2400 struct anon *new;
2401 page_t *pp = NULL;
2402 struct vnode *vp;
2403 anoff_t off;
2404 page_t *anon_pl[1 + 1];
2405 int err;
2406
2407 if (oppflags & STEAL_PAGE)
2408 ASSERT(PAGE_EXCL(opp));
2409 else
2410 ASSERT(PAGE_LOCKED(opp));
2411
2412 CPU_STATS_ADD_K(vm, cow_fault, 1);
2413
2414 /* Kernel probe */
2415 TNF_PROBE_1(anon_private, "vm pagefault", /* CSTYLED */,
2416 tnf_opaque, address, addr);
2417
2418 *app = new = anon_alloc(NULL, 0);
2419 swap_xlate(new, &vp, &off);
2420
2421 if (oppflags & STEAL_PAGE) {
2422 page_rename(opp, vp, (u_offset_t)off);
2423 pp = opp;
2424 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
2425 "anon_private:seg %p addr %x pp %p vp %p off %lx",
2426 seg, addr, pp, vp, off);
2427 hat_setmod(pp);
2428
2429 /* bug 4026339 */
2430 page_downgrade(pp);
2431 return (pp);
2432 }
2433
2434 /*
2435 * Call the VOP_GETPAGE routine to create the page, thereby
2436 * enabling the vnode driver to allocate any filesystem
2437 * space (e.g., disk block allocation for UFS). This also
2438 * prevents more than one page from being added to the
2439 * vnode at the same time.
2440 */
2441 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
2442 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2443 if (err)
2444 goto out;
2445
2446 pp = anon_pl[0];
2447
2448 /*
2449 * If the original page was locked, we need to move the lock
2450 * to the new page by transfering 'cowcnt/lckcnt' of the original
2451 * page to 'cowcnt/lckcnt' of the new page.
2452 *
2453 * See Statement at the beginning of segvn_lockop() and
2454 * comments in page_pp_useclaim() regarding the way
2455 * cowcnts/lckcnts are handled.
2456 *
2457 * Also availrmem must be decremented up front for read only mapping
2458 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2459 * if availrmem did not need to be decremented after all.
2460 */
2461 if (oppflags & LOCK_PAGE) {
2462 if ((prot & PROT_WRITE) == 0) {
2463 mutex_enter(&freemem_lock);
2464 if (availrmem > pages_pp_maximum) {
2465 availrmem--;
2466 pages_useclaim++;
2467 } else {
2468 mutex_exit(&freemem_lock);
2469 goto out;
2470 }
2471 mutex_exit(&freemem_lock);
2472 }
2473 page_pp_useclaim(opp, pp, prot & PROT_WRITE);
2474 }
2475
2476 /*
2477 * Now copy the contents from the original page,
2478 * which is locked and loaded in the MMU by
2479 * the caller to prevent yet another page fault.
2480 */
2481 /* XXX - should set mod bit in here */
2482 if (ppcopy(opp, pp) == 0) {
2483 /*
2484 * Before ppcopy could hanlde UE or other faults, we
2485 * would have panicked here, and still have no option
2486 * but to do so now.
2487 */
2488 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2489 (void *)opp, (void *)pp);
2490 }
2491
2492 hat_setrefmod(pp); /* mark as modified */
2493
2494 /*
2495 * Unload the old translation.
2496 */
2497 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
2498
2499 /*
2500 * Free unmapped, unmodified original page.
2501 * or release the lock on the original page,
2502 * otherwise the process will sleep forever in
2503 * anon_decref() waiting for the "exclusive" lock
2504 * on the page.
2505 */
2506 (void) page_release(opp, 1);
2507
2508 /*
2509 * we are done with page creation so downgrade the new
2510 * page's selock to shared, this helps when multiple
2511 * as_fault(...SOFTLOCK...) are done to the same
2512 * page(aio)
2513 */
2514 page_downgrade(pp);
2515
2516 /*
2517 * NOTE: The original anon slot must be freed by the
2518 * caller while holding the "anon_map" lock, if we
2519 * copied away from an anonymous page.
2520 */
2521 return (pp);
2522
2523 out:
2524 *app = old;
2525 if (pp)
2526 page_unlock(pp);
2527 anon_decref(new);
2528 page_unlock(opp);
2529 return ((page_t *)NULL);
2530 }
2531
2532 int
anon_map_privatepages(struct anon_map * amp,ulong_t start_idx,uint_t szc,struct seg * seg,caddr_t addr,uint_t prot,page_t * ppa[],struct vpage vpage[],int anypgsz,int pgflags,struct cred * cred)2533 anon_map_privatepages(
2534 struct anon_map *amp,
2535 ulong_t start_idx,
2536 uint_t szc,
2537 struct seg *seg,
2538 caddr_t addr,
2539 uint_t prot,
2540 page_t *ppa[],
2541 struct vpage vpage[],
2542 int anypgsz,
2543 int pgflags,
2544 struct cred *cred)
2545 {
2546 pgcnt_t pgcnt;
2547 struct vnode *vp;
2548 anoff_t off;
2549 page_t *pl[2], *conpp = NULL;
2550 int err;
2551 int prealloc = 1;
2552 struct anon *ap, *oldap;
2553 caddr_t vaddr;
2554 page_t *pplist, *pp;
2555 ulong_t pg_idx, an_idx;
2556 spgcnt_t nreloc = 0;
2557 int pagelock = 0;
2558 kmutex_t *ahmpages = NULL;
2559 #ifdef DEBUG
2560 int refcnt;
2561 #endif
2562
2563 ASSERT(szc != 0);
2564 ASSERT(szc == seg->s_szc);
2565
2566 VM_STAT_ADD(anonvmstats.privatepages[0]);
2567
2568 pgcnt = page_get_pagecnt(szc);
2569 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2570 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2571
2572 ASSERT(amp != NULL);
2573 ap = anon_get_ptr(amp->ahp, start_idx);
2574 ASSERT(ap == NULL || ap->an_refcnt >= 1);
2575
2576 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
2577
2578 /*
2579 * Now try and allocate the large page. If we fail then just
2580 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
2581 * the caller make this decision but to avoid added complexity
2582 * it's simplier to handle that case here.
2583 */
2584 if (anypgsz == -1) {
2585 VM_STAT_ADD(anonvmstats.privatepages[2]);
2586 prealloc = 0;
2587 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
2588 anypgsz, pgflags) != 0) {
2589 VM_STAT_ADD(anonvmstats.privatepages[3]);
2590 prealloc = 0;
2591 }
2592
2593 /*
2594 * make the decrement of all refcnts of all
2595 * anon slots of a large page appear atomic by
2596 * getting an anonpages_hash_lock for the
2597 * first anon slot of a large page.
2598 */
2599 if (ap != NULL) {
2600 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
2601 mutex_enter(ahmpages);
2602 if (ap->an_refcnt == 1) {
2603 VM_STAT_ADD(anonvmstats.privatepages[4]);
2604 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
2605 mutex_exit(ahmpages);
2606
2607 if (prealloc) {
2608 page_free_replacement_page(pplist);
2609 page_create_putback(pgcnt);
2610 }
2611 ASSERT(ppa[0]->p_szc <= szc);
2612 if (ppa[0]->p_szc == szc) {
2613 VM_STAT_ADD(anonvmstats.privatepages[5]);
2614 return (0);
2615 }
2616 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2617 ASSERT(ppa[pg_idx] != NULL);
2618 page_unlock(ppa[pg_idx]);
2619 }
2620 return (-1);
2621 }
2622 }
2623
2624 /*
2625 * If we are passed in the vpage array and this is
2626 * not PROT_WRITE then we need to decrement availrmem
2627 * up front before we try anything. If we need to and
2628 * can't decrement availrmem then its better to fail now
2629 * than in the middle of processing the new large page.
2630 * page_pp_usclaim() on behalf of each constituent page
2631 * below will adjust availrmem back for the cases not needed.
2632 */
2633 if (vpage != NULL && (prot & PROT_WRITE) == 0) {
2634 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2635 if (VPP_ISPPLOCK(&vpage[pg_idx])) {
2636 pagelock = 1;
2637 break;
2638 }
2639 }
2640 if (pagelock) {
2641 VM_STAT_ADD(anonvmstats.privatepages[6]);
2642 mutex_enter(&freemem_lock);
2643 if (availrmem >= pages_pp_maximum + pgcnt) {
2644 availrmem -= pgcnt;
2645 pages_useclaim += pgcnt;
2646 } else {
2647 VM_STAT_ADD(anonvmstats.privatepages[7]);
2648 mutex_exit(&freemem_lock);
2649 if (ahmpages != NULL) {
2650 mutex_exit(ahmpages);
2651 }
2652 if (prealloc) {
2653 page_free_replacement_page(pplist);
2654 page_create_putback(pgcnt);
2655 }
2656 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
2657 if (ppa[pg_idx] != NULL)
2658 page_unlock(ppa[pg_idx]);
2659 return (ENOMEM);
2660 }
2661 mutex_exit(&freemem_lock);
2662 }
2663 }
2664
2665 CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
2666
2667 VM_STAT_ADD(anonvmstats.privatepages[8]);
2668
2669 an_idx = start_idx;
2670 pg_idx = 0;
2671 vaddr = addr;
2672 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
2673 ASSERT(ppa[pg_idx] != NULL);
2674 oldap = anon_get_ptr(amp->ahp, an_idx);
2675 ASSERT(ahmpages != NULL || oldap == NULL);
2676 ASSERT(ahmpages == NULL || oldap != NULL);
2677 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2678 ASSERT(ahmpages == NULL || pg_idx != 0 ||
2679 (refcnt = oldap->an_refcnt));
2680 ASSERT(ahmpages == NULL || pg_idx == 0 ||
2681 refcnt == oldap->an_refcnt);
2682
2683 ap = anon_alloc(NULL, 0);
2684
2685 swap_xlate(ap, &vp, &off);
2686
2687 /*
2688 * Now setup our preallocated page to pass down to
2689 * swap_getpage().
2690 */
2691 if (prealloc) {
2692 pp = pplist;
2693 page_sub(&pplist, pp);
2694 conpp = pp;
2695 }
2696
2697 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
2698 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
2699 S_CREATE, cred);
2700
2701 /*
2702 * Impossible to fail this is S_CREATE.
2703 */
2704 if (err)
2705 panic("anon_map_privatepages: VOP_GETPAGE failed");
2706
2707 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
2708 ASSERT(prealloc == 0 || nreloc == 1);
2709
2710 pp = pl[0];
2711
2712 /*
2713 * If the original page was locked, we need to move
2714 * the lock to the new page by transfering
2715 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2716 * of the new page. pg_idx can be used to index
2717 * into the vpage array since the caller will guarentee
2718 * that vpage struct passed in corresponds to addr
2719 * and forward.
2720 */
2721 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
2722 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
2723 } else if (pagelock) {
2724 mutex_enter(&freemem_lock);
2725 availrmem++;
2726 pages_useclaim--;
2727 mutex_exit(&freemem_lock);
2728 }
2729
2730 /*
2731 * Now copy the contents from the original page.
2732 */
2733 if (ppcopy(ppa[pg_idx], pp) == 0) {
2734 /*
2735 * Before ppcopy could hanlde UE or other faults, we
2736 * would have panicked here, and still have no option
2737 * but to do so now.
2738 */
2739 panic("anon_map_privatepages, ppcopy failed");
2740 }
2741
2742 hat_setrefmod(pp); /* mark as modified */
2743
2744 /*
2745 * Release the lock on the original page,
2746 * derement the old slot, and down grade the lock
2747 * on the new copy.
2748 */
2749 page_unlock(ppa[pg_idx]);
2750
2751 if (!prealloc)
2752 page_downgrade(pp);
2753
2754 ppa[pg_idx] = pp;
2755
2756 /*
2757 * Now reflect the copy in the new anon array.
2758 */
2759 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2760 if (oldap != NULL)
2761 anon_decref(oldap);
2762 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2763 }
2764
2765 /*
2766 * Unload the old large page translation.
2767 */
2768 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
2769
2770 if (ahmpages != NULL) {
2771 mutex_exit(ahmpages);
2772 }
2773 ASSERT(prealloc == 0 || pplist == NULL);
2774 if (prealloc) {
2775 VM_STAT_ADD(anonvmstats.privatepages[9]);
2776 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2777 page_downgrade(ppa[pg_idx]);
2778 }
2779 }
2780
2781 return (0);
2782 }
2783
2784 /*
2785 * Allocate a private zero-filled anon page.
2786 */
2787 page_t *
anon_zero(struct seg * seg,caddr_t addr,struct anon ** app,struct cred * cred)2788 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
2789 {
2790 struct anon *ap;
2791 page_t *pp;
2792 struct vnode *vp;
2793 anoff_t off;
2794 page_t *anon_pl[1 + 1];
2795 int err;
2796
2797 /* Kernel probe */
2798 TNF_PROBE_1(anon_zero, "vm pagefault", /* CSTYLED */,
2799 tnf_opaque, address, addr);
2800
2801 *app = ap = anon_alloc(NULL, 0);
2802 swap_xlate(ap, &vp, &off);
2803
2804 /*
2805 * Call the VOP_GETPAGE routine to create the page, thereby
2806 * enabling the vnode driver to allocate any filesystem
2807 * dependent structures (e.g., disk block allocation for UFS).
2808 * This also prevents more than on page from being added to
2809 * the vnode at the same time since it is locked.
2810 */
2811 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
2812 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2813 if (err) {
2814 *app = NULL;
2815 anon_decref(ap);
2816 return (NULL);
2817 }
2818 pp = anon_pl[0];
2819
2820 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */
2821 page_downgrade(pp);
2822 CPU_STATS_ADD_K(vm, zfod, 1);
2823 hat_setrefmod(pp); /* mark as modified so pageout writes back */
2824 return (pp);
2825 }
2826
2827
2828 /*
2829 * Allocate array of private zero-filled anon pages for empty slots
2830 * and kept pages for non empty slots within given range.
2831 *
2832 * NOTE: This rontine will try and use large pages
2833 * if available and supported by underlying platform.
2834 */
2835 int
anon_map_createpages(struct anon_map * amp,ulong_t start_index,size_t len,page_t * ppa[],struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)2836 anon_map_createpages(
2837 struct anon_map *amp,
2838 ulong_t start_index,
2839 size_t len,
2840 page_t *ppa[],
2841 struct seg *seg,
2842 caddr_t addr,
2843 enum seg_rw rw,
2844 struct cred *cred)
2845 {
2846
2847 struct anon *ap;
2848 struct vnode *ap_vp;
2849 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
2850 int err = 0;
2851 ulong_t p_index, index;
2852 pgcnt_t npgs, pg_cnt;
2853 spgcnt_t nreloc = 0;
2854 uint_t l_szc, szc, prot;
2855 anoff_t ap_off;
2856 size_t pgsz;
2857 lgrp_t *lgrp;
2858 kmutex_t *ahm;
2859
2860 /*
2861 * XXX For now only handle S_CREATE.
2862 */
2863 ASSERT(rw == S_CREATE);
2864
2865 index = start_index;
2866 p_index = 0;
2867 npgs = btopr(len);
2868
2869 /*
2870 * If this platform supports multiple page sizes
2871 * then try and allocate directly from the free
2872 * list for pages larger than PAGESIZE.
2873 *
2874 * NOTE:When we have page_create_ru we can stop
2875 * directly allocating from the freelist.
2876 */
2877 l_szc = seg->s_szc;
2878 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2879 while (npgs) {
2880
2881 /*
2882 * if anon slot already exists
2883 * (means page has been created)
2884 * so 1) look up the page
2885 * 2) if the page is still in memory, get it.
2886 * 3) if not, create a page and
2887 * page in from physical swap device.
2888 * These are done in anon_getpage().
2889 */
2890 ap = anon_get_ptr(amp->ahp, index);
2891 if (ap) {
2892 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
2893 seg, addr, S_READ, cred);
2894 if (err) {
2895 ANON_LOCK_EXIT(&->a_rwlock);
2896 panic("anon_map_createpages: anon_getpage");
2897 }
2898 pp = anon_pl[0];
2899 ppa[p_index++] = pp;
2900
2901 /*
2902 * an_pvp can become non-NULL after SysV's page was
2903 * paged out before ISM was attached to this SysV
2904 * shared memory segment. So free swap slot if needed.
2905 */
2906 if (ap->an_pvp != NULL) {
2907 page_io_lock(pp);
2908 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
2909 mutex_enter(ahm);
2910 if (ap->an_pvp != NULL) {
2911 swap_phys_free(ap->an_pvp,
2912 ap->an_poff, PAGESIZE);
2913 ap->an_pvp = NULL;
2914 ap->an_poff = 0;
2915 mutex_exit(ahm);
2916 hat_setmod(pp);
2917 } else {
2918 mutex_exit(ahm);
2919 }
2920 page_io_unlock(pp);
2921 }
2922
2923 addr += PAGESIZE;
2924 index++;
2925 npgs--;
2926 continue;
2927 }
2928 /*
2929 * Now try and allocate the largest page possible
2930 * for the current address and range.
2931 * Keep dropping down in page size until:
2932 *
2933 * 1) Properly aligned
2934 * 2) Does not overlap existing anon pages
2935 * 3) Fits in remaining range.
2936 * 4) able to allocate one.
2937 *
2938 * NOTE: XXX When page_create_ru is completed this code
2939 * will change.
2940 */
2941 szc = l_szc;
2942 pplist = NULL;
2943 pg_cnt = 0;
2944 while (szc) {
2945 pgsz = page_get_pagesize(szc);
2946 pg_cnt = pgsz >> PAGESHIFT;
2947 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
2948 anon_pages(amp->ahp, index, pg_cnt) == 0) {
2949 /*
2950 * XXX
2951 * Since we are faking page_create()
2952 * we also need to do the freemem and
2953 * pcf accounting.
2954 */
2955 (void) page_create_wait(pg_cnt, PG_WAIT);
2956
2957 /*
2958 * Get lgroup to allocate next page of shared
2959 * memory from and use it to specify where to
2960 * allocate the physical memory
2961 */
2962 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2963
2964 pplist = page_get_freelist(
2965 anon_vp, (u_offset_t)0, seg,
2966 addr, pgsz, 0, lgrp);
2967
2968 if (pplist == NULL) {
2969 page_create_putback(pg_cnt);
2970 }
2971
2972 /*
2973 * If a request for a page of size
2974 * larger than PAGESIZE failed
2975 * then don't try that size anymore.
2976 */
2977 if (pplist == NULL) {
2978 l_szc = szc - 1;
2979 } else {
2980 break;
2981 }
2982 }
2983 szc--;
2984 }
2985
2986 /*
2987 * If just using PAGESIZE pages then don't
2988 * directly allocate from the free list.
2989 */
2990 if (pplist == NULL) {
2991 ASSERT(szc == 0);
2992 pp = anon_zero(seg, addr, &ap, cred);
2993 if (pp == NULL) {
2994 ANON_LOCK_EXIT(&->a_rwlock);
2995 panic("anon_map_createpages: anon_zero");
2996 }
2997 ppa[p_index++] = pp;
2998
2999 ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3000 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3001
3002 addr += PAGESIZE;
3003 index++;
3004 npgs--;
3005 continue;
3006 }
3007
3008 /*
3009 * pplist is a list of pg_cnt PAGESIZE pages.
3010 * These pages are locked SE_EXCL since they
3011 * came directly off the free list.
3012 */
3013 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
3014 ASSERT(IS_P2ALIGNED(index, pg_cnt));
3015 ASSERT(conpp == NULL);
3016 while (pg_cnt--) {
3017
3018 ap = anon_alloc(NULL, 0);
3019 swap_xlate(ap, &ap_vp, &ap_off);
3020
3021 ASSERT(pplist != NULL);
3022 pp = pplist;
3023 page_sub(&pplist, pp);
3024 PP_CLRFREE(pp);
3025 PP_CLRAGED(pp);
3026 conpp = pp;
3027
3028 err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
3029 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
3030 &nreloc, seg, addr, S_CREATE, cred);
3031
3032 if (err) {
3033 ANON_LOCK_EXIT(&->a_rwlock);
3034 panic("anon_map_createpages: S_CREATE");
3035 }
3036
3037 ASSERT(anon_pl[0] == pp);
3038 ASSERT(nreloc == 1);
3039 pagezero(pp, 0, PAGESIZE);
3040 CPU_STATS_ADD_K(vm, zfod, 1);
3041 hat_setrefmod(pp);
3042
3043 ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3044 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3045
3046 ppa[p_index++] = pp;
3047
3048 addr += PAGESIZE;
3049 index++;
3050 npgs--;
3051 }
3052 conpp = NULL;
3053 pg_cnt = pgsz >> PAGESHIFT;
3054 p_index = p_index - pg_cnt;
3055 while (pg_cnt--) {
3056 page_downgrade(ppa[p_index++]);
3057 }
3058 }
3059 ANON_LOCK_EXIT(&->a_rwlock);
3060 return (0);
3061 }
3062
3063 static int
anon_try_demote_pages(struct anon_hdr * ahp,ulong_t sidx,uint_t szc,page_t ** ppa,int private)3064 anon_try_demote_pages(
3065 struct anon_hdr *ahp,
3066 ulong_t sidx,
3067 uint_t szc,
3068 page_t **ppa,
3069 int private)
3070 {
3071 struct anon *ap;
3072 pgcnt_t pgcnt = page_get_pagecnt(szc);
3073 page_t *pp;
3074 pgcnt_t i;
3075 kmutex_t *ahmpages = NULL;
3076 int root = 0;
3077 pgcnt_t npgs;
3078 pgcnt_t curnpgs = 0;
3079 size_t ppasize = 0;
3080
3081 ASSERT(szc != 0);
3082 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3083 ASSERT(IS_P2ALIGNED(sidx, pgcnt));
3084 ASSERT(sidx < ahp->size);
3085
3086 if (ppa == NULL) {
3087 ppasize = pgcnt * sizeof (page_t *);
3088 ppa = kmem_alloc(ppasize, KM_SLEEP);
3089 }
3090
3091 ap = anon_get_ptr(ahp, sidx);
3092 if (ap != NULL && private) {
3093 VM_STAT_ADD(anonvmstats.demotepages[1]);
3094 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
3095 mutex_enter(ahmpages);
3096 }
3097
3098 if (ap != NULL && ap->an_refcnt > 1) {
3099 if (ahmpages != NULL) {
3100 VM_STAT_ADD(anonvmstats.demotepages[2]);
3101 mutex_exit(ahmpages);
3102 }
3103 if (ppasize != 0) {
3104 kmem_free(ppa, ppasize);
3105 }
3106 return (0);
3107 }
3108 if (ahmpages != NULL) {
3109 mutex_exit(ahmpages);
3110 }
3111 if (ahp->size - sidx < pgcnt) {
3112 ASSERT(private == 0);
3113 pgcnt = ahp->size - sidx;
3114 }
3115 for (i = 0; i < pgcnt; i++, sidx++) {
3116 ap = anon_get_ptr(ahp, sidx);
3117 if (ap != NULL) {
3118 if (ap->an_refcnt != 1) {
3119 panic("anon_try_demote_pages: an_refcnt != 1");
3120 }
3121 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
3122 SE_EXCL);
3123 if (pp != NULL) {
3124 (void) hat_pageunload(pp,
3125 HAT_FORCE_PGUNLOAD);
3126 }
3127 } else {
3128 ppa[i] = NULL;
3129 }
3130 }
3131 for (i = 0; i < pgcnt; i++) {
3132 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
3133 ASSERT(pp->p_szc <= szc);
3134 if (!root) {
3135 VM_STAT_ADD(anonvmstats.demotepages[3]);
3136 if (curnpgs != 0)
3137 panic("anon_try_demote_pages: "
3138 "bad large page");
3139
3140 root = 1;
3141 curnpgs = npgs =
3142 page_get_pagecnt(pp->p_szc);
3143
3144 ASSERT(npgs <= pgcnt);
3145 ASSERT(IS_P2ALIGNED(npgs, npgs));
3146 ASSERT(!(page_pptonum(pp) & (npgs - 1)));
3147 } else {
3148 ASSERT(i > 0);
3149 ASSERT(page_pptonum(pp) - 1 ==
3150 page_pptonum(ppa[i - 1]));
3151 if ((page_pptonum(pp) & (npgs - 1)) ==
3152 npgs - 1)
3153 root = 0;
3154 }
3155 ASSERT(PAGE_EXCL(pp));
3156 pp->p_szc = 0;
3157 ASSERT(curnpgs > 0);
3158 curnpgs--;
3159 }
3160 }
3161 if (root != 0 || curnpgs != 0)
3162 panic("anon_try_demote_pages: bad large page");
3163
3164 for (i = 0; i < pgcnt; i++) {
3165 if ((pp = ppa[i]) != NULL) {
3166 ASSERT(!hat_page_is_mapped(pp));
3167 ASSERT(pp->p_szc == 0);
3168 page_unlock(pp);
3169 }
3170 }
3171 if (ppasize != 0) {
3172 kmem_free(ppa, ppasize);
3173 }
3174 return (1);
3175 }
3176
3177 /*
3178 * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3179 */
3180 int
anon_map_demotepages(struct anon_map * amp,ulong_t start_idx,struct seg * seg,caddr_t addr,uint_t prot,struct vpage vpage[],struct cred * cred)3181 anon_map_demotepages(
3182 struct anon_map *amp,
3183 ulong_t start_idx,
3184 struct seg *seg,
3185 caddr_t addr,
3186 uint_t prot,
3187 struct vpage vpage[],
3188 struct cred *cred)
3189 {
3190 struct anon *ap;
3191 uint_t szc = seg->s_szc;
3192 pgcnt_t pgcnt = page_get_pagecnt(szc);
3193 size_t ppasize = pgcnt * sizeof (page_t *);
3194 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
3195 page_t *pp;
3196 page_t *pl[2];
3197 pgcnt_t i, pg_idx;
3198 ulong_t an_idx;
3199 caddr_t vaddr;
3200 int err;
3201 int retry = 0;
3202 uint_t vpprot;
3203
3204 ASSERT(RW_WRITE_HELD(&->a_rwlock));
3205 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3206 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
3207 ASSERT(ppa != NULL);
3208 ASSERT(szc != 0);
3209 ASSERT(szc == amp->a_szc);
3210
3211 VM_STAT_ADD(anonvmstats.demotepages[0]);
3212
3213 top:
3214 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
3215 kmem_free(ppa, ppasize);
3216 return (0);
3217 }
3218
3219 VM_STAT_ADD(anonvmstats.demotepages[4]);
3220
3221 ASSERT(retry == 0); /* we can be here only once */
3222
3223 vaddr = addr;
3224 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
3225 pg_idx++, an_idx++, vaddr += PAGESIZE) {
3226 ap = anon_get_ptr(amp->ahp, an_idx);
3227 if (ap == NULL)
3228 panic("anon_map_demotepages: no anon slot");
3229 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
3230 S_READ, cred);
3231 if (err) {
3232 for (i = 0; i < pg_idx; i++) {
3233 if ((pp = ppa[i]) != NULL)
3234 page_unlock(pp);
3235 }
3236 kmem_free(ppa, ppasize);
3237 return (err);
3238 }
3239 ppa[pg_idx] = pl[0];
3240 }
3241
3242 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
3243 vpage, -1, 0, cred);
3244 if (err > 0) {
3245 VM_STAT_ADD(anonvmstats.demotepages[5]);
3246 kmem_free(ppa, ppasize);
3247 return (err);
3248 }
3249 ASSERT(err == 0 || err == -1);
3250 if (err == -1) {
3251 VM_STAT_ADD(anonvmstats.demotepages[6]);
3252 retry = 1;
3253 goto top;
3254 }
3255 for (i = 0; i < pgcnt; i++) {
3256 ASSERT(ppa[i] != NULL);
3257 if (ppa[i]->p_szc != 0)
3258 retry = 1;
3259 page_unlock(ppa[i]);
3260 }
3261 if (retry) {
3262 VM_STAT_ADD(anonvmstats.demotepages[7]);
3263 goto top;
3264 }
3265
3266 VM_STAT_ADD(anonvmstats.demotepages[8]);
3267
3268 kmem_free(ppa, ppasize);
3269
3270 return (0);
3271 }
3272
3273 /*
3274 * Free pages of shared anon map. It's assumed that anon maps don't share anon
3275 * structures with private anon maps. Therefore all anon structures should
3276 * have at most one reference at this point. This means underlying pages can
3277 * be exclusively locked and demoted or freed. If not freeing the entire
3278 * large pages demote the ends of the region we free to be able to free
3279 * subpages. Page roots correspond to aligned index positions in anon map.
3280 */
3281 void
anon_shmap_free_pages(struct anon_map * amp,ulong_t sidx,size_t len)3282 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
3283 {
3284 ulong_t eidx = sidx + btopr(len);
3285 pgcnt_t pages = page_get_pagecnt(amp->a_szc);
3286 struct anon_hdr *ahp = amp->ahp;
3287 ulong_t tidx;
3288 size_t size;
3289 ulong_t sidx_aligned;
3290 ulong_t eidx_aligned;
3291
3292 ASSERT(ANON_WRITE_HELD(&->a_rwlock));
3293 ASSERT(amp->refcnt <= 1);
3294 ASSERT(amp->a_szc > 0);
3295 ASSERT(eidx <= ahp->size);
3296 ASSERT(!anon_share(ahp, sidx, btopr(len)));
3297
3298 if (len == 0) { /* XXX */
3299 return;
3300 }
3301
3302 sidx_aligned = P2ALIGN(sidx, pages);
3303 if (sidx_aligned != sidx ||
3304 (eidx < sidx_aligned + pages && eidx < ahp->size)) {
3305 if (!anon_try_demote_pages(ahp, sidx_aligned,
3306 amp->a_szc, NULL, 0)) {
3307 panic("anon_shmap_free_pages: demote failed");
3308 }
3309 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
3310 P2NPHASE(sidx, pages);
3311 size <<= PAGESHIFT;
3312 anon_free(ahp, sidx, size);
3313 sidx = sidx_aligned + pages;
3314 if (eidx <= sidx) {
3315 return;
3316 }
3317 }
3318 eidx_aligned = P2ALIGN(eidx, pages);
3319 if (sidx < eidx_aligned) {
3320 anon_free_pages(ahp, sidx,
3321 (eidx_aligned - sidx) << PAGESHIFT,
3322 amp->a_szc);
3323 sidx = eidx_aligned;
3324 }
3325 ASSERT(sidx == eidx_aligned);
3326 if (eidx == eidx_aligned) {
3327 return;
3328 }
3329 tidx = eidx;
3330 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
3331 tidx - sidx < pages) {
3332 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
3333 panic("anon_shmap_free_pages: demote failed");
3334 }
3335 size = (eidx - sidx) << PAGESHIFT;
3336 anon_free(ahp, sidx, size);
3337 } else {
3338 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
3339 }
3340 }
3341
3342 /*
3343 * This routine should be called with amp's writer lock when there're no other
3344 * users of amp. All pcache entries of this amp must have been already
3345 * inactivated. We must not drop a_rwlock here to prevent new users from
3346 * attaching to this amp.
3347 */
3348 void
anonmap_purge(struct anon_map * amp)3349 anonmap_purge(struct anon_map *amp)
3350 {
3351 ASSERT(ANON_WRITE_HELD(&->a_rwlock));
3352 ASSERT(amp->refcnt <= 1);
3353
3354 if (amp->a_softlockcnt != 0) {
3355 seg_ppurge(NULL, amp, 0);
3356 }
3357
3358 /*
3359 * Since all pcache entries were already inactive before this routine
3360 * was called seg_ppurge() couldn't return while there're still
3361 * entries that can be found via the list anchored at a_phead. So we
3362 * can assert this list is empty now. a_softlockcnt may be still non 0
3363 * if asynchronous thread that manages pcache already removed pcache
3364 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3365 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3366 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3367 * before shamp_reclaim() is done with it. a_purgemtx also taken by
3368 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3369 * barrier that prevents anonmap_purge() to complete while
3370 * shamp_reclaim() may still be referencing this amp.
3371 */
3372 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3373 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3374
3375 mutex_enter(&->a_purgemtx);
3376 while (amp->a_softlockcnt != 0) {
3377 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3378 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3379 amp->a_purgewait = 1;
3380 cv_wait(&->a_purgecv, &->a_purgemtx);
3381 }
3382 mutex_exit(&->a_purgemtx);
3383
3384 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3385 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3386 ASSERT(amp->a_softlockcnt == 0);
3387 }
3388
3389 /*
3390 * Allocate and initialize an anon_map structure for seg
3391 * associating the given swap reservation with the new anon_map.
3392 */
3393 struct anon_map *
anonmap_alloc(size_t size,size_t swresv,int flags)3394 anonmap_alloc(size_t size, size_t swresv, int flags)
3395 {
3396 struct anon_map *amp;
3397 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
3398
3399 amp = kmem_cache_alloc(anonmap_cache, kmflags);
3400 if (amp == NULL) {
3401 ASSERT(kmflags == KM_NOSLEEP);
3402 return (NULL);
3403 }
3404
3405 amp->ahp = anon_create(btopr(size), flags);
3406 if (amp->ahp == NULL) {
3407 ASSERT(flags == ANON_NOSLEEP);
3408 kmem_cache_free(anonmap_cache, amp);
3409 return (NULL);
3410 }
3411 amp->refcnt = 1;
3412 amp->size = size;
3413 amp->swresv = swresv;
3414 amp->locality = 0;
3415 amp->a_szc = 0;
3416 amp->a_sp = NULL;
3417 amp->a_softlockcnt = 0;
3418 amp->a_purgewait = 0;
3419 amp->a_phead.p_lnext = &->a_phead;
3420 amp->a_phead.p_lprev = &->a_phead;
3421
3422 return (amp);
3423 }
3424
3425 void
anonmap_free(struct anon_map * amp)3426 anonmap_free(struct anon_map *amp)
3427 {
3428 ASSERT(amp->ahp != NULL);
3429 ASSERT(amp->refcnt == 0);
3430 ASSERT(amp->a_softlockcnt == 0);
3431 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3432 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3433
3434 lgrp_shm_policy_fini(amp, NULL);
3435 anon_release(amp->ahp, btopr(amp->size));
3436 kmem_cache_free(anonmap_cache, amp);
3437 }
3438
3439 /*
3440 * Returns true if the app array has some empty slots.
3441 * The offp and lenp parameters are in/out parameters. On entry
3442 * these values represent the starting offset and length of the
3443 * mapping. When true is returned, these values may be modified
3444 * to be the largest range which includes empty slots.
3445 */
3446 int
non_anon(struct anon_hdr * ahp,ulong_t anon_idx,u_offset_t * offp,size_t * lenp)3447 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
3448 size_t *lenp)
3449 {
3450 ulong_t i, el;
3451 ssize_t low, high;
3452 struct anon *ap;
3453
3454 low = -1;
3455 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
3456 ap = anon_get_ptr(ahp, anon_idx);
3457 if (ap == NULL) {
3458 if (low == -1)
3459 low = i;
3460 high = i;
3461 }
3462 }
3463 if (low != -1) {
3464 /*
3465 * Found at least one non-anon page.
3466 * Set up the off and len return values.
3467 */
3468 if (low != 0)
3469 *offp += low;
3470 *lenp = high - low + PAGESIZE;
3471 return (1);
3472 }
3473 return (0);
3474 }
3475
3476 /*
3477 * Return a count of the number of existing anon pages in the anon array
3478 * app in the range (off, off+len). The array and slots must be guaranteed
3479 * stable by the caller.
3480 */
3481 pgcnt_t
anon_pages(struct anon_hdr * ahp,ulong_t anon_index,pgcnt_t nslots)3482 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
3483 {
3484 pgcnt_t cnt = 0;
3485
3486 while (nslots-- > 0) {
3487 if ((anon_get_ptr(ahp, anon_index)) != NULL)
3488 cnt++;
3489 anon_index++;
3490 }
3491 return (cnt);
3492 }
3493
3494 /*
3495 * Move reserved phys swap into memory swap (unreserve phys swap
3496 * and reserve mem swap by the same amount).
3497 * Used by segspt when it needs to lock reserved swap npages in memory
3498 */
3499 int
anon_swap_adjust(pgcnt_t npages)3500 anon_swap_adjust(pgcnt_t npages)
3501 {
3502 pgcnt_t unlocked_mem_swap;
3503
3504 mutex_enter(&anoninfo_lock);
3505
3506 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3507 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3508
3509 unlocked_mem_swap = k_anoninfo.ani_mem_resv
3510 - k_anoninfo.ani_locked_swap;
3511 if (npages > unlocked_mem_swap) {
3512 spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
3513
3514 /*
3515 * if there is not enough unlocked mem swap we take missing
3516 * amount from phys swap and give it to mem swap
3517 */
3518 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
3519 mutex_exit(&anoninfo_lock);
3520 return (ENOMEM);
3521 }
3522
3523 k_anoninfo.ani_mem_resv += adjusted_swap;
3524 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
3525 k_anoninfo.ani_phys_resv -= adjusted_swap;
3526
3527 ANI_ADD(adjusted_swap);
3528 }
3529 k_anoninfo.ani_locked_swap += npages;
3530
3531 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3532 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3533
3534 mutex_exit(&anoninfo_lock);
3535
3536 return (0);
3537 }
3538
3539 /*
3540 * 'unlocked' reserved mem swap so when it is unreserved it
3541 * can be moved back phys (disk) swap
3542 */
3543 void
anon_swap_restore(pgcnt_t npages)3544 anon_swap_restore(pgcnt_t npages)
3545 {
3546 mutex_enter(&anoninfo_lock);
3547
3548 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3549
3550 ASSERT(k_anoninfo.ani_locked_swap >= npages);
3551 k_anoninfo.ani_locked_swap -= npages;
3552
3553 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3554
3555 mutex_exit(&anoninfo_lock);
3556 }
3557
3558 /*
3559 * Return the pointer from the list for a
3560 * specified anon index.
3561 */
3562 ulong_t *
anon_get_slot(struct anon_hdr * ahp,ulong_t an_idx)3563 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
3564 {
3565 struct anon **app;
3566 void **ppp;
3567
3568 ASSERT(an_idx < ahp->size);
3569
3570 /*
3571 * Single level case.
3572 */
3573 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
3574 return ((ulong_t *)&ahp->array_chunk[an_idx]);
3575 } else {
3576
3577 /*
3578 * 2 level case.
3579 */
3580 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3581 if (*ppp == NULL) {
3582 mutex_enter(&ahp->serial_lock);
3583 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3584 if (*ppp == NULL)
3585 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
3586 mutex_exit(&ahp->serial_lock);
3587 }
3588 app = *ppp;
3589 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
3590 }
3591 }
3592
3593 void
anon_array_enter(struct anon_map * amp,ulong_t an_idx,anon_sync_obj_t * sobj)3594 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
3595 {
3596 ulong_t *ap_slot;
3597 kmutex_t *mtx;
3598 kcondvar_t *cv;
3599 int hash;
3600
3601 /*
3602 * Use szc to determine anon slot(s) to appear atomic.
3603 * If szc = 0, then lock the anon slot and mark it busy.
3604 * If szc > 0, then lock the range of slots by getting the
3605 * anon_array_lock for the first anon slot, and mark only the
3606 * first anon slot busy to represent whole range being busy.
3607 */
3608
3609 ASSERT(RW_READ_HELD(&->a_rwlock));
3610 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3611 hash = ANON_ARRAY_HASH(amp, an_idx);
3612 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3613 sobj->sync_cv = cv = &anon_array_cv[hash];
3614 mutex_enter(mtx);
3615 ap_slot = anon_get_slot(amp->ahp, an_idx);
3616 while (ANON_ISBUSY(ap_slot))
3617 cv_wait(cv, mtx);
3618 ANON_SETBUSY(ap_slot);
3619 sobj->sync_data = ap_slot;
3620 mutex_exit(mtx);
3621 }
3622
3623 int
anon_array_try_enter(struct anon_map * amp,ulong_t an_idx,anon_sync_obj_t * sobj)3624 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx,
3625 anon_sync_obj_t *sobj)
3626 {
3627 ulong_t *ap_slot;
3628 kmutex_t *mtx;
3629 int hash;
3630
3631 /*
3632 * Try to lock a range of anon slots.
3633 * Use szc to determine anon slot(s) to appear atomic.
3634 * If szc = 0, then lock the anon slot and mark it busy.
3635 * If szc > 0, then lock the range of slots by getting the
3636 * anon_array_lock for the first anon slot, and mark only the
3637 * first anon slot busy to represent whole range being busy.
3638 * Fail if the mutex or the anon_array are busy.
3639 */
3640
3641 ASSERT(RW_READ_HELD(&->a_rwlock));
3642 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3643 hash = ANON_ARRAY_HASH(amp, an_idx);
3644 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3645 sobj->sync_cv = &anon_array_cv[hash];
3646 if (!mutex_tryenter(mtx)) {
3647 return (EWOULDBLOCK);
3648 }
3649 ap_slot = anon_get_slot(amp->ahp, an_idx);
3650 if (ANON_ISBUSY(ap_slot)) {
3651 mutex_exit(mtx);
3652 return (EWOULDBLOCK);
3653 }
3654 ANON_SETBUSY(ap_slot);
3655 sobj->sync_data = ap_slot;
3656 mutex_exit(mtx);
3657 return (0);
3658 }
3659
3660 void
anon_array_exit(anon_sync_obj_t * sobj)3661 anon_array_exit(anon_sync_obj_t *sobj)
3662 {
3663 mutex_enter(sobj->sync_mutex);
3664 ASSERT(ANON_ISBUSY(sobj->sync_data));
3665 ANON_CLRBUSY(sobj->sync_data);
3666 if (CV_HAS_WAITERS(sobj->sync_cv))
3667 cv_broadcast(sobj->sync_cv);
3668 mutex_exit(sobj->sync_mutex);
3669 }
3670