1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2015, Joyent, Inc. All rights reserved.
24 */
25
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28
29 /*
30 * University Copyright- Copyright (c) 1982, 1986, 1988
31 * The Regents of the University of California
32 * All Rights Reserved
33 *
34 * University Acknowledgment- Portions of this document are derived from
35 * software developed by the University of California, Berkeley, and its
36 * contributors.
37 */
38
39 /*
40 * VM - anonymous pages.
41 *
42 * This layer sits immediately above the vm_swap layer. It manages
43 * physical pages that have no permanent identity in the file system
44 * name space, using the services of the vm_swap layer to allocate
45 * backing storage for these pages. Since these pages have no external
46 * identity, they are discarded when the last reference is removed.
47 *
48 * An important function of this layer is to manage low-level sharing
49 * of pages that are logically distinct but that happen to be
50 * physically identical (e.g., the corresponding pages of the processes
51 * resulting from a fork before one process or the other changes their
52 * contents). This pseudo-sharing is present only as an optimization
53 * and is not to be confused with true sharing in which multiple
54 * address spaces deliberately contain references to the same object;
55 * such sharing is managed at a higher level.
56 *
57 * The key data structure here is the anon struct, which contains a
58 * reference count for its associated physical page and a hint about
59 * the identity of that page. Anon structs typically live in arrays,
60 * with an instance's position in its array determining where the
61 * corresponding backing storage is allocated; however, the swap_xlate()
62 * routine abstracts away this representation information so that the
63 * rest of the anon layer need not know it. (See the swap layer for
64 * more details on anon struct layout.)
65 *
66 * In the future versions of the system, the association between an
67 * anon struct and its position on backing store will change so that
68 * we don't require backing store all anonymous pages in the system.
69 * This is important for consideration for large memory systems.
70 * We can also use this technique to delay binding physical locations
71 * to anonymous pages until pageout/swapout time where we can make
72 * smarter allocation decisions to improve anonymous klustering.
73 *
74 * Many of the routines defined here take a (struct anon **) argument,
75 * which allows the code at this level to manage anon pages directly,
76 * so that callers can regard anon structs as opaque objects and not be
77 * concerned with assigning or inspecting their contents.
78 *
79 * Clients of this layer refer to anon pages indirectly. That is, they
80 * maintain arrays of pointers to anon structs rather than maintaining
81 * anon structs themselves. The (struct anon **) arguments mentioned
82 * above are pointers to entries in these arrays. It is these arrays
83 * that capture the mapping between offsets within a given segment and
84 * the corresponding anonymous backing storage address.
85 */
86
87 #ifdef DEBUG
88 #define ANON_DEBUG
89 #endif
90
91 #include <sys/types.h>
92 #include <sys/t_lock.h>
93 #include <sys/param.h>
94 #include <sys/systm.h>
95 #include <sys/mman.h>
96 #include <sys/cred.h>
97 #include <sys/thread.h>
98 #include <sys/vnode.h>
99 #include <sys/cpuvar.h>
100 #include <sys/swap.h>
101 #include <sys/cmn_err.h>
102 #include <sys/vtrace.h>
103 #include <sys/kmem.h>
104 #include <sys/sysmacros.h>
105 #include <sys/bitmap.h>
106 #include <sys/vmsystm.h>
107 #include <sys/tuneable.h>
108 #include <sys/debug.h>
109 #include <sys/fs/swapnode.h>
110 #include <sys/lgrp.h>
111 #include <sys/policy.h>
112 #include <sys/condvar_impl.h>
113 #include <sys/mutex_impl.h>
114 #include <sys/rctl.h>
115
116 #include <vm/as.h>
117 #include <vm/hat.h>
118 #include <vm/anon.h>
119 #include <vm/page.h>
120 #include <vm/vpage.h>
121 #include <vm/seg.h>
122 #include <vm/rm.h>
123
124 #include <fs/fs_subr.h>
125
126 struct vnode *anon_vp;
127
128 int anon_debug;
129
130 kmutex_t anoninfo_lock;
131 struct k_anoninfo k_anoninfo;
132 ani_free_t *ani_free_pool;
133 pad_mutex_t anon_array_lock[ANON_LOCKSIZE];
134 kcondvar_t anon_array_cv[ANON_LOCKSIZE];
135
136 /*
137 * Global hash table for (vp, off) -> anon slot
138 */
139 extern int swap_maxcontig;
140 size_t anon_hash_size;
141 unsigned int anon_hash_shift;
142 struct anon **anon_hash;
143
144 static struct kmem_cache *anon_cache;
145 static struct kmem_cache *anonmap_cache;
146
147 pad_mutex_t *anonhash_lock;
148
149 /*
150 * Used to make the increment of all refcnts of all anon slots of a large
151 * page appear to be atomic. The lock is grabbed for the first anon slot of
152 * a large page.
153 */
154 pad_mutex_t *anonpages_hash_lock;
155
156 #define APH_MUTEX(vp, off) \
157 (&anonpages_hash_lock[(ANON_HASH((vp), (off)) & \
158 (AH_LOCK_SIZE - 1))].pad_mutex)
159
160 #ifdef VM_STATS
161 static struct anonvmstats_str {
162 ulong_t getpages[30];
163 ulong_t privatepages[10];
164 ulong_t demotepages[9];
165 ulong_t decrefpages[9];
166 ulong_t dupfillholes[4];
167 ulong_t freepages[1];
168 } anonvmstats;
169 #endif /* VM_STATS */
170
171 /*ARGSUSED*/
172 static int
anonmap_cache_constructor(void * buf,void * cdrarg,int kmflags)173 anonmap_cache_constructor(void *buf, void *cdrarg, int kmflags)
174 {
175 struct anon_map *amp = buf;
176
177 rw_init(&->a_rwlock, NULL, RW_DEFAULT, NULL);
178 cv_init(&->a_purgecv, NULL, CV_DEFAULT, NULL);
179 mutex_init(&->a_pmtx, NULL, MUTEX_DEFAULT, NULL);
180 mutex_init(&->a_purgemtx, NULL, MUTEX_DEFAULT, NULL);
181 return (0);
182 }
183
184 /*ARGSUSED1*/
185 static void
anonmap_cache_destructor(void * buf,void * cdrarg)186 anonmap_cache_destructor(void *buf, void *cdrarg)
187 {
188 struct anon_map *amp = buf;
189
190 rw_destroy(&->a_rwlock);
191 cv_destroy(&->a_purgecv);
192 mutex_destroy(&->a_pmtx);
193 mutex_destroy(&->a_purgemtx);
194 }
195
196 void
anon_init(void)197 anon_init(void)
198 {
199 int i;
200 pad_mutex_t *tmp;
201
202 /* These both need to be powers of 2 so round up to the next power */
203 anon_hash_shift = highbit((physmem / ANON_HASHAVELEN) - 1);
204 anon_hash_size = 1L << anon_hash_shift;
205
206 /*
207 * We need to align the anonhash_lock and anonpages_hash_lock arrays
208 * to a 64B boundary to avoid false sharing. We add 63B to our
209 * allocation so that we can get a 64B aligned address to use.
210 * We allocate both of these together to avoid wasting an additional
211 * 63B.
212 */
213 tmp = kmem_zalloc((2 * AH_LOCK_SIZE * sizeof (pad_mutex_t)) + 63,
214 KM_SLEEP);
215 anonhash_lock = (pad_mutex_t *)P2ROUNDUP((uintptr_t)tmp, 64);
216 anonpages_hash_lock = anonhash_lock + AH_LOCK_SIZE;
217
218 for (i = 0; i < AH_LOCK_SIZE; i++) {
219 mutex_init(&anonhash_lock[i].pad_mutex, NULL, MUTEX_DEFAULT,
220 NULL);
221 mutex_init(&anonpages_hash_lock[i].pad_mutex, NULL,
222 MUTEX_DEFAULT, NULL);
223 }
224
225 for (i = 0; i < ANON_LOCKSIZE; i++) {
226 mutex_init(&anon_array_lock[i].pad_mutex, NULL,
227 MUTEX_DEFAULT, NULL);
228 cv_init(&anon_array_cv[i], NULL, CV_DEFAULT, NULL);
229 }
230
231 anon_hash = (struct anon **)
232 kmem_zalloc(sizeof (struct anon *) * anon_hash_size, KM_SLEEP);
233 anon_cache = kmem_cache_create("anon_cache", sizeof (struct anon),
234 AN_CACHE_ALIGN, NULL, NULL, NULL, NULL, NULL, KMC_PREFILL);
235 anonmap_cache = kmem_cache_create("anonmap_cache",
236 sizeof (struct anon_map), 0,
237 anonmap_cache_constructor, anonmap_cache_destructor, NULL,
238 NULL, NULL, 0);
239 swap_maxcontig = (1024 * 1024) >> PAGESHIFT; /* 1MB of pages */
240
241 tmp = kmem_zalloc((ANI_MAX_POOL * sizeof (ani_free_t)) + 63, KM_SLEEP);
242 /* Round ani_free_pool to cacheline boundary to avoid false sharing. */
243 ani_free_pool = (ani_free_t *)P2ROUNDUP((uintptr_t)tmp, 64);
244
245 anon_vp = vn_alloc(KM_SLEEP);
246 vn_setops(anon_vp, swap_vnodeops);
247 anon_vp->v_type = VREG;
248 anon_vp->v_flag |= (VISSWAP|VISSWAPFS);
249 }
250
251 /*
252 * Global anon slot hash table manipulation.
253 */
254
255 static void
anon_addhash(struct anon * ap)256 anon_addhash(struct anon *ap)
257 {
258 int index;
259
260 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
261 index = ANON_HASH(ap->an_vp, ap->an_off);
262 ap->an_hash = anon_hash[index];
263 anon_hash[index] = ap;
264 }
265
266 static void
anon_rmhash(struct anon * ap)267 anon_rmhash(struct anon *ap)
268 {
269 struct anon **app;
270
271 ASSERT(MUTEX_HELD(AH_MUTEX(ap->an_vp, ap->an_off)));
272
273 for (app = &anon_hash[ANON_HASH(ap->an_vp, ap->an_off)];
274 *app; app = &((*app)->an_hash)) {
275 if (*app == ap) {
276 *app = ap->an_hash;
277 break;
278 }
279 }
280 }
281
282 /*
283 * The anon array interfaces. Functions allocating,
284 * freeing array of pointers, and returning/setting
285 * entries in the array of pointers for a given offset.
286 *
287 * Create the list of pointers
288 */
289 struct anon_hdr *
anon_create(pgcnt_t npages,int flags)290 anon_create(pgcnt_t npages, int flags)
291 {
292 struct anon_hdr *ahp;
293 ulong_t nchunks;
294 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
295
296 if ((ahp = kmem_zalloc(sizeof (struct anon_hdr), kmemflags)) == NULL) {
297 return (NULL);
298 }
299
300 mutex_init(&ahp->serial_lock, NULL, MUTEX_DEFAULT, NULL);
301 /*
302 * Single level case.
303 */
304 ahp->size = npages;
305 if (npages <= ANON_CHUNK_SIZE || (flags & ANON_ALLOC_FORCE)) {
306
307 if (flags & ANON_ALLOC_FORCE)
308 ahp->flags |= ANON_ALLOC_FORCE;
309
310 ahp->array_chunk = kmem_zalloc(
311 ahp->size * sizeof (struct anon *), kmemflags);
312
313 if (ahp->array_chunk == NULL) {
314 kmem_free(ahp, sizeof (struct anon_hdr));
315 return (NULL);
316 }
317 } else {
318 /*
319 * 2 Level case.
320 * anon hdr size needs to be rounded off to be a multiple
321 * of ANON_CHUNK_SIZE. This is important as various anon
322 * related functions depend on this.
323 * NOTE -
324 * anon_grow() makes anon hdr size a multiple of
325 * ANON_CHUNK_SIZE.
326 * amp size is <= anon hdr size.
327 * anon_index + seg_pgs <= anon hdr size.
328 */
329 ahp->size = P2ROUNDUP(npages, ANON_CHUNK_SIZE);
330 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
331
332 ahp->array_chunk = kmem_zalloc(nchunks * sizeof (ulong_t *),
333 kmemflags);
334
335 if (ahp->array_chunk == NULL) {
336 kmem_free(ahp, sizeof (struct anon_hdr));
337 return (NULL);
338 }
339 }
340 return (ahp);
341 }
342
343 /*
344 * Free the array of pointers
345 */
346 void
anon_release(struct anon_hdr * ahp,pgcnt_t npages)347 anon_release(struct anon_hdr *ahp, pgcnt_t npages)
348 {
349 ulong_t i;
350 void **ppp;
351 ulong_t nchunks;
352
353 ASSERT(npages <= ahp->size);
354
355 /*
356 * Single level case.
357 */
358 if (npages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
359 kmem_free(ahp->array_chunk, ahp->size * sizeof (struct anon *));
360 } else {
361 /*
362 * 2 level case.
363 */
364 nchunks = ahp->size >> ANON_CHUNK_SHIFT;
365 for (i = 0; i < nchunks; i++) {
366 ppp = &ahp->array_chunk[i];
367 if (*ppp != NULL)
368 kmem_free(*ppp, PAGESIZE);
369 }
370 kmem_free(ahp->array_chunk, nchunks * sizeof (ulong_t *));
371 }
372 mutex_destroy(&ahp->serial_lock);
373 kmem_free(ahp, sizeof (struct anon_hdr));
374 }
375
376 /*
377 * Return the pointer from the list for a
378 * specified anon index.
379 */
380 struct anon *
anon_get_ptr(struct anon_hdr * ahp,ulong_t an_idx)381 anon_get_ptr(struct anon_hdr *ahp, ulong_t an_idx)
382 {
383 struct anon **app;
384
385 ASSERT(an_idx < ahp->size);
386
387 /*
388 * Single level case.
389 */
390 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
391 return ((struct anon *)
392 ((uintptr_t)ahp->array_chunk[an_idx] & ANON_PTRMASK));
393 } else {
394
395 /*
396 * 2 level case.
397 */
398 app = ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
399 if (app) {
400 return ((struct anon *)
401 ((uintptr_t)app[an_idx & ANON_CHUNK_OFF] &
402 ANON_PTRMASK));
403 } else {
404 return (NULL);
405 }
406 }
407 }
408
409 /*
410 * Return the anon pointer for the first valid entry in the anon list,
411 * starting from the given index.
412 */
413 struct anon *
anon_get_next_ptr(struct anon_hdr * ahp,ulong_t * index)414 anon_get_next_ptr(struct anon_hdr *ahp, ulong_t *index)
415 {
416 struct anon *ap;
417 struct anon **app;
418 ulong_t chunkoff;
419 ulong_t i;
420 ulong_t j;
421 pgcnt_t size;
422
423 i = *index;
424 size = ahp->size;
425
426 ASSERT(i < size);
427
428 if ((size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
429 /*
430 * 1 level case
431 */
432 while (i < size) {
433 ap = (struct anon *)
434 ((uintptr_t)ahp->array_chunk[i] & ANON_PTRMASK);
435 if (ap) {
436 *index = i;
437 return (ap);
438 }
439 i++;
440 }
441 } else {
442 /*
443 * 2 level case
444 */
445 chunkoff = i & ANON_CHUNK_OFF;
446 while (i < size) {
447 app = ahp->array_chunk[i >> ANON_CHUNK_SHIFT];
448 if (app)
449 for (j = chunkoff; j < ANON_CHUNK_SIZE; j++) {
450 ap = (struct anon *)
451 ((uintptr_t)app[j] & ANON_PTRMASK);
452 if (ap) {
453 *index = i + (j - chunkoff);
454 return (ap);
455 }
456 }
457 chunkoff = 0;
458 i = (i + ANON_CHUNK_SIZE) & ~ANON_CHUNK_OFF;
459 }
460 }
461 *index = size;
462 return (NULL);
463 }
464
465 /*
466 * Set list entry with a given pointer for a specified offset
467 */
468 int
anon_set_ptr(struct anon_hdr * ahp,ulong_t an_idx,struct anon * ap,int flags)469 anon_set_ptr(struct anon_hdr *ahp, ulong_t an_idx, struct anon *ap, int flags)
470 {
471 void **ppp;
472 struct anon **app;
473 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
474 uintptr_t *ap_addr;
475
476 ASSERT(an_idx < ahp->size);
477
478 /*
479 * Single level case.
480 */
481 if (ahp->size <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
482 ap_addr = (uintptr_t *)&ahp->array_chunk[an_idx];
483 } else {
484
485 /*
486 * 2 level case.
487 */
488 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
489
490 ASSERT(ppp != NULL);
491 if (*ppp == NULL) {
492 mutex_enter(&ahp->serial_lock);
493 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
494 if (*ppp == NULL) {
495 *ppp = kmem_zalloc(PAGESIZE, kmemflags);
496 if (*ppp == NULL) {
497 mutex_exit(&ahp->serial_lock);
498 return (ENOMEM);
499 }
500 }
501 mutex_exit(&ahp->serial_lock);
502 }
503 app = *ppp;
504 ap_addr = (uintptr_t *)&app[an_idx & ANON_CHUNK_OFF];
505 }
506 *ap_addr = (*ap_addr & ~ANON_PTRMASK) | (uintptr_t)ap;
507 return (0);
508 }
509
510 /*
511 * Copy anon array into a given new anon array
512 */
513 int
anon_copy_ptr(struct anon_hdr * sahp,ulong_t s_idx,struct anon_hdr * dahp,ulong_t d_idx,pgcnt_t npages,int flags)514 anon_copy_ptr(struct anon_hdr *sahp, ulong_t s_idx, struct anon_hdr *dahp,
515 ulong_t d_idx, pgcnt_t npages, int flags)
516 {
517 void **sapp, **dapp;
518 void *ap;
519 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
520
521 ASSERT((s_idx < sahp->size) && (d_idx < dahp->size));
522 ASSERT((npages <= sahp->size) && (npages <= dahp->size));
523
524 /*
525 * Both arrays are 1 level.
526 */
527 if (((sahp->size <= ANON_CHUNK_SIZE) &&
528 (dahp->size <= ANON_CHUNK_SIZE)) ||
529 ((sahp->flags & ANON_ALLOC_FORCE) &&
530 (dahp->flags & ANON_ALLOC_FORCE))) {
531
532 bcopy(&sahp->array_chunk[s_idx], &dahp->array_chunk[d_idx],
533 npages * sizeof (struct anon *));
534 return (0);
535 }
536
537 /*
538 * Both arrays are 2 levels.
539 */
540 if (sahp->size > ANON_CHUNK_SIZE &&
541 dahp->size > ANON_CHUNK_SIZE &&
542 ((sahp->flags & ANON_ALLOC_FORCE) == 0) &&
543 ((dahp->flags & ANON_ALLOC_FORCE) == 0)) {
544
545 ulong_t sapidx, dapidx;
546 ulong_t *sap, *dap;
547 ulong_t chknp;
548
549 while (npages != 0) {
550
551 sapidx = s_idx & ANON_CHUNK_OFF;
552 dapidx = d_idx & ANON_CHUNK_OFF;
553 chknp = ANON_CHUNK_SIZE - MAX(sapidx, dapidx);
554 if (chknp > npages)
555 chknp = npages;
556
557 sapp = &sahp->array_chunk[s_idx >> ANON_CHUNK_SHIFT];
558 if ((sap = *sapp) != NULL) {
559 dapp = &dahp->array_chunk[d_idx
560 >> ANON_CHUNK_SHIFT];
561 if ((dap = *dapp) == NULL) {
562 *dapp = kmem_zalloc(PAGESIZE,
563 kmemflags);
564 if ((dap = *dapp) == NULL)
565 return (ENOMEM);
566 }
567 bcopy((sap + sapidx), (dap + dapidx),
568 chknp << ANON_PTRSHIFT);
569 }
570 s_idx += chknp;
571 d_idx += chknp;
572 npages -= chknp;
573 }
574 return (0);
575 }
576
577 /*
578 * At least one of the arrays is 2 level.
579 */
580 while (npages--) {
581 if ((ap = anon_get_ptr(sahp, s_idx)) != NULL) {
582 ASSERT(!ANON_ISBUSY(anon_get_slot(sahp, s_idx)));
583 if (anon_set_ptr(dahp, d_idx, ap, flags) == ENOMEM)
584 return (ENOMEM);
585 }
586 s_idx++;
587 d_idx++;
588 }
589 return (0);
590 }
591
592
593 /*
594 * ANON_INITBUF is a convenience macro for anon_grow() below. It
595 * takes a buffer dst, which is at least as large as buffer src. It
596 * does a bcopy from src into dst, and then bzeros the extra bytes
597 * of dst. If tail is set, the data in src is tail aligned within
598 * dst instead of head aligned.
599 */
600
601 #define ANON_INITBUF(src, srclen, dst, dstsize, tail) \
602 if (tail) { \
603 bzero((dst), (dstsize) - (srclen)); \
604 bcopy((src), (char *)(dst) + (dstsize) - (srclen), (srclen)); \
605 } else { \
606 bcopy((src), (dst), (srclen)); \
607 bzero((char *)(dst) + (srclen), (dstsize) - (srclen)); \
608 }
609
610 #define ANON_1_LEVEL_INC (ANON_CHUNK_SIZE / 8)
611 #define ANON_2_LEVEL_INC (ANON_1_LEVEL_INC * ANON_CHUNK_SIZE)
612
613 /*
614 * anon_grow() is used to efficiently extend an existing anon array.
615 * startidx_p points to the index into the anon array of the first page
616 * that is in use. oldseg_pgs is the number of pages in use, starting at
617 * *startidx_p. newpages is the number of additional pages desired.
618 *
619 * If startidx_p == NULL, startidx is taken to be 0 and cannot be changed.
620 *
621 * The growth is done by creating a new top level of the anon array,
622 * and (if the array is 2-level) reusing the existing second level arrays.
623 *
624 * flags can be used to specify ANON_NOSLEEP and ANON_GROWDOWN.
625 *
626 * Returns the new number of pages in the anon array.
627 */
628 pgcnt_t
anon_grow(struct anon_hdr * ahp,ulong_t * startidx_p,pgcnt_t oldseg_pgs,pgcnt_t newseg_pgs,int flags)629 anon_grow(struct anon_hdr *ahp, ulong_t *startidx_p, pgcnt_t oldseg_pgs,
630 pgcnt_t newseg_pgs, int flags)
631 {
632 ulong_t startidx = startidx_p ? *startidx_p : 0;
633 pgcnt_t oldamp_pgs = ahp->size, newamp_pgs;
634 pgcnt_t oelems, nelems, totpages;
635 void **level1;
636 int kmemflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
637 int growdown = (flags & ANON_GROWDOWN);
638 size_t newarrsz, oldarrsz;
639 void *level2;
640
641 ASSERT(!(startidx_p == NULL && growdown));
642 ASSERT(startidx + oldseg_pgs <= ahp->size);
643
644 /*
645 * Determine the total number of pages needed in the new
646 * anon array. If growing down, totpages is all pages from
647 * startidx through the end of the array, plus <newseg_pgs>
648 * pages. If growing up, keep all pages from page 0 through
649 * the last page currently in use, plus <newseg_pgs> pages.
650 */
651 if (growdown)
652 totpages = oldamp_pgs - startidx + newseg_pgs;
653 else
654 totpages = startidx + oldseg_pgs + newseg_pgs;
655
656 /* If the array is already large enough, just return. */
657
658 if (oldamp_pgs >= totpages) {
659 if (growdown)
660 *startidx_p = oldamp_pgs - totpages;
661 return (oldamp_pgs);
662 }
663
664 /*
665 * oldamp_pgs/newamp_pgs are the total numbers of pages represented
666 * by the corresponding arrays.
667 * oelems/nelems are the number of pointers in the top level arrays
668 * which may be either level 1 or level 2.
669 * Will the new anon array be one level or two levels?
670 */
671 if (totpages <= ANON_CHUNK_SIZE || (ahp->flags & ANON_ALLOC_FORCE)) {
672 newamp_pgs = P2ROUNDUP(totpages, ANON_1_LEVEL_INC);
673 oelems = oldamp_pgs;
674 nelems = newamp_pgs;
675 } else {
676 newamp_pgs = P2ROUNDUP(totpages, ANON_2_LEVEL_INC);
677 oelems = (oldamp_pgs + ANON_CHUNK_OFF) >> ANON_CHUNK_SHIFT;
678 nelems = newamp_pgs >> ANON_CHUNK_SHIFT;
679 }
680
681 newarrsz = nelems * sizeof (void *);
682 level1 = kmem_alloc(newarrsz, kmemflags);
683 if (level1 == NULL)
684 return (0);
685
686 /* Are we converting from a one level to a two level anon array? */
687
688 if (newamp_pgs > ANON_CHUNK_SIZE && oldamp_pgs <= ANON_CHUNK_SIZE &&
689 !(ahp->flags & ANON_ALLOC_FORCE)) {
690
691 /*
692 * Yes, we're converting to a two level. Reuse old level 1
693 * as new level 2 if it is exactly PAGESIZE. Otherwise
694 * alloc a new level 2 and copy the old level 1 data into it.
695 */
696 if (oldamp_pgs == ANON_CHUNK_SIZE) {
697 level2 = (void *)ahp->array_chunk;
698 } else {
699 level2 = kmem_alloc(PAGESIZE, kmemflags);
700 if (level2 == NULL) {
701 kmem_free(level1, newarrsz);
702 return (0);
703 }
704 oldarrsz = oldamp_pgs * sizeof (void *);
705
706 ANON_INITBUF(ahp->array_chunk, oldarrsz,
707 level2, PAGESIZE, growdown);
708 kmem_free(ahp->array_chunk, oldarrsz);
709 }
710 bzero(level1, newarrsz);
711 if (growdown)
712 level1[nelems - 1] = level2;
713 else
714 level1[0] = level2;
715 } else {
716 oldarrsz = oelems * sizeof (void *);
717
718 ANON_INITBUF(ahp->array_chunk, oldarrsz,
719 level1, newarrsz, growdown);
720 kmem_free(ahp->array_chunk, oldarrsz);
721 }
722
723 ahp->array_chunk = level1;
724 ahp->size = newamp_pgs;
725 if (growdown)
726 *startidx_p = newamp_pgs - totpages;
727
728 return (newamp_pgs);
729 }
730
731
732 /*
733 * Called to sync ani_free value.
734 */
735
736 void
set_anoninfo(void)737 set_anoninfo(void)
738 {
739 processorid_t ix, max_seqid;
740 pgcnt_t total = 0;
741 static clock_t last_time;
742 clock_t new_time;
743
744 if (ani_free_pool == NULL)
745 return;
746
747 /*
748 * Recompute ani_free at most once per tick. Use max_cpu_seqid_ever to
749 * identify the maximum number of CPUs were ever online.
750 */
751 new_time = ddi_get_lbolt();
752 if (new_time > last_time) {
753
754 max_seqid = max_cpu_seqid_ever;
755 ASSERT(ANI_MAX_POOL > max_seqid);
756 for (ix = 0; ix <= max_seqid; ix++)
757 total += ani_free_pool[ix].ani_count;
758
759 last_time = new_time;
760 k_anoninfo.ani_free = total;
761 }
762 }
763
764 /*
765 * Reserve anon space.
766 *
767 * It's no longer simply a matter of incrementing ani_resv to
768 * reserve swap space, we need to check memory-based as well
769 * as disk-backed (physical) swap. The following algorithm
770 * is used:
771 * Check the space on physical swap
772 * i.e. amount needed < ani_max - ani_phys_resv
773 * If we are swapping on swapfs check
774 * amount needed < (availrmem - swapfs_minfree)
775 * Since the algorithm to check for the quantity of swap space is
776 * almost the same as that for reserving it, we'll just use anon_resvmem
777 * with a flag to decrement availrmem.
778 *
779 * Return non-zero on success.
780 */
781 int
anon_resvmem(size_t size,boolean_t takemem,zone_t * zone,int tryhard)782 anon_resvmem(size_t size, boolean_t takemem, zone_t *zone, int tryhard)
783 {
784 pgcnt_t npages = btopr(size);
785 pgcnt_t mswap_pages = 0;
786 pgcnt_t pswap_pages = 0;
787 proc_t *p = curproc;
788
789 if (zone != NULL) {
790 /* test zone.max-swap resource control */
791 mutex_enter(&p->p_lock);
792 if (rctl_incr_swap(p, zone, ptob(npages)) != 0) {
793 mutex_exit(&p->p_lock);
794
795 if (takemem)
796 atomic_add_64(&zone->zone_anon_alloc_fail, 1);
797
798 return (0);
799 }
800
801 if (!takemem)
802 rctl_decr_swap(zone, ptob(npages));
803
804 mutex_exit(&p->p_lock);
805 }
806 mutex_enter(&anoninfo_lock);
807
808 /*
809 * pswap_pages is the number of pages we can take from
810 * physical (i.e. disk-backed) swap.
811 */
812 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
813 pswap_pages = k_anoninfo.ani_max - k_anoninfo.ani_phys_resv;
814
815 ANON_PRINT(A_RESV,
816 ("anon_resvmem: npages %lu takemem %u pswap %lu caller %p\n",
817 npages, takemem, pswap_pages, (void *)caller()));
818
819 if (npages <= pswap_pages) {
820 /*
821 * we have enough space on a physical swap
822 */
823 if (takemem)
824 k_anoninfo.ani_phys_resv += npages;
825 mutex_exit(&anoninfo_lock);
826 return (1);
827 } else if (pswap_pages != 0) {
828 /*
829 * we have some space on a physical swap
830 */
831 if (takemem) {
832 /*
833 * use up remainder of phys swap
834 */
835 k_anoninfo.ani_phys_resv += pswap_pages;
836 ASSERT(k_anoninfo.ani_phys_resv == k_anoninfo.ani_max);
837 }
838 }
839 /*
840 * since (npages > pswap_pages) we need mem swap
841 * mswap_pages is the number of pages needed from availrmem
842 */
843 ASSERT(npages > pswap_pages);
844 mswap_pages = npages - pswap_pages;
845
846 ANON_PRINT(A_RESV, ("anon_resvmem: need %ld pages from memory\n",
847 mswap_pages));
848
849 /*
850 * priv processes can reserve memory as swap as long as availrmem
851 * remains greater than swapfs_minfree; in the case of non-priv
852 * processes, memory can be reserved as swap only if availrmem
853 * doesn't fall below (swapfs_minfree + swapfs_reserve). Thus,
854 * swapfs_reserve amount of memswap is not available to non-priv
855 * processes. This protects daemons such as automounter dying
856 * as a result of application processes eating away almost entire
857 * membased swap. This safeguard becomes useless if apps are run
858 * with root access.
859 *
860 * swapfs_reserve is minimum of 4Mb or 1/16 of physmem.
861 *
862 */
863 if (tryhard) {
864 pgcnt_t floor_pages;
865
866 if (secpolicy_resource_anon_mem(CRED())) {
867 floor_pages = swapfs_minfree;
868 } else {
869 floor_pages = swapfs_minfree + swapfs_reserve;
870 }
871
872 mutex_exit(&anoninfo_lock);
873 (void) page_reclaim_mem(mswap_pages, floor_pages, 0);
874 mutex_enter(&anoninfo_lock);
875 }
876
877 mutex_enter(&freemem_lock);
878 if (availrmem > (swapfs_minfree + swapfs_reserve + mswap_pages) ||
879 (availrmem > (swapfs_minfree + mswap_pages) &&
880 secpolicy_resource(CRED()) == 0)) {
881
882 if (takemem) {
883 /*
884 * Take the memory from the rest of the system.
885 */
886 availrmem -= mswap_pages;
887 mutex_exit(&freemem_lock);
888 k_anoninfo.ani_mem_resv += mswap_pages;
889 ANI_ADD(mswap_pages);
890 ANON_PRINT((A_RESV | A_MRESV),
891 ("anon_resvmem: took %ld pages of availrmem\n",
892 mswap_pages));
893 } else {
894 mutex_exit(&freemem_lock);
895 }
896
897 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
898 mutex_exit(&anoninfo_lock);
899 return (1);
900 } else {
901 /*
902 * Fail if not enough memory
903 */
904 if (takemem) {
905 k_anoninfo.ani_phys_resv -= pswap_pages;
906 }
907
908 mutex_exit(&freemem_lock);
909 mutex_exit(&anoninfo_lock);
910 ANON_PRINT(A_RESV,
911 ("anon_resvmem: not enough space from swapfs\n"));
912 if (zone != NULL && takemem)
913 rctl_decr_swap(zone, ptob(npages));
914 return (0);
915 }
916 }
917
918 /*
919 * Give back an anon reservation.
920 */
921 void
anon_unresvmem(size_t size,zone_t * zone)922 anon_unresvmem(size_t size, zone_t *zone)
923 {
924 pgcnt_t npages = btopr(size);
925 spgcnt_t mem_free_pages = 0;
926 pgcnt_t phys_free_slots;
927 #ifdef ANON_DEBUG
928 pgcnt_t mem_resv;
929 #endif
930 if (zone != NULL)
931 rctl_decr_swap(zone, ptob(npages));
932
933 mutex_enter(&anoninfo_lock);
934
935 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
936
937 /*
938 * If some of this reservation belonged to swapfs
939 * give it back to availrmem.
940 * ani_mem_resv is the amount of availrmem swapfs has reserved.
941 * but some of that memory could be locked by segspt so we can only
942 * return non locked ani_mem_resv back to availrmem
943 */
944 if (k_anoninfo.ani_mem_resv > k_anoninfo.ani_locked_swap) {
945 ANON_PRINT((A_RESV | A_MRESV),
946 ("anon_unresv: growing availrmem by %ld pages\n",
947 MIN(k_anoninfo.ani_mem_resv, npages)));
948
949 mem_free_pages = MIN((spgcnt_t)(k_anoninfo.ani_mem_resv -
950 k_anoninfo.ani_locked_swap), npages);
951 mutex_enter(&freemem_lock);
952 availrmem += mem_free_pages;
953 mutex_exit(&freemem_lock);
954 k_anoninfo.ani_mem_resv -= mem_free_pages;
955
956 ANI_ADD(-mem_free_pages);
957 }
958 /*
959 * The remainder of the pages is returned to phys swap
960 */
961 ASSERT(npages >= mem_free_pages);
962 phys_free_slots = npages - mem_free_pages;
963
964 if (phys_free_slots) {
965 k_anoninfo.ani_phys_resv -= phys_free_slots;
966 }
967
968 #ifdef ANON_DEBUG
969 mem_resv = k_anoninfo.ani_mem_resv;
970 #endif
971
972 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
973 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
974
975 mutex_exit(&anoninfo_lock);
976
977 ANON_PRINT(A_RESV, ("anon_unresv: %lu, tot %lu, caller %p\n",
978 npages, mem_resv, (void *)caller()));
979 }
980
981 /*
982 * Allocate an anon slot and return it with the lock held.
983 */
984 struct anon *
anon_alloc(struct vnode * vp,anoff_t off)985 anon_alloc(struct vnode *vp, anoff_t off)
986 {
987 struct anon *ap;
988 kmutex_t *ahm;
989
990 ap = kmem_cache_alloc(anon_cache, KM_SLEEP);
991 if (vp == NULL) {
992 swap_alloc(ap);
993 } else {
994 ap->an_vp = vp;
995 ap->an_off = off;
996 }
997 ap->an_refcnt = 1;
998 ap->an_pvp = NULL;
999 ap->an_poff = 0;
1000 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1001 mutex_enter(ahm);
1002 anon_addhash(ap);
1003 mutex_exit(ahm);
1004 ANI_ADD(-1);
1005 ANON_PRINT(A_ANON, ("anon_alloc: returning ap %p, vp %p\n",
1006 (void *)ap, (ap ? (void *)ap->an_vp : NULL)));
1007 return (ap);
1008 }
1009
1010 /*
1011 * Called for pages locked in memory via softlock/pagelock/mlock to make sure
1012 * such pages don't consume any physical swap resources needed for swapping
1013 * unlocked pages.
1014 */
1015 void
anon_swap_free(struct anon * ap,page_t * pp)1016 anon_swap_free(struct anon *ap, page_t *pp)
1017 {
1018 kmutex_t *ahm;
1019
1020 ASSERT(ap != NULL);
1021 ASSERT(pp != NULL);
1022 ASSERT(PAGE_LOCKED(pp));
1023 ASSERT(pp->p_vnode != NULL);
1024 ASSERT(IS_SWAPFSVP(pp->p_vnode));
1025 ASSERT(ap->an_refcnt != 0);
1026 ASSERT(pp->p_vnode == ap->an_vp);
1027 ASSERT(pp->p_offset == ap->an_off);
1028
1029 if (ap->an_pvp == NULL)
1030 return;
1031
1032 page_io_lock(pp);
1033 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1034 mutex_enter(ahm);
1035
1036 ASSERT(ap->an_refcnt != 0);
1037 ASSERT(pp->p_vnode == ap->an_vp);
1038 ASSERT(pp->p_offset == ap->an_off);
1039
1040 if (ap->an_pvp != NULL) {
1041 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1042 ap->an_pvp = NULL;
1043 ap->an_poff = 0;
1044 mutex_exit(ahm);
1045 hat_setmod(pp);
1046 } else {
1047 mutex_exit(ahm);
1048 }
1049 page_io_unlock(pp);
1050 }
1051
1052 /*
1053 * Decrement the reference count of an anon page.
1054 * If reference count goes to zero, free it and
1055 * its associated page (if any).
1056 */
1057 void
anon_decref(struct anon * ap)1058 anon_decref(struct anon *ap)
1059 {
1060 page_t *pp;
1061 struct vnode *vp;
1062 anoff_t off;
1063 kmutex_t *ahm;
1064
1065 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1066 mutex_enter(ahm);
1067 ASSERT(ap->an_refcnt != 0);
1068 if (ap->an_refcnt == 0)
1069 panic("anon_decref: slot count 0");
1070 if (--ap->an_refcnt == 0) {
1071 swap_xlate(ap, &vp, &off);
1072 anon_rmhash(ap);
1073 if (ap->an_pvp != NULL)
1074 swap_phys_free(ap->an_pvp, ap->an_poff, PAGESIZE);
1075 mutex_exit(ahm);
1076
1077 /*
1078 * If there is a page for this anon slot we will need to
1079 * call VN_DISPOSE to get rid of the vp association and
1080 * put the page back on the free list as really free.
1081 * Acquire the "exclusive" lock to ensure that any
1082 * pending i/o always completes before the swap slot
1083 * is freed.
1084 */
1085 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1086 if (pp != NULL) {
1087 /*LINTED: constant in conditional context */
1088 VN_DISPOSE(pp, B_INVAL, 0, kcred);
1089 }
1090 ANON_PRINT(A_ANON, ("anon_decref: free ap %p, vp %p\n",
1091 (void *)ap, (void *)ap->an_vp));
1092
1093 kmem_cache_free(anon_cache, ap);
1094
1095 ANI_ADD(1);
1096 } else {
1097 mutex_exit(ahm);
1098 }
1099 }
1100
1101
1102 /*
1103 * check an_refcnt of the root anon slot (anon_index argument is aligned at
1104 * seg->s_szc level) to determine whether COW processing is required.
1105 * anonpages_hash_lock[] held on the root ap ensures that if root's
1106 * refcnt is 1 all other refcnt's are 1 as well (and they can't increase
1107 * later since this process can't fork while its AS lock is held).
1108 *
1109 * returns 1 if the root anon slot has a refcnt > 1 otherwise returns 0.
1110 */
1111 int
anon_szcshare(struct anon_hdr * ahp,ulong_t anon_index)1112 anon_szcshare(struct anon_hdr *ahp, ulong_t anon_index)
1113 {
1114 struct anon *ap;
1115 kmutex_t *ahmpages = NULL;
1116
1117 ap = anon_get_ptr(ahp, anon_index);
1118 if (ap == NULL)
1119 return (0);
1120
1121 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1122 mutex_enter(ahmpages);
1123 ASSERT(ap->an_refcnt >= 1);
1124 if (ap->an_refcnt == 1) {
1125 mutex_exit(ahmpages);
1126 return (0);
1127 }
1128 mutex_exit(ahmpages);
1129 return (1);
1130 }
1131 /*
1132 * Check 'nslots' anon slots for refcnt > 1.
1133 *
1134 * returns 1 if any of the 'nslots' anon slots has a refcnt > 1 otherwise
1135 * returns 0.
1136 */
1137 static int
anon_share(struct anon_hdr * ahp,ulong_t anon_index,pgcnt_t nslots)1138 anon_share(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
1139 {
1140 struct anon *ap;
1141
1142 while (nslots-- > 0) {
1143 if ((ap = anon_get_ptr(ahp, anon_index)) != NULL &&
1144 ap->an_refcnt > 1)
1145 return (1);
1146 anon_index++;
1147 }
1148
1149 return (0);
1150 }
1151
1152 static void
anon_decref_pages(struct anon_hdr * ahp,ulong_t an_idx,uint_t szc)1153 anon_decref_pages(
1154 struct anon_hdr *ahp,
1155 ulong_t an_idx,
1156 uint_t szc)
1157 {
1158 struct anon *ap = anon_get_ptr(ahp, an_idx);
1159 kmutex_t *ahmpages = NULL;
1160 page_t *pp;
1161 pgcnt_t pgcnt = page_get_pagecnt(szc);
1162 pgcnt_t i;
1163 struct vnode *vp;
1164 anoff_t off;
1165 kmutex_t *ahm;
1166 #ifdef DEBUG
1167 int refcnt = 1;
1168 #endif
1169
1170 ASSERT(szc != 0);
1171 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1172 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1173 ASSERT(an_idx < ahp->size);
1174
1175 if (ahp->size - an_idx < pgcnt) {
1176 /*
1177 * In case of shared mappings total anon map size may not be
1178 * the largest page size aligned.
1179 */
1180 pgcnt = ahp->size - an_idx;
1181 }
1182
1183 VM_STAT_ADD(anonvmstats.decrefpages[0]);
1184
1185 if (ap != NULL) {
1186 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1187 mutex_enter(ahmpages);
1188 ASSERT((refcnt = ap->an_refcnt) != 0);
1189 VM_STAT_ADD(anonvmstats.decrefpages[1]);
1190 if (ap->an_refcnt == 1) {
1191 VM_STAT_ADD(anonvmstats.decrefpages[2]);
1192 ASSERT(!anon_share(ahp, an_idx, pgcnt));
1193 mutex_exit(ahmpages);
1194 ahmpages = NULL;
1195 }
1196 }
1197
1198 i = 0;
1199 while (i < pgcnt) {
1200 if ((ap = anon_get_ptr(ahp, an_idx + i)) == NULL) {
1201 ASSERT(refcnt == 1 && ahmpages == NULL);
1202 i++;
1203 continue;
1204 }
1205 ASSERT(ap->an_refcnt == refcnt);
1206 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1207 ASSERT(ahmpages == NULL || ap->an_refcnt > 1);
1208
1209 if (ahmpages == NULL) {
1210 swap_xlate(ap, &vp, &off);
1211 pp = page_lookup(vp, (u_offset_t)off, SE_EXCL);
1212 if (pp == NULL || pp->p_szc == 0) {
1213 VM_STAT_ADD(anonvmstats.decrefpages[3]);
1214 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1215 (void) anon_set_ptr(ahp, an_idx + i, NULL,
1216 ANON_SLEEP);
1217 mutex_enter(ahm);
1218 ap->an_refcnt--;
1219 ASSERT(ap->an_refcnt == 0);
1220 anon_rmhash(ap);
1221 if (ap->an_pvp)
1222 swap_phys_free(ap->an_pvp, ap->an_poff,
1223 PAGESIZE);
1224 mutex_exit(ahm);
1225 if (pp == NULL) {
1226 pp = page_lookup(vp, (u_offset_t)off,
1227 SE_EXCL);
1228 ASSERT(pp == NULL || pp->p_szc == 0);
1229 }
1230 if (pp != NULL) {
1231 VM_STAT_ADD(anonvmstats.decrefpages[4]);
1232 /*LINTED*/
1233 VN_DISPOSE(pp, B_INVAL, 0, kcred);
1234 }
1235 kmem_cache_free(anon_cache, ap);
1236 ANI_ADD(1);
1237 i++;
1238 } else {
1239 pgcnt_t j;
1240 pgcnt_t curpgcnt =
1241 page_get_pagecnt(pp->p_szc);
1242 size_t ppasize = curpgcnt * sizeof (page_t *);
1243 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
1244 int dispose = 0;
1245
1246 VM_STAT_ADD(anonvmstats.decrefpages[5]);
1247
1248 ASSERT(pp->p_szc <= szc);
1249 ASSERT(IS_P2ALIGNED(curpgcnt, curpgcnt));
1250 ASSERT(IS_P2ALIGNED(i, curpgcnt));
1251 ASSERT(i + curpgcnt <= pgcnt);
1252 ASSERT(!(page_pptonum(pp) & (curpgcnt - 1)));
1253 ppa[0] = pp;
1254 for (j = i + 1; j < i + curpgcnt; j++) {
1255 ap = anon_get_ptr(ahp, an_idx + j);
1256 ASSERT(ap != NULL &&
1257 ap->an_refcnt == 1);
1258 swap_xlate(ap, &vp, &off);
1259 pp = page_lookup(vp, (u_offset_t)off,
1260 SE_EXCL);
1261 if (pp == NULL)
1262 panic("anon_decref_pages: "
1263 "no page");
1264
1265 (void) hat_pageunload(pp,
1266 HAT_FORCE_PGUNLOAD);
1267 ASSERT(pp->p_szc == ppa[0]->p_szc);
1268 ASSERT(page_pptonum(pp) - 1 ==
1269 page_pptonum(ppa[j - i - 1]));
1270 ppa[j - i] = pp;
1271 if (ap->an_pvp != NULL &&
1272 !vn_matchopval(ap->an_pvp,
1273 VOPNAME_DISPOSE,
1274 (fs_generic_func_p)(uintptr_t)
1275 fs_dispose))
1276 dispose = 1;
1277 }
1278 for (j = i; j < i + curpgcnt; j++) {
1279 ap = anon_get_ptr(ahp, an_idx + j);
1280 ASSERT(ap != NULL &&
1281 ap->an_refcnt == 1);
1282 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1283 (void) anon_set_ptr(ahp, an_idx + j,
1284 NULL, ANON_SLEEP);
1285 mutex_enter(ahm);
1286 ap->an_refcnt--;
1287 ASSERT(ap->an_refcnt == 0);
1288 anon_rmhash(ap);
1289 if (ap->an_pvp)
1290 swap_phys_free(ap->an_pvp,
1291 ap->an_poff, PAGESIZE);
1292 mutex_exit(ahm);
1293 kmem_cache_free(anon_cache, ap);
1294 ANI_ADD(1);
1295 }
1296 if (!dispose) {
1297 VM_STAT_ADD(anonvmstats.decrefpages[6]);
1298 page_destroy_pages(ppa[0]);
1299 } else {
1300 VM_STAT_ADD(anonvmstats.decrefpages[7]);
1301 for (j = 0; j < curpgcnt; j++) {
1302 ASSERT(PAGE_EXCL(ppa[j]));
1303 ppa[j]->p_szc = 0;
1304 }
1305 for (j = 0; j < curpgcnt; j++) {
1306 ASSERT(!hat_page_is_mapped(
1307 ppa[j]));
1308 /*LINTED*/
1309 VN_DISPOSE(ppa[j], B_INVAL, 0,
1310 kcred);
1311 }
1312 }
1313 kmem_free(ppa, ppasize);
1314 i += curpgcnt;
1315 }
1316 } else {
1317 VM_STAT_ADD(anonvmstats.decrefpages[8]);
1318 (void) anon_set_ptr(ahp, an_idx + i, NULL, ANON_SLEEP);
1319 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1320 mutex_enter(ahm);
1321 ap->an_refcnt--;
1322 mutex_exit(ahm);
1323 i++;
1324 }
1325 }
1326
1327 if (ahmpages != NULL) {
1328 mutex_exit(ahmpages);
1329 }
1330 }
1331
1332 /*
1333 * Duplicate references to size bytes worth of anon pages.
1334 * Used when duplicating a segment that contains private anon pages.
1335 * This code assumes that procedure calling this one has already used
1336 * hat_chgprot() to disable write access to the range of addresses that
1337 * that *old actually refers to.
1338 */
1339 void
anon_dup(struct anon_hdr * old,ulong_t old_idx,struct anon_hdr * new,ulong_t new_idx,size_t size)1340 anon_dup(struct anon_hdr *old, ulong_t old_idx, struct anon_hdr *new,
1341 ulong_t new_idx, size_t size)
1342 {
1343 spgcnt_t npages;
1344 kmutex_t *ahm;
1345 struct anon *ap;
1346 ulong_t off;
1347 ulong_t index;
1348
1349 npages = btopr(size);
1350 while (npages > 0) {
1351 index = old_idx;
1352 if ((ap = anon_get_next_ptr(old, &index)) == NULL)
1353 break;
1354
1355 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1356 off = index - old_idx;
1357 npages -= off;
1358 if (npages <= 0)
1359 break;
1360
1361 (void) anon_set_ptr(new, new_idx + off, ap, ANON_SLEEP);
1362 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1363
1364 mutex_enter(ahm);
1365 ap->an_refcnt++;
1366 mutex_exit(ahm);
1367
1368 off++;
1369 new_idx += off;
1370 old_idx += off;
1371 npages--;
1372 }
1373 }
1374
1375 /*
1376 * Just like anon_dup but also guarantees there are no holes (unallocated anon
1377 * slots) within any large page region. That means if a large page region is
1378 * empty in the old array it will skip it. If there are 1 or more valid slots
1379 * in the large page region of the old array it will make sure to fill in any
1380 * unallocated ones and also copy them to the new array. If noalloc is 1 large
1381 * page region should either have no valid anon slots or all slots should be
1382 * valid.
1383 */
1384 void
anon_dup_fill_holes(struct anon_hdr * old,ulong_t old_idx,struct anon_hdr * new,ulong_t new_idx,size_t size,uint_t szc,int noalloc)1385 anon_dup_fill_holes(
1386 struct anon_hdr *old,
1387 ulong_t old_idx,
1388 struct anon_hdr *new,
1389 ulong_t new_idx,
1390 size_t size,
1391 uint_t szc,
1392 int noalloc)
1393 {
1394 struct anon *ap;
1395 spgcnt_t npages;
1396 kmutex_t *ahm, *ahmpages = NULL;
1397 pgcnt_t pgcnt, i;
1398 ulong_t index, off;
1399 #ifdef DEBUG
1400 int refcnt;
1401 #endif
1402
1403 ASSERT(szc != 0);
1404 pgcnt = page_get_pagecnt(szc);
1405 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1406 npages = btopr(size);
1407 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1408 ASSERT(IS_P2ALIGNED(old_idx, pgcnt));
1409
1410 VM_STAT_ADD(anonvmstats.dupfillholes[0]);
1411
1412 while (npages > 0) {
1413 index = old_idx;
1414
1415 /*
1416 * Find the next valid slot.
1417 */
1418 if (anon_get_next_ptr(old, &index) == NULL)
1419 break;
1420
1421 ASSERT(!ANON_ISBUSY(anon_get_slot(old, index)));
1422 /*
1423 * Now backup index to the beginning of the
1424 * current large page region of the old array.
1425 */
1426 index = P2ALIGN(index, pgcnt);
1427 off = index - old_idx;
1428 ASSERT(IS_P2ALIGNED(off, pgcnt));
1429 npages -= off;
1430 if (npages <= 0)
1431 break;
1432
1433 /*
1434 * Fill and copy a large page regions worth
1435 * of anon slots.
1436 */
1437 for (i = 0; i < pgcnt; i++) {
1438 if ((ap = anon_get_ptr(old, index + i)) == NULL) {
1439 if (noalloc) {
1440 panic("anon_dup_fill_holes: "
1441 "empty anon slot\n");
1442 }
1443 VM_STAT_ADD(anonvmstats.dupfillholes[1]);
1444 ap = anon_alloc(NULL, 0);
1445 (void) anon_set_ptr(old, index + i, ap,
1446 ANON_SLEEP);
1447 } else if (i == 0) {
1448 /*
1449 * make the increment of all refcnts of all
1450 * anon slots of a large page appear atomic by
1451 * getting an anonpages_hash_lock for the
1452 * first anon slot of a large page.
1453 */
1454 VM_STAT_ADD(anonvmstats.dupfillholes[2]);
1455
1456 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
1457 mutex_enter(ahmpages);
1458 /*LINTED*/
1459 ASSERT(refcnt = ap->an_refcnt);
1460
1461 VM_STAT_COND_ADD(ap->an_refcnt > 1,
1462 anonvmstats.dupfillholes[3]);
1463 }
1464 (void) anon_set_ptr(new, new_idx + off + i, ap,
1465 ANON_SLEEP);
1466 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1467 mutex_enter(ahm);
1468 ASSERT(ahmpages != NULL || ap->an_refcnt == 1);
1469 ASSERT(i == 0 || ahmpages == NULL ||
1470 refcnt == ap->an_refcnt);
1471 ap->an_refcnt++;
1472 mutex_exit(ahm);
1473 }
1474 if (ahmpages != NULL) {
1475 mutex_exit(ahmpages);
1476 ahmpages = NULL;
1477 }
1478 off += pgcnt;
1479 new_idx += off;
1480 old_idx += off;
1481 npages -= pgcnt;
1482 }
1483 }
1484
1485 /*
1486 * Used when a segment with a vnode changes szc. similarly to
1487 * anon_dup_fill_holes() makes sure each large page region either has no anon
1488 * slots or all of them. but new slots are created by COWing the file
1489 * pages. on entrance no anon slots should be shared.
1490 */
1491 int
anon_fill_cow_holes(struct seg * seg,caddr_t addr,struct anon_hdr * ahp,ulong_t an_idx,struct vnode * vp,u_offset_t vp_off,size_t size,uint_t szc,uint_t prot,struct vpage vpage[],struct cred * cred)1492 anon_fill_cow_holes(
1493 struct seg *seg,
1494 caddr_t addr,
1495 struct anon_hdr *ahp,
1496 ulong_t an_idx,
1497 struct vnode *vp,
1498 u_offset_t vp_off,
1499 size_t size,
1500 uint_t szc,
1501 uint_t prot,
1502 struct vpage vpage[],
1503 struct cred *cred)
1504 {
1505 struct anon *ap;
1506 spgcnt_t npages;
1507 pgcnt_t pgcnt, i;
1508 ulong_t index, off;
1509 int err = 0;
1510 int pageflags = 0;
1511
1512 ASSERT(szc != 0);
1513 pgcnt = page_get_pagecnt(szc);
1514 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1515 npages = btopr(size);
1516 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1517 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1518
1519 while (npages > 0) {
1520 index = an_idx;
1521
1522 /*
1523 * Find the next valid slot.
1524 */
1525 if (anon_get_next_ptr(ahp, &index) == NULL) {
1526 break;
1527 }
1528
1529 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1530 /*
1531 * Now backup index to the beginning of the
1532 * current large page region of the anon array.
1533 */
1534 index = P2ALIGN(index, pgcnt);
1535 off = index - an_idx;
1536 ASSERT(IS_P2ALIGNED(off, pgcnt));
1537 npages -= off;
1538 if (npages <= 0)
1539 break;
1540 an_idx += off;
1541 vp_off += ptob(off);
1542 addr += ptob(off);
1543 if (vpage != NULL) {
1544 vpage += off;
1545 }
1546
1547 for (i = 0; i < pgcnt; i++, an_idx++, vp_off += PAGESIZE) {
1548 if ((ap = anon_get_ptr(ahp, an_idx)) == NULL) {
1549 page_t *pl[1 + 1];
1550 page_t *pp;
1551
1552 err = VOP_GETPAGE(vp, vp_off, PAGESIZE, NULL,
1553 pl, PAGESIZE, seg, addr, S_READ, cred,
1554 NULL);
1555 if (err) {
1556 break;
1557 }
1558 if (vpage != NULL) {
1559 prot = VPP_PROT(vpage);
1560 pageflags = VPP_ISPPLOCK(vpage) ?
1561 LOCK_PAGE : 0;
1562 }
1563 pp = anon_private(&ap, seg, addr, prot, pl[0],
1564 pageflags, cred);
1565 if (pp == NULL) {
1566 err = ENOMEM;
1567 break;
1568 }
1569 (void) anon_set_ptr(ahp, an_idx, ap,
1570 ANON_SLEEP);
1571 page_unlock(pp);
1572 }
1573 ASSERT(ap->an_refcnt == 1);
1574 addr += PAGESIZE;
1575 if (vpage != NULL) {
1576 vpage++;
1577 }
1578 }
1579 npages -= pgcnt;
1580 }
1581
1582 return (err);
1583 }
1584
1585 /*
1586 * Free a group of "size" anon pages, size in bytes,
1587 * and clear out the pointers to the anon entries.
1588 */
1589 void
anon_free(struct anon_hdr * ahp,ulong_t index,size_t size)1590 anon_free(struct anon_hdr *ahp, ulong_t index, size_t size)
1591 {
1592 spgcnt_t npages;
1593 struct anon *ap;
1594 ulong_t old;
1595
1596 npages = btopr(size);
1597
1598 while (npages > 0) {
1599 old = index;
1600 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1601 break;
1602
1603 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1604 npages -= index - old;
1605 if (npages <= 0)
1606 break;
1607
1608 (void) anon_set_ptr(ahp, index, NULL, ANON_SLEEP);
1609 anon_decref(ap);
1610 /*
1611 * Bump index and decrement page count
1612 */
1613 index++;
1614 npages--;
1615 }
1616 }
1617
1618 void
anon_free_pages(struct anon_hdr * ahp,ulong_t an_idx,size_t size,uint_t szc)1619 anon_free_pages(
1620 struct anon_hdr *ahp,
1621 ulong_t an_idx,
1622 size_t size,
1623 uint_t szc)
1624 {
1625 spgcnt_t npages;
1626 pgcnt_t pgcnt;
1627 ulong_t index, off;
1628
1629 ASSERT(szc != 0);
1630 pgcnt = page_get_pagecnt(szc);
1631 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
1632 npages = btopr(size);
1633 ASSERT(IS_P2ALIGNED(npages, pgcnt));
1634 ASSERT(IS_P2ALIGNED(an_idx, pgcnt));
1635 ASSERT(an_idx < ahp->size);
1636
1637 VM_STAT_ADD(anonvmstats.freepages[0]);
1638
1639 while (npages > 0) {
1640 index = an_idx;
1641
1642 /*
1643 * Find the next valid slot.
1644 */
1645 if (anon_get_next_ptr(ahp, &index) == NULL)
1646 break;
1647
1648 ASSERT(!ANON_ISBUSY(anon_get_slot(ahp, index)));
1649 /*
1650 * Now backup index to the beginning of the
1651 * current large page region of the old array.
1652 */
1653 index = P2ALIGN(index, pgcnt);
1654 off = index - an_idx;
1655 ASSERT(IS_P2ALIGNED(off, pgcnt));
1656 npages -= off;
1657 if (npages <= 0)
1658 break;
1659
1660 anon_decref_pages(ahp, index, szc);
1661
1662 off += pgcnt;
1663 an_idx += off;
1664 npages -= pgcnt;
1665 }
1666 }
1667
1668 /*
1669 * Make anonymous pages discardable
1670 */
1671 int
anon_disclaim(struct anon_map * amp,ulong_t index,size_t size,uint_t behav,pgcnt_t * purged)1672 anon_disclaim(struct anon_map *amp, ulong_t index, size_t size,
1673 uint_t behav, pgcnt_t *purged)
1674 {
1675 spgcnt_t npages = btopr(size);
1676 struct anon *ap;
1677 struct vnode *vp;
1678 anoff_t off;
1679 page_t *pp, *root_pp;
1680 kmutex_t *ahm;
1681 pgcnt_t pgcnt, npurged = 0;
1682 ulong_t old_idx, idx, i;
1683 struct anon_hdr *ahp = amp->ahp;
1684 anon_sync_obj_t cookie;
1685 int err = 0;
1686
1687 VERIFY(behav == MADV_FREE || behav == MADV_PURGE);
1688 ASSERT(RW_READ_HELD(&->a_rwlock));
1689 pgcnt = 1;
1690 for (; npages > 0; index = (pgcnt == 1) ? index + 1 :
1691 P2ROUNDUP(index + 1, pgcnt), npages -= pgcnt) {
1692
1693 /*
1694 * get anon pointer and index for the first valid entry
1695 * in the anon list, starting from "index"
1696 */
1697 old_idx = index;
1698 if ((ap = anon_get_next_ptr(ahp, &index)) == NULL)
1699 break;
1700
1701 /*
1702 * decrement npages by number of NULL anon slots we skipped
1703 */
1704 npages -= index - old_idx;
1705 if (npages <= 0)
1706 break;
1707
1708 anon_array_enter(amp, index, &cookie);
1709 ap = anon_get_ptr(ahp, index);
1710 ASSERT(ap != NULL);
1711
1712 /*
1713 * Get anonymous page and try to lock it SE_EXCL;
1714 * if we couldn't grab the lock we skip to next page.
1715 */
1716 swap_xlate(ap, &vp, &off);
1717 pp = page_lookup_nowait(vp, (u_offset_t)off, SE_EXCL);
1718 if (pp == NULL) {
1719 segadvstat.MADV_FREE_miss.value.ul++;
1720 pgcnt = 1;
1721 anon_array_exit(&cookie);
1722 continue;
1723 }
1724 pgcnt = page_get_pagecnt(pp->p_szc);
1725
1726 /*
1727 * we cannot free a page which is permanently locked.
1728 * The page_struct_lock need not be acquired to examine
1729 * these fields since the page has an "exclusive" lock.
1730 */
1731 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1732 page_unlock(pp);
1733 segadvstat.MADV_FREE_miss.value.ul++;
1734 anon_array_exit(&cookie);
1735 err = EBUSY;
1736 continue;
1737 }
1738
1739 ahm = AH_MUTEX(vp, off);
1740 mutex_enter(ahm);
1741 ASSERT(ap->an_refcnt != 0);
1742 /*
1743 * skip this one if copy-on-write is not yet broken.
1744 */
1745 if (ap->an_refcnt > 1) {
1746 mutex_exit(ahm);
1747 page_unlock(pp);
1748 segadvstat.MADV_FREE_miss.value.ul++;
1749 anon_array_exit(&cookie);
1750 continue;
1751 }
1752
1753 if (behav == MADV_PURGE && pp->p_szc != 0) {
1754 /*
1755 * If we're purging and we have a large page, simplify
1756 * things a bit by demoting ourselves into the base
1757 * page case.
1758 */
1759 (void) page_try_demote_pages(pp);
1760 }
1761
1762 if (pp->p_szc == 0) {
1763 pgcnt = 1;
1764
1765 /*
1766 * free swap slot;
1767 */
1768 if (ap->an_pvp) {
1769 swap_phys_free(ap->an_pvp, ap->an_poff,
1770 PAGESIZE);
1771 ap->an_pvp = NULL;
1772 ap->an_poff = 0;
1773 }
1774
1775 if (behav == MADV_PURGE) {
1776 /*
1777 * If we're purging (instead of merely freeing),
1778 * rip out this anon structure entirely to
1779 * assure that any subsequent fault pulls from
1780 * the backing vnode (if any).
1781 */
1782 if (--ap->an_refcnt == 0)
1783 anon_rmhash(ap);
1784
1785 mutex_exit(ahm);
1786 (void) anon_set_ptr(ahp, index,
1787 NULL, ANON_SLEEP);
1788 npurged++;
1789 ANI_ADD(1);
1790 kmem_cache_free(anon_cache, ap);
1791 } else {
1792 mutex_exit(ahm);
1793 }
1794
1795 segadvstat.MADV_FREE_hit.value.ul++;
1796
1797 /*
1798 * while we are at it, unload all the translations
1799 * and attempt to free the page.
1800 */
1801 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1802 /*LINTED: constant in conditional context */
1803 VN_DISPOSE(pp,
1804 behav == MADV_FREE ? B_FREE : B_INVAL, 0, kcred);
1805
1806 anon_array_exit(&cookie);
1807 continue;
1808 }
1809
1810 pgcnt = page_get_pagecnt(pp->p_szc);
1811 if (!IS_P2ALIGNED(index, pgcnt) || npages < pgcnt) {
1812 if (!page_try_demote_pages(pp)) {
1813 mutex_exit(ahm);
1814 page_unlock(pp);
1815 segadvstat.MADV_FREE_miss.value.ul++;
1816 anon_array_exit(&cookie);
1817 err = EBUSY;
1818 continue;
1819 } else {
1820 pgcnt = 1;
1821 if (ap->an_pvp) {
1822 swap_phys_free(ap->an_pvp,
1823 ap->an_poff, PAGESIZE);
1824 ap->an_pvp = NULL;
1825 ap->an_poff = 0;
1826 }
1827 mutex_exit(ahm);
1828 (void) hat_pageunload(pp, HAT_FORCE_PGUNLOAD);
1829 /*LINTED*/
1830 VN_DISPOSE(pp, B_FREE, 0, kcred);
1831 segadvstat.MADV_FREE_hit.value.ul++;
1832 anon_array_exit(&cookie);
1833 continue;
1834 }
1835 }
1836 mutex_exit(ahm);
1837 root_pp = pp;
1838
1839 /*
1840 * try to lock remaining pages
1841 */
1842 for (idx = 1; idx < pgcnt; idx++) {
1843 pp++;
1844 if (!page_trylock(pp, SE_EXCL))
1845 break;
1846 if (pp->p_lckcnt != 0 || pp->p_cowcnt != 0) {
1847 page_unlock(pp);
1848 break;
1849 }
1850 }
1851
1852 if (idx == pgcnt) {
1853 for (i = 0; i < pgcnt; i++) {
1854 ap = anon_get_ptr(ahp, index + i);
1855 if (ap == NULL)
1856 break;
1857 swap_xlate(ap, &vp, &off);
1858 ahm = AH_MUTEX(vp, off);
1859 mutex_enter(ahm);
1860 ASSERT(ap->an_refcnt != 0);
1861
1862 /*
1863 * skip this one if copy-on-write
1864 * is not yet broken.
1865 */
1866 if (ap->an_refcnt > 1) {
1867 mutex_exit(ahm);
1868 goto skiplp;
1869 }
1870 if (ap->an_pvp) {
1871 swap_phys_free(ap->an_pvp,
1872 ap->an_poff, PAGESIZE);
1873 ap->an_pvp = NULL;
1874 ap->an_poff = 0;
1875 }
1876 mutex_exit(ahm);
1877 }
1878 page_destroy_pages(root_pp);
1879 segadvstat.MADV_FREE_hit.value.ul += pgcnt;
1880 anon_array_exit(&cookie);
1881 continue;
1882 }
1883 skiplp:
1884 segadvstat.MADV_FREE_miss.value.ul += pgcnt;
1885 for (i = 0, pp = root_pp; i < idx; pp++, i++)
1886 page_unlock(pp);
1887 anon_array_exit(&cookie);
1888 }
1889
1890 if (purged != NULL)
1891 *purged = npurged;
1892
1893 return (err);
1894 }
1895
1896 /*
1897 * Return the kept page(s) and protections back to the segment driver.
1898 */
1899 int
anon_getpage(struct anon ** app,uint_t * protp,page_t * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)1900 anon_getpage(
1901 struct anon **app,
1902 uint_t *protp,
1903 page_t *pl[],
1904 size_t plsz,
1905 struct seg *seg,
1906 caddr_t addr,
1907 enum seg_rw rw,
1908 struct cred *cred)
1909 {
1910 page_t *pp;
1911 struct anon *ap = *app;
1912 struct vnode *vp;
1913 anoff_t off;
1914 int err;
1915 kmutex_t *ahm;
1916
1917 swap_xlate(ap, &vp, &off);
1918
1919 /*
1920 * Lookup the page. If page is being paged in,
1921 * wait for it to finish as we must return a list of
1922 * pages since this routine acts like the VOP_GETPAGE
1923 * routine does.
1924 */
1925 if (pl != NULL && (pp = page_lookup(vp, (u_offset_t)off, SE_SHARED))) {
1926 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1927 mutex_enter(ahm);
1928 if (ap->an_refcnt == 1)
1929 *protp = PROT_ALL;
1930 else
1931 *protp = PROT_ALL & ~PROT_WRITE;
1932 mutex_exit(ahm);
1933 pl[0] = pp;
1934 pl[1] = NULL;
1935 return (0);
1936 }
1937
1938 /*
1939 * Simply treat it as a vnode fault on the anon vp.
1940 */
1941
1942 TRACE_3(TR_FAC_VM, TR_ANON_GETPAGE,
1943 "anon_getpage:seg %x addr %x vp %x",
1944 seg, addr, vp);
1945
1946 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, protp, pl, plsz,
1947 seg, addr, rw, cred, NULL);
1948
1949 if (err == 0 && pl != NULL) {
1950 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
1951 mutex_enter(ahm);
1952 if (ap->an_refcnt != 1)
1953 *protp &= ~PROT_WRITE; /* make read-only */
1954 mutex_exit(ahm);
1955 }
1956 return (err);
1957 }
1958
1959 /*
1960 * Creates or returns kept pages to the segment driver. returns -1 if a large
1961 * page cannot be allocated. returns -2 if some other process has allocated a
1962 * larger page.
1963 *
1964 * For cowfault it will allocate any size pages to fill the requested area to
1965 * avoid partially overwriting anon slots (i.e. sharing only some of the anon
1966 * slots within a large page with other processes). This policy greatly
1967 * simplifies large page freeing (which is only freed when all anon slot
1968 * refcnts are 0).
1969 */
1970 int
anon_map_getpages(struct anon_map * amp,ulong_t start_idx,uint_t szc,struct seg * seg,caddr_t addr,uint_t prot,uint_t * protp,page_t * ppa[],uint_t * ppa_szc,struct vpage vpage[],enum seg_rw rw,int brkcow,int anypgsz,int pgflags,struct cred * cred)1971 anon_map_getpages(
1972 struct anon_map *amp,
1973 ulong_t start_idx,
1974 uint_t szc,
1975 struct seg *seg,
1976 caddr_t addr,
1977 uint_t prot,
1978 uint_t *protp,
1979 page_t *ppa[],
1980 uint_t *ppa_szc,
1981 struct vpage vpage[],
1982 enum seg_rw rw,
1983 int brkcow,
1984 int anypgsz,
1985 int pgflags,
1986 struct cred *cred)
1987 {
1988 pgcnt_t pgcnt;
1989 struct anon *ap;
1990 struct vnode *vp;
1991 anoff_t off;
1992 page_t *pp, *pl[2], *conpp = NULL;
1993 caddr_t vaddr;
1994 ulong_t pg_idx, an_idx, i;
1995 spgcnt_t nreloc = 0;
1996 int prealloc = 1;
1997 int err, slotcreate;
1998 uint_t vpprot;
1999 int upsize = (szc < seg->s_szc);
2000
2001 #if !defined(__x86)
2002 ASSERT(seg->s_szc != 0);
2003 #endif
2004 ASSERT(szc <= seg->s_szc);
2005 ASSERT(ppa_szc != NULL);
2006 ASSERT(rw != S_CREATE);
2007
2008 *protp = PROT_ALL;
2009
2010 VM_STAT_ADD(anonvmstats.getpages[0]);
2011
2012 if (szc == 0) {
2013 VM_STAT_ADD(anonvmstats.getpages[1]);
2014 if ((ap = anon_get_ptr(amp->ahp, start_idx)) != NULL) {
2015 err = anon_getpage(&ap, protp, pl, PAGESIZE, seg,
2016 addr, rw, cred);
2017 if (err)
2018 return (err);
2019 ppa[0] = pl[0];
2020 if (brkcow == 0 || (*protp & PROT_WRITE)) {
2021 VM_STAT_ADD(anonvmstats.getpages[2]);
2022 if (ppa[0]->p_szc != 0 && upsize) {
2023 VM_STAT_ADD(anonvmstats.getpages[3]);
2024 *ppa_szc = MIN(ppa[0]->p_szc,
2025 seg->s_szc);
2026 page_unlock(ppa[0]);
2027 return (-2);
2028 }
2029 return (0);
2030 }
2031 panic("anon_map_getpages: cowfault for szc 0");
2032 } else {
2033 VM_STAT_ADD(anonvmstats.getpages[4]);
2034 ppa[0] = anon_zero(seg, addr, &ap, cred);
2035 if (ppa[0] == NULL)
2036 return (ENOMEM);
2037 (void) anon_set_ptr(amp->ahp, start_idx, ap,
2038 ANON_SLEEP);
2039 return (0);
2040 }
2041 }
2042
2043 pgcnt = page_get_pagecnt(szc);
2044 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2045 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2046
2047 /*
2048 * First we check for the case that the requtested large
2049 * page or larger page already exists in the system.
2050 * Actually we only check if the first constituent page
2051 * exists and only preallocate if it's not found.
2052 */
2053 ap = anon_get_ptr(amp->ahp, start_idx);
2054 if (ap) {
2055 uint_t pszc;
2056 swap_xlate(ap, &vp, &off);
2057 if (page_exists_forreal(vp, (u_offset_t)off, &pszc)) {
2058 if (pszc > szc && upsize) {
2059 *ppa_szc = MIN(pszc, seg->s_szc);
2060 return (-2);
2061 }
2062 if (pszc >= szc) {
2063 prealloc = 0;
2064 }
2065 }
2066 }
2067
2068 VM_STAT_COND_ADD(prealloc == 0, anonvmstats.getpages[5]);
2069 VM_STAT_COND_ADD(prealloc != 0, anonvmstats.getpages[6]);
2070
2071 top:
2072 /*
2073 * If a smaller page or no page at all was found,
2074 * grab a large page off the freelist.
2075 */
2076 if (prealloc) {
2077 ASSERT(conpp == NULL);
2078 if (page_alloc_pages(anon_vp, seg, addr, NULL, ppa,
2079 szc, 0, pgflags) != 0) {
2080 VM_STAT_ADD(anonvmstats.getpages[7]);
2081 if (brkcow == 0 || szc < seg->s_szc ||
2082 !anon_szcshare(amp->ahp, start_idx)) {
2083 /*
2084 * If the refcnt's of all anon slots are <= 1
2085 * they can't increase since we are holding
2086 * the address space's lock. So segvn can
2087 * safely decrease szc without risking to
2088 * generate a cow fault for the region smaller
2089 * than the segment's largest page size.
2090 */
2091 VM_STAT_ADD(anonvmstats.getpages[8]);
2092 return (-1);
2093 }
2094 docow:
2095 /*
2096 * This is a cow fault. Copy away the entire 1 large
2097 * page region of this segment.
2098 */
2099 if (szc != seg->s_szc)
2100 panic("anon_map_getpages: cowfault for szc %d",
2101 szc);
2102 vaddr = addr;
2103 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
2104 pg_idx++, an_idx++, vaddr += PAGESIZE) {
2105 if ((ap = anon_get_ptr(amp->ahp, an_idx)) !=
2106 NULL) {
2107 err = anon_getpage(&ap, &vpprot, pl,
2108 PAGESIZE, seg, vaddr, rw, cred);
2109 if (err) {
2110 for (i = 0; i < pg_idx; i++) {
2111 if ((pp = ppa[i]) !=
2112 NULL)
2113 page_unlock(pp);
2114 }
2115 return (err);
2116 }
2117 ppa[pg_idx] = pl[0];
2118 } else {
2119 /*
2120 * Since this is a cowfault we know
2121 * that this address space has a
2122 * parent or children which means
2123 * anon_dup_fill_holes() has initialized
2124 * all anon slots within a large page
2125 * region that had at least one anon
2126 * slot at the time of fork().
2127 */
2128 panic("anon_map_getpages: "
2129 "cowfault but anon slot is empty");
2130 }
2131 }
2132 VM_STAT_ADD(anonvmstats.getpages[9]);
2133 *protp = PROT_ALL;
2134 return (anon_map_privatepages(amp, start_idx, szc, seg,
2135 addr, prot, ppa, vpage, anypgsz, pgflags, cred));
2136 }
2137 }
2138
2139 VM_STAT_ADD(anonvmstats.getpages[10]);
2140
2141 an_idx = start_idx;
2142 pg_idx = 0;
2143 vaddr = addr;
2144 while (pg_idx < pgcnt) {
2145 slotcreate = 0;
2146 if ((ap = anon_get_ptr(amp->ahp, an_idx)) == NULL) {
2147 VM_STAT_ADD(anonvmstats.getpages[11]);
2148 /*
2149 * For us to have decided not to preallocate
2150 * would have meant that a large page
2151 * was found. Which also means that all of the
2152 * anon slots for that page would have been
2153 * already created for us.
2154 */
2155 if (prealloc == 0)
2156 panic("anon_map_getpages: prealloc = 0");
2157
2158 slotcreate = 1;
2159 ap = anon_alloc(NULL, 0);
2160 }
2161 swap_xlate(ap, &vp, &off);
2162
2163 /*
2164 * Now setup our preallocated page to pass down
2165 * to swap_getpage().
2166 */
2167 if (prealloc) {
2168 ASSERT(ppa[pg_idx]->p_szc == szc);
2169 conpp = ppa[pg_idx];
2170 }
2171 ASSERT(prealloc || conpp == NULL);
2172
2173 /*
2174 * If we just created this anon slot then call
2175 * with S_CREATE to prevent doing IO on the page.
2176 * Similar to the anon_zero case.
2177 */
2178 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE,
2179 NULL, pl, PAGESIZE, conpp, ppa_szc, &nreloc, seg, vaddr,
2180 slotcreate == 1 ? S_CREATE : rw, cred);
2181
2182 if (err) {
2183 ASSERT(err != -2 || upsize);
2184 VM_STAT_ADD(anonvmstats.getpages[12]);
2185 ASSERT(slotcreate == 0);
2186 goto io_err;
2187 }
2188
2189 pp = pl[0];
2190
2191 if (pp->p_szc < szc || (pp->p_szc > szc && upsize)) {
2192 VM_STAT_ADD(anonvmstats.getpages[13]);
2193 ASSERT(slotcreate == 0);
2194 ASSERT(prealloc == 0);
2195 ASSERT(pg_idx == 0);
2196 if (pp->p_szc > szc) {
2197 ASSERT(upsize);
2198 *ppa_szc = MIN(pp->p_szc, seg->s_szc);
2199 page_unlock(pp);
2200 VM_STAT_ADD(anonvmstats.getpages[14]);
2201 return (-2);
2202 }
2203 page_unlock(pp);
2204 prealloc = 1;
2205 goto top;
2206 }
2207
2208 /*
2209 * If we decided to preallocate but VOP_GETPAGE
2210 * found a page in the system that satisfies our
2211 * request then free up our preallocated large page
2212 * and continue looping accross the existing large
2213 * page via VOP_GETPAGE.
2214 */
2215 if (prealloc && pp != ppa[pg_idx]) {
2216 VM_STAT_ADD(anonvmstats.getpages[15]);
2217 ASSERT(slotcreate == 0);
2218 ASSERT(pg_idx == 0);
2219 conpp = NULL;
2220 prealloc = 0;
2221 page_free_pages(ppa[0]);
2222 }
2223
2224 if (prealloc && nreloc > 1) {
2225 /*
2226 * we have relocated out of a smaller large page.
2227 * skip npgs - 1 iterations and continue which will
2228 * increment by one the loop indices.
2229 */
2230 spgcnt_t npgs = nreloc;
2231
2232 VM_STAT_ADD(anonvmstats.getpages[16]);
2233
2234 ASSERT(pp == ppa[pg_idx]);
2235 ASSERT(slotcreate == 0);
2236 ASSERT(pg_idx + npgs <= pgcnt);
2237 if ((*protp & PROT_WRITE) &&
2238 anon_share(amp->ahp, an_idx, npgs)) {
2239 *protp &= ~PROT_WRITE;
2240 }
2241 pg_idx += npgs;
2242 an_idx += npgs;
2243 vaddr += PAGESIZE * npgs;
2244 continue;
2245 }
2246
2247 VM_STAT_ADD(anonvmstats.getpages[17]);
2248
2249 /*
2250 * Anon_zero case.
2251 */
2252 if (slotcreate) {
2253 ASSERT(prealloc);
2254 pagezero(pp, 0, PAGESIZE);
2255 CPU_STATS_ADD_K(vm, zfod, 1);
2256 hat_setrefmod(pp);
2257 }
2258
2259 ASSERT(prealloc == 0 || ppa[pg_idx] == pp);
2260 ASSERT(prealloc != 0 || PAGE_SHARED(pp));
2261 ASSERT(prealloc == 0 || PAGE_EXCL(pp));
2262
2263 if (pg_idx > 0 &&
2264 ((page_pptonum(pp) != page_pptonum(ppa[pg_idx - 1]) + 1) ||
2265 (pp->p_szc != ppa[pg_idx - 1]->p_szc))) {
2266 panic("anon_map_getpages: unexpected page");
2267 } else if (pg_idx == 0 && (page_pptonum(pp) & (pgcnt - 1))) {
2268 panic("anon_map_getpages: unaligned page");
2269 }
2270
2271 if (prealloc == 0) {
2272 ppa[pg_idx] = pp;
2273 }
2274
2275 if (ap->an_refcnt > 1) {
2276 VM_STAT_ADD(anonvmstats.getpages[18]);
2277 *protp &= ~PROT_WRITE;
2278 }
2279
2280 /*
2281 * If this is a new anon slot then initialize
2282 * the anon array entry.
2283 */
2284 if (slotcreate) {
2285 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2286 }
2287 pg_idx++;
2288 an_idx++;
2289 vaddr += PAGESIZE;
2290 }
2291
2292 /*
2293 * Since preallocated pages come off the freelist
2294 * they are locked SE_EXCL. Simply downgrade and return.
2295 */
2296 if (prealloc) {
2297 VM_STAT_ADD(anonvmstats.getpages[19]);
2298 conpp = NULL;
2299 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2300 page_downgrade(ppa[pg_idx]);
2301 }
2302 }
2303 ASSERT(conpp == NULL);
2304
2305 if (brkcow == 0 || (*protp & PROT_WRITE)) {
2306 VM_STAT_ADD(anonvmstats.getpages[20]);
2307 return (0);
2308 }
2309
2310 if (szc < seg->s_szc)
2311 panic("anon_map_getpages: cowfault for szc %d", szc);
2312
2313 VM_STAT_ADD(anonvmstats.getpages[21]);
2314
2315 *protp = PROT_ALL;
2316 return (anon_map_privatepages(amp, start_idx, szc, seg, addr, prot,
2317 ppa, vpage, anypgsz, pgflags, cred));
2318 io_err:
2319 /*
2320 * We got an IO error somewhere in our large page.
2321 * If we were using a preallocated page then just demote
2322 * all the constituent pages that we've succeeded with sofar
2323 * to PAGESIZE pages and leave them in the system
2324 * unlocked.
2325 */
2326
2327 ASSERT(err != -2 || ((pg_idx == 0) && upsize));
2328
2329 VM_STAT_COND_ADD(err > 0, anonvmstats.getpages[22]);
2330 VM_STAT_COND_ADD(err == -1, anonvmstats.getpages[23]);
2331 VM_STAT_COND_ADD(err == -2, anonvmstats.getpages[24]);
2332
2333 if (prealloc) {
2334 conpp = NULL;
2335 if (pg_idx > 0) {
2336 VM_STAT_ADD(anonvmstats.getpages[25]);
2337 for (i = 0; i < pgcnt; i++) {
2338 pp = ppa[i];
2339 ASSERT(PAGE_EXCL(pp));
2340 ASSERT(pp->p_szc == szc);
2341 pp->p_szc = 0;
2342 }
2343 for (i = 0; i < pg_idx; i++) {
2344 ASSERT(!hat_page_is_mapped(ppa[i]));
2345 page_unlock(ppa[i]);
2346 }
2347 /*
2348 * Now free up the remaining unused constituent
2349 * pages.
2350 */
2351 while (pg_idx < pgcnt) {
2352 ASSERT(!hat_page_is_mapped(ppa[pg_idx]));
2353 page_free(ppa[pg_idx], 0);
2354 pg_idx++;
2355 }
2356 } else {
2357 VM_STAT_ADD(anonvmstats.getpages[26]);
2358 page_free_pages(ppa[0]);
2359 }
2360 } else {
2361 VM_STAT_ADD(anonvmstats.getpages[27]);
2362 ASSERT(err > 0);
2363 for (i = 0; i < pg_idx; i++)
2364 page_unlock(ppa[i]);
2365 }
2366 ASSERT(conpp == NULL);
2367 if (err != -1)
2368 return (err);
2369 /*
2370 * we are here because we failed to relocate.
2371 */
2372 ASSERT(prealloc);
2373 if (brkcow == 0 || szc < seg->s_szc ||
2374 !anon_szcshare(amp->ahp, start_idx)) {
2375 VM_STAT_ADD(anonvmstats.getpages[28]);
2376 return (-1);
2377 }
2378 VM_STAT_ADD(anonvmstats.getpages[29]);
2379 goto docow;
2380 }
2381
2382
2383 /*
2384 * Turn a reference to an object or shared anon page
2385 * into a private page with a copy of the data from the
2386 * original page which is always locked by the caller.
2387 * This routine unloads the translation and unlocks the
2388 * original page, if it isn't being stolen, before returning
2389 * to the caller.
2390 *
2391 * NOTE: The original anon slot is not freed by this routine
2392 * It must be freed by the caller while holding the
2393 * "anon_map" lock to prevent races which can occur if
2394 * a process has multiple lwps in its address space.
2395 */
2396 page_t *
anon_private(struct anon ** app,struct seg * seg,caddr_t addr,uint_t prot,page_t * opp,int oppflags,struct cred * cred)2397 anon_private(
2398 struct anon **app,
2399 struct seg *seg,
2400 caddr_t addr,
2401 uint_t prot,
2402 page_t *opp,
2403 int oppflags,
2404 struct cred *cred)
2405 {
2406 struct anon *old = *app;
2407 struct anon *new;
2408 page_t *pp = NULL;
2409 struct vnode *vp;
2410 anoff_t off;
2411 page_t *anon_pl[1 + 1];
2412 int err;
2413
2414 if (oppflags & STEAL_PAGE)
2415 ASSERT(PAGE_EXCL(opp));
2416 else
2417 ASSERT(PAGE_LOCKED(opp));
2418
2419 CPU_STATS_ADD_K(vm, cow_fault, 1);
2420
2421 *app = new = anon_alloc(NULL, 0);
2422 swap_xlate(new, &vp, &off);
2423
2424 if (oppflags & STEAL_PAGE) {
2425 page_rename(opp, vp, (u_offset_t)off);
2426 pp = opp;
2427 TRACE_5(TR_FAC_VM, TR_ANON_PRIVATE,
2428 "anon_private:seg %p addr %x pp %p vp %p off %lx",
2429 seg, addr, pp, vp, off);
2430 hat_setmod(pp);
2431
2432 /* bug 4026339 */
2433 page_downgrade(pp);
2434 return (pp);
2435 }
2436
2437 /*
2438 * Call the VOP_GETPAGE routine to create the page, thereby
2439 * enabling the vnode driver to allocate any filesystem
2440 * space (e.g., disk block allocation for UFS). This also
2441 * prevents more than one page from being added to the
2442 * vnode at the same time.
2443 */
2444 err = VOP_GETPAGE(vp, (u_offset_t)off, PAGESIZE, NULL,
2445 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2446 if (err)
2447 goto out;
2448
2449 pp = anon_pl[0];
2450
2451 /*
2452 * If the original page was locked, we need to move the lock
2453 * to the new page by transfering 'cowcnt/lckcnt' of the original
2454 * page to 'cowcnt/lckcnt' of the new page.
2455 *
2456 * See Statement at the beginning of segvn_lockop() and
2457 * comments in page_pp_useclaim() regarding the way
2458 * cowcnts/lckcnts are handled.
2459 *
2460 * Also availrmem must be decremented up front for read only mapping
2461 * before calling page_pp_useclaim. page_pp_useclaim will bump it back
2462 * if availrmem did not need to be decremented after all.
2463 */
2464 if (oppflags & LOCK_PAGE) {
2465 if ((prot & PROT_WRITE) == 0) {
2466 mutex_enter(&freemem_lock);
2467 if (availrmem > pages_pp_maximum) {
2468 availrmem--;
2469 pages_useclaim++;
2470 } else {
2471 mutex_exit(&freemem_lock);
2472 goto out;
2473 }
2474 mutex_exit(&freemem_lock);
2475 }
2476 page_pp_useclaim(opp, pp, prot & PROT_WRITE);
2477 }
2478
2479 /*
2480 * Now copy the contents from the original page,
2481 * which is locked and loaded in the MMU by
2482 * the caller to prevent yet another page fault.
2483 */
2484 /* XXX - should set mod bit in here */
2485 if (ppcopy(opp, pp) == 0) {
2486 /*
2487 * Before ppcopy could hanlde UE or other faults, we
2488 * would have panicked here, and still have no option
2489 * but to do so now.
2490 */
2491 panic("anon_private, ppcopy failed, opp = 0x%p, pp = 0x%p",
2492 (void *)opp, (void *)pp);
2493 }
2494
2495 hat_setrefmod(pp); /* mark as modified */
2496
2497 /*
2498 * Unload the old translation.
2499 */
2500 hat_unload(seg->s_as->a_hat, addr, PAGESIZE, HAT_UNLOAD);
2501
2502 /*
2503 * Free unmapped, unmodified original page.
2504 * or release the lock on the original page,
2505 * otherwise the process will sleep forever in
2506 * anon_decref() waiting for the "exclusive" lock
2507 * on the page.
2508 */
2509 (void) page_release(opp, 1);
2510
2511 /*
2512 * we are done with page creation so downgrade the new
2513 * page's selock to shared, this helps when multiple
2514 * as_fault(...SOFTLOCK...) are done to the same
2515 * page(aio)
2516 */
2517 page_downgrade(pp);
2518
2519 /*
2520 * NOTE: The original anon slot must be freed by the
2521 * caller while holding the "anon_map" lock, if we
2522 * copied away from an anonymous page.
2523 */
2524 return (pp);
2525
2526 out:
2527 *app = old;
2528 if (pp)
2529 page_unlock(pp);
2530 anon_decref(new);
2531 page_unlock(opp);
2532 return ((page_t *)NULL);
2533 }
2534
2535 int
anon_map_privatepages(struct anon_map * amp,ulong_t start_idx,uint_t szc,struct seg * seg,caddr_t addr,uint_t prot,page_t * ppa[],struct vpage vpage[],int anypgsz,int pgflags,struct cred * cred)2536 anon_map_privatepages(
2537 struct anon_map *amp,
2538 ulong_t start_idx,
2539 uint_t szc,
2540 struct seg *seg,
2541 caddr_t addr,
2542 uint_t prot,
2543 page_t *ppa[],
2544 struct vpage vpage[],
2545 int anypgsz,
2546 int pgflags,
2547 struct cred *cred)
2548 {
2549 pgcnt_t pgcnt;
2550 struct vnode *vp;
2551 anoff_t off;
2552 page_t *pl[2], *conpp = NULL;
2553 int err;
2554 int prealloc = 1;
2555 struct anon *ap, *oldap;
2556 caddr_t vaddr;
2557 page_t *pplist, *pp;
2558 ulong_t pg_idx, an_idx;
2559 spgcnt_t nreloc = 0;
2560 int pagelock = 0;
2561 kmutex_t *ahmpages = NULL;
2562 #ifdef DEBUG
2563 int refcnt;
2564 #endif
2565
2566 ASSERT(szc != 0);
2567 ASSERT(szc == seg->s_szc);
2568
2569 VM_STAT_ADD(anonvmstats.privatepages[0]);
2570
2571 pgcnt = page_get_pagecnt(szc);
2572 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
2573 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
2574
2575 ASSERT(amp != NULL);
2576 ap = anon_get_ptr(amp->ahp, start_idx);
2577 ASSERT(ap == NULL || ap->an_refcnt >= 1);
2578
2579 VM_STAT_COND_ADD(ap == NULL, anonvmstats.privatepages[1]);
2580
2581 /*
2582 * Now try and allocate the large page. If we fail then just
2583 * let VOP_GETPAGE give us PAGESIZE pages. Normally we let
2584 * the caller make this decision but to avoid added complexity
2585 * it's simplier to handle that case here.
2586 */
2587 if (anypgsz == -1) {
2588 VM_STAT_ADD(anonvmstats.privatepages[2]);
2589 prealloc = 0;
2590 } else if (page_alloc_pages(anon_vp, seg, addr, &pplist, NULL, szc,
2591 anypgsz, pgflags) != 0) {
2592 VM_STAT_ADD(anonvmstats.privatepages[3]);
2593 prealloc = 0;
2594 }
2595
2596 /*
2597 * make the decrement of all refcnts of all
2598 * anon slots of a large page appear atomic by
2599 * getting an anonpages_hash_lock for the
2600 * first anon slot of a large page.
2601 */
2602 if (ap != NULL) {
2603 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
2604 mutex_enter(ahmpages);
2605 if (ap->an_refcnt == 1) {
2606 VM_STAT_ADD(anonvmstats.privatepages[4]);
2607 ASSERT(!anon_share(amp->ahp, start_idx, pgcnt));
2608 mutex_exit(ahmpages);
2609
2610 if (prealloc) {
2611 page_free_replacement_page(pplist);
2612 page_create_putback(pgcnt);
2613 }
2614 ASSERT(ppa[0]->p_szc <= szc);
2615 if (ppa[0]->p_szc == szc) {
2616 VM_STAT_ADD(anonvmstats.privatepages[5]);
2617 return (0);
2618 }
2619 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2620 ASSERT(ppa[pg_idx] != NULL);
2621 page_unlock(ppa[pg_idx]);
2622 }
2623 return (-1);
2624 }
2625 }
2626
2627 /*
2628 * If we are passed in the vpage array and this is
2629 * not PROT_WRITE then we need to decrement availrmem
2630 * up front before we try anything. If we need to and
2631 * can't decrement availrmem then its better to fail now
2632 * than in the middle of processing the new large page.
2633 * page_pp_usclaim() on behalf of each constituent page
2634 * below will adjust availrmem back for the cases not needed.
2635 */
2636 if (vpage != NULL && (prot & PROT_WRITE) == 0) {
2637 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2638 if (VPP_ISPPLOCK(&vpage[pg_idx])) {
2639 pagelock = 1;
2640 break;
2641 }
2642 }
2643 if (pagelock) {
2644 VM_STAT_ADD(anonvmstats.privatepages[6]);
2645 mutex_enter(&freemem_lock);
2646 if (availrmem >= pages_pp_maximum + pgcnt) {
2647 availrmem -= pgcnt;
2648 pages_useclaim += pgcnt;
2649 } else {
2650 VM_STAT_ADD(anonvmstats.privatepages[7]);
2651 mutex_exit(&freemem_lock);
2652 if (ahmpages != NULL) {
2653 mutex_exit(ahmpages);
2654 }
2655 if (prealloc) {
2656 page_free_replacement_page(pplist);
2657 page_create_putback(pgcnt);
2658 }
2659 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++)
2660 if (ppa[pg_idx] != NULL)
2661 page_unlock(ppa[pg_idx]);
2662 return (ENOMEM);
2663 }
2664 mutex_exit(&freemem_lock);
2665 }
2666 }
2667
2668 CPU_STATS_ADD_K(vm, cow_fault, pgcnt);
2669
2670 VM_STAT_ADD(anonvmstats.privatepages[8]);
2671
2672 an_idx = start_idx;
2673 pg_idx = 0;
2674 vaddr = addr;
2675 for (; pg_idx < pgcnt; pg_idx++, an_idx++, vaddr += PAGESIZE) {
2676 ASSERT(ppa[pg_idx] != NULL);
2677 oldap = anon_get_ptr(amp->ahp, an_idx);
2678 ASSERT(ahmpages != NULL || oldap == NULL);
2679 ASSERT(ahmpages == NULL || oldap != NULL);
2680 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2681 ASSERT(ahmpages == NULL || pg_idx != 0 ||
2682 (refcnt = oldap->an_refcnt));
2683 ASSERT(ahmpages == NULL || pg_idx == 0 ||
2684 refcnt == oldap->an_refcnt);
2685
2686 ap = anon_alloc(NULL, 0);
2687
2688 swap_xlate(ap, &vp, &off);
2689
2690 /*
2691 * Now setup our preallocated page to pass down to
2692 * swap_getpage().
2693 */
2694 if (prealloc) {
2695 pp = pplist;
2696 page_sub(&pplist, pp);
2697 conpp = pp;
2698 }
2699
2700 err = swap_getconpage(vp, (u_offset_t)off, PAGESIZE, NULL, pl,
2701 PAGESIZE, conpp, NULL, &nreloc, seg, vaddr,
2702 S_CREATE, cred);
2703
2704 /*
2705 * Impossible to fail this is S_CREATE.
2706 */
2707 if (err)
2708 panic("anon_map_privatepages: VOP_GETPAGE failed");
2709
2710 ASSERT(prealloc ? pp == pl[0] : pl[0]->p_szc == 0);
2711 ASSERT(prealloc == 0 || nreloc == 1);
2712
2713 pp = pl[0];
2714
2715 /*
2716 * If the original page was locked, we need to move
2717 * the lock to the new page by transfering
2718 * 'cowcnt/lckcnt' of the original page to 'cowcnt/lckcnt'
2719 * of the new page. pg_idx can be used to index
2720 * into the vpage array since the caller will guarentee
2721 * that vpage struct passed in corresponds to addr
2722 * and forward.
2723 */
2724 if (vpage != NULL && VPP_ISPPLOCK(&vpage[pg_idx])) {
2725 page_pp_useclaim(ppa[pg_idx], pp, prot & PROT_WRITE);
2726 } else if (pagelock) {
2727 mutex_enter(&freemem_lock);
2728 availrmem++;
2729 pages_useclaim--;
2730 mutex_exit(&freemem_lock);
2731 }
2732
2733 /*
2734 * Now copy the contents from the original page.
2735 */
2736 if (ppcopy(ppa[pg_idx], pp) == 0) {
2737 /*
2738 * Before ppcopy could hanlde UE or other faults, we
2739 * would have panicked here, and still have no option
2740 * but to do so now.
2741 */
2742 panic("anon_map_privatepages, ppcopy failed");
2743 }
2744
2745 hat_setrefmod(pp); /* mark as modified */
2746
2747 /*
2748 * Release the lock on the original page,
2749 * derement the old slot, and down grade the lock
2750 * on the new copy.
2751 */
2752 page_unlock(ppa[pg_idx]);
2753
2754 if (!prealloc)
2755 page_downgrade(pp);
2756
2757 ppa[pg_idx] = pp;
2758
2759 /*
2760 * Now reflect the copy in the new anon array.
2761 */
2762 ASSERT(ahmpages == NULL || oldap->an_refcnt > 1);
2763 if (oldap != NULL)
2764 anon_decref(oldap);
2765 (void) anon_set_ptr(amp->ahp, an_idx, ap, ANON_SLEEP);
2766 }
2767
2768 /*
2769 * Unload the old large page translation.
2770 */
2771 hat_unload(seg->s_as->a_hat, addr, pgcnt << PAGESHIFT, HAT_UNLOAD);
2772
2773 if (ahmpages != NULL) {
2774 mutex_exit(ahmpages);
2775 }
2776 ASSERT(prealloc == 0 || pplist == NULL);
2777 if (prealloc) {
2778 VM_STAT_ADD(anonvmstats.privatepages[9]);
2779 for (pg_idx = 0; pg_idx < pgcnt; pg_idx++) {
2780 page_downgrade(ppa[pg_idx]);
2781 }
2782 }
2783
2784 return (0);
2785 }
2786
2787 /*
2788 * Allocate a private zero-filled anon page.
2789 */
2790 page_t *
anon_zero(struct seg * seg,caddr_t addr,struct anon ** app,struct cred * cred)2791 anon_zero(struct seg *seg, caddr_t addr, struct anon **app, struct cred *cred)
2792 {
2793 struct anon *ap;
2794 page_t *pp;
2795 struct vnode *vp;
2796 anoff_t off;
2797 page_t *anon_pl[1 + 1];
2798 int err;
2799
2800 *app = ap = anon_alloc(NULL, 0);
2801 swap_xlate(ap, &vp, &off);
2802
2803 /*
2804 * Call the VOP_GETPAGE routine to create the page, thereby
2805 * enabling the vnode driver to allocate any filesystem
2806 * dependent structures (e.g., disk block allocation for UFS).
2807 * This also prevents more than on page from being added to
2808 * the vnode at the same time since it is locked.
2809 */
2810 err = VOP_GETPAGE(vp, off, PAGESIZE, NULL,
2811 anon_pl, PAGESIZE, seg, addr, S_CREATE, cred, NULL);
2812 if (err) {
2813 *app = NULL;
2814 anon_decref(ap);
2815 return (NULL);
2816 }
2817 pp = anon_pl[0];
2818
2819 pagezero(pp, 0, PAGESIZE); /* XXX - should set mod bit */
2820 page_downgrade(pp);
2821 CPU_STATS_ADD_K(vm, zfod, 1);
2822 hat_setrefmod(pp); /* mark as modified so pageout writes back */
2823 return (pp);
2824 }
2825
2826
2827 /*
2828 * Allocate array of private zero-filled anon pages for empty slots
2829 * and kept pages for non empty slots within given range.
2830 *
2831 * NOTE: This rontine will try and use large pages
2832 * if available and supported by underlying platform.
2833 */
2834 int
anon_map_createpages(struct anon_map * amp,ulong_t start_index,size_t len,page_t * ppa[],struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)2835 anon_map_createpages(
2836 struct anon_map *amp,
2837 ulong_t start_index,
2838 size_t len,
2839 page_t *ppa[],
2840 struct seg *seg,
2841 caddr_t addr,
2842 enum seg_rw rw,
2843 struct cred *cred)
2844 {
2845
2846 struct anon *ap;
2847 struct vnode *ap_vp;
2848 page_t *pp, *pplist, *anon_pl[1 + 1], *conpp = NULL;
2849 int err = 0;
2850 ulong_t p_index, index;
2851 pgcnt_t npgs, pg_cnt;
2852 spgcnt_t nreloc = 0;
2853 uint_t l_szc, szc, prot;
2854 anoff_t ap_off;
2855 size_t pgsz;
2856 lgrp_t *lgrp;
2857 kmutex_t *ahm;
2858
2859 /*
2860 * XXX For now only handle S_CREATE.
2861 */
2862 ASSERT(rw == S_CREATE);
2863
2864 index = start_index;
2865 p_index = 0;
2866 npgs = btopr(len);
2867
2868 /*
2869 * If this platform supports multiple page sizes
2870 * then try and allocate directly from the free
2871 * list for pages larger than PAGESIZE.
2872 *
2873 * NOTE:When we have page_create_ru we can stop
2874 * directly allocating from the freelist.
2875 */
2876 l_szc = seg->s_szc;
2877 ANON_LOCK_ENTER(&->a_rwlock, RW_WRITER);
2878 while (npgs) {
2879
2880 /*
2881 * if anon slot already exists
2882 * (means page has been created)
2883 * so 1) look up the page
2884 * 2) if the page is still in memory, get it.
2885 * 3) if not, create a page and
2886 * page in from physical swap device.
2887 * These are done in anon_getpage().
2888 */
2889 ap = anon_get_ptr(amp->ahp, index);
2890 if (ap) {
2891 err = anon_getpage(&ap, &prot, anon_pl, PAGESIZE,
2892 seg, addr, S_READ, cred);
2893 if (err) {
2894 ANON_LOCK_EXIT(&->a_rwlock);
2895 panic("anon_map_createpages: anon_getpage");
2896 }
2897 pp = anon_pl[0];
2898 ppa[p_index++] = pp;
2899
2900 /*
2901 * an_pvp can become non-NULL after SysV's page was
2902 * paged out before ISM was attached to this SysV
2903 * shared memory segment. So free swap slot if needed.
2904 */
2905 if (ap->an_pvp != NULL) {
2906 page_io_lock(pp);
2907 ahm = AH_MUTEX(ap->an_vp, ap->an_off);
2908 mutex_enter(ahm);
2909 if (ap->an_pvp != NULL) {
2910 swap_phys_free(ap->an_pvp,
2911 ap->an_poff, PAGESIZE);
2912 ap->an_pvp = NULL;
2913 ap->an_poff = 0;
2914 mutex_exit(ahm);
2915 hat_setmod(pp);
2916 } else {
2917 mutex_exit(ahm);
2918 }
2919 page_io_unlock(pp);
2920 }
2921
2922 addr += PAGESIZE;
2923 index++;
2924 npgs--;
2925 continue;
2926 }
2927 /*
2928 * Now try and allocate the largest page possible
2929 * for the current address and range.
2930 * Keep dropping down in page size until:
2931 *
2932 * 1) Properly aligned
2933 * 2) Does not overlap existing anon pages
2934 * 3) Fits in remaining range.
2935 * 4) able to allocate one.
2936 *
2937 * NOTE: XXX When page_create_ru is completed this code
2938 * will change.
2939 */
2940 szc = l_szc;
2941 pplist = NULL;
2942 pg_cnt = 0;
2943 while (szc) {
2944 pgsz = page_get_pagesize(szc);
2945 pg_cnt = pgsz >> PAGESHIFT;
2946 if (IS_P2ALIGNED(addr, pgsz) && pg_cnt <= npgs &&
2947 anon_pages(amp->ahp, index, pg_cnt) == 0) {
2948 /*
2949 * XXX
2950 * Since we are faking page_create()
2951 * we also need to do the freemem and
2952 * pcf accounting.
2953 */
2954 (void) page_create_wait(pg_cnt, PG_WAIT);
2955
2956 /*
2957 * Get lgroup to allocate next page of shared
2958 * memory from and use it to specify where to
2959 * allocate the physical memory
2960 */
2961 lgrp = lgrp_mem_choose(seg, addr, pgsz);
2962
2963 pplist = page_get_freelist(
2964 anon_vp, (u_offset_t)0, seg,
2965 addr, pgsz, 0, lgrp);
2966
2967 if (pplist == NULL) {
2968 page_create_putback(pg_cnt);
2969 }
2970
2971 /*
2972 * If a request for a page of size
2973 * larger than PAGESIZE failed
2974 * then don't try that size anymore.
2975 */
2976 if (pplist == NULL) {
2977 l_szc = szc - 1;
2978 } else {
2979 break;
2980 }
2981 }
2982 szc--;
2983 }
2984
2985 /*
2986 * If just using PAGESIZE pages then don't
2987 * directly allocate from the free list.
2988 */
2989 if (pplist == NULL) {
2990 ASSERT(szc == 0);
2991 pp = anon_zero(seg, addr, &ap, cred);
2992 if (pp == NULL) {
2993 ANON_LOCK_EXIT(&->a_rwlock);
2994 panic("anon_map_createpages: anon_zero");
2995 }
2996 ppa[p_index++] = pp;
2997
2998 ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
2999 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3000
3001 addr += PAGESIZE;
3002 index++;
3003 npgs--;
3004 continue;
3005 }
3006
3007 /*
3008 * pplist is a list of pg_cnt PAGESIZE pages.
3009 * These pages are locked SE_EXCL since they
3010 * came directly off the free list.
3011 */
3012 ASSERT(IS_P2ALIGNED(pg_cnt, pg_cnt));
3013 ASSERT(IS_P2ALIGNED(index, pg_cnt));
3014 ASSERT(conpp == NULL);
3015 while (pg_cnt--) {
3016
3017 ap = anon_alloc(NULL, 0);
3018 swap_xlate(ap, &ap_vp, &ap_off);
3019
3020 ASSERT(pplist != NULL);
3021 pp = pplist;
3022 page_sub(&pplist, pp);
3023 PP_CLRFREE(pp);
3024 PP_CLRAGED(pp);
3025 conpp = pp;
3026
3027 err = swap_getconpage(ap_vp, ap_off, PAGESIZE,
3028 (uint_t *)NULL, anon_pl, PAGESIZE, conpp, NULL,
3029 &nreloc, seg, addr, S_CREATE, cred);
3030
3031 if (err) {
3032 ANON_LOCK_EXIT(&->a_rwlock);
3033 panic("anon_map_createpages: S_CREATE");
3034 }
3035
3036 ASSERT(anon_pl[0] == pp);
3037 ASSERT(nreloc == 1);
3038 pagezero(pp, 0, PAGESIZE);
3039 CPU_STATS_ADD_K(vm, zfod, 1);
3040 hat_setrefmod(pp);
3041
3042 ASSERT(anon_get_ptr(amp->ahp, index) == NULL);
3043 (void) anon_set_ptr(amp->ahp, index, ap, ANON_SLEEP);
3044
3045 ppa[p_index++] = pp;
3046
3047 addr += PAGESIZE;
3048 index++;
3049 npgs--;
3050 }
3051 conpp = NULL;
3052 pg_cnt = pgsz >> PAGESHIFT;
3053 p_index = p_index - pg_cnt;
3054 while (pg_cnt--) {
3055 page_downgrade(ppa[p_index++]);
3056 }
3057 }
3058 ANON_LOCK_EXIT(&->a_rwlock);
3059 return (0);
3060 }
3061
3062 static int
anon_try_demote_pages(struct anon_hdr * ahp,ulong_t sidx,uint_t szc,page_t ** ppa,int private)3063 anon_try_demote_pages(
3064 struct anon_hdr *ahp,
3065 ulong_t sidx,
3066 uint_t szc,
3067 page_t **ppa,
3068 int private)
3069 {
3070 struct anon *ap;
3071 pgcnt_t pgcnt = page_get_pagecnt(szc);
3072 page_t *pp;
3073 pgcnt_t i;
3074 kmutex_t *ahmpages = NULL;
3075 int root = 0;
3076 pgcnt_t npgs;
3077 pgcnt_t curnpgs = 0;
3078 size_t ppasize = 0;
3079
3080 ASSERT(szc != 0);
3081 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3082 ASSERT(IS_P2ALIGNED(sidx, pgcnt));
3083 ASSERT(sidx < ahp->size);
3084
3085 if (ppa == NULL) {
3086 ppasize = pgcnt * sizeof (page_t *);
3087 ppa = kmem_alloc(ppasize, KM_SLEEP);
3088 }
3089
3090 ap = anon_get_ptr(ahp, sidx);
3091 if (ap != NULL && private) {
3092 VM_STAT_ADD(anonvmstats.demotepages[1]);
3093 ahmpages = APH_MUTEX(ap->an_vp, ap->an_off);
3094 mutex_enter(ahmpages);
3095 }
3096
3097 if (ap != NULL && ap->an_refcnt > 1) {
3098 if (ahmpages != NULL) {
3099 VM_STAT_ADD(anonvmstats.demotepages[2]);
3100 mutex_exit(ahmpages);
3101 }
3102 if (ppasize != 0) {
3103 kmem_free(ppa, ppasize);
3104 }
3105 return (0);
3106 }
3107 if (ahmpages != NULL) {
3108 mutex_exit(ahmpages);
3109 }
3110 if (ahp->size - sidx < pgcnt) {
3111 ASSERT(private == 0);
3112 pgcnt = ahp->size - sidx;
3113 }
3114 for (i = 0; i < pgcnt; i++, sidx++) {
3115 ap = anon_get_ptr(ahp, sidx);
3116 if (ap != NULL) {
3117 if (ap->an_refcnt != 1) {
3118 panic("anon_try_demote_pages: an_refcnt != 1");
3119 }
3120 pp = ppa[i] = page_lookup(ap->an_vp, ap->an_off,
3121 SE_EXCL);
3122 if (pp != NULL) {
3123 (void) hat_pageunload(pp,
3124 HAT_FORCE_PGUNLOAD);
3125 }
3126 } else {
3127 ppa[i] = NULL;
3128 }
3129 }
3130 for (i = 0; i < pgcnt; i++) {
3131 if ((pp = ppa[i]) != NULL && pp->p_szc != 0) {
3132 ASSERT(pp->p_szc <= szc);
3133 if (!root) {
3134 VM_STAT_ADD(anonvmstats.demotepages[3]);
3135 if (curnpgs != 0)
3136 panic("anon_try_demote_pages: "
3137 "bad large page");
3138
3139 root = 1;
3140 curnpgs = npgs =
3141 page_get_pagecnt(pp->p_szc);
3142
3143 ASSERT(npgs <= pgcnt);
3144 ASSERT(IS_P2ALIGNED(npgs, npgs));
3145 ASSERT(!(page_pptonum(pp) & (npgs - 1)));
3146 } else {
3147 ASSERT(i > 0);
3148 ASSERT(page_pptonum(pp) - 1 ==
3149 page_pptonum(ppa[i - 1]));
3150 if ((page_pptonum(pp) & (npgs - 1)) ==
3151 npgs - 1)
3152 root = 0;
3153 }
3154 ASSERT(PAGE_EXCL(pp));
3155 pp->p_szc = 0;
3156 ASSERT(curnpgs > 0);
3157 curnpgs--;
3158 }
3159 }
3160 if (root != 0 || curnpgs != 0)
3161 panic("anon_try_demote_pages: bad large page");
3162
3163 for (i = 0; i < pgcnt; i++) {
3164 if ((pp = ppa[i]) != NULL) {
3165 ASSERT(!hat_page_is_mapped(pp));
3166 ASSERT(pp->p_szc == 0);
3167 page_unlock(pp);
3168 }
3169 }
3170 if (ppasize != 0) {
3171 kmem_free(ppa, ppasize);
3172 }
3173 return (1);
3174 }
3175
3176 /*
3177 * anon_map_demotepages() can only be called by MAP_PRIVATE segments.
3178 */
3179 int
anon_map_demotepages(struct anon_map * amp,ulong_t start_idx,struct seg * seg,caddr_t addr,uint_t prot,struct vpage vpage[],struct cred * cred)3180 anon_map_demotepages(
3181 struct anon_map *amp,
3182 ulong_t start_idx,
3183 struct seg *seg,
3184 caddr_t addr,
3185 uint_t prot,
3186 struct vpage vpage[],
3187 struct cred *cred)
3188 {
3189 struct anon *ap;
3190 uint_t szc = seg->s_szc;
3191 pgcnt_t pgcnt = page_get_pagecnt(szc);
3192 size_t ppasize = pgcnt * sizeof (page_t *);
3193 page_t **ppa = kmem_alloc(ppasize, KM_SLEEP);
3194 page_t *pp;
3195 page_t *pl[2];
3196 pgcnt_t i, pg_idx;
3197 ulong_t an_idx;
3198 caddr_t vaddr;
3199 int err;
3200 int retry = 0;
3201 uint_t vpprot;
3202
3203 ASSERT(RW_WRITE_HELD(&->a_rwlock));
3204 ASSERT(IS_P2ALIGNED(pgcnt, pgcnt));
3205 ASSERT(IS_P2ALIGNED(start_idx, pgcnt));
3206 ASSERT(ppa != NULL);
3207 ASSERT(szc != 0);
3208 ASSERT(szc == amp->a_szc);
3209
3210 VM_STAT_ADD(anonvmstats.demotepages[0]);
3211
3212 top:
3213 if (anon_try_demote_pages(amp->ahp, start_idx, szc, ppa, 1)) {
3214 kmem_free(ppa, ppasize);
3215 return (0);
3216 }
3217
3218 VM_STAT_ADD(anonvmstats.demotepages[4]);
3219
3220 ASSERT(retry == 0); /* we can be here only once */
3221
3222 vaddr = addr;
3223 for (pg_idx = 0, an_idx = start_idx; pg_idx < pgcnt;
3224 pg_idx++, an_idx++, vaddr += PAGESIZE) {
3225 ap = anon_get_ptr(amp->ahp, an_idx);
3226 if (ap == NULL)
3227 panic("anon_map_demotepages: no anon slot");
3228 err = anon_getpage(&ap, &vpprot, pl, PAGESIZE, seg, vaddr,
3229 S_READ, cred);
3230 if (err) {
3231 for (i = 0; i < pg_idx; i++) {
3232 if ((pp = ppa[i]) != NULL)
3233 page_unlock(pp);
3234 }
3235 kmem_free(ppa, ppasize);
3236 return (err);
3237 }
3238 ppa[pg_idx] = pl[0];
3239 }
3240
3241 err = anon_map_privatepages(amp, start_idx, szc, seg, addr, prot, ppa,
3242 vpage, -1, 0, cred);
3243 if (err > 0) {
3244 VM_STAT_ADD(anonvmstats.demotepages[5]);
3245 kmem_free(ppa, ppasize);
3246 return (err);
3247 }
3248 ASSERT(err == 0 || err == -1);
3249 if (err == -1) {
3250 VM_STAT_ADD(anonvmstats.demotepages[6]);
3251 retry = 1;
3252 goto top;
3253 }
3254 for (i = 0; i < pgcnt; i++) {
3255 ASSERT(ppa[i] != NULL);
3256 if (ppa[i]->p_szc != 0)
3257 retry = 1;
3258 page_unlock(ppa[i]);
3259 }
3260 if (retry) {
3261 VM_STAT_ADD(anonvmstats.demotepages[7]);
3262 goto top;
3263 }
3264
3265 VM_STAT_ADD(anonvmstats.demotepages[8]);
3266
3267 kmem_free(ppa, ppasize);
3268
3269 return (0);
3270 }
3271
3272 /*
3273 * Free pages of shared anon map. It's assumed that anon maps don't share anon
3274 * structures with private anon maps. Therefore all anon structures should
3275 * have at most one reference at this point. This means underlying pages can
3276 * be exclusively locked and demoted or freed. If not freeing the entire
3277 * large pages demote the ends of the region we free to be able to free
3278 * subpages. Page roots correspond to aligned index positions in anon map.
3279 */
3280 void
anon_shmap_free_pages(struct anon_map * amp,ulong_t sidx,size_t len)3281 anon_shmap_free_pages(struct anon_map *amp, ulong_t sidx, size_t len)
3282 {
3283 ulong_t eidx = sidx + btopr(len);
3284 pgcnt_t pages = page_get_pagecnt(amp->a_szc);
3285 struct anon_hdr *ahp = amp->ahp;
3286 ulong_t tidx;
3287 size_t size;
3288 ulong_t sidx_aligned;
3289 ulong_t eidx_aligned;
3290
3291 ASSERT(ANON_WRITE_HELD(&->a_rwlock));
3292 ASSERT(amp->refcnt <= 1);
3293 ASSERT(amp->a_szc > 0);
3294 ASSERT(eidx <= ahp->size);
3295 ASSERT(!anon_share(ahp, sidx, btopr(len)));
3296
3297 if (len == 0) { /* XXX */
3298 return;
3299 }
3300
3301 sidx_aligned = P2ALIGN(sidx, pages);
3302 if (sidx_aligned != sidx ||
3303 (eidx < sidx_aligned + pages && eidx < ahp->size)) {
3304 if (!anon_try_demote_pages(ahp, sidx_aligned,
3305 amp->a_szc, NULL, 0)) {
3306 panic("anon_shmap_free_pages: demote failed");
3307 }
3308 size = (eidx <= sidx_aligned + pages) ? (eidx - sidx) :
3309 P2NPHASE(sidx, pages);
3310 size <<= PAGESHIFT;
3311 anon_free(ahp, sidx, size);
3312 sidx = sidx_aligned + pages;
3313 if (eidx <= sidx) {
3314 return;
3315 }
3316 }
3317 eidx_aligned = P2ALIGN(eidx, pages);
3318 if (sidx < eidx_aligned) {
3319 anon_free_pages(ahp, sidx,
3320 (eidx_aligned - sidx) << PAGESHIFT,
3321 amp->a_szc);
3322 sidx = eidx_aligned;
3323 }
3324 ASSERT(sidx == eidx_aligned);
3325 if (eidx == eidx_aligned) {
3326 return;
3327 }
3328 tidx = eidx;
3329 if (eidx != ahp->size && anon_get_next_ptr(ahp, &tidx) != NULL &&
3330 tidx - sidx < pages) {
3331 if (!anon_try_demote_pages(ahp, sidx, amp->a_szc, NULL, 0)) {
3332 panic("anon_shmap_free_pages: demote failed");
3333 }
3334 size = (eidx - sidx) << PAGESHIFT;
3335 anon_free(ahp, sidx, size);
3336 } else {
3337 anon_free_pages(ahp, sidx, pages << PAGESHIFT, amp->a_szc);
3338 }
3339 }
3340
3341 /*
3342 * This routine should be called with amp's writer lock when there're no other
3343 * users of amp. All pcache entries of this amp must have been already
3344 * inactivated. We must not drop a_rwlock here to prevent new users from
3345 * attaching to this amp.
3346 */
3347 void
anonmap_purge(struct anon_map * amp)3348 anonmap_purge(struct anon_map *amp)
3349 {
3350 ASSERT(ANON_WRITE_HELD(&->a_rwlock));
3351 ASSERT(amp->refcnt <= 1);
3352
3353 if (amp->a_softlockcnt != 0) {
3354 seg_ppurge(NULL, amp, 0);
3355 }
3356
3357 /*
3358 * Since all pcache entries were already inactive before this routine
3359 * was called seg_ppurge() couldn't return while there're still
3360 * entries that can be found via the list anchored at a_phead. So we
3361 * can assert this list is empty now. a_softlockcnt may be still non 0
3362 * if asynchronous thread that manages pcache already removed pcache
3363 * entries but hasn't unlocked the pages yet. If a_softlockcnt is non
3364 * 0 we just wait on a_purgecv for shamp_reclaim() to finish. Even if
3365 * a_softlockcnt is 0 we grab a_purgemtx to avoid freeing anon map
3366 * before shamp_reclaim() is done with it. a_purgemtx also taken by
3367 * shamp_reclaim() while a_softlockcnt was still not 0 acts as a
3368 * barrier that prevents anonmap_purge() to complete while
3369 * shamp_reclaim() may still be referencing this amp.
3370 */
3371 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3372 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3373
3374 mutex_enter(&->a_purgemtx);
3375 while (amp->a_softlockcnt != 0) {
3376 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3377 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3378 amp->a_purgewait = 1;
3379 cv_wait(&->a_purgecv, &->a_purgemtx);
3380 }
3381 mutex_exit(&->a_purgemtx);
3382
3383 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3384 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3385 ASSERT(amp->a_softlockcnt == 0);
3386 }
3387
3388 /*
3389 * Allocate and initialize an anon_map structure for seg
3390 * associating the given swap reservation with the new anon_map.
3391 */
3392 struct anon_map *
anonmap_alloc(size_t size,size_t swresv,int flags)3393 anonmap_alloc(size_t size, size_t swresv, int flags)
3394 {
3395 struct anon_map *amp;
3396 int kmflags = (flags & ANON_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
3397
3398 amp = kmem_cache_alloc(anonmap_cache, kmflags);
3399 if (amp == NULL) {
3400 ASSERT(kmflags == KM_NOSLEEP);
3401 return (NULL);
3402 }
3403
3404 amp->ahp = anon_create(btopr(size), flags);
3405 if (amp->ahp == NULL) {
3406 ASSERT(flags == ANON_NOSLEEP);
3407 kmem_cache_free(anonmap_cache, amp);
3408 return (NULL);
3409 }
3410 amp->refcnt = 1;
3411 amp->size = size;
3412 amp->swresv = swresv;
3413 amp->locality = 0;
3414 amp->a_szc = 0;
3415 amp->a_sp = NULL;
3416 amp->a_softlockcnt = 0;
3417 amp->a_purgewait = 0;
3418 amp->a_phead.p_lnext = &->a_phead;
3419 amp->a_phead.p_lprev = &->a_phead;
3420
3421 return (amp);
3422 }
3423
3424 void
anonmap_free(struct anon_map * amp)3425 anonmap_free(struct anon_map *amp)
3426 {
3427 ASSERT(amp->ahp != NULL);
3428 ASSERT(amp->refcnt == 0);
3429 ASSERT(amp->a_softlockcnt == 0);
3430 ASSERT(amp->a_phead.p_lnext == &->a_phead);
3431 ASSERT(amp->a_phead.p_lprev == &->a_phead);
3432
3433 lgrp_shm_policy_fini(amp, NULL);
3434 anon_release(amp->ahp, btopr(amp->size));
3435 kmem_cache_free(anonmap_cache, amp);
3436 }
3437
3438 /*
3439 * Returns true if the app array has some empty slots.
3440 * The offp and lenp parameters are in/out parameters. On entry
3441 * these values represent the starting offset and length of the
3442 * mapping. When true is returned, these values may be modified
3443 * to be the largest range which includes empty slots.
3444 */
3445 int
non_anon(struct anon_hdr * ahp,ulong_t anon_idx,u_offset_t * offp,size_t * lenp)3446 non_anon(struct anon_hdr *ahp, ulong_t anon_idx, u_offset_t *offp,
3447 size_t *lenp)
3448 {
3449 ulong_t i, el;
3450 ssize_t low, high;
3451 struct anon *ap;
3452
3453 low = -1;
3454 for (i = 0, el = *lenp; i < el; i += PAGESIZE, anon_idx++) {
3455 ap = anon_get_ptr(ahp, anon_idx);
3456 if (ap == NULL) {
3457 if (low == -1)
3458 low = i;
3459 high = i;
3460 }
3461 }
3462 if (low != -1) {
3463 /*
3464 * Found at least one non-anon page.
3465 * Set up the off and len return values.
3466 */
3467 if (low != 0)
3468 *offp += low;
3469 *lenp = high - low + PAGESIZE;
3470 return (1);
3471 }
3472 return (0);
3473 }
3474
3475 /*
3476 * Return a count of the number of existing anon pages in the anon array
3477 * app in the range (off, off+len). The array and slots must be guaranteed
3478 * stable by the caller.
3479 */
3480 pgcnt_t
anon_pages(struct anon_hdr * ahp,ulong_t anon_index,pgcnt_t nslots)3481 anon_pages(struct anon_hdr *ahp, ulong_t anon_index, pgcnt_t nslots)
3482 {
3483 pgcnt_t cnt = 0;
3484
3485 while (nslots-- > 0) {
3486 if ((anon_get_ptr(ahp, anon_index)) != NULL)
3487 cnt++;
3488 anon_index++;
3489 }
3490 return (cnt);
3491 }
3492
3493 /*
3494 * Move reserved phys swap into memory swap (unreserve phys swap
3495 * and reserve mem swap by the same amount).
3496 * Used by segspt when it needs to lock reserved swap npages in memory
3497 */
3498 int
anon_swap_adjust(pgcnt_t npages)3499 anon_swap_adjust(pgcnt_t npages)
3500 {
3501 pgcnt_t unlocked_mem_swap;
3502
3503 mutex_enter(&anoninfo_lock);
3504
3505 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3506 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3507
3508 unlocked_mem_swap = k_anoninfo.ani_mem_resv
3509 - k_anoninfo.ani_locked_swap;
3510 if (npages > unlocked_mem_swap) {
3511 spgcnt_t adjusted_swap = npages - unlocked_mem_swap;
3512
3513 /*
3514 * if there is not enough unlocked mem swap we take missing
3515 * amount from phys swap and give it to mem swap
3516 */
3517 if (!page_reclaim_mem(adjusted_swap, segspt_minfree, 1)) {
3518 mutex_exit(&anoninfo_lock);
3519 return (ENOMEM);
3520 }
3521
3522 k_anoninfo.ani_mem_resv += adjusted_swap;
3523 ASSERT(k_anoninfo.ani_phys_resv >= adjusted_swap);
3524 k_anoninfo.ani_phys_resv -= adjusted_swap;
3525
3526 ANI_ADD(adjusted_swap);
3527 }
3528 k_anoninfo.ani_locked_swap += npages;
3529
3530 ASSERT(k_anoninfo.ani_mem_resv >= k_anoninfo.ani_locked_swap);
3531 ASSERT(k_anoninfo.ani_max >= k_anoninfo.ani_phys_resv);
3532
3533 mutex_exit(&anoninfo_lock);
3534
3535 return (0);
3536 }
3537
3538 /*
3539 * 'unlocked' reserved mem swap so when it is unreserved it
3540 * can be moved back phys (disk) swap
3541 */
3542 void
anon_swap_restore(pgcnt_t npages)3543 anon_swap_restore(pgcnt_t npages)
3544 {
3545 mutex_enter(&anoninfo_lock);
3546
3547 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3548
3549 ASSERT(k_anoninfo.ani_locked_swap >= npages);
3550 k_anoninfo.ani_locked_swap -= npages;
3551
3552 ASSERT(k_anoninfo.ani_locked_swap <= k_anoninfo.ani_mem_resv);
3553
3554 mutex_exit(&anoninfo_lock);
3555 }
3556
3557 /*
3558 * Return the pointer from the list for a
3559 * specified anon index.
3560 */
3561 ulong_t *
anon_get_slot(struct anon_hdr * ahp,ulong_t an_idx)3562 anon_get_slot(struct anon_hdr *ahp, ulong_t an_idx)
3563 {
3564 struct anon **app;
3565 void **ppp;
3566
3567 ASSERT(an_idx < ahp->size);
3568
3569 /*
3570 * Single level case.
3571 */
3572 if ((ahp->size <= ANON_CHUNK_SIZE) || (ahp->flags & ANON_ALLOC_FORCE)) {
3573 return ((ulong_t *)&ahp->array_chunk[an_idx]);
3574 } else {
3575
3576 /*
3577 * 2 level case.
3578 */
3579 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3580 if (*ppp == NULL) {
3581 mutex_enter(&ahp->serial_lock);
3582 ppp = &ahp->array_chunk[an_idx >> ANON_CHUNK_SHIFT];
3583 if (*ppp == NULL)
3584 *ppp = kmem_zalloc(PAGESIZE, KM_SLEEP);
3585 mutex_exit(&ahp->serial_lock);
3586 }
3587 app = *ppp;
3588 return ((ulong_t *)&app[an_idx & ANON_CHUNK_OFF]);
3589 }
3590 }
3591
3592 void
anon_array_enter(struct anon_map * amp,ulong_t an_idx,anon_sync_obj_t * sobj)3593 anon_array_enter(struct anon_map *amp, ulong_t an_idx, anon_sync_obj_t *sobj)
3594 {
3595 ulong_t *ap_slot;
3596 kmutex_t *mtx;
3597 kcondvar_t *cv;
3598 int hash;
3599
3600 /*
3601 * Use szc to determine anon slot(s) to appear atomic.
3602 * If szc = 0, then lock the anon slot and mark it busy.
3603 * If szc > 0, then lock the range of slots by getting the
3604 * anon_array_lock for the first anon slot, and mark only the
3605 * first anon slot busy to represent whole range being busy.
3606 */
3607
3608 ASSERT(RW_READ_HELD(&->a_rwlock));
3609 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3610 hash = ANON_ARRAY_HASH(amp, an_idx);
3611 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3612 sobj->sync_cv = cv = &anon_array_cv[hash];
3613 mutex_enter(mtx);
3614 ap_slot = anon_get_slot(amp->ahp, an_idx);
3615 while (ANON_ISBUSY(ap_slot))
3616 cv_wait(cv, mtx);
3617 ANON_SETBUSY(ap_slot);
3618 sobj->sync_data = ap_slot;
3619 mutex_exit(mtx);
3620 }
3621
3622 int
anon_array_try_enter(struct anon_map * amp,ulong_t an_idx,anon_sync_obj_t * sobj)3623 anon_array_try_enter(struct anon_map *amp, ulong_t an_idx,
3624 anon_sync_obj_t *sobj)
3625 {
3626 ulong_t *ap_slot;
3627 kmutex_t *mtx;
3628 int hash;
3629
3630 /*
3631 * Try to lock a range of anon slots.
3632 * Use szc to determine anon slot(s) to appear atomic.
3633 * If szc = 0, then lock the anon slot and mark it busy.
3634 * If szc > 0, then lock the range of slots by getting the
3635 * anon_array_lock for the first anon slot, and mark only the
3636 * first anon slot busy to represent whole range being busy.
3637 * Fail if the mutex or the anon_array are busy.
3638 */
3639
3640 ASSERT(RW_READ_HELD(&->a_rwlock));
3641 an_idx = P2ALIGN(an_idx, page_get_pagecnt(amp->a_szc));
3642 hash = ANON_ARRAY_HASH(amp, an_idx);
3643 sobj->sync_mutex = mtx = &anon_array_lock[hash].pad_mutex;
3644 sobj->sync_cv = &anon_array_cv[hash];
3645 if (!mutex_tryenter(mtx)) {
3646 return (EWOULDBLOCK);
3647 }
3648 ap_slot = anon_get_slot(amp->ahp, an_idx);
3649 if (ANON_ISBUSY(ap_slot)) {
3650 mutex_exit(mtx);
3651 return (EWOULDBLOCK);
3652 }
3653 ANON_SETBUSY(ap_slot);
3654 sobj->sync_data = ap_slot;
3655 mutex_exit(mtx);
3656 return (0);
3657 }
3658
3659 void
anon_array_exit(anon_sync_obj_t * sobj)3660 anon_array_exit(anon_sync_obj_t *sobj)
3661 {
3662 mutex_enter(sobj->sync_mutex);
3663 ASSERT(ANON_ISBUSY(sobj->sync_data));
3664 ANON_CLRBUSY(sobj->sync_data);
3665 if (CV_HAS_WAITERS(sobj->sync_cv))
3666 cv_broadcast(sobj->sync_cv);
3667 mutex_exit(sobj->sync_mutex);
3668 }
3669